diff --git a/main.py b/main.py index 697ab20b..20c86acd 100644 --- a/main.py +++ b/main.py @@ -37,9 +37,9 @@ def search(q: str, n: Optional[int] = 5, t: Optional[str] = 'notes'): if __name__ == '__main__': # Setup Argument Parser parser = argparse.ArgumentParser(description="Expose API for Semantic Search") - parser.add_argument('--compressed-jsonl', '-j', required=True, type=pathlib.Path, help="Compressed JSONL formatted notes file to compute embeddings from") - parser.add_argument('--embeddings', '-e', required=True, type=pathlib.Path, help="File to save/load model embeddings to/from") - parser.add_argument('--verbose', action='store_true', default=False, help="Show verbose conversion logs. Default: false") + parser.add_argument('--compressed-jsonl', '-j', type=pathlib.Path, default=pathlib.Path(".notes.jsonl.gz"), help="Compressed JSONL formatted notes file to compute embeddings from") + parser.add_argument('--embeddings', '-e', type=pathlib.Path, default=pathlib.Path(".notes_embeddings.pt"), help="File to save/load model embeddings to/from") + parser.add_argument('--verbose', action='count', help="Show verbose conversion logs. Default: 0") args = parser.parse_args() # Initialize Model diff --git a/processor/org_mode/org_to_jsonl.py b/processor/org_mode/org_to_jsonl.py index 9e786abb..8e152c4b 100644 --- a/processor/org_mode/org_to_jsonl.py +++ b/processor/org_mode/org_to_jsonl.py @@ -11,9 +11,9 @@ import gzip # Define Functions -def org_to_jsonl(org_files, org_file_filter, output_path, verbose=False): +def org_to_jsonl(org_files, org_file_filter, output_path, verbose=0): # Get Org Files to Process - org_files = get_org_files(args.input_files, args.input_filter) + org_files = get_org_files(args.input_files, args.input_filter, verbose) # Extract Entries from specified Org files entries = extract_org_entries(org_files) @@ -59,7 +59,7 @@ def load_jsonl(input_path, verbose=0): return data -def get_org_files(org_files=None, org_file_filter=None): +def get_org_files(org_files=None, org_file_filter=None, verbose=0): "Get Org files to process" absolute_org_files, filtered_org_files = set(), set() if org_files: @@ -75,7 +75,7 @@ def get_org_files(org_files=None, org_file_filter=None): if any(files_with_non_org_extensions): print(f"[Warning] There maybe non org-mode files in the input set: {files_with_non_org_extensions}") - if args.verbose: + if args.verbose > 0: print(f'Processing files: {all_org_files}') return all_org_files diff --git a/search_type/asymmetric.py b/search_type/asymmetric.py index 74078563..aecaa775 100644 --- a/search_type/asymmetric.py +++ b/search_type/asymmetric.py @@ -20,7 +20,7 @@ def initialize_model(): return bi_encoder, cross_encoder, top_k -def extract_entries(notesfile, verbose=False): +def extract_entries(notesfile, verbose=0): "Load entries from compressed jsonl" entries = [] with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl: @@ -34,24 +34,24 @@ def extract_entries(notesfile, verbose=False): note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}' entries.extend([note_string]) - if verbose: + if verbose > 0: print(f"Loaded {len(entries)} entries from {notesfile}") return entries -def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=False): +def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0): "Compute (and Save) Embeddings or Load Pre-Computed Embeddings" # Load pre-computed embeddings from file if exists if embeddings_file.exists() and not regenerate: corpus_embeddings = torch.load(get_absolute_path(embeddings_file)) - if verbose: + if verbose > 0: print(f"Loaded embeddings from {embeddings_file}") else: # Else compute the corpus_embeddings from scratch, which can take a while corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True) torch.save(corpus_embeddings, get_absolute_path(embeddings_file)) - if verbose: + if verbose > 0: print(f"Computed embeddings and save them to {embeddings_file}") return corpus_embeddings @@ -147,7 +147,7 @@ if __name__ == '__main__': parser.add_argument('--embeddings', '-e', required=True, type=pathlib.Path, help="File to save/load model embeddings to/from") parser.add_argument('--results-count', '-n', default=5, type=int, help="Number of results to render. Default: 5") parser.add_argument('--interactive', action='store_true', default=False, help="Interactive mode allows user to run queries on the model. Default: true") - parser.add_argument('--verbose', action='store_true', default=False, help="Show verbose conversion logs. Default: false") + parser.add_argument('--verbose', action='count', help="Show verbose conversion logs. Default: 0") args = parser.parse_args() # Initialize Model