From c35c6fb0b301900046b2aae4b9a96e96a36f8226 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 16 Aug 2021 23:58:24 -0700 Subject: [PATCH] Reuse asymmetric.setup & input validation from asymmetric & org_to_jsonl Create asymmetric.setup method to - initialize model - generate compressed jsonl - compute embeddings put input_files, input_file_filter validation in org_to_jsonl for reuse in main.py, asymmetic.py --- main.py | 28 +++--------------------- processor/org_mode/org_to_jsonl.py | 10 ++++----- search_type/asymmetric.py | 35 +++++++++++++++++++++--------- 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/main.py b/main.py index ed966bb6..8cb4169e 100644 --- a/main.py +++ b/main.py @@ -38,14 +38,8 @@ def search(q: str, n: Optional[int] = 5, t: Optional[str] = 'notes'): @app.get('/regenerate') def regenerate(): - # Generate Compressed JSONL from Notes in Input Files - org_to_jsonl(args.input_files, args.input_filter, args.compressed_jsonl, args.verbose) - - # Extract Entries from Compressed JSONL - extracted_entries = asymmetric.extract_entries(args.compressed_jsonl, args.verbose) - - # Compute Embeddings from Extracted Entries - computed_embeddings = asymmetric.compute_embeddings(extracted_entries, bi_encoder, args.embeddings, regenerate=True, verbose=args.verbose) + # Extract Entries, Generate Embeddings + extracted_entries, computed_embeddings, _, _, _ = asymmetric.setup(args.input_files, args.input_filter, args.compressed_jsonl, args.embeddings, regenerate=True, verbose=args.verbose) # Now Update State # update state variables after regeneration complete @@ -69,23 +63,7 @@ if __name__ == '__main__': parser.add_argument('--verbose', action='count', default=0, help="Show verbose conversion logs. Default: 0") args = parser.parse_args() - # Input Validation - if is_none_or_empty(args.input_files) and is_none_or_empty(args.input_filter): - print("At least one of org-files or org-file-filter is required to be specified") - exit(1) - - # Initialize Model - bi_encoder, cross_encoder, top_k = asymmetric.initialize_model() - - # Map notes in Org-Mode files to (compressed) JSONL formatted file - if not args.compressed_jsonl.exists() or args.regenerate: - org_to_jsonl(args.input_files, args.input_filter, args.compressed_jsonl, args.verbose) - - # Extract Entries - entries = asymmetric.extract_entries(args.compressed_jsonl, args.verbose) - - # Compute or Load Embeddings - corpus_embeddings = asymmetric.compute_embeddings(entries, bi_encoder, args.embeddings, regenerate=args.regenerate, verbose=args.verbose) + entries, corpus_embeddings, bi_encoder, cross_encoder, top_k = asymmetric.setup(args.input_files, args.input_filter, args.compressed_jsonl, args.embeddings, args.regenerate, args.verbose) # Start Application Server uvicorn.run(app) diff --git a/processor/org_mode/org_to_jsonl.py b/processor/org_mode/org_to_jsonl.py index ad6039ae..0fb009c0 100644 --- a/processor/org_mode/org_to_jsonl.py +++ b/processor/org_mode/org_to_jsonl.py @@ -12,6 +12,11 @@ import gzip # Define Functions def org_to_jsonl(org_files, org_file_filter, output_file, verbose=0): + # Input Validation + if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): + print("At least one of org-files or org-file-filter is required to be specified") + exit(1) + # Get Org Files to Process org_files = get_org_files(org_files, org_file_filter, verbose) @@ -132,10 +137,5 @@ if __name__ == '__main__': parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0") args = parser.parse_args() - # Input Validation - if is_none_or_empty(args.input_files) and is_none_or_empty(args.input_filter): - print("At least one of org-files or org-file-filter is required to be specified") - exit(1) - # Map notes in Org-Mode files to (compressed) JSONL formatted file org_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose) diff --git a/search_type/asymmetric.py b/search_type/asymmetric.py index 2dd54f9a..4bdab236 100644 --- a/search_type/asymmetric.py +++ b/search_type/asymmetric.py @@ -11,6 +11,8 @@ import torch import argparse import pathlib from utils.helpers import get_absolute_path +from processor.org_mode.org_to_jsonl import org_to_jsonl + def initialize_model(): "Initialize model for assymetric semantic search. That is, where query smaller than results" @@ -140,24 +142,37 @@ def collate_results(hits, entries, count=5): in hits[0:count]] +def setup(input_files, input_filter, compressed_jsonl, embeddings, regenerate=False, verbose=False): + # Initialize Model + bi_encoder, cross_encoder, top_k = initialize_model() + + # Map notes in Org-Mode files to (compressed) JSONL formatted file + if not compressed_jsonl.exists() or regenerate: + org_to_jsonl(input_files, input_filter, compressed_jsonl, verbose) + + # Extract Entries + entries = extract_entries(compressed_jsonl, verbose) + + # Compute or Load Embeddings + corpus_embeddings = compute_embeddings(entries, bi_encoder, embeddings, regenerate=regenerate, verbose=verbose) + + return entries, corpus_embeddings, bi_encoder, cross_encoder, top_k + + if __name__ == '__main__': # Setup Argument Parser parser = argparse.ArgumentParser(description="Map Org-Mode notes into (compressed) JSONL format") - parser.add_argument('--compressed-jsonl', '-j', required=True, type=pathlib.Path, help="Input file for compressed JSONL formatted notes to compute embeddings from") - parser.add_argument('--embeddings', '-e', required=True, type=pathlib.Path, help="File to save/load model embeddings to/from") + parser.add_argument('--input-files', '-i', nargs='*', help="List of org-mode files to process") + parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for org-mode files to process") + parser.add_argument('--compressed-jsonl', '-j', type=pathlib.Path, default=pathlib.Path(".notes.jsonl.gz"), help="Compressed JSONL formatted notes file to compute embeddings from") + parser.add_argument('--embeddings', '-e', type=pathlib.Path, default=pathlib.Path(".notes_embeddings.pt"), help="File to save/load model embeddings to/from") + parser.add_argument('--regenerate', action='store_true', default=False, help="Regenerate embeddings from org-mode files. Default: false") parser.add_argument('--results-count', '-n', default=5, type=int, help="Number of results to render. Default: 5") parser.add_argument('--interactive', action='store_true', default=False, help="Interactive mode allows user to run queries on the model. Default: true") parser.add_argument('--verbose', action='count', default=0, help="Show verbose conversion logs. Default: 0") args = parser.parse_args() - # Initialize Model - bi_encoder, cross_encoder, top_k = initialize_model() - - # Extract Entries - entries = extract_entries(args.compressed_jsonl, args.verbose) - - # Compute or Load Embeddings - corpus_embeddings = compute_embeddings(entries, bi_encoder, args.embeddings, args.verbose) + entries, corpus_embeddings, bi_encoder, cross_encoder, top_k = setup(args.input_files, args.input_filter, args.compressed_jsonl, args.embeddings, args.regenerate, args.verbose) # Run User Queries on Entries in Interactive Mode while args.interactive: