2021-08-16 02:50:08 +02:00
from typing import Optional
from fastapi import FastAPI
2021-08-17 01:04:45 +02:00
from search_type import asymmetric
2021-08-17 03:52:38 +02:00
from processor . org_mode . org_to_jsonl import org_to_jsonl
from utils . helpers import is_none_or_empty
2021-08-16 04:09:50 +02:00
import argparse
import pathlib
2021-08-16 02:50:08 +02:00
import uvicorn
app = FastAPI ( )
@app.get ( ' /search ' )
def search ( q : str , n : Optional [ int ] = 5 , t : Optional [ str ] = ' notes ' ) :
if q is None or q == ' ' :
print ( f ' No query param (q) passed in API call to initiate search ' )
return { }
user_query = q
results_count = n
if t == ' notes ' :
# query notes
2021-08-17 01:52:48 +02:00
hits = asymmetric . query_notes (
2021-08-17 03:52:38 +02:00
user_query ,
2021-08-17 01:52:48 +02:00
corpus_embeddings ,
entries ,
bi_encoder ,
cross_encoder ,
top_k )
2021-08-16 02:50:08 +02:00
# collate and return results
2021-08-16 04:09:50 +02:00
return asymmetric . collate_results ( hits , entries , results_count )
2021-08-16 02:50:08 +02:00
else :
return { }
2021-08-17 03:52:38 +02:00
@app.get ( ' /regenerate ' )
def regenerate ( ) :
2021-08-17 08:47:33 +02:00
# Generate Compressed JSONL from Notes in Input Files
2021-08-17 03:52:38 +02:00
org_to_jsonl ( args . input_files , args . input_filter , args . compressed_jsonl , args . verbose )
2021-08-17 08:47:33 +02:00
# Extract Entries from Compressed JSONL
extracted_entries = asymmetric . extract_entries ( args . compressed_jsonl , args . verbose )
# Compute Embeddings from Extracted Entries
computed_embeddings = asymmetric . compute_embeddings ( extracted_entries , bi_encoder , args . embeddings , regenerate = True , verbose = args . verbose )
2021-08-17 03:52:38 +02:00
2021-08-17 08:47:33 +02:00
# Now Update State
# update state variables after regeneration complete
# minimize time the application is in inconsistent, partially updated state
2021-08-17 03:52:38 +02:00
global corpus_embeddings
2021-08-17 08:47:33 +02:00
global entries
entries = extracted_entries
corpus_embeddings = computed_embeddings
return { ' status ' : ' ok ' , ' message ' : ' regeneration completed ' }
2021-08-17 03:52:38 +02:00
2021-08-16 02:50:08 +02:00
if __name__ == ' __main__ ' :
# Setup Argument Parser
parser = argparse . ArgumentParser ( description = " Expose API for Semantic Search " )
2021-08-17 03:52:38 +02:00
parser . add_argument ( ' --input-files ' , ' -i ' , nargs = ' * ' , help = " List of org-mode files to process " )
parser . add_argument ( ' --input-filter ' , type = str , default = None , help = " Regex filter for org-mode files to process " )
2021-08-17 02:15:41 +02:00
parser . add_argument ( ' --compressed-jsonl ' , ' -j ' , type = pathlib . Path , default = pathlib . Path ( " .notes.jsonl.gz " ) , help = " Compressed JSONL formatted notes file to compute embeddings from " )
parser . add_argument ( ' --embeddings ' , ' -e ' , type = pathlib . Path , default = pathlib . Path ( " .notes_embeddings.pt " ) , help = " File to save/load model embeddings to/from " )
2021-08-17 03:52:38 +02:00
parser . add_argument ( ' --regenerate ' , action = ' store_true ' , default = False , help = " Regenerate embeddings from org-mode files. Default: false " )
2021-08-17 04:16:29 +02:00
parser . add_argument ( ' --verbose ' , action = ' count ' , default = 0 , help = " Show verbose conversion logs. Default: 0 " )
2021-08-16 02:50:08 +02:00
args = parser . parse_args ( )
2021-08-17 03:52:38 +02:00
# Input Validation
if is_none_or_empty ( args . input_files ) and is_none_or_empty ( args . input_filter ) :
print ( " At least one of org-files or org-file-filter is required to be specified " )
exit ( 1 )
2021-08-16 02:50:08 +02:00
# Initialize Model
2021-08-16 04:09:50 +02:00
bi_encoder , cross_encoder , top_k = asymmetric . initialize_model ( )
2021-08-16 02:50:08 +02:00
2021-08-17 03:52:38 +02:00
# Map notes in Org-Mode files to (compressed) JSONL formatted file
if not args . compressed_jsonl . exists ( ) or args . regenerate :
org_to_jsonl ( args . input_files , args . input_filter , args . compressed_jsonl , args . verbose )
2021-08-16 02:50:08 +02:00
# Extract Entries
2021-08-16 22:44:42 +02:00
entries = asymmetric . extract_entries ( args . compressed_jsonl , args . verbose )
2021-08-16 02:50:08 +02:00
# Compute or Load Embeddings
2021-08-17 03:52:38 +02:00
corpus_embeddings = asymmetric . compute_embeddings ( entries , bi_encoder , args . embeddings , regenerate = args . regenerate , verbose = args . verbose )
2021-08-16 02:50:08 +02:00
# Start Application Server
uvicorn . run ( app )