mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Decouple results shown to user and text the model is trained on
- Previously: The text the model was trained on was being used to re-create a semblance of the original org-mode entry. - Now: - Store raw entry as another key:value in each entry json too Only return actual raw org entries in results But create embeddings like before - Also add link to entry in file:<filename>::<line_number> form in property drawer of returned results This can be used to jump to actual entry in it's original file
This commit is contained in:
parent
7ee3007070
commit
f4bde75249
4 changed files with 20 additions and 20 deletions
|
@ -48,7 +48,7 @@
|
|||
;; extract entries from response as single string and convert to entries
|
||||
(format "%s"
|
||||
(mapcar
|
||||
(lambda (args) (format "* %s" (cdr (assoc 'Entry args))))
|
||||
(lambda (args) (format "%s" (cdr (assoc 'Entry args))))
|
||||
json-response))))
|
||||
|
||||
(defun semantic-search--extract-entries-as-ledger (json-response)
|
||||
|
|
|
@ -120,15 +120,9 @@ def convert_org_entries_to_jsonl(entries, verbose=0):
|
|||
if verbose > 2:
|
||||
print(f"Body: {entry.Body()}")
|
||||
|
||||
for property_key in ('ID', 'QUERY', 'TYPE', 'CATEGORY'):
|
||||
if entry.Property(property_key):
|
||||
if 'Property' not in entry_dict:
|
||||
entry_dict['Property'] = dict()
|
||||
entry_dict['Property'][property_key] = entry.Property(property_key)
|
||||
if verbose > 2:
|
||||
print(f'Property: {entry_dict["PROPERTY"][property_key]}')
|
||||
|
||||
if entry_dict:
|
||||
entry_dict["Raw"] = f'{entry}'
|
||||
|
||||
# Convert Dictionary to JSON and Append to JSONL string
|
||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||
|
||||
|
|
|
@ -77,7 +77,7 @@ def makelist(filename):
|
|||
deadline_date = ''
|
||||
thisNode.setProperties(propdict)
|
||||
nodelist.append( thisNode )
|
||||
propdict = dict()
|
||||
propdict = {'SOURCE': f'file:{filename}::{ctr}'}
|
||||
level = hdng.group(1)
|
||||
heading = hdng.group(2)
|
||||
bodytext = ""
|
||||
|
@ -325,8 +325,14 @@ class Orgnode(object):
|
|||
n = n + ':' + t
|
||||
closecolon = ':'
|
||||
n = n + closecolon
|
||||
# Need to output Scheduled Date, Deadline Date, property tags The
|
||||
# following will output the text used to construct the object
|
||||
n = n + "\n" + self.body
|
||||
# Need to output Scheduled Date, Deadline Date, property tags The
|
||||
# following will output the text used to construct the object
|
||||
n = n + "\n"
|
||||
n = n + ":PROPERTIES:\n"
|
||||
for key, value in self.properties.items():
|
||||
n = n + f":{key}: {value}\n"
|
||||
n = n + ":END:\n"
|
||||
|
||||
n = n + self.body
|
||||
|
||||
return n
|
||||
|
|
|
@ -39,7 +39,7 @@ def extract_entries(notesfile, verbose=0):
|
|||
continue
|
||||
|
||||
note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
|
||||
entries.extend([note_string])
|
||||
entries.append([note_string, note["Raw"]])
|
||||
|
||||
if verbose > 0:
|
||||
print(f"Loaded {len(entries)} entries from {notesfile}")
|
||||
|
@ -56,7 +56,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, v
|
|||
print(f"Loaded embeddings from {embeddings_file}")
|
||||
|
||||
else: # Else compute the corpus_embeddings from scratch, which can take a while
|
||||
corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True)
|
||||
corpus_embeddings = bi_encoder.encode([entry[0] for entry in entries], convert_to_tensor=True, show_progress_bar=True)
|
||||
torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
|
||||
if verbose > 0:
|
||||
print(f"Computed embeddings and save them to {embeddings_file}")
|
||||
|
@ -79,12 +79,12 @@ def query_notes(raw_query, corpus_embeddings, entries, bi_encoder, cross_encoder
|
|||
hits = hits[0] # Get the hits for the first query
|
||||
|
||||
# Filter results using explicit filters
|
||||
hits = explicit_filter(hits, entries, required_words, blocked_words)
|
||||
hits = explicit_filter(hits, [entry[0] for entry in entries], required_words, blocked_words)
|
||||
if hits is None or len(hits) == 0:
|
||||
return hits
|
||||
|
||||
# Score all retrieved entries using the cross-encoder
|
||||
cross_inp = [[query, entries[hit['corpus_id']]] for hit in hits]
|
||||
cross_inp = [[query, entries[hit['corpus_id']][0]] for hit in hits]
|
||||
cross_scores = cross_encoder.predict(cross_inp)
|
||||
|
||||
# Store cross-encoder scores in results dictionary for ranking
|
||||
|
@ -127,20 +127,20 @@ def render_results(hits, entries, count=5, display_biencoder_results=False):
|
|||
print(f"Top-{count} Bi-Encoder Retrieval hits")
|
||||
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
||||
for hit in hits[0:count]:
|
||||
print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']]}")
|
||||
print(f"Score: {hit['score']:.3f}\n------------\n{entries[hit['corpus_id']][0]}")
|
||||
|
||||
# Output of top hits from re-ranker
|
||||
print("\n-------------------------\n")
|
||||
print(f"Top-{count} Cross-Encoder Re-ranker hits")
|
||||
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
|
||||
for hit in hits[0:count]:
|
||||
print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']]}")
|
||||
print(f"CrossScore: {hit['cross-score']:.3f}\n-----------------\n{entries[hit['corpus_id']][0]}")
|
||||
|
||||
|
||||
def collate_results(hits, entries, count=5):
|
||||
return [
|
||||
{
|
||||
"Entry": entries[hit['corpus_id']],
|
||||
"Entry": entries[hit['corpus_id']][1],
|
||||
"Score": f"{hit['cross-score']:.3f}"
|
||||
}
|
||||
for hit
|
||||
|
|
Loading…
Add table
Reference in a new issue