mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Merge pull request #25 from debanjum/users/debanjum/improve-semantic-search-on-ledger
Improve Extraction and Rendering of Semantic Search on Ledger
This commit is contained in:
commit
6a84ca965a
3 changed files with 53 additions and 29 deletions
|
@ -69,16 +69,20 @@
|
||||||
(lambda (args) (format "\n** \n [[%s]]" (cdr (assoc 'Entry args))))
|
(lambda (args) (format "\n** \n [[%s]]" (cdr (assoc 'Entry args))))
|
||||||
json-response))))
|
json-response))))
|
||||||
|
|
||||||
(defun semantic-search--extract-entries-as-ledger (json-response)
|
(defun semantic-search--extract-entries-as-ledger (json-response query)
|
||||||
"Convert json response from API to ledger entries"
|
"Convert json response from API to ledger entries"
|
||||||
;; remove leading (, ) or SPC from extracted entries string
|
;; remove leading (, ) or SPC from extracted entries string
|
||||||
|
(replace-regexp-in-string
|
||||||
|
"[\(\) ]$" ""
|
||||||
(replace-regexp-in-string
|
(replace-regexp-in-string
|
||||||
"^[\(\) ]" ""
|
"^[\(\) ]" ""
|
||||||
;; extract entries from response as single string and convert to entries
|
;; extract entries from response as single string and convert to entries
|
||||||
(format "%s"
|
(format ";; %s\n\n%s\n"
|
||||||
|
query
|
||||||
(mapcar
|
(mapcar
|
||||||
(lambda (args) (format "* %s" (cdr (assoc 'Entry args))))
|
(lambda (args)
|
||||||
json-response))))
|
(format "%s\n\n" (cdr (assoc 'Entry args))))
|
||||||
|
json-response)))))
|
||||||
|
|
||||||
(defun semantic-search--buffer-name-to-search-type (buffer-name)
|
(defun semantic-search--buffer-name-to-search-type (buffer-name)
|
||||||
(let ((file-extension (file-name-extension buffer-name)))
|
(let ((file-extension (file-name-extension buffer-name)))
|
||||||
|
@ -112,10 +116,11 @@
|
||||||
(erase-buffer)
|
(erase-buffer)
|
||||||
(insert
|
(insert
|
||||||
(cond ((or (equal search-type "notes") (equal search-type "music")) (semantic-search--extract-entries-as-org json-response query))
|
(cond ((or (equal search-type "notes") (equal search-type "music")) (semantic-search--extract-entries-as-org json-response query))
|
||||||
((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response))
|
((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response query))
|
||||||
((equal search-type "image") (semantic-search--extract-entries-as-images json-response query))
|
((equal search-type "image") (semantic-search--extract-entries-as-images json-response query))
|
||||||
(t (format "%s" json-response)))))
|
(t (format "%s" json-response)))))
|
||||||
(cond ((equal search-type "notes") (org-mode))
|
(cond ((equal search-type "notes") (org-mode))
|
||||||
|
((equal search-type "ledger") (beancount-mode))
|
||||||
((equal search-type "music") (progn (org-mode)
|
((equal search-type "music") (progn (org-mode)
|
||||||
(org-music-mode)))
|
(org-music-mode)))
|
||||||
((equal search-type "image") (progn (org-mode)
|
((equal search-type "image") (progn (org-mode)
|
||||||
|
|
|
@ -6,6 +6,7 @@ import argparse
|
||||||
import pathlib
|
import pathlib
|
||||||
import glob
|
import glob
|
||||||
import gzip
|
import gzip
|
||||||
|
import re
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.org_mode import orgnode
|
from src.processor.org_mode import orgnode
|
||||||
|
@ -43,7 +44,8 @@ def dump_jsonl(jsonl_data, output_path, verbose=0):
|
||||||
f.write(jsonl_data)
|
f.write(jsonl_data)
|
||||||
|
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
|
jsonl_entries = len(jsonl_data.split('\n'))
|
||||||
|
print(f'Wrote {jsonl_entries} lines to jsonl at {output_path}')
|
||||||
|
|
||||||
|
|
||||||
def compress_jsonl_data(jsonl_data, output_path, verbose=0):
|
def compress_jsonl_data(jsonl_data, output_path, verbose=0):
|
||||||
|
@ -51,16 +53,31 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0):
|
||||||
gzip_file.write(jsonl_data)
|
gzip_file.write(jsonl_data)
|
||||||
|
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')
|
jsonl_entries = len(jsonl_data.split('\n'))
|
||||||
|
print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}')
|
||||||
|
|
||||||
|
|
||||||
def load_jsonl(input_path, verbose=0):
|
def load_jsonl(input_path, verbose=0):
|
||||||
"Read List of JSON objects from JSON line file"
|
"Read List of JSON objects from JSON line file"
|
||||||
|
# Initialize Variables
|
||||||
data = []
|
data = []
|
||||||
with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f:
|
jsonl_file = None
|
||||||
for line in f:
|
escape_sequences = '\n|\r\t '
|
||||||
data.append(json.loads(line.rstrip('\n|\r')))
|
|
||||||
|
|
||||||
|
# Open JSONL file
|
||||||
|
if input_path.suffix == ".gz":
|
||||||
|
jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
|
||||||
|
elif input_path.suffix == ".jsonl":
|
||||||
|
jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
|
||||||
|
|
||||||
|
# Read JSONL file
|
||||||
|
for line in jsonl_file:
|
||||||
|
data.append(json.loads(line.strip(escape_sequences)))
|
||||||
|
|
||||||
|
# Close JSONL file
|
||||||
|
jsonl_file.close()
|
||||||
|
|
||||||
|
# Log JSONL entries loaded
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
print(f'Loaded {len(data)} records from {input_path}')
|
print(f'Loaded {len(data)} records from {input_path}')
|
||||||
|
|
||||||
|
@ -79,7 +96,10 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbos
|
||||||
|
|
||||||
all_beancount_files = absolute_beancount_files | filtered_beancount_files
|
all_beancount_files = absolute_beancount_files | filtered_beancount_files
|
||||||
|
|
||||||
files_with_non_beancount_extensions = {beancount_file for beancount_file in all_beancount_files if not beancount_file.endswith(".bean")}
|
files_with_non_beancount_extensions = {beancount_file
|
||||||
|
for beancount_file
|
||||||
|
in all_beancount_files
|
||||||
|
if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")}
|
||||||
if any(files_with_non_beancount_extensions):
|
if any(files_with_non_beancount_extensions):
|
||||||
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
|
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
|
||||||
|
|
||||||
|
@ -91,11 +111,19 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbos
|
||||||
|
|
||||||
def extract_beancount_entries(beancount_files):
|
def extract_beancount_entries(beancount_files):
|
||||||
"Extract entries from specified Beancount files"
|
"Extract entries from specified Beancount files"
|
||||||
|
|
||||||
|
# Initialize Regex for extracting Beancount Entries
|
||||||
|
date_regex = r'^\n?\d{4}-\d{2}-\d{2}'
|
||||||
|
empty_newline = r'^[\n\r\t ]*$'
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
for beancount_file in beancount_files:
|
for beancount_file in beancount_files:
|
||||||
with open(beancount_file) as f:
|
with open(beancount_file) as f:
|
||||||
entries.extend(
|
ledger_content = f.read()
|
||||||
f.read().split('\n\n'))
|
entries.extend([entry.strip('\n|\r|\t| ')
|
||||||
|
for entry
|
||||||
|
in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
|
||||||
|
if re.match(date_regex, entry)])
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
||||||
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl
|
||||||
from src.utils.config import TextSearchModel
|
from src.utils.config import TextSearchModel
|
||||||
from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
|
from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
|
||||||
|
|
||||||
|
@ -40,18 +40,9 @@ def initialize_model(search_config: SymmetricSearchConfig):
|
||||||
|
|
||||||
def extract_entries(notesfile, verbose=0):
|
def extract_entries(notesfile, verbose=0):
|
||||||
"Load entries from compressed jsonl"
|
"Load entries from compressed jsonl"
|
||||||
entries = []
|
return [f'{entry["Title"]}'
|
||||||
with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
|
for entry
|
||||||
for line in jsonl:
|
in load_jsonl(notesfile, verbose=verbose)]
|
||||||
note = json.loads(line.strip())
|
|
||||||
|
|
||||||
note_string = f'{note["Title"]} \t {note["Tags"] if "Tags" in note else ""} \n {note["Body"] if "Body" in note else ""}'
|
|
||||||
entries.extend([note_string])
|
|
||||||
|
|
||||||
if verbose > 0:
|
|
||||||
print(f"Loaded {len(entries)} entries from {notesfile}")
|
|
||||||
|
|
||||||
return entries
|
|
||||||
|
|
||||||
|
|
||||||
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):
|
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):
|
||||||
|
|
Loading…
Add table
Reference in a new issue