mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Update panchayat-to-jsonl changes to merge with master
This commit is contained in:
parent
f12ca56e93
commit
63f2312b84
3 changed files with 63 additions and 41 deletions
|
@ -74,7 +74,12 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
|||
# Initialize Panchayat Search
|
||||
if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
|
||||
# Extract Entries, Generate Yaml Embeddings
|
||||
model.panchayat_search = text_search.setup(panchayat_to_jsonl, config.content_type.panchayat, search_config=config.search_type.asymmetric, regenerate=regenerate, verbose=verbose)
|
||||
model.panchayat_search = text_search.setup(
|
||||
panchayat_to_jsonl,
|
||||
config.content_type.panchayat,
|
||||
search_config=config.search_type.asymmetric,
|
||||
regenerate=regenerate,
|
||||
filters=[])
|
||||
|
||||
# Initialize Ledger Search
|
||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||
|
|
|
@ -2,8 +2,7 @@
|
|||
|
||||
# Standard Packages
|
||||
import json
|
||||
import argparse
|
||||
import pathlib
|
||||
import logging
|
||||
import glob
|
||||
import yaml
|
||||
|
||||
|
@ -11,37 +10,54 @@ import yaml
|
|||
from panchayat import vdb
|
||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def panchayat_constructor(loader, node):
|
||||
fields = loader.construct_mapping(node)
|
||||
return vdb.VDB(**fields)
|
||||
|
||||
|
||||
class VDBEntry():
|
||||
post_id: str
|
||||
body: str
|
||||
title: str
|
||||
author: str
|
||||
|
||||
def __init__(self, post_id, body, title, author):
|
||||
self.post_id = post_id
|
||||
self.body = body
|
||||
self.title = title
|
||||
self.author = author
|
||||
|
||||
|
||||
# Define Functions
|
||||
def panchayat_to_jsonl(yaml_files, yaml_file_filter, output_file, verbose=0):
|
||||
def panchayat_to_jsonl(config: TextContentConfig, previous_entries = None):
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(yaml_files) and is_none_or_empty(yaml_file_filter):
|
||||
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
||||
if is_none_or_empty(config.input_files) and is_none_or_empty(config.input_filter):
|
||||
print("At least one of input-files or input-file-filter is required to be specified")
|
||||
exit(1)
|
||||
|
||||
# Get Markdown Files to Process
|
||||
yaml_files = get_panchayat_files(yaml_files, yaml_file_filter, verbose)
|
||||
yaml_files = get_panchayat_files(config.input_files, config.input_filter)
|
||||
|
||||
output_file = config.compressed_jsonl
|
||||
|
||||
# Extract Entries from specified Markdown files
|
||||
entries = extract_panchayat_entries(yaml_files)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_data = convert_panchayat_entries_to_jsonl(entries, verbose=verbose)
|
||||
jsonl_data = convert_panchayat_entries_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file, verbose=verbose)
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file, verbose=verbose)
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
|
||||
return entries
|
||||
return list(enumerate(entries))
|
||||
|
||||
|
||||
def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
|
||||
|
@ -50,9 +66,13 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
|
|||
if yaml_files:
|
||||
absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files}
|
||||
if yaml_file_filter:
|
||||
filtered_yaml_files = set(glob.glob(get_absolute_path(yaml_file_filter)))
|
||||
filtered_yaml_files = {
|
||||
filtered_file
|
||||
for filter in yaml_file_filter
|
||||
for filtered_file in glob.glob(get_absolute_path(filter))
|
||||
}
|
||||
|
||||
all_yaml_files = absolute_yaml_files | filtered_yaml_files
|
||||
all_yaml_files = sorted(absolute_yaml_files | filtered_yaml_files)
|
||||
|
||||
files_with_non_yaml_extensions = {
|
||||
yaml_file
|
||||
|
@ -62,7 +82,7 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
|
|||
}
|
||||
|
||||
if any(files_with_non_yaml_extensions):
|
||||
print(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")
|
||||
logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")
|
||||
|
||||
if verbose > 0:
|
||||
print(f'Processing files: {all_yaml_files}')
|
||||
|
@ -77,7 +97,6 @@ def extract_panchayat_entries(yaml_files):
|
|||
for yaml_file in yaml_files:
|
||||
with open(yaml_file) as f:
|
||||
|
||||
# try:
|
||||
raw_data = yaml.load(f, Loader=yaml.UnsafeLoader)
|
||||
|
||||
seen_ids = set()
|
||||
|
@ -87,11 +106,18 @@ def extract_panchayat_entries(yaml_files):
|
|||
for subpost in all_subposts:
|
||||
if subpost.post_id not in seen_ids:
|
||||
seen_ids.add(subpost.post_id)
|
||||
entry = {
|
||||
"post_id": subpost.post_id,
|
||||
"author": subpost.author.username,
|
||||
"title": subpost.title,
|
||||
"body": subpost.body}
|
||||
|
||||
# entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author)
|
||||
# entry = {
|
||||
# 'post_id': subpost.post_id,
|
||||
# 'body': subpost.body,
|
||||
# 'title': subpost.title,
|
||||
# 'author': subpost.author.username
|
||||
# }
|
||||
entry = dict()
|
||||
entry['compiled'] = f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}'
|
||||
entry['raw'] = subpost.post_id
|
||||
# entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}"""
|
||||
entries.append(entry)
|
||||
|
||||
return entries
|
||||
|
@ -99,26 +125,17 @@ def extract_panchayat_entries(yaml_files):
|
|||
|
||||
def convert_panchayat_entries_to_jsonl(entries, verbose=0):
|
||||
"Convert each Panchayat Yaml entry to JSON and collate as JSONL"
|
||||
jsonl = ''
|
||||
for entry in entries:
|
||||
entry_dict = {'compiled': entry, 'raw': entry["post_id"]}
|
||||
# Convert Dictionary to JSON and Append to JSONL string
|
||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||
|
||||
if verbose > 0:
|
||||
print(f"Converted {len(entries)} to jsonl format")
|
||||
|
||||
return jsonl
|
||||
# jsonl = ''
|
||||
# for entry in entries:
|
||||
# entry_dict = {'compiled': f'body: {entry["body"]} author: {entry["author"]} title: {entry["title"]}', 'raw': entry["post_id"]}
|
||||
# # Convert Dictionary to JSON and Append to JSONL string
|
||||
# jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Setup Argument Parser
|
||||
parser = argparse.ArgumentParser(description="Map Yaml entries into (compressed) JSONL format")
|
||||
parser.add_argument('--output-file', '-o', type=pathlib.Path, required=True, help="Output file for (compressed) JSONL formatted notes. Expected file extensions: jsonl or jsonl.gz")
|
||||
parser.add_argument('--input-files', '-i', nargs='*', help="List of yaml files to process")
|
||||
parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for yaml files to process")
|
||||
parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0")
|
||||
args = parser.parse_args()
|
||||
# if verbose > 0:
|
||||
# logger.info(f"Converted {len(entries)} to jsonl format")
|
||||
|
||||
# return jsonl
|
||||
|
||||
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||
|
||||
# Map notes in Yaml files to (compressed) JSONL formatted file
|
||||
panchayat_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose)
|
||||
|
|
|
@ -112,7 +112,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
|
|||
if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search:
|
||||
# query Panchayat yaml files
|
||||
query_start = time.time()
|
||||
hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r, filters=[ExplicitFilter(), DateFilter()], verbose=state.verbose)
|
||||
hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r)
|
||||
query_end = time.time()
|
||||
|
||||
# collate and return results
|
||||
|
|
Loading…
Reference in a new issue