mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-30 10:53:02 +01:00
Update panchayat-to-jsonl changes to merge with master
This commit is contained in:
parent
f12ca56e93
commit
63f2312b84
3 changed files with 63 additions and 41 deletions
|
@ -74,7 +74,12 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
# Initialize Panchayat Search
|
# Initialize Panchayat Search
|
||||||
if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
|
if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
|
||||||
# Extract Entries, Generate Yaml Embeddings
|
# Extract Entries, Generate Yaml Embeddings
|
||||||
model.panchayat_search = text_search.setup(panchayat_to_jsonl, config.content_type.panchayat, search_config=config.search_type.asymmetric, regenerate=regenerate, verbose=verbose)
|
model.panchayat_search = text_search.setup(
|
||||||
|
panchayat_to_jsonl,
|
||||||
|
config.content_type.panchayat,
|
||||||
|
search_config=config.search_type.asymmetric,
|
||||||
|
regenerate=regenerate,
|
||||||
|
filters=[])
|
||||||
|
|
||||||
# Initialize Ledger Search
|
# Initialize Ledger Search
|
||||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
|
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import json
|
import json
|
||||||
import argparse
|
import logging
|
||||||
import pathlib
|
|
||||||
import glob
|
import glob
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
@ -11,37 +10,54 @@ import yaml
|
||||||
from panchayat import vdb
|
from panchayat import vdb
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
||||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
|
from src.utils.rawconfig import TextContentConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def panchayat_constructor(loader, node):
|
def panchayat_constructor(loader, node):
|
||||||
fields = loader.construct_mapping(node)
|
fields = loader.construct_mapping(node)
|
||||||
return vdb.VDB(**fields)
|
return vdb.VDB(**fields)
|
||||||
|
|
||||||
|
|
||||||
|
class VDBEntry():
|
||||||
|
post_id: str
|
||||||
|
body: str
|
||||||
|
title: str
|
||||||
|
author: str
|
||||||
|
|
||||||
|
def __init__(self, post_id, body, title, author):
|
||||||
|
self.post_id = post_id
|
||||||
|
self.body = body
|
||||||
|
self.title = title
|
||||||
|
self.author = author
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def panchayat_to_jsonl(yaml_files, yaml_file_filter, output_file, verbose=0):
|
def panchayat_to_jsonl(config: TextContentConfig, previous_entries = None):
|
||||||
|
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(yaml_files) and is_none_or_empty(yaml_file_filter):
|
if is_none_or_empty(config.input_files) and is_none_or_empty(config.input_filter):
|
||||||
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
print("At least one of input-files or input-file-filter is required to be specified")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
# Get Markdown Files to Process
|
# Get Markdown Files to Process
|
||||||
yaml_files = get_panchayat_files(yaml_files, yaml_file_filter, verbose)
|
yaml_files = get_panchayat_files(config.input_files, config.input_filter)
|
||||||
|
|
||||||
|
output_file = config.compressed_jsonl
|
||||||
|
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries = extract_panchayat_entries(yaml_files)
|
entries = extract_panchayat_entries(yaml_files)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_panchayat_entries_to_jsonl(entries, verbose=verbose)
|
jsonl_data = convert_panchayat_entries_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
compress_jsonl_data(jsonl_data, output_file, verbose=verbose)
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
elif output_file.suffix == ".jsonl":
|
elif output_file.suffix == ".jsonl":
|
||||||
dump_jsonl(jsonl_data, output_file, verbose=verbose)
|
dump_jsonl(jsonl_data, output_file)
|
||||||
|
|
||||||
return entries
|
return list(enumerate(entries))
|
||||||
|
|
||||||
|
|
||||||
def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
|
def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
|
||||||
|
@ -50,9 +66,13 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
|
||||||
if yaml_files:
|
if yaml_files:
|
||||||
absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files}
|
absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files}
|
||||||
if yaml_file_filter:
|
if yaml_file_filter:
|
||||||
filtered_yaml_files = set(glob.glob(get_absolute_path(yaml_file_filter)))
|
filtered_yaml_files = {
|
||||||
|
filtered_file
|
||||||
|
for filter in yaml_file_filter
|
||||||
|
for filtered_file in glob.glob(get_absolute_path(filter))
|
||||||
|
}
|
||||||
|
|
||||||
all_yaml_files = absolute_yaml_files | filtered_yaml_files
|
all_yaml_files = sorted(absolute_yaml_files | filtered_yaml_files)
|
||||||
|
|
||||||
files_with_non_yaml_extensions = {
|
files_with_non_yaml_extensions = {
|
||||||
yaml_file
|
yaml_file
|
||||||
|
@ -62,7 +82,7 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
|
||||||
}
|
}
|
||||||
|
|
||||||
if any(files_with_non_yaml_extensions):
|
if any(files_with_non_yaml_extensions):
|
||||||
print(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")
|
logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")
|
||||||
|
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
print(f'Processing files: {all_yaml_files}')
|
print(f'Processing files: {all_yaml_files}')
|
||||||
|
@ -77,7 +97,6 @@ def extract_panchayat_entries(yaml_files):
|
||||||
for yaml_file in yaml_files:
|
for yaml_file in yaml_files:
|
||||||
with open(yaml_file) as f:
|
with open(yaml_file) as f:
|
||||||
|
|
||||||
# try:
|
|
||||||
raw_data = yaml.load(f, Loader=yaml.UnsafeLoader)
|
raw_data = yaml.load(f, Loader=yaml.UnsafeLoader)
|
||||||
|
|
||||||
seen_ids = set()
|
seen_ids = set()
|
||||||
|
@ -87,11 +106,18 @@ def extract_panchayat_entries(yaml_files):
|
||||||
for subpost in all_subposts:
|
for subpost in all_subposts:
|
||||||
if subpost.post_id not in seen_ids:
|
if subpost.post_id not in seen_ids:
|
||||||
seen_ids.add(subpost.post_id)
|
seen_ids.add(subpost.post_id)
|
||||||
entry = {
|
|
||||||
"post_id": subpost.post_id,
|
# entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author)
|
||||||
"author": subpost.author.username,
|
# entry = {
|
||||||
"title": subpost.title,
|
# 'post_id': subpost.post_id,
|
||||||
"body": subpost.body}
|
# 'body': subpost.body,
|
||||||
|
# 'title': subpost.title,
|
||||||
|
# 'author': subpost.author.username
|
||||||
|
# }
|
||||||
|
entry = dict()
|
||||||
|
entry['compiled'] = f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}'
|
||||||
|
entry['raw'] = subpost.post_id
|
||||||
|
# entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}"""
|
||||||
entries.append(entry)
|
entries.append(entry)
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
@ -99,26 +125,17 @@ def extract_panchayat_entries(yaml_files):
|
||||||
|
|
||||||
def convert_panchayat_entries_to_jsonl(entries, verbose=0):
|
def convert_panchayat_entries_to_jsonl(entries, verbose=0):
|
||||||
"Convert each Panchayat Yaml entry to JSON and collate as JSONL"
|
"Convert each Panchayat Yaml entry to JSON and collate as JSONL"
|
||||||
jsonl = ''
|
# jsonl = ''
|
||||||
for entry in entries:
|
# for entry in entries:
|
||||||
entry_dict = {'compiled': entry, 'raw': entry["post_id"]}
|
# entry_dict = {'compiled': f'body: {entry["body"]} author: {entry["author"]} title: {entry["title"]}', 'raw': entry["post_id"]}
|
||||||
# Convert Dictionary to JSON and Append to JSONL string
|
# # Convert Dictionary to JSON and Append to JSONL string
|
||||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
# jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||||
|
|
||||||
if verbose > 0:
|
|
||||||
print(f"Converted {len(entries)} to jsonl format")
|
|
||||||
|
|
||||||
return jsonl
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
# if verbose > 0:
|
||||||
# Setup Argument Parser
|
# logger.info(f"Converted {len(entries)} to jsonl format")
|
||||||
parser = argparse.ArgumentParser(description="Map Yaml entries into (compressed) JSONL format")
|
|
||||||
parser.add_argument('--output-file', '-o', type=pathlib.Path, required=True, help="Output file for (compressed) JSONL formatted notes. Expected file extensions: jsonl or jsonl.gz")
|
# return jsonl
|
||||||
parser.add_argument('--input-files', '-i', nargs='*', help="List of yaml files to process")
|
|
||||||
parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for yaml files to process")
|
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||||
parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Map notes in Yaml files to (compressed) JSONL formatted file
|
|
||||||
panchayat_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose)
|
|
||||||
|
|
|
@ -112,7 +112,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
|
||||||
if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search:
|
if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search:
|
||||||
# query Panchayat yaml files
|
# query Panchayat yaml files
|
||||||
query_start = time.time()
|
query_start = time.time()
|
||||||
hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r, filters=[ExplicitFilter(), DateFilter()], verbose=state.verbose)
|
hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r)
|
||||||
query_end = time.time()
|
query_end = time.time()
|
||||||
|
|
||||||
# collate and return results
|
# collate and return results
|
||||||
|
|
Loading…
Reference in a new issue