diff --git a/src/configure.py b/src/configure.py index 0006c9fe..6e6f2da5 100644 --- a/src/configure.py +++ b/src/configure.py @@ -74,7 +74,12 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, # Initialize Panchayat Search if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat: # Extract Entries, Generate Yaml Embeddings - model.panchayat_search = text_search.setup(panchayat_to_jsonl, config.content_type.panchayat, search_config=config.search_type.asymmetric, regenerate=regenerate, verbose=verbose) + model.panchayat_search = text_search.setup( + panchayat_to_jsonl, + config.content_type.panchayat, + search_config=config.search_type.asymmetric, + regenerate=regenerate, + filters=[]) # Initialize Ledger Search if (t == SearchType.Ledger or t == None) and config.content_type.ledger: diff --git a/src/processor/panchayat/panchayat_to_jsonl.py b/src/processor/panchayat/panchayat_to_jsonl.py index 66ea8b30..dd60c67c 100644 --- a/src/processor/panchayat/panchayat_to_jsonl.py +++ b/src/processor/panchayat/panchayat_to_jsonl.py @@ -2,8 +2,7 @@ # Standard Packages import json -import argparse -import pathlib +import logging import glob import yaml @@ -11,37 +10,54 @@ import yaml from panchayat import vdb from src.utils.helpers import get_absolute_path, is_none_or_empty from src.utils.jsonl import dump_jsonl, compress_jsonl_data +from src.utils.rawconfig import TextContentConfig +logger = logging.getLogger(__name__) def panchayat_constructor(loader, node): fields = loader.construct_mapping(node) return vdb.VDB(**fields) +class VDBEntry(): + post_id: str + body: str + title: str + author: str + + def __init__(self, post_id, body, title, author): + self.post_id = post_id + self.body = body + self.title = title + self.author = author + + # Define Functions -def panchayat_to_jsonl(yaml_files, yaml_file_filter, output_file, verbose=0): +def panchayat_to_jsonl(config: TextContentConfig, previous_entries = None): # Input Validation - if is_none_or_empty(yaml_files) and is_none_or_empty(yaml_file_filter): - print("At least one of markdown-files or markdown-file-filter is required to be specified") + if is_none_or_empty(config.input_files) and is_none_or_empty(config.input_filter): + print("At least one of input-files or input-file-filter is required to be specified") exit(1) # Get Markdown Files to Process - yaml_files = get_panchayat_files(yaml_files, yaml_file_filter, verbose) + yaml_files = get_panchayat_files(config.input_files, config.input_filter) + + output_file = config.compressed_jsonl # Extract Entries from specified Markdown files entries = extract_panchayat_entries(yaml_files) # Process Each Entry from All Notes Files - jsonl_data = convert_panchayat_entries_to_jsonl(entries, verbose=verbose) + jsonl_data = convert_panchayat_entries_to_jsonl(entries) # Compress JSONL formatted Data if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file, verbose=verbose) + compress_jsonl_data(jsonl_data, output_file) elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file, verbose=verbose) + dump_jsonl(jsonl_data, output_file) - return entries + return list(enumerate(entries)) def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0): @@ -50,9 +66,13 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0): if yaml_files: absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files} if yaml_file_filter: - filtered_yaml_files = set(glob.glob(get_absolute_path(yaml_file_filter))) + filtered_yaml_files = { + filtered_file + for filter in yaml_file_filter + for filtered_file in glob.glob(get_absolute_path(filter)) + } - all_yaml_files = absolute_yaml_files | filtered_yaml_files + all_yaml_files = sorted(absolute_yaml_files | filtered_yaml_files) files_with_non_yaml_extensions = { yaml_file @@ -62,7 +82,7 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0): } if any(files_with_non_yaml_extensions): - print(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}") + logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}") if verbose > 0: print(f'Processing files: {all_yaml_files}') @@ -77,7 +97,6 @@ def extract_panchayat_entries(yaml_files): for yaml_file in yaml_files: with open(yaml_file) as f: - # try: raw_data = yaml.load(f, Loader=yaml.UnsafeLoader) seen_ids = set() @@ -87,11 +106,18 @@ def extract_panchayat_entries(yaml_files): for subpost in all_subposts: if subpost.post_id not in seen_ids: seen_ids.add(subpost.post_id) - entry = { - "post_id": subpost.post_id, - "author": subpost.author.username, - "title": subpost.title, - "body": subpost.body} + + # entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author) + # entry = { + # 'post_id': subpost.post_id, + # 'body': subpost.body, + # 'title': subpost.title, + # 'author': subpost.author.username + # } + entry = dict() + entry['compiled'] = f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}' + entry['raw'] = subpost.post_id + # entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}""" entries.append(entry) return entries @@ -99,26 +125,17 @@ def extract_panchayat_entries(yaml_files): def convert_panchayat_entries_to_jsonl(entries, verbose=0): "Convert each Panchayat Yaml entry to JSON and collate as JSONL" - jsonl = '' - for entry in entries: - entry_dict = {'compiled': entry, 'raw': entry["post_id"]} - # Convert Dictionary to JSON and Append to JSONL string - jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' - - if verbose > 0: - print(f"Converted {len(entries)} to jsonl format") - - return jsonl + # jsonl = '' + # for entry in entries: + # entry_dict = {'compiled': f'body: {entry["body"]} author: {entry["author"]} title: {entry["title"]}', 'raw': entry["post_id"]} + # # Convert Dictionary to JSON and Append to JSONL string + # jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' -if __name__ == '__main__': - # Setup Argument Parser - parser = argparse.ArgumentParser(description="Map Yaml entries into (compressed) JSONL format") - parser.add_argument('--output-file', '-o', type=pathlib.Path, required=True, help="Output file for (compressed) JSONL formatted notes. Expected file extensions: jsonl or jsonl.gz") - parser.add_argument('--input-files', '-i', nargs='*', help="List of yaml files to process") - parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for yaml files to process") - parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0") - args = parser.parse_args() + # if verbose > 0: + # logger.info(f"Converted {len(entries)} to jsonl format") + + # return jsonl + + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) - # Map notes in Yaml files to (compressed) JSONL formatted file - panchayat_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose) diff --git a/src/router.py b/src/router.py index c3e65e17..948675c3 100644 --- a/src/router.py +++ b/src/router.py @@ -112,7 +112,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search: # query Panchayat yaml files query_start = time.time() - hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r, filters=[ExplicitFilter(), DateFilter()], verbose=state.verbose) + hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r) query_end = time.time() # collate and return results