Update panchayat-to-jsonl changes to merge with master

2024-11-23 15:38:55 +01:00 · 2022-09-15 20:08:54 +03:00 · 2022-09-15 20:08:54 +03:00 · 63f2312b84
commit 63f2312b84
parent f12ca56e93
3 changed files with 63 additions and 41 deletions
--- a/src/configure.py
+++ b/src/configure.py
@ -74,7 +74,12 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
    # Initialize Panchayat Search
    if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
        # Extract Entries, Generate Yaml Embeddings
-        model.panchayat_search = text_search.setup(panchayat_to_jsonl, config.content_type.panchayat, search_config=config.search_type.asymmetric, regenerate=regenerate, verbose=verbose)
+        model.panchayat_search = text_search.setup(
+            panchayat_to_jsonl,
+            config.content_type.panchayat,
+            search_config=config.search_type.asymmetric,
+            regenerate=regenerate,
+            filters=[])

    # Initialize Ledger Search
    if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
--- a/src/processor/panchayat/panchayat_to_jsonl.py
+++ b/src/processor/panchayat/panchayat_to_jsonl.py
@ -2,8 +2,7 @@

 # Standard Packages
 import json
-import argparse
-import pathlib
+import logging
 import glob
 import yaml

@ -11,37 +10,54 @@ import yaml
 from panchayat import vdb
 from src.utils.helpers import get_absolute_path, is_none_or_empty
 from src.utils.jsonl import dump_jsonl, compress_jsonl_data
+from src.utils.rawconfig import TextContentConfig

+logger = logging.getLogger(__name__)

 def panchayat_constructor(loader, node):
    fields = loader.construct_mapping(node)
    return vdb.VDB(**fields)


+class VDBEntry():
+    post_id: str
+    body: str
+    title: str
+    author: str
+
+    def __init__(self, post_id, body, title, author):
+        self.post_id = post_id
+        self.body = body
+        self.title = title
+        self.author = author
+
+
 # Define Functions
-def panchayat_to_jsonl(yaml_files, yaml_file_filter, output_file, verbose=0):
+def panchayat_to_jsonl(config: TextContentConfig, previous_entries = None):

    # Input Validation
-    if is_none_or_empty(yaml_files) and is_none_or_empty(yaml_file_filter):
-        print("At least one of markdown-files or markdown-file-filter is required to be specified")
+    if is_none_or_empty(config.input_files) and is_none_or_empty(config.input_filter):
+        print("At least one of input-files or input-file-filter is required to be specified")
        exit(1)

    # Get Markdown Files to Process
-    yaml_files = get_panchayat_files(yaml_files, yaml_file_filter, verbose)
+    yaml_files = get_panchayat_files(config.input_files, config.input_filter)
+
+    output_file = config.compressed_jsonl

    # Extract Entries from specified Markdown files
    entries = extract_panchayat_entries(yaml_files)

    # Process Each Entry from All Notes Files
-    jsonl_data = convert_panchayat_entries_to_jsonl(entries, verbose=verbose)
+    jsonl_data = convert_panchayat_entries_to_jsonl(entries)

    # Compress JSONL formatted Data
    if output_file.suffix == ".gz":
-        compress_jsonl_data(jsonl_data, output_file, verbose=verbose)
+        compress_jsonl_data(jsonl_data, output_file)
    elif output_file.suffix == ".jsonl":
-        dump_jsonl(jsonl_data, output_file, verbose=verbose)
+        dump_jsonl(jsonl_data, output_file)

-    return entries
+    return list(enumerate(entries))


 def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
@ -50,9 +66,13 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
    if yaml_files:
        absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files}
    if yaml_file_filter:
-        filtered_yaml_files = set(glob.glob(get_absolute_path(yaml_file_filter)))
+        filtered_yaml_files = {
+            filtered_file
+            for filter in yaml_file_filter
+            for filtered_file in glob.glob(get_absolute_path(filter))
+        }

-    all_yaml_files = absolute_yaml_files | filtered_yaml_files
+    all_yaml_files = sorted(absolute_yaml_files | filtered_yaml_files)

    files_with_non_yaml_extensions = {
        yaml_file
@ -62,7 +82,7 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
    }

    if any(files_with_non_yaml_extensions):
-        print(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")
+        logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")

    if verbose > 0:
        print(f'Processing files: {all_yaml_files}')
@ -77,7 +97,6 @@ def extract_panchayat_entries(yaml_files):
    for yaml_file in yaml_files:
        with open(yaml_file) as f:

-            # try:
            raw_data = yaml.load(f, Loader=yaml.UnsafeLoader)

            seen_ids = set()
@ -87,11 +106,18 @@ def extract_panchayat_entries(yaml_files):
                for subpost in all_subposts:
                    if subpost.post_id not in seen_ids:
                        seen_ids.add(subpost.post_id)
-                        entry = {
-                            "post_id": subpost.post_id,
-                            "author": subpost.author.username,
-                            "title": subpost.title,
-                            "body": subpost.body}   
+
+                        # entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author)
+                        # entry = {
+                        #     'post_id': subpost.post_id,
+                        #     'body': subpost.body,
+                        #     'title': subpost.title,
+                        #     'author': subpost.author.username
+                        # }
+                        entry = dict()
+                        entry['compiled'] =  f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}'
+                        entry['raw'] = subpost.post_id
+                        # entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}"""
                        entries.append(entry)

    return entries
@ -99,26 +125,17 @@ def extract_panchayat_entries(yaml_files):

 def convert_panchayat_entries_to_jsonl(entries, verbose=0):
    "Convert each Panchayat Yaml entry to JSON and collate as JSONL"
-    jsonl = ''
-    for entry in entries:
-        entry_dict = {'compiled': entry, 'raw': entry["post_id"]}
-        # Convert Dictionary to JSON and Append to JSONL string
-        jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
-
-    if verbose > 0:
-        print(f"Converted {len(entries)} to jsonl format")
-
-    return jsonl
+    # jsonl = ''
+    # for entry in entries:
+    #     entry_dict = {'compiled': f'body: {entry["body"]} author: {entry["author"]} title: {entry["title"]}', 'raw': entry["post_id"]}
+    #     # Convert Dictionary to JSON and Append to JSONL string
+    #     jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'


-if __name__ == '__main__':
-    # Setup Argument Parser
-    parser = argparse.ArgumentParser(description="Map Yaml entries into (compressed) JSONL format")
-    parser.add_argument('--output-file', '-o', type=pathlib.Path, required=True, help="Output file for (compressed) JSONL formatted notes. Expected file extensions: jsonl or jsonl.gz")
-    parser.add_argument('--input-files', '-i', nargs='*', help="List of yaml files to process")
-    parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for yaml files to process")
-    parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0")
-    args = parser.parse_args()
+    # if verbose > 0:
+    #     logger.info(f"Converted {len(entries)} to jsonl format")
+
+    # return jsonl
+
+    return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])

-    # Map notes in Yaml files to (compressed) JSONL formatted file
-    panchayat_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose)
--- a/src/router.py
+++ b/src/router.py
@ -112,7 +112,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
    if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search:
        # query Panchayat yaml files
        query_start = time.time()
-        hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r, filters=[ExplicitFilter(), DateFilter()], verbose=state.verbose)
+        hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r)
        query_end = time.time()

        # collate and return results