Update panchayat-to-jsonl changes to merge with master

This commit is contained in:
Saba 2022-09-15 20:08:54 +03:00
parent f12ca56e93
commit 63f2312b84
3 changed files with 63 additions and 41 deletions

View file

@ -74,7 +74,12 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
# Initialize Panchayat Search # Initialize Panchayat Search
if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat: if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
# Extract Entries, Generate Yaml Embeddings # Extract Entries, Generate Yaml Embeddings
model.panchayat_search = text_search.setup(panchayat_to_jsonl, config.content_type.panchayat, search_config=config.search_type.asymmetric, regenerate=regenerate, verbose=verbose) model.panchayat_search = text_search.setup(
panchayat_to_jsonl,
config.content_type.panchayat,
search_config=config.search_type.asymmetric,
regenerate=regenerate,
filters=[])
# Initialize Ledger Search # Initialize Ledger Search
if (t == SearchType.Ledger or t == None) and config.content_type.ledger: if (t == SearchType.Ledger or t == None) and config.content_type.ledger:

View file

@ -2,8 +2,7 @@
# Standard Packages # Standard Packages
import json import json
import argparse import logging
import pathlib
import glob import glob
import yaml import yaml
@ -11,37 +10,54 @@ import yaml
from panchayat import vdb from panchayat import vdb
from src.utils.helpers import get_absolute_path, is_none_or_empty from src.utils.helpers import get_absolute_path, is_none_or_empty
from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils.rawconfig import TextContentConfig
logger = logging.getLogger(__name__)
def panchayat_constructor(loader, node): def panchayat_constructor(loader, node):
fields = loader.construct_mapping(node) fields = loader.construct_mapping(node)
return vdb.VDB(**fields) return vdb.VDB(**fields)
class VDBEntry():
post_id: str
body: str
title: str
author: str
def __init__(self, post_id, body, title, author):
self.post_id = post_id
self.body = body
self.title = title
self.author = author
# Define Functions # Define Functions
def panchayat_to_jsonl(yaml_files, yaml_file_filter, output_file, verbose=0): def panchayat_to_jsonl(config: TextContentConfig, previous_entries = None):
# Input Validation # Input Validation
if is_none_or_empty(yaml_files) and is_none_or_empty(yaml_file_filter): if is_none_or_empty(config.input_files) and is_none_or_empty(config.input_filter):
print("At least one of markdown-files or markdown-file-filter is required to be specified") print("At least one of input-files or input-file-filter is required to be specified")
exit(1) exit(1)
# Get Markdown Files to Process # Get Markdown Files to Process
yaml_files = get_panchayat_files(yaml_files, yaml_file_filter, verbose) yaml_files = get_panchayat_files(config.input_files, config.input_filter)
output_file = config.compressed_jsonl
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entries = extract_panchayat_entries(yaml_files) entries = extract_panchayat_entries(yaml_files)
# Process Each Entry from All Notes Files # Process Each Entry from All Notes Files
jsonl_data = convert_panchayat_entries_to_jsonl(entries, verbose=verbose) jsonl_data = convert_panchayat_entries_to_jsonl(entries)
# Compress JSONL formatted Data # Compress JSONL formatted Data
if output_file.suffix == ".gz": if output_file.suffix == ".gz":
compress_jsonl_data(jsonl_data, output_file, verbose=verbose) compress_jsonl_data(jsonl_data, output_file)
elif output_file.suffix == ".jsonl": elif output_file.suffix == ".jsonl":
dump_jsonl(jsonl_data, output_file, verbose=verbose) dump_jsonl(jsonl_data, output_file)
return entries return list(enumerate(entries))
def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0): def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
@ -50,9 +66,13 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
if yaml_files: if yaml_files:
absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files} absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files}
if yaml_file_filter: if yaml_file_filter:
filtered_yaml_files = set(glob.glob(get_absolute_path(yaml_file_filter))) filtered_yaml_files = {
filtered_file
for filter in yaml_file_filter
for filtered_file in glob.glob(get_absolute_path(filter))
}
all_yaml_files = absolute_yaml_files | filtered_yaml_files all_yaml_files = sorted(absolute_yaml_files | filtered_yaml_files)
files_with_non_yaml_extensions = { files_with_non_yaml_extensions = {
yaml_file yaml_file
@ -62,7 +82,7 @@ def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
} }
if any(files_with_non_yaml_extensions): if any(files_with_non_yaml_extensions):
print(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}") logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")
if verbose > 0: if verbose > 0:
print(f'Processing files: {all_yaml_files}') print(f'Processing files: {all_yaml_files}')
@ -77,7 +97,6 @@ def extract_panchayat_entries(yaml_files):
for yaml_file in yaml_files: for yaml_file in yaml_files:
with open(yaml_file) as f: with open(yaml_file) as f:
# try:
raw_data = yaml.load(f, Loader=yaml.UnsafeLoader) raw_data = yaml.load(f, Loader=yaml.UnsafeLoader)
seen_ids = set() seen_ids = set()
@ -87,11 +106,18 @@ def extract_panchayat_entries(yaml_files):
for subpost in all_subposts: for subpost in all_subposts:
if subpost.post_id not in seen_ids: if subpost.post_id not in seen_ids:
seen_ids.add(subpost.post_id) seen_ids.add(subpost.post_id)
entry = {
"post_id": subpost.post_id, # entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author)
"author": subpost.author.username, # entry = {
"title": subpost.title, # 'post_id': subpost.post_id,
"body": subpost.body} # 'body': subpost.body,
# 'title': subpost.title,
# 'author': subpost.author.username
# }
entry = dict()
entry['compiled'] = f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}'
entry['raw'] = subpost.post_id
# entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}"""
entries.append(entry) entries.append(entry)
return entries return entries
@ -99,26 +125,17 @@ def extract_panchayat_entries(yaml_files):
def convert_panchayat_entries_to_jsonl(entries, verbose=0): def convert_panchayat_entries_to_jsonl(entries, verbose=0):
"Convert each Panchayat Yaml entry to JSON and collate as JSONL" "Convert each Panchayat Yaml entry to JSON and collate as JSONL"
jsonl = '' # jsonl = ''
for entry in entries: # for entry in entries:
entry_dict = {'compiled': entry, 'raw': entry["post_id"]} # entry_dict = {'compiled': f'body: {entry["body"]} author: {entry["author"]} title: {entry["title"]}', 'raw': entry["post_id"]}
# Convert Dictionary to JSON and Append to JSONL string # # Convert Dictionary to JSON and Append to JSONL string
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' # jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
if verbose > 0:
print(f"Converted {len(entries)} to jsonl format")
return jsonl
if __name__ == '__main__': # if verbose > 0:
# Setup Argument Parser # logger.info(f"Converted {len(entries)} to jsonl format")
parser = argparse.ArgumentParser(description="Map Yaml entries into (compressed) JSONL format")
parser.add_argument('--output-file', '-o', type=pathlib.Path, required=True, help="Output file for (compressed) JSONL formatted notes. Expected file extensions: jsonl or jsonl.gz") # return jsonl
parser.add_argument('--input-files', '-i', nargs='*', help="List of yaml files to process")
parser.add_argument('--input-filter', type=str, default=None, help="Regex filter for yaml files to process") return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs, Default: 0")
args = parser.parse_args()
# Map notes in Yaml files to (compressed) JSONL formatted file
panchayat_to_jsonl(args.input_files, args.input_filter, args.output_file, args.verbose)

View file

@ -112,7 +112,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search: if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search:
# query Panchayat yaml files # query Panchayat yaml files
query_start = time.time() query_start = time.time()
hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r, filters=[ExplicitFilter(), DateFilter()], verbose=state.verbose) hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r)
query_end = time.time() query_end = time.time()
# collate and return results # collate and return results