From 21a9fbcea3c56e4367de41b176287e85f1f6fec2 Mon Sep 17 00:00:00 2001 From: Saba Date: Wed, 28 Dec 2022 09:50:44 -0300 Subject: [PATCH] Update panchayat yaml to jsonl file to compile additional attributes - Update typing for list to use the List object from typing module - Parse number of upvotes, created date - Add support for word filter and date filter on compiled entries --- config/khoj_sample.yml | 2 +- src/configure.py | 2 +- src/processor/ledger/beancount_to_jsonl.py | 5 +++-- src/processor/markdown/markdown_to_jsonl.py | 3 ++- src/processor/org_mode/org_to_jsonl.py | 4 +++- src/processor/panchayat/panchayat_to_jsonl.py | 17 +++++++---------- src/search_filter/base_filter.py | 4 +++- src/search_type/text_search.py | 3 ++- src/utils/config.py | 4 +++- src/utils/state.py | 4 +++- 10 files changed, 28 insertions(+), 20 deletions(-) diff --git a/config/khoj_sample.yml b/config/khoj_sample.yml index 5ce68a9c..0bbd1877 100644 --- a/config/khoj_sample.yml +++ b/config/khoj_sample.yml @@ -2,7 +2,7 @@ content-type: panchayat: input-files: null - input-filter: "/home/saba/projects/panchayat/panchayat/instance/*.yaml" + input-filter: ["/home/saba/projects/panchayat/panchayat/instance/*.yaml"] compressed-jsonl: "./khoj/embeddings/panchyat.jsonl.gz" embeddings-file: "./khoj/embeddings/panchayat_embeddings.pt" diff --git a/src/configure.py b/src/configure.py index 6e6f2da5..4862391d 100644 --- a/src/configure.py +++ b/src/configure.py @@ -79,7 +79,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.panchayat, search_config=config.search_type.asymmetric, regenerate=regenerate, - filters=[]) + filters=[DateFilter(entry_key='compiled'), WordFilter(entry_key='compiled')]) # Initialize Ledger Search if (t == SearchType.Ledger or t == None) and config.content_type.ledger: diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 7b8b9bba..afd037b6 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -12,6 +12,7 @@ from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_ from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils.rawconfig import TextContentConfig +from typing import List logger = logging.getLogger(__name__) @@ -112,7 +113,7 @@ def extract_beancount_transactions(beancount_files): return entries, dict(transaction_to_file_map) -def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]: +def convert_transactions_to_maps(entries: List[str], transaction_to_file_map) -> List[dict]: "Convert each Beancount transaction into a dictionary" entry_maps = [] for entry in entries: @@ -123,6 +124,6 @@ def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> return entry_maps -def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str: +def convert_transaction_maps_to_jsonl(entries: List[dict]) -> str: "Convert each Beancount transaction dictionary to JSON and collate as JSONL" return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index 22f5ea17..fb777e4e 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -13,6 +13,7 @@ from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils.rawconfig import TextContentConfig +from typing import List logger = logging.getLogger(__name__) @@ -111,7 +112,7 @@ def extract_markdown_entries(markdown_files): return entries, dict(entry_to_file_map) -def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]: +def convert_markdown_entries_to_maps(entries: List[str], entry_to_file_map) -> List[dict]: "Convert each Markdown entries into a dictionary" entry_maps = [] for entry in entries: diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 43f4acef..98165f46 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -14,6 +14,8 @@ from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils import state from src.utils.rawconfig import TextContentConfig +from typing import List + logger = logging.getLogger(__name__) @@ -105,7 +107,7 @@ def extract_org_entries(org_files): return entries, dict(entry_to_file_map) -def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]: +def convert_org_nodes_to_entries(entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> List[dict]: "Convert Org-Mode entries into list of dictionary" entry_maps = [] for entry in entries: diff --git a/src/processor/panchayat/panchayat_to_jsonl.py b/src/processor/panchayat/panchayat_to_jsonl.py index dd60c67c..e1b33ff4 100644 --- a/src/processor/panchayat/panchayat_to_jsonl.py +++ b/src/processor/panchayat/panchayat_to_jsonl.py @@ -106,18 +106,15 @@ def extract_panchayat_entries(yaml_files): for subpost in all_subposts: if subpost.post_id not in seen_ids: seen_ids.add(subpost.post_id) - - # entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author) - # entry = { - # 'post_id': subpost.post_id, - # 'body': subpost.body, - # 'title': subpost.title, - # 'author': subpost.author.username - # } entry = dict() - entry['compiled'] = f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}' + + entry['compiled'] = f"""body: {subpost.body} + author: {subpost.author.username} + title: {subpost.title} + created: {subpost.created} + upvotes: {len(subpost.upvotes)}""" + entry['raw'] = subpost.post_id - # entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}""" entries.append(entry) return entries diff --git a/src/search_filter/base_filter.py b/src/search_filter/base_filter.py index 2550b32e..bbc2a4a6 100644 --- a/src/search_filter/base_filter.py +++ b/src/search_filter/base_filter.py @@ -1,6 +1,8 @@ # Standard Packages from abc import ABC, abstractmethod +from typing import List, Set, Tuple + class BaseFilter(ABC): @abstractmethod @@ -12,5 +14,5 @@ class BaseFilter(ABC): pass @abstractmethod - def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]: + def apply(self, query:str, raw_entries:List[str]) -> Tuple[str, Set[int]]: pass \ No newline at end of file diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index d4d8a9d4..ca62f7e2 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -14,6 +14,7 @@ from src.utils.config import TextSearchModel from src.utils.rawconfig import TextSearchConfig, TextContentConfig from src.utils.jsonl import load_jsonl +from typing import List logger = logging.getLogger(__name__) @@ -179,7 +180,7 @@ def collate_results(hits, entries, count=5): in hits[0:count]] -def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel: +def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: List[BaseFilter] = []) -> TextSearchModel: # Initialize Model bi_encoder, cross_encoder, top_k = initialize_model(search_config) diff --git a/src/utils/config.py b/src/utils/config.py index 316a3d64..617517c8 100644 --- a/src/utils/config.py +++ b/src/utils/config.py @@ -7,6 +7,8 @@ from pathlib import Path from src.utils.rawconfig import ConversationProcessorConfig from src.search_filter.base_filter import BaseFilter +from typing import List + class SearchType(str, Enum): Org = "org" @@ -22,7 +24,7 @@ class ProcessorType(str, Enum): class TextSearchModel(): - def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: list[BaseFilter], top_k): + def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: List[BaseFilter], top_k): self.entries = entries self.corpus_embeddings = corpus_embeddings self.bi_encoder = bi_encoder diff --git a/src/utils/state.py b/src/utils/state.py index 283d2b5a..9677f635 100644 --- a/src/utils/state.py +++ b/src/utils/state.py @@ -10,6 +10,8 @@ from src.utils.config import SearchModels, ProcessorConfigModel from src.utils.helpers import LRU from src.utils.rawconfig import FullConfig +from typing import List + # Application Global State config = FullConfig() model = SearchModels() @@ -18,7 +20,7 @@ config_file: Path = None verbose: int = 0 host: str = None port: int = None -cli_args: list[str] = None +cli_args: List[str] = None query_cache = LRU() if torch.cuda.is_available():