Update panchayat yaml to jsonl file to compile additional attributes

- Update typing for list to use the List object from typing module - Parse number of upvotes, created date - Add support for word filter and date filter on compiled entries
2024-11-23 15:38:55 +01:00 · 2022-12-28 09:50:44 -03:00 · 2022-12-28 09:50:44 -03:00 · 21a9fbcea3
commit 21a9fbcea3
parent 21eb58156c
10 changed files with 28 additions and 20 deletions
--- a/config/khoj_sample.yml
+++ b/config/khoj_sample.yml
@ -2,7 +2,7 @@ content-type:
  panchayat:
    input-files: null
-    input-filter: "/home/saba/projects/panchayat/panchayat/instance/*.yaml"
+    input-filter: ["/home/saba/projects/panchayat/panchayat/instance/*.yaml"]
    compressed-jsonl: "./khoj/embeddings/panchyat.jsonl.gz"
    embeddings-file: "./khoj/embeddings/panchayat_embeddings.pt"
--- a/src/configure.py
+++ b/src/configure.py
@ -79,7 +79,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
            config.content_type.panchayat,
            search_config=config.search_type.asymmetric,
            regenerate=regenerate,
-            filters=[])
+            filters=[DateFilter(entry_key='compiled'), WordFilter(entry_key='compiled')])
    # Initialize Ledger Search
    if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@ -12,6 +12,7 @@ from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_
 from src.utils.constants import empty_escape_sequences
 from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 from src.utils.rawconfig import TextContentConfig
 from typing import List
 logger = logging.getLogger(__name__)
@ -112,7 +113,7 @@ def extract_beancount_transactions(beancount_files):
    return entries, dict(transaction_to_file_map)
-def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
+def convert_transactions_to_maps(entries: List[str], transaction_to_file_map) -> List[dict]:
    "Convert each Beancount transaction into a dictionary"
    entry_maps = []
    for entry in entries:
@ -123,6 +124,6 @@ def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) ->
    return entry_maps
-def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
+def convert_transaction_maps_to_jsonl(entries: List[dict]) -> str:
    "Convert each Beancount transaction dictionary to JSON and collate as JSONL"
    return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
--- a/src/processor/markdown/markdown_to_jsonl.py
+++ b/src/processor/markdown/markdown_to_jsonl.py
@ -13,6 +13,7 @@ from src.utils.constants import empty_escape_sequences
 from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 from src.utils.rawconfig import TextContentConfig
 from typing import List
 logger = logging.getLogger(__name__)
@ -111,7 +112,7 @@ def extract_markdown_entries(markdown_files):
    return entries, dict(entry_to_file_map)
-def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
+def convert_markdown_entries_to_maps(entries: List[str], entry_to_file_map) -> List[dict]:
    "Convert each Markdown entries into a dictionary"
    entry_maps = []
    for entry in entries:
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@ -14,6 +14,8 @@ from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 from src.utils import state
 from src.utils.rawconfig import TextContentConfig
 from typing import List
 logger = logging.getLogger(__name__)
@ -105,7 +107,7 @@ def extract_org_entries(org_files):
    return entries, dict(entry_to_file_map)
-def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
+def convert_org_nodes_to_entries(entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> List[dict]:
    "Convert Org-Mode entries into list of dictionary"
    entry_maps = []
    for entry in entries:
--- a/src/processor/panchayat/panchayat_to_jsonl.py
+++ b/src/processor/panchayat/panchayat_to_jsonl.py
@ -106,18 +106,15 @@ def extract_panchayat_entries(yaml_files):
                for subpost in all_subposts:
                    if subpost.post_id not in seen_ids:
                        seen_ids.add(subpost.post_id)
                        # entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author)
                        # entry = {
                        #     'post_id': subpost.post_id,
                        #     'body': subpost.body,
                        #     'title': subpost.title,
                        #     'author': subpost.author.username
                        # }
                        entry = dict()
-                        entry['compiled'] =  f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}'
+                        
                        entry['compiled'] = f"""body: {subpost.body}
                            author: {subpost.author.username}
                            title: {subpost.title}
                            created: {subpost.created}
                            upvotes: {len(subpost.upvotes)}"""
                        entry['raw'] = subpost.post_id
                        # entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}"""
                        entries.append(entry)
    return entries
--- a/src/search_filter/base_filter.py
+++ b/src/search_filter/base_filter.py
@ -1,6 +1,8 @@
 # Standard Packages
 from abc import ABC, abstractmethod
 from typing import List, Set, Tuple
 class BaseFilter(ABC):
    @abstractmethod
@ -12,5 +14,5 @@ class BaseFilter(ABC):
        pass
    @abstractmethod
-    def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]:
+    def apply(self, query:str, raw_entries:List[str]) -> Tuple[str, Set[int]]:
        pass
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@ -14,6 +14,7 @@ from src.utils.config import TextSearchModel
 from src.utils.rawconfig import TextSearchConfig, TextContentConfig
 from src.utils.jsonl import load_jsonl
 from typing import List
 logger = logging.getLogger(__name__)
@ -179,7 +180,7 @@ def collate_results(hits, entries, count=5):
        in hits[0:count]]
-def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
+def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: List[BaseFilter] = []) -> TextSearchModel:
    # Initialize Model
    bi_encoder, cross_encoder, top_k = initialize_model(search_config)
--- a/src/utils/config.py
+++ b/src/utils/config.py
@ -7,6 +7,8 @@ from pathlib import Path
 from src.utils.rawconfig import ConversationProcessorConfig
 from src.search_filter.base_filter import BaseFilter
 from typing import List
 class SearchType(str, Enum):
    Org = "org"
@ -22,7 +24,7 @@ class ProcessorType(str, Enum):
 class TextSearchModel():
-    def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: list[BaseFilter], top_k):
+    def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: List[BaseFilter], top_k):
        self.entries = entries
        self.corpus_embeddings = corpus_embeddings
        self.bi_encoder = bi_encoder
--- a/src/utils/state.py
+++ b/src/utils/state.py
@ -10,6 +10,8 @@ from src.utils.config import SearchModels, ProcessorConfigModel
 from src.utils.helpers import LRU
 from src.utils.rawconfig import FullConfig
 from typing import List
 # Application Global State
 config = FullConfig()
 model = SearchModels()
@ -18,7 +20,7 @@ config_file: Path = None
 verbose: int = 0
 host: str = None
 port: int = None
-cli_args: list[str] = None
+cli_args: List[str] = None
 query_cache = LRU()
 if torch.cuda.is_available():