From 21a9fbcea3c56e4367de41b176287e85f1f6fec2 Mon Sep 17 00:00:00 2001
From: Saba <narmiabas@gmail.com>
Date: Wed, 28 Dec 2022 09:50:44 -0300
Subject: [PATCH] Update panchayat yaml to jsonl file to compile additional
 attributes

- Update typing for list to use the List object from typing module
- Parse number of upvotes, created date
- Add support for word filter and date filter on compiled entries
---
 config/khoj_sample.yml                        |  2 +-
 src/configure.py                              |  2 +-
 src/processor/ledger/beancount_to_jsonl.py    |  5 +++--
 src/processor/markdown/markdown_to_jsonl.py   |  3 ++-
 src/processor/org_mode/org_to_jsonl.py        |  4 +++-
 src/processor/panchayat/panchayat_to_jsonl.py | 17 +++++++----------
 src/search_filter/base_filter.py              |  4 +++-
 src/search_type/text_search.py                |  3 ++-
 src/utils/config.py                           |  4 +++-
 src/utils/state.py                            |  4 +++-
 10 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/config/khoj_sample.yml b/config/khoj_sample.yml
index 5ce68a9c..0bbd1877 100644
--- a/config/khoj_sample.yml
+++ b/config/khoj_sample.yml
@@ -2,7 +2,7 @@ content-type:
 
   panchayat:
     input-files: null
-    input-filter: "/home/saba/projects/panchayat/panchayat/instance/*.yaml"
+    input-filter: ["/home/saba/projects/panchayat/panchayat/instance/*.yaml"]
     compressed-jsonl: "./khoj/embeddings/panchyat.jsonl.gz"
     embeddings-file: "./khoj/embeddings/panchayat_embeddings.pt"
 
diff --git a/src/configure.py b/src/configure.py
index 6e6f2da5..4862391d 100644
--- a/src/configure.py
+++ b/src/configure.py
@@ -79,7 +79,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
             config.content_type.panchayat,
             search_config=config.search_type.asymmetric,
             regenerate=regenerate,
-            filters=[])
+            filters=[DateFilter(entry_key='compiled'), WordFilter(entry_key='compiled')])
 
     # Initialize Ledger Search
     if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py
index 7b8b9bba..afd037b6 100644
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -12,6 +12,7 @@ from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_
 from src.utils.constants import empty_escape_sequences
 from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 from src.utils.rawconfig import TextContentConfig
+from typing import List
 
 
 logger = logging.getLogger(__name__)
@@ -112,7 +113,7 @@ def extract_beancount_transactions(beancount_files):
     return entries, dict(transaction_to_file_map)
 
 
-def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
+def convert_transactions_to_maps(entries: List[str], transaction_to_file_map) -> List[dict]:
     "Convert each Beancount transaction into a dictionary"
     entry_maps = []
     for entry in entries:
@@ -123,6 +124,6 @@ def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) ->
     return entry_maps
 
 
-def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
+def convert_transaction_maps_to_jsonl(entries: List[dict]) -> str:
     "Convert each Beancount transaction dictionary to JSON and collate as JSONL"
     return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py
index 22f5ea17..fb777e4e 100644
--- a/src/processor/markdown/markdown_to_jsonl.py
+++ b/src/processor/markdown/markdown_to_jsonl.py
@@ -13,6 +13,7 @@ from src.utils.constants import empty_escape_sequences
 from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 from src.utils.rawconfig import TextContentConfig
 
+from typing import List
 
 logger = logging.getLogger(__name__)
 
@@ -111,7 +112,7 @@ def extract_markdown_entries(markdown_files):
     return entries, dict(entry_to_file_map)
 
 
-def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
+def convert_markdown_entries_to_maps(entries: List[str], entry_to_file_map) -> List[dict]:
     "Convert each Markdown entries into a dictionary"
     entry_maps = []
     for entry in entries:
diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py
index 43f4acef..98165f46 100644
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@@ -14,6 +14,8 @@ from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 from src.utils import state
 from src.utils.rawconfig import TextContentConfig
 
+from typing import List
+
 
 logger = logging.getLogger(__name__)
 
@@ -105,7 +107,7 @@ def extract_org_entries(org_files):
     return entries, dict(entry_to_file_map)
 
 
-def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
+def convert_org_nodes_to_entries(entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> List[dict]:
     "Convert Org-Mode entries into list of dictionary"
     entry_maps = []
     for entry in entries:
diff --git a/src/processor/panchayat/panchayat_to_jsonl.py b/src/processor/panchayat/panchayat_to_jsonl.py
index dd60c67c..e1b33ff4 100644
--- a/src/processor/panchayat/panchayat_to_jsonl.py
+++ b/src/processor/panchayat/panchayat_to_jsonl.py
@@ -106,18 +106,15 @@ def extract_panchayat_entries(yaml_files):
                 for subpost in all_subposts:
                     if subpost.post_id not in seen_ids:
                         seen_ids.add(subpost.post_id)
-
-                        # entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author)
-                        # entry = {
-                        #     'post_id': subpost.post_id,
-                        #     'body': subpost.body,
-                        #     'title': subpost.title,
-                        #     'author': subpost.author.username
-                        # }
                         entry = dict()
-                        entry['compiled'] =  f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}'
+                        
+                        entry['compiled'] = f"""body: {subpost.body}
+                            author: {subpost.author.username}
+                            title: {subpost.title}
+                            created: {subpost.created}
+                            upvotes: {len(subpost.upvotes)}"""
+                        
                         entry['raw'] = subpost.post_id
-                        # entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}"""
                         entries.append(entry)
 
     return entries
diff --git a/src/search_filter/base_filter.py b/src/search_filter/base_filter.py
index 2550b32e..bbc2a4a6 100644
--- a/src/search_filter/base_filter.py
+++ b/src/search_filter/base_filter.py
@@ -1,6 +1,8 @@
 # Standard Packages
 from abc import ABC, abstractmethod
 
+from typing import List, Set, Tuple
+
 
 class BaseFilter(ABC):
     @abstractmethod
@@ -12,5 +14,5 @@ class BaseFilter(ABC):
         pass
 
     @abstractmethod
-    def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]:
+    def apply(self, query:str, raw_entries:List[str]) -> Tuple[str, Set[int]]:
         pass
\ No newline at end of file
diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py
index d4d8a9d4..ca62f7e2 100644
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@@ -14,6 +14,7 @@ from src.utils.config import TextSearchModel
 from src.utils.rawconfig import TextSearchConfig, TextContentConfig
 from src.utils.jsonl import load_jsonl
 
+from typing import List
 
 logger = logging.getLogger(__name__)
 
@@ -179,7 +180,7 @@ def collate_results(hits, entries, count=5):
         in hits[0:count]]
 
 
-def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
+def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: List[BaseFilter] = []) -> TextSearchModel:
     # Initialize Model
     bi_encoder, cross_encoder, top_k = initialize_model(search_config)
 
diff --git a/src/utils/config.py b/src/utils/config.py
index 316a3d64..617517c8 100644
--- a/src/utils/config.py
+++ b/src/utils/config.py
@@ -7,6 +7,8 @@ from pathlib import Path
 from src.utils.rawconfig import ConversationProcessorConfig
 from src.search_filter.base_filter import BaseFilter
 
+from typing import List
+
 
 class SearchType(str, Enum):
     Org = "org"
@@ -22,7 +24,7 @@ class ProcessorType(str, Enum):
 
 
 class TextSearchModel():
-    def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: list[BaseFilter], top_k):
+    def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: List[BaseFilter], top_k):
         self.entries = entries
         self.corpus_embeddings = corpus_embeddings
         self.bi_encoder = bi_encoder
diff --git a/src/utils/state.py b/src/utils/state.py
index 283d2b5a..9677f635 100644
--- a/src/utils/state.py
+++ b/src/utils/state.py
@@ -10,6 +10,8 @@ from src.utils.config import SearchModels, ProcessorConfigModel
 from src.utils.helpers import LRU
 from src.utils.rawconfig import FullConfig
 
+from typing import List
+
 # Application Global State
 config = FullConfig()
 model = SearchModels()
@@ -18,7 +20,7 @@ config_file: Path = None
 verbose: int = 0
 host: str = None
 port: int = None
-cli_args: list[str] = None
+cli_args: List[str] = None
 query_cache = LRU()
 
 if torch.cuda.is_available():