Update panchayat yaml to jsonl file to compile additional attributes

- Update typing for list to use the List object from typing module
- Parse number of upvotes, created date
- Add support for word filter and date filter on compiled entries
This commit is contained in:
Saba 2022-12-28 09:50:44 -03:00
parent 21eb58156c
commit 21a9fbcea3
10 changed files with 28 additions and 20 deletions

View file

@ -2,7 +2,7 @@ content-type:
panchayat: panchayat:
input-files: null input-files: null
input-filter: "/home/saba/projects/panchayat/panchayat/instance/*.yaml" input-filter: ["/home/saba/projects/panchayat/panchayat/instance/*.yaml"]
compressed-jsonl: "./khoj/embeddings/panchyat.jsonl.gz" compressed-jsonl: "./khoj/embeddings/panchyat.jsonl.gz"
embeddings-file: "./khoj/embeddings/panchayat_embeddings.pt" embeddings-file: "./khoj/embeddings/panchayat_embeddings.pt"

View file

@ -79,7 +79,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
config.content_type.panchayat, config.content_type.panchayat,
search_config=config.search_type.asymmetric, search_config=config.search_type.asymmetric,
regenerate=regenerate, regenerate=regenerate,
filters=[]) filters=[DateFilter(entry_key='compiled'), WordFilter(entry_key='compiled')])
# Initialize Ledger Search # Initialize Ledger Search
if (t == SearchType.Ledger or t == None) and config.content_type.ledger: if (t == SearchType.Ledger or t == None) and config.content_type.ledger:

View file

@ -12,6 +12,7 @@ from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_
from src.utils.constants import empty_escape_sequences from src.utils.constants import empty_escape_sequences
from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils.rawconfig import TextContentConfig from src.utils.rawconfig import TextContentConfig
from typing import List
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -112,7 +113,7 @@ def extract_beancount_transactions(beancount_files):
return entries, dict(transaction_to_file_map) return entries, dict(transaction_to_file_map)
def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]: def convert_transactions_to_maps(entries: List[str], transaction_to_file_map) -> List[dict]:
"Convert each Beancount transaction into a dictionary" "Convert each Beancount transaction into a dictionary"
entry_maps = [] entry_maps = []
for entry in entries: for entry in entries:
@ -123,6 +124,6 @@ def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) ->
return entry_maps return entry_maps
def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str: def convert_transaction_maps_to_jsonl(entries: List[dict]) -> str:
"Convert each Beancount transaction dictionary to JSON and collate as JSONL" "Convert each Beancount transaction dictionary to JSON and collate as JSONL"
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])

View file

@ -13,6 +13,7 @@ from src.utils.constants import empty_escape_sequences
from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils.rawconfig import TextContentConfig from src.utils.rawconfig import TextContentConfig
from typing import List
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -111,7 +112,7 @@ def extract_markdown_entries(markdown_files):
return entries, dict(entry_to_file_map) return entries, dict(entry_to_file_map)
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]: def convert_markdown_entries_to_maps(entries: List[str], entry_to_file_map) -> List[dict]:
"Convert each Markdown entries into a dictionary" "Convert each Markdown entries into a dictionary"
entry_maps = [] entry_maps = []
for entry in entries: for entry in entries:

View file

@ -14,6 +14,8 @@ from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils import state from src.utils import state
from src.utils.rawconfig import TextContentConfig from src.utils.rawconfig import TextContentConfig
from typing import List
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -105,7 +107,7 @@ def extract_org_entries(org_files):
return entries, dict(entry_to_file_map) return entries, dict(entry_to_file_map)
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]: def convert_org_nodes_to_entries(entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> List[dict]:
"Convert Org-Mode entries into list of dictionary" "Convert Org-Mode entries into list of dictionary"
entry_maps = [] entry_maps = []
for entry in entries: for entry in entries:

View file

@ -106,18 +106,15 @@ def extract_panchayat_entries(yaml_files):
for subpost in all_subposts: for subpost in all_subposts:
if subpost.post_id not in seen_ids: if subpost.post_id not in seen_ids:
seen_ids.add(subpost.post_id) seen_ids.add(subpost.post_id)
# entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author)
# entry = {
# 'post_id': subpost.post_id,
# 'body': subpost.body,
# 'title': subpost.title,
# 'author': subpost.author.username
# }
entry = dict() entry = dict()
entry['compiled'] = f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}'
entry['compiled'] = f"""body: {subpost.body}
author: {subpost.author.username}
title: {subpost.title}
created: {subpost.created}
upvotes: {len(subpost.upvotes)}"""
entry['raw'] = subpost.post_id entry['raw'] = subpost.post_id
# entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}"""
entries.append(entry) entries.append(entry)
return entries return entries

View file

@ -1,6 +1,8 @@
# Standard Packages # Standard Packages
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Set, Tuple
class BaseFilter(ABC): class BaseFilter(ABC):
@abstractmethod @abstractmethod
@ -12,5 +14,5 @@ class BaseFilter(ABC):
pass pass
@abstractmethod @abstractmethod
def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]: def apply(self, query:str, raw_entries:List[str]) -> Tuple[str, Set[int]]:
pass pass

View file

@ -14,6 +14,7 @@ from src.utils.config import TextSearchModel
from src.utils.rawconfig import TextSearchConfig, TextContentConfig from src.utils.rawconfig import TextSearchConfig, TextContentConfig
from src.utils.jsonl import load_jsonl from src.utils.jsonl import load_jsonl
from typing import List
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -179,7 +180,7 @@ def collate_results(hits, entries, count=5):
in hits[0:count]] in hits[0:count]]
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel: def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: List[BaseFilter] = []) -> TextSearchModel:
# Initialize Model # Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model(search_config) bi_encoder, cross_encoder, top_k = initialize_model(search_config)

View file

@ -7,6 +7,8 @@ from pathlib import Path
from src.utils.rawconfig import ConversationProcessorConfig from src.utils.rawconfig import ConversationProcessorConfig
from src.search_filter.base_filter import BaseFilter from src.search_filter.base_filter import BaseFilter
from typing import List
class SearchType(str, Enum): class SearchType(str, Enum):
Org = "org" Org = "org"
@ -22,7 +24,7 @@ class ProcessorType(str, Enum):
class TextSearchModel(): class TextSearchModel():
def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: list[BaseFilter], top_k): def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: List[BaseFilter], top_k):
self.entries = entries self.entries = entries
self.corpus_embeddings = corpus_embeddings self.corpus_embeddings = corpus_embeddings
self.bi_encoder = bi_encoder self.bi_encoder = bi_encoder

View file

@ -10,6 +10,8 @@ from src.utils.config import SearchModels, ProcessorConfigModel
from src.utils.helpers import LRU from src.utils.helpers import LRU
from src.utils.rawconfig import FullConfig from src.utils.rawconfig import FullConfig
from typing import List
# Application Global State # Application Global State
config = FullConfig() config = FullConfig()
model = SearchModels() model = SearchModels()
@ -18,7 +20,7 @@ config_file: Path = None
verbose: int = 0 verbose: int = 0
host: str = None host: str = None
port: int = None port: int = None
cli_args: list[str] = None cli_args: List[str] = None
query_cache = LRU() query_cache = LRU()
if torch.cuda.is_available(): if torch.cuda.is_available():