mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Update panchayat yaml to jsonl file to compile additional attributes
- Update typing for list to use the List object from typing module - Parse number of upvotes, created date - Add support for word filter and date filter on compiled entries
This commit is contained in:
parent
21eb58156c
commit
21a9fbcea3
10 changed files with 28 additions and 20 deletions
|
@ -2,7 +2,7 @@ content-type:
|
|||
|
||||
panchayat:
|
||||
input-files: null
|
||||
input-filter: "/home/saba/projects/panchayat/panchayat/instance/*.yaml"
|
||||
input-filter: ["/home/saba/projects/panchayat/panchayat/instance/*.yaml"]
|
||||
compressed-jsonl: "./khoj/embeddings/panchyat.jsonl.gz"
|
||||
embeddings-file: "./khoj/embeddings/panchayat_embeddings.pt"
|
||||
|
||||
|
|
|
@ -79,7 +79,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
|||
config.content_type.panchayat,
|
||||
search_config=config.search_type.asymmetric,
|
||||
regenerate=regenerate,
|
||||
filters=[])
|
||||
filters=[DateFilter(entry_key='compiled'), WordFilter(entry_key='compiled')])
|
||||
|
||||
# Initialize Ledger Search
|
||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||
|
|
|
@ -12,6 +12,7 @@ from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_
|
|||
from src.utils.constants import empty_escape_sequences
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
from typing import List
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -112,7 +113,7 @@ def extract_beancount_transactions(beancount_files):
|
|||
return entries, dict(transaction_to_file_map)
|
||||
|
||||
|
||||
def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
|
||||
def convert_transactions_to_maps(entries: List[str], transaction_to_file_map) -> List[dict]:
|
||||
"Convert each Beancount transaction into a dictionary"
|
||||
entry_maps = []
|
||||
for entry in entries:
|
||||
|
@ -123,6 +124,6 @@ def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) ->
|
|||
return entry_maps
|
||||
|
||||
|
||||
def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
|
||||
def convert_transaction_maps_to_jsonl(entries: List[dict]) -> str:
|
||||
"Convert each Beancount transaction dictionary to JSON and collate as JSONL"
|
||||
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||
|
|
|
@ -13,6 +13,7 @@ from src.utils.constants import empty_escape_sequences
|
|||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -111,7 +112,7 @@ def extract_markdown_entries(markdown_files):
|
|||
return entries, dict(entry_to_file_map)
|
||||
|
||||
|
||||
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
|
||||
def convert_markdown_entries_to_maps(entries: List[str], entry_to_file_map) -> List[dict]:
|
||||
"Convert each Markdown entries into a dictionary"
|
||||
entry_maps = []
|
||||
for entry in entries:
|
||||
|
|
|
@ -14,6 +14,8 @@ from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
|||
from src.utils import state
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -105,7 +107,7 @@ def extract_org_entries(org_files):
|
|||
return entries, dict(entry_to_file_map)
|
||||
|
||||
|
||||
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
|
||||
def convert_org_nodes_to_entries(entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> List[dict]:
|
||||
"Convert Org-Mode entries into list of dictionary"
|
||||
entry_maps = []
|
||||
for entry in entries:
|
||||
|
|
|
@ -106,18 +106,15 @@ def extract_panchayat_entries(yaml_files):
|
|||
for subpost in all_subposts:
|
||||
if subpost.post_id not in seen_ids:
|
||||
seen_ids.add(subpost.post_id)
|
||||
|
||||
# entry = VDBEntry(post_id=subpost.post_id, body=subpost.body, title=subpost.title, author=subpost.author)
|
||||
# entry = {
|
||||
# 'post_id': subpost.post_id,
|
||||
# 'body': subpost.body,
|
||||
# 'title': subpost.title,
|
||||
# 'author': subpost.author.username
|
||||
# }
|
||||
entry = dict()
|
||||
entry['compiled'] = f'body: {subpost.body} author: {subpost.author.username} title: {subpost.title}'
|
||||
|
||||
entry['compiled'] = f"""body: {subpost.body}
|
||||
author: {subpost.author.username}
|
||||
title: {subpost.title}
|
||||
created: {subpost.created}
|
||||
upvotes: {len(subpost.upvotes)}"""
|
||||
|
||||
entry['raw'] = subpost.post_id
|
||||
# entry = f"""body: {subpost.body} title: {subpost.title} author: {subpost.author.username}"""
|
||||
entries.append(entry)
|
||||
|
||||
return entries
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# Standard Packages
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from typing import List, Set, Tuple
|
||||
|
||||
|
||||
class BaseFilter(ABC):
|
||||
@abstractmethod
|
||||
|
@ -12,5 +14,5 @@ class BaseFilter(ABC):
|
|||
pass
|
||||
|
||||
@abstractmethod
|
||||
def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]:
|
||||
def apply(self, query:str, raw_entries:List[str]) -> Tuple[str, Set[int]]:
|
||||
pass
|
|
@ -14,6 +14,7 @@ from src.utils.config import TextSearchModel
|
|||
from src.utils.rawconfig import TextSearchConfig, TextContentConfig
|
||||
from src.utils.jsonl import load_jsonl
|
||||
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -179,7 +180,7 @@ def collate_results(hits, entries, count=5):
|
|||
in hits[0:count]]
|
||||
|
||||
|
||||
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
|
||||
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: List[BaseFilter] = []) -> TextSearchModel:
|
||||
# Initialize Model
|
||||
bi_encoder, cross_encoder, top_k = initialize_model(search_config)
|
||||
|
||||
|
|
|
@ -7,6 +7,8 @@ from pathlib import Path
|
|||
from src.utils.rawconfig import ConversationProcessorConfig
|
||||
from src.search_filter.base_filter import BaseFilter
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
class SearchType(str, Enum):
|
||||
Org = "org"
|
||||
|
@ -22,7 +24,7 @@ class ProcessorType(str, Enum):
|
|||
|
||||
|
||||
class TextSearchModel():
|
||||
def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: list[BaseFilter], top_k):
|
||||
def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: List[BaseFilter], top_k):
|
||||
self.entries = entries
|
||||
self.corpus_embeddings = corpus_embeddings
|
||||
self.bi_encoder = bi_encoder
|
||||
|
|
|
@ -10,6 +10,8 @@ from src.utils.config import SearchModels, ProcessorConfigModel
|
|||
from src.utils.helpers import LRU
|
||||
from src.utils.rawconfig import FullConfig
|
||||
|
||||
from typing import List
|
||||
|
||||
# Application Global State
|
||||
config = FullConfig()
|
||||
model = SearchModels()
|
||||
|
@ -18,7 +20,7 @@ config_file: Path = None
|
|||
verbose: int = 0
|
||||
host: str = None
|
||||
port: int = None
|
||||
cli_args: list[str] = None
|
||||
cli_args: List[str] = None
|
||||
query_cache = LRU()
|
||||
|
||||
if torch.cuda.is_available():
|
||||
|
|
Loading…
Reference in a new issue