mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Setup File Filter for Markdown and Ledger content types
- Pass file associated with entries in markdown, beancount to json converters - Add File, Word, Date Filters to Ledger, Markdown Types - Word, Date Filters were accidently removed from the above types yesterday - File Filter is the only filter that newly got added
This commit is contained in:
parent
94cf3e97f3
commit
490157cafa
5 changed files with 62 additions and 28 deletions
|
@ -42,30 +42,62 @@ def configure_server(args, required=False):
|
||||||
def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: SearchType = None):
|
def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: SearchType = None):
|
||||||
# Initialize Org Notes Search
|
# Initialize Org Notes Search
|
||||||
if (t == SearchType.Org or t == None) and config.content_type.org:
|
if (t == SearchType.Org or t == None) and config.content_type.org:
|
||||||
filter_directory = resolve_absolute_path(config.content_type.org.compressed_jsonl.parent)
|
|
||||||
filters = [DateFilter(), WordFilter(filter_directory, search_type=SearchType.Org), FileFilter()]
|
|
||||||
# Extract Entries, Generate Notes Embeddings
|
# Extract Entries, Generate Notes Embeddings
|
||||||
model.orgmode_search = text_search.setup(org_to_jsonl, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, filters=filters)
|
model.orgmode_search = text_search.setup(
|
||||||
|
org_to_jsonl,
|
||||||
|
config.content_type.org,
|
||||||
|
search_config=config.search_type.asymmetric,
|
||||||
|
regenerate=regenerate,
|
||||||
|
filters=[
|
||||||
|
DateFilter(),
|
||||||
|
WordFilter(config.content_type.org.compressed_jsonl.parent, SearchType.Org),
|
||||||
|
FileFilter(),
|
||||||
|
])
|
||||||
|
|
||||||
# Initialize Org Music Search
|
# Initialize Org Music Search
|
||||||
if (t == SearchType.Music or t == None) and config.content_type.music:
|
if (t == SearchType.Music or t == None) and config.content_type.music:
|
||||||
# Extract Entries, Generate Music Embeddings
|
# Extract Entries, Generate Music Embeddings
|
||||||
model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate)
|
model.music_search = text_search.setup(
|
||||||
|
org_to_jsonl,
|
||||||
|
config.content_type.music,
|
||||||
|
search_config=config.search_type.asymmetric,
|
||||||
|
regenerate=regenerate)
|
||||||
|
|
||||||
# Initialize Markdown Search
|
# Initialize Markdown Search
|
||||||
if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
|
if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
|
||||||
# Extract Entries, Generate Markdown Embeddings
|
# Extract Entries, Generate Markdown Embeddings
|
||||||
model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate)
|
model.markdown_search = text_search.setup(
|
||||||
|
markdown_to_jsonl,
|
||||||
|
config.content_type.markdown,
|
||||||
|
search_config=config.search_type.asymmetric,
|
||||||
|
regenerate=regenerate,
|
||||||
|
filters=[
|
||||||
|
DateFilter(),
|
||||||
|
WordFilter(config.content_type.markdown.compressed_jsonl.parent, SearchType.Markdown),
|
||||||
|
FileFilter(),
|
||||||
|
])
|
||||||
|
|
||||||
# Initialize Ledger Search
|
# Initialize Ledger Search
|
||||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||||
# Extract Entries, Generate Ledger Embeddings
|
# Extract Entries, Generate Ledger Embeddings
|
||||||
model.ledger_search = text_search.setup(beancount_to_jsonl, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate)
|
model.ledger_search = text_search.setup(
|
||||||
|
beancount_to_jsonl,
|
||||||
|
config.content_type.ledger,
|
||||||
|
search_config=config.search_type.symmetric,
|
||||||
|
regenerate=regenerate,
|
||||||
|
filters=[
|
||||||
|
DateFilter(),
|
||||||
|
WordFilter(config.content_type.ledger.compressed_jsonl.parent, SearchType.Ledger),
|
||||||
|
FileFilter(),
|
||||||
|
])
|
||||||
|
|
||||||
# Initialize Image Search
|
# Initialize Image Search
|
||||||
if (t == SearchType.Image or t == None) and config.content_type.image:
|
if (t == SearchType.Image or t == None) and config.content_type.image:
|
||||||
# Extract Entries, Generate Image Embeddings
|
# Extract Entries, Generate Image Embeddings
|
||||||
model.image_search = image_search.setup(config.content_type.image, search_config=config.search_type.image, regenerate=regenerate)
|
model.image_search = image_search.setup(
|
||||||
|
config.content_type.image,
|
||||||
|
search_config=config.search_type.image,
|
||||||
|
regenerate=regenerate)
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
@ -28,10 +28,10 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file):
|
||||||
beancount_files = get_beancount_files(beancount_files, beancount_file_filter)
|
beancount_files = get_beancount_files(beancount_files, beancount_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Beancount files
|
# Extract Entries from specified Beancount files
|
||||||
entries = extract_beancount_entries(beancount_files)
|
entries, transaction_to_file_map = extract_beancount_entries(beancount_files)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_beancount_entries_to_jsonl(entries)
|
jsonl_data = convert_beancount_entries_to_jsonl(entries, transaction_to_file_map)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
|
@ -74,22 +74,24 @@ def extract_beancount_entries(beancount_files):
|
||||||
empty_newline = f'^[{empty_escape_sequences}]*$'
|
empty_newline = f'^[{empty_escape_sequences}]*$'
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
|
transaction_to_file_map = []
|
||||||
for beancount_file in beancount_files:
|
for beancount_file in beancount_files:
|
||||||
with open(beancount_file) as f:
|
with open(beancount_file) as f:
|
||||||
ledger_content = f.read()
|
ledger_content = f.read()
|
||||||
entries.extend([entry.strip(empty_escape_sequences)
|
transactions_per_file = [entry.strip(empty_escape_sequences)
|
||||||
for entry
|
for entry
|
||||||
in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
|
in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
|
||||||
if re.match(transaction_regex, entry)])
|
if re.match(transaction_regex, entry)]
|
||||||
|
transaction_to_file_map += [beancount_file]*len(transactions_per_file)
|
||||||
return entries
|
entries.extend(transactions_per_file)
|
||||||
|
return entries, transaction_to_file_map
|
||||||
|
|
||||||
|
|
||||||
def convert_beancount_entries_to_jsonl(entries):
|
def convert_beancount_entries_to_jsonl(entries, transaction_to_file_map):
|
||||||
"Convert each Beancount transaction to JSON and collate as JSONL"
|
"Convert each Beancount transaction to JSON and collate as JSONL"
|
||||||
jsonl = ''
|
jsonl = ''
|
||||||
for entry in entries:
|
for entry_id, entry in enumerate(entries):
|
||||||
entry_dict = {'compiled': entry, 'raw': entry}
|
entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry_id]}'}
|
||||||
# Convert Dictionary to JSON and Append to JSONL string
|
# Convert Dictionary to JSON and Append to JSONL string
|
||||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||||
|
|
||||||
|
|
|
@ -28,10 +28,10 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
|
||||||
markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
|
markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries = extract_markdown_entries(markdown_files)
|
entries, entry_to_file_map = extract_markdown_entries(markdown_files)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_markdown_entries_to_jsonl(entries)
|
jsonl_data = convert_markdown_entries_to_jsonl(entries, entry_to_file_map)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
|
@ -74,21 +74,24 @@ def extract_markdown_entries(markdown_files):
|
||||||
markdown_heading_regex = r'^#'
|
markdown_heading_regex = r'^#'
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
|
entry_to_file_map = []
|
||||||
for markdown_file in markdown_files:
|
for markdown_file in markdown_files:
|
||||||
with open(markdown_file) as f:
|
with open(markdown_file) as f:
|
||||||
markdown_content = f.read()
|
markdown_content = f.read()
|
||||||
entries.extend([f'#{entry.strip(empty_escape_sequences)}'
|
markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
|
||||||
for entry
|
for entry
|
||||||
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)])
|
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)]
|
||||||
|
entry_to_file_map += [markdown_file]*len(markdown_entries_per_file)
|
||||||
|
entries.extend(markdown_entries_per_file)
|
||||||
|
|
||||||
return entries
|
return entries, entry_to_file_map
|
||||||
|
|
||||||
|
|
||||||
def convert_markdown_entries_to_jsonl(entries):
|
def convert_markdown_entries_to_jsonl(entries, entry_to_file_map):
|
||||||
"Convert each Markdown entries to JSON and collate as JSONL"
|
"Convert each Markdown entries to JSON and collate as JSONL"
|
||||||
jsonl = ''
|
jsonl = ''
|
||||||
for entry in entries:
|
for entry_id, entry in enumerate(entries):
|
||||||
entry_dict = {'compiled': entry, 'raw': entry}
|
entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'}
|
||||||
# Convert Dictionary to JSON and Append to JSONL string
|
# Convert Dictionary to JSON and Append to JSONL string
|
||||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,7 @@ class DateFilter(BaseFilter):
|
||||||
continue
|
continue
|
||||||
self.date_to_entry_ids[date_in_entry].add(id)
|
self.date_to_entry_ids[date_in_entry].add(id)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Created file filter index: {end - start} seconds")
|
logger.debug(f"Created date filter index: {end - start} seconds")
|
||||||
|
|
||||||
|
|
||||||
def can_filter(self, raw_query):
|
def can_filter(self, raw_query):
|
||||||
|
|
|
@ -4,9 +4,6 @@ import time
|
||||||
import pickle
|
import pickle
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
# External Packages
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.search_filter.base_filter import BaseFilter
|
from src.search_filter.base_filter import BaseFilter
|
||||||
from src.utils.helpers import LRU, resolve_absolute_path
|
from src.utils.helpers import LRU, resolve_absolute_path
|
||||||
|
|
Loading…
Reference in a new issue