diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index dd797c31..9dbdc093 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -15,7 +15,6 @@ from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data from khoj.utils.rawconfig import Entry -from khoj.utils import state logger = logging.getLogger(__name__) @@ -38,7 +37,7 @@ class GithubToJsonl(TextToJsonl): else: return - def process(self, previous_entries=None): + def process(self, previous_entries=[]): current_entries = [] for repo in self.config.repos: current_entries += self.process_repo(repo) diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py index f743d5d5..c033f522 100644 --- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py +++ b/src/khoj/processor/jsonl/jsonl_to_jsonl.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) class JsonlToJsonl(TextToJsonl): # Define Functions - def process(self, previous_entries=None): + def process(self, previous_entries=[]): # Extract required fields from config input_jsonl_files, input_jsonl_filter, output_file = ( self.config.input_files, @@ -38,15 +38,9 @@ class JsonlToJsonl(TextToJsonl): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, - previous_entries, - key="compiled", - logger=logger, - ) + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) with timer("Write entries to JSONL file", logger): # Process Each Entry from All Notes Files diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 21cbda72..2da5bd4c 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -23,7 +23,7 @@ class MarkdownToJsonl(TextToJsonl): self.config = config # Define Functions - def process(self, previous_entries=None): + def process(self, previous_entries=[]): # Extract required fields from config markdown_files, markdown_file_filter, output_file = ( self.config.input_files, @@ -51,12 +51,9 @@ class MarkdownToJsonl(TextToJsonl): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger - ) + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) with timer("Write markdown entries to JSONL file", logger): # Process Each Entry from All Notes Files diff --git a/src/khoj/processor/notion/notion_to_jsonl.py b/src/khoj/processor/notion/notion_to_jsonl.py index 20a11cd7..d4cd78f3 100644 --- a/src/khoj/processor/notion/notion_to_jsonl.py +++ b/src/khoj/processor/notion/notion_to_jsonl.py @@ -80,7 +80,7 @@ class NotionToJsonl(TextToJsonl): self.body_params = {"page_size": 100} - def process(self, previous_entries=None): + def process(self, previous_entries=[]): current_entries = [] # Get all pages @@ -240,12 +240,9 @@ class NotionToJsonl(TextToJsonl): def update_entries_with_ids(self, current_entries, previous_entries): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger - ) + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) with timer("Write Notion entries to JSONL file", logger): # Process Each Entry from all Notion entries diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index 608fdd41..b00a6c50 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -22,7 +22,7 @@ class OrgToJsonl(TextToJsonl): self.config = config # Define Functions - def process(self, previous_entries: List[Entry] = None): + def process(self, previous_entries: List[Entry] = []): # Extract required fields from config org_files, org_file_filter, output_file = ( self.config.input_files, @@ -51,9 +51,7 @@ class OrgToJsonl(TextToJsonl): current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) # Identify, mark and merge any new entries with previous entries - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: + with timer("Identify new or updated entries", logger): entries_with_ids = TextToJsonl.mark_entries_for_update( current_entries, previous_entries, key="compiled", logger=logger ) diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py index c5c395bc..e41fd976 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) class PdfToJsonl(TextToJsonl): # Define Functions - def process(self, previous_entries=None): + def process(self, previous_entries=[]): # Extract required fields from config pdf_files, pdf_file_filter, output_file = ( self.config.input_files, @@ -45,12 +45,9 @@ class PdfToJsonl(TextToJsonl): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger - ) + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) with timer("Write PDF entries to JSONL file", logger): # Process Each Entry from All Notes Files diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index a4d01cf5..ff169fac 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -17,7 +17,7 @@ class TextToJsonl(ABC): self.config = config @abstractmethod - def process(self, previous_entries: List[Entry] = None) -> List[Tuple[int, Entry]]: + def process(self, previous_entries: List[Entry] = []) -> List[Tuple[int, Entry]]: ... @staticmethod diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index a77be6e1..edc735f2 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -176,10 +176,10 @@ def setup( ) -> TextContent: # Map notes in text files to (compressed) JSONL formatted file config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) - previous_entries = ( - extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None - ) - entries_with_indices = text_to_jsonl(config).process(previous_entries or []) + previous_entries = [] + if config.compressed_jsonl.exists() and not regenerate: + previous_entries = extract_entries(config.compressed_jsonl) + entries_with_indices = text_to_jsonl(config).process(previous_entries) # Extract Updated Entries entries = extract_entries(config.compressed_jsonl)