From ed693afd685664dac8b073ab856ce676060354aa Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 7 Jul 2024 02:12:37 +0530 Subject: [PATCH] Reduce max embedding chunk size to fit token limit of standard bert variants Reduce embeddings model max prompt size to 128 from 256 words. A word is usually 3-4 tokens. So 128*4 = 512 should be upper limit to split text into chunks --- src/khoj/processor/content/docx/docx_to_entries.py | 2 +- src/khoj/processor/content/github/github_to_entries.py | 2 +- src/khoj/processor/content/images/image_to_entries.py | 2 +- src/khoj/processor/content/markdown/markdown_to_entries.py | 6 +++--- src/khoj/processor/content/notion/notion_to_entries.py | 2 +- src/khoj/processor/content/org_mode/org_to_entries.py | 6 +++--- src/khoj/processor/content/pdf/pdf_to_entries.py | 2 +- .../processor/content/plaintext/plaintext_to_entries.py | 2 +- src/khoj/processor/content/text_to_entries.py | 2 +- tests/test_text_search.py | 4 ++-- 10 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/khoj/processor/content/docx/docx_to_entries.py b/src/khoj/processor/content/docx/docx_to_entries.py index ab28066d..06aa085d 100644 --- a/src/khoj/processor/content/docx/docx_to_entries.py +++ b/src/khoj/processor/content/docx/docx_to_entries.py @@ -36,7 +36,7 @@ class DocxToEntries(TextToEntries): # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): - current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=128) # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): diff --git a/src/khoj/processor/content/github/github_to_entries.py b/src/khoj/processor/content/github/github_to_entries.py index 2aa63d4e..cc388c20 100644 --- a/src/khoj/processor/content/github/github_to_entries.py +++ b/src/khoj/processor/content/github/github_to_entries.py @@ -95,7 +95,7 @@ class GithubToEntries(TextToEntries): ) with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger): - current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=128) return current_entries diff --git a/src/khoj/processor/content/images/image_to_entries.py b/src/khoj/processor/content/images/image_to_entries.py index 20705a0f..8f6ab8ef 100644 --- a/src/khoj/processor/content/images/image_to_entries.py +++ b/src/khoj/processor/content/images/image_to_entries.py @@ -37,7 +37,7 @@ class ImageToEntries(TextToEntries): # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): - current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=128) # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index f18e1e21..7cf63b56 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -30,7 +30,7 @@ class MarkdownToEntries(TextToEntries): else: deletion_file_names = None - max_tokens = 256 + max_tokens = 128 # Extract Entries from specified Markdown files with timer("Extract entries from specified Markdown files", logger): file_to_text_map, current_entries = MarkdownToEntries.extract_markdown_entries(files, max_tokens) @@ -56,7 +56,7 @@ class MarkdownToEntries(TextToEntries): return num_new_embeddings, num_deleted_embeddings @staticmethod - def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]: + def extract_markdown_entries(markdown_files, max_tokens=128) -> Tuple[Dict, List[Entry]]: "Extract entries by heading from specified Markdown files" entries: List[str] = [] entry_to_file_map: List[Tuple[str, str]] = [] @@ -81,7 +81,7 @@ class MarkdownToEntries(TextToEntries): markdown_file: str, entries: List[str], entry_to_file_map: List[Tuple[str, str]], - max_tokens=256, + max_tokens=128, ancestry: Dict[int, str] = {}, ) -> Tuple[List[str], List[Tuple[str, str]]]: # Prepend the markdown section's heading ancestry diff --git a/src/khoj/processor/content/notion/notion_to_entries.py b/src/khoj/processor/content/notion/notion_to_entries.py index 57456ed5..9a6a70a4 100644 --- a/src/khoj/processor/content/notion/notion_to_entries.py +++ b/src/khoj/processor/content/notion/notion_to_entries.py @@ -112,7 +112,7 @@ class NotionToEntries(TextToEntries): page_entries = self.process_page(p_or_d) current_entries.extend(page_entries) - current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=128) return self.update_entries_with_ids(current_entries, user=user) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index c528244d..2de67e90 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -31,7 +31,7 @@ class OrgToEntries(TextToEntries): deletion_file_names = None # Extract Entries from specified Org files - max_tokens = 256 + max_tokens = 128 with timer("Extract entries from specified Org files", logger): file_to_text_map, current_entries = self.extract_org_entries(files, max_tokens=max_tokens) @@ -56,7 +56,7 @@ class OrgToEntries(TextToEntries): @staticmethod def extract_org_entries( - org_files: dict[str, str], index_heading_entries: bool = False, max_tokens=256 + org_files: dict[str, str], index_heading_entries: bool = False, max_tokens=128 ) -> Tuple[Dict, List[Entry]]: "Extract entries from specified Org files" file_to_text_map, entries, entry_to_file_map = OrgToEntries.extract_org_nodes(org_files, max_tokens) @@ -90,7 +90,7 @@ class OrgToEntries(TextToEntries): org_file: str, entries: List[List[Orgnode]], entry_to_file_map: List[Tuple[Orgnode, str]], - max_tokens=256, + max_tokens=128, ancestry: Dict[int, str] = {}, ) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]: """Parse org_content from org_file into OrgNode entries diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 45ff7261..016b30df 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -39,7 +39,7 @@ class PdfToEntries(TextToEntries): # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): - current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=128) # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): diff --git a/src/khoj/processor/content/plaintext/plaintext_to_entries.py b/src/khoj/processor/content/plaintext/plaintext_to_entries.py index 2c994899..ee400d85 100644 --- a/src/khoj/processor/content/plaintext/plaintext_to_entries.py +++ b/src/khoj/processor/content/plaintext/plaintext_to_entries.py @@ -36,7 +36,7 @@ class PlaintextToEntries(TextToEntries): # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): - current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256, raw_is_compiled=True) + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=128, raw_is_compiled=True) # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index cdb2e207..75619443 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -63,7 +63,7 @@ class TextToEntries(ABC): @staticmethod def split_entries_by_max_tokens( - entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500, raw_is_compiled: bool = False + entries: List[Entry], max_tokens: int = 128, max_word_length: int = 500, raw_is_compiled: bool = False ) -> List[Entry]: "Split entries if compiled entry length exceeds the max tokens supported by the ML model." chunked_entries: List[Entry] = [] diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 915425bf..4f041649 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -172,7 +172,7 @@ async def test_text_search(search_config: SearchConfig): def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog): # Arrange # Insert org-mode entry with size exceeding max token limit to new org file - max_tokens = 256 + max_tokens = 128 new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) with open(new_file_to_index, "w") as f: f.write(f"* Entry more than {max_tokens} words\n") @@ -224,7 +224,7 @@ conda activate khoj user=default_user, ) - max_tokens = 256 + max_tokens = 128 new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) with open(new_file_to_index, "w") as f: f.write(f"* Entry more than {max_tokens} words\n")