Parse markdown file as single entry if it fits with max token limits

These changes improve entry context available to the search model
Specifically this should improve entry context from short knowledge trees,
that is knowledge bases with small files

Previously we split all markdown files by their headings,
even if the file was small enough to fit entirely within the max token
limits of the search model. This used to reduce the context available
to select the appropriate entries for a given query for the search model,
especially from short knowledge trees
This commit is contained in:
Debanjum Singh Solanky 2024-02-10 14:34:09 +05:30
parent d8f01876e5
commit 982ac1859c
2 changed files with 41 additions and 9 deletions

View file

@ -31,13 +31,14 @@ class MarkdownToEntries(TextToEntries):
else: else:
deletion_file_names = None deletion_file_names = None
max_tokens = 256
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
with timer("Extract entries from specified Markdown files", logger): with timer("Extract entries from specified Markdown files", logger):
current_entries = MarkdownToEntries.extract_markdown_entries(files) current_entries = MarkdownToEntries.extract_markdown_entries(files, max_tokens)
# Split entries by max tokens supported by model # Split entries by max tokens supported by model
with timer("Split entries by max token size supported by model", logger): with timer("Split entries by max token size supported by model", logger):
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens)
# Identify, mark and merge any new entries with previous entries # Identify, mark and merge any new entries with previous entries
with timer("Identify new or updated entries", logger): with timer("Identify new or updated entries", logger):
@ -55,7 +56,7 @@ class MarkdownToEntries(TextToEntries):
return num_new_embeddings, num_deleted_embeddings return num_new_embeddings, num_deleted_embeddings
@staticmethod @staticmethod
def extract_markdown_entries(markdown_files) -> List[Entry]: def extract_markdown_entries(markdown_files, max_tokens=256) -> List[Entry]:
"Extract entries by heading from specified Markdown files" "Extract entries by heading from specified Markdown files"
entries: List[str] = [] entries: List[str] = []
entry_to_file_map: List[Tuple[str, Path]] = [] entry_to_file_map: List[Tuple[str, Path]] = []
@ -63,7 +64,7 @@ class MarkdownToEntries(TextToEntries):
try: try:
markdown_content = markdown_files[markdown_file] markdown_content = markdown_files[markdown_file]
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file( entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
markdown_content, markdown_file, entries, entry_to_file_map markdown_content, markdown_file, entries, entry_to_file_map, max_tokens
) )
except Exception as e: except Exception as e:
logger.warning( logger.warning(
@ -74,8 +75,17 @@ class MarkdownToEntries(TextToEntries):
@staticmethod @staticmethod
def process_single_markdown_file( def process_single_markdown_file(
markdown_content: str, markdown_file: Path, entries: List[str], entry_to_file_map: List[Tuple[str, Path]] markdown_content: str,
markdown_file: Path,
entries: List[str],
entry_to_file_map: List[Tuple[str, Path]],
max_tokens=256,
): ):
if len(TextToEntries.tokenizer(markdown_content)) <= max_tokens:
entry_to_file_map += [(markdown_content, markdown_file)]
entries.extend([markdown_content])
return entries, entry_to_file_map
headers_to_split_on = [("#", "1"), ("##", "2"), ("###", "3"), ("####", "4"), ("#####", "5"), ("######", "6")] headers_to_split_on = [("#", "1"), ("##", "2"), ("###", "3"), ("####", "4"), ("#####", "5"), ("######", "6")]
reversed_headers_to_split_on = list(reversed(headers_to_split_on)) reversed_headers_to_split_on = list(reversed(headers_to_split_on))
markdown_entries_per_file: List[str] = [] markdown_entries_per_file: List[str] = []

View file

@ -20,7 +20,7 @@ def test_extract_markdown_with_no_headings(tmp_path):
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
# Assert # Assert
assert len(entries) == 1 assert len(entries) == 1
@ -45,7 +45,7 @@ def test_extract_single_markdown_entry(tmp_path):
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
# Assert # Assert
assert len(entries) == 1 assert len(entries) == 1
@ -68,7 +68,7 @@ def test_extract_multiple_markdown_entries(tmp_path):
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
# Assert # Assert
assert len(entries) == 2 assert len(entries) == 2
@ -127,7 +127,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
# Assert # Assert
assert len(entries) == 3 assert len(entries) == 3
@ -161,6 +161,28 @@ body line 2
assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2", "Ensure raw entry includes heading ancestory" assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2", "Ensure raw entry includes heading ancestory"
def test_parse_markdown_file_into_single_entry_if_small(tmp_path):
"Parse markdown file into single entry if it fits within the token limits."
# Arrange
entry = f"""
# Heading 1
body line 1
## Subheading 1.1
body line 1.1
"""
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Markdown files
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=12)
# Assert
assert len(entries) == 1
assert entries[0].raw == entry
# Helper Functions # Helper Functions
def create_file(tmp_path: Path, entry=None, filename="test.md"): def create_file(tmp_path: Path, entry=None, filename="test.md"):
markdown_file = tmp_path / filename markdown_file = tmp_path / filename