mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-01-07 03:58:08 +00:00
36 lines
1.2 KiB
Python
36 lines
1.2 KiB
Python
|
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
|
||
|
|
||
|
|
||
|
def test_single_page_docx_to_jsonl():
|
||
|
"Convert single page DOCX file to jsonl."
|
||
|
# Act
|
||
|
# Extract Entries from specified Docx files
|
||
|
# Read singlepage.docx into memory as bytes
|
||
|
with open("tests/data/docx/iceland.docx", "rb") as f:
|
||
|
docx_bytes = f.read()
|
||
|
|
||
|
data = {"tests/data/docx/iceland.docx": docx_bytes}
|
||
|
entries = DocxToEntries.extract_docx_entries(docx_files=data)
|
||
|
|
||
|
# Assert
|
||
|
assert "The Icelandic horse" in entries[0]["tests/data/docx/iceland.docx"][0]
|
||
|
assert len(entries) == 2
|
||
|
assert len(entries[1]) == 1
|
||
|
|
||
|
|
||
|
def test_multi_page_docx_to_jsonl():
|
||
|
"Convert multi page DOCX file to jsonl."
|
||
|
# Act
|
||
|
# Extract Entries from specified Docx files
|
||
|
# Read multipage.docx into memory as bytes
|
||
|
with open("tests/data/docx/bangalore.docx", "rb") as f:
|
||
|
docx_bytes = f.read()
|
||
|
|
||
|
data = {"tests/data/docx/bangalore.docx": docx_bytes}
|
||
|
entries = DocxToEntries.extract_docx_entries(docx_files=data)
|
||
|
|
||
|
# Assert
|
||
|
assert "Bangalore" in entries[0]["tests/data/docx/bangalore.docx"][0]
|
||
|
assert len(entries) == 2
|
||
|
assert len(entries[1]) == 1
|