mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Support Indexing Docx Files (#801)
* Add support for indexing docx files and associated unit tests --------- Co-authored-by: sabaimran <narmiabas@gmail.com>
This commit is contained in:
parent
d4e5c95711
commit
bd3b590153
15 changed files with 193 additions and 4 deletions
|
@ -87,6 +87,7 @@ dependencies = [
|
|||
"cron-descriptor == 1.4.3",
|
||||
"django_apscheduler == 0.6.2",
|
||||
"anthropic == 0.26.1",
|
||||
"docx2txt == 0.8"
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
|
|
|
@ -306,6 +306,7 @@ class Entry(BaseModel):
|
|||
NOTION = "notion"
|
||||
GITHUB = "github"
|
||||
CONVERSATION = "conversation"
|
||||
DOCX = "docx"
|
||||
|
||||
class EntrySource(models.TextChoices):
|
||||
COMPUTER = "computer"
|
||||
|
|
7
src/khoj/interface/web/assets/icons/docx.svg
Normal file
7
src/khoj/interface/web/assets/icons/docx.svg
Normal file
|
@ -0,0 +1,7 @@
|
|||
<svg xmlns="http://www.w3.org/2000/svg" fill="#FFF" stroke-miterlimit="10" stroke-width="2" viewBox="0 0 96 96">
|
||||
<path stroke="#979593" d="M67.1716 7H27c-1.1046 0-2 .8954-2 2v78c0 1.1046.8954 2 2 2h58c1.1046 0 2-.8954 2-2V26.8284c0-.5304-.2107-1.0391-.5858-1.4142L68.5858 7.5858C68.2107 7.2107 67.702 7 67.1716 7z"/>
|
||||
<path fill="none" stroke="#979593" d="M67 7v18c0 1.1046.8954 2 2 2h18"/>
|
||||
<path fill="#C8C6C4" d="M79 61H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1zm0-6H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1zm0-6H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1zm0-6H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1zm0 24H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1z"/>
|
||||
<path fill="#185ABD" d="M12 74h32c2.2091 0 4-1.7909 4-4V38c0-2.2091-1.7909-4-4-4H12c-2.2091 0-4 1.7909-4 4v32c0 2.2091 1.7909 4 4 4z"/>
|
||||
<path d="M21.6245 60.6455c.0661.522.109.9769.1296 1.3657h.0762c.0306-.3685.0889-.8129.1751-1.3349.0862-.5211.1703-.961.2517-1.319L25.7911 44h4.5702l3.6562 15.1272c.183.7468.3353 1.6973.457 2.8532h.0608c.0508-.7979.1777-1.7184.3809-2.7615L37.8413 44H42l-5.1183 22h-4.86l-3.4885-14.5744c-.1016-.4197-.2158-.9663-.3428-1.6417-.127-.6745-.2057-1.1656-.236-1.4724h-.0608c-.0407.358-.1195.8896-.2364 1.595-.1169.7062-.211 1.2273-.2819 1.565L24.1 66h-4.9357L14 44h4.2349l3.1843 15.3882c.0709.3165.1392.7362.2053 1.2573z"/>
|
||||
</svg>
|
After Width: | Height: | Size: 1.3 KiB |
|
@ -48,8 +48,8 @@ Get the Khoj [Desktop](https://khoj.dev/downloads), [Obsidian](https://docs.khoj
|
|||
|
||||
To get started, just start typing below. You can also type / to see a list of commands.
|
||||
`.trim()
|
||||
const allowedExtensions = ['text/org', 'text/markdown', 'text/plain', 'text/html', 'application/pdf'];
|
||||
const allowedFileEndings = ['org', 'md', 'txt', 'html', 'pdf'];
|
||||
const allowedExtensions = ['text/org', 'text/markdown', 'text/plain', 'text/html', 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'];
|
||||
const allowedFileEndings = ['org', 'md', 'txt', 'html', 'pdf', 'docx'];
|
||||
let chatOptions = [];
|
||||
function createCopyParentText(message) {
|
||||
return function(event) {
|
||||
|
|
|
@ -73,6 +73,8 @@
|
|||
image_name = "pdf.svg"
|
||||
else if (fileExtension === "markdown" || fileExtension === "md")
|
||||
image_name = "markdown.svg"
|
||||
else if (fileExtension === "docx")
|
||||
image_name = "docx.svg"
|
||||
else
|
||||
image_name = "plaintext.svg"
|
||||
|
||||
|
|
0
src/khoj/processor/content/docx/__init__.py
Normal file
0
src/khoj/processor/content/docx/__init__.py
Normal file
110
src/khoj/processor/content/docx/docx_to_entries.py
Normal file
110
src/khoj/processor/content/docx/docx_to_entries.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from langchain_community.document_loaders import Docx2txtLoader
|
||||
|
||||
from khoj.database.models import Entry as DbEntry
|
||||
from khoj.database.models import KhojUser
|
||||
from khoj.processor.content.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocxToEntries(TextToEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
# Define Functions
|
||||
def process(
|
||||
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
||||
) -> Tuple[int, int]:
|
||||
# Extract required fields from config
|
||||
if not full_corpus:
|
||||
deletion_file_names = set([file for file in files if files[file] == b""])
|
||||
files_to_process = set(files) - deletion_file_names
|
||||
files = {file: files[file] for file in files_to_process}
|
||||
else:
|
||||
deletion_file_names = None
|
||||
|
||||
# Extract Entries from specified Docx files
|
||||
with timer("Extract entries from specified DOCX files", logger):
|
||||
file_to_text_map, current_entries = DocxToEntries.extract_docx_entries(files)
|
||||
|
||||
# Split entries by max tokens supported by model
|
||||
with timer("Split entries by max token size supported by model", logger):
|
||||
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
|
||||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||
current_entries,
|
||||
DbEntry.EntryType.DOCX,
|
||||
DbEntry.EntrySource.COMPUTER,
|
||||
"compiled",
|
||||
logger,
|
||||
deletion_file_names,
|
||||
user,
|
||||
regenerate=regenerate,
|
||||
file_to_text_map=file_to_text_map,
|
||||
)
|
||||
|
||||
return num_new_embeddings, num_deleted_embeddings
|
||||
|
||||
@staticmethod
|
||||
def extract_docx_entries(docx_files) -> Tuple[Dict, List[Entry]]:
|
||||
"""Extract entries from specified DOCX files"""
|
||||
|
||||
entries: List[str] = []
|
||||
entry_to_location_map: List[Tuple[str, str]] = []
|
||||
file_to_text_map = dict()
|
||||
for docx_file in docx_files:
|
||||
try:
|
||||
timestamp_now = datetime.utcnow().timestamp()
|
||||
tmp_file = f"tmp_docx_file_{timestamp_now}.docx"
|
||||
with open(tmp_file, "wb") as f:
|
||||
bytes_content = docx_files[docx_file]
|
||||
f.write(bytes_content)
|
||||
|
||||
# Load the content using Docx2txtLoader
|
||||
loader = Docx2txtLoader(tmp_file)
|
||||
docx_entries_per_file = loader.load()
|
||||
|
||||
# Convert the loaded entries into the desired format
|
||||
docx_texts = [page.page_content for page in docx_entries_per_file]
|
||||
|
||||
entry_to_location_map += zip(docx_texts, [docx_file] * len(docx_texts))
|
||||
entries.extend(docx_texts)
|
||||
file_to_text_map[docx_file] = docx_texts
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to process file: {docx_file}. This file will not be indexed.")
|
||||
logger.warning(e, exc_info=True)
|
||||
finally:
|
||||
if os.path.exists(f"{tmp_file}"):
|
||||
os.remove(f"{tmp_file}")
|
||||
return file_to_text_map, DocxToEntries.convert_docx_entries_to_maps(entries, dict(entry_to_location_map))
|
||||
|
||||
@staticmethod
|
||||
def convert_docx_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
||||
"""Convert each DOCX entry into a dictionary"""
|
||||
entries = []
|
||||
for parsed_entry in parsed_entries:
|
||||
entry_filename = entry_to_file_map[parsed_entry]
|
||||
# Append base filename to compiled entry for context to model
|
||||
heading = f"{entry_filename}\n"
|
||||
compiled_entry = f"{heading}{parsed_entry}"
|
||||
entries.append(
|
||||
Entry(
|
||||
compiled=compiled_entry,
|
||||
raw=parsed_entry,
|
||||
heading=heading,
|
||||
file=f"{entry_filename}",
|
||||
)
|
||||
)
|
||||
|
||||
logger.debug(f"Converted {len(parsed_entries)} DOCX entries to dictionaries")
|
||||
|
||||
return entries
|
|
@ -7,6 +7,7 @@ from pydantic import BaseModel
|
|||
from starlette.authentication import requires
|
||||
|
||||
from khoj.database.models import GithubConfig, KhojUser, NotionConfig
|
||||
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
|
||||
from khoj.processor.content.github.github_to_entries import GithubToEntries
|
||||
from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
|
||||
from khoj.processor.content.notion.notion_to_entries import NotionToEntries
|
||||
|
@ -40,6 +41,7 @@ class IndexerInput(BaseModel):
|
|||
markdown: Optional[dict[str, str]] = None
|
||||
pdf: Optional[dict[str, bytes]] = None
|
||||
plaintext: Optional[dict[str, str]] = None
|
||||
docx: Optional[dict[str, bytes]] = None
|
||||
|
||||
|
||||
@indexer.post("/update")
|
||||
|
@ -63,7 +65,7 @@ async def update(
|
|||
),
|
||||
):
|
||||
user = request.user.object
|
||||
index_files: Dict[str, Dict[str, str]] = {"org": {}, "markdown": {}, "pdf": {}, "plaintext": {}}
|
||||
index_files: Dict[str, Dict[str, str]] = {"org": {}, "markdown": {}, "pdf": {}, "plaintext": {}, "docx": {}}
|
||||
try:
|
||||
logger.info(f"📬 Updating content index via API call by {client} client")
|
||||
for file in files:
|
||||
|
@ -79,6 +81,7 @@ async def update(
|
|||
markdown=index_files["markdown"],
|
||||
pdf=index_files["pdf"],
|
||||
plaintext=index_files["plaintext"],
|
||||
docx=index_files["docx"],
|
||||
)
|
||||
|
||||
if state.config == None:
|
||||
|
@ -93,6 +96,7 @@ async def update(
|
|||
org=None,
|
||||
markdown=None,
|
||||
pdf=None,
|
||||
docx=None,
|
||||
image=None,
|
||||
github=None,
|
||||
notion=None,
|
||||
|
@ -129,6 +133,7 @@ async def update(
|
|||
"num_markdown": len(index_files["markdown"]),
|
||||
"num_pdf": len(index_files["pdf"]),
|
||||
"num_plaintext": len(index_files["plaintext"]),
|
||||
"num_docx": len(index_files["docx"]),
|
||||
}
|
||||
|
||||
update_telemetry_state(
|
||||
|
@ -295,6 +300,20 @@ def configure_content(
|
|||
logger.error(f"🚨 Failed to setup Notion: {e}", exc_info=True)
|
||||
success = False
|
||||
|
||||
try:
|
||||
if (search_type == state.SearchType.All.value or search_type == state.SearchType.Docx.value) and files["docx"]:
|
||||
logger.info("📄 Setting up search for docx")
|
||||
text_search.setup(
|
||||
DocxToEntries,
|
||||
files.get("docx"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
user=user,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"🚨 Failed to setup docx: {e}", exc_info=True)
|
||||
success = False
|
||||
|
||||
# Invalidate Query Cache
|
||||
if user:
|
||||
state.query_cache[user.uuid] = LRU()
|
||||
|
|
|
@ -28,6 +28,7 @@ class SearchType(str, Enum):
|
|||
Github = "github"
|
||||
Notion = "notion"
|
||||
Plaintext = "plaintext"
|
||||
Docx = "docx"
|
||||
|
||||
|
||||
class ProcessorType(str, Enum):
|
||||
|
|
|
@ -115,6 +115,8 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
|
|||
return "org", encoding
|
||||
elif file_type in ["application/pdf"]:
|
||||
return "pdf", encoding
|
||||
elif file_type in ["application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
|
||||
return "docx", encoding
|
||||
elif file_type in ["image/jpeg"]:
|
||||
return "jpeg", encoding
|
||||
elif file_type in ["image/png"]:
|
||||
|
|
|
@ -65,6 +65,7 @@ class ContentConfig(ConfigBase):
|
|||
plaintext: Optional[TextContentConfig] = None
|
||||
github: Optional[GithubContentConfig] = None
|
||||
notion: Optional[NotionContentConfig] = None
|
||||
docx: Optional[TextContentConfig] = None
|
||||
|
||||
|
||||
class ImageSearchConfig(ConfigBase):
|
||||
|
|
BIN
tests/data/docx/bangalore.docx
vendored
Normal file
BIN
tests/data/docx/bangalore.docx
vendored
Normal file
Binary file not shown.
BIN
tests/data/docx/iceland.docx
vendored
Normal file
BIN
tests/data/docx/iceland.docx
vendored
Normal file
Binary file not shown.
|
@ -61,7 +61,7 @@ def test_search_with_invalid_content_type(client):
|
|||
@pytest.mark.django_db(transaction=True)
|
||||
def test_search_with_valid_content_type(client):
|
||||
headers = {"Authorization": "Bearer kk-secret"}
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plaintext"]:
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plaintext", "docx"]:
|
||||
# Act
|
||||
response = client.get(f"/api/search?q=random&t={content_type}", headers=headers)
|
||||
# Assert
|
||||
|
@ -480,6 +480,14 @@ def get_sample_files_data():
|
|||
("files", ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain")),
|
||||
("files", ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain")),
|
||||
("files", ("path/to/filename.md", "# Notes from client call", "text/markdown")),
|
||||
(
|
||||
"files",
|
||||
(
|
||||
"path/to/filename.docx",
|
||||
"## Studying anthropological records from the Fatimid caliphate",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
),
|
||||
),
|
||||
(
|
||||
"files",
|
||||
("path/to/filename1.md", "## Studying anthropological records from the Fatimid caliphate", "text/markdown"),
|
||||
|
|
37
tests/test_docx_to_entries.py
Normal file
37
tests/test_docx_to_entries.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import os
|
||||
|
||||
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
|
||||
|
||||
|
||||
def test_single_page_docx_to_jsonl():
|
||||
"Convert single page DOCX file to jsonl."
|
||||
# Act
|
||||
# Extract Entries from specified Docx files
|
||||
# Read singlepage.docx into memory as bytes
|
||||
with open("tests/data/docx/iceland.docx", "rb") as f:
|
||||
docx_bytes = f.read()
|
||||
|
||||
data = {"tests/data/docx/iceland.docx": docx_bytes}
|
||||
entries = DocxToEntries.extract_docx_entries(docx_files=data)
|
||||
|
||||
# Assert
|
||||
assert "The Icelandic horse" in entries[0]["tests/data/docx/iceland.docx"][0]
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
|
||||
|
||||
def test_multi_page_docx_to_jsonl():
|
||||
"Convert multi page DOCX file to jsonl."
|
||||
# Act
|
||||
# Extract Entries from specified Docx files
|
||||
# Read multipage.docx into memory as bytes
|
||||
with open("tests/data/docx/bangalore.docx", "rb") as f:
|
||||
docx_bytes = f.read()
|
||||
|
||||
data = {"tests/data/docx/bangalore.docx": docx_bytes}
|
||||
entries = DocxToEntries.extract_docx_entries(docx_files=data)
|
||||
|
||||
# Assert
|
||||
assert "Bangalore" in entries[0]["tests/data/docx/bangalore.docx"][0]
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
Loading…
Reference in a new issue