Add better parsing for XML files

2025-02-17 08:04:21 +00:00 · 2023-10-25 14:42:43 -07:00 · 2023-10-25 14:42:43 -07:00 · 08654163cb
commit 08654163cb
parent 025dc6e3e0
4 changed files with 37 additions and 4 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -71,6 +71,7 @@ dependencies = [
    "google-auth == 2.23.3",
    "python-multipart == 0.0.6",
    "gunicorn == 21.2.0",
+    "lxml == 4.9.3",
 ]
 dynamic = ["version"]

--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@ -94,6 +94,15 @@
            }).join("\n");
        }

+        function render_xml(query, data) {
+            return data.map(function (item) {
+                return `<div class="results-xml">` +
+                    `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` +
+                    `<xml>${item.entry}</xml>` +
+                    `</div>`
+            }).join("\n");
+        }
+
        function render_multiple(query, data, type) {
            let html = "";
            data.forEach(item => {
@ -113,6 +122,8 @@
                    html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
                } else if (item.additional.file.endsWith(".html")) {
                    html += render_html(query, [item]);
+                } else if (item.additional.file.endsWith(".xml")) {
+                    html += render_xml(query, [item])
                } else {
                    html += `<div class="results-plugin">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
                }
--- a/src/khoj/processor/plaintext/plaintext_to_jsonl.py
+++ b/src/khoj/processor/plaintext/plaintext_to_jsonl.py
@ -2,6 +2,8 @@
 import logging
 from pathlib import Path
 from typing import List, Tuple
+from bs4 import BeautifulSoup
+

 # Internal Packages
 from khoj.processor.text_to_jsonl import TextEmbeddings
@ -28,6 +30,19 @@ class PlaintextToJsonl(TextEmbeddings):
        else:
            deletion_file_names = None

+        with timer("Scrub plaintext files and extract text", logger):
+            for file in files:
+                try:
+                    plaintext_content = files[file]
+                    if file.endswith(("html", "htm", "xml")):
+                        plaintext_content = PlaintextToJsonl.extract_html_content(
+                            plaintext_content, file.split(".")[-1]
+                        )
+                    files[file] = plaintext_content
+                except Exception as e:
+                    logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
+                    logger.warning(e, exc_info=True)
+
        # Extract Entries from specified plaintext files
        with timer("Parse entries from plaintext files", logger):
            current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files)
@ -50,6 +65,15 @@ class PlaintextToJsonl(TextEmbeddings):

        return num_new_embeddings, num_deleted_embeddings

+    @staticmethod
+    def extract_html_content(markup_content: str, markup_type: str):
+        "Extract content from HTML"
+        if markup_type == "xml":
+            soup = BeautifulSoup(markup_content, "xml")
+        else:
+            soup = BeautifulSoup(markup_content, "html.parser")
+        return soup.get_text(strip=True, separator="\n")
+
    @staticmethod
    def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]:
        "Convert each plaintext entries into a dictionary"
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@ -163,13 +163,10 @@ async def update(

 def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
    # Run Validation Checks
-    if search_config is None:
-        logger.warning("🚨 No Search configuration available.")
-        return None
    if search_models is None:
        search_models = SearchModels()

-    if search_config.image:
+    if search_config and search_config.image:
        logger.info("🔍 🌄 Setting up image search model")
        search_models.image_search = image_search.initialize_model(search_config.image)