Add better parsing for XML files

This commit is contained in:
sabaimran 2023-10-25 14:42:43 -07:00
parent 025dc6e3e0
commit 08654163cb
4 changed files with 37 additions and 4 deletions

View file

@ -71,6 +71,7 @@ dependencies = [
"google-auth == 2.23.3",
"python-multipart == 0.0.6",
"gunicorn == 21.2.0",
"lxml == 4.9.3",
]
dynamic = ["version"]

View file

@ -94,6 +94,15 @@
}).join("\n");
}
function render_xml(query, data) {
return data.map(function (item) {
return `<div class="results-xml">` +
`<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` +
`<xml>${item.entry}</xml>` +
`</div>`
}).join("\n");
}
function render_multiple(query, data, type) {
let html = "";
data.forEach(item => {
@ -113,6 +122,8 @@
html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
} else if (item.additional.file.endsWith(".html")) {
html += render_html(query, [item]);
} else if (item.additional.file.endsWith(".xml")) {
html += render_xml(query, [item])
} else {
html += `<div class="results-plugin">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
}

View file

@ -2,6 +2,8 @@
import logging
from pathlib import Path
from typing import List, Tuple
from bs4 import BeautifulSoup
# Internal Packages
from khoj.processor.text_to_jsonl import TextEmbeddings
@ -28,6 +30,19 @@ class PlaintextToJsonl(TextEmbeddings):
else:
deletion_file_names = None
with timer("Scrub plaintext files and extract text", logger):
for file in files:
try:
plaintext_content = files[file]
if file.endswith(("html", "htm", "xml")):
plaintext_content = PlaintextToJsonl.extract_html_content(
plaintext_content, file.split(".")[-1]
)
files[file] = plaintext_content
except Exception as e:
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
logger.warning(e, exc_info=True)
# Extract Entries from specified plaintext files
with timer("Parse entries from plaintext files", logger):
current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files)
@ -50,6 +65,15 @@ class PlaintextToJsonl(TextEmbeddings):
return num_new_embeddings, num_deleted_embeddings
@staticmethod
def extract_html_content(markup_content: str, markup_type: str):
"Extract content from HTML"
if markup_type == "xml":
soup = BeautifulSoup(markup_content, "xml")
else:
soup = BeautifulSoup(markup_content, "html.parser")
return soup.get_text(strip=True, separator="\n")
@staticmethod
def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]:
"Convert each plaintext entries into a dictionary"

View file

@ -163,13 +163,10 @@ async def update(
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
# Run Validation Checks
if search_config is None:
logger.warning("🚨 No Search configuration available.")
return None
if search_models is None:
search_models = SearchModels()
if search_config.image:
if search_config and search_config.image:
logger.info("🔍 🌄 Setting up image search model")
search_models.image_search = image_search.initialize_model(search_config.image)