mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Add better parsing for XML files
This commit is contained in:
parent
025dc6e3e0
commit
08654163cb
4 changed files with 37 additions and 4 deletions
|
@ -71,6 +71,7 @@ dependencies = [
|
|||
"google-auth == 2.23.3",
|
||||
"python-multipart == 0.0.6",
|
||||
"gunicorn == 21.2.0",
|
||||
"lxml == 4.9.3",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
|
|
|
@ -94,6 +94,15 @@
|
|||
}).join("\n");
|
||||
}
|
||||
|
||||
function render_xml(query, data) {
|
||||
return data.map(function (item) {
|
||||
return `<div class="results-xml">` +
|
||||
`<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` +
|
||||
`<xml>${item.entry}</xml>` +
|
||||
`</div>`
|
||||
}).join("\n");
|
||||
}
|
||||
|
||||
function render_multiple(query, data, type) {
|
||||
let html = "";
|
||||
data.forEach(item => {
|
||||
|
@ -113,6 +122,8 @@
|
|||
html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
|
||||
} else if (item.additional.file.endsWith(".html")) {
|
||||
html += render_html(query, [item]);
|
||||
} else if (item.additional.file.endsWith(".xml")) {
|
||||
html += render_xml(query, [item])
|
||||
} else {
|
||||
html += `<div class="results-plugin">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
|
||||
}
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
|
@ -28,6 +30,19 @@ class PlaintextToJsonl(TextEmbeddings):
|
|||
else:
|
||||
deletion_file_names = None
|
||||
|
||||
with timer("Scrub plaintext files and extract text", logger):
|
||||
for file in files:
|
||||
try:
|
||||
plaintext_content = files[file]
|
||||
if file.endswith(("html", "htm", "xml")):
|
||||
plaintext_content = PlaintextToJsonl.extract_html_content(
|
||||
plaintext_content, file.split(".")[-1]
|
||||
)
|
||||
files[file] = plaintext_content
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
|
||||
logger.warning(e, exc_info=True)
|
||||
|
||||
# Extract Entries from specified plaintext files
|
||||
with timer("Parse entries from plaintext files", logger):
|
||||
current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files)
|
||||
|
@ -50,6 +65,15 @@ class PlaintextToJsonl(TextEmbeddings):
|
|||
|
||||
return num_new_embeddings, num_deleted_embeddings
|
||||
|
||||
@staticmethod
|
||||
def extract_html_content(markup_content: str, markup_type: str):
|
||||
"Extract content from HTML"
|
||||
if markup_type == "xml":
|
||||
soup = BeautifulSoup(markup_content, "xml")
|
||||
else:
|
||||
soup = BeautifulSoup(markup_content, "html.parser")
|
||||
return soup.get_text(strip=True, separator="\n")
|
||||
|
||||
@staticmethod
|
||||
def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]:
|
||||
"Convert each plaintext entries into a dictionary"
|
||||
|
|
|
@ -163,13 +163,10 @@ async def update(
|
|||
|
||||
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
|
||||
# Run Validation Checks
|
||||
if search_config is None:
|
||||
logger.warning("🚨 No Search configuration available.")
|
||||
return None
|
||||
if search_models is None:
|
||||
search_models = SearchModels()
|
||||
|
||||
if search_config.image:
|
||||
if search_config and search_config.image:
|
||||
logger.info("🔍 🌄 Setting up image search model")
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue