mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Fix plaintext HTML parsing and rendering (#464)
* Store conversation command options in an Enum * Move to slash commands instead of using @ to specify general commands * Calculate conversation command once & pass it as arg to child funcs * Add /notes command to respond using only knowledge base as context This prevents the chat model to try respond using it's general world knowledge only without any references pulled from the indexed knowledge base * Test general and notes slash commands in openai chat director tests --------- Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
parent
7919787fb7
commit
b45e1d8c0d
12 changed files with 2168 additions and 48 deletions
|
@ -36,6 +36,7 @@ classifiers = [
|
|||
"Topic :: Text Processing :: Linguistic",
|
||||
]
|
||||
dependencies = [
|
||||
"bs4 >= 0.0.1",
|
||||
"dateparser >= 1.1.1",
|
||||
"defusedxml == 0.7.1",
|
||||
"fastapi == 0.77.1",
|
||||
|
|
|
@ -34,28 +34,6 @@
|
|||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
@media only screen and (max-width: 600px) {
|
||||
body {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr;
|
||||
grid-template-rows: 1fr auto auto auto minmax(80px, 100%);
|
||||
}
|
||||
body > * {
|
||||
grid-column: 1;
|
||||
}
|
||||
|
||||
div.filler {
|
||||
display: none;
|
||||
}
|
||||
|
||||
body.khoj-configure {
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
div.section {
|
||||
padding: 12px;
|
||||
}
|
||||
}
|
||||
|
||||
img.khoj-logo {
|
||||
max-width: none!important;
|
||||
|
@ -74,6 +52,11 @@
|
|||
justify-self: center;
|
||||
}
|
||||
|
||||
|
||||
div.section.general-settings {
|
||||
justify-self: center;
|
||||
}
|
||||
|
||||
div.instructions {
|
||||
font-size: large;
|
||||
}
|
||||
|
@ -103,6 +86,26 @@
|
|||
box-shadow: 0px 1px 3px 0px rgba(0,0,0,0.1),0px 1px 2px -1px rgba(0,0,0,0.1);
|
||||
overflow: hidden;
|
||||
}
|
||||
div.finalize-buttons {
|
||||
display: grid;
|
||||
gap: 8px;
|
||||
padding: 24px 16px;
|
||||
width: 320px;
|
||||
background: white;
|
||||
border-radius: 4px;
|
||||
overflow: hidden;
|
||||
}
|
||||
div#features-hint-text {
|
||||
width: 640px;
|
||||
opacity: 0;
|
||||
transition: opacity 0.5s ease-in-out;
|
||||
overflow: hidden;
|
||||
height: 0;
|
||||
}
|
||||
div#features-hint-text.show {
|
||||
opacity: 1;
|
||||
height: auto;
|
||||
}
|
||||
.card-title-row {
|
||||
display: grid;
|
||||
grid-template-columns: auto 1fr;
|
||||
|
@ -213,5 +216,32 @@
|
|||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
@media only screen and (max-width: 600px) {
|
||||
body {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr;
|
||||
grid-template-rows: 1fr auto auto auto minmax(80px, 100%);
|
||||
}
|
||||
body > * {
|
||||
grid-column: 1;
|
||||
}
|
||||
|
||||
div.filler {
|
||||
display: none;
|
||||
}
|
||||
|
||||
body.khoj-configure {
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
div.section {
|
||||
padding: 12px;
|
||||
}
|
||||
|
||||
div#features-hint-text {
|
||||
width: 320px;
|
||||
}
|
||||
|
||||
}
|
||||
</style>
|
||||
</html>
|
||||
|
|
|
@ -104,15 +104,18 @@
|
|||
newResponseText.appendChild(loadingSpinner);
|
||||
document.getElementById("chat-body").scrollTop = document.getElementById("chat-body").scrollHeight;
|
||||
|
||||
let chatTooltip = document.getElementById("chat-tooltip");
|
||||
chatTooltip.style.display = "none";
|
||||
|
||||
let chatInput = document.getElementById("chat-input");
|
||||
chatInput.classList.remove("option-enabled");
|
||||
|
||||
// Call specified Khoj API which returns a streamed response of type text/plain
|
||||
fetch(url)
|
||||
.then(response => {
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
let chatTooltip = document.getElementById("chat-tooltip");
|
||||
chatTooltip.style.display = "none";
|
||||
|
||||
function readStream() {
|
||||
reader.read().then(({ done, value }) => {
|
||||
if (done) {
|
||||
|
@ -176,17 +179,16 @@
|
|||
} else if (chatInput.value.startsWith("/")) {
|
||||
const firstWord = chatInput.value.split(" ")[0];
|
||||
if (firstWord.substring(1) in chatOptions) {
|
||||
// Add a div element around the text.
|
||||
let chatTooltip = document.getElementById("chat-tooltip");
|
||||
chatTooltip.style.display = "block";
|
||||
chatTooltip.innerHTML = "Mode: " + firstWord.substring(1);
|
||||
chatInput.classList.add("option-enabled");
|
||||
} else {
|
||||
let chatTooltip = document.getElementById("chat-tooltip");
|
||||
chatTooltip.style.display = "none";
|
||||
chatInput.classList.remove("option-enabled");
|
||||
}
|
||||
let chatTooltip = document.getElementById("chat-tooltip");
|
||||
chatTooltip.style.display = "none";
|
||||
} else {
|
||||
let chatTooltip = document.getElementById("chat-tooltip");
|
||||
chatTooltip.style.display = "none";
|
||||
chatInput.classList.remove("option-enabled");
|
||||
}
|
||||
|
||||
autoResize();
|
||||
|
@ -288,7 +290,8 @@
|
|||
<!-- Chat Footer -->
|
||||
<div id="chat-footer">
|
||||
<div id="chat-tooltip" style="display: none;"></div>
|
||||
<textarea id="chat-input" class="option" oninput="onChatInput()" onkeyup=incrementalChat(event) autofocus="autofocus" placeholder="What is the meaning of life?"></textarea>
|
||||
<textarea id="chat-input" class="option" oninput="onChatInput()" onkeyup=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter.">
|
||||
</textarea>
|
||||
</div>
|
||||
</body>
|
||||
|
||||
|
@ -428,6 +431,19 @@
|
|||
line-height: 1.5em;
|
||||
margin: 0;
|
||||
}
|
||||
#chat-input:focus {
|
||||
outline: none !important;
|
||||
}
|
||||
|
||||
.option-enabled {
|
||||
box-shadow: 0 0 12px rgb(119, 156, 46);
|
||||
}
|
||||
|
||||
.option-enabled:focus {
|
||||
outline: none !important;
|
||||
border:1px solid #475569;
|
||||
box-shadow: 0 0 16px var(--primary);
|
||||
}
|
||||
|
||||
a.inline-chat-link {
|
||||
color: #475569;
|
||||
|
|
|
@ -219,6 +219,7 @@
|
|||
</div>
|
||||
<div class="section">
|
||||
<h2 class="section-title">Features</h2>
|
||||
<div id="features-hint-text"></div>
|
||||
<div class="section-cards">
|
||||
<div class="card">
|
||||
<div class="card-title-row">
|
||||
|
@ -285,16 +286,22 @@
|
|||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="section">
|
||||
<div class="section general-settings">
|
||||
<div id="results-count" title="Number of items to show in search and use for chat response">
|
||||
<label for="results-count-slider">Results Count: <span id="results-count-value">5</span></label>
|
||||
<input type="range" id="results-count-slider" name="results-count-slider" min="1" max="10" step="1" value="5">
|
||||
</div>
|
||||
<div id="status" style="display: none;"></div>
|
||||
</div>
|
||||
<div class="section finalize-actions">
|
||||
<button id="configure" type="submit" title="Update index with the latest changes">⚙️ Configure</button>
|
||||
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
|
||||
<div class="section finalize-actions general-settings">
|
||||
<div class="section-cards">
|
||||
<div class="finalize-buttons">
|
||||
<button id="configure" type="submit" title="Update index with the latest changes">⚙️ Configure</button>
|
||||
</div>
|
||||
<div class="finalize-buttons">
|
||||
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
|
@ -329,9 +336,16 @@
|
|||
function toggleEnableLocalLLLM(enable) {
|
||||
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
|
||||
var toggleEnableLocalLLLMButton = document.getElementById("toggle-enable-offline-chat");
|
||||
var featuresHintText = document.getElementById("features-hint-text");
|
||||
toggleEnableLocalLLLMButton.classList.remove("disabled");
|
||||
toggleEnableLocalLLLMButton.classList.add("enabled");
|
||||
|
||||
if (enable) {
|
||||
featuresHintText.style.display = "block";
|
||||
featuresHintText.innerHTML = "An open source model is being downloaded in the background. Hang tight, this may take a few minutes ⏳.";
|
||||
featuresHintText.classList.add("show");
|
||||
}
|
||||
|
||||
fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
|
@ -372,6 +386,8 @@
|
|||
disableLocalLLLMButton.classList.remove("enabled");
|
||||
}
|
||||
|
||||
featuresHintText.classList.remove("show");
|
||||
featuresHintText.innerHTML = "";
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
@ -56,6 +56,44 @@
|
|||
}).join("\n");
|
||||
}
|
||||
|
||||
function render_html(query, data) {
|
||||
return data.map(function (item) {
|
||||
let document = new DOMParser().parseFromString(item.entry, "text/html");
|
||||
// Scrub the HTML to remove any script tags and associated content
|
||||
let script_tags = document.querySelectorAll("script");
|
||||
for (let i = 0; i < script_tags.length; i++) {
|
||||
script_tags[i].remove();
|
||||
}
|
||||
// Scrub the HTML to remove any style tags and associated content
|
||||
let style_tags = document.querySelectorAll("style");
|
||||
for (let i = 0; i < style_tags.length; i++) {
|
||||
style_tags[i].remove();
|
||||
}
|
||||
// Scrub the HTML to remove any noscript tags and associated content
|
||||
let noscript_tags = document.querySelectorAll("noscript");
|
||||
for (let i = 0; i < noscript_tags.length; i++) {
|
||||
noscript_tags[i].remove();
|
||||
}
|
||||
// Scrub the HTML to remove any iframe tags and associated content
|
||||
let iframe_tags = document.querySelectorAll("iframe");
|
||||
for (let i = 0; i < iframe_tags.length; i++) {
|
||||
iframe_tags[i].remove();
|
||||
}
|
||||
// Scrub the HTML to remove any object tags and associated content
|
||||
let object_tags = document.querySelectorAll("object");
|
||||
for (let i = 0; i < object_tags.length; i++) {
|
||||
object_tags[i].remove();
|
||||
}
|
||||
// Scrub the HTML to remove any embed tags and associated content
|
||||
let embed_tags = document.querySelectorAll("embed");
|
||||
for (let i = 0; i < embed_tags.length; i++) {
|
||||
embed_tags[i].remove();
|
||||
}
|
||||
let scrubbedHTML = document.body.outerHTML;
|
||||
return `<div class="results-html">` + scrubbedHTML + `</div>`;
|
||||
}).join("\n");
|
||||
}
|
||||
|
||||
function render_multiple(query, data, type) {
|
||||
let html = "";
|
||||
data.forEach(item => {
|
||||
|
@ -73,6 +111,8 @@
|
|||
html += render_pdf(query, [item]);
|
||||
} else if (item.additional.file.includes("notion.so")) {
|
||||
html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
|
||||
} else if (item.additional.file.endsWith(".html")) {
|
||||
html += render_html(query, [item]);
|
||||
} else {
|
||||
html += `<div class="results-plugin">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
|
||||
}
|
||||
|
@ -247,7 +287,7 @@
|
|||
</div>
|
||||
|
||||
<!--Add Text Box To Enter Query, Trigger Incremental Search OnChange -->
|
||||
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="What is the meaning of life?">
|
||||
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="Search directly from your knowledge base">
|
||||
|
||||
<div id="options">
|
||||
<!--Add Dropdown to Select Query Type -->
|
||||
|
@ -338,6 +378,7 @@
|
|||
}
|
||||
.results-pdf,
|
||||
.results-notion,
|
||||
.results-html,
|
||||
.results-plugin {
|
||||
text-align: left;
|
||||
white-space: pre-line;
|
||||
|
@ -398,6 +439,7 @@
|
|||
div.results-notion,
|
||||
div.results-org,
|
||||
div.results-plugin,
|
||||
div.results-html,
|
||||
div.results-pdf {
|
||||
text-align: left;
|
||||
box-shadow: 2px 2px 2px var(--primary-hover);
|
||||
|
|
|
@ -122,6 +122,7 @@ def converse(
|
|||
if conversation_command == ConversationCommand.General:
|
||||
conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query)
|
||||
elif conversation_command == ConversationCommand.Notes and is_none_or_empty(compiled_references):
|
||||
completion_func(chat_response=prompts.no_notes_found.format())
|
||||
return iter([prompts.no_notes_found.format()])
|
||||
else:
|
||||
conversation_primer = prompts.notes_conversation.format(
|
||||
|
|
|
@ -237,10 +237,10 @@ Q:"""
|
|||
help_message = PromptTemplate.from_template(
|
||||
"""
|
||||
**/help**: Show this help message.
|
||||
**/notes**: Search only against the information in your knowledge base. This is the default method.
|
||||
**/general**: Search general knowledge with the LLM. This will not search against your notes.
|
||||
**/notes**: Chat using the information in your knowledge base. This is the default method.
|
||||
**/general**: Chat using general knowledge with the LLM. This will not search against your notes.
|
||||
|
||||
You are using the **{model}** model. To change the model, go to your <a href="/config">settings</a> page.
|
||||
You are using the **{model}** model.
|
||||
**version**: {version}
|
||||
""".strip()
|
||||
)
|
||||
|
|
|
@ -91,15 +91,21 @@ class PlaintextToJsonl(TextToJsonl):
|
|||
|
||||
for plaintext_file in plaintext_files:
|
||||
with open(plaintext_file, "r") as f:
|
||||
try:
|
||||
plaintext_content = f.read()
|
||||
entry_to_file_map.append((plaintext_content, plaintext_file))
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to process file: {plaintext_file}. This file will not be indexed.")
|
||||
logger.warning(e, exc_info=True)
|
||||
plaintext_content = f.read()
|
||||
if plaintext_file.endswith(("html", "htm", "xml")):
|
||||
plaintext_content = PlaintextToJsonl.extract_html_content(plaintext_content)
|
||||
entry_to_file_map.append((plaintext_content, plaintext_file))
|
||||
|
||||
return dict(entry_to_file_map)
|
||||
|
||||
@staticmethod
|
||||
def extract_html_content(html_content: str):
|
||||
"Extract content from HTML"
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
return soup.get_text(strip=True, separator="\n")
|
||||
|
||||
@staticmethod
|
||||
def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]:
|
||||
"Convert each plaintext entries into a dictionary"
|
||||
|
|
|
@ -222,5 +222,5 @@ class ConversationCommand(str, Enum):
|
|||
command_descriptions = {
|
||||
ConversationCommand.General: "This command allows you to search talk with the LLM without including context from your knowledge base.",
|
||||
ConversationCommand.Notes: "This command allows you to search talk with the LLM while including context from your knowledge base.",
|
||||
ConversationCommand.Help: "This command displays a help message.",
|
||||
ConversationCommand.Help: "This command displays a help message with all available commands and other metadata.",
|
||||
}
|
||||
|
|
|
@ -109,6 +109,13 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
|||
)
|
||||
}
|
||||
|
||||
content_config.plaintext = TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
|
||||
compressed_jsonl=content_dir.joinpath("plaintext.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("plaintext_embeddings.pt"),
|
||||
)
|
||||
|
||||
content_config.github = GithubContentConfig(
|
||||
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
|
||||
repos=[
|
||||
|
|
1984
tests/data/plaintext/wikipedia_tardigrades.html
Normal file
1984
tests/data/plaintext/wikipedia_tardigrades.html
Normal file
File diff suppressed because one or more lines are too long
|
@ -18,7 +18,7 @@ def test_plaintext_file(tmp_path):
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified plaintext files
|
||||
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[plaintextfile])
|
||||
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[str(plaintextfile)])
|
||||
|
||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
|
||||
|
||||
|
@ -75,6 +75,23 @@ def test_get_plaintext_files(tmp_path):
|
|||
assert set(extracted_plaintext_files) == set(expected_files)
|
||||
|
||||
|
||||
def test_parse_html_plaintext_file(content_config):
|
||||
"Ensure HTML files are parsed correctly"
|
||||
# Arrange
|
||||
# Setup input-files, input-filters
|
||||
input_files = content_config.plaintext.input_files
|
||||
input_filter = content_config.plaintext.input_filter
|
||||
|
||||
# Act
|
||||
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
|
||||
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(extracted_plaintext_files)
|
||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
|
||||
|
||||
# Assert
|
||||
assert len(maps) == 1
|
||||
assert "<div>" not in maps[0].raw
|
||||
|
||||
|
||||
# Helper Functions
|
||||
def create_file(tmp_path: Path, entry=None, filename="test.md"):
|
||||
file_ = tmp_path / filename
|
||||
|
|
Loading…
Reference in a new issue