Fix plaintext HTML parsing and rendering (#464)

* Store conversation command options in an Enum
* Move to slash commands instead of using @ to specify general commands
* Calculate conversation command once & pass it as arg to child funcs
* Add /notes command to respond using only knowledge base as context
This prevents the chat model to try respond using it's general world
knowledge only without any references pulled from the indexed
knowledge base
* Test general and notes slash commands in openai chat director tests
---------

Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
sabaimran 2023-08-27 11:24:30 -07:00 committed by GitHub
parent 7919787fb7
commit b45e1d8c0d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 2168 additions and 48 deletions

View file

@ -36,6 +36,7 @@ classifiers = [
"Topic :: Text Processing :: Linguistic", "Topic :: Text Processing :: Linguistic",
] ]
dependencies = [ dependencies = [
"bs4 >= 0.0.1",
"dateparser >= 1.1.1", "dateparser >= 1.1.1",
"defusedxml == 0.7.1", "defusedxml == 0.7.1",
"fastapi == 0.77.1", "fastapi == 0.77.1",

View file

@ -34,28 +34,6 @@
margin: 0; margin: 0;
padding: 0; padding: 0;
} }
@media only screen and (max-width: 600px) {
body {
display: grid;
grid-template-columns: 1fr;
grid-template-rows: 1fr auto auto auto minmax(80px, 100%);
}
body > * {
grid-column: 1;
}
div.filler {
display: none;
}
body.khoj-configure {
padding: 0;
}
div.section {
padding: 12px;
}
}
img.khoj-logo { img.khoj-logo {
max-width: none!important; max-width: none!important;
@ -74,6 +52,11 @@
justify-self: center; justify-self: center;
} }
div.section.general-settings {
justify-self: center;
}
div.instructions { div.instructions {
font-size: large; font-size: large;
} }
@ -103,6 +86,26 @@
box-shadow: 0px 1px 3px 0px rgba(0,0,0,0.1),0px 1px 2px -1px rgba(0,0,0,0.1); box-shadow: 0px 1px 3px 0px rgba(0,0,0,0.1),0px 1px 2px -1px rgba(0,0,0,0.1);
overflow: hidden; overflow: hidden;
} }
div.finalize-buttons {
display: grid;
gap: 8px;
padding: 24px 16px;
width: 320px;
background: white;
border-radius: 4px;
overflow: hidden;
}
div#features-hint-text {
width: 640px;
opacity: 0;
transition: opacity 0.5s ease-in-out;
overflow: hidden;
height: 0;
}
div#features-hint-text.show {
opacity: 1;
height: auto;
}
.card-title-row { .card-title-row {
display: grid; display: grid;
grid-template-columns: auto 1fr; grid-template-columns: auto 1fr;
@ -213,5 +216,32 @@
grid-template-columns: 1fr; grid-template-columns: 1fr;
} }
} }
@media only screen and (max-width: 600px) {
body {
display: grid;
grid-template-columns: 1fr;
grid-template-rows: 1fr auto auto auto minmax(80px, 100%);
}
body > * {
grid-column: 1;
}
div.filler {
display: none;
}
body.khoj-configure {
padding: 0;
}
div.section {
padding: 12px;
}
div#features-hint-text {
width: 320px;
}
}
</style> </style>
</html> </html>

View file

@ -104,15 +104,18 @@
newResponseText.appendChild(loadingSpinner); newResponseText.appendChild(loadingSpinner);
document.getElementById("chat-body").scrollTop = document.getElementById("chat-body").scrollHeight; document.getElementById("chat-body").scrollTop = document.getElementById("chat-body").scrollHeight;
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "none";
let chatInput = document.getElementById("chat-input");
chatInput.classList.remove("option-enabled");
// Call specified Khoj API which returns a streamed response of type text/plain // Call specified Khoj API which returns a streamed response of type text/plain
fetch(url) fetch(url)
.then(response => { .then(response => {
const reader = response.body.getReader(); const reader = response.body.getReader();
const decoder = new TextDecoder(); const decoder = new TextDecoder();
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "none";
function readStream() { function readStream() {
reader.read().then(({ done, value }) => { reader.read().then(({ done, value }) => {
if (done) { if (done) {
@ -176,17 +179,16 @@
} else if (chatInput.value.startsWith("/")) { } else if (chatInput.value.startsWith("/")) {
const firstWord = chatInput.value.split(" ")[0]; const firstWord = chatInput.value.split(" ")[0];
if (firstWord.substring(1) in chatOptions) { if (firstWord.substring(1) in chatOptions) {
// Add a div element around the text. chatInput.classList.add("option-enabled");
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "block";
chatTooltip.innerHTML = "Mode: " + firstWord.substring(1);
} else { } else {
let chatTooltip = document.getElementById("chat-tooltip"); chatInput.classList.remove("option-enabled");
chatTooltip.style.display = "none";
} }
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "none";
} else { } else {
let chatTooltip = document.getElementById("chat-tooltip"); let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "none"; chatTooltip.style.display = "none";
chatInput.classList.remove("option-enabled");
} }
autoResize(); autoResize();
@ -288,7 +290,8 @@
<!-- Chat Footer --> <!-- Chat Footer -->
<div id="chat-footer"> <div id="chat-footer">
<div id="chat-tooltip" style="display: none;"></div> <div id="chat-tooltip" style="display: none;"></div>
<textarea id="chat-input" class="option" oninput="onChatInput()" onkeyup=incrementalChat(event) autofocus="autofocus" placeholder="What is the meaning of life?"></textarea> <textarea id="chat-input" class="option" oninput="onChatInput()" onkeyup=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter.">
</textarea>
</div> </div>
</body> </body>
@ -428,6 +431,19 @@
line-height: 1.5em; line-height: 1.5em;
margin: 0; margin: 0;
} }
#chat-input:focus {
outline: none !important;
}
.option-enabled {
box-shadow: 0 0 12px rgb(119, 156, 46);
}
.option-enabled:focus {
outline: none !important;
border:1px solid #475569;
box-shadow: 0 0 16px var(--primary);
}
a.inline-chat-link { a.inline-chat-link {
color: #475569; color: #475569;

View file

@ -219,6 +219,7 @@
</div> </div>
<div class="section"> <div class="section">
<h2 class="section-title">Features</h2> <h2 class="section-title">Features</h2>
<div id="features-hint-text"></div>
<div class="section-cards"> <div class="section-cards">
<div class="card"> <div class="card">
<div class="card-title-row"> <div class="card-title-row">
@ -285,16 +286,22 @@
</div> </div>
</div> </div>
</div> </div>
<div class="section"> <div class="section general-settings">
<div id="results-count" title="Number of items to show in search and use for chat response"> <div id="results-count" title="Number of items to show in search and use for chat response">
<label for="results-count-slider">Results Count: <span id="results-count-value">5</span></label> <label for="results-count-slider">Results Count: <span id="results-count-value">5</span></label>
<input type="range" id="results-count-slider" name="results-count-slider" min="1" max="10" step="1" value="5"> <input type="range" id="results-count-slider" name="results-count-slider" min="1" max="10" step="1" value="5">
</div> </div>
<div id="status" style="display: none;"></div> <div id="status" style="display: none;"></div>
</div> </div>
<div class="section finalize-actions"> <div class="section finalize-actions general-settings">
<button id="configure" type="submit" title="Update index with the latest changes">⚙️ Configure</button> <div class="section-cards">
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button> <div class="finalize-buttons">
<button id="configure" type="submit" title="Update index with the latest changes">⚙️ Configure</button>
</div>
<div class="finalize-buttons">
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
</div>
</div>
</div> </div>
</div> </div>
<script> <script>
@ -329,9 +336,16 @@
function toggleEnableLocalLLLM(enable) { function toggleEnableLocalLLLM(enable) {
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1]; const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
var toggleEnableLocalLLLMButton = document.getElementById("toggle-enable-offline-chat"); var toggleEnableLocalLLLMButton = document.getElementById("toggle-enable-offline-chat");
var featuresHintText = document.getElementById("features-hint-text");
toggleEnableLocalLLLMButton.classList.remove("disabled"); toggleEnableLocalLLLMButton.classList.remove("disabled");
toggleEnableLocalLLLMButton.classList.add("enabled"); toggleEnableLocalLLLMButton.classList.add("enabled");
if (enable) {
featuresHintText.style.display = "block";
featuresHintText.innerHTML = "An open source model is being downloaded in the background. Hang tight, this may take a few minutes ⏳.";
featuresHintText.classList.add("show");
}
fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, { fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, {
method: 'POST', method: 'POST',
headers: { headers: {
@ -372,6 +386,8 @@
disableLocalLLLMButton.classList.remove("enabled"); disableLocalLLLMButton.classList.remove("enabled");
} }
featuresHintText.classList.remove("show");
featuresHintText.innerHTML = "";
} }
}) })
} }

View file

@ -56,6 +56,44 @@
}).join("\n"); }).join("\n");
} }
function render_html(query, data) {
return data.map(function (item) {
let document = new DOMParser().parseFromString(item.entry, "text/html");
// Scrub the HTML to remove any script tags and associated content
let script_tags = document.querySelectorAll("script");
for (let i = 0; i < script_tags.length; i++) {
script_tags[i].remove();
}
// Scrub the HTML to remove any style tags and associated content
let style_tags = document.querySelectorAll("style");
for (let i = 0; i < style_tags.length; i++) {
style_tags[i].remove();
}
// Scrub the HTML to remove any noscript tags and associated content
let noscript_tags = document.querySelectorAll("noscript");
for (let i = 0; i < noscript_tags.length; i++) {
noscript_tags[i].remove();
}
// Scrub the HTML to remove any iframe tags and associated content
let iframe_tags = document.querySelectorAll("iframe");
for (let i = 0; i < iframe_tags.length; i++) {
iframe_tags[i].remove();
}
// Scrub the HTML to remove any object tags and associated content
let object_tags = document.querySelectorAll("object");
for (let i = 0; i < object_tags.length; i++) {
object_tags[i].remove();
}
// Scrub the HTML to remove any embed tags and associated content
let embed_tags = document.querySelectorAll("embed");
for (let i = 0; i < embed_tags.length; i++) {
embed_tags[i].remove();
}
let scrubbedHTML = document.body.outerHTML;
return `<div class="results-html">` + scrubbedHTML + `</div>`;
}).join("\n");
}
function render_multiple(query, data, type) { function render_multiple(query, data, type) {
let html = ""; let html = "";
data.forEach(item => { data.forEach(item => {
@ -73,6 +111,8 @@
html += render_pdf(query, [item]); html += render_pdf(query, [item]);
} else if (item.additional.file.includes("notion.so")) { } else if (item.additional.file.includes("notion.so")) {
html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`; html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
} else if (item.additional.file.endsWith(".html")) {
html += render_html(query, [item]);
} else { } else {
html += `<div class="results-plugin">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`; html += `<div class="results-plugin">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
} }
@ -247,7 +287,7 @@
</div> </div>
<!--Add Text Box To Enter Query, Trigger Incremental Search OnChange --> <!--Add Text Box To Enter Query, Trigger Incremental Search OnChange -->
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="What is the meaning of life?"> <input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="Search directly from your knowledge base">
<div id="options"> <div id="options">
<!--Add Dropdown to Select Query Type --> <!--Add Dropdown to Select Query Type -->
@ -338,6 +378,7 @@
} }
.results-pdf, .results-pdf,
.results-notion, .results-notion,
.results-html,
.results-plugin { .results-plugin {
text-align: left; text-align: left;
white-space: pre-line; white-space: pre-line;
@ -398,6 +439,7 @@
div.results-notion, div.results-notion,
div.results-org, div.results-org,
div.results-plugin, div.results-plugin,
div.results-html,
div.results-pdf { div.results-pdf {
text-align: left; text-align: left;
box-shadow: 2px 2px 2px var(--primary-hover); box-shadow: 2px 2px 2px var(--primary-hover);

View file

@ -122,6 +122,7 @@ def converse(
if conversation_command == ConversationCommand.General: if conversation_command == ConversationCommand.General:
conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query) conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query)
elif conversation_command == ConversationCommand.Notes and is_none_or_empty(compiled_references): elif conversation_command == ConversationCommand.Notes and is_none_or_empty(compiled_references):
completion_func(chat_response=prompts.no_notes_found.format())
return iter([prompts.no_notes_found.format()]) return iter([prompts.no_notes_found.format()])
else: else:
conversation_primer = prompts.notes_conversation.format( conversation_primer = prompts.notes_conversation.format(

View file

@ -237,10 +237,10 @@ Q:"""
help_message = PromptTemplate.from_template( help_message = PromptTemplate.from_template(
""" """
**/help**: Show this help message. **/help**: Show this help message.
**/notes**: Search only against the information in your knowledge base. This is the default method. **/notes**: Chat using the information in your knowledge base. This is the default method.
**/general**: Search general knowledge with the LLM. This will not search against your notes. **/general**: Chat using general knowledge with the LLM. This will not search against your notes.
You are using the **{model}** model. To change the model, go to your <a href="/config">settings</a> page. You are using the **{model}** model.
**version**: {version} **version**: {version}
""".strip() """.strip()
) )

View file

@ -91,15 +91,21 @@ class PlaintextToJsonl(TextToJsonl):
for plaintext_file in plaintext_files: for plaintext_file in plaintext_files:
with open(plaintext_file, "r") as f: with open(plaintext_file, "r") as f:
try: plaintext_content = f.read()
plaintext_content = f.read() if plaintext_file.endswith(("html", "htm", "xml")):
entry_to_file_map.append((plaintext_content, plaintext_file)) plaintext_content = PlaintextToJsonl.extract_html_content(plaintext_content)
except Exception as e: entry_to_file_map.append((plaintext_content, plaintext_file))
logger.warning(f"Unable to process file: {plaintext_file}. This file will not be indexed.")
logger.warning(e, exc_info=True)
return dict(entry_to_file_map) return dict(entry_to_file_map)
@staticmethod
def extract_html_content(html_content: str):
"Extract content from HTML"
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
return soup.get_text(strip=True, separator="\n")
@staticmethod @staticmethod
def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]: def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]:
"Convert each plaintext entries into a dictionary" "Convert each plaintext entries into a dictionary"

View file

@ -222,5 +222,5 @@ class ConversationCommand(str, Enum):
command_descriptions = { command_descriptions = {
ConversationCommand.General: "This command allows you to search talk with the LLM without including context from your knowledge base.", ConversationCommand.General: "This command allows you to search talk with the LLM without including context from your knowledge base.",
ConversationCommand.Notes: "This command allows you to search talk with the LLM while including context from your knowledge base.", ConversationCommand.Notes: "This command allows you to search talk with the LLM while including context from your knowledge base.",
ConversationCommand.Help: "This command displays a help message.", ConversationCommand.Help: "This command displays a help message with all available commands and other metadata.",
} }

View file

@ -109,6 +109,13 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
) )
} }
content_config.plaintext = TextContentConfig(
input_files=None,
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
compressed_jsonl=content_dir.joinpath("plaintext.jsonl.gz"),
embeddings_file=content_dir.joinpath("plaintext_embeddings.pt"),
)
content_config.github = GithubContentConfig( content_config.github = GithubContentConfig(
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""), pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
repos=[ repos=[

File diff suppressed because one or more lines are too long

View file

@ -18,7 +18,7 @@ def test_plaintext_file(tmp_path):
# Act # Act
# Extract Entries from specified plaintext files # Extract Entries from specified plaintext files
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[plaintextfile]) file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[str(plaintextfile)])
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries) maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
@ -75,6 +75,23 @@ def test_get_plaintext_files(tmp_path):
assert set(extracted_plaintext_files) == set(expected_files) assert set(extracted_plaintext_files) == set(expected_files)
def test_parse_html_plaintext_file(content_config):
"Ensure HTML files are parsed correctly"
# Arrange
# Setup input-files, input-filters
input_files = content_config.plaintext.input_files
input_filter = content_config.plaintext.input_filter
# Act
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(extracted_plaintext_files)
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
# Assert
assert len(maps) == 1
assert "<div>" not in maps[0].raw
# Helper Functions # Helper Functions
def create_file(tmp_path: Path, entry=None, filename="test.md"): def create_file(tmp_path: Path, entry=None, filename="test.md"):
file_ = tmp_path / filename file_ = tmp_path / filename