Fix plaintext HTML parsing and rendering (#464)

* Store conversation command options in an Enum
* Move to slash commands instead of using @ to specify general commands
* Calculate conversation command once & pass it as arg to child funcs
* Add /notes command to respond using only knowledge base as context
This prevents the chat model to try respond using it's general world
knowledge only without any references pulled from the indexed
knowledge base
* Test general and notes slash commands in openai chat director tests
---------

Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
sabaimran 2023-08-27 11:24:30 -07:00 committed by GitHub
parent 7919787fb7
commit b45e1d8c0d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 2168 additions and 48 deletions

View file

@ -36,6 +36,7 @@ classifiers = [
"Topic :: Text Processing :: Linguistic",
]
dependencies = [
"bs4 >= 0.0.1",
"dateparser >= 1.1.1",
"defusedxml == 0.7.1",
"fastapi == 0.77.1",

View file

@ -34,28 +34,6 @@
margin: 0;
padding: 0;
}
@media only screen and (max-width: 600px) {
body {
display: grid;
grid-template-columns: 1fr;
grid-template-rows: 1fr auto auto auto minmax(80px, 100%);
}
body > * {
grid-column: 1;
}
div.filler {
display: none;
}
body.khoj-configure {
padding: 0;
}
div.section {
padding: 12px;
}
}
img.khoj-logo {
max-width: none!important;
@ -74,6 +52,11 @@
justify-self: center;
}
div.section.general-settings {
justify-self: center;
}
div.instructions {
font-size: large;
}
@ -103,6 +86,26 @@
box-shadow: 0px 1px 3px 0px rgba(0,0,0,0.1),0px 1px 2px -1px rgba(0,0,0,0.1);
overflow: hidden;
}
div.finalize-buttons {
display: grid;
gap: 8px;
padding: 24px 16px;
width: 320px;
background: white;
border-radius: 4px;
overflow: hidden;
}
div#features-hint-text {
width: 640px;
opacity: 0;
transition: opacity 0.5s ease-in-out;
overflow: hidden;
height: 0;
}
div#features-hint-text.show {
opacity: 1;
height: auto;
}
.card-title-row {
display: grid;
grid-template-columns: auto 1fr;
@ -213,5 +216,32 @@
grid-template-columns: 1fr;
}
}
@media only screen and (max-width: 600px) {
body {
display: grid;
grid-template-columns: 1fr;
grid-template-rows: 1fr auto auto auto minmax(80px, 100%);
}
body > * {
grid-column: 1;
}
div.filler {
display: none;
}
body.khoj-configure {
padding: 0;
}
div.section {
padding: 12px;
}
div#features-hint-text {
width: 320px;
}
}
</style>
</html>

View file

@ -104,15 +104,18 @@
newResponseText.appendChild(loadingSpinner);
document.getElementById("chat-body").scrollTop = document.getElementById("chat-body").scrollHeight;
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "none";
let chatInput = document.getElementById("chat-input");
chatInput.classList.remove("option-enabled");
// Call specified Khoj API which returns a streamed response of type text/plain
fetch(url)
.then(response => {
const reader = response.body.getReader();
const decoder = new TextDecoder();
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "none";
function readStream() {
reader.read().then(({ done, value }) => {
if (done) {
@ -176,17 +179,16 @@
} else if (chatInput.value.startsWith("/")) {
const firstWord = chatInput.value.split(" ")[0];
if (firstWord.substring(1) in chatOptions) {
// Add a div element around the text.
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "block";
chatTooltip.innerHTML = "Mode: " + firstWord.substring(1);
chatInput.classList.add("option-enabled");
} else {
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "none";
chatInput.classList.remove("option-enabled");
}
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "none";
} else {
let chatTooltip = document.getElementById("chat-tooltip");
chatTooltip.style.display = "none";
chatInput.classList.remove("option-enabled");
}
autoResize();
@ -288,7 +290,8 @@
<!-- Chat Footer -->
<div id="chat-footer">
<div id="chat-tooltip" style="display: none;"></div>
<textarea id="chat-input" class="option" oninput="onChatInput()" onkeyup=incrementalChat(event) autofocus="autofocus" placeholder="What is the meaning of life?"></textarea>
<textarea id="chat-input" class="option" oninput="onChatInput()" onkeyup=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter.">
</textarea>
</div>
</body>
@ -428,6 +431,19 @@
line-height: 1.5em;
margin: 0;
}
#chat-input:focus {
outline: none !important;
}
.option-enabled {
box-shadow: 0 0 12px rgb(119, 156, 46);
}
.option-enabled:focus {
outline: none !important;
border:1px solid #475569;
box-shadow: 0 0 16px var(--primary);
}
a.inline-chat-link {
color: #475569;

View file

@ -219,6 +219,7 @@
</div>
<div class="section">
<h2 class="section-title">Features</h2>
<div id="features-hint-text"></div>
<div class="section-cards">
<div class="card">
<div class="card-title-row">
@ -285,18 +286,24 @@
</div>
</div>
</div>
<div class="section">
<div class="section general-settings">
<div id="results-count" title="Number of items to show in search and use for chat response">
<label for="results-count-slider">Results Count: <span id="results-count-value">5</span></label>
<input type="range" id="results-count-slider" name="results-count-slider" min="1" max="10" step="1" value="5">
</div>
<div id="status" style="display: none;"></div>
</div>
<div class="section finalize-actions">
<div class="section finalize-actions general-settings">
<div class="section-cards">
<div class="finalize-buttons">
<button id="configure" type="submit" title="Update index with the latest changes">⚙️ Configure</button>
</div>
<div class="finalize-buttons">
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
</div>
</div>
</div>
</div>
<script>
function clearContentType(content_type) {
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
@ -329,9 +336,16 @@
function toggleEnableLocalLLLM(enable) {
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
var toggleEnableLocalLLLMButton = document.getElementById("toggle-enable-offline-chat");
var featuresHintText = document.getElementById("features-hint-text");
toggleEnableLocalLLLMButton.classList.remove("disabled");
toggleEnableLocalLLLMButton.classList.add("enabled");
if (enable) {
featuresHintText.style.display = "block";
featuresHintText.innerHTML = "An open source model is being downloaded in the background. Hang tight, this may take a few minutes ⏳.";
featuresHintText.classList.add("show");
}
fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, {
method: 'POST',
headers: {
@ -372,6 +386,8 @@
disableLocalLLLMButton.classList.remove("enabled");
}
featuresHintText.classList.remove("show");
featuresHintText.innerHTML = "";
}
})
}

View file

@ -56,6 +56,44 @@
}).join("\n");
}
function render_html(query, data) {
return data.map(function (item) {
let document = new DOMParser().parseFromString(item.entry, "text/html");
// Scrub the HTML to remove any script tags and associated content
let script_tags = document.querySelectorAll("script");
for (let i = 0; i < script_tags.length; i++) {
script_tags[i].remove();
}
// Scrub the HTML to remove any style tags and associated content
let style_tags = document.querySelectorAll("style");
for (let i = 0; i < style_tags.length; i++) {
style_tags[i].remove();
}
// Scrub the HTML to remove any noscript tags and associated content
let noscript_tags = document.querySelectorAll("noscript");
for (let i = 0; i < noscript_tags.length; i++) {
noscript_tags[i].remove();
}
// Scrub the HTML to remove any iframe tags and associated content
let iframe_tags = document.querySelectorAll("iframe");
for (let i = 0; i < iframe_tags.length; i++) {
iframe_tags[i].remove();
}
// Scrub the HTML to remove any object tags and associated content
let object_tags = document.querySelectorAll("object");
for (let i = 0; i < object_tags.length; i++) {
object_tags[i].remove();
}
// Scrub the HTML to remove any embed tags and associated content
let embed_tags = document.querySelectorAll("embed");
for (let i = 0; i < embed_tags.length; i++) {
embed_tags[i].remove();
}
let scrubbedHTML = document.body.outerHTML;
return `<div class="results-html">` + scrubbedHTML + `</div>`;
}).join("\n");
}
function render_multiple(query, data, type) {
let html = "";
data.forEach(item => {
@ -73,6 +111,8 @@
html += render_pdf(query, [item]);
} else if (item.additional.file.includes("notion.so")) {
html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
} else if (item.additional.file.endsWith(".html")) {
html += render_html(query, [item]);
} else {
html += `<div class="results-plugin">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
}
@ -247,7 +287,7 @@
</div>
<!--Add Text Box To Enter Query, Trigger Incremental Search OnChange -->
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="What is the meaning of life?">
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="Search directly from your knowledge base">
<div id="options">
<!--Add Dropdown to Select Query Type -->
@ -338,6 +378,7 @@
}
.results-pdf,
.results-notion,
.results-html,
.results-plugin {
text-align: left;
white-space: pre-line;
@ -398,6 +439,7 @@
div.results-notion,
div.results-org,
div.results-plugin,
div.results-html,
div.results-pdf {
text-align: left;
box-shadow: 2px 2px 2px var(--primary-hover);

View file

@ -122,6 +122,7 @@ def converse(
if conversation_command == ConversationCommand.General:
conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query)
elif conversation_command == ConversationCommand.Notes and is_none_or_empty(compiled_references):
completion_func(chat_response=prompts.no_notes_found.format())
return iter([prompts.no_notes_found.format()])
else:
conversation_primer = prompts.notes_conversation.format(

View file

@ -237,10 +237,10 @@ Q:"""
help_message = PromptTemplate.from_template(
"""
**/help**: Show this help message.
**/notes**: Search only against the information in your knowledge base. This is the default method.
**/general**: Search general knowledge with the LLM. This will not search against your notes.
**/notes**: Chat using the information in your knowledge base. This is the default method.
**/general**: Chat using general knowledge with the LLM. This will not search against your notes.
You are using the **{model}** model. To change the model, go to your <a href="/config">settings</a> page.
You are using the **{model}** model.
**version**: {version}
""".strip()
)

View file

@ -91,15 +91,21 @@ class PlaintextToJsonl(TextToJsonl):
for plaintext_file in plaintext_files:
with open(plaintext_file, "r") as f:
try:
plaintext_content = f.read()
if plaintext_file.endswith(("html", "htm", "xml")):
plaintext_content = PlaintextToJsonl.extract_html_content(plaintext_content)
entry_to_file_map.append((plaintext_content, plaintext_file))
except Exception as e:
logger.warning(f"Unable to process file: {plaintext_file}. This file will not be indexed.")
logger.warning(e, exc_info=True)
return dict(entry_to_file_map)
@staticmethod
def extract_html_content(html_content: str):
"Extract content from HTML"
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
return soup.get_text(strip=True, separator="\n")
@staticmethod
def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]:
"Convert each plaintext entries into a dictionary"

View file

@ -222,5 +222,5 @@ class ConversationCommand(str, Enum):
command_descriptions = {
ConversationCommand.General: "This command allows you to search talk with the LLM without including context from your knowledge base.",
ConversationCommand.Notes: "This command allows you to search talk with the LLM while including context from your knowledge base.",
ConversationCommand.Help: "This command displays a help message.",
ConversationCommand.Help: "This command displays a help message with all available commands and other metadata.",
}

View file

@ -109,6 +109,13 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
)
}
content_config.plaintext = TextContentConfig(
input_files=None,
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
compressed_jsonl=content_dir.joinpath("plaintext.jsonl.gz"),
embeddings_file=content_dir.joinpath("plaintext_embeddings.pt"),
)
content_config.github = GithubContentConfig(
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
repos=[

File diff suppressed because one or more lines are too long

View file

@ -18,7 +18,7 @@ def test_plaintext_file(tmp_path):
# Act
# Extract Entries from specified plaintext files
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[plaintextfile])
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[str(plaintextfile)])
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
@ -75,6 +75,23 @@ def test_get_plaintext_files(tmp_path):
assert set(extracted_plaintext_files) == set(expected_files)
def test_parse_html_plaintext_file(content_config):
"Ensure HTML files are parsed correctly"
# Arrange
# Setup input-files, input-filters
input_files = content_config.plaintext.input_files
input_filter = content_config.plaintext.input_filter
# Act
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(extracted_plaintext_files)
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
# Assert
assert len(maps) == 1
assert "<div>" not in maps[0].raw
# Helper Functions
def create_file(tmp_path: Path, entry=None, filename="test.md"):
file_ = tmp_path / filename