mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Fix plaintext HTML parsing and rendering (#464)
* Store conversation command options in an Enum * Move to slash commands instead of using @ to specify general commands * Calculate conversation command once & pass it as arg to child funcs * Add /notes command to respond using only knowledge base as context This prevents the chat model to try respond using it's general world knowledge only without any references pulled from the indexed knowledge base * Test general and notes slash commands in openai chat director tests --------- Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
parent
7919787fb7
commit
b45e1d8c0d
12 changed files with 2168 additions and 48 deletions
|
@ -36,6 +36,7 @@ classifiers = [
|
||||||
"Topic :: Text Processing :: Linguistic",
|
"Topic :: Text Processing :: Linguistic",
|
||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"bs4 >= 0.0.1",
|
||||||
"dateparser >= 1.1.1",
|
"dateparser >= 1.1.1",
|
||||||
"defusedxml == 0.7.1",
|
"defusedxml == 0.7.1",
|
||||||
"fastapi == 0.77.1",
|
"fastapi == 0.77.1",
|
||||||
|
|
|
@ -34,28 +34,6 @@
|
||||||
margin: 0;
|
margin: 0;
|
||||||
padding: 0;
|
padding: 0;
|
||||||
}
|
}
|
||||||
@media only screen and (max-width: 600px) {
|
|
||||||
body {
|
|
||||||
display: grid;
|
|
||||||
grid-template-columns: 1fr;
|
|
||||||
grid-template-rows: 1fr auto auto auto minmax(80px, 100%);
|
|
||||||
}
|
|
||||||
body > * {
|
|
||||||
grid-column: 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
div.filler {
|
|
||||||
display: none;
|
|
||||||
}
|
|
||||||
|
|
||||||
body.khoj-configure {
|
|
||||||
padding: 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
div.section {
|
|
||||||
padding: 12px;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
img.khoj-logo {
|
img.khoj-logo {
|
||||||
max-width: none!important;
|
max-width: none!important;
|
||||||
|
@ -74,6 +52,11 @@
|
||||||
justify-self: center;
|
justify-self: center;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
div.section.general-settings {
|
||||||
|
justify-self: center;
|
||||||
|
}
|
||||||
|
|
||||||
div.instructions {
|
div.instructions {
|
||||||
font-size: large;
|
font-size: large;
|
||||||
}
|
}
|
||||||
|
@ -103,6 +86,26 @@
|
||||||
box-shadow: 0px 1px 3px 0px rgba(0,0,0,0.1),0px 1px 2px -1px rgba(0,0,0,0.1);
|
box-shadow: 0px 1px 3px 0px rgba(0,0,0,0.1),0px 1px 2px -1px rgba(0,0,0,0.1);
|
||||||
overflow: hidden;
|
overflow: hidden;
|
||||||
}
|
}
|
||||||
|
div.finalize-buttons {
|
||||||
|
display: grid;
|
||||||
|
gap: 8px;
|
||||||
|
padding: 24px 16px;
|
||||||
|
width: 320px;
|
||||||
|
background: white;
|
||||||
|
border-radius: 4px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
div#features-hint-text {
|
||||||
|
width: 640px;
|
||||||
|
opacity: 0;
|
||||||
|
transition: opacity 0.5s ease-in-out;
|
||||||
|
overflow: hidden;
|
||||||
|
height: 0;
|
||||||
|
}
|
||||||
|
div#features-hint-text.show {
|
||||||
|
opacity: 1;
|
||||||
|
height: auto;
|
||||||
|
}
|
||||||
.card-title-row {
|
.card-title-row {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: auto 1fr;
|
grid-template-columns: auto 1fr;
|
||||||
|
@ -213,5 +216,32 @@
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@media only screen and (max-width: 600px) {
|
||||||
|
body {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
grid-template-rows: 1fr auto auto auto minmax(80px, 100%);
|
||||||
|
}
|
||||||
|
body > * {
|
||||||
|
grid-column: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
div.filler {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
body.khoj-configure {
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
div.section {
|
||||||
|
padding: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
div#features-hint-text {
|
||||||
|
width: 320px;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -104,15 +104,18 @@
|
||||||
newResponseText.appendChild(loadingSpinner);
|
newResponseText.appendChild(loadingSpinner);
|
||||||
document.getElementById("chat-body").scrollTop = document.getElementById("chat-body").scrollHeight;
|
document.getElementById("chat-body").scrollTop = document.getElementById("chat-body").scrollHeight;
|
||||||
|
|
||||||
|
let chatTooltip = document.getElementById("chat-tooltip");
|
||||||
|
chatTooltip.style.display = "none";
|
||||||
|
|
||||||
|
let chatInput = document.getElementById("chat-input");
|
||||||
|
chatInput.classList.remove("option-enabled");
|
||||||
|
|
||||||
// Call specified Khoj API which returns a streamed response of type text/plain
|
// Call specified Khoj API which returns a streamed response of type text/plain
|
||||||
fetch(url)
|
fetch(url)
|
||||||
.then(response => {
|
.then(response => {
|
||||||
const reader = response.body.getReader();
|
const reader = response.body.getReader();
|
||||||
const decoder = new TextDecoder();
|
const decoder = new TextDecoder();
|
||||||
|
|
||||||
let chatTooltip = document.getElementById("chat-tooltip");
|
|
||||||
chatTooltip.style.display = "none";
|
|
||||||
|
|
||||||
function readStream() {
|
function readStream() {
|
||||||
reader.read().then(({ done, value }) => {
|
reader.read().then(({ done, value }) => {
|
||||||
if (done) {
|
if (done) {
|
||||||
|
@ -176,17 +179,16 @@
|
||||||
} else if (chatInput.value.startsWith("/")) {
|
} else if (chatInput.value.startsWith("/")) {
|
||||||
const firstWord = chatInput.value.split(" ")[0];
|
const firstWord = chatInput.value.split(" ")[0];
|
||||||
if (firstWord.substring(1) in chatOptions) {
|
if (firstWord.substring(1) in chatOptions) {
|
||||||
// Add a div element around the text.
|
chatInput.classList.add("option-enabled");
|
||||||
let chatTooltip = document.getElementById("chat-tooltip");
|
|
||||||
chatTooltip.style.display = "block";
|
|
||||||
chatTooltip.innerHTML = "Mode: " + firstWord.substring(1);
|
|
||||||
} else {
|
} else {
|
||||||
let chatTooltip = document.getElementById("chat-tooltip");
|
chatInput.classList.remove("option-enabled");
|
||||||
chatTooltip.style.display = "none";
|
|
||||||
}
|
}
|
||||||
|
let chatTooltip = document.getElementById("chat-tooltip");
|
||||||
|
chatTooltip.style.display = "none";
|
||||||
} else {
|
} else {
|
||||||
let chatTooltip = document.getElementById("chat-tooltip");
|
let chatTooltip = document.getElementById("chat-tooltip");
|
||||||
chatTooltip.style.display = "none";
|
chatTooltip.style.display = "none";
|
||||||
|
chatInput.classList.remove("option-enabled");
|
||||||
}
|
}
|
||||||
|
|
||||||
autoResize();
|
autoResize();
|
||||||
|
@ -288,7 +290,8 @@
|
||||||
<!-- Chat Footer -->
|
<!-- Chat Footer -->
|
||||||
<div id="chat-footer">
|
<div id="chat-footer">
|
||||||
<div id="chat-tooltip" style="display: none;"></div>
|
<div id="chat-tooltip" style="display: none;"></div>
|
||||||
<textarea id="chat-input" class="option" oninput="onChatInput()" onkeyup=incrementalChat(event) autofocus="autofocus" placeholder="What is the meaning of life?"></textarea>
|
<textarea id="chat-input" class="option" oninput="onChatInput()" onkeyup=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter.">
|
||||||
|
</textarea>
|
||||||
</div>
|
</div>
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
|
@ -428,6 +431,19 @@
|
||||||
line-height: 1.5em;
|
line-height: 1.5em;
|
||||||
margin: 0;
|
margin: 0;
|
||||||
}
|
}
|
||||||
|
#chat-input:focus {
|
||||||
|
outline: none !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
.option-enabled {
|
||||||
|
box-shadow: 0 0 12px rgb(119, 156, 46);
|
||||||
|
}
|
||||||
|
|
||||||
|
.option-enabled:focus {
|
||||||
|
outline: none !important;
|
||||||
|
border:1px solid #475569;
|
||||||
|
box-shadow: 0 0 16px var(--primary);
|
||||||
|
}
|
||||||
|
|
||||||
a.inline-chat-link {
|
a.inline-chat-link {
|
||||||
color: #475569;
|
color: #475569;
|
||||||
|
|
|
@ -219,6 +219,7 @@
|
||||||
</div>
|
</div>
|
||||||
<div class="section">
|
<div class="section">
|
||||||
<h2 class="section-title">Features</h2>
|
<h2 class="section-title">Features</h2>
|
||||||
|
<div id="features-hint-text"></div>
|
||||||
<div class="section-cards">
|
<div class="section-cards">
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-title-row">
|
<div class="card-title-row">
|
||||||
|
@ -285,16 +286,22 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="section">
|
<div class="section general-settings">
|
||||||
<div id="results-count" title="Number of items to show in search and use for chat response">
|
<div id="results-count" title="Number of items to show in search and use for chat response">
|
||||||
<label for="results-count-slider">Results Count: <span id="results-count-value">5</span></label>
|
<label for="results-count-slider">Results Count: <span id="results-count-value">5</span></label>
|
||||||
<input type="range" id="results-count-slider" name="results-count-slider" min="1" max="10" step="1" value="5">
|
<input type="range" id="results-count-slider" name="results-count-slider" min="1" max="10" step="1" value="5">
|
||||||
</div>
|
</div>
|
||||||
<div id="status" style="display: none;"></div>
|
<div id="status" style="display: none;"></div>
|
||||||
</div>
|
</div>
|
||||||
<div class="section finalize-actions">
|
<div class="section finalize-actions general-settings">
|
||||||
<button id="configure" type="submit" title="Update index with the latest changes">⚙️ Configure</button>
|
<div class="section-cards">
|
||||||
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
|
<div class="finalize-buttons">
|
||||||
|
<button id="configure" type="submit" title="Update index with the latest changes">⚙️ Configure</button>
|
||||||
|
</div>
|
||||||
|
<div class="finalize-buttons">
|
||||||
|
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
|
@ -329,9 +336,16 @@
|
||||||
function toggleEnableLocalLLLM(enable) {
|
function toggleEnableLocalLLLM(enable) {
|
||||||
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
|
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
|
||||||
var toggleEnableLocalLLLMButton = document.getElementById("toggle-enable-offline-chat");
|
var toggleEnableLocalLLLMButton = document.getElementById("toggle-enable-offline-chat");
|
||||||
|
var featuresHintText = document.getElementById("features-hint-text");
|
||||||
toggleEnableLocalLLLMButton.classList.remove("disabled");
|
toggleEnableLocalLLLMButton.classList.remove("disabled");
|
||||||
toggleEnableLocalLLLMButton.classList.add("enabled");
|
toggleEnableLocalLLLMButton.classList.add("enabled");
|
||||||
|
|
||||||
|
if (enable) {
|
||||||
|
featuresHintText.style.display = "block";
|
||||||
|
featuresHintText.innerHTML = "An open source model is being downloaded in the background. Hang tight, this may take a few minutes ⏳.";
|
||||||
|
featuresHintText.classList.add("show");
|
||||||
|
}
|
||||||
|
|
||||||
fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, {
|
fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {
|
headers: {
|
||||||
|
@ -372,6 +386,8 @@
|
||||||
disableLocalLLLMButton.classList.remove("enabled");
|
disableLocalLLLMButton.classList.remove("enabled");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
featuresHintText.classList.remove("show");
|
||||||
|
featuresHintText.innerHTML = "";
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,6 +56,44 @@
|
||||||
}).join("\n");
|
}).join("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function render_html(query, data) {
|
||||||
|
return data.map(function (item) {
|
||||||
|
let document = new DOMParser().parseFromString(item.entry, "text/html");
|
||||||
|
// Scrub the HTML to remove any script tags and associated content
|
||||||
|
let script_tags = document.querySelectorAll("script");
|
||||||
|
for (let i = 0; i < script_tags.length; i++) {
|
||||||
|
script_tags[i].remove();
|
||||||
|
}
|
||||||
|
// Scrub the HTML to remove any style tags and associated content
|
||||||
|
let style_tags = document.querySelectorAll("style");
|
||||||
|
for (let i = 0; i < style_tags.length; i++) {
|
||||||
|
style_tags[i].remove();
|
||||||
|
}
|
||||||
|
// Scrub the HTML to remove any noscript tags and associated content
|
||||||
|
let noscript_tags = document.querySelectorAll("noscript");
|
||||||
|
for (let i = 0; i < noscript_tags.length; i++) {
|
||||||
|
noscript_tags[i].remove();
|
||||||
|
}
|
||||||
|
// Scrub the HTML to remove any iframe tags and associated content
|
||||||
|
let iframe_tags = document.querySelectorAll("iframe");
|
||||||
|
for (let i = 0; i < iframe_tags.length; i++) {
|
||||||
|
iframe_tags[i].remove();
|
||||||
|
}
|
||||||
|
// Scrub the HTML to remove any object tags and associated content
|
||||||
|
let object_tags = document.querySelectorAll("object");
|
||||||
|
for (let i = 0; i < object_tags.length; i++) {
|
||||||
|
object_tags[i].remove();
|
||||||
|
}
|
||||||
|
// Scrub the HTML to remove any embed tags and associated content
|
||||||
|
let embed_tags = document.querySelectorAll("embed");
|
||||||
|
for (let i = 0; i < embed_tags.length; i++) {
|
||||||
|
embed_tags[i].remove();
|
||||||
|
}
|
||||||
|
let scrubbedHTML = document.body.outerHTML;
|
||||||
|
return `<div class="results-html">` + scrubbedHTML + `</div>`;
|
||||||
|
}).join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
function render_multiple(query, data, type) {
|
function render_multiple(query, data, type) {
|
||||||
let html = "";
|
let html = "";
|
||||||
data.forEach(item => {
|
data.forEach(item => {
|
||||||
|
@ -73,6 +111,8 @@
|
||||||
html += render_pdf(query, [item]);
|
html += render_pdf(query, [item]);
|
||||||
} else if (item.additional.file.includes("notion.so")) {
|
} else if (item.additional.file.includes("notion.so")) {
|
||||||
html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
|
html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
|
||||||
|
} else if (item.additional.file.endsWith(".html")) {
|
||||||
|
html += render_html(query, [item]);
|
||||||
} else {
|
} else {
|
||||||
html += `<div class="results-plugin">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
|
html += `<div class="results-plugin">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
|
||||||
}
|
}
|
||||||
|
@ -247,7 +287,7 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!--Add Text Box To Enter Query, Trigger Incremental Search OnChange -->
|
<!--Add Text Box To Enter Query, Trigger Incremental Search OnChange -->
|
||||||
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="What is the meaning of life?">
|
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="Search directly from your knowledge base">
|
||||||
|
|
||||||
<div id="options">
|
<div id="options">
|
||||||
<!--Add Dropdown to Select Query Type -->
|
<!--Add Dropdown to Select Query Type -->
|
||||||
|
@ -338,6 +378,7 @@
|
||||||
}
|
}
|
||||||
.results-pdf,
|
.results-pdf,
|
||||||
.results-notion,
|
.results-notion,
|
||||||
|
.results-html,
|
||||||
.results-plugin {
|
.results-plugin {
|
||||||
text-align: left;
|
text-align: left;
|
||||||
white-space: pre-line;
|
white-space: pre-line;
|
||||||
|
@ -398,6 +439,7 @@
|
||||||
div.results-notion,
|
div.results-notion,
|
||||||
div.results-org,
|
div.results-org,
|
||||||
div.results-plugin,
|
div.results-plugin,
|
||||||
|
div.results-html,
|
||||||
div.results-pdf {
|
div.results-pdf {
|
||||||
text-align: left;
|
text-align: left;
|
||||||
box-shadow: 2px 2px 2px var(--primary-hover);
|
box-shadow: 2px 2px 2px var(--primary-hover);
|
||||||
|
|
|
@ -122,6 +122,7 @@ def converse(
|
||||||
if conversation_command == ConversationCommand.General:
|
if conversation_command == ConversationCommand.General:
|
||||||
conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query)
|
conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query)
|
||||||
elif conversation_command == ConversationCommand.Notes and is_none_or_empty(compiled_references):
|
elif conversation_command == ConversationCommand.Notes and is_none_or_empty(compiled_references):
|
||||||
|
completion_func(chat_response=prompts.no_notes_found.format())
|
||||||
return iter([prompts.no_notes_found.format()])
|
return iter([prompts.no_notes_found.format()])
|
||||||
else:
|
else:
|
||||||
conversation_primer = prompts.notes_conversation.format(
|
conversation_primer = prompts.notes_conversation.format(
|
||||||
|
|
|
@ -237,10 +237,10 @@ Q:"""
|
||||||
help_message = PromptTemplate.from_template(
|
help_message = PromptTemplate.from_template(
|
||||||
"""
|
"""
|
||||||
**/help**: Show this help message.
|
**/help**: Show this help message.
|
||||||
**/notes**: Search only against the information in your knowledge base. This is the default method.
|
**/notes**: Chat using the information in your knowledge base. This is the default method.
|
||||||
**/general**: Search general knowledge with the LLM. This will not search against your notes.
|
**/general**: Chat using general knowledge with the LLM. This will not search against your notes.
|
||||||
|
|
||||||
You are using the **{model}** model. To change the model, go to your <a href="/config">settings</a> page.
|
You are using the **{model}** model.
|
||||||
**version**: {version}
|
**version**: {version}
|
||||||
""".strip()
|
""".strip()
|
||||||
)
|
)
|
||||||
|
|
|
@ -91,15 +91,21 @@ class PlaintextToJsonl(TextToJsonl):
|
||||||
|
|
||||||
for plaintext_file in plaintext_files:
|
for plaintext_file in plaintext_files:
|
||||||
with open(plaintext_file, "r") as f:
|
with open(plaintext_file, "r") as f:
|
||||||
try:
|
plaintext_content = f.read()
|
||||||
plaintext_content = f.read()
|
if plaintext_file.endswith(("html", "htm", "xml")):
|
||||||
entry_to_file_map.append((plaintext_content, plaintext_file))
|
plaintext_content = PlaintextToJsonl.extract_html_content(plaintext_content)
|
||||||
except Exception as e:
|
entry_to_file_map.append((plaintext_content, plaintext_file))
|
||||||
logger.warning(f"Unable to process file: {plaintext_file}. This file will not be indexed.")
|
|
||||||
logger.warning(e, exc_info=True)
|
|
||||||
|
|
||||||
return dict(entry_to_file_map)
|
return dict(entry_to_file_map)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_html_content(html_content: str):
|
||||||
|
"Extract content from HTML"
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
return soup.get_text(strip=True, separator="\n")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]:
|
def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]:
|
||||||
"Convert each plaintext entries into a dictionary"
|
"Convert each plaintext entries into a dictionary"
|
||||||
|
|
|
@ -222,5 +222,5 @@ class ConversationCommand(str, Enum):
|
||||||
command_descriptions = {
|
command_descriptions = {
|
||||||
ConversationCommand.General: "This command allows you to search talk with the LLM without including context from your knowledge base.",
|
ConversationCommand.General: "This command allows you to search talk with the LLM without including context from your knowledge base.",
|
||||||
ConversationCommand.Notes: "This command allows you to search talk with the LLM while including context from your knowledge base.",
|
ConversationCommand.Notes: "This command allows you to search talk with the LLM while including context from your knowledge base.",
|
||||||
ConversationCommand.Help: "This command displays a help message.",
|
ConversationCommand.Help: "This command displays a help message with all available commands and other metadata.",
|
||||||
}
|
}
|
||||||
|
|
|
@ -109,6 +109,13 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
content_config.plaintext = TextContentConfig(
|
||||||
|
input_files=None,
|
||||||
|
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
|
||||||
|
compressed_jsonl=content_dir.joinpath("plaintext.jsonl.gz"),
|
||||||
|
embeddings_file=content_dir.joinpath("plaintext_embeddings.pt"),
|
||||||
|
)
|
||||||
|
|
||||||
content_config.github = GithubContentConfig(
|
content_config.github = GithubContentConfig(
|
||||||
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
|
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
|
||||||
repos=[
|
repos=[
|
||||||
|
|
1984
tests/data/plaintext/wikipedia_tardigrades.html
Normal file
1984
tests/data/plaintext/wikipedia_tardigrades.html
Normal file
File diff suppressed because one or more lines are too long
|
@ -18,7 +18,7 @@ def test_plaintext_file(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified plaintext files
|
# Extract Entries from specified plaintext files
|
||||||
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[plaintextfile])
|
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[str(plaintextfile)])
|
||||||
|
|
||||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
|
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
|
||||||
|
|
||||||
|
@ -75,6 +75,23 @@ def test_get_plaintext_files(tmp_path):
|
||||||
assert set(extracted_plaintext_files) == set(expected_files)
|
assert set(extracted_plaintext_files) == set(expected_files)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_html_plaintext_file(content_config):
|
||||||
|
"Ensure HTML files are parsed correctly"
|
||||||
|
# Arrange
|
||||||
|
# Setup input-files, input-filters
|
||||||
|
input_files = content_config.plaintext.input_files
|
||||||
|
input_filter = content_config.plaintext.input_filter
|
||||||
|
|
||||||
|
# Act
|
||||||
|
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
|
||||||
|
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(extracted_plaintext_files)
|
||||||
|
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(maps) == 1
|
||||||
|
assert "<div>" not in maps[0].raw
|
||||||
|
|
||||||
|
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
def create_file(tmp_path: Path, entry=None, filename="test.md"):
|
def create_file(tmp_path: Path, entry=None, filename="test.md"):
|
||||||
file_ = tmp_path / filename
|
file_ = tmp_path / filename
|
||||||
|
|
Loading…
Reference in a new issue