mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 09:25:06 +01:00
Merge branch 'features/include-full-file-in-convo-with-filter' of github.com:khoj-ai/khoj into features/include-full-file-in-convo-with-filter
This commit is contained in:
commit
47937d5148
4 changed files with 53 additions and 74 deletions
|
@ -257,9 +257,6 @@ export const ChatInputArea = forwardRef<HTMLTextAreaElement, ChatInputProps>((pr
|
||||||
setConvertedAttachedFiles(data);
|
setConvertedAttachedFiles(data);
|
||||||
});
|
});
|
||||||
|
|
||||||
const totalSize = Array.from(files).reduce((acc, file) => acc + file.size, 0);
|
|
||||||
const totalSizeInMB = totalSize / (1024 * 1024);
|
|
||||||
|
|
||||||
// Set focus to the input for user message after uploading files
|
// Set focus to the input for user message after uploading files
|
||||||
chatInputRef?.current?.focus();
|
chatInputRef?.current?.focus();
|
||||||
}
|
}
|
||||||
|
@ -612,6 +609,7 @@ export const ChatInputArea = forwardRef<HTMLTextAreaElement, ChatInputProps>((pr
|
||||||
>
|
>
|
||||||
<input
|
<input
|
||||||
type="file"
|
type="file"
|
||||||
|
accept=".pdf,.doc,.docx,.txt,.md,.org,.jpg,.jpeg,.png,.webp"
|
||||||
multiple={true}
|
multiple={true}
|
||||||
ref={fileInputRef}
|
ref={fileInputRef}
|
||||||
onChange={handleFileChange}
|
onChange={handleFileChange}
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
import logging
|
import logging
|
||||||
import os
|
import tempfile
|
||||||
from datetime import datetime
|
|
||||||
from random import randint
|
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from langchain_community.document_loaders import Docx2txtLoader
|
from langchain_community.document_loaders import Docx2txtLoader
|
||||||
|
@ -94,26 +92,20 @@ class DocxToEntries(TextToEntries):
|
||||||
def extract_text(docx_file):
|
def extract_text(docx_file):
|
||||||
"""Extract text from specified DOCX file"""
|
"""Extract text from specified DOCX file"""
|
||||||
try:
|
try:
|
||||||
timestamp_now = datetime.utcnow().timestamp()
|
|
||||||
random_suffix = randint(0, 1000)
|
|
||||||
tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx"
|
|
||||||
docx_entry_by_pages = []
|
docx_entry_by_pages = []
|
||||||
with open(tmp_file, "wb") as f:
|
# Create temp file with .docx extension that gets auto-deleted
|
||||||
bytes_content = docx_file
|
with tempfile.NamedTemporaryFile(suffix=".docx", delete=True) as tmp:
|
||||||
f.write(bytes_content)
|
tmp.write(docx_file)
|
||||||
|
tmp.flush() # Ensure all data is written
|
||||||
|
|
||||||
# Load the content using Docx2txtLoader
|
# Load the content using Docx2txtLoader
|
||||||
loader = Docx2txtLoader(tmp_file)
|
loader = Docx2txtLoader(tmp.name)
|
||||||
docx_entries_per_file = loader.load()
|
docx_entries_per_file = loader.load()
|
||||||
|
|
||||||
# Convert the loaded entries into the desired format
|
|
||||||
docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
|
|
||||||
|
|
||||||
|
# Convert the loaded entries into the desired format
|
||||||
|
docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Unable to extract text from file: {docx_file}")
|
logger.warning(f"Unable to extract text from file: {docx_file}")
|
||||||
logger.warning(e, exc_info=True)
|
logger.warning(e, exc_info=True)
|
||||||
finally:
|
|
||||||
if os.path.exists(f"{tmp_file}"):
|
|
||||||
os.remove(f"{tmp_file}")
|
|
||||||
|
|
||||||
return docx_entry_by_pages
|
return docx_entry_by_pages
|
||||||
|
|
|
@ -1,14 +1,10 @@
|
||||||
import base64
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import tempfile
|
||||||
from datetime import datetime
|
from io import BytesIO
|
||||||
from random import randint
|
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from langchain_community.document_loaders import PyMuPDFLoader
|
from langchain_community.document_loaders import PyMuPDFLoader
|
||||||
|
|
||||||
# importing FileObjectAdapter so that we can add new files and debug file object db.
|
|
||||||
# from khoj.database.adapters import FileObjectAdapters
|
|
||||||
from khoj.database.models import Entry as DbEntry
|
from khoj.database.models import Entry as DbEntry
|
||||||
from khoj.database.models import KhojUser
|
from khoj.database.models import KhojUser
|
||||||
from khoj.processor.content.text_to_entries import TextToEntries
|
from khoj.processor.content.text_to_entries import TextToEntries
|
||||||
|
@ -97,26 +93,19 @@ class PdfToEntries(TextToEntries):
|
||||||
def extract_text(pdf_file):
|
def extract_text(pdf_file):
|
||||||
"""Extract text from specified PDF files"""
|
"""Extract text from specified PDF files"""
|
||||||
try:
|
try:
|
||||||
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path
|
# Create temp file with .pdf extension that gets auto-deleted
|
||||||
timestamp_now = datetime.utcnow().timestamp()
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmpf:
|
||||||
random_suffix = randint(0, 1000)
|
tmpf.write(pdf_file)
|
||||||
tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf"
|
tmpf.flush() # Ensure all data is written
|
||||||
pdf_entry_by_pages = []
|
|
||||||
with open(f"{tmp_file}", "wb") as f:
|
# Load the content using PyMuPDFLoader
|
||||||
f.write(pdf_file)
|
loader = PyMuPDFLoader(tmpf.name, extract_images=True)
|
||||||
try:
|
pdf_entries_per_file = loader.load()
|
||||||
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False)
|
|
||||||
pdf_entry_by_pages = [page.page_content for page in loader.load()]
|
# Convert the loaded entries into the desired format
|
||||||
except ImportError:
|
pdf_entry_by_pages = [page.page_content for page in pdf_entries_per_file]
|
||||||
loader = PyMuPDFLoader(f"{tmp_file}")
|
|
||||||
pdf_entry_by_pages = [
|
|
||||||
page.page_content for page in loader.load()
|
|
||||||
] # page_content items list for a given pdf.
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
|
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
|
||||||
logger.warning(e, exc_info=True)
|
logger.warning(e, exc_info=True)
|
||||||
finally:
|
|
||||||
if os.path.exists(f"{tmp_file}"):
|
|
||||||
os.remove(f"{tmp_file}")
|
|
||||||
|
|
||||||
return pdf_entry_by_pages
|
return pdf_entry_by_pages
|
||||||
|
|
|
@ -140,6 +140,35 @@ def construct_iteration_history(
|
||||||
return previous_iterations_history
|
return previous_iterations_history
|
||||||
|
|
||||||
|
|
||||||
|
def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="AI") -> str:
|
||||||
|
chat_history = ""
|
||||||
|
for chat in conversation_history.get("chat", [])[-n:]:
|
||||||
|
if chat["by"] == "khoj" and chat["intent"].get("type") in ["remember", "reminder", "summarize"]:
|
||||||
|
chat_history += f"User: {chat['intent']['query']}\n"
|
||||||
|
|
||||||
|
if chat["intent"].get("inferred-queries"):
|
||||||
|
chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
|
||||||
|
|
||||||
|
chat_history += f"{agent_name}: {chat['message']}\n\n"
|
||||||
|
elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")):
|
||||||
|
chat_history += f"User: {chat['intent']['query']}\n"
|
||||||
|
chat_history += f"{agent_name}: [generated image redacted for space]\n"
|
||||||
|
elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")):
|
||||||
|
chat_history += f"User: {chat['intent']['query']}\n"
|
||||||
|
chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
|
||||||
|
elif chat["by"] == "you":
|
||||||
|
raw_attached_files = chat.get("attachedFiles")
|
||||||
|
if raw_attached_files:
|
||||||
|
attached_files: Dict[str, str] = {}
|
||||||
|
for file in raw_attached_files:
|
||||||
|
attached_files[file["name"]] = file["content"]
|
||||||
|
|
||||||
|
attached_file_context = gather_raw_attached_files(attached_files)
|
||||||
|
chat_history += f"User: {attached_file_context}\n"
|
||||||
|
|
||||||
|
return chat_history
|
||||||
|
|
||||||
|
|
||||||
def construct_tool_chat_history(
|
def construct_tool_chat_history(
|
||||||
previous_iterations: List[InformationCollectionIteration], tool: ConversationCommand = None
|
previous_iterations: List[InformationCollectionIteration], tool: ConversationCommand = None
|
||||||
) -> Dict[str, list]:
|
) -> Dict[str, list]:
|
||||||
|
@ -540,35 +569,6 @@ def get_image_from_url(image_url: str, type="pil"):
|
||||||
return ImageWithType(content=None, type=None)
|
return ImageWithType(content=None, type=None)
|
||||||
|
|
||||||
|
|
||||||
def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="AI") -> str:
|
|
||||||
chat_history = ""
|
|
||||||
for chat in conversation_history.get("chat", [])[-n:]:
|
|
||||||
if chat["by"] == "khoj" and chat["intent"].get("type") in ["remember", "reminder", "summarize"]:
|
|
||||||
chat_history += f"User: {chat['intent']['query']}\n"
|
|
||||||
|
|
||||||
if chat["intent"].get("inferred-queries"):
|
|
||||||
chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
|
|
||||||
|
|
||||||
chat_history += f"{agent_name}: {chat['message']}\n\n"
|
|
||||||
elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")):
|
|
||||||
chat_history += f"User: {chat['intent']['query']}\n"
|
|
||||||
chat_history += f"{agent_name}: [generated image redacted for space]\n"
|
|
||||||
elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")):
|
|
||||||
chat_history += f"User: {chat['intent']['query']}\n"
|
|
||||||
chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
|
|
||||||
elif chat["by"] == "you":
|
|
||||||
raw_attached_files = chat.get("attachedFiles")
|
|
||||||
if raw_attached_files:
|
|
||||||
attached_files: Dict[str, str] = {}
|
|
||||||
for file in raw_attached_files:
|
|
||||||
attached_files[file["name"]] = file["content"]
|
|
||||||
|
|
||||||
attached_file_context = gather_raw_attached_files(attached_files)
|
|
||||||
chat_history += f"User: {attached_file_context}\n"
|
|
||||||
|
|
||||||
return chat_history
|
|
||||||
|
|
||||||
|
|
||||||
def commit_conversation_trace(
|
def commit_conversation_trace(
|
||||||
session: list[ChatMessage],
|
session: list[ChatMessage],
|
||||||
response: str | list[dict],
|
response: str | list[dict],
|
||||||
|
|
Loading…
Reference in a new issue