Merge branch 'features/include-full-file-in-convo-with-filter' of github.com:khoj-ai/khoj into features/include-full-file-in-convo-with-filter

This commit is contained in:
sabaimran 2024-11-11 09:34:08 -08:00
commit 47937d5148
4 changed files with 53 additions and 74 deletions

View file

@ -257,9 +257,6 @@ export const ChatInputArea = forwardRef<HTMLTextAreaElement, ChatInputProps>((pr
setConvertedAttachedFiles(data); setConvertedAttachedFiles(data);
}); });
const totalSize = Array.from(files).reduce((acc, file) => acc + file.size, 0);
const totalSizeInMB = totalSize / (1024 * 1024);
// Set focus to the input for user message after uploading files // Set focus to the input for user message after uploading files
chatInputRef?.current?.focus(); chatInputRef?.current?.focus();
} }
@ -612,6 +609,7 @@ export const ChatInputArea = forwardRef<HTMLTextAreaElement, ChatInputProps>((pr
> >
<input <input
type="file" type="file"
accept=".pdf,.doc,.docx,.txt,.md,.org,.jpg,.jpeg,.png,.webp"
multiple={true} multiple={true}
ref={fileInputRef} ref={fileInputRef}
onChange={handleFileChange} onChange={handleFileChange}

View file

@ -1,7 +1,5 @@
import logging import logging
import os import tempfile
from datetime import datetime
from random import randint
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from langchain_community.document_loaders import Docx2txtLoader from langchain_community.document_loaders import Docx2txtLoader
@ -94,26 +92,20 @@ class DocxToEntries(TextToEntries):
def extract_text(docx_file): def extract_text(docx_file):
"""Extract text from specified DOCX file""" """Extract text from specified DOCX file"""
try: try:
timestamp_now = datetime.utcnow().timestamp()
random_suffix = randint(0, 1000)
tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx"
docx_entry_by_pages = [] docx_entry_by_pages = []
with open(tmp_file, "wb") as f: # Create temp file with .docx extension that gets auto-deleted
bytes_content = docx_file with tempfile.NamedTemporaryFile(suffix=".docx", delete=True) as tmp:
f.write(bytes_content) tmp.write(docx_file)
tmp.flush() # Ensure all data is written
# Load the content using Docx2txtLoader # Load the content using Docx2txtLoader
loader = Docx2txtLoader(tmp_file) loader = Docx2txtLoader(tmp.name)
docx_entries_per_file = loader.load() docx_entries_per_file = loader.load()
# Convert the loaded entries into the desired format # Convert the loaded entries into the desired format
docx_entry_by_pages = [page.page_content for page in docx_entries_per_file] docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
except Exception as e: except Exception as e:
logger.warning(f"Unable to extract text from file: {docx_file}") logger.warning(f"Unable to extract text from file: {docx_file}")
logger.warning(e, exc_info=True) logger.warning(e, exc_info=True)
finally:
if os.path.exists(f"{tmp_file}"):
os.remove(f"{tmp_file}")
return docx_entry_by_pages return docx_entry_by_pages

View file

@ -1,14 +1,10 @@
import base64
import logging import logging
import os import tempfile
from datetime import datetime from io import BytesIO
from random import randint
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.document_loaders import PyMuPDFLoader
# importing FileObjectAdapter so that we can add new files and debug file object db.
# from khoj.database.adapters import FileObjectAdapters
from khoj.database.models import Entry as DbEntry from khoj.database.models import Entry as DbEntry
from khoj.database.models import KhojUser from khoj.database.models import KhojUser
from khoj.processor.content.text_to_entries import TextToEntries from khoj.processor.content.text_to_entries import TextToEntries
@ -97,26 +93,19 @@ class PdfToEntries(TextToEntries):
def extract_text(pdf_file): def extract_text(pdf_file):
"""Extract text from specified PDF files""" """Extract text from specified PDF files"""
try: try:
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path # Create temp file with .pdf extension that gets auto-deleted
timestamp_now = datetime.utcnow().timestamp() with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmpf:
random_suffix = randint(0, 1000) tmpf.write(pdf_file)
tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf" tmpf.flush() # Ensure all data is written
pdf_entry_by_pages = []
with open(f"{tmp_file}", "wb") as f: # Load the content using PyMuPDFLoader
f.write(pdf_file) loader = PyMuPDFLoader(tmpf.name, extract_images=True)
try: pdf_entries_per_file = loader.load()
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False)
pdf_entry_by_pages = [page.page_content for page in loader.load()] # Convert the loaded entries into the desired format
except ImportError: pdf_entry_by_pages = [page.page_content for page in pdf_entries_per_file]
loader = PyMuPDFLoader(f"{tmp_file}")
pdf_entry_by_pages = [
page.page_content for page in loader.load()
] # page_content items list for a given pdf.
except Exception as e: except Exception as e:
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.") logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
logger.warning(e, exc_info=True) logger.warning(e, exc_info=True)
finally:
if os.path.exists(f"{tmp_file}"):
os.remove(f"{tmp_file}")
return pdf_entry_by_pages return pdf_entry_by_pages

View file

@ -140,6 +140,35 @@ def construct_iteration_history(
return previous_iterations_history return previous_iterations_history
def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="AI") -> str:
chat_history = ""
for chat in conversation_history.get("chat", [])[-n:]:
if chat["by"] == "khoj" and chat["intent"].get("type") in ["remember", "reminder", "summarize"]:
chat_history += f"User: {chat['intent']['query']}\n"
if chat["intent"].get("inferred-queries"):
chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
chat_history += f"{agent_name}: {chat['message']}\n\n"
elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")):
chat_history += f"User: {chat['intent']['query']}\n"
chat_history += f"{agent_name}: [generated image redacted for space]\n"
elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")):
chat_history += f"User: {chat['intent']['query']}\n"
chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
elif chat["by"] == "you":
raw_attached_files = chat.get("attachedFiles")
if raw_attached_files:
attached_files: Dict[str, str] = {}
for file in raw_attached_files:
attached_files[file["name"]] = file["content"]
attached_file_context = gather_raw_attached_files(attached_files)
chat_history += f"User: {attached_file_context}\n"
return chat_history
def construct_tool_chat_history( def construct_tool_chat_history(
previous_iterations: List[InformationCollectionIteration], tool: ConversationCommand = None previous_iterations: List[InformationCollectionIteration], tool: ConversationCommand = None
) -> Dict[str, list]: ) -> Dict[str, list]:
@ -540,35 +569,6 @@ def get_image_from_url(image_url: str, type="pil"):
return ImageWithType(content=None, type=None) return ImageWithType(content=None, type=None)
def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="AI") -> str:
chat_history = ""
for chat in conversation_history.get("chat", [])[-n:]:
if chat["by"] == "khoj" and chat["intent"].get("type") in ["remember", "reminder", "summarize"]:
chat_history += f"User: {chat['intent']['query']}\n"
if chat["intent"].get("inferred-queries"):
chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
chat_history += f"{agent_name}: {chat['message']}\n\n"
elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")):
chat_history += f"User: {chat['intent']['query']}\n"
chat_history += f"{agent_name}: [generated image redacted for space]\n"
elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")):
chat_history += f"User: {chat['intent']['query']}\n"
chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
elif chat["by"] == "you":
raw_attached_files = chat.get("attachedFiles")
if raw_attached_files:
attached_files: Dict[str, str] = {}
for file in raw_attached_files:
attached_files[file["name"]] = file["content"]
attached_file_context = gather_raw_attached_files(attached_files)
chat_history += f"User: {attached_file_context}\n"
return chat_history
def commit_conversation_trace( def commit_conversation_trace(
session: list[ChatMessage], session: list[ChatMessage],
response: str | list[dict], response: str | list[dict],