From 459318be13ea2b6ff3b7ace01982ba32fdc0f48e Mon Sep 17 00:00:00 2001 From: sabaimran Date: Sat, 9 Nov 2024 18:46:34 -0800 Subject: [PATCH] And random suffixes to decreases any clash probability when writing tmp files to disc --- src/khoj/processor/content/docx/docx_to_entries.py | 4 +++- src/khoj/processor/content/pdf/pdf_to_entries.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/khoj/processor/content/docx/docx_to_entries.py b/src/khoj/processor/content/docx/docx_to_entries.py index a2948caa..9ca9ef1f 100644 --- a/src/khoj/processor/content/docx/docx_to_entries.py +++ b/src/khoj/processor/content/docx/docx_to_entries.py @@ -1,6 +1,7 @@ import logging import os from datetime import datetime +from random import random from typing import Dict, List, Tuple from langchain_community.document_loaders import Docx2txtLoader @@ -94,7 +95,8 @@ class DocxToEntries(TextToEntries): """Extract text from specified DOCX file""" try: timestamp_now = datetime.utcnow().timestamp() - tmp_file = f"tmp_docx_file_{timestamp_now}.docx" + random_suffix = random.randint(0, 1000) + tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx" docx_entry_by_pages = [] with open(tmp_file, "wb") as f: bytes_content = docx_file diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 20b72b8c..a5de1335 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -2,6 +2,7 @@ import base64 import logging import os from datetime import datetime +from random import random from typing import Dict, List, Tuple from langchain_community.document_loaders import PyMuPDFLoader @@ -98,7 +99,8 @@ class PdfToEntries(TextToEntries): try: # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path timestamp_now = datetime.utcnow().timestamp() - tmp_file = f"tmp_pdf_file_{timestamp_now}.pdf" + random_suffix = random.randint(0, 1000) + tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf" pdf_entry_by_pages = [] with open(f"{tmp_file}", "wb") as f: f.write(pdf_file)