And random suffixes to decreases any clash probability when writing tmp files to disc

2024-11-23 23:48:56 +01:00 · 2024-11-09 18:46:34 -08:00 · 2024-11-09 18:46:34 -08:00 · 459318be13
commit 459318be13
parent dbf0c26247
2 changed files with 6 additions and 2 deletions
--- a/src/khoj/processor/content/docx/docx_to_entries.py
+++ b/src/khoj/processor/content/docx/docx_to_entries.py
@ -1,6 +1,7 @@
 import logging
 import os
 from datetime import datetime
 from random import random
 from typing import Dict, List, Tuple
 from langchain_community.document_loaders import Docx2txtLoader
@ -94,7 +95,8 @@ class DocxToEntries(TextToEntries):
        """Extract text from specified DOCX file"""
        try:
            timestamp_now = datetime.utcnow().timestamp()
-            tmp_file = f"tmp_docx_file_{timestamp_now}.docx"
+            random_suffix = random.randint(0, 1000)
            tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx"
            docx_entry_by_pages = []
            with open(tmp_file, "wb") as f:
                bytes_content = docx_file
--- a/src/khoj/processor/content/pdf/pdf_to_entries.py
+++ b/src/khoj/processor/content/pdf/pdf_to_entries.py
@ -2,6 +2,7 @@ import base64
 import logging
 import os
 from datetime import datetime
 from random import random
 from typing import Dict, List, Tuple
 from langchain_community.document_loaders import PyMuPDFLoader
@ -98,7 +99,8 @@ class PdfToEntries(TextToEntries):
        try:
            # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path
            timestamp_now = datetime.utcnow().timestamp()
-            tmp_file = f"tmp_pdf_file_{timestamp_now}.pdf"
+            random_suffix = random.randint(0, 1000)
            tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf"
            pdf_entry_by_pages = []
            with open(f"{tmp_file}", "wb") as f:
                f.write(pdf_file)