And random suffixes to decreases any clash probability when writing tmp files to disc

This commit is contained in:
sabaimran 2024-11-09 18:46:34 -08:00
parent dbf0c26247
commit 459318be13
2 changed files with 6 additions and 2 deletions

View file

@ -1,6 +1,7 @@
import logging import logging
import os import os
from datetime import datetime from datetime import datetime
from random import random
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from langchain_community.document_loaders import Docx2txtLoader from langchain_community.document_loaders import Docx2txtLoader
@ -94,7 +95,8 @@ class DocxToEntries(TextToEntries):
"""Extract text from specified DOCX file""" """Extract text from specified DOCX file"""
try: try:
timestamp_now = datetime.utcnow().timestamp() timestamp_now = datetime.utcnow().timestamp()
tmp_file = f"tmp_docx_file_{timestamp_now}.docx" random_suffix = random.randint(0, 1000)
tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx"
docx_entry_by_pages = [] docx_entry_by_pages = []
with open(tmp_file, "wb") as f: with open(tmp_file, "wb") as f:
bytes_content = docx_file bytes_content = docx_file

View file

@ -2,6 +2,7 @@ import base64
import logging import logging
import os import os
from datetime import datetime from datetime import datetime
from random import random
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.document_loaders import PyMuPDFLoader
@ -98,7 +99,8 @@ class PdfToEntries(TextToEntries):
try: try:
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path
timestamp_now = datetime.utcnow().timestamp() timestamp_now = datetime.utcnow().timestamp()
tmp_file = f"tmp_pdf_file_{timestamp_now}.pdf" random_suffix = random.randint(0, 1000)
tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf"
pdf_entry_by_pages = [] pdf_entry_by_pages = []
with open(f"{tmp_file}", "wb") as f: with open(f"{tmp_file}", "wb") as f:
f.write(pdf_file) f.write(pdf_file)