mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
And random suffixes to decreases any clash probability when writing tmp files to disc
This commit is contained in:
parent
dbf0c26247
commit
459318be13
2 changed files with 6 additions and 2 deletions
|
@ -1,6 +1,7 @@
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from random import random
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from langchain_community.document_loaders import Docx2txtLoader
|
from langchain_community.document_loaders import Docx2txtLoader
|
||||||
|
@ -94,7 +95,8 @@ class DocxToEntries(TextToEntries):
|
||||||
"""Extract text from specified DOCX file"""
|
"""Extract text from specified DOCX file"""
|
||||||
try:
|
try:
|
||||||
timestamp_now = datetime.utcnow().timestamp()
|
timestamp_now = datetime.utcnow().timestamp()
|
||||||
tmp_file = f"tmp_docx_file_{timestamp_now}.docx"
|
random_suffix = random.randint(0, 1000)
|
||||||
|
tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx"
|
||||||
docx_entry_by_pages = []
|
docx_entry_by_pages = []
|
||||||
with open(tmp_file, "wb") as f:
|
with open(tmp_file, "wb") as f:
|
||||||
bytes_content = docx_file
|
bytes_content = docx_file
|
||||||
|
|
|
@ -2,6 +2,7 @@ import base64
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from random import random
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from langchain_community.document_loaders import PyMuPDFLoader
|
from langchain_community.document_loaders import PyMuPDFLoader
|
||||||
|
@ -98,7 +99,8 @@ class PdfToEntries(TextToEntries):
|
||||||
try:
|
try:
|
||||||
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path
|
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path
|
||||||
timestamp_now = datetime.utcnow().timestamp()
|
timestamp_now = datetime.utcnow().timestamp()
|
||||||
tmp_file = f"tmp_pdf_file_{timestamp_now}.pdf"
|
random_suffix = random.randint(0, 1000)
|
||||||
|
tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf"
|
||||||
pdf_entry_by_pages = []
|
pdf_entry_by_pages = []
|
||||||
with open(f"{tmp_file}", "wb") as f:
|
with open(f"{tmp_file}", "wb") as f:
|
||||||
f.write(pdf_file)
|
f.write(pdf_file)
|
||||||
|
|
Loading…
Reference in a new issue