mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Add an api that gets a document, and converts it to just text
This commit is contained in:
parent
3b1e8462cd
commit
394035136d
1 changed files with 56 additions and 5 deletions
|
@ -36,16 +36,18 @@ from khoj.database.models import (
|
||||||
LocalPlaintextConfig,
|
LocalPlaintextConfig,
|
||||||
NotionConfig,
|
NotionConfig,
|
||||||
)
|
)
|
||||||
|
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
|
||||||
|
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
|
||||||
from khoj.routers.helpers import (
|
from khoj.routers.helpers import (
|
||||||
ApiIndexedDataLimiter,
|
ApiIndexedDataLimiter,
|
||||||
CommonQueryParams,
|
CommonQueryParams,
|
||||||
configure_content,
|
configure_content,
|
||||||
|
get_file_content,
|
||||||
get_user_config,
|
get_user_config,
|
||||||
update_telemetry_state,
|
update_telemetry_state,
|
||||||
)
|
)
|
||||||
from khoj.utils import constants, state
|
from khoj.utils import constants, state
|
||||||
from khoj.utils.config import SearchModels
|
from khoj.utils.config import SearchModels
|
||||||
from khoj.utils.helpers import get_file_type
|
|
||||||
from khoj.utils.rawconfig import (
|
from khoj.utils.rawconfig import (
|
||||||
ContentConfig,
|
ContentConfig,
|
||||||
FullConfig,
|
FullConfig,
|
||||||
|
@ -375,6 +377,54 @@ async def delete_content_source(
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
@api_content.post("/convert", status_code=200)
|
||||||
|
@requires(["authenticated"])
|
||||||
|
async def convert_documents(
|
||||||
|
request: Request,
|
||||||
|
files: List[UploadFile],
|
||||||
|
client: Optional[str] = None,
|
||||||
|
):
|
||||||
|
converted_files = []
|
||||||
|
supported_files = ["org", "markdown", "pdf", "plaintext", "docx"]
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
file_data = get_file_content(file)
|
||||||
|
if file_data.file_type in supported_files:
|
||||||
|
extracted_content = (
|
||||||
|
file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_data.file_type == "docx":
|
||||||
|
entries_per_page = DocxToEntries.extract_text(file_data.content)
|
||||||
|
extracted_content = "\n".join(entries_per_page)
|
||||||
|
|
||||||
|
elif file_data.file_type == "pdf":
|
||||||
|
entries_per_page = PdfToEntries.extract_text(file_data.content)
|
||||||
|
extracted_content = "\n".join(entries_per_page)
|
||||||
|
|
||||||
|
size_in_bytes = len(extracted_content.encode("utf-8"))
|
||||||
|
|
||||||
|
converted_files.append(
|
||||||
|
{
|
||||||
|
"name": file_data.name,
|
||||||
|
"content": extracted_content,
|
||||||
|
"file_type": file_data.file_type,
|
||||||
|
"size": size_in_bytes,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Skipped converting unsupported file type sent by {client} client: {file.filename}")
|
||||||
|
|
||||||
|
update_telemetry_state(
|
||||||
|
request=request,
|
||||||
|
telemetry_type="api",
|
||||||
|
api="convert_documents",
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
|
||||||
|
return Response(content=json.dumps(converted_files), media_type="application/json", status_code=200)
|
||||||
|
|
||||||
|
|
||||||
async def indexer(
|
async def indexer(
|
||||||
request: Request,
|
request: Request,
|
||||||
files: list[UploadFile],
|
files: list[UploadFile],
|
||||||
|
@ -398,10 +448,11 @@ async def indexer(
|
||||||
try:
|
try:
|
||||||
logger.info(f"📬 Updating content index via API call by {client} client")
|
logger.info(f"📬 Updating content index via API call by {client} client")
|
||||||
for file in files:
|
for file in files:
|
||||||
file_content = file.file.read()
|
file_data = get_file_content(file)
|
||||||
file_type, encoding = get_file_type(file.content_type, file_content)
|
if file_data.file_type in index_files:
|
||||||
if file_type in index_files:
|
index_files[file_data.file_type][file_data.filename] = (
|
||||||
index_files[file_type][file.filename] = file_content.decode(encoding) if encoding else file_content
|
file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue