mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Wire up PDF to jsonl processor to Khoj server layer (API, config)
- Specify PDF content to index via khoj.yml - Index PDF content on app start, reconfigure - Expose PDF as a search type via API
This commit is contained in:
parent
d63194c3a9
commit
acd14a5e41
9 changed files with 40 additions and 3 deletions
|
@ -21,6 +21,7 @@ services:
|
|||
- ./tests/data/ledger/:/data/ledger/
|
||||
- ./tests/data/music/:/data/music/
|
||||
- ./tests/data/markdown/:/data/markdown/
|
||||
- ./tests/data/pdf/:/data/pdf/
|
||||
# Embeddings and models are populated after the first run
|
||||
# You can set these volumes to point to empty directories on host
|
||||
- ./tests/data/embeddings/:/data/embeddings/
|
||||
|
|
|
@ -21,6 +21,7 @@ keywords = [
|
|||
"markdown",
|
||||
"beancount",
|
||||
"images",
|
||||
"pdf",
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
|
|
|
@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
|
|||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils import constants, state
|
||||
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||
|
@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
|||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize PDF Search
|
||||
if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
|
||||
logger.info("💸 Setting up search for pdf")
|
||||
# Extract Entries, Generate PDF Embeddings
|
||||
model.pdf_search = text_search.setup(
|
||||
PdfToJsonl,
|
||||
config.content_type.pdf,
|
||||
search_config=config.search_type.asymmetric,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize Image Search
|
||||
if (t == state.SearchType.Image or t == None) and config.content_type.image:
|
||||
logger.info("🌄 Setting up search for images")
|
||||
|
|
|
@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget):
|
|||
return "Beancount Files (*.bean *.beancount)"
|
||||
elif search_type == SearchType.Markdown:
|
||||
return "Markdown Files (*.md *.markdown)"
|
||||
elif search_type == SearchType.Pdf:
|
||||
return "Pdf Files (*.pdf)"
|
||||
elif search_type == SearchType.Music:
|
||||
return "Org-Music Files (*.org)"
|
||||
elif search_type == SearchType.Image:
|
||||
|
|
|
@ -109,6 +109,17 @@ def search(
|
|||
with timer("Collating results took", logger):
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
elif (t == SearchType.Pdf or t == None) and state.model.pdf_search:
|
||||
# query pdf files
|
||||
with timer("Query took", logger):
|
||||
hits, entries = text_search.query(
|
||||
user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||
)
|
||||
|
||||
# collate and return results
|
||||
with timer("Collating results took", logger):
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
|
||||
# query transactions
|
||||
with timer("Query took", logger):
|
||||
|
|
|
@ -22,6 +22,7 @@ class SearchType(str, Enum):
|
|||
Music = "music"
|
||||
Markdown = "markdown"
|
||||
Image = "image"
|
||||
Pdf = "pdf"
|
||||
|
||||
|
||||
class ProcessorType(str, Enum):
|
||||
|
@ -61,6 +62,7 @@ class SearchModels:
|
|||
ledger_search: TextSearchModel = None
|
||||
music_search: TextSearchModel = None
|
||||
markdown_search: TextSearchModel = None
|
||||
pdf_search: TextSearchModel = None
|
||||
image_search: ImageSearchModel = None
|
||||
plugin_search: Dict[str, TextSearchModel] = None
|
||||
|
||||
|
|
|
@ -28,6 +28,12 @@ default_config = {
|
|||
"compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
|
||||
},
|
||||
"pdf": {
|
||||
"input-files": None,
|
||||
"input-filter": None,
|
||||
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
|
||||
},
|
||||
"image": {
|
||||
"input-directories": None,
|
||||
"input-filter": None,
|
||||
|
|
|
@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
|
|||
image: Optional[ImageContentConfig]
|
||||
music: Optional[TextContentConfig]
|
||||
markdown: Optional[TextContentConfig]
|
||||
pdf: Optional[TextContentConfig]
|
||||
plugins: Optional[Dict[str, TextContentConfig]]
|
||||
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client):
|
|||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_search_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/search?q=random&t={content_type}")
|
||||
# Assert
|
||||
|
@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client):
|
|||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/update?t={content_type}")
|
||||
# Assert
|
||||
|
@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client):
|
|||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_regenerate_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/update?force=true&t={content_type}")
|
||||
# Assert
|
||||
|
|
Loading…
Reference in a new issue