mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Wire up PDF to jsonl processor to Khoj server layer (API, config)
- Specify PDF content to index via khoj.yml - Index PDF content on app start, reconfigure - Expose PDF as a search type via API
This commit is contained in:
parent
d63194c3a9
commit
acd14a5e41
9 changed files with 40 additions and 3 deletions
|
@ -21,6 +21,7 @@ services:
|
||||||
- ./tests/data/ledger/:/data/ledger/
|
- ./tests/data/ledger/:/data/ledger/
|
||||||
- ./tests/data/music/:/data/music/
|
- ./tests/data/music/:/data/music/
|
||||||
- ./tests/data/markdown/:/data/markdown/
|
- ./tests/data/markdown/:/data/markdown/
|
||||||
|
- ./tests/data/pdf/:/data/pdf/
|
||||||
# Embeddings and models are populated after the first run
|
# Embeddings and models are populated after the first run
|
||||||
# You can set these volumes to point to empty directories on host
|
# You can set these volumes to point to empty directories on host
|
||||||
- ./tests/data/embeddings/:/data/embeddings/
|
- ./tests/data/embeddings/:/data/embeddings/
|
||||||
|
|
|
@ -21,6 +21,7 @@ keywords = [
|
||||||
"markdown",
|
"markdown",
|
||||||
"beancount",
|
"beancount",
|
||||||
"images",
|
"images",
|
||||||
|
"pdf",
|
||||||
]
|
]
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Development Status :: 4 - Beta",
|
"Development Status :: 4 - Beta",
|
||||||
|
|
|
@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
|
||||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
|
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||||
from khoj.search_type import image_search, text_search
|
from khoj.search_type import image_search, text_search
|
||||||
from khoj.utils import constants, state
|
from khoj.utils import constants, state
|
||||||
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||||
|
@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Initialize PDF Search
|
||||||
|
if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
|
||||||
|
logger.info("💸 Setting up search for pdf")
|
||||||
|
# Extract Entries, Generate PDF Embeddings
|
||||||
|
model.pdf_search = text_search.setup(
|
||||||
|
PdfToJsonl,
|
||||||
|
config.content_type.pdf,
|
||||||
|
search_config=config.search_type.asymmetric,
|
||||||
|
regenerate=regenerate,
|
||||||
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||||
|
)
|
||||||
|
|
||||||
# Initialize Image Search
|
# Initialize Image Search
|
||||||
if (t == state.SearchType.Image or t == None) and config.content_type.image:
|
if (t == state.SearchType.Image or t == None) and config.content_type.image:
|
||||||
logger.info("🌄 Setting up search for images")
|
logger.info("🌄 Setting up search for images")
|
||||||
|
|
|
@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget):
|
||||||
return "Beancount Files (*.bean *.beancount)"
|
return "Beancount Files (*.bean *.beancount)"
|
||||||
elif search_type == SearchType.Markdown:
|
elif search_type == SearchType.Markdown:
|
||||||
return "Markdown Files (*.md *.markdown)"
|
return "Markdown Files (*.md *.markdown)"
|
||||||
|
elif search_type == SearchType.Pdf:
|
||||||
|
return "Pdf Files (*.pdf)"
|
||||||
elif search_type == SearchType.Music:
|
elif search_type == SearchType.Music:
|
||||||
return "Org-Music Files (*.org)"
|
return "Org-Music Files (*.org)"
|
||||||
elif search_type == SearchType.Image:
|
elif search_type == SearchType.Image:
|
||||||
|
|
|
@ -109,6 +109,17 @@ def search(
|
||||||
with timer("Collating results took", logger):
|
with timer("Collating results took", logger):
|
||||||
results = text_search.collate_results(hits, entries, results_count)
|
results = text_search.collate_results(hits, entries, results_count)
|
||||||
|
|
||||||
|
elif (t == SearchType.Pdf or t == None) and state.model.pdf_search:
|
||||||
|
# query pdf files
|
||||||
|
with timer("Query took", logger):
|
||||||
|
hits, entries = text_search.query(
|
||||||
|
user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||||
|
)
|
||||||
|
|
||||||
|
# collate and return results
|
||||||
|
with timer("Collating results took", logger):
|
||||||
|
results = text_search.collate_results(hits, entries, results_count)
|
||||||
|
|
||||||
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
|
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
|
||||||
# query transactions
|
# query transactions
|
||||||
with timer("Query took", logger):
|
with timer("Query took", logger):
|
||||||
|
|
|
@ -22,6 +22,7 @@ class SearchType(str, Enum):
|
||||||
Music = "music"
|
Music = "music"
|
||||||
Markdown = "markdown"
|
Markdown = "markdown"
|
||||||
Image = "image"
|
Image = "image"
|
||||||
|
Pdf = "pdf"
|
||||||
|
|
||||||
|
|
||||||
class ProcessorType(str, Enum):
|
class ProcessorType(str, Enum):
|
||||||
|
@ -61,6 +62,7 @@ class SearchModels:
|
||||||
ledger_search: TextSearchModel = None
|
ledger_search: TextSearchModel = None
|
||||||
music_search: TextSearchModel = None
|
music_search: TextSearchModel = None
|
||||||
markdown_search: TextSearchModel = None
|
markdown_search: TextSearchModel = None
|
||||||
|
pdf_search: TextSearchModel = None
|
||||||
image_search: ImageSearchModel = None
|
image_search: ImageSearchModel = None
|
||||||
plugin_search: Dict[str, TextSearchModel] = None
|
plugin_search: Dict[str, TextSearchModel] = None
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,12 @@ default_config = {
|
||||||
"compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
|
"compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
|
||||||
"embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
|
"embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
|
||||||
},
|
},
|
||||||
|
"pdf": {
|
||||||
|
"input-files": None,
|
||||||
|
"input-filter": None,
|
||||||
|
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
|
||||||
|
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
|
||||||
|
},
|
||||||
"image": {
|
"image": {
|
||||||
"input-directories": None,
|
"input-directories": None,
|
||||||
"input-filter": None,
|
"input-filter": None,
|
||||||
|
|
|
@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
|
||||||
image: Optional[ImageContentConfig]
|
image: Optional[ImageContentConfig]
|
||||||
music: Optional[TextContentConfig]
|
music: Optional[TextContentConfig]
|
||||||
markdown: Optional[TextContentConfig]
|
markdown: Optional[TextContentConfig]
|
||||||
|
pdf: Optional[TextContentConfig]
|
||||||
plugins: Optional[Dict[str, TextContentConfig]]
|
plugins: Optional[Dict[str, TextContentConfig]]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client):
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_search_with_valid_content_type(client):
|
def test_search_with_valid_content_type(client):
|
||||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||||
# Act
|
# Act
|
||||||
response = client.get(f"/api/search?q=random&t={content_type}")
|
response = client.get(f"/api/search?q=random&t={content_type}")
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client):
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_update_with_valid_content_type(client):
|
def test_update_with_valid_content_type(client):
|
||||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||||
# Act
|
# Act
|
||||||
response = client.get(f"/api/update?t={content_type}")
|
response = client.get(f"/api/update?t={content_type}")
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client):
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_regenerate_with_valid_content_type(client):
|
def test_regenerate_with_valid_content_type(client):
|
||||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||||
# Act
|
# Act
|
||||||
response = client.get(f"/api/update?force=true&t={content_type}")
|
response = client.get(f"/api/update?force=true&t={content_type}")
|
||||||
# Assert
|
# Assert
|
||||||
|
|
Loading…
Reference in a new issue