Wire up PDF to jsonl processor to Khoj server layer (API, config)

- Specify PDF content to index via khoj.yml
- Index PDF content on app start, reconfigure
- Expose PDF as a search type via API
This commit is contained in:
Debanjum Singh Solanky 2023-06-01 09:55:48 +05:30
parent d63194c3a9
commit acd14a5e41
9 changed files with 40 additions and 3 deletions

View file

@ -21,6 +21,7 @@ services:
- ./tests/data/ledger/:/data/ledger/ - ./tests/data/ledger/:/data/ledger/
- ./tests/data/music/:/data/music/ - ./tests/data/music/:/data/music/
- ./tests/data/markdown/:/data/markdown/ - ./tests/data/markdown/:/data/markdown/
- ./tests/data/pdf/:/data/pdf/
# Embeddings and models are populated after the first run # Embeddings and models are populated after the first run
# You can set these volumes to point to empty directories on host # You can set these volumes to point to empty directories on host
- ./tests/data/embeddings/:/data/embeddings/ - ./tests/data/embeddings/:/data/embeddings/

View file

@ -21,6 +21,7 @@ keywords = [
"markdown", "markdown",
"beancount", "beancount",
"images", "images",
"pdf",
] ]
classifiers = [ classifiers = [
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",

View file

@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
from khoj.search_type import image_search, text_search from khoj.search_type import image_search, text_search
from khoj.utils import constants, state from khoj.utils import constants, state
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
filters=[DateFilter(), WordFilter(), FileFilter()], filters=[DateFilter(), WordFilter(), FileFilter()],
) )
# Initialize PDF Search
if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
logger.info("💸 Setting up search for pdf")
# Extract Entries, Generate PDF Embeddings
model.pdf_search = text_search.setup(
PdfToJsonl,
config.content_type.pdf,
search_config=config.search_type.asymmetric,
regenerate=regenerate,
filters=[DateFilter(), WordFilter(), FileFilter()],
)
# Initialize Image Search # Initialize Image Search
if (t == state.SearchType.Image or t == None) and config.content_type.image: if (t == state.SearchType.Image or t == None) and config.content_type.image:
logger.info("🌄 Setting up search for images") logger.info("🌄 Setting up search for images")

View file

@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget):
return "Beancount Files (*.bean *.beancount)" return "Beancount Files (*.bean *.beancount)"
elif search_type == SearchType.Markdown: elif search_type == SearchType.Markdown:
return "Markdown Files (*.md *.markdown)" return "Markdown Files (*.md *.markdown)"
elif search_type == SearchType.Pdf:
return "Pdf Files (*.pdf)"
elif search_type == SearchType.Music: elif search_type == SearchType.Music:
return "Org-Music Files (*.org)" return "Org-Music Files (*.org)"
elif search_type == SearchType.Image: elif search_type == SearchType.Image:

View file

@ -109,6 +109,17 @@ def search(
with timer("Collating results took", logger): with timer("Collating results took", logger):
results = text_search.collate_results(hits, entries, results_count) results = text_search.collate_results(hits, entries, results_count)
elif (t == SearchType.Pdf or t == None) and state.model.pdf_search:
# query pdf files
with timer("Query took", logger):
hits, entries = text_search.query(
user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
)
# collate and return results
with timer("Collating results took", logger):
results = text_search.collate_results(hits, entries, results_count)
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search: elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
# query transactions # query transactions
with timer("Query took", logger): with timer("Query took", logger):

View file

@ -22,6 +22,7 @@ class SearchType(str, Enum):
Music = "music" Music = "music"
Markdown = "markdown" Markdown = "markdown"
Image = "image" Image = "image"
Pdf = "pdf"
class ProcessorType(str, Enum): class ProcessorType(str, Enum):
@ -61,6 +62,7 @@ class SearchModels:
ledger_search: TextSearchModel = None ledger_search: TextSearchModel = None
music_search: TextSearchModel = None music_search: TextSearchModel = None
markdown_search: TextSearchModel = None markdown_search: TextSearchModel = None
pdf_search: TextSearchModel = None
image_search: ImageSearchModel = None image_search: ImageSearchModel = None
plugin_search: Dict[str, TextSearchModel] = None plugin_search: Dict[str, TextSearchModel] = None

View file

@ -28,6 +28,12 @@ default_config = {
"compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz", "compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
"embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt", "embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
}, },
"pdf": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
},
"image": { "image": {
"input-directories": None, "input-directories": None,
"input-filter": None, "input-filter": None,

View file

@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
image: Optional[ImageContentConfig] image: Optional[ImageContentConfig]
music: Optional[TextContentConfig] music: Optional[TextContentConfig]
markdown: Optional[TextContentConfig] markdown: Optional[TextContentConfig]
pdf: Optional[TextContentConfig]
plugins: Optional[Dict[str, TextContentConfig]] plugins: Optional[Dict[str, TextContentConfig]]

View file

@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client):
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_search_with_valid_content_type(client): def test_search_with_valid_content_type(client):
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]: for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
# Act # Act
response = client.get(f"/api/search?q=random&t={content_type}") response = client.get(f"/api/search?q=random&t={content_type}")
# Assert # Assert
@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client):
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_update_with_valid_content_type(client): def test_update_with_valid_content_type(client):
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]: for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
# Act # Act
response = client.get(f"/api/update?t={content_type}") response = client.get(f"/api/update?t={content_type}")
# Assert # Assert
@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client):
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_regenerate_with_valid_content_type(client): def test_regenerate_with_valid_content_type(client):
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]: for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
# Act # Act
response = client.get(f"/api/update?force=true&t={content_type}") response = client.get(f"/api/update?force=true&t={content_type}")
# Assert # Assert