mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Remove unused dump_jsonl method
The entries index is stored ingzipped jsonl files for each content type
This commit is contained in:
parent
9bcca43299
commit
6e70b914c2
8 changed files with 16 additions and 45 deletions
|
@ -13,7 +13,7 @@ from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
|||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
|
||||
|
@ -97,10 +97,7 @@ class GithubToJsonl(TextToJsonl):
|
|||
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if self.config.compressed_jsonl.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||
elif self.config.compressed_jsonl.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
|
||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ from typing import List
|
|||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, timer
|
||||
from khoj.utils.jsonl import load_jsonl, dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import load_jsonl, compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
|
||||
|
@ -48,10 +48,7 @@ class JsonlToJsonl(TextToJsonl):
|
|||
jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ from typing import List
|
|||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||
from khoj.utils.constants import empty_escape_sequences
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry, TextContentConfig
|
||||
|
||||
|
||||
|
@ -61,10 +61,7 @@ class MarkdownToJsonl(TextToJsonl):
|
|||
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ import requests
|
|||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
from enum import Enum
|
||||
|
@ -250,9 +250,6 @@ class NotionToJsonl(TextToJsonl):
|
|||
jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if self.config.compressed_jsonl.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||
elif self.config.compressed_jsonl.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
|
||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||
|
||||
return entries_with_ids
|
||||
|
|
|
@ -8,7 +8,7 @@ from typing import Iterable, List
|
|||
from khoj.processor.org_mode import orgnode
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry, TextContentConfig
|
||||
from khoj.utils import state
|
||||
|
||||
|
@ -62,10 +62,7 @@ class OrgToJsonl(TextToJsonl):
|
|||
jsonl_data = self.convert_org_entries_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ from langchain.document_loaders import PyPDFLoader
|
|||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
|
||||
|
@ -55,10 +55,7 @@ class PdfToJsonl(TextToJsonl):
|
|||
jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ def load_jsonl(input_path):
|
|||
# Open JSONL file
|
||||
if input_path.suffix == ".gz":
|
||||
jsonl_file = gzip.open(get_absolute_path(input_path), "rt", encoding="utf-8")
|
||||
elif input_path.suffix == ".jsonl":
|
||||
else:
|
||||
jsonl_file = open(get_absolute_path(input_path), "r", encoding="utf-8")
|
||||
|
||||
# Read JSONL file
|
||||
|
@ -36,17 +36,6 @@ def load_jsonl(input_path):
|
|||
return data
|
||||
|
||||
|
||||
def dump_jsonl(jsonl_data, output_path):
|
||||
"Write List of JSON objects to JSON line file"
|
||||
# Create output directory, if it doesn't exist
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(jsonl_data)
|
||||
|
||||
logger.debug(f"Wrote jsonl data to {output_path}")
|
||||
|
||||
|
||||
def compress_jsonl_data(jsonl_data, output_path):
|
||||
# Create output directory, if it doesn't exist
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
|
|
@ -90,7 +90,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
|||
content_config.org = TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/org/*.org"],
|
||||
compressed_jsonl=content_dir.joinpath("notes.jsonl"),
|
||||
compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("note_embeddings.pt"),
|
||||
)
|
||||
|
||||
|
@ -101,7 +101,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
|||
|
||||
content_config.plugins = {
|
||||
"plugin1": TextContentConfig(
|
||||
input_files=[content_dir.joinpath("notes.jsonl")],
|
||||
input_files=[content_dir.joinpath("notes.jsonl.gz")],
|
||||
input_filter=None,
|
||||
compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("plugin_embeddings.pt"),
|
||||
|
@ -142,7 +142,7 @@ def md_content_config(tmp_path_factory):
|
|||
content_config.markdown = TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/markdown/*.markdown"],
|
||||
compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
|
||||
compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in a new issue