Remove unused dump_jsonl method

The entries index is stored ingzipped jsonl files for each content type
This commit is contained in:
Debanjum Singh Solanky 2023-07-15 20:42:26 -07:00
parent 9bcca43299
commit 6e70b914c2
8 changed files with 16 additions and 45 deletions

View file

@ -13,7 +13,7 @@ from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
from khoj.utils.jsonl import compress_jsonl_data
from khoj.utils.rawconfig import Entry
@ -97,10 +97,7 @@ class GithubToJsonl(TextToJsonl):
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
# Compress JSONL formatted Data
if self.config.compressed_jsonl.suffix == ".gz":
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
elif self.config.compressed_jsonl.suffix == ".jsonl":
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
return entries_with_ids

View file

@ -7,7 +7,7 @@ from typing import List
# Internal Packages
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.helpers import get_absolute_path, timer
from khoj.utils.jsonl import load_jsonl, dump_jsonl, compress_jsonl_data
from khoj.utils.jsonl import load_jsonl, compress_jsonl_data
from khoj.utils.rawconfig import Entry
@ -48,10 +48,7 @@ class JsonlToJsonl(TextToJsonl):
jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
compress_jsonl_data(jsonl_data, output_file)
elif output_file.suffix == ".jsonl":
dump_jsonl(jsonl_data, output_file)
compress_jsonl_data(jsonl_data, output_file)
return entries_with_ids

View file

@ -10,7 +10,7 @@ from typing import List
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
from khoj.utils.constants import empty_escape_sequences
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
from khoj.utils.jsonl import compress_jsonl_data
from khoj.utils.rawconfig import Entry, TextContentConfig
@ -61,10 +61,7 @@ class MarkdownToJsonl(TextToJsonl):
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
compress_jsonl_data(jsonl_data, output_file)
elif output_file.suffix == ".jsonl":
dump_jsonl(jsonl_data, output_file)
compress_jsonl_data(jsonl_data, output_file)
return entries_with_ids

View file

@ -8,7 +8,7 @@ import requests
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry, NotionContentConfig
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
from khoj.utils.jsonl import compress_jsonl_data
from khoj.utils.rawconfig import Entry
from enum import Enum
@ -250,9 +250,6 @@ class NotionToJsonl(TextToJsonl):
jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries)
# Compress JSONL formatted Data
if self.config.compressed_jsonl.suffix == ".gz":
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
elif self.config.compressed_jsonl.suffix == ".jsonl":
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
return entries_with_ids

View file

@ -8,7 +8,7 @@ from typing import Iterable, List
from khoj.processor.org_mode import orgnode
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
from khoj.utils.jsonl import compress_jsonl_data
from khoj.utils.rawconfig import Entry, TextContentConfig
from khoj.utils import state
@ -62,10 +62,7 @@ class OrgToJsonl(TextToJsonl):
jsonl_data = self.convert_org_entries_to_jsonl(entries)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
compress_jsonl_data(jsonl_data, output_file)
elif output_file.suffix == ".jsonl":
dump_jsonl(jsonl_data, output_file)
compress_jsonl_data(jsonl_data, output_file)
return entries_with_ids

View file

@ -10,7 +10,7 @@ from langchain.document_loaders import PyPDFLoader
# Internal Packages
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
from khoj.utils.jsonl import compress_jsonl_data
from khoj.utils.rawconfig import Entry
@ -55,10 +55,7 @@ class PdfToJsonl(TextToJsonl):
jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
compress_jsonl_data(jsonl_data, output_file)
elif output_file.suffix == ".jsonl":
dump_jsonl(jsonl_data, output_file)
compress_jsonl_data(jsonl_data, output_file)
return entries_with_ids

View file

@ -20,7 +20,7 @@ def load_jsonl(input_path):
# Open JSONL file
if input_path.suffix == ".gz":
jsonl_file = gzip.open(get_absolute_path(input_path), "rt", encoding="utf-8")
elif input_path.suffix == ".jsonl":
else:
jsonl_file = open(get_absolute_path(input_path), "r", encoding="utf-8")
# Read JSONL file
@ -36,17 +36,6 @@ def load_jsonl(input_path):
return data
def dump_jsonl(jsonl_data, output_path):
"Write List of JSON objects to JSON line file"
# Create output directory, if it doesn't exist
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(jsonl_data)
logger.debug(f"Wrote jsonl data to {output_path}")
def compress_jsonl_data(jsonl_data, output_path):
# Create output directory, if it doesn't exist
output_path.parent.mkdir(parents=True, exist_ok=True)

View file

@ -90,7 +90,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
content_config.org = TextContentConfig(
input_files=None,
input_filter=["tests/data/org/*.org"],
compressed_jsonl=content_dir.joinpath("notes.jsonl"),
compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"),
embeddings_file=content_dir.joinpath("note_embeddings.pt"),
)
@ -101,7 +101,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
content_config.plugins = {
"plugin1": TextContentConfig(
input_files=[content_dir.joinpath("notes.jsonl")],
input_files=[content_dir.joinpath("notes.jsonl.gz")],
input_filter=None,
compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"),
embeddings_file=content_dir.joinpath("plugin_embeddings.pt"),
@ -142,7 +142,7 @@ def md_content_config(tmp_path_factory):
content_config.markdown = TextContentConfig(
input_files=None,
input_filter=["tests/data/markdown/*.markdown"],
compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"),
embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
)