From f209e30a3b320f3a0fd616b7cb3fffe2e2cd0847 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 26 Dec 2022 13:14:15 -0300 Subject: [PATCH] Split entries by max tokens while converting Markdown entries To JSONL --- src/processor/markdown/markdown_to_jsonl.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index 5c4d660d..17482de5 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -35,6 +35,12 @@ class MarkdownToJsonl(TextToJsonl): end = time.time() logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds") + # Split entries by max tokens supported by model + start = time.time() + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + end = time.time() + logger.debug(f"Split entries by max token size supported by model: {end - start} seconds") + # Identify, mark and merge any new entries with previous entries start = time.time() if not previous_entries: