From 5475a262d471bd7f698b1ee996cca224fed362d8 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Thu, 21 Nov 2024 14:27:39 -0800 Subject: [PATCH] Move truncate code context func for reusability across modules It needs to be used across routers and processors. It being in run_code tool makes it hard to be used in other chat provider contexts due to circular dependency issues created by send_message_to_model_wrapper func --- src/khoj/processor/tools/run_code.py | 26 +------------------------- src/khoj/routers/research.py | 3 ++- src/khoj/utils/helpers.py | 26 +++++++++++++++++++++++++- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py index 8f2f170a..22c8eca5 100644 --- a/src/khoj/processor/tools/run_code.py +++ b/src/khoj/processor/tools/run_code.py @@ -1,5 +1,4 @@ import base64 -import copy import datetime import json import logging @@ -20,7 +19,7 @@ from khoj.processor.conversation.utils import ( construct_chat_history, ) from khoj.routers.helpers import send_message_to_model_wrapper -from khoj.utils.helpers import is_none_or_empty, timer +from khoj.utils.helpers import is_none_or_empty, timer, truncate_code_context from khoj.utils.rawconfig import LocationData logger = logging.getLogger(__name__) @@ -180,26 +179,3 @@ async def execute_sandboxed_python(code: str, input_data: list[dict], sandbox_ur "std_err": f"Failed to execute code with {response.status}", "output_files": [], } - - -def truncate_code_context(original_code_results: dict[str, Any], max_chars=10000) -> dict[str, Any]: - """ - Truncate large output files and drop image file data from code results. - """ - # Create a deep copy of the code results to avoid modifying the original data - code_results = copy.deepcopy(original_code_results) - for code_result in code_results.values(): - for idx, output_file in enumerate(code_result["results"]["output_files"]): - # Drop image files from code results - if Path(output_file["filename"]).suffix in {".png", ".jpg", ".jpeg", ".webp"}: - code_result["results"]["output_files"][idx] = { - "filename": output_file["filename"], - "b64_data": "[placeholder for generated image data for brevity]", - } - # Truncate large output files - elif len(output_file["b64_data"]) > max_chars: - code_result["results"]["output_files"][idx] = { - "filename": output_file["filename"], - "b64_data": output_file["b64_data"][:max_chars] + "...", - } - return code_results diff --git a/src/khoj/routers/research.py b/src/khoj/routers/research.py index 562d48d9..efeb3787 100644 --- a/src/khoj/routers/research.py +++ b/src/khoj/routers/research.py @@ -16,7 +16,7 @@ from khoj.processor.conversation.utils import ( construct_tool_chat_history, ) from khoj.processor.tools.online_search import read_webpages, search_online -from khoj.processor.tools.run_code import run_code, truncate_code_context +from khoj.processor.tools.run_code import run_code from khoj.routers.api import extract_references_and_questions from khoj.routers.helpers import ( ChatEvent, @@ -28,6 +28,7 @@ from khoj.utils.helpers import ( function_calling_description_for_llm, is_none_or_empty, timer, + truncate_code_context, ) from khoj.utils.rawconfig import LocationData diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 187e9062..36e0e1d2 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -1,5 +1,6 @@ from __future__ import annotations # to avoid quoting type hints +import copy import datetime import io import ipaddress @@ -18,7 +19,7 @@ from itertools import islice from os import path from pathlib import Path from time import perf_counter -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union from urllib.parse import urlparse import psutil @@ -527,6 +528,29 @@ def convert_image_to_webp(image_bytes): return webp_image_bytes +def truncate_code_context(original_code_results: dict[str, Any], max_chars=10000) -> dict[str, Any]: + """ + Truncate large output files and drop image file data from code results. + """ + # Create a deep copy of the code results to avoid modifying the original data + code_results = copy.deepcopy(original_code_results) + for code_result in code_results.values(): + for idx, output_file in enumerate(code_result["results"]["output_files"]): + # Drop image files from code results + if Path(output_file["filename"]).suffix in {".png", ".jpg", ".jpeg", ".webp"}: + code_result["results"]["output_files"][idx] = { + "filename": output_file["filename"], + "b64_data": "[placeholder for generated image data for brevity]", + } + # Truncate large output files + elif len(output_file["b64_data"]) > max_chars: + code_result["results"]["output_files"][idx] = { + "filename": output_file["filename"], + "b64_data": output_file["b64_data"][:max_chars] + "...", + } + return code_results + + @lru_cache def tz_to_cc_map() -> dict[str, str]: """Create a mapping of timezone to country code"""