diff --git a/src/khoj/processor/tools/run_code.py b/src/khoj/processor/tools/run_code.py index 5366cd02..8f2f170a 100644 --- a/src/khoj/processor/tools/run_code.py +++ b/src/khoj/processor/tools/run_code.py @@ -1,4 +1,5 @@ import base64 +import copy import datetime import json import logging @@ -88,7 +89,8 @@ async def run_code( with timer("Chat actor: Execute generated program", logger, log_level=logging.INFO): result = await execute_sandboxed_python(generated_code.code, input_data, sandbox_url) code = result.pop("code") - logger.info(f"Executed Code:\n--@@--\n{code}\n--@@--Result:\n--@@--\n{result}\n--@@--") + cleaned_result = truncate_code_context({"cleaned": {"results": result}})["cleaned"]["results"] + logger.info(f"Executed Code\n----\n{code}\n----\nResult\n----\n{cleaned_result}\n----") yield {query: {"code": code, "results": result}} except Exception as e: raise ValueError(f"Failed to run code for {query} with error: {e}") @@ -163,7 +165,8 @@ async def execute_sandboxed_python(code: str, input_data: list[dict], sandbox_ur result: dict[str, Any] = await response.json() result["code"] = cleaned_code # Store decoded output files - for output_file in result.get("output_files", []): + result["output_files"] = result.get("output_files", []) + for output_file in result["output_files"]: # Decode text files as UTF-8 if mimetypes.guess_type(output_file["filename"])[0].startswith("text/") or Path( output_file["filename"] @@ -175,4 +178,28 @@ async def execute_sandboxed_python(code: str, input_data: list[dict], sandbox_ur "code": cleaned_code, "success": False, "std_err": f"Failed to execute code with {response.status}", + "output_files": [], } + + +def truncate_code_context(original_code_results: dict[str, Any], max_chars=10000) -> dict[str, Any]: + """ + Truncate large output files and drop image file data from code results. + """ + # Create a deep copy of the code results to avoid modifying the original data + code_results = copy.deepcopy(original_code_results) + for code_result in code_results.values(): + for idx, output_file in enumerate(code_result["results"]["output_files"]): + # Drop image files from code results + if Path(output_file["filename"]).suffix in {".png", ".jpg", ".jpeg", ".webp"}: + code_result["results"]["output_files"][idx] = { + "filename": output_file["filename"], + "b64_data": "[placeholder for generated image data for brevity]", + } + # Truncate large output files + elif len(output_file["b64_data"]) > max_chars: + code_result["results"]["output_files"][idx] = { + "filename": output_file["filename"], + "b64_data": output_file["b64_data"][:max_chars] + "...", + } + return code_results diff --git a/src/khoj/routers/research.py b/src/khoj/routers/research.py index abf8f96c..fdc10906 100644 --- a/src/khoj/routers/research.py +++ b/src/khoj/routers/research.py @@ -16,7 +16,7 @@ from khoj.processor.conversation.utils import ( construct_tool_chat_history, ) from khoj.processor.tools.online_search import read_webpages, search_online -from khoj.processor.tools.run_code import run_code +from khoj.processor.tools.run_code import run_code, truncate_code_context from khoj.routers.api import extract_references_and_questions from khoj.routers.helpers import ( ChatEvent, @@ -348,7 +348,7 @@ async def execute_information_collection( if online_results: results_data += f"\n\n{yaml.dump(online_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n" if code_results: - results_data += f"\n\n{yaml.dump(code_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n" + results_data += f"\n\n{yaml.dump(truncate_code_context(code_results), allow_unicode=True, sort_keys=False, default_flow_style=False)}\n" if summarize_files: results_data += f"\n\n{yaml.dump(summarize_files, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n" if this_iteration.warning: