Improve the prompt to generate images with DALLE3 and SD3

- Major - Ask for prompt in prose - Remove seed from SD3 image generation to improve diversity of output for a given prompt Otherwise for conversations with similar sounding prompts, the images would be almost exactly the same. This maybe another indicator of SD3's inability to capture detailed instructions - Consistently use "prompt" wording instead of "query" in improved image generation prompts. Previously a mix of those terms were being used, which could confuse the chat model - Minor - Add day of week to prompt - Remove 2-5 sentence limit on instructions to SD3. It seems to be able to follow longer instructions just with less fidelity than DALLE. And the 2-5 sentence instruction limit wasn't being adhered to - Improve ability to edit, improve the image based on follow-up instructions by the user - Align prompts for DALLE and SD3. Only difference is to wrap text to be rendered in quotes for SD3. This improves it's ability to render requested text. DALLE cannot render text as well or consistently
2025-02-17 08:04:21 +00:00 · 2024-07-11 10:28:52 +05:30 · 2024-07-11 10:28:52 +05:30 · 6c1861b319
commit 6c1861b319
parent 260aa61818
2 changed files with 24 additions and 35 deletions
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@ -121,9 +121,9 @@ User's Notes:
 ## Image Generation
 ## --

-image_generation_improve_prompt_dalle = PromptTemplate.from_template(
-    """
-You are a talented creator. Generate a detailed prompt to generate an image based on the following description. Update the query below to improve the image generation. Add additional context to the query to improve the image generation. Make sure to retain any important information originally from the query. You are provided with the following information to help you generate the prompt:
+image_generation_improve_prompt_base = """
+You are a talented creator with the ability to describe images to compose in vivid, fine detail.
+Use the provided context and user prompt to generate a more detailed prompt to create an image:

 Today's Date: {current_date}
 User's Location: {location}
@ -137,39 +137,29 @@ Online References:
 Conversation Log:
 {chat_history}

-Query: {query}
+User Prompt: "{query}"

-Remember, now you are generating a prompt to improve the image generation. Add additional context to the query to improve the image generation. Make sure to retain any important information originally from the query. Use the additional context from the user's notes, online references and conversation log to improve the image generation.
-Improved Query:"""
+Now generate an improved prompt describing the image to generate in vivid, fine detail.
+- Use today's date, user's location, user's notes and online references to weave in any context that will improve the image generation.
+- Retain any important information and follow any instructions in the conversation log or user prompt.
+- Add specific, fine position details to compose the image.
+- Ensure your improved prompt is in prose format."""
+
+image_generation_improve_prompt_dalle = PromptTemplate.from_template(
+    f"""
+{image_generation_improve_prompt_base}
+
+Improved Prompt:
+""".strip()
 )

 image_generation_improve_prompt_sd = PromptTemplate.from_template(
-    """
-You are a talented creator. Write 2-5 sentences with precise image composition, position details to create an image.
-Use the provided context below to add specific, fine details to the image composition.
-Retain any important information and follow any instructions from the original prompt.
-Put any text to be rendered in the image within double quotes in your improved prompt.
-You are provided with the following context to help enhance the original prompt:
+    f"""
+{image_generation_improve_prompt_base}
+- If any text is to be rendered in the image put it within double quotes in your improved prompt.

-Today's Date: {current_date}
-User's Location: {location}
-
-User's Notes:
-{references}
-
-Online References:
-{online_results}
-
-Conversation Log:
-{chat_history}
-
-Original Prompt: "{query}"
-
-Now create an improved prompt using the context provided above to generate an image.
-Retain any important information and follow any instructions from the original prompt.
-Use the additional context from the user's notes, online references and conversation log to improve the image generation.
-
-Improved Prompt:"""
+Improved Prompt:
+""".strip()
 )

 ## Online Search Conversation
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@ -459,7 +459,7 @@ async def generate_better_image_prompt(
    Generate a better image prompt from the given query
    """

-    today_date = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d")
+    today_date = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d, %A")
    model_type = model_type or TextToImageModelConfig.ModelType.OPENAI

    if location_data:
@ -776,8 +776,8 @@ async def text_to_image(
            chat_history += f"Q: {chat['intent']['query']}\n"
            chat_history += f"A: {chat['message']}\n"
        elif chat["by"] == "khoj" and "text-to-image" in chat["intent"].get("type"):
-            chat_history += f"Q: Query: {chat['intent']['query']}\n"
-            chat_history += f"A: Improved Query: {chat['intent']['inferred-queries'][0]}\n"
+            chat_history += f"Q: Prompt: {chat['intent']['query']}\n"
+            chat_history += f"A: Improved Prompt: {chat['intent']['inferred-queries'][0]}\n"

    with timer("Improve the original user query", logger):
        if send_status_func:
@ -836,7 +836,6 @@ async def text_to_image(
                        "model": text2image_model,
                        "mode": "text-to-image",
                        "output_format": "png",
-                        "seed": 1032622926,
                        "aspect_ratio": "1:1",
                    },
                )