import json
import re
from torch import bfloat16
# import transformers
from duckduckgo_search import DDGS
# from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import json
from torch import bfloat16
# import transformers
from duckduckgo_search import DDGS

## Download the GGUF model
model_name = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
model_file = "mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf" 
model_id = "mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf" 
model_path = "/Users/sij/AI/Models/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf"
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# Initialize the model
# model = transformers.AutoModelForCausalLM.from_pretrained(
#     model_id,
#     trust_remote_code=True,
#     torch_dtype=bfloat16,
#     device_map='auto'
# )
# model.eval()

# Initialize the tokenizer
# tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# # Define a text-generation pipeline 
# generate_text = transformers.pipeline(
#     model=model, tokenizer=tokenizer,
#     return_full_text=False,
#     task="text-generation",
#     temperature=0.1,
#     top_p=0.15,
#     top_k=0,
#     max_new_tokens=512,
#     repetition_penalty=1.1
# )

## tokenizer not carried over
## 

llm = Llama(
    model_path=model_path,
    n_ctx=8000,  # Context length to use
    n_threads=8,            # Number of CPU threads to use
    n_gpu_layers=2        # Number of model layers to offload to GPU
)

## Generation kwargs
generation_kwargs = {
    "max_tokens":20000,
    "stop":["</s>"],
    "echo":True, # Echo the prompt in the output
    "top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
}

# Define a function to use a tool based on the action dictionary
def use_tool(action: dict):
    tool_name = action["tool_name"]
    if tool_name == "Calculator":
        exec(action["input"])
        return f"Tool Output: {output}"
    
    elif tool_name == "Search":
        contexts = []
        with DDGS() as ddgs:
            results = ddgs.text(
                action["input"],
                region="wt-wt", safesearch="on",
                max_results=3
            )
            for r in results:
                contexts.append(r['body'])
        info = "\n---\n".join(contexts)
        return f"Tool Output: {info}"
    elif tool_name == "Final Answer":
        return "Assistant: "+action["input"]


# Function to format instruction prompt
def instruction_format(sys_message: str, query: str):
    return f'<s> [INST] {sys_message} [/INST]\nUser: {query}\nAssistant: ```json\n'


# Function to parse the generated action string into a dictionary
def format_output(input_text: str, prefix: str):
    # Remove the prefix from input_text
    if input_text.startswith(prefix):
        # Cutting off the prefix to isolate the JSON part
        trimmed_text = input_text[len(prefix):]
    else:
        print("Prefix not found at the beginning of input_text.")
        return None

    if trimmed_text.endswith('\n```'):
        json_str = trimmed_text[:-len('\n```')].strip()
    else:
        json_str = trimmed_text.strip()

    #    json_str = json_str[len('```json\n'):-len('\n```')].strip()

    print(f"Trimmed: {json_str}")

    try:
        json_data = json.loads(json_str)
        print(f"Parsed JSON: {json_data}\n")
        return json_data
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return None


# Function to handle a single prompt, tool selection, and final action loop
def run(query: str):
    input_prompt = instruction_format(sys_msg, query)
    # res = generate_text(input_prompt #####)

    res = llm(input_prompt, **generation_kwargs)
    textthereof = res["choices"][0]["text"]
    action_dict = format_output(textthereof, input_prompt)

    response = use_tool(action_dict)
    full_text = f"{query}{res[0]['generated_text']}\n{response}"
    return response, full_text

# Example query
query = "Hi there, I'm stuck on a math problem, can you help? My question is what is the square root of 512 multiplied by 7?"
sys_msg = """[Your detailed system message or instructions here]"""  # You would replace this with your actual detailed instructions

# Running the example
out = run(query)

print(out[0])