ig_bot/unsorted/test8.py

import json
import re
from torch import bfloat16
# import transformers
from duckduckgo_search import DDGS
# from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import json
from torch import bfloat16
# import transformers
from duckduckgo_search import DDGS

## Download the GGUF model
model_name = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
model_file = "mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf" 
model_id = "mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf" 
model_path = "/Users/sij/AI/Models/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf"
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# Initialize the model
# model = transformers.AutoModelForCausalLM.from_pretrained(
#     model_id,
#     trust_remote_code=True,
#     torch_dtype=bfloat16,
#     device_map='auto'
# )
# model.eval()

# Initialize the tokenizer
# tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# # Define a text-generation pipeline 
# generate_text = transformers.pipeline(
#     model=model, tokenizer=tokenizer,
#     return_full_text=False,
#     task="text-generation",
#     temperature=0.1,
#     top_p=0.15,
#     top_k=0,
#     max_new_tokens=512,
#     repetition_penalty=1.1
# )

## tokenizer not carried over
## 

llm = Llama(
    model_path=model_path,
    n_ctx=8000,  # Context length to use
    n_threads=8,            # Number of CPU threads to use
    n_gpu_layers=2        # Number of model layers to offload to GPU
)

## Generation kwargs
generation_kwargs = {
    "max_tokens":20000,
    "stop":["</s>"],
    "echo":True, # Echo the prompt in the output
    "top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
}

# Define a function to use a tool based on the action dictionary
def use_tool(action: dict):
    tool_name = action["tool_name"]
    if tool_name == "Calculator":
        exec(action["input"])
        return f"Tool Output: {output}"
    
    elif tool_name == "Search":
        contexts = []
        with DDGS() as ddgs:
            results = ddgs.text(
                action["input"],
                region="wt-wt", safesearch="on",
                max_results=3
            )
            for r in results:
                contexts.append(r['body'])
        info = "\n---\n".join(contexts)
        return f"Tool Output: {info}"
    elif tool_name == "Final Answer":
        return "Assistant: "+action["input"]


# Function to format instruction prompt
def instruction_format(sys_message: str, query: str):
    return f'<s> [INST] {sys_message} [/INST]\nUser: {query}\nAssistant: ```json\n'


# Function to parse the generated action string into a dictionary
def format_output(input_text: str, prefix: str):
    # Remove the prefix from input_text
    if input_text.startswith(prefix):
        # Cutting off the prefix to isolate the JSON part
        trimmed_text = input_text[len(prefix):]
    else:
        print("Prefix not found at the beginning of input_text.")
        return None

    if trimmed_text.endswith('\n```'):
        json_str = trimmed_text[:-len('\n```')].strip()
    else:
        json_str = trimmed_text.strip()

    #    json_str = json_str[len('```json\n'):-len('\n```')].strip()

    print(f"Trimmed: {json_str}")

    try:
        json_data = json.loads(json_str)
        print(f"Parsed JSON: {json_data}\n")
        return json_data
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return None


# Function to handle a single prompt, tool selection, and final action loop
def run(query: str):
    input_prompt = instruction_format(sys_msg, query)
    # res = generate_text(input_prompt #####)

    res = llm(input_prompt, **generation_kwargs)
    textthereof = res["choices"][0]["text"]
    action_dict = format_output(textthereof, input_prompt)

    response = use_tool(action_dict)
    full_text = f"{query}{res[0]['generated_text']}\n{response}"
    return response, full_text

# Example query
query = "Hi there, I'm stuck on a math problem, can you help? My question is what is the square root of 512 multiplied by 7?"
sys_msg = """[Your detailed system message or instructions here]"""  # You would replace this with your actual detailed instructions

# Running the example
out = run(query)

print(out[0])
initial commit 2024-05-25 09:13:33 +02:00			`import json`
			`import re`
			`from torch import bfloat16`
			`# import transformers`
			`from duckduckgo_search import DDGS`
			`# from transformers import AutoModelForCausalLM, AutoTokenizer`
			`from huggingface_hub import hf_hub_download`
			`from llama_cpp import Llama`
			`import json`
			`from torch import bfloat16`
			`# import transformers`
			`from duckduckgo_search import DDGS`

			`## Download the GGUF model`
			`model_name = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"`
			`model_file = "mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf"`
			`model_id = "mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf"`
			`model_path = "/Users/sij/AI/Models/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf"`
			`model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"`

			`# Initialize the model`
			`# model = transformers.AutoModelForCausalLM.from_pretrained(`
			`# model_id,`
			`# trust_remote_code=True,`
			`# torch_dtype=bfloat16,`
			`# device_map='auto'`
			`# )`
			`# model.eval()`

			`# Initialize the tokenizer`
			`# tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)`

			`# # Define a text-generation pipeline`
			`# generate_text = transformers.pipeline(`
			`# model=model, tokenizer=tokenizer,`
			`# return_full_text=False,`
			`# task="text-generation",`
			`# temperature=0.1,`
			`# top_p=0.15,`
			`# top_k=0,`
			`# max_new_tokens=512,`
			`# repetition_penalty=1.1`
			`# )`

			`## tokenizer not carried over`
			`##`

			`llm = Llama(`
			`model_path=model_path,`
			`n_ctx=8000, # Context length to use`
			`n_threads=8, # Number of CPU threads to use`
			`n_gpu_layers=2 # Number of model layers to offload to GPU`
			`)`

			`## Generation kwargs`
			`generation_kwargs = {`
			`"max_tokens":20000,`
			`"stop":["</s>"],`
			`"echo":True, # Echo the prompt in the output`
			`"top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding`
			`}`

			`# Define a function to use a tool based on the action dictionary`
			`def use_tool(action: dict):`
			`tool_name = action["tool_name"]`
			`if tool_name == "Calculator":`
			`exec(action["input"])`
			`return f"Tool Output: {output}"`

			`elif tool_name == "Search":`
			`contexts = []`
			`with DDGS() as ddgs:`
			`results = ddgs.text(`
			`action["input"],`
			`region="wt-wt", safesearch="on",`
			`max_results=3`
			`)`
			`for r in results:`
			`contexts.append(r['body'])`
			`info = "\n---\n".join(contexts)`
			`return f"Tool Output: {info}"`
			`elif tool_name == "Final Answer":`
			`return "Assistant: "+action["input"]`


			`# Function to format instruction prompt`
			`def instruction_format(sys_message: str, query: str):`
			return f'<s> [INST] {sys_message} [/INST]\nUser: {query}\nAssistant: ```json\n'


			`# Function to parse the generated action string into a dictionary`
			`def format_output(input_text: str, prefix: str):`
			`# Remove the prefix from input_text`
			`if input_text.startswith(prefix):`
			`# Cutting off the prefix to isolate the JSON part`
			`trimmed_text = input_text[len(prefix):]`
			`else:`
			`print("Prefix not found at the beginning of input_text.")`
			`return None`

			if trimmed_text.endswith('\n```'):
			json_str = trimmed_text[:-len('\n```')].strip()
			`else:`
			`json_str = trimmed_text.strip()`

			# json_str = json_str[len('```json\n'):-len('\n```')].strip()

			`print(f"Trimmed: {json_str}")`

			`try:`
			`json_data = json.loads(json_str)`
			`print(f"Parsed JSON: {json_data}\n")`
			`return json_data`
			`except json.JSONDecodeError as e:`
			`print(f"Error parsing JSON: {e}")`
			`return None`


			`# Function to handle a single prompt, tool selection, and final action loop`
			`def run(query: str):`
			`input_prompt = instruction_format(sys_msg, query)`
			`# res = generate_text(input_prompt #####)`

			`res = llm(input_prompt, **generation_kwargs)`
			`textthereof = res["choices"][0]["text"]`
			`action_dict = format_output(textthereof, input_prompt)`

			`response = use_tool(action_dict)`
			`full_text = f"{query}{res[0]['generated_text']}\n{response}"`
			`return response, full_text`

			`# Example query`
			`query = "Hi there, I'm stuck on a math problem, can you help? My question is what is the square root of 512 multiplied by 7?"`
			`sys_msg = """[Your detailed system message or instructions here]""" # You would replace this with your actual detailed instructions`

			`# Running the example`
			`out = run(query)`

			`print(out[0])`