2023-03-15 00:44:16 +01:00
|
|
|
from datetime import datetime
|
|
|
|
|
2021-11-26 12:53:03 +01:00
|
|
|
import pytest
|
2023-07-27 01:27:08 +02:00
|
|
|
|
|
|
|
SKIP_TESTS = True
|
|
|
|
pytestmark = pytest.mark.skipif(
|
|
|
|
SKIP_TESTS,
|
Use llama.cpp for offline chat models
- Benefits of moving to llama-cpp-python from gpt4all:
- Support for all GGUF format chat models
- Support for AMD, Nvidia, Mac, Vulcan GPU machines (instead of just Vulcan, Mac)
- Supports models with more capabilities like tools, schema
enforcement, speculative ddecoding, image gen etc.
- Upgrade default chat model, prompt size, tokenizer for new supported
chat models
- Load offline chat model when present on disk without requiring internet
- Load model onto GPU if not disabled and device has GPU
- Load model onto CPU if loading model onto GPU fails
- Create helper function to check and load model from disk, when model
glob is present on disk.
`Llama.from_pretrained' needs internet to get repo info from
HuggingFace. This isn't required, if the model is already downloaded
Didn't find any existing HF or llama.cpp method that looked for model
glob on disk without internet
2024-03-15 21:19:44 +01:00
|
|
|
reason="Disable in CI to avoid long test runs.",
|
2023-07-27 01:27:08 +02:00
|
|
|
)
|
|
|
|
|
2023-07-08 00:18:55 +02:00
|
|
|
import freezegun
|
2023-03-16 21:49:35 +01:00
|
|
|
from freezegun import freeze_time
|
2021-11-26 12:53:03 +01:00
|
|
|
|
2023-12-28 13:34:02 +01:00
|
|
|
from khoj.processor.conversation.offline.chat_model import (
|
|
|
|
converse_offline,
|
|
|
|
extract_questions_offline,
|
|
|
|
filter_questions,
|
|
|
|
)
|
|
|
|
from khoj.processor.conversation.offline.utils import download_model
|
2023-03-16 19:20:33 +01:00
|
|
|
from khoj.processor.conversation.utils import message_to_log
|
2024-03-06 09:18:41 +01:00
|
|
|
from khoj.routers.helpers import aget_relevant_output_modes
|
Use llama.cpp for offline chat models
- Benefits of moving to llama-cpp-python from gpt4all:
- Support for all GGUF format chat models
- Support for AMD, Nvidia, Mac, Vulcan GPU machines (instead of just Vulcan, Mac)
- Supports models with more capabilities like tools, schema
enforcement, speculative ddecoding, image gen etc.
- Upgrade default chat model, prompt size, tokenizer for new supported
chat models
- Load offline chat model when present on disk without requiring internet
- Load model onto GPU if not disabled and device has GPU
- Load model onto CPU if loading model onto GPU fails
- Create helper function to check and load model from disk, when model
glob is present on disk.
`Llama.from_pretrained' needs internet to get repo info from
HuggingFace. This isn't required, if the model is already downloaded
Didn't find any existing HF or llama.cpp method that looked for model
glob on disk without internet
2024-03-15 21:19:44 +01:00
|
|
|
from khoj.utils.constants import default_offline_chat_model
|
2023-07-28 05:51:20 +02:00
|
|
|
|
2023-01-09 04:09:24 +01:00
|
|
|
|
2023-07-27 01:27:08 +02:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def loaded_model():
|
2024-04-23 19:38:29 +02:00
|
|
|
return download_model(default_offline_chat_model, max_tokens=5000)
|
2023-07-27 01:27:08 +02:00
|
|
|
|
2021-11-26 12:53:03 +01:00
|
|
|
|
2023-07-08 00:18:55 +02:00
|
|
|
freezegun.configure(extend_ignore_list=["transformers"])
|
|
|
|
|
2021-11-26 12:53:03 +01:00
|
|
|
|
2023-03-16 19:20:33 +01:00
|
|
|
# Test
|
2023-03-16 21:49:35 +01:00
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
@pytest.mark.chatquality
|
2024-02-05 15:32:54 +01:00
|
|
|
@freeze_time("1984-04-02", ignore=["transformers"])
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_extract_question_with_date_filter_from_relative_day(loaded_model):
|
2023-03-16 21:49:35 +01:00
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response = extract_questions_offline("Where did I go for dinner yesterday?", loaded_model=loaded_model)
|
2023-03-16 21:49:35 +01:00
|
|
|
|
2023-07-27 01:27:08 +02:00
|
|
|
assert len(response) >= 1
|
2023-07-28 05:51:20 +02:00
|
|
|
|
|
|
|
assert any(
|
|
|
|
[
|
|
|
|
"dt>='1984-04-01'" in response[0] and "dt<'1984-04-02'" in response[0],
|
|
|
|
"dt>='1984-04-01'" in response[0] and "dt<='1984-04-01'" in response[0],
|
|
|
|
'dt>="1984-04-01"' in response[0] and 'dt<"1984-04-02"' in response[0],
|
|
|
|
'dt>="1984-04-01"' in response[0] and 'dt<="1984-04-01"' in response[0],
|
|
|
|
]
|
|
|
|
)
|
2023-07-27 01:27:08 +02:00
|
|
|
|
2023-03-16 21:49:35 +01:00
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-08-02 05:13:47 +02:00
|
|
|
@pytest.mark.xfail(reason="Search actor still isn't very date aware nor capable of formatting")
|
2023-03-16 21:49:35 +01:00
|
|
|
@pytest.mark.chatquality
|
2024-02-05 15:32:54 +01:00
|
|
|
@freeze_time("1984-04-02", ignore=["transformers"])
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_extract_question_with_date_filter_from_relative_month(loaded_model):
|
2023-03-16 21:49:35 +01:00
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response = extract_questions_offline("Which countries did I visit last month?", loaded_model=loaded_model)
|
2023-03-16 21:49:35 +01:00
|
|
|
|
|
|
|
# Assert
|
2023-07-28 05:51:20 +02:00
|
|
|
assert len(response) >= 1
|
|
|
|
# The user query should be the last question in the response
|
|
|
|
assert response[-1] == ["Which countries did I visit last month?"]
|
|
|
|
assert any(
|
|
|
|
[
|
|
|
|
"dt>='1984-03-01'" in response[0] and "dt<'1984-04-01'" in response[0],
|
|
|
|
"dt>='1984-03-01'" in response[0] and "dt<='1984-03-31'" in response[0],
|
|
|
|
'dt>="1984-03-01"' in response[0] and 'dt<"1984-04-01"' in response[0],
|
|
|
|
'dt>="1984-03-01"' in response[0] and 'dt<="1984-03-31"' in response[0],
|
|
|
|
]
|
|
|
|
)
|
2023-03-16 21:49:35 +01:00
|
|
|
|
|
|
|
|
2023-08-01 21:24:43 +02:00
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
@pytest.mark.xfail(reason="Chat actor still isn't very date aware nor capable of formatting")
|
|
|
|
@pytest.mark.chatquality
|
2024-02-05 15:32:54 +01:00
|
|
|
@freeze_time("1984-04-02", ignore=["transformers"])
|
2023-08-01 21:24:43 +02:00
|
|
|
def test_extract_question_with_date_filter_from_relative_year():
|
|
|
|
# Act
|
|
|
|
response = extract_questions_offline("Which countries have I visited this year?")
|
|
|
|
|
|
|
|
# Assert
|
|
|
|
expected_responses = [
|
|
|
|
("dt>='1984-01-01'", ""),
|
|
|
|
("dt>='1984-01-01'", "dt<'1985-01-01'"),
|
|
|
|
("dt>='1984-01-01'", "dt<='1984-12-31'"),
|
|
|
|
]
|
|
|
|
assert len(response) == 1
|
|
|
|
assert any([start in response[0] and end in response[0] for start, end in expected_responses]), (
|
|
|
|
"Expected date filter to limit to 1984 in response but got: " + response[0]
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-03-16 21:49:35 +01:00
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_extract_multiple_explicit_questions_from_message(loaded_model):
|
2023-03-16 21:49:35 +01:00
|
|
|
# Act
|
2024-04-23 21:05:13 +02:00
|
|
|
responses = extract_questions_offline("What is the Sun? What is the Moon?", loaded_model=loaded_model)
|
2023-03-16 21:49:35 +01:00
|
|
|
|
|
|
|
# Assert
|
2024-04-23 21:05:13 +02:00
|
|
|
assert len(responses) >= 2
|
|
|
|
assert ["the Sun" in response for response in responses]
|
|
|
|
assert ["the Moon" in response for response in responses]
|
2023-03-16 21:49:35 +01:00
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_extract_multiple_implicit_questions_from_message(loaded_model):
|
2023-03-16 21:49:35 +01:00
|
|
|
# Act
|
2023-10-05 05:42:25 +02:00
|
|
|
response = extract_questions_offline("Is Carl taller than Ross?", loaded_model=loaded_model)
|
2023-03-16 21:49:35 +01:00
|
|
|
|
|
|
|
# Assert
|
2023-10-05 05:42:25 +02:00
|
|
|
expected_responses = ["height", "taller", "shorter", "heights", "who"]
|
2023-08-01 21:24:43 +02:00
|
|
|
assert len(response) <= 3
|
|
|
|
|
|
|
|
for question in response:
|
|
|
|
assert any([expected_response in question.lower() for expected_response in expected_responses]), (
|
2023-10-05 05:42:25 +02:00
|
|
|
"Expected chat actor to ask follow-up questions about Carl and Ross, but got: " + question
|
2023-08-01 21:24:43 +02:00
|
|
|
)
|
2023-03-17 01:14:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_generate_search_query_using_question_from_chat_history(loaded_model):
|
2023-03-17 01:14:41 +01:00
|
|
|
# Arrange
|
|
|
|
message_list = [
|
2023-10-16 02:22:44 +02:00
|
|
|
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
|
2023-03-17 01:14:41 +01:00
|
|
|
]
|
Use llama.cpp for offline chat models
- Benefits of moving to llama-cpp-python from gpt4all:
- Support for all GGUF format chat models
- Support for AMD, Nvidia, Mac, Vulcan GPU machines (instead of just Vulcan, Mac)
- Supports models with more capabilities like tools, schema
enforcement, speculative ddecoding, image gen etc.
- Upgrade default chat model, prompt size, tokenizer for new supported
chat models
- Load offline chat model when present on disk without requiring internet
- Load model onto GPU if not disabled and device has GPU
- Load model onto CPU if loading model onto GPU fails
- Create helper function to check and load model from disk, when model
glob is present on disk.
`Llama.from_pretrained' needs internet to get repo info from
HuggingFace. This isn't required, if the model is already downloaded
Didn't find any existing HF or llama.cpp method that looked for model
glob on disk without internet
2024-03-15 21:19:44 +01:00
|
|
|
query = "Does he have any sons?"
|
2023-03-17 01:14:41 +01:00
|
|
|
|
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response = extract_questions_offline(
|
Use llama.cpp for offline chat models
- Benefits of moving to llama-cpp-python from gpt4all:
- Support for all GGUF format chat models
- Support for AMD, Nvidia, Mac, Vulcan GPU machines (instead of just Vulcan, Mac)
- Supports models with more capabilities like tools, schema
enforcement, speculative ddecoding, image gen etc.
- Upgrade default chat model, prompt size, tokenizer for new supported
chat models
- Load offline chat model when present on disk without requiring internet
- Load model onto GPU if not disabled and device has GPU
- Load model onto CPU if loading model onto GPU fails
- Create helper function to check and load model from disk, when model
glob is present on disk.
`Llama.from_pretrained' needs internet to get repo info from
HuggingFace. This isn't required, if the model is already downloaded
Didn't find any existing HF or llama.cpp method that looked for model
glob on disk without internet
2024-03-15 21:19:44 +01:00
|
|
|
query,
|
2023-07-27 01:27:08 +02:00
|
|
|
conversation_log=populate_chat_history(message_list),
|
|
|
|
loaded_model=loaded_model,
|
|
|
|
use_history=True,
|
|
|
|
)
|
2023-03-17 01:14:41 +01:00
|
|
|
|
Use llama.cpp for offline chat models
- Benefits of moving to llama-cpp-python from gpt4all:
- Support for all GGUF format chat models
- Support for AMD, Nvidia, Mac, Vulcan GPU machines (instead of just Vulcan, Mac)
- Supports models with more capabilities like tools, schema
enforcement, speculative ddecoding, image gen etc.
- Upgrade default chat model, prompt size, tokenizer for new supported
chat models
- Load offline chat model when present on disk without requiring internet
- Load model onto GPU if not disabled and device has GPU
- Load model onto CPU if loading model onto GPU fails
- Create helper function to check and load model from disk, when model
glob is present on disk.
`Llama.from_pretrained' needs internet to get repo info from
HuggingFace. This isn't required, if the model is already downloaded
Didn't find any existing HF or llama.cpp method that looked for model
glob on disk without internet
2024-03-15 21:19:44 +01:00
|
|
|
any_expected_with_barbara = [
|
|
|
|
"sibling",
|
|
|
|
"brother",
|
2023-10-16 02:22:44 +02:00
|
|
|
]
|
|
|
|
|
Use llama.cpp for offline chat models
- Benefits of moving to llama-cpp-python from gpt4all:
- Support for all GGUF format chat models
- Support for AMD, Nvidia, Mac, Vulcan GPU machines (instead of just Vulcan, Mac)
- Supports models with more capabilities like tools, schema
enforcement, speculative ddecoding, image gen etc.
- Upgrade default chat model, prompt size, tokenizer for new supported
chat models
- Load offline chat model when present on disk without requiring internet
- Load model onto GPU if not disabled and device has GPU
- Load model onto CPU if loading model onto GPU fails
- Create helper function to check and load model from disk, when model
glob is present on disk.
`Llama.from_pretrained' needs internet to get repo info from
HuggingFace. This isn't required, if the model is already downloaded
Didn't find any existing HF or llama.cpp method that looked for model
glob on disk without internet
2024-03-15 21:19:44 +01:00
|
|
|
any_expected_with_anderson = [
|
2023-07-28 05:51:20 +02:00
|
|
|
"son",
|
2023-10-16 02:22:44 +02:00
|
|
|
"sons",
|
2023-07-28 05:51:20 +02:00
|
|
|
"children",
|
2024-04-23 21:05:13 +02:00
|
|
|
"family",
|
2023-03-17 01:14:41 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
# Assert
|
2023-07-27 01:27:08 +02:00
|
|
|
assert len(response) >= 1
|
Use llama.cpp for offline chat models
- Benefits of moving to llama-cpp-python from gpt4all:
- Support for all GGUF format chat models
- Support for AMD, Nvidia, Mac, Vulcan GPU machines (instead of just Vulcan, Mac)
- Supports models with more capabilities like tools, schema
enforcement, speculative ddecoding, image gen etc.
- Upgrade default chat model, prompt size, tokenizer for new supported
chat models
- Load offline chat model when present on disk without requiring internet
- Load model onto GPU if not disabled and device has GPU
- Load model onto CPU if loading model onto GPU fails
- Create helper function to check and load model from disk, when model
glob is present on disk.
`Llama.from_pretrained' needs internet to get repo info from
HuggingFace. This isn't required, if the model is already downloaded
Didn't find any existing HF or llama.cpp method that looked for model
glob on disk without internet
2024-03-15 21:19:44 +01:00
|
|
|
# Ensure the remaining generated search queries use proper nouns and chat history context
|
2024-04-23 21:05:13 +02:00
|
|
|
for question in response:
|
Use llama.cpp for offline chat models
- Benefits of moving to llama-cpp-python from gpt4all:
- Support for all GGUF format chat models
- Support for AMD, Nvidia, Mac, Vulcan GPU machines (instead of just Vulcan, Mac)
- Supports models with more capabilities like tools, schema
enforcement, speculative ddecoding, image gen etc.
- Upgrade default chat model, prompt size, tokenizer for new supported
chat models
- Load offline chat model when present on disk without requiring internet
- Load model onto GPU if not disabled and device has GPU
- Load model onto CPU if loading model onto GPU fails
- Create helper function to check and load model from disk, when model
glob is present on disk.
`Llama.from_pretrained' needs internet to get repo info from
HuggingFace. This isn't required, if the model is already downloaded
Didn't find any existing HF or llama.cpp method that looked for model
glob on disk without internet
2024-03-15 21:19:44 +01:00
|
|
|
if "Barbara" in question:
|
|
|
|
assert any([expected_relation in question for expected_relation in any_expected_with_barbara]), (
|
|
|
|
"Expected search queries using proper nouns and chat history for context, but got: " + question
|
|
|
|
)
|
|
|
|
elif "Anderson" in question:
|
|
|
|
assert any([expected_response in question for expected_response in any_expected_with_anderson]), (
|
|
|
|
"Expected search queries using proper nouns and chat history for context, but got: " + question
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
assert False, (
|
|
|
|
"Expected search queries using proper nouns and chat history for context, but got: " + question
|
|
|
|
)
|
2023-03-17 01:14:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_generate_search_query_using_answer_from_chat_history(loaded_model):
|
2023-03-17 01:14:41 +01:00
|
|
|
# Arrange
|
|
|
|
message_list = [
|
2023-10-16 02:22:44 +02:00
|
|
|
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
|
2023-03-17 01:14:41 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response = extract_questions_offline(
|
2023-10-16 02:22:44 +02:00
|
|
|
"Is she a Doctor?",
|
2023-07-27 01:27:08 +02:00
|
|
|
conversation_log=populate_chat_history(message_list),
|
|
|
|
loaded_model=loaded_model,
|
|
|
|
use_history=True,
|
|
|
|
)
|
2023-03-17 01:14:41 +01:00
|
|
|
|
2023-07-28 05:51:20 +02:00
|
|
|
expected_responses = [
|
2023-10-16 02:22:44 +02:00
|
|
|
"Barbara",
|
2024-04-23 21:05:13 +02:00
|
|
|
"Anderson",
|
2023-07-28 05:51:20 +02:00
|
|
|
]
|
|
|
|
|
2023-03-17 01:14:41 +01:00
|
|
|
# Assert
|
2023-07-28 05:51:20 +02:00
|
|
|
assert len(response) >= 1
|
|
|
|
assert any([expected_response in response[0] for expected_response in expected_responses]), (
|
2024-04-23 21:05:13 +02:00
|
|
|
"Expected chat actor to mention person's by name, but got: " + response[0]
|
2023-07-28 05:51:20 +02:00
|
|
|
)
|
2023-03-17 01:14:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-08-02 05:13:47 +02:00
|
|
|
@pytest.mark.xfail(reason="Search actor unable to create date filter using chat history and notes as context")
|
2023-03-17 01:14:41 +01:00
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_generate_search_query_with_date_and_context_from_chat_history(loaded_model):
|
2023-03-17 01:14:41 +01:00
|
|
|
# Arrange
|
|
|
|
message_list = [
|
2023-03-24 15:55:22 +01:00
|
|
|
("When did I visit Masai Mara?", "You visited Masai Mara in April 2000", []),
|
2023-03-17 01:14:41 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response = extract_questions_offline(
|
2023-07-27 01:27:08 +02:00
|
|
|
"What was the Pizza place we ate at over there?",
|
|
|
|
conversation_log=populate_chat_history(message_list),
|
|
|
|
loaded_model=loaded_model,
|
2023-03-17 01:14:41 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
# Assert
|
|
|
|
expected_responses = [
|
2023-07-08 02:13:47 +02:00
|
|
|
("dt>='2000-04-01'", "dt<'2000-05-01'"),
|
|
|
|
("dt>='2000-04-01'", "dt<='2000-04-30'"),
|
2023-07-09 19:12:09 +02:00
|
|
|
('dt>="2000-04-01"', 'dt<"2000-05-01"'),
|
|
|
|
('dt>="2000-04-01"', 'dt<="2000-04-30"'),
|
2023-03-17 01:14:41 +01:00
|
|
|
]
|
|
|
|
assert len(response) == 1
|
|
|
|
assert "Masai Mara" in response[0]
|
|
|
|
assert any([start in response[0] and end in response[0] for start, end in expected_responses]), (
|
|
|
|
"Expected date filter to limit to April 2000 in response but got: " + response[0]
|
2023-03-16 21:49:35 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2021-11-26 12:53:03 +01:00
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-03-15 01:56:14 +01:00
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_chat_with_no_chat_history_or_retrieved_content(loaded_model):
|
2021-11-26 12:53:03 +01:00
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response_gen = converse_offline(
|
2023-03-24 15:55:22 +01:00
|
|
|
references=[], # Assume no context retrieved from notes for the user_query
|
2023-03-14 18:27:58 +01:00
|
|
|
user_query="Hello, my name is Testatron. Who are you?",
|
2023-07-27 01:27:08 +02:00
|
|
|
loaded_model=loaded_model,
|
2023-03-14 18:27:58 +01:00
|
|
|
)
|
2023-07-08 00:23:44 +02:00
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
2021-11-26 12:53:03 +01:00
|
|
|
|
|
|
|
# Assert
|
2023-08-02 05:13:47 +02:00
|
|
|
expected_responses = ["Khoj", "khoj", "KHOJ"]
|
2021-11-26 12:53:03 +01:00
|
|
|
assert len(response) > 0
|
2023-03-14 18:27:58 +01:00
|
|
|
assert any([expected_response in response for expected_response in expected_responses]), (
|
2023-03-17 01:14:41 +01:00
|
|
|
"Expected assistants name, [K|k]hoj, in response but got: " + response
|
2023-03-14 18:27:58 +01:00
|
|
|
)
|
2021-11-26 12:53:03 +01:00
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-03-15 01:56:14 +01:00
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_answer_from_chat_history_and_previously_retrieved_content(loaded_model):
|
2023-03-15 21:18:08 +01:00
|
|
|
"Chat actor needs to use context in previous notes and chat history to answer question"
|
2023-03-15 00:44:16 +01:00
|
|
|
# Arrange
|
|
|
|
message_list = [
|
2023-03-24 15:55:22 +01:00
|
|
|
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
|
|
|
(
|
|
|
|
"When was I born?",
|
|
|
|
"You were born on 1st April 1984.",
|
|
|
|
["Testatron was born on 1st April 1984 in Testville."],
|
|
|
|
),
|
2023-03-15 00:44:16 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response_gen = converse_offline(
|
2023-03-24 15:55:22 +01:00
|
|
|
references=[], # Assume no context retrieved from notes for the user_query
|
2023-03-15 00:44:16 +01:00
|
|
|
user_query="Where was I born?",
|
2023-03-16 21:49:35 +01:00
|
|
|
conversation_log=populate_chat_history(message_list),
|
2023-07-27 01:27:08 +02:00
|
|
|
loaded_model=loaded_model,
|
2023-03-15 00:44:16 +01:00
|
|
|
)
|
2023-07-08 00:23:44 +02:00
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
2023-03-15 00:44:16 +01:00
|
|
|
|
|
|
|
# Assert
|
|
|
|
assert len(response) > 0
|
|
|
|
# Infer who I am and use that to infer I was born in Testville using chat history and previously retrieved notes
|
|
|
|
assert "Testville" in response
|
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-03-15 01:56:14 +01:00
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_answer_from_chat_history_and_currently_retrieved_content(loaded_model):
|
2023-03-15 21:18:08 +01:00
|
|
|
"Chat actor needs to use context across currently retrieved notes and chat history to answer question"
|
2023-03-15 00:44:16 +01:00
|
|
|
# Arrange
|
|
|
|
message_list = [
|
2023-03-24 15:55:22 +01:00
|
|
|
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
|
|
|
("When was I born?", "You were born on 1st April 1984.", []),
|
2023-03-15 00:44:16 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response_gen = converse_offline(
|
2023-03-24 15:55:22 +01:00
|
|
|
references=[
|
2024-07-17 23:37:16 +02:00
|
|
|
{"compiled": "Testatron was born on 1st April 1984 in Testville."}
|
2023-03-24 15:55:22 +01:00
|
|
|
], # Assume context retrieved from notes for the user_query
|
2023-03-15 00:44:16 +01:00
|
|
|
user_query="Where was I born?",
|
2023-03-16 21:49:35 +01:00
|
|
|
conversation_log=populate_chat_history(message_list),
|
2023-07-27 01:27:08 +02:00
|
|
|
loaded_model=loaded_model,
|
2023-03-14 18:27:58 +01:00
|
|
|
)
|
2023-07-08 00:23:44 +02:00
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
2021-11-26 12:53:03 +01:00
|
|
|
|
2023-03-15 00:44:16 +01:00
|
|
|
# Assert
|
|
|
|
assert len(response) > 0
|
|
|
|
assert "Testville" in response
|
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
Use llama.cpp for offline chat models
- Benefits of moving to llama-cpp-python from gpt4all:
- Support for all GGUF format chat models
- Support for AMD, Nvidia, Mac, Vulcan GPU machines (instead of just Vulcan, Mac)
- Supports models with more capabilities like tools, schema
enforcement, speculative ddecoding, image gen etc.
- Upgrade default chat model, prompt size, tokenizer for new supported
chat models
- Load offline chat model when present on disk without requiring internet
- Load model onto GPU if not disabled and device has GPU
- Load model onto CPU if loading model onto GPU fails
- Create helper function to check and load model from disk, when model
glob is present on disk.
`Llama.from_pretrained' needs internet to get repo info from
HuggingFace. This isn't required, if the model is already downloaded
Didn't find any existing HF or llama.cpp method that looked for model
glob on disk without internet
2024-03-15 21:19:44 +01:00
|
|
|
@pytest.mark.xfail(reason="Chat actor lies when it doesn't know the answer")
|
2023-03-15 01:56:14 +01:00
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_refuse_answering_unanswerable_question(loaded_model):
|
2023-05-12 12:35:14 +02:00
|
|
|
"Chat actor should not try make up answers to unanswerable questions."
|
2023-03-15 00:44:16 +01:00
|
|
|
# Arrange
|
|
|
|
message_list = [
|
2023-03-24 15:55:22 +01:00
|
|
|
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
|
|
|
("When was I born?", "You were born on 1st April 1984.", []),
|
2023-03-15 00:44:16 +01:00
|
|
|
]
|
|
|
|
|
2022-01-12 16:36:01 +01:00
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response_gen = converse_offline(
|
2023-03-24 15:55:22 +01:00
|
|
|
references=[], # Assume no context retrieved from notes for the user_query
|
2023-03-15 00:44:16 +01:00
|
|
|
user_query="Where was I born?",
|
2023-03-16 21:49:35 +01:00
|
|
|
conversation_log=populate_chat_history(message_list),
|
2023-07-27 01:27:08 +02:00
|
|
|
loaded_model=loaded_model,
|
2023-02-17 17:04:26 +01:00
|
|
|
)
|
2023-07-08 00:23:44 +02:00
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
2021-11-26 12:53:03 +01:00
|
|
|
|
|
|
|
# Assert
|
2023-05-12 12:35:14 +02:00
|
|
|
expected_responses = [
|
|
|
|
"don't know",
|
|
|
|
"do not know",
|
|
|
|
"no information",
|
|
|
|
"do not have",
|
|
|
|
"don't have",
|
|
|
|
"cannot answer",
|
|
|
|
"I'm sorry",
|
|
|
|
]
|
2021-11-26 12:53:03 +01:00
|
|
|
assert len(response) > 0
|
2023-03-15 21:18:08 +01:00
|
|
|
assert any([expected_response in response for expected_response in expected_responses]), (
|
|
|
|
"Expected chat actor to say they don't know in response, but got: " + response
|
|
|
|
)
|
2023-03-15 00:44:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-03-15 01:56:14 +01:00
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_answer_requires_current_date_awareness(loaded_model):
|
2023-03-15 21:18:08 +01:00
|
|
|
"Chat actor should be able to answer questions relative to current date using provided notes"
|
2023-03-15 00:44:16 +01:00
|
|
|
# Arrange
|
2023-03-24 15:55:22 +01:00
|
|
|
context = [
|
2024-07-17 23:37:16 +02:00
|
|
|
{
|
|
|
|
"compiled": f"""{datetime.now().strftime("%Y-%m-%d")} "Naco Taco" "Tacos for Dinner"
|
|
|
|
Expenses:Food:Dining 10.00 USD"""
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"compiled": f"""{datetime.now().strftime("%Y-%m-%d")} "Sagar Ratna" "Dosa for Lunch"
|
|
|
|
Expenses:Food:Dining 10.00 USD"""
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"compiled": f"""2020-04-01 "SuperMercado" "Bananas"
|
|
|
|
Expenses:Food:Groceries 10.00 USD"""
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"compiled": f"""2020-01-01 "Naco Taco" "Burittos for Dinner"
|
|
|
|
Expenses:Food:Dining 10.00 USD"""
|
|
|
|
},
|
2023-03-24 15:55:22 +01:00
|
|
|
]
|
2023-03-15 00:44:16 +01:00
|
|
|
|
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response_gen = converse_offline(
|
2023-03-24 15:55:22 +01:00
|
|
|
references=context, # Assume context retrieved from notes for the user_query
|
2023-03-15 00:44:16 +01:00
|
|
|
user_query="What did I have for Dinner today?",
|
2023-07-27 01:27:08 +02:00
|
|
|
loaded_model=loaded_model,
|
2023-03-15 00:44:16 +01:00
|
|
|
)
|
2023-07-08 00:23:44 +02:00
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
2023-03-15 00:44:16 +01:00
|
|
|
|
|
|
|
# Assert
|
|
|
|
expected_responses = ["tacos", "Tacos"]
|
|
|
|
assert len(response) > 0
|
|
|
|
assert any([expected_response in response for expected_response in expected_responses]), (
|
|
|
|
"Expected [T|t]acos in response, but got: " + response
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-03-15 01:56:14 +01:00
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_answer_requires_date_aware_aggregation_across_provided_notes(loaded_model):
|
2023-03-15 21:18:08 +01:00
|
|
|
"Chat actor should be able to answer questions that require date aware aggregation across multiple notes"
|
2023-03-15 00:44:16 +01:00
|
|
|
# Arrange
|
2023-03-24 15:55:22 +01:00
|
|
|
context = [
|
2024-07-17 23:37:16 +02:00
|
|
|
{
|
|
|
|
"compiled": f"""# {datetime.now().strftime("%Y-%m-%d")} "Naco Taco" "Tacos for Dinner"
|
|
|
|
Expenses:Food:Dining 10.00 USD"""
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"compiled": f"""{datetime.now().strftime("%Y-%m-%d")} "Sagar Ratna" "Dosa for Lunch"
|
|
|
|
Expenses:Food:Dining 10.00 USD"""
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"compiled": f"""2020-04-01 "SuperMercado" "Bananas"
|
|
|
|
Expenses:Food:Groceries 10.00 USD"""
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"compiled": f"""2020-01-01 "Naco Taco" "Burittos for Dinner"
|
|
|
|
Expenses:Food:Dining 10.00 USD"""
|
|
|
|
},
|
2023-03-24 15:55:22 +01:00
|
|
|
]
|
2023-03-15 00:44:16 +01:00
|
|
|
|
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response_gen = converse_offline(
|
2023-03-24 15:55:22 +01:00
|
|
|
references=context, # Assume context retrieved from notes for the user_query
|
2023-03-15 00:44:16 +01:00
|
|
|
user_query="How much did I spend on dining this year?",
|
2023-07-27 01:27:08 +02:00
|
|
|
loaded_model=loaded_model,
|
2023-03-15 00:44:16 +01:00
|
|
|
)
|
2023-07-08 00:23:44 +02:00
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
2023-03-15 00:44:16 +01:00
|
|
|
|
|
|
|
# Assert
|
|
|
|
assert len(response) > 0
|
|
|
|
assert "20" in response
|
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-03-15 01:56:14 +01:00
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_answer_general_question_not_in_chat_history_or_retrieved_content(loaded_model):
|
2023-03-15 21:18:08 +01:00
|
|
|
"Chat actor should be able to answer general questions not requiring looking at chat history or notes"
|
2023-03-15 00:44:16 +01:00
|
|
|
# Arrange
|
|
|
|
message_list = [
|
2023-03-24 15:55:22 +01:00
|
|
|
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
|
|
|
("When was I born?", "You were born on 1st April 1984.", []),
|
|
|
|
("Where was I born?", "You were born Testville.", []),
|
2023-03-15 00:44:16 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response_gen = converse_offline(
|
2023-03-24 15:55:22 +01:00
|
|
|
references=[], # Assume no context retrieved from notes for the user_query
|
2023-03-18 02:53:17 +01:00
|
|
|
user_query="Write a haiku about unit testing in 3 lines",
|
2023-03-16 21:49:35 +01:00
|
|
|
conversation_log=populate_chat_history(message_list),
|
2023-07-27 01:27:08 +02:00
|
|
|
loaded_model=loaded_model,
|
2023-03-15 00:44:16 +01:00
|
|
|
)
|
2023-07-08 00:23:44 +02:00
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
2023-03-15 00:44:16 +01:00
|
|
|
|
|
|
|
# Assert
|
2023-07-27 01:27:08 +02:00
|
|
|
expected_responses = ["test", "testing"]
|
|
|
|
assert len(response.splitlines()) >= 3 # haikus are 3 lines long, but Falcon tends to add a lot of new lines.
|
|
|
|
assert any([expected_response in response.lower() for expected_response in expected_responses]), (
|
2023-03-15 00:44:16 +01:00
|
|
|
"Expected [T|t]est in response, but got: " + response
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-03-15 01:56:14 +01:00
|
|
|
@pytest.mark.chatquality
|
2023-07-27 01:27:08 +02:00
|
|
|
def test_ask_for_clarification_if_not_enough_context_in_question(loaded_model):
|
2023-03-15 21:18:08 +01:00
|
|
|
"Chat actor should ask for clarification if question cannot be answered unambiguously with the provided context"
|
2023-03-15 00:44:16 +01:00
|
|
|
# Arrange
|
2023-03-24 15:55:22 +01:00
|
|
|
context = [
|
2024-07-17 23:37:16 +02:00
|
|
|
{
|
|
|
|
"compiled": f"""# Ramya
|
|
|
|
My sister, Ramya, is married to Kali Devi. They have 2 kids, Ravi and Rani."""
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"compiled": f"""# Fang
|
|
|
|
My sister, Fang Liu is married to Xi Li. They have 1 kid, Xiao Li."""
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"compiled": f"""# Aiyla
|
|
|
|
My sister, Aiyla is married to Tolga. They have 3 kids, Yildiz, Ali and Ahmet."""
|
|
|
|
},
|
2023-03-24 15:55:22 +01:00
|
|
|
]
|
2023-03-15 00:44:16 +01:00
|
|
|
|
|
|
|
# Act
|
2023-07-28 05:51:20 +02:00
|
|
|
response_gen = converse_offline(
|
2023-03-24 15:55:22 +01:00
|
|
|
references=context, # Assume context retrieved from notes for the user_query
|
2023-03-15 00:44:16 +01:00
|
|
|
user_query="How many kids does my older sister have?",
|
2023-07-27 01:27:08 +02:00
|
|
|
loaded_model=loaded_model,
|
2023-03-15 00:44:16 +01:00
|
|
|
)
|
2023-07-08 00:23:44 +02:00
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
2023-03-15 00:44:16 +01:00
|
|
|
|
|
|
|
# Assert
|
2024-04-23 21:05:13 +02:00
|
|
|
expected_responses = ["which sister", "Which sister", "which of your sister", "Which of your sister", "Which one"]
|
2023-03-15 00:44:16 +01:00
|
|
|
assert any([expected_response in response for expected_response in expected_responses]), (
|
2023-03-15 21:18:08 +01:00
|
|
|
"Expected chat actor to ask for clarification in response, but got: " + response
|
2023-03-15 00:44:16 +01:00
|
|
|
)
|
2023-03-16 21:49:35 +01:00
|
|
|
|
|
|
|
|
2024-03-23 17:39:38 +01:00
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
@pytest.mark.chatquality
|
|
|
|
def test_agent_prompt_should_be_used(loaded_model, offline_agent):
|
|
|
|
"Chat actor should ask be tuned to think like an accountant based on the agent definition"
|
|
|
|
# Arrange
|
|
|
|
context = [
|
2024-07-17 23:37:16 +02:00
|
|
|
{"compiled": f"""I went to the store and bought some bananas for 2.20"""},
|
|
|
|
{"compiled": f"""I went to the store and bought some apples for 1.30"""},
|
|
|
|
{"compiled": f"""I went to the store and bought some oranges for 6.00"""},
|
2024-03-23 17:39:38 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
# Act
|
|
|
|
response_gen = converse_offline(
|
|
|
|
references=context, # Assume context retrieved from notes for the user_query
|
|
|
|
user_query="What did I buy?",
|
|
|
|
loaded_model=loaded_model,
|
|
|
|
)
|
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
|
|
|
|
|
|
|
# Assert that the model without the agent prompt does not include the summary of purchases
|
|
|
|
expected_responses = ["9.50", "9.5"]
|
|
|
|
assert all([expected_response not in response for expected_response in expected_responses]), (
|
|
|
|
"Expected chat actor to summarize values of purchases" + response
|
|
|
|
)
|
|
|
|
|
|
|
|
# Act
|
|
|
|
response_gen = converse_offline(
|
|
|
|
references=context, # Assume context retrieved from notes for the user_query
|
|
|
|
user_query="What did I buy?",
|
|
|
|
loaded_model=loaded_model,
|
|
|
|
agent=offline_agent,
|
|
|
|
)
|
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
|
|
|
|
|
|
|
# Assert that the model with the agent prompt does include the summary of purchases
|
|
|
|
expected_responses = ["9.50", "9.5"]
|
|
|
|
assert any([expected_response in response for expected_response in expected_responses]), (
|
|
|
|
"Expected chat actor to summarize values of purchases" + response
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-08-02 04:29:03 +02:00
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
def test_chat_does_not_exceed_prompt_size(loaded_model):
|
|
|
|
"Ensure chat context and response together do not exceed max prompt size for the model"
|
|
|
|
# Arrange
|
|
|
|
prompt_size_exceeded_error = "ERROR: The prompt size exceeds the context window size and cannot be processed"
|
2024-07-17 23:37:16 +02:00
|
|
|
context = [{"compiled": " ".join([f"{number}" for number in range(2043)])}]
|
2023-08-02 04:29:03 +02:00
|
|
|
|
|
|
|
# Act
|
|
|
|
response_gen = converse_offline(
|
|
|
|
references=context, # Assume context retrieved from notes for the user_query
|
|
|
|
user_query="What numbers come after these?",
|
|
|
|
loaded_model=loaded_model,
|
|
|
|
)
|
|
|
|
response = "".join([response_chunk for response_chunk in response_gen])
|
|
|
|
|
|
|
|
# Assert
|
|
|
|
assert prompt_size_exceeded_error not in response, (
|
|
|
|
"Expected chat response to be within prompt limits, but got exceeded error: " + response
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
2023-07-28 05:51:20 +02:00
|
|
|
def test_filter_questions():
|
|
|
|
test_questions = [
|
|
|
|
"I don't know how to answer that",
|
|
|
|
"I cannot answer anything about the nuclear secrets",
|
|
|
|
"Who is on the basketball team?",
|
|
|
|
]
|
|
|
|
filtered_questions = filter_questions(test_questions)
|
|
|
|
assert len(filtered_questions) == 1
|
|
|
|
assert filtered_questions[0] == "Who is on the basketball team?"
|
|
|
|
|
|
|
|
|
2024-03-06 09:18:41 +01:00
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
@pytest.mark.anyio
|
|
|
|
@pytest.mark.django_db(transaction=True)
|
2024-07-17 23:37:16 +02:00
|
|
|
async def test_use_text_response_mode(client_offline_chat):
|
2024-03-06 09:18:41 +01:00
|
|
|
# Arrange
|
|
|
|
user_query = "What's the latest in the Israel/Palestine conflict?"
|
|
|
|
|
|
|
|
# Act
|
|
|
|
mode = await aget_relevant_output_modes(user_query, {})
|
|
|
|
|
|
|
|
# Assert
|
2024-07-17 23:37:16 +02:00
|
|
|
assert mode.value == "text"
|
2024-03-06 09:18:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
@pytest.mark.anyio
|
|
|
|
@pytest.mark.django_db(transaction=True)
|
|
|
|
async def test_use_image_response_mode(client_offline_chat):
|
|
|
|
# Arrange
|
|
|
|
user_query = "Paint a picture of the scenery in Timbuktu in the winter"
|
|
|
|
|
|
|
|
# Act
|
|
|
|
mode = await aget_relevant_output_modes(user_query, {})
|
|
|
|
|
|
|
|
# Assert
|
|
|
|
assert mode.value == "image"
|
|
|
|
|
|
|
|
|
2023-03-16 21:49:35 +01:00
|
|
|
# Helpers
|
|
|
|
# ----------------------------------------------------------------------------------------------------
|
|
|
|
def populate_chat_history(message_list):
|
|
|
|
# Generate conversation logs
|
|
|
|
conversation_log = {"chat": []}
|
2023-07-27 01:27:08 +02:00
|
|
|
for user_message, chat_response, context in message_list:
|
|
|
|
message_to_log(
|
2023-03-16 21:49:35 +01:00
|
|
|
user_message,
|
2023-07-27 01:27:08 +02:00
|
|
|
chat_response,
|
2023-03-16 21:49:35 +01:00
|
|
|
{"context": context, "intent": {"query": user_message, "inferred-queries": f'["{user_message}"]'}},
|
2023-07-27 01:27:08 +02:00
|
|
|
conversation_log=conversation_log["chat"],
|
2023-03-16 21:49:35 +01:00
|
|
|
)
|
|
|
|
return conversation_log
|