khoj/tests/test_helpers.py

import os
import secrets

import numpy as np
import psutil
import pytest
from scipy.stats import linregress

from khoj.processor.embeddings import EmbeddingsModel
from khoj.processor.tools.online_search import read_webpage_with_olostep
from khoj.utils import helpers


def test_get_from_null_dict():
    # null handling
    assert helpers.get_from_dict(dict()) == dict()
    assert helpers.get_from_dict(dict(), None) == None

    # key present in nested dictionary
    # 1-level dictionary
    assert helpers.get_from_dict({"a": 1, "b": 2}, "a") == 1
    assert helpers.get_from_dict({"a": 1, "b": 2}, "c") == None

    # 2-level dictionary
    assert helpers.get_from_dict({"a": {"a_a": 1}, "b": 2}, "a") == {"a_a": 1}
    assert helpers.get_from_dict({"a": {"a_a": 1}, "b": 2}, "a", "a_a") == 1

    # key not present in nested dictionary
    # 2-level_dictionary
    assert helpers.get_from_dict({"a": {"a_a": 1}, "b": 2}, "b", "b_a") == None


def test_merge_dicts():
    # basic merge of dicts with non-overlapping keys
    assert helpers.merge_dicts(priority_dict={"a": 1}, default_dict={"b": 2}) == {"a": 1, "b": 2}

    # use default dict items when not present in priority dict
    assert helpers.merge_dicts(priority_dict={}, default_dict={"b": 2}) == {"b": 2}

    # do not override existing key in priority_dict with default dict
    assert helpers.merge_dicts(priority_dict={"a": 1}, default_dict={"a": 2}) == {"a": 1}


def test_lru_cache():
    # Test initializing cache
    cache = helpers.LRU({"a": 1, "b": 2}, capacity=2)
    assert cache == {"a": 1, "b": 2}

    # Test capacity overflow
    cache["c"] = 3
    assert cache == {"b": 2, "c": 3}

    # Test delete least recently used item from LRU cache on capacity overflow
    cache["b"]  # accessing 'b' makes it the most recently used item
    cache["d"] = 4  # so 'c' is deleted from the cache instead of 'b'
    assert cache == {"b": 2, "d": 4}


@pytest.mark.skip(reason="Memory leak exists on GPU, MPS devices")
def test_encode_docs_memory_leak():
    # Arrange
    iterations = 50
    batch_size = 20
    embeddings_model = EmbeddingsModel()
    memory_usage_trend = []
    device = f"{helpers.get_device()}".upper()

    # Act
    # Encode random strings repeatedly and record memory usage trend
    for iteration in range(iterations):
        random_docs = [" ".join(secrets.token_hex(5) for _ in range(10)) for _ in range(batch_size)]
        a = [embeddings_model.embed_documents(random_docs)]
        memory_usage_trend += [psutil.Process().memory_info().rss / (1024 * 1024)]
        print(f"{iteration:02d}, {memory_usage_trend[-1]:.2f}", flush=True)

    # Calculate slope of line fitting memory usage history
    memory_usage_trend = np.array(memory_usage_trend)
    slope, _, _, _, _ = linregress(np.arange(len(memory_usage_trend)), memory_usage_trend)
    print(f"Memory usage increased at ~{slope:.2f} MB per iteration on {device}")

    # Assert
    # If slope is positive memory utilization is increasing
    # Positive threshold of 2, from observing memory usage trend on MPS vs CPU device
    assert slope < 2, f"Memory leak suspected on {device}. Memory usage increased at ~{slope:.2f} MB per iteration"


@pytest.mark.skipif(os.getenv("OLOSTEP_API_KEY") is None, reason="OLOSTEP_API_KEY is not set")
def test_olostep_api():
    # Arrange
    website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"

    # Act
    response = read_webpage_with_olostep(website)

    # Assert
    assert (
        "An alarm sent from the area near the fire also failed to register at the courthouse where the fire watchmen were"
        in response
    )
Scrape results from Serper results using Olostep (#627) * Initailize changes to incporate web scraping logic after getting SERP results - Do some minor refactors to pass a symptom prompt to the openai model when making a query - integrate Olostep in order to perform the webscraping * Fix truncation error with new line, fix typing in olostep code * Use the authorization header for the token * Add a small hint/indicator for how to use Khojs other modalities in the welcome prompt * Add more detailed error message if Olostep query fails * Add unit tests which invoke Olostep in chat director * Add test for olostep tool 2024-01-29 09:46:50 +01:00			`import os`
Test memory leak on MPS device when generating vector embeddings Slope threshold of 2.0 determined qualitatively on local Mac device Minor unused import and clean-up 2023-11-05 12:32:29 +01:00			`import secrets`

Add isort to the pre-commit configuration and apply it to the whole project (#595) * Apply isort to the entire repository * Fix missing import issues in text_to_entries * Fix imports in migration files 2023-12-28 13:34:02 +01:00			`import numpy as np`
			`import psutil`
Test memory leak on MPS device when generating vector embeddings Slope threshold of 2.0 determined qualitatively on local Mac device Minor unused import and clean-up 2023-11-05 12:32:29 +01:00			`import pytest`
Add isort to the pre-commit configuration and apply it to the whole project (#595) * Apply isort to the entire repository * Fix missing import issues in text_to_entries * Fix imports in migration files 2023-12-28 13:34:02 +01:00			`from scipy.stats import linregress`
Test memory leak on MPS device when generating vector embeddings Slope threshold of 2.0 determined qualitatively on local Mac device Minor unused import and clean-up 2023-11-05 12:32:29 +01:00
			`from khoj.processor.embeddings import EmbeddingsModel`
Read, extract information from web pages in parallel to lower response time - Time reading webpage, extract info from webpage steps for perf analysis - Deduplicate webpages to read gathered across separate google searches - Use aiohttp to make API requests non-blocking, pair with asyncio to parallelize all the online search webpage read and extract calls 2024-03-08 12:11:19 +01:00			`from khoj.processor.tools.online_search import read_webpage_with_olostep`
Use the src/ layout to fix packaging Khoj for PyPi - Why The khoj pypi packages should be installed in `khoj' directory. Previously it was being installed into `src' directory, which is a generic top level directory name that is discouraged from being used - Changes - move src/* to src/khoj/* - update `setup.py' to `find_packages' in `src' instead of project root - rename imports to form `from khoj.*' in complete project - update `constants.web_directory' path to use `khoj' directory - rename root logger to `khoj' in `main.py' - fix image_search tests to use the newly rename `khoj' logger - update config, docs, workflows to reference new path `src/khoj' 2023-02-14 21:50:51 +01:00			`from khoj.utils import helpers`
Add helpers to merge dictionaries and get keys deep inside a dictionary 2021-08-22 00:32:23 +02:00
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00
Add helpers to merge dictionaries and get keys deep inside a dictionary 2021-08-22 00:32:23 +02:00			`def test_get_from_null_dict():`
			`# null handling`
			`assert helpers.get_from_dict(dict()) == dict()`
			`assert helpers.get_from_dict(dict(), None) == None`

			`# key present in nested dictionary`
			`# 1-level dictionary`
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00			`assert helpers.get_from_dict({"a": 1, "b": 2}, "a") == 1`
			`assert helpers.get_from_dict({"a": 1, "b": 2}, "c") == None`
Add helpers to merge dictionaries and get keys deep inside a dictionary 2021-08-22 00:32:23 +02:00
			`# 2-level dictionary`
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00			`assert helpers.get_from_dict({"a": {"a_a": 1}, "b": 2}, "a") == {"a_a": 1}`
			`assert helpers.get_from_dict({"a": {"a_a": 1}, "b": 2}, "a", "a_a") == 1`
Add helpers to merge dictionaries and get keys deep inside a dictionary 2021-08-22 00:32:23 +02:00
			`# key not present in nested dictionary`
			`# 2-level_dictionary`
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00			`assert helpers.get_from_dict({"a": {"a_a": 1}, "b": 2}, "b", "b_a") == None`
Add helpers to merge dictionaries and get keys deep inside a dictionary 2021-08-22 00:32:23 +02:00

			`def test_merge_dicts():`
			`# basic merge of dicts with non-overlapping keys`
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00			`assert helpers.merge_dicts(priority_dict={"a": 1}, default_dict={"b": 2}) == {"a": 1, "b": 2}`
Add helpers to merge dictionaries and get keys deep inside a dictionary 2021-08-22 00:32:23 +02:00
			`# use default dict items when not present in priority dict`
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00			`assert helpers.merge_dicts(priority_dict={}, default_dict={"b": 2}) == {"b": 2}`
Add helpers to merge dictionaries and get keys deep inside a dictionary 2021-08-22 00:32:23 +02:00
			`# do not override existing key in priority_dict with default dict`
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00			`assert helpers.merge_dicts(priority_dict={"a": 1}, default_dict={"a": 2}) == {"a": 1}`
Create LRU helper class for caching 2022-09-04 15:31:46 +02:00

			`def test_lru_cache():`
			`# Test initializing cache`
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00			`cache = helpers.LRU({"a": 1, "b": 2}, capacity=2)`
			`assert cache == {"a": 1, "b": 2}`
Create LRU helper class for caching 2022-09-04 15:31:46 +02:00
			`# Test capacity overflow`
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00			`cache["c"] = 3`
			`assert cache == {"b": 2, "c": 3}`
Create LRU helper class for caching 2022-09-04 15:31:46 +02:00
			`# Test delete least recently used item from LRU cache on capacity overflow`
Use Black to format Khoj server code and tests 2023-02-17 17:04:26 +01:00			`cache["b"] # accessing 'b' makes it the most recently used item`
			`cache["d"] = 4 # so 'c' is deleted from the cache instead of 'b'`
			`assert cache == {"b": 2, "d": 4}`
Test memory leak on MPS device when generating vector embeddings Slope threshold of 2.0 determined qualitatively on local Mac device Minor unused import and clean-up 2023-11-05 12:32:29 +01:00

			`@pytest.mark.skip(reason="Memory leak exists on GPU, MPS devices")`
			`def test_encode_docs_memory_leak():`
			`# Arrange`
			`iterations = 50`
			`batch_size = 20`
			`embeddings_model = EmbeddingsModel()`
			`memory_usage_trend = []`
Improve log messages in text_entries and memory leak unit test 2023-11-07 04:26:54 +01:00			`device = f"{helpers.get_device()}".upper()`
Test memory leak on MPS device when generating vector embeddings Slope threshold of 2.0 determined qualitatively on local Mac device Minor unused import and clean-up 2023-11-05 12:32:29 +01:00
			`# Act`
			`# Encode random strings repeatedly and record memory usage trend`
			`for iteration in range(iterations):`
			`random_docs = [" ".join(secrets.token_hex(5) for _ in range(10)) for _ in range(batch_size)]`
			`a = [embeddings_model.embed_documents(random_docs)]`
			`memory_usage_trend += [psutil.Process().memory_info().rss / (1024 * 1024)]`
			`print(f"{iteration:02d}, {memory_usage_trend[-1]:.2f}", flush=True)`

			`# Calculate slope of line fitting memory usage history`
			`memory_usage_trend = np.array(memory_usage_trend)`
			`slope, _, _, _, _ = linregress(np.arange(len(memory_usage_trend)), memory_usage_trend)`
Improve log messages in text_entries and memory leak unit test 2023-11-07 04:26:54 +01:00			`print(f"Memory usage increased at ~{slope:.2f} MB per iteration on {device}")`
Test memory leak on MPS device when generating vector embeddings Slope threshold of 2.0 determined qualitatively on local Mac device Minor unused import and clean-up 2023-11-05 12:32:29 +01:00
			`# Assert`
			`# If slope is positive memory utilization is increasing`
			`# Positive threshold of 2, from observing memory usage trend on MPS vs CPU device`
Improve log messages in text_entries and memory leak unit test 2023-11-07 04:26:54 +01:00			`assert slope < 2, f"Memory leak suspected on {device}. Memory usage increased at ~{slope:.2f} MB per iteration"`
Scrape results from Serper results using Olostep (#627) * Initailize changes to incporate web scraping logic after getting SERP results - Do some minor refactors to pass a symptom prompt to the openai model when making a query - integrate Olostep in order to perform the webscraping * Fix truncation error with new line, fix typing in olostep code * Use the authorization header for the token * Add a small hint/indicator for how to use Khojs other modalities in the welcome prompt * Add more detailed error message if Olostep query fails * Add unit tests which invoke Olostep in chat director * Add test for olostep tool 2024-01-29 09:46:50 +01:00

			`@pytest.mark.skipif(os.getenv("OLOSTEP_API_KEY") is None, reason="OLOSTEP_API_KEY is not set")`
			`def test_olostep_api():`
			`# Arrange`
			`website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"`

			`# Act`
Read, extract information from web pages in parallel to lower response time - Time reading webpage, extract info from webpage steps for perf analysis - Deduplicate webpages to read gathered across separate google searches - Use aiohttp to make API requests non-blocking, pair with asyncio to parallelize all the online search webpage read and extract calls 2024-03-08 12:11:19 +01:00			`response = read_webpage_with_olostep(website)`
Scrape results from Serper results using Olostep (#627) * Initailize changes to incporate web scraping logic after getting SERP results - Do some minor refactors to pass a symptom prompt to the openai model when making a query - integrate Olostep in order to perform the webscraping * Fix truncation error with new line, fix typing in olostep code * Use the authorization header for the token * Add a small hint/indicator for how to use Khojs other modalities in the welcome prompt * Add more detailed error message if Olostep query fails * Add unit tests which invoke Olostep in chat director * Add test for olostep tool 2024-01-29 09:46:50 +01:00
			`# Assert`
			`assert (`
			`"An alarm sent from the area near the fire also failed to register at the courthouse where the fire watchmen were"`
			`in response`
			`)`