Merge branch 'master' of github.com:debanjum/semantic-search into add-summarize-capability-to-chat-bot

- Fix openai_api_key being set in ConfigProcessorConfig
- Merge addition of config UI and config instantiation updates
This commit is contained in:
Debanjum Singh Solanky 2021-12-20 13:26:35 +05:30
commit 6dc2a99d35
19 changed files with 424 additions and 210 deletions

View file

@ -10,35 +10,50 @@ on:
jobs:
test:
name: Run Tests
runs-on: "macos-latest"
defaults:
run:
shell: bash -l {0}
strategy:
matrix:
include:
- os: ubuntu-latest
label: linux-64
prefix: /usr/share/miniconda3/envs/test
name: ${{ matrix.label }}
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
- name: Cache conda
uses: actions/cache@v2
env:
# Increase this value to reset cache if environment.yml has not changed
CACHE_NUMBER: 0
with:
path: ~/conda_pkgs_dir
key:
${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{
hashFiles('environment.yml') }}
- uses: conda-incubator/setup-miniconda@v2
- name: Install Environment Dependencies
shell: bash -l {0}
run: sudo apt-get -y install libimage-exiftool-perl
- name: Setup Mambaforge
uses: conda-incubator/setup-miniconda@v2
with:
miniforge-variant: Mambaforge
miniforge-version: latest
activate-environment: test
use-mamba: true
environment-file: environment.yml
python-version: 3.8
auto-activate-base: false
use-only-tar-bz2: true
- name: Conda Info
run: |
conda info
conda list
- name: Set cache date
run: echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV
- uses: actions/cache@v2
with:
path: ${{ matrix.prefix }}
key: ${{ matrix.label }}-conda-${{ hashFiles('environment.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }}
env:
# Increase this value to reset cache if environment.yml has not changed
CACHE_NUMBER: 0
id: cache
- name: Update environment
run: mamba env update -n test -f environment.yml
if: steps.cache.outputs.cache-hit != 'true'
- name: Run Pytest
run: |
python -m pytest
shell: bash -l {0}
run: python -m pytest

4
.gitignore vendored
View file

@ -3,4 +3,6 @@ __pycache__
.emacs.desktop*
tests/data/.*
src/.data
.vscode
.vscode
*.gz
*.pt

View file

@ -16,6 +16,11 @@
conda activate semantic-search
#+end_src
*** Install Environmental Dependencies
#+begin_src shell
sudo apt-get -y install libimage-exiftool-perl
#+end_src
** Configure
Configure application search types and their underlying data source/files in ~sample_config.yml~
Use the ~sample_config.yml~ as reference

View file

@ -6,11 +6,15 @@ dependencies:
- numpy=1.*
- pytorch=1.*
- transformers=4.*
- sentence-transformers=2.0.0
- sentence-transformers=2.1.0
- fastapi=0.*
- uvicorn=0.*
- pyyaml=5.*
- pytest=6.*
- pillow=8.*
- torchvision=0.*
- openai=0.*
- openai=0.*
- pydantic=1.*
- jinja2=3.0.*
- aiofiles=0.*
- huggingface_hub=0.*

View file

@ -1,26 +1,49 @@
# Standard Packages
import sys
import json
import sys, json, yaml
from typing import Optional
# External Packages
import uvicorn
from fastapi import FastAPI
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
# Internal Packages
from src.search_type import asymmetric, symmetric_ledger, image_search
from src.utils.helpers import get_absolute_path, get_from_dict
from src.utils.cli import cli
from src.utils.config import SearchType, SearchModels, TextSearchConfig, ImageSearchConfig, SearchConfig, ProcessorConfig, ConversationProcessorConfig
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
from src.utils.rawconfig import FullConfig
from src.processor.conversation.gpt import converse, message_to_log, message_to_prompt, understand, summarize
# Application Global State
config = FullConfig()
model = SearchModels()
search_config = SearchConfig()
processor_config = ProcessorConfig()
processor_config = ProcessorConfigModel()
config_file = ""
verbose = 0
app = FastAPI()
app.mount("/views", StaticFiles(directory="views"), name="views")
templates = Jinja2Templates(directory="views/")
@app.get('/ui', response_class=HTMLResponse)
def ui(request: Request):
return templates.TemplateResponse("config.html", context={'request': request})
@app.get('/config', response_model=FullConfig)
def config_data():
return config
@app.post('/config')
async def config_data(updated_config: FullConfig):
global config
config = updated_config
with open(config_file, 'w') as outfile:
yaml.dump(yaml.safe_load(config.json(by_alias=True)), outfile)
outfile.close()
return config
@app.get('/search')
def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
@ -60,7 +83,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
return image_search.collate_results(
hits,
model.image_search.image_names,
search_config.image.input_directory,
config.content_type.image.input_directory,
results_count)
else:
@ -69,22 +92,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
@app.get('/regenerate')
def regenerate(t: Optional[SearchType] = None):
if (t == SearchType.Notes or t == None) and search_config.notes:
# Extract Entries, Generate Embeddings
model.notes_search = asymmetric.setup(search_config.notes, regenerate=True)
if (t == SearchType.Music or t == None) and search_config.music:
# Extract Entries, Generate Song Embeddings
model.music_search = asymmetric.setup(search_config.music, regenerate=True)
if (t == SearchType.Ledger or t == None) and search_config.ledger:
# Extract Entries, Generate Embeddings
model.ledger_search = symmetric_ledger.setup(search_config.ledger, regenerate=True)
if (t == SearchType.Image or t == None) and search_config.image:
# Extract Images, Generate Embeddings
model.image_search = image_search.setup(search_config.image, regenerate=True)
initialize_search(config, regenerate=True, t=t)
return {'status': 'ok', 'message': 'regeneration completed'}
@ -111,37 +119,40 @@ def chat(q: str):
return {'status': 'ok', 'response': gpt_response}
def initialize_search(config, regenerate, verbose):
def initialize_search(config: FullConfig, regenerate: bool, t: SearchType = None):
model = SearchModels()
search_config = SearchConfig()
# Initialize Org Notes Search
search_config.notes = TextSearchConfig.create_from_dictionary(config, ('content-type', 'org'), verbose)
if search_config.notes:
model.notes_search = asymmetric.setup(search_config.notes, regenerate=regenerate)
if (t == SearchType.Notes or t == None) and config.content_type.org:
# Extract Entries, Generate Notes Embeddings
model.notes_search = asymmetric.setup(config.content_type.org, regenerate=regenerate, verbose=verbose)
# Initialize Org Music Search
search_config.music = TextSearchConfig.create_from_dictionary(config, ('content-type', 'music'), verbose)
if search_config.music:
model.music_search = asymmetric.setup(search_config.music, regenerate=regenerate)
if (t == SearchType.Music or t == None) and config.content_type.music:
# Extract Entries, Generate Music Embeddings
model.music_search = asymmetric.setup(config.content_type.music, regenerate=regenerate, verbose=verbose)
# Initialize Ledger Search
search_config.ledger = TextSearchConfig.create_from_dictionary(config, ('content-type', 'ledger'), verbose)
if search_config.ledger:
model.ledger_search = symmetric_ledger.setup(search_config.ledger, regenerate=regenerate)
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
# Extract Entries, Generate Ledger Embeddings
model.ledger_search = symmetric_ledger.setup(config.content_type.ledger, regenerate=regenerate, verbose=verbose)
# Initialize Image Search
search_config.image = ImageSearchConfig.create_from_dictionary(config, ('content-type', 'image'), verbose)
if search_config.image:
model.image_search = image_search.setup(search_config.image, regenerate=regenerate)
if (t == SearchType.Image or t == None) and config.content_type.image:
# Extract Entries, Generate Image Embeddings
model.image_search = image_search.setup(config.content_type.image, regenerate=regenerate, verbose=verbose)
return model, search_config
return model
def initialize_processor(config, verbose):
def initialize_processor(config: FullConfig):
if not config.processor:
return
processor_config = ProcessorConfigModel()
# Initialize Conversation Processor
processor_config = ProcessorConfig()
processor_config.conversation = ConversationProcessorConfig.create_from_dictionary(config, ('processor', 'conversation'), verbose)
processor_config.conversation = ConversationProcessorConfigModel(config.processor.conversation, verbose)
conversation_logfile = processor_config.conversation.conversation_logfile
if processor_config.conversation.verbose:
@ -195,11 +206,20 @@ if __name__ == '__main__':
# Load config from CLI
args = cli(sys.argv[1:])
# Initialize Search from Config
model, search_config = initialize_search(args.config, args.regenerate, args.verbose)
# Stores the file path to the config file.
config_file = args.config_file
# Store the verbose flag
verbose = args.verbose
# Store the raw config data.
config = args.config
# Initialize the search model from Config
model = initialize_search(args.config, args.regenerate)
# Initialize Processor from Config
processor_config = initialize_processor(args.config, args.verbose)
processor_config = initialize_processor(args.config)
# Start Application Server
if args.socket:

View file

@ -14,7 +14,8 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
# Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.utils.config import TextSearchModel, TextSearchConfig
from src.utils.config import TextSearchModel
from src.utils.rawconfig import TextSearchConfig
def initialize_model():
@ -58,7 +59,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, v
corpus_embeddings = bi_encoder.encode([entry[0] for entry in entries], convert_to_tensor=True, show_progress_bar=True)
torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
if verbose > 0:
print(f"Computed embeddings and save them to {embeddings_file}")
print(f"Computed embeddings and saved them to {embeddings_file}")
return corpus_embeddings
@ -148,22 +149,22 @@ def collate_results(hits, entries, count=5):
in hits[0:count]]
def setup(config: TextSearchConfig, regenerate: bool) -> TextSearchModel:
def setup(config: TextSearchConfig, regenerate: bool, verbose: bool=False) -> TextSearchModel:
# Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model()
# Map notes in Org-Mode files to (compressed) JSONL formatted file
if not resolve_absolute_path(config.compressed_jsonl).exists() or regenerate:
org_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, config.verbose)
org_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, verbose)
# Extract Entries
entries = extract_entries(config.compressed_jsonl, config.verbose)
entries = extract_entries(config.compressed_jsonl, verbose)
top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus
# Compute or Load Embeddings
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=config.verbose)
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=config.verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=verbose)
if __name__ == '__main__':

View file

@ -10,9 +10,10 @@ from tqdm import trange
import torch
# Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path
from src.utils.helpers import resolve_absolute_path
import src.utils.exiftool as exiftool
from src.utils.config import ImageSearchModel, ImageSearchConfig
from src.utils.config import ImageSearchModel
from src.utils.rawconfig import ImageSearchConfig
def initialize_model():
@ -153,13 +154,13 @@ def collate_results(hits, image_names, image_directory, count=5):
in hits[0:count]]
def setup(config: ImageSearchConfig, regenerate: bool) -> ImageSearchModel:
def setup(config: ImageSearchConfig, regenerate: bool, verbose: bool=False) -> ImageSearchModel:
# Initialize Model
encoder = initialize_model()
# Extract Entries
image_directory = resolve_absolute_path(config.input_directory, strict=True)
image_names = extract_entries(image_directory, config.verbose)
image_names = extract_entries(image_directory, verbose)
# Compute or Load Embeddings
embeddings_file = resolve_absolute_path(config.embeddings_file)
@ -170,13 +171,13 @@ def setup(config: ImageSearchConfig, regenerate: bool) -> ImageSearchModel:
batch_size=config.batch_size,
regenerate=regenerate,
use_xmp_metadata=config.use_xmp_metadata,
verbose=config.verbose)
verbose=verbose)
return ImageSearchModel(image_names,
image_embeddings,
image_metadata_embeddings,
encoder,
config.verbose)
verbose)
if __name__ == '__main__':

View file

@ -1,9 +1,6 @@
# Standard Packages
import json
import time
import gzip
import os
import sys
import re
import argparse
import pathlib
@ -15,11 +12,12 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
# Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.utils.config import TextSearchModel, TextSearchConfig
from src.utils.config import TextSearchModel
from src.utils.rawconfig import TextSearchConfig
def initialize_model():
"Initialize model for symetric semantic search. That is, where query of similar size to results"
"Initialize model for symmetric semantic search. That is, where query of similar size to results"
torch.set_num_threads(4)
bi_encoder = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') # The encoder encodes all entries to use for semantic search
top_k = 30 # Number of entries we want to retrieve with the bi-encoder
@ -55,7 +53,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, v
corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True)
torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
if verbose > 0:
print(f"Computed embeddings and save them to {embeddings_file}")
print(f"Computed embeddings and saved them to {embeddings_file}")
return corpus_embeddings
@ -143,22 +141,22 @@ def collate_results(hits, entries, count=5):
in hits[0:count]]
def setup(config: TextSearchConfig, regenerate: bool) -> TextSearchModel:
def setup(config: TextSearchConfig, regenerate: bool, verbose: bool) -> TextSearchModel:
# Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model()
# Map notes in Org-Mode files to (compressed) JSONL formatted file
if not resolve_absolute_path(config.compressed_jsonl).exists() or regenerate:
beancount_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, config.verbose)
beancount_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, verbose)
# Extract Entries
entries = extract_entries(config.compressed_jsonl, config.verbose)
entries = extract_entries(config.compressed_jsonl, verbose)
top_k = min(len(entries), top_k)
# Compute or Load Embeddings
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=config.verbose)
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=config.verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=verbose)
if __name__ == '__main__':

View file

@ -1,12 +1,14 @@
# Standard Packages
import argparse
import pathlib
import json
# External Packages
import yaml
# Internal Packages
from src.utils.helpers import is_none_or_empty, get_absolute_path, resolve_absolute_path, get_from_dict, merge_dicts
from src.utils.helpers import is_none_or_empty, get_absolute_path, resolve_absolute_path, merge_dicts
from src.utils.rawconfig import FullConfig
def cli(args=None):
if is_none_or_empty(args):
@ -35,12 +37,15 @@ def cli(args=None):
with open(get_absolute_path(args.config_file), 'r', encoding='utf-8') as config_file:
config_from_file = yaml.safe_load(config_file)
args.config = merge_dicts(priority_dict=config_from_file, default_dict=args.config)
args.config = FullConfig.parse_obj(args.config)
else:
args.config = FullConfig.parse_obj(args.config)
if args.org_files:
args.config['content-type']['org']['input-files'] = args.org_files
args.config.content_type.org.input_files = args.org_files
if args.org_filter:
args.config['content-type']['org']['input-filter'] = args.org_filter
args.config.content_type.org.input_filter = args.org_filter
return args

View file

@ -4,7 +4,7 @@ from dataclasses import dataclass
from pathlib import Path
# Internal Packages
from src.utils.helpers import get_from_dict
from src.utils.rawconfig import ConversationProcessorConfig
class SearchType(str, Enum):
@ -42,80 +42,15 @@ class SearchModels():
image_search: ImageSearchModel = None
class TextSearchConfig():
def __init__(self, input_files, input_filter, compressed_jsonl, embeddings_file, verbose):
self.input_files = input_files
self.input_filter = input_filter
self.compressed_jsonl = Path(compressed_jsonl)
self.embeddings_file = Path(embeddings_file)
class ConversationProcessorConfigModel():
def __init__(self, processor_config: ConversationProcessorConfig, verbose: bool):
self.openai_api_key = processor_config.openai_api_key
self.conversation_logfile = Path(processor_config.conversation_logfile)
self.chat_session = ''
self.meta_log = []
self.verbose = verbose
def create_from_dictionary(config, key_tree, verbose):
text_config = get_from_dict(config, *key_tree)
search_enabled = text_config and ('input-files' in text_config or 'input-filter' in text_config)
if not search_enabled:
return None
return TextSearchConfig(
input_files = text_config['input-files'],
input_filter = text_config['input-filter'],
compressed_jsonl = Path(text_config['compressed-jsonl']),
embeddings_file = Path(text_config['embeddings-file']),
verbose = verbose)
class ImageSearchConfig():
def __init__(self, input_directory, embeddings_file, batch_size, use_xmp_metadata, verbose):
self.input_directory = input_directory
self.embeddings_file = Path(embeddings_file)
self.batch_size = batch_size
self.use_xmp_metadata = use_xmp_metadata
self.verbose = verbose
def create_from_dictionary(config, key_tree, verbose):
image_config = get_from_dict(config, *key_tree)
search_enabled = image_config and 'input-directory' in image_config
if not search_enabled:
return None
return ImageSearchConfig(
input_directory = Path(image_config['input-directory']),
embeddings_file = Path(image_config['embeddings-file']),
batch_size = image_config['batch-size'],
use_xmp_metadata = {'yes': True, 'no': False}[image_config['use-xmp-metadata']],
verbose = verbose)
@dataclass
class SearchConfig():
notes: TextSearchConfig = None
ledger: TextSearchConfig = None
music: TextSearchConfig = None
image: ImageSearchConfig = None
class ConversationProcessorConfig():
def __init__(self, conversation_logfile, chat_session, meta_log, openai_api_key, verbose):
self.openai_api_key = openai_api_key
self.conversation_logfile = conversation_logfile
self.chat_session = chat_session
self.meta_log = meta_log
self.verbose = verbose
def create_from_dictionary(config, key_tree, verbose):
conversation_config = get_from_dict(config, *key_tree)
if not conversation_config:
return None
return ConversationProcessorConfig(
openai_api_key = conversation_config['openai-api-key'],
chat_session = '',
meta_log = [],
conversation_logfile = Path(conversation_config['conversation-logfile']),
verbose = verbose)
@dataclass
class ProcessorConfig():
conversation: ConversationProcessorConfig = None
class ProcessorConfigModel():
conversation: ConversationProcessorConfigModel = None

View file

@ -4,6 +4,8 @@ import pathlib
def is_none_or_empty(item):
return item == None or (hasattr(item, '__iter__') and len(item) == 0)
def to_snake_case_from_dash(item: str):
return item.replace('_', '-')
def get_absolute_path(filepath):
return str(pathlib.Path(filepath).expanduser().absolute())

62
src/utils/rawconfig.py Normal file
View file

@ -0,0 +1,62 @@
# System Packages
from pathlib import Path
from typing import List, Optional
# External Packages
from pydantic import BaseModel
# Internal Packages
from src.utils.helpers import to_snake_case_from_dash
class ConfigBase(BaseModel):
class Config:
alias_generator = to_snake_case_from_dash
allow_population_by_field_name = True
class SearchConfig(ConfigBase):
input_files: Optional[List[str]]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class TextSearchConfig(ConfigBase):
compressed_jsonl: Optional[Path]
input_files: Optional[List[str]]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class ImageSearchConfig(ConfigBase):
use_xmp_metadata: Optional[str]
batch_size: Optional[int]
input_directory: Optional[Path]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class ContentTypeConfig(ConfigBase):
org: Optional[TextSearchConfig]
ledger: Optional[TextSearchConfig]
image: Optional[ImageSearchConfig]
music: Optional[TextSearchConfig]
class AsymmetricConfig(ConfigBase):
encoder: Optional[str]
cross_encoder: Optional[str]
class ImageSearchTypeConfig(ConfigBase):
encoder: Optional[str]
class SearchTypeConfig(ConfigBase):
asymmetric: Optional[AsymmetricConfig]
image: Optional[ImageSearchTypeConfig]
class ConversationProcessorConfig(ConfigBase):
openai_api_key: Optional[str]
conversation_logfile: Optional[str]
conversation_history: Optional[str]
class ProcessorConfigModel(ConfigBase):
conversation: Optional[ConversationProcessorConfig]
class FullConfig(ConfigBase):
content_type: Optional[ContentTypeConfig]
search_type: Optional[SearchTypeConfig]
processor: Optional[ProcessorConfigModel]

View file

@ -3,8 +3,8 @@ import pytest
from pathlib import Path
# Internal Packages
from src.utils.config import SearchConfig, TextSearchConfig, ImageSearchConfig
from src.search_type import asymmetric, image_search
from src.utils.rawconfig import ContentTypeConfig, ImageSearchConfig, TextSearchConfig
@pytest.fixture(scope='session')
@ -12,44 +12,40 @@ def model_dir(tmp_path_factory):
model_dir = tmp_path_factory.mktemp('data')
# Generate Image Embeddings from Test Images
search_config = SearchConfig()
search_config = ContentTypeConfig()
search_config.image = ImageSearchConfig(
input_directory = Path('tests/data'),
input_directory = 'tests/data',
embeddings_file = model_dir.joinpath('.image_embeddings.pt'),
batch_size = 10,
use_xmp_metadata = False,
verbose = 2)
use_xmp_metadata = False)
image_search.setup(search_config.image, regenerate=False)
image_search.setup(search_config.image, regenerate=False, verbose=True)
# Generate Notes Embeddings from Test Notes
search_config.notes = TextSearchConfig(
input_files = [Path('tests/data/main_readme.org'), Path('tests/data/interface_emacs_readme.org')],
search_config.org = TextSearchConfig(
input_files = ['tests/data/main_readme.org', 'tests/data/interface_emacs_readme.org'],
input_filter = None,
compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'),
embeddings_file = model_dir.joinpath('.note_embeddings.pt'),
verbose = 0)
embeddings_file = model_dir.joinpath('.note_embeddings.pt'))
asymmetric.setup(search_config.notes, regenerate=False)
asymmetric.setup(search_config.org, regenerate=False, verbose=True)
return model_dir
@pytest.fixture(scope='session')
def search_config(model_dir):
search_config = SearchConfig()
search_config.notes = TextSearchConfig(
input_files = [Path('tests/data/main_readme.org'), Path('tests/data/interface_emacs_readme.org')],
search_config = ContentTypeConfig()
search_config.org = TextSearchConfig(
input_files = ['tests/data/main_readme.org', 'tests/data/interface_emacs_readme.org'],
input_filter = None,
compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'),
embeddings_file = model_dir.joinpath('.note_embeddings.pt'),
verbose = 2)
embeddings_file = model_dir.joinpath('.note_embeddings.pt'))
search_config.image = ImageSearchConfig(
input_directory = Path('tests/data'),
embeddings_file = Path('tests/data/.image_embeddings.pt'),
input_directory = 'tests/data',
embeddings_file = 'tests/data/.image_embeddings.pt',
batch_size = 10,
use_xmp_metadata = False,
verbose = 2)
use_xmp_metadata = False)
return search_config

View file

@ -8,7 +8,7 @@ from src.search_type import asymmetric
def test_asymmetric_setup(search_config):
# Act
# Regenerate notes embeddings during asymmetric setup
notes_model = asymmetric.setup(search_config.notes, regenerate=True)
notes_model = asymmetric.setup(search_config.org, regenerate=True)
# Assert
assert len(notes_model.entries) == 10
@ -18,7 +18,7 @@ def test_asymmetric_setup(search_config):
# ----------------------------------------------------------------------------------------------------
def test_asymmetric_search(search_config):
# Arrange
model.notes_search = asymmetric.setup(search_config.notes, regenerate=False)
model.notes_search = asymmetric.setup(search_config.org, regenerate=False)
query = "How to git install application?"
# Act

View file

@ -40,7 +40,7 @@ def test_cli_config_from_file():
assert actual_args.config_file == Path('tests/data/config.yml')
assert actual_args.regenerate == True
assert actual_args.config is not None
assert actual_args.config['content-type']['org']['input-files'] == ['~/first_from_config.org', '~/second_from_config.org']
assert actual_args.config.content_type.org.input_files == ['~/first_from_config.org', '~/second_from_config.org']
assert actual_args.verbose == 3
@ -54,7 +54,7 @@ def test_cli_config_from_cmd_args():
assert actual_args.org_files == ['first.org']
assert actual_args.config_file is None
assert actual_args.config is not None
assert actual_args.config['content-type']['org']['input-files'] == ['first.org']
assert actual_args.config.content_type.org.input_files == ['first.org']
# ----------------------------------------------------------------------------------------------------
@ -67,4 +67,4 @@ def test_cli_config_from_cmd_args_override_config_file():
assert actual_args.org_files == ['first.org']
assert actual_args.config_file == Path('tests/data/config.yml')
assert actual_args.config is not None
assert actual_args.config['content-type']['org']['input-files'] == ['first.org']
assert actual_args.config.content_type.org.input_files == ['first.org']

View file

@ -3,18 +3,19 @@ from pathlib import Path
# External Packages
from fastapi.testclient import TestClient
import pytest
# Internal Packages
from src.main import app, model, search_config as main_search_config
from src.main import app, model, config
from src.search_type import asymmetric, image_search
from src.utils.helpers import resolve_absolute_path
from src.utils.rawconfig import ContentTypeConfig
# Arrange
# ----------------------------------------------------------------------------------------------------
client = TestClient(app)
# Test
# ----------------------------------------------------------------------------------------------------
def test_search_with_invalid_search_type():
@ -29,9 +30,10 @@ def test_search_with_invalid_search_type():
# ----------------------------------------------------------------------------------------------------
def test_search_with_valid_search_type(search_config):
def test_search_with_valid_search_type(search_config: ContentTypeConfig):
# Arrange
main_search_config.image = search_config.image
config.content_type = search_config
# config.content_type.image = search_config.image
for search_type in ["notes", "ledger", "music", "image"]:
# Act
response = client.get(f"/search?q=random&t={search_type}")
@ -49,9 +51,9 @@ def test_regenerate_with_invalid_search_type():
# ----------------------------------------------------------------------------------------------------
def test_regenerate_with_valid_search_type(search_config):
def test_regenerate_with_valid_search_type(search_config: ContentTypeConfig):
# Arrange
main_search_config.image = search_config.image
config.content_type = search_config
for search_type in ["notes", "ledger", "music", "image"]:
# Act
response = client.get(f"/regenerate?t={search_type}")
@ -60,9 +62,10 @@ def test_regenerate_with_valid_search_type(search_config):
# ----------------------------------------------------------------------------------------------------
def test_image_search(search_config):
@pytest.mark.skip(reason="Flaky test. Search doesn't always return expected image path.")
def test_image_search(search_config: ContentTypeConfig):
# Arrange
main_search_config.image = search_config.image
config.content_type = search_config
model.image_search = image_search.setup(search_config.image, regenerate=False)
query_expected_image_pairs = [("brown kitten next to fallen plant", "kitten_park.jpg"),
("a horse and dog on a leash", "horse_dog.jpg"),
@ -82,9 +85,9 @@ def test_image_search(search_config):
# ----------------------------------------------------------------------------------------------------
def test_notes_search(search_config):
def test_notes_search(search_config: ContentTypeConfig):
# Arrange
model.notes_search = asymmetric.setup(search_config.notes, regenerate=False)
model.notes_search = asymmetric.setup(search_config.org, regenerate=False)
user_query = "How to git install application?"
# Act
@ -98,9 +101,9 @@ def test_notes_search(search_config):
# ----------------------------------------------------------------------------------------------------
def test_notes_search_with_include_filter(search_config):
def test_notes_search_with_include_filter(search_config: ContentTypeConfig):
# Arrange
model.notes_search = asymmetric.setup(search_config.notes, regenerate=False)
model.notes_search = asymmetric.setup(search_config.org, regenerate=False)
user_query = "How to git install application? +Emacs"
# Act
@ -114,9 +117,9 @@ def test_notes_search_with_include_filter(search_config):
# ----------------------------------------------------------------------------------------------------
def test_notes_search_with_exclude_filter(search_config):
def test_notes_search_with_exclude_filter(search_config: ContentTypeConfig):
# Arrange
model.notes_search = asymmetric.setup(search_config.notes, regenerate=False)
model.notes_search = asymmetric.setup(search_config.org, regenerate=False)
user_query = "How to git install application? -clone"
# Act

12
views/config.html Normal file
View file

@ -0,0 +1,12 @@
<!DOCTYPE html>
<head>
<title>Set directories for your config file.</title>
<link rel="stylesheet" href="views/style.css">
</head>
<body>
<form id="config-form">
</form>
<button id="config-regenerate">regenerate</button>
</body>
<script src="views/scripts/config.js"></script>
</html>

124
views/scripts/config.js Normal file
View file

@ -0,0 +1,124 @@
// Retrieve elements from the DOM.
var showConfig = document.getElementById("show-config");
var configForm = document.getElementById("config-form");
var regenerateButton = document.getElementById("config-regenerate");
// Global variables.
var rawConfig = {};
var emptyValueDefault = "🖊️";
/**
* Fetch the existing config file.
*/
fetch("/config")
.then(response => response.json())
.then(data => {
rawConfig = data;
configForm.style.display = "block";
processChildren(configForm, data);
var submitButton = document.createElement("button");
submitButton.type = "submit";
submitButton.innerHTML = "update";
configForm.appendChild(submitButton);
// The config form's submit handler.
configForm.addEventListener("submit", (event) => {
event.preventDefault();
console.log(rawConfig);
const response = fetch("/config", {
method: "POST",
credentials: "same-origin",
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(rawConfig)
}).then(response => response.json())
.then((data) => console.log(data));
});
});
/**
* The click handler for the Regenerate button.
*/
regenerateButton.addEventListener("click", (event) => {
event.preventDefault();
regenerateButton.style.cursor = "progress";
regenerateButton.disabled = true;
fetch("/regenerate")
.then(response => response.json())
.then(data => {
regenerateButton.style.cursor = "pointer";
regenerateButton.disabled = false;
console.log(data);
});
})
/**
* Adds config elements to the DOM representing the sub-components
* of one of the fields in the raw config file.
* @param {the parent element} element
* @param {the data to be rendered for this element and its children} data
*/
function processChildren(element, data) {
for (let key in data) {
var child = document.createElement("div");
child.id = key;
child.className = "config-element";
child.appendChild(document.createTextNode(key + ": "));
if (data[key] === Object(data[key]) && !Array.isArray(data[key])) {
child.className+=" config-title";
processChildren(child, data[key]);
} else {
child.appendChild(createValueNode(data, key));
}
element.appendChild(child);
}
}
/**
* Takes an element, and replaces it with an editable
* element with the same data in place.
* @param {the original element to be replaced} original
* @param {the source data to be rendered for the new element} data
* @param {the key for this input in the source data} key
*/
function makeElementEditable(original, data, key) {
original.addEventListener("click", () => {
var inputNewText = document.createElement("input");
inputNewText.type = "text";
inputNewText.className = "config-element-edit";
inputNewText.value = (original.textContent == emptyValueDefault) ? "" : original.textContent;
fixInputOnFocusOut(inputNewText, data, key);
original.parentNode.replaceChild(inputNewText, original);
inputNewText.focus();
});
}
/**
* Creates a node corresponding to the value of a config element.
* @param {the source data} data
* @param {the key corresponding to this node's data} key
* @returns A new element which corresponds to the value in some field.
*/
function createValueNode(data, key) {
var valueElement = document.createElement("span");
valueElement.className = "config-element-value";
valueElement.textContent = !data[key] ? emptyValueDefault : data[key];
makeElementEditable(valueElement, data, key);
return valueElement;
}
/**
* Replaces an existing input element with an element with the same data, which is not an input.
* If the input data for this element was changed, update the corresponding data in the raw config.
* @param {the original element to be replaced} original
* @param {the source data} data
* @param {the key corresponding to this node's data} key
*/
function fixInputOnFocusOut(original, data, key) {
original.addEventListener("blur", () => {
data[key] = (original.value != emptyValueDefault) ? original.value : "";
original.parentNode.replaceChild(createValueNode(data, key), original);
})
}

29
views/style.css Normal file
View file

@ -0,0 +1,29 @@
:root {
--primary-color: #ffffff;
--bold-color: #2073ee;
--complementary-color: #124408;
--accent-color-0: #57f0b5;
}
input[type=text] {
width: 40%;
}
div.config-element {
color: var(--bold-color);
margin: 8px;
}
div.config-title {
font-weight: bold;
}
span.config-element-value {
color: var(--complementary-color);
font-weight: normal;
cursor: pointer;
}
button {
cursor: pointer;
}