Merge branch 'master' of github.com:debanjum/semantic-search into add-summarize-capability-to-chat-bot

- Fix openai_api_key being set in ConfigProcessorConfig
- Merge addition of config UI and config instantiation updates
This commit is contained in:
Debanjum Singh Solanky 2021-12-20 13:26:35 +05:30
commit 6dc2a99d35
19 changed files with 424 additions and 210 deletions

View file

@ -10,35 +10,50 @@ on:
jobs: jobs:
test: test:
name: Run Tests strategy:
runs-on: "macos-latest" matrix:
defaults: include:
run: - os: ubuntu-latest
shell: bash -l {0} label: linux-64
prefix: /usr/share/miniconda3/envs/test
name: ${{ matrix.label }}
runs-on: ${{ matrix.os }}
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- name: Cache conda
uses: actions/cache@v2 - name: Install Environment Dependencies
env: shell: bash -l {0}
# Increase this value to reset cache if environment.yml has not changed run: sudo apt-get -y install libimage-exiftool-perl
CACHE_NUMBER: 0
with: - name: Setup Mambaforge
path: ~/conda_pkgs_dir uses: conda-incubator/setup-miniconda@v2
key:
${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{
hashFiles('environment.yml') }}
- uses: conda-incubator/setup-miniconda@v2
with: with:
miniforge-variant: Mambaforge
miniforge-version: latest
activate-environment: test activate-environment: test
use-mamba: true
environment-file: environment.yml environment-file: environment.yml
python-version: 3.8 python-version: 3.8
auto-activate-base: false auto-activate-base: false
use-only-tar-bz2: true use-only-tar-bz2: true
- name: Conda Info
run: | - name: Set cache date
conda info run: echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV
conda list
- uses: actions/cache@v2
with:
path: ${{ matrix.prefix }}
key: ${{ matrix.label }}-conda-${{ hashFiles('environment.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }}
env:
# Increase this value to reset cache if environment.yml has not changed
CACHE_NUMBER: 0
id: cache
- name: Update environment
run: mamba env update -n test -f environment.yml
if: steps.cache.outputs.cache-hit != 'true'
- name: Run Pytest - name: Run Pytest
run: | shell: bash -l {0}
python -m pytest run: python -m pytest

2
.gitignore vendored
View file

@ -4,3 +4,5 @@ __pycache__
tests/data/.* tests/data/.*
src/.data src/.data
.vscode .vscode
*.gz
*.pt

View file

@ -16,6 +16,11 @@
conda activate semantic-search conda activate semantic-search
#+end_src #+end_src
*** Install Environmental Dependencies
#+begin_src shell
sudo apt-get -y install libimage-exiftool-perl
#+end_src
** Configure ** Configure
Configure application search types and their underlying data source/files in ~sample_config.yml~ Configure application search types and their underlying data source/files in ~sample_config.yml~
Use the ~sample_config.yml~ as reference Use the ~sample_config.yml~ as reference

View file

@ -6,7 +6,7 @@ dependencies:
- numpy=1.* - numpy=1.*
- pytorch=1.* - pytorch=1.*
- transformers=4.* - transformers=4.*
- sentence-transformers=2.0.0 - sentence-transformers=2.1.0
- fastapi=0.* - fastapi=0.*
- uvicorn=0.* - uvicorn=0.*
- pyyaml=5.* - pyyaml=5.*
@ -14,3 +14,7 @@ dependencies:
- pillow=8.* - pillow=8.*
- torchvision=0.* - torchvision=0.*
- openai=0.* - openai=0.*
- pydantic=1.*
- jinja2=3.0.*
- aiofiles=0.*
- huggingface_hub=0.*

View file

@ -1,26 +1,49 @@
# Standard Packages # Standard Packages
import sys import sys, json, yaml
import json
from typing import Optional from typing import Optional
# External Packages # External Packages
import uvicorn import uvicorn
from fastapi import FastAPI from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
# Internal Packages # Internal Packages
from src.search_type import asymmetric, symmetric_ledger, image_search from src.search_type import asymmetric, symmetric_ledger, image_search
from src.utils.helpers import get_absolute_path, get_from_dict from src.utils.helpers import get_absolute_path, get_from_dict
from src.utils.cli import cli from src.utils.cli import cli
from src.utils.config import SearchType, SearchModels, TextSearchConfig, ImageSearchConfig, SearchConfig, ProcessorConfig, ConversationProcessorConfig from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
from src.utils.rawconfig import FullConfig
from src.processor.conversation.gpt import converse, message_to_log, message_to_prompt, understand, summarize from src.processor.conversation.gpt import converse, message_to_log, message_to_prompt, understand, summarize
# Application Global State # Application Global State
config = FullConfig()
model = SearchModels() model = SearchModels()
search_config = SearchConfig() processor_config = ProcessorConfigModel()
processor_config = ProcessorConfig() config_file = ""
verbose = 0
app = FastAPI() app = FastAPI()
app.mount("/views", StaticFiles(directory="views"), name="views")
templates = Jinja2Templates(directory="views/")
@app.get('/ui', response_class=HTMLResponse)
def ui(request: Request):
return templates.TemplateResponse("config.html", context={'request': request})
@app.get('/config', response_model=FullConfig)
def config_data():
return config
@app.post('/config')
async def config_data(updated_config: FullConfig):
global config
config = updated_config
with open(config_file, 'w') as outfile:
yaml.dump(yaml.safe_load(config.json(by_alias=True)), outfile)
outfile.close()
return config
@app.get('/search') @app.get('/search')
def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
@ -60,7 +83,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
return image_search.collate_results( return image_search.collate_results(
hits, hits,
model.image_search.image_names, model.image_search.image_names,
search_config.image.input_directory, config.content_type.image.input_directory,
results_count) results_count)
else: else:
@ -69,22 +92,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None):
@app.get('/regenerate') @app.get('/regenerate')
def regenerate(t: Optional[SearchType] = None): def regenerate(t: Optional[SearchType] = None):
if (t == SearchType.Notes or t == None) and search_config.notes: initialize_search(config, regenerate=True, t=t)
# Extract Entries, Generate Embeddings
model.notes_search = asymmetric.setup(search_config.notes, regenerate=True)
if (t == SearchType.Music or t == None) and search_config.music:
# Extract Entries, Generate Song Embeddings
model.music_search = asymmetric.setup(search_config.music, regenerate=True)
if (t == SearchType.Ledger or t == None) and search_config.ledger:
# Extract Entries, Generate Embeddings
model.ledger_search = symmetric_ledger.setup(search_config.ledger, regenerate=True)
if (t == SearchType.Image or t == None) and search_config.image:
# Extract Images, Generate Embeddings
model.image_search = image_search.setup(search_config.image, regenerate=True)
return {'status': 'ok', 'message': 'regeneration completed'} return {'status': 'ok', 'message': 'regeneration completed'}
@ -111,37 +119,40 @@ def chat(q: str):
return {'status': 'ok', 'response': gpt_response} return {'status': 'ok', 'response': gpt_response}
def initialize_search(config, regenerate, verbose): def initialize_search(config: FullConfig, regenerate: bool, t: SearchType = None):
model = SearchModels() model = SearchModels()
search_config = SearchConfig()
# Initialize Org Notes Search # Initialize Org Notes Search
search_config.notes = TextSearchConfig.create_from_dictionary(config, ('content-type', 'org'), verbose) if (t == SearchType.Notes or t == None) and config.content_type.org:
if search_config.notes: # Extract Entries, Generate Notes Embeddings
model.notes_search = asymmetric.setup(search_config.notes, regenerate=regenerate) model.notes_search = asymmetric.setup(config.content_type.org, regenerate=regenerate, verbose=verbose)
# Initialize Org Music Search # Initialize Org Music Search
search_config.music = TextSearchConfig.create_from_dictionary(config, ('content-type', 'music'), verbose) if (t == SearchType.Music or t == None) and config.content_type.music:
if search_config.music: # Extract Entries, Generate Music Embeddings
model.music_search = asymmetric.setup(search_config.music, regenerate=regenerate) model.music_search = asymmetric.setup(config.content_type.music, regenerate=regenerate, verbose=verbose)
# Initialize Ledger Search # Initialize Ledger Search
search_config.ledger = TextSearchConfig.create_from_dictionary(config, ('content-type', 'ledger'), verbose) if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
if search_config.ledger: # Extract Entries, Generate Ledger Embeddings
model.ledger_search = symmetric_ledger.setup(search_config.ledger, regenerate=regenerate) model.ledger_search = symmetric_ledger.setup(config.content_type.ledger, regenerate=regenerate, verbose=verbose)
# Initialize Image Search # Initialize Image Search
search_config.image = ImageSearchConfig.create_from_dictionary(config, ('content-type', 'image'), verbose) if (t == SearchType.Image or t == None) and config.content_type.image:
if search_config.image: # Extract Entries, Generate Image Embeddings
model.image_search = image_search.setup(search_config.image, regenerate=regenerate) model.image_search = image_search.setup(config.content_type.image, regenerate=regenerate, verbose=verbose)
return model, search_config return model
def initialize_processor(config, verbose): def initialize_processor(config: FullConfig):
if not config.processor:
return
processor_config = ProcessorConfigModel()
# Initialize Conversation Processor # Initialize Conversation Processor
processor_config = ProcessorConfig() processor_config.conversation = ConversationProcessorConfigModel(config.processor.conversation, verbose)
processor_config.conversation = ConversationProcessorConfig.create_from_dictionary(config, ('processor', 'conversation'), verbose)
conversation_logfile = processor_config.conversation.conversation_logfile conversation_logfile = processor_config.conversation.conversation_logfile
if processor_config.conversation.verbose: if processor_config.conversation.verbose:
@ -195,11 +206,20 @@ if __name__ == '__main__':
# Load config from CLI # Load config from CLI
args = cli(sys.argv[1:]) args = cli(sys.argv[1:])
# Initialize Search from Config # Stores the file path to the config file.
model, search_config = initialize_search(args.config, args.regenerate, args.verbose) config_file = args.config_file
# Store the verbose flag
verbose = args.verbose
# Store the raw config data.
config = args.config
# Initialize the search model from Config
model = initialize_search(args.config, args.regenerate)
# Initialize Processor from Config # Initialize Processor from Config
processor_config = initialize_processor(args.config, args.verbose) processor_config = initialize_processor(args.config)
# Start Application Server # Start Application Server
if args.socket: if args.socket:

View file

@ -14,7 +14,8 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
# Internal Packages # Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path from src.utils.helpers import get_absolute_path, resolve_absolute_path
from src.processor.org_mode.org_to_jsonl import org_to_jsonl from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.utils.config import TextSearchModel, TextSearchConfig from src.utils.config import TextSearchModel
from src.utils.rawconfig import TextSearchConfig
def initialize_model(): def initialize_model():
@ -58,7 +59,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, v
corpus_embeddings = bi_encoder.encode([entry[0] for entry in entries], convert_to_tensor=True, show_progress_bar=True) corpus_embeddings = bi_encoder.encode([entry[0] for entry in entries], convert_to_tensor=True, show_progress_bar=True)
torch.save(corpus_embeddings, get_absolute_path(embeddings_file)) torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
if verbose > 0: if verbose > 0:
print(f"Computed embeddings and save them to {embeddings_file}") print(f"Computed embeddings and saved them to {embeddings_file}")
return corpus_embeddings return corpus_embeddings
@ -148,22 +149,22 @@ def collate_results(hits, entries, count=5):
in hits[0:count]] in hits[0:count]]
def setup(config: TextSearchConfig, regenerate: bool) -> TextSearchModel: def setup(config: TextSearchConfig, regenerate: bool, verbose: bool=False) -> TextSearchModel:
# Initialize Model # Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model() bi_encoder, cross_encoder, top_k = initialize_model()
# Map notes in Org-Mode files to (compressed) JSONL formatted file # Map notes in Org-Mode files to (compressed) JSONL formatted file
if not resolve_absolute_path(config.compressed_jsonl).exists() or regenerate: if not resolve_absolute_path(config.compressed_jsonl).exists() or regenerate:
org_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, config.verbose) org_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, verbose)
# Extract Entries # Extract Entries
entries = extract_entries(config.compressed_jsonl, config.verbose) entries = extract_entries(config.compressed_jsonl, verbose)
top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus
# Compute or Load Embeddings # Compute or Load Embeddings
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=config.verbose) corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=config.verbose) return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=verbose)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -10,9 +10,10 @@ from tqdm import trange
import torch import torch
# Internal Packages # Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path from src.utils.helpers import resolve_absolute_path
import src.utils.exiftool as exiftool import src.utils.exiftool as exiftool
from src.utils.config import ImageSearchModel, ImageSearchConfig from src.utils.config import ImageSearchModel
from src.utils.rawconfig import ImageSearchConfig
def initialize_model(): def initialize_model():
@ -153,13 +154,13 @@ def collate_results(hits, image_names, image_directory, count=5):
in hits[0:count]] in hits[0:count]]
def setup(config: ImageSearchConfig, regenerate: bool) -> ImageSearchModel: def setup(config: ImageSearchConfig, regenerate: bool, verbose: bool=False) -> ImageSearchModel:
# Initialize Model # Initialize Model
encoder = initialize_model() encoder = initialize_model()
# Extract Entries # Extract Entries
image_directory = resolve_absolute_path(config.input_directory, strict=True) image_directory = resolve_absolute_path(config.input_directory, strict=True)
image_names = extract_entries(image_directory, config.verbose) image_names = extract_entries(image_directory, verbose)
# Compute or Load Embeddings # Compute or Load Embeddings
embeddings_file = resolve_absolute_path(config.embeddings_file) embeddings_file = resolve_absolute_path(config.embeddings_file)
@ -170,13 +171,13 @@ def setup(config: ImageSearchConfig, regenerate: bool) -> ImageSearchModel:
batch_size=config.batch_size, batch_size=config.batch_size,
regenerate=regenerate, regenerate=regenerate,
use_xmp_metadata=config.use_xmp_metadata, use_xmp_metadata=config.use_xmp_metadata,
verbose=config.verbose) verbose=verbose)
return ImageSearchModel(image_names, return ImageSearchModel(image_names,
image_embeddings, image_embeddings,
image_metadata_embeddings, image_metadata_embeddings,
encoder, encoder,
config.verbose) verbose)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,9 +1,6 @@
# Standard Packages # Standard Packages
import json import json
import time
import gzip import gzip
import os
import sys
import re import re
import argparse import argparse
import pathlib import pathlib
@ -15,11 +12,12 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
# Internal Packages # Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path from src.utils.helpers import get_absolute_path, resolve_absolute_path
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.utils.config import TextSearchModel, TextSearchConfig from src.utils.config import TextSearchModel
from src.utils.rawconfig import TextSearchConfig
def initialize_model(): def initialize_model():
"Initialize model for symetric semantic search. That is, where query of similar size to results" "Initialize model for symmetric semantic search. That is, where query of similar size to results"
torch.set_num_threads(4) torch.set_num_threads(4)
bi_encoder = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') # The encoder encodes all entries to use for semantic search bi_encoder = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') # The encoder encodes all entries to use for semantic search
top_k = 30 # Number of entries we want to retrieve with the bi-encoder top_k = 30 # Number of entries we want to retrieve with the bi-encoder
@ -55,7 +53,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, v
corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True) corpus_embeddings = bi_encoder.encode(entries, convert_to_tensor=True, show_progress_bar=True)
torch.save(corpus_embeddings, get_absolute_path(embeddings_file)) torch.save(corpus_embeddings, get_absolute_path(embeddings_file))
if verbose > 0: if verbose > 0:
print(f"Computed embeddings and save them to {embeddings_file}") print(f"Computed embeddings and saved them to {embeddings_file}")
return corpus_embeddings return corpus_embeddings
@ -143,22 +141,22 @@ def collate_results(hits, entries, count=5):
in hits[0:count]] in hits[0:count]]
def setup(config: TextSearchConfig, regenerate: bool) -> TextSearchModel: def setup(config: TextSearchConfig, regenerate: bool, verbose: bool) -> TextSearchModel:
# Initialize Model # Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model() bi_encoder, cross_encoder, top_k = initialize_model()
# Map notes in Org-Mode files to (compressed) JSONL formatted file # Map notes in Org-Mode files to (compressed) JSONL formatted file
if not resolve_absolute_path(config.compressed_jsonl).exists() or regenerate: if not resolve_absolute_path(config.compressed_jsonl).exists() or regenerate:
beancount_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, config.verbose) beancount_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, verbose)
# Extract Entries # Extract Entries
entries = extract_entries(config.compressed_jsonl, config.verbose) entries = extract_entries(config.compressed_jsonl, verbose)
top_k = min(len(entries), top_k) top_k = min(len(entries), top_k)
# Compute or Load Embeddings # Compute or Load Embeddings
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=config.verbose) corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=config.verbose) return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=verbose)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,12 +1,14 @@
# Standard Packages # Standard Packages
import argparse import argparse
import pathlib import pathlib
import json
# External Packages # External Packages
import yaml import yaml
# Internal Packages # Internal Packages
from src.utils.helpers import is_none_or_empty, get_absolute_path, resolve_absolute_path, get_from_dict, merge_dicts from src.utils.helpers import is_none_or_empty, get_absolute_path, resolve_absolute_path, merge_dicts
from src.utils.rawconfig import FullConfig
def cli(args=None): def cli(args=None):
if is_none_or_empty(args): if is_none_or_empty(args):
@ -35,12 +37,15 @@ def cli(args=None):
with open(get_absolute_path(args.config_file), 'r', encoding='utf-8') as config_file: with open(get_absolute_path(args.config_file), 'r', encoding='utf-8') as config_file:
config_from_file = yaml.safe_load(config_file) config_from_file = yaml.safe_load(config_file)
args.config = merge_dicts(priority_dict=config_from_file, default_dict=args.config) args.config = merge_dicts(priority_dict=config_from_file, default_dict=args.config)
args.config = FullConfig.parse_obj(args.config)
else:
args.config = FullConfig.parse_obj(args.config)
if args.org_files: if args.org_files:
args.config['content-type']['org']['input-files'] = args.org_files args.config.content_type.org.input_files = args.org_files
if args.org_filter: if args.org_filter:
args.config['content-type']['org']['input-filter'] = args.org_filter args.config.content_type.org.input_filter = args.org_filter
return args return args

View file

@ -4,7 +4,7 @@ from dataclasses import dataclass
from pathlib import Path from pathlib import Path
# Internal Packages # Internal Packages
from src.utils.helpers import get_from_dict from src.utils.rawconfig import ConversationProcessorConfig
class SearchType(str, Enum): class SearchType(str, Enum):
@ -42,80 +42,15 @@ class SearchModels():
image_search: ImageSearchModel = None image_search: ImageSearchModel = None
class TextSearchConfig(): class ConversationProcessorConfigModel():
def __init__(self, input_files, input_filter, compressed_jsonl, embeddings_file, verbose): def __init__(self, processor_config: ConversationProcessorConfig, verbose: bool):
self.input_files = input_files self.openai_api_key = processor_config.openai_api_key
self.input_filter = input_filter self.conversation_logfile = Path(processor_config.conversation_logfile)
self.compressed_jsonl = Path(compressed_jsonl) self.chat_session = ''
self.embeddings_file = Path(embeddings_file) self.meta_log = []
self.verbose = verbose self.verbose = verbose
def create_from_dictionary(config, key_tree, verbose):
text_config = get_from_dict(config, *key_tree)
search_enabled = text_config and ('input-files' in text_config or 'input-filter' in text_config)
if not search_enabled:
return None
return TextSearchConfig(
input_files = text_config['input-files'],
input_filter = text_config['input-filter'],
compressed_jsonl = Path(text_config['compressed-jsonl']),
embeddings_file = Path(text_config['embeddings-file']),
verbose = verbose)
class ImageSearchConfig():
def __init__(self, input_directory, embeddings_file, batch_size, use_xmp_metadata, verbose):
self.input_directory = input_directory
self.embeddings_file = Path(embeddings_file)
self.batch_size = batch_size
self.use_xmp_metadata = use_xmp_metadata
self.verbose = verbose
def create_from_dictionary(config, key_tree, verbose):
image_config = get_from_dict(config, *key_tree)
search_enabled = image_config and 'input-directory' in image_config
if not search_enabled:
return None
return ImageSearchConfig(
input_directory = Path(image_config['input-directory']),
embeddings_file = Path(image_config['embeddings-file']),
batch_size = image_config['batch-size'],
use_xmp_metadata = {'yes': True, 'no': False}[image_config['use-xmp-metadata']],
verbose = verbose)
@dataclass @dataclass
class SearchConfig(): class ProcessorConfigModel():
notes: TextSearchConfig = None conversation: ConversationProcessorConfigModel = None
ledger: TextSearchConfig = None
music: TextSearchConfig = None
image: ImageSearchConfig = None
class ConversationProcessorConfig():
def __init__(self, conversation_logfile, chat_session, meta_log, openai_api_key, verbose):
self.openai_api_key = openai_api_key
self.conversation_logfile = conversation_logfile
self.chat_session = chat_session
self.meta_log = meta_log
self.verbose = verbose
def create_from_dictionary(config, key_tree, verbose):
conversation_config = get_from_dict(config, *key_tree)
if not conversation_config:
return None
return ConversationProcessorConfig(
openai_api_key = conversation_config['openai-api-key'],
chat_session = '',
meta_log = [],
conversation_logfile = Path(conversation_config['conversation-logfile']),
verbose = verbose)
@dataclass
class ProcessorConfig():
conversation: ConversationProcessorConfig = None

View file

@ -4,6 +4,8 @@ import pathlib
def is_none_or_empty(item): def is_none_or_empty(item):
return item == None or (hasattr(item, '__iter__') and len(item) == 0) return item == None or (hasattr(item, '__iter__') and len(item) == 0)
def to_snake_case_from_dash(item: str):
return item.replace('_', '-')
def get_absolute_path(filepath): def get_absolute_path(filepath):
return str(pathlib.Path(filepath).expanduser().absolute()) return str(pathlib.Path(filepath).expanduser().absolute())

62
src/utils/rawconfig.py Normal file
View file

@ -0,0 +1,62 @@
# System Packages
from pathlib import Path
from typing import List, Optional
# External Packages
from pydantic import BaseModel
# Internal Packages
from src.utils.helpers import to_snake_case_from_dash
class ConfigBase(BaseModel):
class Config:
alias_generator = to_snake_case_from_dash
allow_population_by_field_name = True
class SearchConfig(ConfigBase):
input_files: Optional[List[str]]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class TextSearchConfig(ConfigBase):
compressed_jsonl: Optional[Path]
input_files: Optional[List[str]]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class ImageSearchConfig(ConfigBase):
use_xmp_metadata: Optional[str]
batch_size: Optional[int]
input_directory: Optional[Path]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class ContentTypeConfig(ConfigBase):
org: Optional[TextSearchConfig]
ledger: Optional[TextSearchConfig]
image: Optional[ImageSearchConfig]
music: Optional[TextSearchConfig]
class AsymmetricConfig(ConfigBase):
encoder: Optional[str]
cross_encoder: Optional[str]
class ImageSearchTypeConfig(ConfigBase):
encoder: Optional[str]
class SearchTypeConfig(ConfigBase):
asymmetric: Optional[AsymmetricConfig]
image: Optional[ImageSearchTypeConfig]
class ConversationProcessorConfig(ConfigBase):
openai_api_key: Optional[str]
conversation_logfile: Optional[str]
conversation_history: Optional[str]
class ProcessorConfigModel(ConfigBase):
conversation: Optional[ConversationProcessorConfig]
class FullConfig(ConfigBase):
content_type: Optional[ContentTypeConfig]
search_type: Optional[SearchTypeConfig]
processor: Optional[ProcessorConfigModel]

View file

@ -3,8 +3,8 @@ import pytest
from pathlib import Path from pathlib import Path
# Internal Packages # Internal Packages
from src.utils.config import SearchConfig, TextSearchConfig, ImageSearchConfig
from src.search_type import asymmetric, image_search from src.search_type import asymmetric, image_search
from src.utils.rawconfig import ContentTypeConfig, ImageSearchConfig, TextSearchConfig
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
@ -12,44 +12,40 @@ def model_dir(tmp_path_factory):
model_dir = tmp_path_factory.mktemp('data') model_dir = tmp_path_factory.mktemp('data')
# Generate Image Embeddings from Test Images # Generate Image Embeddings from Test Images
search_config = SearchConfig() search_config = ContentTypeConfig()
search_config.image = ImageSearchConfig( search_config.image = ImageSearchConfig(
input_directory = Path('tests/data'), input_directory = 'tests/data',
embeddings_file = model_dir.joinpath('.image_embeddings.pt'), embeddings_file = model_dir.joinpath('.image_embeddings.pt'),
batch_size = 10, batch_size = 10,
use_xmp_metadata = False, use_xmp_metadata = False)
verbose = 2)
image_search.setup(search_config.image, regenerate=False) image_search.setup(search_config.image, regenerate=False, verbose=True)
# Generate Notes Embeddings from Test Notes # Generate Notes Embeddings from Test Notes
search_config.notes = TextSearchConfig( search_config.org = TextSearchConfig(
input_files = [Path('tests/data/main_readme.org'), Path('tests/data/interface_emacs_readme.org')], input_files = ['tests/data/main_readme.org', 'tests/data/interface_emacs_readme.org'],
input_filter = None, input_filter = None,
compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'), compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'),
embeddings_file = model_dir.joinpath('.note_embeddings.pt'), embeddings_file = model_dir.joinpath('.note_embeddings.pt'))
verbose = 0)
asymmetric.setup(search_config.notes, regenerate=False) asymmetric.setup(search_config.org, regenerate=False, verbose=True)
return model_dir return model_dir
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
def search_config(model_dir): def search_config(model_dir):
search_config = SearchConfig() search_config = ContentTypeConfig()
search_config.notes = TextSearchConfig( search_config.org = TextSearchConfig(
input_files = [Path('tests/data/main_readme.org'), Path('tests/data/interface_emacs_readme.org')], input_files = ['tests/data/main_readme.org', 'tests/data/interface_emacs_readme.org'],
input_filter = None, input_filter = None,
compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'), compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'),
embeddings_file = model_dir.joinpath('.note_embeddings.pt'), embeddings_file = model_dir.joinpath('.note_embeddings.pt'))
verbose = 2)
search_config.image = ImageSearchConfig( search_config.image = ImageSearchConfig(
input_directory = Path('tests/data'), input_directory = 'tests/data',
embeddings_file = Path('tests/data/.image_embeddings.pt'), embeddings_file = 'tests/data/.image_embeddings.pt',
batch_size = 10, batch_size = 10,
use_xmp_metadata = False, use_xmp_metadata = False)
verbose = 2)
return search_config return search_config

View file

@ -8,7 +8,7 @@ from src.search_type import asymmetric
def test_asymmetric_setup(search_config): def test_asymmetric_setup(search_config):
# Act # Act
# Regenerate notes embeddings during asymmetric setup # Regenerate notes embeddings during asymmetric setup
notes_model = asymmetric.setup(search_config.notes, regenerate=True) notes_model = asymmetric.setup(search_config.org, regenerate=True)
# Assert # Assert
assert len(notes_model.entries) == 10 assert len(notes_model.entries) == 10
@ -18,7 +18,7 @@ def test_asymmetric_setup(search_config):
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_asymmetric_search(search_config): def test_asymmetric_search(search_config):
# Arrange # Arrange
model.notes_search = asymmetric.setup(search_config.notes, regenerate=False) model.notes_search = asymmetric.setup(search_config.org, regenerate=False)
query = "How to git install application?" query = "How to git install application?"
# Act # Act

View file

@ -40,7 +40,7 @@ def test_cli_config_from_file():
assert actual_args.config_file == Path('tests/data/config.yml') assert actual_args.config_file == Path('tests/data/config.yml')
assert actual_args.regenerate == True assert actual_args.regenerate == True
assert actual_args.config is not None assert actual_args.config is not None
assert actual_args.config['content-type']['org']['input-files'] == ['~/first_from_config.org', '~/second_from_config.org'] assert actual_args.config.content_type.org.input_files == ['~/first_from_config.org', '~/second_from_config.org']
assert actual_args.verbose == 3 assert actual_args.verbose == 3
@ -54,7 +54,7 @@ def test_cli_config_from_cmd_args():
assert actual_args.org_files == ['first.org'] assert actual_args.org_files == ['first.org']
assert actual_args.config_file is None assert actual_args.config_file is None
assert actual_args.config is not None assert actual_args.config is not None
assert actual_args.config['content-type']['org']['input-files'] == ['first.org'] assert actual_args.config.content_type.org.input_files == ['first.org']
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -67,4 +67,4 @@ def test_cli_config_from_cmd_args_override_config_file():
assert actual_args.org_files == ['first.org'] assert actual_args.org_files == ['first.org']
assert actual_args.config_file == Path('tests/data/config.yml') assert actual_args.config_file == Path('tests/data/config.yml')
assert actual_args.config is not None assert actual_args.config is not None
assert actual_args.config['content-type']['org']['input-files'] == ['first.org'] assert actual_args.config.content_type.org.input_files == ['first.org']

View file

@ -3,18 +3,19 @@ from pathlib import Path
# External Packages # External Packages
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
import pytest
# Internal Packages # Internal Packages
from src.main import app, model, search_config as main_search_config from src.main import app, model, config
from src.search_type import asymmetric, image_search from src.search_type import asymmetric, image_search
from src.utils.helpers import resolve_absolute_path from src.utils.helpers import resolve_absolute_path
from src.utils.rawconfig import ContentTypeConfig
# Arrange # Arrange
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
client = TestClient(app) client = TestClient(app)
# Test # Test
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_search_with_invalid_search_type(): def test_search_with_invalid_search_type():
@ -29,9 +30,10 @@ def test_search_with_invalid_search_type():
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_search_with_valid_search_type(search_config): def test_search_with_valid_search_type(search_config: ContentTypeConfig):
# Arrange # Arrange
main_search_config.image = search_config.image config.content_type = search_config
# config.content_type.image = search_config.image
for search_type in ["notes", "ledger", "music", "image"]: for search_type in ["notes", "ledger", "music", "image"]:
# Act # Act
response = client.get(f"/search?q=random&t={search_type}") response = client.get(f"/search?q=random&t={search_type}")
@ -49,9 +51,9 @@ def test_regenerate_with_invalid_search_type():
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_regenerate_with_valid_search_type(search_config): def test_regenerate_with_valid_search_type(search_config: ContentTypeConfig):
# Arrange # Arrange
main_search_config.image = search_config.image config.content_type = search_config
for search_type in ["notes", "ledger", "music", "image"]: for search_type in ["notes", "ledger", "music", "image"]:
# Act # Act
response = client.get(f"/regenerate?t={search_type}") response = client.get(f"/regenerate?t={search_type}")
@ -60,9 +62,10 @@ def test_regenerate_with_valid_search_type(search_config):
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_image_search(search_config): @pytest.mark.skip(reason="Flaky test. Search doesn't always return expected image path.")
def test_image_search(search_config: ContentTypeConfig):
# Arrange # Arrange
main_search_config.image = search_config.image config.content_type = search_config
model.image_search = image_search.setup(search_config.image, regenerate=False) model.image_search = image_search.setup(search_config.image, regenerate=False)
query_expected_image_pairs = [("brown kitten next to fallen plant", "kitten_park.jpg"), query_expected_image_pairs = [("brown kitten next to fallen plant", "kitten_park.jpg"),
("a horse and dog on a leash", "horse_dog.jpg"), ("a horse and dog on a leash", "horse_dog.jpg"),
@ -82,9 +85,9 @@ def test_image_search(search_config):
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_notes_search(search_config): def test_notes_search(search_config: ContentTypeConfig):
# Arrange # Arrange
model.notes_search = asymmetric.setup(search_config.notes, regenerate=False) model.notes_search = asymmetric.setup(search_config.org, regenerate=False)
user_query = "How to git install application?" user_query = "How to git install application?"
# Act # Act
@ -98,9 +101,9 @@ def test_notes_search(search_config):
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_notes_search_with_include_filter(search_config): def test_notes_search_with_include_filter(search_config: ContentTypeConfig):
# Arrange # Arrange
model.notes_search = asymmetric.setup(search_config.notes, regenerate=False) model.notes_search = asymmetric.setup(search_config.org, regenerate=False)
user_query = "How to git install application? +Emacs" user_query = "How to git install application? +Emacs"
# Act # Act
@ -114,9 +117,9 @@ def test_notes_search_with_include_filter(search_config):
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_notes_search_with_exclude_filter(search_config): def test_notes_search_with_exclude_filter(search_config: ContentTypeConfig):
# Arrange # Arrange
model.notes_search = asymmetric.setup(search_config.notes, regenerate=False) model.notes_search = asymmetric.setup(search_config.org, regenerate=False)
user_query = "How to git install application? -clone" user_query = "How to git install application? -clone"
# Act # Act

12
views/config.html Normal file
View file

@ -0,0 +1,12 @@
<!DOCTYPE html>
<head>
<title>Set directories for your config file.</title>
<link rel="stylesheet" href="views/style.css">
</head>
<body>
<form id="config-form">
</form>
<button id="config-regenerate">regenerate</button>
</body>
<script src="views/scripts/config.js"></script>
</html>

124
views/scripts/config.js Normal file
View file

@ -0,0 +1,124 @@
// Retrieve elements from the DOM.
var showConfig = document.getElementById("show-config");
var configForm = document.getElementById("config-form");
var regenerateButton = document.getElementById("config-regenerate");
// Global variables.
var rawConfig = {};
var emptyValueDefault = "🖊️";
/**
* Fetch the existing config file.
*/
fetch("/config")
.then(response => response.json())
.then(data => {
rawConfig = data;
configForm.style.display = "block";
processChildren(configForm, data);
var submitButton = document.createElement("button");
submitButton.type = "submit";
submitButton.innerHTML = "update";
configForm.appendChild(submitButton);
// The config form's submit handler.
configForm.addEventListener("submit", (event) => {
event.preventDefault();
console.log(rawConfig);
const response = fetch("/config", {
method: "POST",
credentials: "same-origin",
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(rawConfig)
}).then(response => response.json())
.then((data) => console.log(data));
});
});
/**
* The click handler for the Regenerate button.
*/
regenerateButton.addEventListener("click", (event) => {
event.preventDefault();
regenerateButton.style.cursor = "progress";
regenerateButton.disabled = true;
fetch("/regenerate")
.then(response => response.json())
.then(data => {
regenerateButton.style.cursor = "pointer";
regenerateButton.disabled = false;
console.log(data);
});
})
/**
* Adds config elements to the DOM representing the sub-components
* of one of the fields in the raw config file.
* @param {the parent element} element
* @param {the data to be rendered for this element and its children} data
*/
function processChildren(element, data) {
for (let key in data) {
var child = document.createElement("div");
child.id = key;
child.className = "config-element";
child.appendChild(document.createTextNode(key + ": "));
if (data[key] === Object(data[key]) && !Array.isArray(data[key])) {
child.className+=" config-title";
processChildren(child, data[key]);
} else {
child.appendChild(createValueNode(data, key));
}
element.appendChild(child);
}
}
/**
* Takes an element, and replaces it with an editable
* element with the same data in place.
* @param {the original element to be replaced} original
* @param {the source data to be rendered for the new element} data
* @param {the key for this input in the source data} key
*/
function makeElementEditable(original, data, key) {
original.addEventListener("click", () => {
var inputNewText = document.createElement("input");
inputNewText.type = "text";
inputNewText.className = "config-element-edit";
inputNewText.value = (original.textContent == emptyValueDefault) ? "" : original.textContent;
fixInputOnFocusOut(inputNewText, data, key);
original.parentNode.replaceChild(inputNewText, original);
inputNewText.focus();
});
}
/**
* Creates a node corresponding to the value of a config element.
* @param {the source data} data
* @param {the key corresponding to this node's data} key
* @returns A new element which corresponds to the value in some field.
*/
function createValueNode(data, key) {
var valueElement = document.createElement("span");
valueElement.className = "config-element-value";
valueElement.textContent = !data[key] ? emptyValueDefault : data[key];
makeElementEditable(valueElement, data, key);
return valueElement;
}
/**
* Replaces an existing input element with an element with the same data, which is not an input.
* If the input data for this element was changed, update the corresponding data in the raw config.
* @param {the original element to be replaced} original
* @param {the source data} data
* @param {the key corresponding to this node's data} key
*/
function fixInputOnFocusOut(original, data, key) {
original.addEventListener("blur", () => {
data[key] = (original.value != emptyValueDefault) ? original.value : "";
original.parentNode.replaceChild(createValueNode(data, key), original);
})
}

29
views/style.css Normal file
View file

@ -0,0 +1,29 @@
:root {
--primary-color: #ffffff;
--bold-color: #2073ee;
--complementary-color: #124408;
--accent-color-0: #57f0b5;
}
input[type=text] {
width: 40%;
}
div.config-element {
color: var(--bold-color);
margin: 8px;
}
div.config-title {
font-weight: bold;
}
span.config-element-value {
color: var(--complementary-color);
font-weight: normal;
cursor: pointer;
}
button {
cursor: pointer;
}