Merge sample_config, docker_sample_config yml into a single sample_config.yml

- Update readme to indicate how to update the new sample_config to run on test data
This commit is contained in:
Debanjum Singh Solanky 2022-01-29 01:32:12 -05:00
parent 2bc2780501
commit 3e889760c7
4 changed files with 31 additions and 73 deletions

View file

@ -50,8 +50,9 @@
#+end_src #+end_src
**** 3. Configure **** 3. Configure
- Configure application search types and their underlying data source/files in ~sample_config.yml~ - Configure files/directories to search in ~content-type~ section of ~sample_config.yml~
- Use the ~sample_config.yml~ as reference - To run application on test data, update file paths containing ~/data/~ to ~tests/data/~ in ~sample_config.yml~
- Example replace ~/data/notes/*.org~ with ~tests/data/notes/*.org~
**** 4. Run **** 4. Run
Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML

View file

@ -20,13 +20,14 @@ services:
# points to the files you want to index. # points to the files you want to index.
# The path of the mounted directory (right hand side), # The path of the mounted directory (right hand side),
# must match the path prefix in your config file. # must match the path prefix in your config file.
- ./tests/data/:/data/notes/ - ./tests/data/notes/:/data/notes/
- ./tests/data/:/data/images/ - ./tests/data/images/:/data/images/
- ./tests/data/:/data/ledger/ - ./tests/data/ledger/:/data/ledger/
- ./tests/data/:/data/music/ - ./tests/data/music/:/data/music/
# It's ok if you don't have existing embeddings. # Embeddings and models are populated after the first run
# You can set this volume to point to an empty folder. # You can set these volumes to point to empty directories on host
- ./tests/data/embeddings/:/data/generated/ - ./tests/data/embeddings/:/data/generated/
- ./tests/data/models/:/data/models/
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/ # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
command: --host="0.0.0.0" --port=8000 -c=docker_sample_config.yml -vv command: --host="0.0.0.0" --port=8000 -c=sample_config.yml -vv

View file

@ -1,47 +0,0 @@
content-type:
# The /data/folder/ prefix to the folders is here because this is
# the directory to which the local files are copied in the docker-compose.
# If changing, the docker-compose volumes should also be changed to match.
org:
input-files: null
input-filter: "/data/notes/*.org"
compressed-jsonl: "/data/generated/.notes.json.gz"
embeddings-file: "/data/generated/.note_embeddings.pt"
ledger:
input-files: null
input-filter: /data/ledger/*.beancount
compressed-jsonl: /data/generated/.transactions.jsonl.gz
embeddings-file: /data/generated/.transaction_embeddings.pt
image:
input-directory: "/data/images/"
embeddings-file: "/data/generated/.image_embeddings.pt"
batch-size: 50
use-xmp-metadata: true
music:
input-files: null
input-filter: "/data/music/*.org"
compressed-jsonl: "/data/generated/.songs.jsonl.gz"
embeddings-file: "/data/generated/.song_embeddings.pt"
search-type:
symmetric:
encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2"
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_directory: "/data/models/.symmetric"
asymmetric:
encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3"
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_directory: "/data/models/.asymmetric"
image:
encoder: "clip-ViT-B-32"
model_directory: "/data/models/.image_encoder"
processor:
conversation:
openai-api-key: null
conversation-logfile: "/data/conversation/.conversation_logs.json"

View file

@ -1,44 +1,47 @@
content-type: content-type:
# The /data/folder/ prefix to the folders is here because this is
# the directory to which the local files are copied in the docker-compose.
# If changing, the docker-compose volumes should also be changed to match.
org: org:
input-files: ["tests/data/main_readme.org", "tests/data/interface_emacs_readme.org"] input-files: null
input-filter: null input-filter: "/data/notes/*.org"
compressed-jsonl: "tests/data/.notes.json.gz" compressed-jsonl: "/data/generated/notes.json.gz"
embeddings-file: "tests/data/.note_embeddings.pt" embeddings-file: "/data/generated/note_embeddings.pt"
ledger: ledger:
input-files: null input-files: null
input-filter: tests/data/*.beancount input-filter: /data/ledger/*.beancount
compressed-jsonl: tests/data/.transactions.jsonl.gz compressed-jsonl: /data/generated/transactions.jsonl.gz
embeddings-file: tests/data/.transaction_embeddings.pt embeddings-file: /data/generated/transaction_embeddings.pt
image: image:
input-directory: "tests/data" input-directory: "/data/images/"
embeddings-file: "tests/data/.image_embeddings.pt" embeddings-file: "/data/generated/image_embeddings.pt"
batch-size: 50 batch-size: 50
use-xmp-metadata: false use-xmp-metadata: true
music: music:
input-files: ["tests/data/music.org"] input-files: ["/data/music/music.org"]
input-filter: null input-filter: null
compressed-jsonl: "tests/data/.songs.jsonl.gz" compressed-jsonl: "/data/generated/songs.jsonl.gz"
embeddings-file: "tests/data/.song_embeddings.pt" embeddings-file: "/data/generated/song_embeddings.pt"
search-type: search-type:
symmetric: symmetric:
encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2" encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2"
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_directory: "tests/data/.symmetric" model_directory: "/data/models/symmetric"
asymmetric: asymmetric:
encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3" encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3"
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_directory: "tests/data/.asymmetric" model_directory: "/data/models/asymmetric"
image: image:
encoder: "clip-ViT-B-32" encoder: "clip-ViT-B-32"
model_directory: "tests/data/.image_encoder" model_directory: "/data/models/image_encoder"
processor: processor:
conversation: conversation:
openai-api-key: null openai-api-key: null
conversation-logfile: "tests/data/.conversation_logs.json" conversation-logfile: "/data/generated/conversation_logs.json"