mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Merge sample_config, docker_sample_config yml into a single sample_config.yml
- Update readme to indicate how to update the new sample_config to run on test data
This commit is contained in:
parent
2bc2780501
commit
3e889760c7
4 changed files with 31 additions and 73 deletions
|
@ -50,8 +50,9 @@
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
**** 3. Configure
|
**** 3. Configure
|
||||||
- Configure application search types and their underlying data source/files in ~sample_config.yml~
|
- Configure files/directories to search in ~content-type~ section of ~sample_config.yml~
|
||||||
- Use the ~sample_config.yml~ as reference
|
- To run application on test data, update file paths containing ~/data/~ to ~tests/data/~ in ~sample_config.yml~
|
||||||
|
- Example replace ~/data/notes/*.org~ with ~tests/data/notes/*.org~
|
||||||
|
|
||||||
**** 4. Run
|
**** 4. Run
|
||||||
Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML
|
Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML
|
||||||
|
|
|
@ -20,13 +20,14 @@ services:
|
||||||
# points to the files you want to index.
|
# points to the files you want to index.
|
||||||
# The path of the mounted directory (right hand side),
|
# The path of the mounted directory (right hand side),
|
||||||
# must match the path prefix in your config file.
|
# must match the path prefix in your config file.
|
||||||
- ./tests/data/:/data/notes/
|
- ./tests/data/notes/:/data/notes/
|
||||||
- ./tests/data/:/data/images/
|
- ./tests/data/images/:/data/images/
|
||||||
- ./tests/data/:/data/ledger/
|
- ./tests/data/ledger/:/data/ledger/
|
||||||
- ./tests/data/:/data/music/
|
- ./tests/data/music/:/data/music/
|
||||||
# It's ok if you don't have existing embeddings.
|
# Embeddings and models are populated after the first run
|
||||||
# You can set this volume to point to an empty folder.
|
# You can set these volumes to point to empty directories on host
|
||||||
- ./tests/data/embeddings/:/data/generated/
|
- ./tests/data/embeddings/:/data/generated/
|
||||||
|
- ./tests/data/models/:/data/models/
|
||||||
|
|
||||||
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
|
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
|
||||||
command: --host="0.0.0.0" --port=8000 -c=docker_sample_config.yml -vv
|
command: --host="0.0.0.0" --port=8000 -c=sample_config.yml -vv
|
||||||
|
|
|
@ -1,47 +0,0 @@
|
||||||
content-type:
|
|
||||||
# The /data/folder/ prefix to the folders is here because this is
|
|
||||||
# the directory to which the local files are copied in the docker-compose.
|
|
||||||
# If changing, the docker-compose volumes should also be changed to match.
|
|
||||||
org:
|
|
||||||
input-files: null
|
|
||||||
input-filter: "/data/notes/*.org"
|
|
||||||
compressed-jsonl: "/data/generated/.notes.json.gz"
|
|
||||||
embeddings-file: "/data/generated/.note_embeddings.pt"
|
|
||||||
|
|
||||||
ledger:
|
|
||||||
input-files: null
|
|
||||||
input-filter: /data/ledger/*.beancount
|
|
||||||
compressed-jsonl: /data/generated/.transactions.jsonl.gz
|
|
||||||
embeddings-file: /data/generated/.transaction_embeddings.pt
|
|
||||||
|
|
||||||
image:
|
|
||||||
input-directory: "/data/images/"
|
|
||||||
embeddings-file: "/data/generated/.image_embeddings.pt"
|
|
||||||
batch-size: 50
|
|
||||||
use-xmp-metadata: true
|
|
||||||
|
|
||||||
music:
|
|
||||||
input-files: null
|
|
||||||
input-filter: "/data/music/*.org"
|
|
||||||
compressed-jsonl: "/data/generated/.songs.jsonl.gz"
|
|
||||||
embeddings-file: "/data/generated/.song_embeddings.pt"
|
|
||||||
|
|
||||||
search-type:
|
|
||||||
symmetric:
|
|
||||||
encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2"
|
|
||||||
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
||||||
model_directory: "/data/models/.symmetric"
|
|
||||||
|
|
||||||
asymmetric:
|
|
||||||
encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3"
|
|
||||||
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
||||||
model_directory: "/data/models/.asymmetric"
|
|
||||||
|
|
||||||
image:
|
|
||||||
encoder: "clip-ViT-B-32"
|
|
||||||
model_directory: "/data/models/.image_encoder"
|
|
||||||
|
|
||||||
processor:
|
|
||||||
conversation:
|
|
||||||
openai-api-key: null
|
|
||||||
conversation-logfile: "/data/conversation/.conversation_logs.json"
|
|
|
@ -1,44 +1,47 @@
|
||||||
content-type:
|
content-type:
|
||||||
|
# The /data/folder/ prefix to the folders is here because this is
|
||||||
|
# the directory to which the local files are copied in the docker-compose.
|
||||||
|
# If changing, the docker-compose volumes should also be changed to match.
|
||||||
org:
|
org:
|
||||||
input-files: ["tests/data/main_readme.org", "tests/data/interface_emacs_readme.org"]
|
input-files: null
|
||||||
input-filter: null
|
input-filter: "/data/notes/*.org"
|
||||||
compressed-jsonl: "tests/data/.notes.json.gz"
|
compressed-jsonl: "/data/generated/notes.json.gz"
|
||||||
embeddings-file: "tests/data/.note_embeddings.pt"
|
embeddings-file: "/data/generated/note_embeddings.pt"
|
||||||
|
|
||||||
ledger:
|
ledger:
|
||||||
input-files: null
|
input-files: null
|
||||||
input-filter: tests/data/*.beancount
|
input-filter: /data/ledger/*.beancount
|
||||||
compressed-jsonl: tests/data/.transactions.jsonl.gz
|
compressed-jsonl: /data/generated/transactions.jsonl.gz
|
||||||
embeddings-file: tests/data/.transaction_embeddings.pt
|
embeddings-file: /data/generated/transaction_embeddings.pt
|
||||||
|
|
||||||
image:
|
image:
|
||||||
input-directory: "tests/data"
|
input-directory: "/data/images/"
|
||||||
embeddings-file: "tests/data/.image_embeddings.pt"
|
embeddings-file: "/data/generated/image_embeddings.pt"
|
||||||
batch-size: 50
|
batch-size: 50
|
||||||
use-xmp-metadata: false
|
use-xmp-metadata: true
|
||||||
|
|
||||||
music:
|
music:
|
||||||
input-files: ["tests/data/music.org"]
|
input-files: ["/data/music/music.org"]
|
||||||
input-filter: null
|
input-filter: null
|
||||||
compressed-jsonl: "tests/data/.songs.jsonl.gz"
|
compressed-jsonl: "/data/generated/songs.jsonl.gz"
|
||||||
embeddings-file: "tests/data/.song_embeddings.pt"
|
embeddings-file: "/data/generated/song_embeddings.pt"
|
||||||
|
|
||||||
search-type:
|
search-type:
|
||||||
symmetric:
|
symmetric:
|
||||||
encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2"
|
encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2"
|
||||||
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||||
model_directory: "tests/data/.symmetric"
|
model_directory: "/data/models/symmetric"
|
||||||
|
|
||||||
asymmetric:
|
asymmetric:
|
||||||
encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3"
|
encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3"
|
||||||
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||||
model_directory: "tests/data/.asymmetric"
|
model_directory: "/data/models/asymmetric"
|
||||||
|
|
||||||
image:
|
image:
|
||||||
encoder: "clip-ViT-B-32"
|
encoder: "clip-ViT-B-32"
|
||||||
model_directory: "tests/data/.image_encoder"
|
model_directory: "/data/models/image_encoder"
|
||||||
|
|
||||||
processor:
|
processor:
|
||||||
conversation:
|
conversation:
|
||||||
openai-api-key: null
|
openai-api-key: null
|
||||||
conversation-logfile: "tests/data/.conversation_logs.json"
|
conversation-logfile: "/data/generated/conversation_logs.json"
|
Loading…
Reference in a new issue