diff --git a/README.org b/README.org index 20f3301a..478fba82 100644 --- a/README.org +++ b/README.org @@ -50,8 +50,9 @@ #+end_src **** 3. Configure - - Configure application search types and their underlying data source/files in ~sample_config.yml~ - - Use the ~sample_config.yml~ as reference + - Configure files/directories to search in ~content-type~ section of ~sample_config.yml~ + - To run application on test data, update file paths containing ~/data/~ to ~tests/data/~ in ~sample_config.yml~ + - Example replace ~/data/notes/*.org~ with ~tests/data/notes/*.org~ **** 4. Run Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML diff --git a/docker-compose.yml b/docker-compose.yml index c1115d87..e9b1dbfb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,13 +20,14 @@ services: # points to the files you want to index. # The path of the mounted directory (right hand side), # must match the path prefix in your config file. - - ./tests/data/:/data/notes/ - - ./tests/data/:/data/images/ - - ./tests/data/:/data/ledger/ - - ./tests/data/:/data/music/ - # It's ok if you don't have existing embeddings. - # You can set this volume to point to an empty folder. + - ./tests/data/notes/:/data/notes/ + - ./tests/data/images/:/data/images/ + - ./tests/data/ledger/:/data/ledger/ + - ./tests/data/music/:/data/music/ + # Embeddings and models are populated after the first run + # You can set these volumes to point to empty directories on host - ./tests/data/embeddings/:/data/generated/ + - ./tests/data/models/:/data/models/ # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/ - command: --host="0.0.0.0" --port=8000 -c=docker_sample_config.yml -vv + command: --host="0.0.0.0" --port=8000 -c=sample_config.yml -vv diff --git a/docker_sample_config.yml b/docker_sample_config.yml deleted file mode 100644 index e9e23b75..00000000 --- a/docker_sample_config.yml +++ /dev/null @@ -1,47 +0,0 @@ -content-type: - # The /data/folder/ prefix to the folders is here because this is - # the directory to which the local files are copied in the docker-compose. - # If changing, the docker-compose volumes should also be changed to match. - org: - input-files: null - input-filter: "/data/notes/*.org" - compressed-jsonl: "/data/generated/.notes.json.gz" - embeddings-file: "/data/generated/.note_embeddings.pt" - - ledger: - input-files: null - input-filter: /data/ledger/*.beancount - compressed-jsonl: /data/generated/.transactions.jsonl.gz - embeddings-file: /data/generated/.transaction_embeddings.pt - - image: - input-directory: "/data/images/" - embeddings-file: "/data/generated/.image_embeddings.pt" - batch-size: 50 - use-xmp-metadata: true - - music: - input-files: null - input-filter: "/data/music/*.org" - compressed-jsonl: "/data/generated/.songs.jsonl.gz" - embeddings-file: "/data/generated/.song_embeddings.pt" - -search-type: - symmetric: - encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2" - cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" - model_directory: "/data/models/.symmetric" - - asymmetric: - encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3" - cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" - model_directory: "/data/models/.asymmetric" - - image: - encoder: "clip-ViT-B-32" - model_directory: "/data/models/.image_encoder" - -processor: - conversation: - openai-api-key: null - conversation-logfile: "/data/conversation/.conversation_logs.json" \ No newline at end of file diff --git a/sample_config.yml b/sample_config.yml index 8805c984..b16759e8 100644 --- a/sample_config.yml +++ b/sample_config.yml @@ -1,44 +1,47 @@ content-type: + # The /data/folder/ prefix to the folders is here because this is + # the directory to which the local files are copied in the docker-compose. + # If changing, the docker-compose volumes should also be changed to match. org: - input-files: ["tests/data/main_readme.org", "tests/data/interface_emacs_readme.org"] - input-filter: null - compressed-jsonl: "tests/data/.notes.json.gz" - embeddings-file: "tests/data/.note_embeddings.pt" + input-files: null + input-filter: "/data/notes/*.org" + compressed-jsonl: "/data/generated/notes.json.gz" + embeddings-file: "/data/generated/note_embeddings.pt" ledger: input-files: null - input-filter: tests/data/*.beancount - compressed-jsonl: tests/data/.transactions.jsonl.gz - embeddings-file: tests/data/.transaction_embeddings.pt + input-filter: /data/ledger/*.beancount + compressed-jsonl: /data/generated/transactions.jsonl.gz + embeddings-file: /data/generated/transaction_embeddings.pt image: - input-directory: "tests/data" - embeddings-file: "tests/data/.image_embeddings.pt" + input-directory: "/data/images/" + embeddings-file: "/data/generated/image_embeddings.pt" batch-size: 50 - use-xmp-metadata: false + use-xmp-metadata: true music: - input-files: ["tests/data/music.org"] + input-files: ["/data/music/music.org"] input-filter: null - compressed-jsonl: "tests/data/.songs.jsonl.gz" - embeddings-file: "tests/data/.song_embeddings.pt" + compressed-jsonl: "/data/generated/songs.jsonl.gz" + embeddings-file: "/data/generated/song_embeddings.pt" search-type: symmetric: encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2" cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" - model_directory: "tests/data/.symmetric" + model_directory: "/data/models/symmetric" asymmetric: encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3" cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" - model_directory: "tests/data/.asymmetric" + model_directory: "/data/models/asymmetric" image: encoder: "clip-ViT-B-32" - model_directory: "tests/data/.image_encoder" + model_directory: "/data/models/image_encoder" processor: conversation: openai-api-key: null - conversation-logfile: "tests/data/.conversation_logs.json" \ No newline at end of file + conversation-logfile: "/data/generated/conversation_logs.json" \ No newline at end of file