Merge pull request #21 from debanjum/saba/dockerize

Add Docker support to semantic-search
2024-11-23 23:48:56 +01:00 · 2022-01-28 20:27:40 -08:00 · 2022-01-28 20:27:40 -08:00 · d943d2be80
commit d943d2be80
parent 974690939c 1ba7fa66e5
6 changed files with 169 additions and 31 deletions
--- a/29
+++ b/29
@ -0,0 +1,29 @@
 # syntax=docker/dockerfile:1
 FROM continuumio/miniconda3:latest
 # Install system dependencies.
 RUN apt-get update -y && \
    apt-get -y install libimage-exiftool-perl
 # Add the local code to the /app directory and set it to be the working directory.
 # Since we mount the /app directory as a volume in docker-compose.yml, this
 # allows us to automatically update the code in the Docker image when it's changed.
 ADD . /app
 WORKDIR /app
 # Get the arguments from the docker-compose environment.
 ARG PORT
 EXPOSE ${PORT}
 # Create the conda environment.
 RUN conda env create -f environment.yml
 # Use the conda environment we created to run the application.
 # To enable the conda env, we cannot simply RUN `conda activate semantic-search`, 
 # since each RUN command in a Dockerfile is a separate bash shell. 
 # The environment would not carry forward.
 # Instead, we'll use `conda run` to run the application.
 # There are more arguments required for the script to run, 
 # but these should be passed in through the docker-compose.yml file.
 ENTRYPOINT ["conda", "run", "--no-capture-output", "--name", "semantic-search", \
    "python3", "-m", "src.main"]
--- a/README.org
+++ b/README.org
@ -5,32 +5,52 @@
  All data is processed locally. User can interface with semantic-search app via [[./src/interface/emacs/semantic-search.el][Emacs]], API or Commandline
-** Dependencies
+** Setup
   - Python3
   - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
-** Install
+*** Setup using Docker
   #+begin_src shell
   git clone https://github.com/debanjum/semantic-search && cd semantic-search
   conda env create -f environment.yml
   conda activate semantic-search
   #+end_src
-*** Install Environmental Dependencies
+**** 1. Clone Repository
-   #+begin_src shell
+     #+begin_src shell
-   sudo apt-get -y install libimage-exiftool-perl
+       git clone https://github.com/debanjum/semantic-search && cd semantic-search
-   #+end_src
+     #+end_src
-** Configure
+**** 2. Configure
-   Configure application search types and their underlying data source/files in ~sample_config.yml~
+     Add Content Directories for Semantic Search to Docker-Compose
-   Use the ~sample_config.yml~ as reference
+     Update [[./docker-compose.yml][docker-compose.yml]] to mount your images, org-mode notes, ledger/beancount directories
     If required, edit config settings in [[./docker_sample_config.yml][docker_sample_config.yml]].
-** Run
+**** 3. Run
-   Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML
+     #+begin_src shell
     docker-compose up -d
     #+end_src
-   #+begin_src shell
+*** Setup on Local Machine
-   python3 -m src.main -c=sample_config.yml -vv
+
-   #+end_src
+**** 1. Install Dependencies
     1. Install Python3 [Required[
     2. [[https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html][Install Conda]] [Required]
     3. Install Exiftool [Optional]
        #+begin_src shell
        sudo apt-get -y install libimage-exiftool-perl
        #+end_src
 **** 2. Install Semantic Search
       #+begin_src shell
       git clone https://github.com/debanjum/semantic-search && cd semantic-search
       conda env create -f environment.yml
       conda activate semantic-search
       #+end_src
 **** 3. Configure
     Configure application search types and their underlying data source/files in ~sample_config.yml~
     Use the ~sample_config.yml~ as reference
 **** 4. Run
     Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML
     #+begin_src shell
     python3 -m src.main -c=sample_config.yml -vv
     #+end_src
 ** Use
   - *Semantic Search via Emacs*
@ -39,19 +59,29 @@
   - *Semantic Search via API*
     - Query: ~GET~ [[http://localhost:8000/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:8000/search?q="What is the meaning of life"&t=notes]]
-     - Regenerate Embeddings: ~GET~ [[http://localhost:8000/regenerate][http://localhost:8000/regenerate?t=image]]
+     - Regenerate Embeddings: ~GET~ [[http://localhost:8000/regenerate][http://localhost:8000/regenerate]]
     - [[http://localhost:8000/docs][Semantic Search API Docs]]
   - *UI to Edit Config*
     - [[https://localhost:8000/ui][Config UI]]
 ** Upgrade
-   #+begin_src shell
+
-     cd semantic-search
+*** Using Docker
-     git pull origin master
+    #+begin_src shell
-     conda env update -f environment.yml
+      docker-compose up
-     conda activate semantic-search
+    #+end_src
-   #+end_src
+
 *** On Local Machine
    #+begin_src shell
      cd semantic-search
      git pull origin master
      conda env update -f environment.yml
      conda activate semantic-search
    #+end_src
 ** Acknowledgments
   - [[https://huggingface.co/sentence-transformers/msmarco-MiniLM-L-6-v3][MiniLM Model]] for Asymmetric Text Search. See [[https://www.sbert.net/examples/applications/retrieve_rerank/README.html][SBert Documentation]]
   - [[https://github.com/openai/CLIP][OpenAI CLIP Model]] for Image Search. See [[https://www.sbert.net/examples/applications/image-search/README.html][SBert Documentation]]
   - Charles Cave for [[http://members.optusnet.com.au/~charles57/GTD/orgnode.html][OrgNode Parser]]
-   - Sven Marnach for [[https://github.com/smarnach/pyexiftool/blob/master/exiftool.py][PyExifTool]]
+   - Sven Marnach for [[https://github.com/smarnach/pyexiftool/blob/master/exiftool.py][PyExifTool]]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,32 @@
 version: "3.9"
 services:
  server:
    build:
      context: .
      dockerfile: Dockerfile
      args:
        - PORT=8000
    ports:
      # If changing the local port (left hand side), no other changes required.
      # If changing the remote port (right hand side), 
      #   change the port in the args in the build section, 
      #   as well as the port in the command section to match
      - "8000:8000"
    working_dir: /app
    volumes:
      - .:/app
      # These mounted volumes hold the raw data that should be indexed for search. 
      # The path in your local directory (left hand side)
      #   points to the files you want to index.
      # The path of the mounted directory (right hand side),
      #   must match the path prefix in your config file.
      - ./tests/data/:/data/notes/
      - ./tests/data/:/data/images/
      - ./tests/data/:/data/ledger/
      - ./tests/data/:/data/music/
      # It's ok if you don't have existing embeddings. 
      # You can set this volume to point to an empty folder.
      - ./tests/data/:/data/generated/
    # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
    command: --host="0.0.0.0" --port=8000 -c=docker_sample_config.yml -vv
--- a/docker_sample_config.yml
+++ b/docker_sample_config.yml
@ -0,0 +1,47 @@
 content-type:
  # The /data/folder/ prefix to the folders is here because this is 
  # the directory to which the local files are copied in the docker-compose.
  # If changing, the docker-compose volumes should also be changed to match.
  org:
    input-files: null
    input-filter: "/data/notes/*.org"
    compressed-jsonl: "/data/generated/.notes.json.gz"
    embeddings-file: "/data/generated/.note_embeddings.pt"
  ledger:
    input-files: null
    input-filter: /data/ledger/*.beancount
    compressed-jsonl: /data/generated/.transactions.jsonl.gz
    embeddings-file: /data/generated/.transaction_embeddings.pt
  image:
    input-directory: "/data/images/"
    embeddings-file: "/data/generated/.image_embeddings.pt"
    batch-size: 50
    use-xmp-metadata: true
  music:
    input-files: null
    input-filter: "/data/music/*.org"
    compressed-jsonl: "/data/generated/.songs.jsonl.gz"
    embeddings-file: "/data/generated/.song_embeddings.pt"
 search-type:
  symmetric:
    encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2"
    cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
    model_directory: "/data/models/.symmetric"
  asymmetric:
    encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3"
    cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
    model_directory: "/data/models/.asymmetric"
  image:
    encoder: "clip-ViT-B-32"
    model_directory: "/data/models/.image_encoder"
 processor:
  conversation:
    openai-api-key: null
    conversation-logfile: "/data/conversation/.conversation_logs.json"
--- a/sample_config.yml
+++ b/sample_config.yml
@ -15,7 +15,7 @@ content-type:
    input-directory: "tests/data"
    embeddings-file: "tests/data/.image_embeddings.pt"
    batch-size: 50
-    use-xmp-metadata: "no"
+    use-xmp-metadata: false
  music:
    input-files: ["tests/data/music.org"]
--- a/src/utils/rawconfig.py
+++ b/src/utils/rawconfig.py
@ -20,7 +20,7 @@ class TextContentConfig(ConfigBase):
    embeddings_file: Optional[Path]
 class ImageContentConfig(ConfigBase):
-    use_xmp_metadata: Optional[str]
+    use_xmp_metadata: Optional[bool]
    batch_size: Optional[int]
    input_directory: Optional[Path]
    input_filter: Optional[str]