Merge pull request #21 from debanjum/saba/dockerize

Add Docker support to semantic-search
This commit is contained in:
Debanjum 2022-01-28 20:27:40 -08:00 committed by GitHub
commit d943d2be80
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 169 additions and 31 deletions

29
Dockerfile Normal file
View file

@ -0,0 +1,29 @@
# syntax=docker/dockerfile:1
FROM continuumio/miniconda3:latest
# Install system dependencies.
RUN apt-get update -y && \
apt-get -y install libimage-exiftool-perl
# Add the local code to the /app directory and set it to be the working directory.
# Since we mount the /app directory as a volume in docker-compose.yml, this
# allows us to automatically update the code in the Docker image when it's changed.
ADD . /app
WORKDIR /app
# Get the arguments from the docker-compose environment.
ARG PORT
EXPOSE ${PORT}
# Create the conda environment.
RUN conda env create -f environment.yml
# Use the conda environment we created to run the application.
# To enable the conda env, we cannot simply RUN `conda activate semantic-search`,
# since each RUN command in a Dockerfile is a separate bash shell.
# The environment would not carry forward.
# Instead, we'll use `conda run` to run the application.
# There are more arguments required for the script to run,
# but these should be passed in through the docker-compose.yml file.
ENTRYPOINT ["conda", "run", "--no-capture-output", "--name", "semantic-search", \
"python3", "-m", "src.main"]

View file

@ -5,27 +5,47 @@
All data is processed locally. User can interface with semantic-search app via [[./src/interface/emacs/semantic-search.el][Emacs]], API or Commandline All data is processed locally. User can interface with semantic-search app via [[./src/interface/emacs/semantic-search.el][Emacs]], API or Commandline
** Dependencies ** Setup
- Python3
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
** Install *** Setup using Docker
**** 1. Clone Repository
#+begin_src shell
git clone https://github.com/debanjum/semantic-search && cd semantic-search
#+end_src
**** 2. Configure
Add Content Directories for Semantic Search to Docker-Compose
Update [[./docker-compose.yml][docker-compose.yml]] to mount your images, org-mode notes, ledger/beancount directories
If required, edit config settings in [[./docker_sample_config.yml][docker_sample_config.yml]].
**** 3. Run
#+begin_src shell
docker-compose up -d
#+end_src
*** Setup on Local Machine
**** 1. Install Dependencies
1. Install Python3 [Required[
2. [[https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html][Install Conda]] [Required]
3. Install Exiftool [Optional]
#+begin_src shell
sudo apt-get -y install libimage-exiftool-perl
#+end_src
**** 2. Install Semantic Search
#+begin_src shell #+begin_src shell
git clone https://github.com/debanjum/semantic-search && cd semantic-search git clone https://github.com/debanjum/semantic-search && cd semantic-search
conda env create -f environment.yml conda env create -f environment.yml
conda activate semantic-search conda activate semantic-search
#+end_src #+end_src
*** Install Environmental Dependencies **** 3. Configure
#+begin_src shell
sudo apt-get -y install libimage-exiftool-perl
#+end_src
** Configure
Configure application search types and their underlying data source/files in ~sample_config.yml~ Configure application search types and their underlying data source/files in ~sample_config.yml~
Use the ~sample_config.yml~ as reference Use the ~sample_config.yml~ as reference
** Run **** 4. Run
Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML
#+begin_src shell #+begin_src shell
@ -39,10 +59,20 @@
- *Semantic Search via API* - *Semantic Search via API*
- Query: ~GET~ [[http://localhost:8000/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:8000/search?q="What is the meaning of life"&t=notes]] - Query: ~GET~ [[http://localhost:8000/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:8000/search?q="What is the meaning of life"&t=notes]]
- Regenerate Embeddings: ~GET~ [[http://localhost:8000/regenerate][http://localhost:8000/regenerate?t=image]] - Regenerate Embeddings: ~GET~ [[http://localhost:8000/regenerate][http://localhost:8000/regenerate]]
- [[http://localhost:8000/docs][Semantic Search API Docs]] - [[http://localhost:8000/docs][Semantic Search API Docs]]
- *UI to Edit Config*
- [[https://localhost:8000/ui][Config UI]]
** Upgrade ** Upgrade
*** Using Docker
#+begin_src shell
docker-compose up
#+end_src
*** On Local Machine
#+begin_src shell #+begin_src shell
cd semantic-search cd semantic-search
git pull origin master git pull origin master

32
docker-compose.yml Normal file
View file

@ -0,0 +1,32 @@
version: "3.9"
services:
server:
build:
context: .
dockerfile: Dockerfile
args:
- PORT=8000
ports:
# If changing the local port (left hand side), no other changes required.
# If changing the remote port (right hand side),
# change the port in the args in the build section,
# as well as the port in the command section to match
- "8000:8000"
working_dir: /app
volumes:
- .:/app
# These mounted volumes hold the raw data that should be indexed for search.
# The path in your local directory (left hand side)
# points to the files you want to index.
# The path of the mounted directory (right hand side),
# must match the path prefix in your config file.
- ./tests/data/:/data/notes/
- ./tests/data/:/data/images/
- ./tests/data/:/data/ledger/
- ./tests/data/:/data/music/
# It's ok if you don't have existing embeddings.
# You can set this volume to point to an empty folder.
- ./tests/data/:/data/generated/
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
command: --host="0.0.0.0" --port=8000 -c=docker_sample_config.yml -vv

47
docker_sample_config.yml Normal file
View file

@ -0,0 +1,47 @@
content-type:
# The /data/folder/ prefix to the folders is here because this is
# the directory to which the local files are copied in the docker-compose.
# If changing, the docker-compose volumes should also be changed to match.
org:
input-files: null
input-filter: "/data/notes/*.org"
compressed-jsonl: "/data/generated/.notes.json.gz"
embeddings-file: "/data/generated/.note_embeddings.pt"
ledger:
input-files: null
input-filter: /data/ledger/*.beancount
compressed-jsonl: /data/generated/.transactions.jsonl.gz
embeddings-file: /data/generated/.transaction_embeddings.pt
image:
input-directory: "/data/images/"
embeddings-file: "/data/generated/.image_embeddings.pt"
batch-size: 50
use-xmp-metadata: true
music:
input-files: null
input-filter: "/data/music/*.org"
compressed-jsonl: "/data/generated/.songs.jsonl.gz"
embeddings-file: "/data/generated/.song_embeddings.pt"
search-type:
symmetric:
encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2"
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_directory: "/data/models/.symmetric"
asymmetric:
encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3"
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_directory: "/data/models/.asymmetric"
image:
encoder: "clip-ViT-B-32"
model_directory: "/data/models/.image_encoder"
processor:
conversation:
openai-api-key: null
conversation-logfile: "/data/conversation/.conversation_logs.json"

View file

@ -15,7 +15,7 @@ content-type:
input-directory: "tests/data" input-directory: "tests/data"
embeddings-file: "tests/data/.image_embeddings.pt" embeddings-file: "tests/data/.image_embeddings.pt"
batch-size: 50 batch-size: 50
use-xmp-metadata: "no" use-xmp-metadata: false
music: music:
input-files: ["tests/data/music.org"] input-files: ["tests/data/music.org"]

View file

@ -20,7 +20,7 @@ class TextContentConfig(ConfigBase):
embeddings_file: Optional[Path] embeddings_file: Optional[Path]
class ImageContentConfig(ConfigBase): class ImageContentConfig(ConfigBase):
use_xmp_metadata: Optional[str] use_xmp_metadata: Optional[bool]
batch_size: Optional[int] batch_size: Optional[int]
input_directory: Optional[Path] input_directory: Optional[Path]
input_filter: Optional[str] input_filter: Optional[str]