diff --git a/.github/workflows/desktop.yml b/.github/workflows/desktop.yml new file mode 100644 index 00000000..1df8b7e6 --- /dev/null +++ b/.github/workflows/desktop.yml @@ -0,0 +1,48 @@ +name: desktop + +on: + push: + tags: + - "*" + branches: + - 'master' + paths: + - src/interface/desktop/** + - .github/workflows/desktop.yml + +jobs: + build: + name: 🖥️ Build, Release Desktop App + runs-on: ubuntu-latest + env: + TODESKTOP_ACCESS_TOKEN: ${{ secrets.TODESKTOP_ACCESS_TOKEN }} + TODESKTOP_EMAIL: ${{ secrets.TODESKTOP_EMAIL }} + defaults: + run: + shell: bash + working-directory: src/interface/desktop + steps: + - name: ⬇️ Checkout Code + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: ⤵️ Install Node + uses: actions/setup-node@v3 + with: + node-version: "lts/*" + + - name: ⚙️ Setup Desktop Build + run: | + yarn + npm install -g @todesktop/cli + sed -i "s/\"id\": \"\"/\"id\": \"${{ secrets.TODESKTOP_ID }}\"/g" todesktop.json + + - name: ⚙️ Build Desktop App + run: | + npx todesktop build + + - name: 📦 Release Desktop App + if: startsWith(github.ref, 'refs/tags/') + run: | + npx todesktop release --latest --force diff --git a/.github/workflows/dockerize_dev.yml b/.github/workflows/dockerize_dev.yml new file mode 100644 index 00000000..288fdb8a --- /dev/null +++ b/.github/workflows/dockerize_dev.yml @@ -0,0 +1,43 @@ +name: dockerize-dev + +on: + pull_request: + paths: + - src/khoj/** + - config/** + - pyproject.toml + - prod.Dockerfile + - .github/workflows/dockerize_dev.yml + workflow_dispatch: + +env: + DOCKER_IMAGE_TAG: 'dev' + +jobs: + build: + name: Build Production Docker Image, Push to Container Registry + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.PAT }} + + - name: 📦 Build and Push Docker Image + uses: docker/build-push-action@v2 + with: + context: . + file: prod.Dockerfile + platforms: linux/amd64 + push: true + tags: ghcr.io/${{ github.repository }}-cloud:${{ env.DOCKER_IMAGE_TAG }} + build-args: | + PORT=42110 diff --git a/.github/workflows/dockerize_production.yml b/.github/workflows/dockerize_production.yml new file mode 100644 index 00000000..2e1eea4b --- /dev/null +++ b/.github/workflows/dockerize_production.yml @@ -0,0 +1,47 @@ +name: dockerize-prod + +on: + push: + tags: + - "*" + branches: + - master + paths: + - src/khoj/** + - config/** + - pyproject.toml + - prod.Dockerfile + - .github/workflows/dockerize_production.yml + workflow_dispatch: + +env: + DOCKER_IMAGE_TAG: ${{ github.ref == 'refs/heads/master' && 'latest' || github.ref_name }} + +jobs: + build: + name: Build Production Docker Image, Push to Container Registry + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.PAT }} + + - name: 📦 Build and Push Docker Image + uses: docker/build-push-action@v2 + with: + context: . + file: prod.Dockerfile + platforms: linux/amd64 + push: true + tags: ghcr.io/${{ github.repository }}-cloud:${{ env.DOCKER_IMAGE_TAG }} + build-args: | + PORT=42110 diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..a571e8a1 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,48 @@ +name: pre-commit + +on: + pull_request: + paths: + - src/** + - tests/** + - config/** + - pyproject.toml + - .pre-commit-config.yml + - .github/workflows/test.yml + push: + branches: + - master + paths: + - src/khoj/** + - tests/** + - config/** + - pyproject.toml + - .pre-commit-config.yml + - .github/workflows/test.yml + +jobs: + test: + name: Run Tests + runs-on: ubuntu-latest + strategy: + fail-fast: false + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.11 + + - name: ⏬️ Install Dependencies + run: | + sudo apt update && sudo apt install -y libegl1 + python -m pip install --upgrade pip + + - name: ⬇️ Install Application + run: pip install --upgrade .[dev] + + - name: 🌡️ Validate Application + run: pre-commit run --hook-stage manual --all diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d8aa9be8..697579da 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,10 +2,8 @@ name: test on: pull_request: - branches: - - 'master' paths: - - src/khoj/** + - src/** - tests/** - config/** - pyproject.toml @@ -13,7 +11,7 @@ on: - .github/workflows/test.yml push: branches: - - 'master' + - master paths: - src/khoj/** - tests/** @@ -26,6 +24,7 @@ jobs: test: name: Run Tests runs-on: ubuntu-latest + container: ubuntu:jammy strategy: fail-fast: false matrix: @@ -33,6 +32,17 @@ jobs: - '3.9' - '3.10' - '3.11' + + services: + postgres: + image: ankane/pgvector + env: + POSTGRES_PASSWORD: postgres + POSTGRES_USER: postgres + ports: + - 5432:5432 + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + steps: - uses: actions/checkout@v3 with: @@ -43,17 +53,37 @@ jobs: with: python-version: ${{ matrix.python_version }} - - name: ⏬️ Install Dependencies + - name: Install Git run: | - sudo apt update && sudo apt install -y libegl1 + apt update && apt install -y git + + - name: ⏬️ Install Dependencies + env: + DEBIAN_FRONTEND: noninteractive + run: | + apt update && apt install -y libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6 + + - name: ⬇️ Install Postgres + env: + DEBIAN_FRONTEND: noninteractive + run : | + apt install -y postgresql postgresql-client && apt install -y postgresql-server-dev-14 + + - name: ⬇️ Install pip + run: | + apt install -y python3-pip + python -m ensurepip --upgrade python -m pip install --upgrade pip - name: ⬇️ Install Application - run: pip install --upgrade .[dev] - - - name: 🌡️ Validate Application - run: pre-commit run --hook-stage manual --all + run: sed -i 's/dynamic = \["version"\]/version = "0.0.0"/' pyproject.toml && pip install --upgrade .[dev] - name: 🧪 Test Application + env: + POSTGRES_HOST: postgres + POSTGRES_PORT: 5432 + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres run: pytest timeout-minutes: 10 diff --git a/.gitignore b/.gitignore index 8e99392c..35315263 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,8 @@ todesktop.json khoj_assistant.egg-info /config/khoj*.yml .pytest_cache -khoj.log +*.log +static # Obsidian plugin artifacts # --- diff --git a/Dockerfile b/Dockerfile index bdf9647f..9882a236 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,14 +5,23 @@ LABEL org.opencontainers.image.source https://github.com/khoj-ai/khoj # Install System Dependencies RUN apt update -y && apt -y install python3-pip git +WORKDIR /app + # Install Application -COPY . . +COPY pyproject.toml . +COPY README.md . RUN sed -i 's/dynamic = \["version"\]/version = "0.0.0"/' pyproject.toml && \ pip install --no-cache-dir . +# Copy Source Code +COPY . . + +# Set the PYTHONPATH environment variable in order for it to find the Django app. +ENV PYTHONPATH=/app/src:$PYTHONPATH + # Run the Application # There are more arguments required for the application to run, # but these should be passed in through the docker-compose.yml file. ARG PORT EXPOSE ${PORT} -ENTRYPOINT ["khoj"] +ENTRYPOINT ["python3", "src/khoj/main.py"] diff --git a/LICENSE b/LICENSE index 94a04532..0ad25db4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,23 +1,21 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 - Copyright (C) 2007 Free Software Foundation, Inc. + Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble - The GNU General Public License is a free, copyleft license for -software and other kinds of works. + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to +our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. +software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you @@ -26,44 +24,34 @@ them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. The precise terms and conditions for copying, distribution and modification follow. @@ -72,7 +60,7 @@ modification follow. 0. Definitions. - "This License" refers to version 3 of the GNU General Public License. + "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. @@ -549,35 +537,45 @@ to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. - 13. Use with the GNU Affero General Public License. + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single +under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General +Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published +GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's +versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. @@ -619,3 +617,45 @@ Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/docker-compose.yml b/docker-compose.yml index bc3da2a9..365d2572 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,29 @@ version: "3.9" services: + database: + image: ankane/pgvector + ports: + - "5432:5432" + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + volumes: + - khoj_db:/var/lib/postgresql/data/ + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 30s + timeout: 10s + retries: 5 server: + depends_on: + database: + condition: service_healthy + # Use the following line to use the latest version of khoj. Otherwise, it will build from source. image: ghcr.io/khoj-ai/khoj:latest + # Uncomment the following line to build from source. This will take a few minutes. Comment the next two lines out if you want to use the offiicial image. + # build: + # context: . ports: # If changing the local port (left hand side), no other changes required. # If changing the remote port (right hand side), @@ -10,26 +32,23 @@ services: - "42110:42110" working_dir: /app volumes: - - .:/app - # These mounted volumes hold the raw data that should be indexed for search. - # The path in your local directory (left hand side) - # points to the files you want to index. - # The path of the mounted directory (right hand side), - # must match the path prefix in your config file. - - ./tests/data/org/:/data/org/ - - ./tests/data/images/:/data/images/ - - ./tests/data/markdown/:/data/markdown/ - - ./tests/data/pdf/:/data/pdf/ - # Embeddings and models are populated after the first run - # You can set these volumes to point to empty directories on host - - ./tests/data/embeddings/:/root/.khoj/content/ - - ./tests/data/models/:/root/.khoj/search/ - khoj_config:/root/.khoj/ - - sentence_tranformer_models:/root/.cache/torch/sentence_transformers + - khoj_models:/root/.cache/torch/sentence_transformers # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/ - command: --host="0.0.0.0" --port=42110 -vv + environment: + - POSTGRES_DB=postgres + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=postgres + - POSTGRES_HOST=database + - POSTGRES_PORT=5432 + - KHOJ_DJANGO_SECRET_KEY=secret + - KHOJ_DEBUG=True + - KHOJ_ADMIN_EMAIL=username@example.com + - KHOJ_ADMIN_PASSWORD=password + command: --host="0.0.0.0" --port=42110 -vv --anonymous-mode volumes: khoj_config: - sentence_tranformer_models: + khoj_db: + khoj_models: diff --git a/docs/README.md b/docs/README.md index 06d026a4..04a2226a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -9,7 +9,7 @@
-An AI personal assistant for your digital brain +An AI copilot for your Second Brain
@@ -24,30 +24,29 @@ ## Introduction -Welcome to the Khoj Docs! This is the best place to [get started](./setup.md) with Khoj. +Welcome to the Khoj Docs! This is the best place to get setup and explore Khoj's features. -- Khoj is a desktop application to [search](./search.md) and [chat](./chat.md) with your notes, documents and images -- It is an offline-first, open source AI personal assistant accessible from your [Emacs](./emacs.md), [Obsidian](./obsidian.md) or [Web browser](./web.md) -- It works with jpeg, markdown, [notion](./notion_integration.md) org-mode, pdf files and [github repositories](./github_integration.md) -- If you have more questions, check out the [FAQ](https://faq.khoj.dev/) - it's a live Khoj instance indexing our Github repository! +- Khoj is an open source, personal AI +- You can [chat](chat.md) with it about anything. When relevant, it'll use any notes or documents you shared with it to respond +- Quickly [find](search.md) relevant notes and documents using natural language +- It understands pdf, plaintext, markdown, org-mode files, [notion pages](notion_integration.md) and [github repositories](github_integration.md) +- Access it from your [Emacs](emacs.md), [Obsidian](obsidian.md), [Web browser](web.md) or the [Khoj Desktop app](desktop.md) +- You can self-host Khoj on your consumer hardware or share it with your family, friends or team from your private cloud ## Quickstart -[Click here](./setup.md) for full setup instructions - -```shell -pip install khoj-assistant && khoj -``` +- [Try Khoj Cloud](https://app.khoj.dev) to get started quickly +- [Read these instructions](./setup.md) to self-host a private instance of Khoj ## Overview    -#### [Search](./search.md) - - **Local**: Your personal data stays local. All search and indexing is done on your machine. +#### [Search](search.md) + - **Natural**: Use natural language queries to quickly find relevant notes and documents. - **Incremental**: Incremental search for a fast, search-as-you-type experience -#### [Chat](./chat.md) +#### [Chat](chat.md) - **Faster answers**: Find answers faster, smoother than search. No need to manually scan through your notes to find answers. - **Iterative discovery**: Iteratively explore and (re-)discover your notes - **Assisted creativity**: Smoothly weave across answers retrieval and content generation diff --git a/docs/_sidebar.md b/docs/_sidebar.md index 9e0b8849..348b785a 100644 --- a/docs/_sidebar.md +++ b/docs/_sidebar.md @@ -1,12 +1,13 @@ - Get Started - [Overview](README.md) - - [Install](setup.md) + - [Self-Host](setup.md) - [Demos](demos.md) - Use - [Features](features.md) - [Chat](chat.md) - [Search](search.md) - - Interfaces + - Clients + - [Desktop](desktop.md) - [Obsidian](obsidian.md) - [Emacs](emacs.md) - [Web](web.md) diff --git a/docs/advanced.md b/docs/advanced.md index a567783f..95dacf30 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -1,63 +1,11 @@ ## Advanced Usage -### Search across Different Languages + +### Search across Different Languages (Self-Hosting) To search for notes in multiple, different languages, you can use a [multi-lingual model](https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models).
For example, the [paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) supports [50+ languages](https://www.sbert.net/docs/pretrained_models.html#:~:text=we%20used%20the%20following%2050%2B%20languages), has good search quality and speed. To use it: -1. Manually update `search-type > asymmetric > encoder` to `paraphrase-multilingual-MiniLM-L12-v2` in your `~/.khoj/khoj.yml` file for now. See diff of `khoj.yml` below for illustration: - - ```diff - asymmetric: - - encoder: sentence-transformers/multi-qa-MiniLM-L6-cos-v1 - + encoder: paraphrase-multilingual-MiniLM-L12-v2 - cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2 - model_directory: "~/.khoj/search/asymmetric/" - ``` - -2. Regenerate your content index. For example, by opening [\/api/update?t=force](http://localhost:42110/api/update?t=force) - -### Access Khoj on Mobile -1. [Setup Khoj](/#/setup) on your personal server. This can be any always-on machine, i.e an old computer, RaspberryPi(?) etc -2. [Install](https://tailscale.com/kb/installation/) [Tailscale](tailscale.com/) on your personal server and phone -3. Open the Khoj web interface of the server from your phone browser.
It should be `http://tailscale-ip-of-server:42110` or `http://name-of-server:42110` if you've setup [MagicDNS](https://tailscale.com/kb/1081/magicdns/) -4. Click the [Add to Homescreen](https://developer.mozilla.org/en-US/docs/Web/Progressive_web_apps/Add_to_home_screen) button -5. Enjoy exploring your notes, documents and images from your phone! - -![](./assets/khoj_pwa_android.png?) - -### Use OpenAI Models for Search -#### Setup -1. Set `encoder-type`, `encoder` and `model-directory` under `asymmetric` and/or `symmetric` `search-type` in your `khoj.yml` (at `~/.khoj/khoj.yml`): - ```diff - asymmetric: - - encoder: "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" - + encoder: text-embedding-ada-002 - + encoder-type: khoj.utils.models.OpenAI - cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" - - encoder-type: sentence_transformers.SentenceTransformer - - model_directory: "~/.khoj/search/asymmetric/" - + model-directory: null - ``` -2. [Setup your OpenAI API key in Khoj](/#/chat?id=setup) -3. Restart Khoj server to generate embeddings. It will take longer than with the offline search models. - -#### Warnings - This configuration *uses an online model* - - It will **send all notes to OpenAI** to generate embeddings - - **All queries will be sent to OpenAI** when you search with Khoj - - You will be **charged by OpenAI** based on the total tokens processed - - It *requires an active internet connection* to search and index - -### Bootstrap Khoj Search for Offline Usage later - -You can bootstrap Khoj pre-emptively to run on machines that do not have internet access. An example use-case would be to run Khoj on an air-gapped machine. -Note: *Only search can currently run in fully offline mode, not chat.* - -- With Internet - 1. Manually download the [asymmetric text](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1), [symmetric text](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) and [image search](https://huggingface.co/sentence-transformers/clip-ViT-B-32) models from HuggingFace - 2. Pip install khoj (and dependencies) in an associated virtualenv. E.g `python -m venv .venv && source .venv/bin/activate && pip install khoj-assistant` -- Without Internet - 1. Copy each of the search models into their respective folders, `asymmetric`, `symmetric` and `image` under the `~/.khoj/search/` directory on the air-gapped machine - 2. Copy the khoj virtual environment directory onto the air-gapped machine, activate the environment and start and khoj as normal. E.g `source .venv/bin/activate && khoj` +1. Manually update the search config in server's admin settings page. Go to [the search config](http://localhost:42110/server/admin/database/searchmodelconfig/). Either create a new one, if none exists, or update the existing one. Set the bi_encoder to `sentence-transformers/multi-qa-MiniLM-L6-cos-v1` and the cross_encoder to `cross-encoder/ms-marco-MiniLM-L-6-v2`. +2. Regenerate your content index from all the relevant clients. This step is very important, as you'll need to re-encode all your content with the new model. ### Query Filters diff --git a/docs/assets/khoj_chat_on_desktop.png b/docs/assets/khoj_chat_on_desktop.png new file mode 100644 index 00000000..e8c10718 Binary files /dev/null and b/docs/assets/khoj_chat_on_desktop.png differ diff --git a/docs/assets/khoj_search_on_desktop.png b/docs/assets/khoj_search_on_desktop.png new file mode 100644 index 00000000..1dc3231d Binary files /dev/null and b/docs/assets/khoj_search_on_desktop.png differ diff --git a/docs/chat.md b/docs/chat.md index 2efd7b1b..4ea64c3f 100644 --- a/docs/chat.md +++ b/docs/chat.md @@ -1,13 +1,13 @@ -### Khoj Chat -#### Overview +## Khoj Chat +### Overview - Creates a personal assistant for you to inquire and engage with your notes - You can choose to use Online or Offline Chat depending on your requirements - Supports multi-turn conversations with the relevant notes for context - Shows reference notes used to generate a response -### Setup +### Setup (Self-Hosting) #### Offline Chat -Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive. +Offline chat stays completely private and works without internet using open-source models. > **System Requirements**: > - Minimum 8 GB RAM. Recommend **16Gb VRAM** @@ -15,9 +15,10 @@ Offline chat stays completely private and works without internet. But it is slow > - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required > - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times -- Open your [Khoj settings](http://localhost:42110/config/) and click *Enable* on the Offline Chat card +1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration. +2. Open your [Chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/) and add a new option for the offline chat model you want to use. Make sure to use `Offline` as its type. We currently only support offline models that use the [Llama chat prompt](https://replicate.com/blog/how-to-prompt-llama#wrap-user-input-with-inst-inst-tags) format. We recommend using `mistral-7b-instruct-v0.1.Q4_0.gguf`. -![Configure offline chat](https://user-images.githubusercontent.com/6413477/257021364-8a2029f5-dc21-4de8-9af9-9ba6100d695c.mp4 ':include :type=mp4') +!> **Note**: Offline chat is not supported for a multi-user scenario. The host machine will encounter segmentation faults if multiple users try to use offline chat at the same time. #### Online Chat Online chat requires internet to use ChatGPT but is faster, higher quality and less compute intensive. @@ -25,14 +26,12 @@ Online chat requires internet to use ChatGPT but is faster, higher quality and l !> **Warning**: This will enable Khoj to send your chat queries and query relevant notes to OpenAI for processing 1. Get your [OpenAI API Key](https://platform.openai.com/account/api-keys) -2. Open your [Khoj Online Chat settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key. - -![Configure online chat](https://user-images.githubusercontent.com/6413477/256998908-ac26e55e-13a2-45fb-9348-3b90a62f7687.mp4 ':include :type=mp4') - +2. Open your [Khoj Online Chat settings](http://localhost:42110/server/admin/database/openaiprocessorconversationconfig/). Add a new setting with your OpenAI API key, and click *Save*. Only one configuration will be used, so make sure that's the only one you have. +3. Open your [Chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/) and add a new option for the OpenAI chat model you want to use. Make sure to use `OpenAI` as its type. ### Use 1. Open Khoj Chat - - **On Web**: Open [/chat](http://localhost:42110/chat) in your web browser + - **On Web**: Open [/chat](https://app.khoj.dev/chat) in your web browser - **On Obsidian**: Search for *Khoj: Chat* in the [Command Palette](https://help.obsidian.md/Plugins/Command+palette) - **On Emacs**: Run `M-x khoj ` 2. Enter your queries to chat with Khoj. Use [slash commands](#commands) and [query filters](./advanced.md#query-filters) to change what Khoj uses to respond diff --git a/docs/desktop.md b/docs/desktop.md new file mode 100644 index 00000000..a28352db --- /dev/null +++ b/docs/desktop.md @@ -0,0 +1,23 @@ +

Khoj Logo Desktop

+ +> An AI copilot for your Second Brain + +## Features +- **Chat** + - **Faster answers**: Find answers quickly, from your private notes or the public internet + - **Assisted creativity**: Smoothly weave across retrieving answers and generating content + - **Iterative discovery**: Iteratively explore and re-discover your notes +- **Search** + - **Natural**: Advanced natural language understanding using Transformer based ML Models + - **Incremental**: Incremental search for a fast, search-as-you-type experience + +## Setup + +1. Install the [Khoj Desktop app](https://khoj.dev/downloads) for your OS +2. Generate an API key on the [Khoj Web App](https://app.khoj.dev/config#clients) +3. Set your Khoj API Key on the *Settings* page of the Khoj Desktop app +4. [Optional] Add any files, folders you'd like Khoj to be aware of on the *Settings* page and Click *Save* + +## Interface +![](./assets/khoj_chat_on_desktop.png ':size=600px') +![](./assets/khoj_search_on_desktop.png ':size=600px') diff --git a/docs/desktop_installation.md b/docs/desktop_installation.md index d79a282f..42a89383 100644 --- a/docs/desktop_installation.md +++ b/docs/desktop_installation.md @@ -28,5 +28,5 @@ For the Linux installation, you have to have `glibc` version 2.35 or higher. You If you decide you want to uninstall the application, you can uninstall it like any other application on your system. For example, on MacOS, you can drag the application to the trash. On Windows, you can uninstall it from the `Add or Remove Programs` menu. On Linux, you can uninstall it with `sudo apt remove khoj`. In addition to that, you might want to `rm -rf` the following directories: -- `~/.khoj` -- `~/.cache/gpt4all` + - `~/.khoj` + - `~/.cache/gpt4all` diff --git a/docs/development.md b/docs/development.md index dd1aad46..0d715dc4 100644 --- a/docs/development.md +++ b/docs/development.md @@ -25,13 +25,7 @@ pip install -e .'[dev]' khoj -vv ``` 2. Configure Khoj - - **Via the Settings UI**: Add files, directories to index the [Khoj settings](http://localhost:42110/config) UI once Khoj has started up. Once you've saved all your settings, click `Configure`. - - **Manually**: - - Copy the `config/khoj_sample.yml` to `~/.khoj/khoj.yml` - - Set `input-files` or `input-filter` in each relevant `content-type` section of `~/.khoj/khoj.yml` - - Set `input-directories` field in `image` `content-type` section - - Delete `content-type` and `processor` sub-section(s) irrelevant for your use-case - - Restart khoj + - **Via the Desktop application**: Add files, directories to index using the settings page of your desktop application. Click "Save" to immediately trigger indexing. Note: Wait after configuration for khoj to Load ML model, generate embeddings and expose API to query notes, images, documents etc specified in config YAML diff --git a/docs/emacs.md b/docs/emacs.md index 6492ecc4..d8e7e682 100644 --- a/docs/emacs.md +++ b/docs/emacs.md @@ -1,6 +1,6 @@ -

Khoj LogoEmacs

+

Khoj Logo Emacs

-> An AI personal assistance for your digital brain +> An AI copilot for your Second Brain in Emacs Melpa Stable Badge Melpa Badge @@ -10,14 +10,13 @@ ## Features +- **Chat** + - **Faster answers**: Find answers quickly, from your private notes or the public internet + - **Assisted creativity**: Smoothly weave across retrieving answers and generating content + - **Iterative discovery**: Iteratively explore and re-discover your notes - **Search** - **Natural**: Advanced natural language understanding using Transformer based ML Models - - **Local**: Your personal data stays local. All search, indexing is done on your machine* - **Incremental**: Incremental search for a fast, search-as-you-type experience -- **Chat** - - **Faster answers**: Find answers faster than search - - **Iterative discovery**: Iteratively explore and (re-)discover your notes - - **Assisted creativity**: Smoothly weave across answer retrieval and content generation ## Interface #### Search @@ -27,79 +26,76 @@ ![khoj chat on emacs](./assets/khoj_chat_on_emacs.png ':size=400px') ## Setup -- *Make sure [python](https://realpython.com/installing-python/) and [pip](https://pip.pypa.io/en/stable/installation/) are installed on your machine* +1. Generate an API key on the [Khoj Web App](https://app.khoj.dev/config#clients) +2. Add below snippet to your Emacs config file, usually at `~/.emacs.d/init.el` -- *khoj.el attempts to automatically install, start and configure the khoj server.* - If this fails, follow [these instructions](/setup) to manually setup the khoj server. -### Direct Install + + +#### **Direct Install** +*Khoj will index your org-agenda files, by default* + ```elisp +;; Install Khoj.el M-x package-install khoj + +; Set your Khoj API key +(setq khoj-api-key "YOUR_KHOJ_CLOUD_API_KEY") ``` -### Minimal Install -Add below snippet to your Emacs config file. -Indexes your org-agenda files, by default. +#### **Minimal Install** +*Khoj will index your org-agenda files, by default* ```elisp - ;; Install Khoj Package from MELPA Stable - (use-package khoj - :ensure t - :pin melpa-stable - :bind ("C-c s" . 'khoj)) -``` - -- Note: Install `khoj.el` from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj - - That is, use `:pin melpa` to install khoj.el in above snippet if khoj server was installed with `--pre` flag, i.e `pip install --pre khoj-assistant` - - Else use `:pin melpa-stable` to install khoj.el in above snippet if khoj was installed with `pip install khoj-assistant` - - This ensures both khoj.el and khoj app are from the same version (git tagged or latest) - -### Standard Install - Add below snippet to your Emacs config file. - Indexes the specified org files, directories. Sets up OpenAI API key for Khoj Chat - -```elisp -;; Install Khoj Package from MELPA Stable +;; Install Khoj client from MELPA Stable (use-package khoj :ensure t :pin melpa-stable :bind ("C-c s" . 'khoj) - :config (setq khoj-org-directories '("~/docs/org-roam" "~/docs/notes") - khoj-org-files '("~/docs/todo.org" "~/docs/work.org") - khoj-openai-api-key "YOUR_OPENAI_API_KEY")) ; required to enable chat + :config (setq khoj-api-key "YOUR_KHOJ_CLOUD_API_KEY")) ``` -### With [Straight.el](https://github.com/raxod502/straight.el) -Add below snippet to your Emacs config file. -Indexes the specified org files, directories. Sets up OpenAI API key for Khoj Chat +#### **Standard Install** +*Configures the specified org files, directories to be indexed by Khoj* ```elisp - ;; Install Khoj Package using Straight.el - (use-package khoj - :after org - :straight (khoj :type git :host github :repo "khoj-ai/khoj" :files (:defaults "src/interface/emacs/khoj.el")) - :bind ("C-c s" . 'khoj) - :config (setq khoj-org-directories '("~/docs/org-roam" "~/docs/notes") - khoj-org-files '("~/docs/todo.org" "~/docs/work.org") - khoj-openai-api-key "YOUR_OPENAI_API_KEY" ; required to enable chat) - ``` +;; Install Khoj client from MELPA Stable +(use-package khoj + :ensure t + :pin melpa-stable + :bind ("C-c s" . 'khoj) + :config (setq khoj-api-key "YOUR_KHOJ_CLOUD_API_KEY" + khoj-org-directories '("~/docs/org-roam" "~/docs/notes") + khoj-org-files '("~/docs/todo.org" "~/docs/work.org"))) +``` +#### **Straight.el** +*Configures the specified org files, directories to be indexed by Khoj* + +```elisp +;; Install Khoj client using Straight.el +(use-package khoj + :after org + :straight (khoj :type git :host github :repo "khoj-ai/khoj" :files (:defaults "src/interface/emacs/khoj.el")) + :bind ("C-c s" . 'khoj) + :config (setq khoj-api-key "YOUR_KHOJ_CLOUD_API_KEY" + khoj-org-directories '("~/docs/org-roam" "~/docs/notes") + khoj-org-files '("~/docs/todo.org" "~/docs/work.org"))) +``` + + ## Use ### Search +See [Khoj Search](search.md) for details 1. Hit `C-c s s` (or `M-x khoj RET s`) to open khoj search - -2. Enter your query in natural language - - e.g "What is the meaning of life?", "My life goals for 2023" +2. Enter your query in natural language
+ E.g *"What is the meaning of life?"*, *"My life goals for 2023"* ### Chat +See [Khoj Chat](chat.md) for details 1. Hit `C-c s c` (or `M-x khoj RET c`) to open khoj chat - -2. Ask questions in a natural, conversational style - - E.g "When did I file my taxes last year?" - - See [Khoj Chat](/#/chat) for more details +2. Ask questions in a natural, conversational style
+ E.g *"When did I file my taxes last year?"* ### Find Similar Entries This feature finds entries similar to the one you are currently on. @@ -108,7 +104,6 @@ This feature finds entries similar to the one you are currently on. ### Advanced Usage - Add [query filters](https://github.com/khoj-ai/khoj/#query-filters) during search to narrow down results further - e.g `What is the meaning of life? -"god" +"none" dt>"last week"` - Use `C-c C-o 2` to open the current result at cursor in its source org file @@ -121,31 +116,21 @@ This feature finds entries similar to the one you are currently on. ![](./assets/khoj_emacs_menu.png) Hit `C-c s` (or `M-x khoj`) to open the khoj menu above. Then: - Hit `t` until you preferred content type is selected in the khoj menu - `Content Type` specifies the content to perform `Search`, `Update` or `Find Similar` actions on - Hit `n` twice and then enter number of results you want to see - `Results Count` is used by the `Search` and `Find Similar` actions - Hit `-f u` to `force` update the khoj content index - The `Force Update` switch is only used by the `Update` action ## Upgrade -### Upgrade Khoj Backend -```bash -pip install --upgrade khoj-assistant -``` -### Upgrade Khoj.el Use your Emacs package manager to upgrade `khoj.el` + -- For `khoj.el` from MELPA - - Method 1 - - Run `M-x package-list-packages` to list all packages - - Press `U` on `khoj` to mark it for upgrade - - Press `x` to execute the marked actions - - Method 2 - - Run `M-x package-refresh-content` - - Run `M-x package-reinstall khoj` +#### **With MELPA** +1. Run `M-x package-refresh-content` +2. Run `M-x package-reinstall khoj` -- For `khoj.el` from Straight - - Run `M-x straight-pull-package khoj` +#### **With Straight.el** +- Run `M-x straight-pull-package khoj` + + diff --git a/docs/features.md b/docs/features.md index 3bd8939f..f59e0657 100644 --- a/docs/features.md +++ b/docs/features.md @@ -1,10 +1,10 @@ ## Features -#### [Search](./search.md) +#### [Search](search.md) - **Local**: Your personal data stays local. All search and indexing is done on your machine. - **Incremental**: Incremental search for a fast, search-as-you-type experience -#### [Chat](./chat.md) +#### [Chat](chat.md) - **Faster answers**: Find answers faster, smoother than search. No need to manually scan through your notes to find answers. - **Iterative discovery**: Iteratively explore and (re-)discover your notes - **Assisted creativity**: Smoothly weave across answers retrieval and content generation diff --git a/docs/github_integration.md b/docs/github_integration.md index 6b8dce48..b7c8a4fe 100644 --- a/docs/github_integration.md +++ b/docs/github_integration.md @@ -1,14 +1,14 @@ -# Setup the Github integration +# 🧑🏾‍💻 Setup the Github integration The Github integration allows you to index as many repositories as you want. It's currently default configured to index Issues, Commits, and all Markdown/Org files in each repository. For large repositories, this takes a fairly long time, but it works well for smaller projects. # Configure your settings -1. Go to [http://localhost:42110/config](http://localhost:42110/config) and enter in settings for the data sources you want to index. You'll have to specify the file paths. +1. Go to [https://app.khoj.dev/config](https://app.khoj.dev/config) and enter in settings for the data sources you want to index. You'll have to specify the file paths. ## Use the Github plugin 1. Generate a [classic PAT (personal access token)](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) from [Github](https://github.com/settings/tokens) with `repo` and `admin:org` scopes at least. -2. Navigate to [http://localhost:42110/config/content_type/github](http://localhost:42110/config/content_type/github) to configure your Github settings. Enter in your PAT, along with details for each repository you want to index. +2. Navigate to [https://app.khoj.dev/config/content-source/github](https://app.khoj.dev/config/content-source/github) to configure your Github settings. Enter in your PAT, along with details for each repository you want to index. 3. Click `Save`. Go back to the settings page and click `Configure`. -4. Go to [http://localhost:42110/](http://localhost:42110/) and start searching! +4. Go to [https://app.khoj.dev/](https://app.khoj.dev/) and start searching! diff --git a/docs/index.html b/docs/index.html index 33ba0735..5c1d3466 100644 --- a/docs/index.html +++ b/docs/index.html @@ -5,6 +5,15 @@ Document + + + + + + + + + @@ -17,11 +26,13 @@ repo: 'https://github.com/khoj-ai/khoj', loadSidebar: true, themeColor: '#c2a600', + auto2top: true, // coverpage: true, } + diff --git a/docs/notion_integration.md b/docs/notion_integration.md index 5fee7ff6..6a309d41 100644 --- a/docs/notion_integration.md +++ b/docs/notion_integration.md @@ -8,7 +8,7 @@ We haven't setup a fancy integration with OAuth yet, so this integration still r ![setup_new_integration](https://github.com/khoj-ai/khoj/assets/65192171/b056e057-d4dc-47dc-aad3-57b59a22c68b) 3. Share all the workspaces that you want to integrate with the Khoj integration you just made in the previous step ![enable_workspace](https://github.com/khoj-ai/khoj/assets/65192171/98290303-b5b8-4cb0-b32c-f68c6923a3d0) -4. In the first step, you generated an API key. Use the newly generated API Key in your Khoj settings, by default at http://localhost:42110/config/content_type/notion. Click `Save`. -5. Click `Configure` in http://localhost:42110/config to index your Notion workspace(s). +4. In the first step, you generated an API key. Use the newly generated API Key in your Khoj settings, by default at https://app.khoj.dev/config/content-source/notion. Click `Save`. +5. Click `Configure` in https://app.khoj.dev/config to index your Notion workspace(s). That's it! You should be ready to start searching and chatting. Make sure you've configured your OpenAI API Key for chat. diff --git a/docs/obsidian.md b/docs/obsidian.md index 10e65ba1..478c4c60 100644 --- a/docs/obsidian.md +++ b/docs/obsidian.md @@ -1,16 +1,15 @@ -

Khoj LogoObsidian

+

Khoj Logo Obsidian

-> An AI personal assistant for your Digital Brain in Obsidian +> An AI copilot for your Second Brain in Obsidian ## Features +- **Chat** + - **Faster answers**: Find answers quickly, from your private notes or the public internet + - **Assisted creativity**: Smoothly weave across retrieving answers and generating content + - **Iterative discovery**: Iteratively explore and re-discover your notes - **Search** - **Natural**: Advanced natural language understanding using Transformer based ML Models - - **Local**: Your personal data stays local. All search and indexing is done on your machine. *Unlike chat which requires access to GPT.* - **Incremental**: Incremental search for a fast, search-as-you-type experience -- **Chat** - - **Faster answers**: Find answers faster and with less effort than search - - **Iterative discovery**: Iteratively explore and (re-)discover your notes - - **Assisted creativity**: Smoothly weave across answers retrieval and content generation ## Interface ![](./assets/khoj_search_on_obsidian.png ':size=400px') @@ -18,102 +17,37 @@ ## Setup -- *Make sure [python](https://realpython.com/installing-python/) and [pip](https://pip.pypa.io/en/stable/installation/) are installed on your machine* -- *Ensure you follow the ordering of the setup steps. Install the plugin after starting the khoj backend. This allows the plugin to configure the khoj backend* -### 1. Setup Backend -Open terminal/cmd and run below command to install and start the khoj backend -- On Linux/MacOS - ```shell - python -m pip install khoj-assistant && khoj - ``` - -- On Windows - ```shell - py -m pip install khoj-assistant && khoj - ``` - -### 2. Setup Plugin 1. Open [Khoj](https://obsidian.md/plugins?id=khoj) from the *Community plugins* tab in Obsidian settings panel 2. Click *Install*, then *Enable* on the Khoj plugin page in Obsidian - 3. [Optional] To enable Khoj Chat, set your [OpenAI API key](https://platform.openai.com/account/api-keys) in the Khoj plugin settings + 3. Generate an API key on the [Khoj Web App](https://app.khoj.dev/config#clients) + 4. Set your Khoj API Key in the Khoj plugin settings in Obsidian -See [official Obsidian plugin docs](https://help.obsidian.md/Extending+Obsidian/Community+plugins) for details +See the official [Obsidian Plugin Docs](https://help.obsidian.md/Extending+Obsidian/Community+plugins) for more details on installing Obsidian plugins. ## Use ### Chat Run *Khoj: Chat* from the [Command Palette](https://help.obsidian.md/Plugins/Command+palette) and ask questions in a natural, conversational style.
-E.g "When did I file my taxes last year?" - -Notes: -- *Using Khoj Chat will result in query relevant notes being shared with OpenAI for ChatGPT to respond.* -- *To use Khoj Chat, ensure you've set your [OpenAI API key](https://platform.openai.com/account/api-keys) in the Khoj plugin settings.* +E.g *"When did I file my taxes last year?"* See [Khoj Chat](/chat) for more details -### Search -Click the *Khoj search* icon 🔎 on the [Ribbon](https://help.obsidian.md/User+interface/Workspace/Ribbon) or run *Khoj: Search* from the [Command Palette](https://help.obsidian.md/Plugins/Command+palette) - -*Note: Ensure the khoj server is running in the background before searching. Execute `khoj` in your terminal if it is not already running* - -[search_demo](https://user-images.githubusercontent.com/6413477/218801155-cd67e8b4-a770-404a-8179-d6b61caa0f93.mp4 ':include :type=mp4') - -#### Query Filters - -Use structured query syntax to filter the natural language search results -- **Word Filter**: Get entries that include/exclude a specified term - - Entries that contain term_to_include: `+"term_to_include"` - - Entries that contain term_to_exclude: `-"term_to_exclude"` -- **Date Filter**: Get entries containing dates in YYYY-MM-DD format from specified date (range) - - Entries from April 1st 1984: `dt:"1984-04-01"` - - Entries after March 31st 1984: `dt>="1984-04-01"` - - Entries before April 2nd 1984 : `dt<="1984-04-01"` -- **File Filter**: Get entries from a specified file - - Entries from incoming.org file: `file:"incoming.org"` -- Combined Example - - `what is the meaning of life? file:"1984.org" dt>="1984-01-01" dt<="1985-01-01" -"big" -"brother"` - - Adds all filters to the natural language query. It should return entries - - from the file *1984.org* - - containing dates from the year *1984* - - excluding words *"big"* and *"brother"* - - that best match the natural language query *"what is the meaning of life?"* - ### Find Similar Notes To see other notes similar to the current one, run *Khoj: Find Similar Notes* from the [Command Palette](https://help.obsidian.md/Plugins/Command+palette) +### Search +Click the *Khoj search* icon 🔎 on the [Ribbon](https://help.obsidian.md/User+interface/Workspace/Ribbon) or run *Khoj: Search* from the [Command Palette](https://help.obsidian.md/Plugins/Command+palette) + +See [Khoj Search](/search) for more details. Use [query filters](/advanced#query-filters) to limit entries to search + +[search_demo](https://user-images.githubusercontent.com/6413477/218801155-cd67e8b4-a770-404a-8179-d6b61caa0f93.mp4 ':include :type=mp4') + ## Upgrade -### 1. Upgrade Backend - ```shell - pip install --upgrade khoj-assistant - ``` -### 2. Upgrade Plugin 1. Open *Community plugins* tab in Obsidian settings 2. Click the *Check for updates* button 3. Click the *Update* button next to Khoj, if available -## Demo -### Search Demo -[demo](https://github-production-user-asset-6210df.s3.amazonaws.com/6413477/240061700-3e33d8ea-25bb-46c8-a3bf-c92f78d0f56b.mp4 ':include :type=mp4') - -#### Description - -1. Install Khoj via `pip` and start Khoj backend - ```shell - python -m pip install khoj-assistant && khoj - ``` -2. Install Khoj plugin via Community Plugins settings pane on Obsidian app - - Check the new Khoj plugin settings - - Wait for Khoj backend to index markdown, PDF files in the current Vault - - Open Khoj plugin on Obsidian via Search button on Left Pane - - Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/) - - Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin) - - ## Troubleshooting - Open the Khoj plugin settings pane, to configure Khoj - Toggle Enable/Disable Khoj, if setting changes have not applied - Click *Update* button to force index to refresh, if results are failing or stale - -## Current Limitations -- The plugin loads the index of only one vault at a time.
- So notes across multiple vaults **cannot** be searched at the same time diff --git a/docs/search.md b/docs/search.md index 579034ec..f2387f06 100644 --- a/docs/search.md +++ b/docs/search.md @@ -1,7 +1,7 @@ ## Khoj Search ### Use 1. Open Khoj Search - - **On Web**: Open in your web browser + - **On Web**: Open in your web browser - **On Obsidian**: Click the *Khoj search* icon 🔎 on the [Ribbon](https://help.obsidian.md/User+interface/Workspace/Ribbon) or Search for *Khoj: Search* in the [Command Palette](https://help.obsidian.md/Plugins/Command+palette) - **On Emacs**: Run `M-x khoj ` 2. Query using natural language to find relevant entries from your knowledge base. Use [query filters](./advanced.md#query-filters) to limit entries to search diff --git a/docs/setup.md b/docs/setup.md index 2e02d271..a1d2c17c 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -3,41 +3,15 @@ These are the general setup instructions for Khoj. - Make sure [python](https://realpython.com/installing-python/) and [pip](https://pip.pypa.io/en/stable/installation/) are installed on your machine - Check the [Khoj Emacs docs](/emacs?id=setup) to setup Khoj with Emacs
- Its simpler as it can skip the server *install*, *run* and *configure* step below. + It's simpler as it can skip the server *install*, *run* and *configure* step below. - Check the [Khoj Obsidian docs](/obsidian?id=_2-setup-plugin) to setup Khoj with Obsidian
Its simpler as it can skip the *configure* step below. -### 1. Install +For Installation, you can either use Docker or install Khoj locally. -#### 1.1 Local Server Setup -Run the following command in your terminal to install the Khoj backend. +### 1. Installation (Docker) -- On Linux/MacOS - ```shell - python -m pip install khoj-assistant - ``` - -- On Windows - ```shell - py -m pip install khoj-assistant - ``` -For more detailed Windows installation and troubleshooting, see [Windows Install](./windows_install.md). - - -##### 1.1.1 Local Server Start - -Run the following command from your terminal to start the Khoj backend and open Khoj in your browser. - -```shell -khoj -``` - -Khoj should now be running at http://localhost:42110. You can see the web UI in your browser. - -Note: To start Khoj automatically in the background use [Task scheduler](https://www.windowscentral.com/how-create-automated-task-using-task-scheduler-windows-10) on Windows or [Cron](https://en.wikipedia.org/wiki/Cron) on Mac, Linux (e.g with `@reboot khoj`) - -#### 1.2 Local Docker Setup -Use the sample docker-compose [in Github](https://github.com/khoj-ai/khoj/blob/master/docker-compose.yml) to run Khoj in Docker. To start the container, run the following command in the same directory as the docker-compose.yml file. You'll have to configure the mounted directories to match your local knowledge base. +Use the sample docker-compose [in Github](https://github.com/khoj-ai/khoj/blob/master/docker-compose.yml) to run Khoj in Docker. Start by configuring all the environment variables to your choosing. Your admin account will automatically be created based on the admin credentials in that file, so pay attention to those. To start the container, run the following command in the same directory as the docker-compose.yml file. This will automatically setup the database and run the Khoj server. ```shell docker-compose up @@ -45,27 +19,131 @@ docker-compose up Khoj should now be running at http://localhost:42110. You can see the web UI in your browser. -#### 1.3 Download the desktop client [Optional] +### 1. Installation (Local) -You can use our desktop executables to select file paths and folders to index. You can simply select the folders or files, and they'll be automatically uploaded to the server. Once you specify a file or file path, you don't need to update the configuration again; it will grab any data diffs dynamically over time. This part is currently optional, but may make setup and configuration slightly easier. It removes the need for setting up custom file paths for your Khoj data configurations. +#### Prerequisites -**To download the desktop client, go to https://download.khoj.dev** and the correct executable for your OS will automatically start downloading. Once downloaded, you can configure your folders for indexing using the settings tab. To set your chat configuration, you'll have to use the web interface for the Khoj server you setup in the previous step. +##### Install Postgres (with PgVector) -### 1.4 Use (deprecated) desktop builds +Khoj uses the `pgvector` package to store embeddings of your index in a Postgres database. In order to use this, you need to have Postgres installed. -Before `v0.12.0``, we had self-contained desktop builds that included both the server and the client. These were difficult to maintain, but are still available as part of earlier releases. To find setup instructions, see here: + -- [Desktop Installation](desktop_installation.md) -- [Windows Installation](windows_install.md) +#### **MacOS** -### 2. Configure -1. Set `File`, `Folder` and hit `Save` in each Plugins you want to enable for Search on the Khoj config page -2. Add your OpenAI API key to Chat Feature settings if you want to use Chat -3. Click `Configure` and wait. The app will download ML models and index the content for search and (optionally) chat +Install [Postgres.app](https://postgresapp.com/). This comes pre-installed with `pgvector` and relevant dependencies. -![configure demo](https://user-images.githubusercontent.com/6413477/255307879-61247d3f-c69a-46ef-b058-9bc533cb5c72.mp4 ':include :type=mp4') +#### **Windows** -### 3. Install Interface Plugins (Optional) +Use the [recommended installer](https://www.postgresql.org/download/windows/) + +#### **Linux** +From [official instructions](https://wiki.postgresql.org/wiki/Apt) + +```bash +sudo apt install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh +sudo apt install postgres-16 postgresql-16-pgvector +``` + +##### **From Source** +1. Follow instructions to [Install Postgres](https://www.postgresql.org/download/) +2. Follow instructions to [Install PgVector](https://github.com/pgvector/pgvector#installation) in case you need to manually install it. Reproduced instructions below for convenience. + +```bash +cd /tmp +git clone --branch v0.5.1 https://github.com/pgvector/pgvector.git +cd pgvector +make +make install # may need sudo +``` + + + + +##### Create the Khoj database + +Make sure to update your environment variables to match your Postgres configuration if you're using a different name. The default values should work for most people. + + + +#### **MacOS** +```bash +createdb khoj -U postgres +``` + +#### **Windows** +```bash +createdb khoj -U postgres +``` + +#### **Linux** +```bash +sudo -u postgres createdb khoj +``` + + + +#### Install package + +##### Local Server Setup +- *Make sure [python](https://realpython.com/installing-python/) and [pip](https://pip.pypa.io/en/stable/installation/) are installed on your machine* + +Run the following command in your terminal to install the Khoj backend. + + + +#### **MacOS** + +```shell +python -m pip install khoj-assistant +``` + +#### **Windows** + +```shell +py -m pip install khoj-assistant +``` +For more detailed Windows installation and troubleshooting, see [Windows Install](./windows_install.md). + +#### **Linux** + +```shell +python -m pip install khoj-assistant +``` + + + +##### Local Server Start + +Run the following command from your terminal to start the Khoj backend and open Khoj in your browser. + +```shell +khoj --anonymous-mode +``` +`--anonymous-mode` allows you to run the server without setting up Google credentials for login. This allows you to use any of the clients without a login wall. If you want to use Google login, you can skip this flag, but you will have to add your Google developer credentials. + +On the first run, you will be prompted to input credentials for your admin account and do some basic configuration for your chat model settings. Once created, you can go to http://localhost:42110/server/admin and login with the credentials you just created. + +Khoj should now be running at http://localhost:42110. You can see the web UI in your browser. + +Note: To start Khoj automatically in the background use [Task scheduler](https://www.windowscentral.com/how-create-automated-task-using-task-scheduler-windows-10) on Windows or [Cron](https://en.wikipedia.org/wiki/Cron) on Mac, Linux (e.g with `@reboot khoj`) + + +### 2. Download the desktop client + +You can use our desktop executables to select file paths and folders to index. You can simply select the folders or files, and they'll be automatically uploaded to the server. Once you specify a file or file path, you don't need to update the configuration again; it will grab any data diffs dynamically over time. + +**To download the latest desktop client, go to https://download.khoj.dev** and the correct executable for your OS will automatically start downloading. Once downloaded, you can configure your folders for indexing using the settings tab. To set your chat configuration, you'll have to use the web interface for the Khoj server you setup in the previous step. + +To use the desktop client, you need to go to your Khoj server's settings page (http://localhost:42110/config) and copy the API key. Then, paste it into the desktop client's settings page. Once you've done that, you can select files and folders to index. + +### 3. Configure +1. Go to http://localhost:42110/server/admin and login with your admin credentials. Go to the ChatModelOptions if you want to add additional models for chat. +1. Select files and folders to index [using the desktop client](./setup.md?id=_2-download-the-desktop-client). When you click 'Save', the files will be sent to your server for indexing. + - Select Notion workspaces and Github repositories to index using the web interface. + +### 4. Install Client Plugins (Optional) Khoj exposes a web interface to search, chat and configure by default.
The optional steps below allow using Khoj from within an existing application like Obsidian or Emacs. @@ -75,9 +153,17 @@ The optional steps below allow using Khoj from within an existing application li - **Khoj Emacs**:
[Install](/emacs?id=setup) khoj.el +### 5. Use Khoj 🚀 + +You can head to http://localhost:42110 to use the web interface. You can also use the desktop client to search and chat. ## Upgrade ### Upgrade Khoj Server + + + +#### **Local Setup** + ```shell pip install --upgrade khoj-assistant ``` @@ -88,6 +174,16 @@ pip install --upgrade khoj-assistant pip install --upgrade --pre khoj-assistant ``` +#### **Docker** +From the same directory where you have your `docker-compose` file, this will fetch the latest build and upgrade your server. + +```shell +docker-compose up --build +``` + + + + ### Upgrade Khoj on Emacs - Use your Emacs Package Manager to Upgrade - See [khoj.el package setup](/emacs?id=setup) for details @@ -100,8 +196,8 @@ pip install --upgrade --pre khoj-assistant 1. (Optional) Hit `Ctrl-C` in the terminal running the khoj server to stop it 2. Delete the khoj directory in your home folder (i.e `~/.khoj` on Linux, Mac or `C:\Users\\.khoj` on Windows) 5. You might want to `rm -rf` the following directories: -- `~/.khoj` -- `~/.cache/gpt4all` + - `~/.khoj` + - `~/.cache/gpt4all` 3. Uninstall the khoj server with `pip uninstall khoj-assistant` 4. (Optional) Uninstall khoj.el or the khoj obsidian plugin in the standard way on Emacs, Obsidian diff --git a/docs/telemetry.md b/docs/telemetry.md index 11490c3a..060bcdab 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -1,4 +1,4 @@ -# Telemetry +# Telemetry (self-hosting) We collect some high level, anonymized metadata about usage of Khoj. This includes: - Client (Web, Emacs, Obsidian) diff --git a/docs/web.md b/docs/web.md index 90791c6e..21571f46 100644 --- a/docs/web.md +++ b/docs/web.md @@ -1,19 +1,18 @@ -

Khoj LogoWeb

+

Khoj Logo Web

-> An AI personal assistant for your Digital Brain +> An AI copilot for your Second Brain ## Features +- **Chat** + - **Faster answers**: Find answers quickly, from your private notes or the public internet + - **Assisted creativity**: Smoothly weave across retrieving answers and generating content + - **Iterative discovery**: Iteratively explore and re-discover your notes - **Search** - **Natural**: Advanced natural language understanding using Transformer based ML Models - - **Local**: Your personal data stays local. All search and indexing is done on your machine. *Unlike chat which requires access to GPT.* - **Incremental**: Incremental search for a fast, search-as-you-type experience -- **Chat** - - **Faster answers**: Find answers faster and with less effort than search - - **Iterative discovery**: Iteratively explore and (re-)discover your notes - - **Assisted creativity**: Smoothly weave across answers retrieval and content generation ## Setup -The Khoj web interface is the default interface. It comes packaged with the khoj server. +No setup required. The Khoj web app is the default interface to Khoj. You can access it from any web browser. Try it on [Khoj Cloud](https://app.khoj.dev) ## Interface ![](./assets/khoj_search_on_web.png ':size=400px') diff --git a/gunicorn-config.py b/gunicorn-config.py new file mode 100644 index 00000000..1760ae38 --- /dev/null +++ b/gunicorn-config.py @@ -0,0 +1,10 @@ +import multiprocessing + +bind = "0.0.0.0:42110" +workers = 4 +worker_class = "uvicorn.workers.UvicornWorker" +timeout = 120 +keep_alive = 60 +accesslog = "access.log" +errorlog = "error.log" +loglevel = "debug" diff --git a/manifest.json b/manifest.json index a1c1f913..4d019834 100644 --- a/manifest.json +++ b/manifest.json @@ -1,7 +1,7 @@ { "id": "khoj", "name": "Khoj", - "version": "0.14.0", + "version": "1.0.0", "minAppVersion": "0.15.0", "description": "An AI copilot for your Second Brain", "author": "Khoj Inc.", diff --git a/prod.Dockerfile b/prod.Dockerfile new file mode 100644 index 00000000..693a3a8b --- /dev/null +++ b/prod.Dockerfile @@ -0,0 +1,30 @@ +# Use Nvidia's latest Ubuntu 22.04 image as the base image +FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 + +LABEL org.opencontainers.image.source https://github.com/khoj-ai/khoj + +# Install System Dependencies +RUN apt update -y && apt -y install python3-pip git libsqlite3-0 ffmpeg libsm6 libxext6 + +WORKDIR /app + +# Install Application +COPY pyproject.toml . +COPY README.md . +RUN sed -i 's/dynamic = \["version"\]/version = "0.0.0"/' pyproject.toml && \ + TMPDIR=/home/cache/ pip install --cache-dir=/home/cache/ -e . + +# Copy Source Code +COPY . . + +RUN apt install vim -y + +# Set the PYTHONPATH environment variable in order for it to find the Django app. +ENV PYTHONPATH=/app/src:$PYTHONPATH + +# Run the Application +# There are more arguments required for the application to run, +# but these should be passed in through the docker-compose.yml file. +ARG PORT +EXPOSE ${PORT} +ENTRYPOINT [ "gunicorn", "-c", "gunicorn-config.py", "src.khoj.main:app" ] diff --git a/pyproject.toml b/pyproject.toml index bac662a4..a457aec6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "bs4 >= 0.0.1", "dateparser >= 1.1.1", "defusedxml == 0.7.1", - "fastapi == 0.77.1", + "fastapi >= 0.104.1", "python-multipart >= 0.0.5", "jinja2 == 3.1.2", "openai >= 0.27.0, < 1.0.0", @@ -54,14 +54,27 @@ dependencies = [ "transformers >= 4.28.0", "torch == 2.0.1", "uvicorn == 0.17.6", - "aiohttp == 3.8.5", - "langchain >= 0.0.187", + "aiohttp == 3.8.6", + "langchain >= 0.0.331", "requests >= 2.26.0", "bs4 >= 0.0.1", "anyio == 3.7.1", - "pymupdf >= 1.23.3", + "pymupdf >= 1.23.5", + "django == 4.2.7", + "authlib == 1.2.1", "gpt4all >= 2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'", "gpt4all >= 2.0.0; platform_system == 'Windows' or platform_system == 'Darwin'", + "itsdangerous == 2.1.2", + "httpx == 0.25.0", + "pgvector == 0.2.3", + "psycopg2-binary == 2.9.9", + "google-auth == 2.23.3", + "python-multipart == 0.0.6", + "gunicorn == 21.2.0", + "lxml == 4.9.3", + "tzdata == 2023.3", + "rapidocr-onnxruntime == 1.3.8", + "stripe == 7.3.0", ] dynamic = ["version"] @@ -81,12 +94,15 @@ test = [ "factory-boy >= 3.2.1", "trio >= 0.22.0", "pytest-xdist", + "psutil >= 5.8.0", ] dev = [ "khoj-assistant[test]", "mypy >= 1.0.1", "black >= 23.1.0", "pre-commit >= 3.0.4", + "pytest-django == 4.5.2", + "pytest-asyncio == 0.21.1", ] [tool.hatch.version] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..b3e418d0 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +DJANGO_SETTINGS_MODULE = app.settings +pythonpath = . src +testpaths = tests +markers = + chatquality: marks tests as chatquality (deselect with '-m "not chatquality"') diff --git a/src/app/README.md b/src/app/README.md new file mode 100644 index 00000000..14fc8501 --- /dev/null +++ b/src/app/README.md @@ -0,0 +1,94 @@ +# Django App + +Khoj uses Django as the backend framework primarily for its powerful ORM and the admin interface. The Django app is located in the `src/app` directory. We have one installed app, under the `/database/` directory. This app is responsible for all the database related operations and holds all of our models. You can find the extensive Django documentation [here](https://docs.djangoproject.com/en/4.2/) 🌈. + +## Setup (Docker) + +### Prerequisites +1. Ensure you have [Docker](https://docs.docker.com/get-docker/) installed. +2. Ensure you have [Docker Compose](https://docs.docker.com/compose/install/) installed. + +### Run + +Using the `docker-compose.yml` file in the root directory, you can run the Khoj app using the following command: +```bash +docker-compose up +``` + +## Setup (Local) + +### Install Postgres (with PgVector) + +#### MacOS +- Install the [Postgres.app](https://postgresapp.com/). + +#### Debian, Ubuntu +From [official instructions](https://wiki.postgresql.org/wiki/Apt) + +```bash +sudo apt install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh +sudo apt install postgres-16 postgresql-16-pgvector +``` + +#### Windows +- Use the [recommended installer](https://www.postgresql.org/download/windows/) + +#### From Source +1. Follow instructions to [Install Postgres](https://www.postgresql.org/download/) +2. Follow instructions to [Install PgVector](https://github.com/pgvector/pgvector#installation) in case you need to manually install it. Reproduced instructions below for convenience. + +```bash +cd /tmp +git clone --branch v0.5.1 https://github.com/pgvector/pgvector.git +cd pgvector +make +make install # may need sudo +``` + +### Create the Khoj database + +#### MacOS +```bash +createdb khoj -U postgres +``` + +#### Debian, Ubuntu +```bash +sudo -u postgres createdb khoj +``` + +- [Optional] To set default postgres user's password + - Execute `ALTER USER postgres PASSWORD 'my_secure_password';` using `psql` + - Run `export $POSTGRES_PASSWORD=my_secure_password` in your terminal for Khoj to use it later + +### Install Khoj + +```bash +pip install -e '.[dev]' +``` + +### Make Khoj DB migrations + +This command will create the migrations for the database app. This command should be run whenever a new db model is added to the database app or an existing db model is modified (updated or deleted). + +```bash +python3 src/manage.py makemigrations +``` + +### Run Khoj DB migrations + +This command will run any pending migrations in your application. +```bash +python3 src/manage.py migrate +``` + +### Start Khoj Server + +While we're using Django for the ORM, we're still using the FastAPI server for the API. This command automatically scaffolds the Django application in the backend. + +*Note: Anonymous mode bypasses authentication for local, single-user usage.* + +```bash +python3 src/khoj/main.py --anonymous-mode +``` diff --git a/src/khoj/processor/jsonl/__init__.py b/src/app/__init__.py similarity index 100% rename from src/khoj/processor/jsonl/__init__.py rename to src/app/__init__.py diff --git a/src/app/settings.py b/src/app/settings.py new file mode 100644 index 00000000..0803081d --- /dev/null +++ b/src/app/settings.py @@ -0,0 +1,152 @@ +""" +Django settings for app project. + +Generated by 'django-admin startproject' using Django 4.2.5. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/4.2/ref/settings/ +""" + +from pathlib import Path +import os + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = os.getenv("KHOJ_DJANGO_SECRET_KEY") + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = os.getenv("KHOJ_DEBUG", "False") == "True" + +ALLOWED_HOSTS = [".khoj.dev", "localhost", "127.0.0.1", "[::1]", "beta.khoj.dev"] + +CSRF_TRUSTED_ORIGINS = [ + "https://app.khoj.dev", + "https://beta.khoj.dev", + "https://khoj.dev", + "https://*.khoj.dev", +] + +COOKIE_SAMESITE = "None" +if DEBUG: + SESSION_COOKIE_DOMAIN = "localhost" + CSRF_COOKIE_DOMAIN = "localhost" +else: + SESSION_COOKIE_DOMAIN = "khoj.dev" + CSRF_COOKIE_DOMAIN = "khoj.dev" + +SESSION_COOKIE_SECURE = True +CSRF_COOKIE_SECURE = True +COOKIE_SAMESITE = "None" +SESSION_COOKIE_SAMESITE = "None" + +# Application definition + +INSTALLED_APPS = [ + "django.contrib.auth", + "django.contrib.contenttypes", + "database.apps.DatabaseConfig", + "django.contrib.admin", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", +] + +MIDDLEWARE = [ + "django.middleware.security.SecurityMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", +] + +ROOT_URLCONF = "app.urls" + +TEMPLATES = [ + { + "BACKEND": "django.template.backends.django.DjangoTemplates", + "APP_DIRS": True, + "DIRS": [os.path.join(BASE_DIR, "templates"), os.path.join(BASE_DIR, "templates", "account")], + "OPTIONS": { + "context_processors": [ + "django.template.context_processors.debug", + "django.template.context_processors.request", + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + ], + }, + }, +] + +WSGI_APPLICATION = "app.wsgi.application" + + +# Database +# https://docs.djangoproject.com/en/4.2/ref/settings/#databases + +DATABASES = { + "default": { + "ENGINE": "django.db.backends.postgresql", + "HOST": os.getenv("POSTGRES_HOST", "localhost"), + "PORT": os.getenv("POSTGRES_PORT", "5432"), + "USER": os.getenv("POSTGRES_USER", "postgres"), + "NAME": os.getenv("POSTGRES_DB", "khoj"), + "PASSWORD": os.getenv("POSTGRES_PASSWORD", "postgres"), + } +} + +# User Settings +AUTH_USER_MODEL = "database.KhojUser" + +# Password validation +# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/4.2/topics/i18n/ + +LANGUAGE_CODE = "en-us" + +TIME_ZONE = "UTC" + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/4.2/howto/static-files/ + +STATIC_ROOT = BASE_DIR / "static" +STATICFILES_DIRS = [BASE_DIR / "src/khoj/interface/web"] +STATIC_URL = "/static/" + +# Default primary key field type +# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" diff --git a/src/app/urls.py b/src/app/urls.py new file mode 100644 index 00000000..39b4b1ef --- /dev/null +++ b/src/app/urls.py @@ -0,0 +1,25 @@ +""" +URL configuration for app project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/4.2/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path +from django.contrib.staticfiles.urls import staticfiles_urlpatterns + +urlpatterns = [ + path("admin/", admin.site.urls), +] + +urlpatterns += staticfiles_urlpatterns() diff --git a/src/app/wsgi.py b/src/app/wsgi.py new file mode 100644 index 00000000..cbdf4342 --- /dev/null +++ b/src/app/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for app project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "app.settings") + +application = get_wsgi_application() diff --git a/src/database/__init__.py b/src/database/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/database/adapters/__init__.py b/src/database/adapters/__init__.py new file mode 100644 index 00000000..4141d3bb --- /dev/null +++ b/src/database/adapters/__init__.py @@ -0,0 +1,481 @@ +import math +from typing import Optional, Type, List +from datetime import date, datetime +import secrets +from typing import Type, List +from datetime import date, timezone + +from django.db import models +from django.contrib.sessions.backends.db import SessionStore +from pgvector.django import CosineDistance +from django.db.models.manager import BaseManager +from django.db.models import Q +from torch import Tensor + +# Import sync_to_async from Django Channels +from asgiref.sync import sync_to_async + +from fastapi import HTTPException + +from database.models import ( + KhojUser, + GoogleUser, + KhojApiUser, + NotionConfig, + GithubConfig, + Entry, + GithubRepoConfig, + Conversation, + ChatModelOptions, + SearchModelConfig, + Subscription, + UserConversationConfig, + OpenAIProcessorConversationConfig, + OfflineChatProcessorConversationConfig, +) +from khoj.utils.helpers import generate_random_name +from khoj.search_filter.word_filter import WordFilter +from khoj.search_filter.file_filter import FileFilter +from khoj.search_filter.date_filter import DateFilter + + +async def set_notion_config(token: str, user: KhojUser): + notion_config = await NotionConfig.objects.filter(user=user).afirst() + if not notion_config: + notion_config = await NotionConfig.objects.acreate(token=token, user=user) + else: + notion_config.token = token + await notion_config.asave() + return notion_config + + +async def create_khoj_token(user: KhojUser, name=None): + "Create Khoj API key for user" + token = f"kk-{secrets.token_urlsafe(32)}" + name = name or f"{generate_random_name().title()}" + return await KhojApiUser.objects.acreate(token=token, user=user, name=name) + + +def get_khoj_tokens(user: KhojUser): + "Get all Khoj API keys for user" + return list(KhojApiUser.objects.filter(user=user)) + + +async def delete_khoj_token(user: KhojUser, token: str): + "Delete Khoj API Key for user" + await KhojApiUser.objects.filter(token=token, user=user).adelete() + + +async def get_or_create_user(token: dict) -> KhojUser: + user = await get_user_by_token(token) + if not user: + user = await create_user_by_google_token(token) + return user + + +async def create_user_by_google_token(token: dict) -> KhojUser: + user, _ = await KhojUser.objects.filter(email=token.get("email")).aupdate_or_create( + defaults={"username": token.get("email"), "email": token.get("email")} + ) + await user.asave() + + await GoogleUser.objects.acreate( + sub=token.get("sub"), + azp=token.get("azp"), + email=token.get("email"), + name=token.get("name"), + given_name=token.get("given_name"), + family_name=token.get("family_name"), + picture=token.get("picture"), + locale=token.get("locale"), + user=user, + ) + + await Subscription.objects.acreate(user=user, type="trial") + + return user + + +def get_user_subscription(email: str) -> Optional[Subscription]: + return Subscription.objects.filter(user__email=email).first() + + +async def set_user_subscription( + email: str, is_recurring=None, renewal_date=None, type="standard" +) -> Optional[Subscription]: + user_subscription = await Subscription.objects.filter(user__email=email).afirst() + if not user_subscription: + user = await get_user_by_email(email) + if not user: + return None + user_subscription = await Subscription.objects.acreate( + user=user, type=type, is_recurring=is_recurring, renewal_date=renewal_date + ) + return user_subscription + elif user_subscription: + user_subscription.type = type + if is_recurring is not None: + user_subscription.is_recurring = is_recurring + if renewal_date is False: + user_subscription.renewal_date = None + elif renewal_date is not None: + user_subscription.renewal_date = renewal_date + await user_subscription.asave() + return user_subscription + else: + return None + + +def get_user_subscription_state(email: str) -> str: + """Get subscription state of user + Valid state transitions: trial -> subscribed <-> unsubscribed OR expired + """ + user_subscription = Subscription.objects.filter(user__email=email).first() + if not user_subscription: + return "trial" + elif user_subscription.type == Subscription.Type.TRIAL: + return "trial" + elif user_subscription.is_recurring and user_subscription.renewal_date >= datetime.now(tz=timezone.utc): + return "subscribed" + elif not user_subscription.is_recurring and user_subscription.renewal_date >= datetime.now(tz=timezone.utc): + return "unsubscribed" + elif not user_subscription.is_recurring and user_subscription.renewal_date < datetime.now(tz=timezone.utc): + return "expired" + return "invalid" + + +async def get_user_by_email(email: str) -> KhojUser: + return await KhojUser.objects.filter(email=email).afirst() + + +async def get_user_by_token(token: dict) -> KhojUser: + google_user = await GoogleUser.objects.filter(sub=token.get("sub")).select_related("user").afirst() + if not google_user: + return None + return google_user.user + + +async def retrieve_user(session_id: str) -> KhojUser: + session = SessionStore(session_key=session_id) + if not await sync_to_async(session.exists)(session_key=session_id): + raise HTTPException(status_code=401, detail="Invalid session") + session_data = await sync_to_async(session.load)() + user = await KhojUser.objects.filter(id=session_data.get("_auth_user_id")).afirst() + if not user: + raise HTTPException(status_code=401, detail="Invalid user") + return user + + +def get_all_users() -> BaseManager[KhojUser]: + return KhojUser.objects.all() + + +def get_user_github_config(user: KhojUser): + config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first() + return config + + +def get_user_notion_config(user: KhojUser): + config = NotionConfig.objects.filter(user=user).first() + return config + + +async def set_text_content_config(user: KhojUser, object: Type[models.Model], updated_config): + deduped_files = list(set(updated_config.input_files)) if updated_config.input_files else None + deduped_filters = list(set(updated_config.input_filter)) if updated_config.input_filter else None + await object.objects.filter(user=user).adelete() + await object.objects.acreate( + input_files=deduped_files, + input_filter=deduped_filters, + index_heading_entries=updated_config.index_heading_entries, + user=user, + ) + + +async def set_user_github_config(user: KhojUser, pat_token: str, repos: list): + config = await GithubConfig.objects.filter(user=user).afirst() + + if not config: + config = await GithubConfig.objects.acreate(pat_token=pat_token, user=user) + else: + config.pat_token = pat_token + await config.asave() + await config.githubrepoconfig.all().adelete() + + for repo in repos: + await GithubRepoConfig.objects.acreate( + name=repo["name"], owner=repo["owner"], branch=repo["branch"], github_config=config + ) + return config + + +def get_or_create_search_model(): + search_model = SearchModelConfig.objects.filter().first() + if not search_model: + search_model = SearchModelConfig.objects.create() + + return search_model + + +class ConversationAdapters: + @staticmethod + def get_conversation_by_user(user: KhojUser): + conversation = Conversation.objects.filter(user=user) + if conversation.exists(): + return conversation.first() + return Conversation.objects.create(user=user) + + @staticmethod + async def aget_conversation_by_user(user: KhojUser): + conversation = Conversation.objects.filter(user=user) + if await conversation.aexists(): + return await conversation.afirst() + return await Conversation.objects.acreate(user=user) + + @staticmethod + def has_any_conversation_config(user: KhojUser): + return ChatModelOptions.objects.filter(user=user).exists() + + @staticmethod + def get_openai_conversation_config(): + return OpenAIProcessorConversationConfig.objects.filter().first() + + @staticmethod + async def aget_openai_conversation_config(): + return await OpenAIProcessorConversationConfig.objects.filter().afirst() + + @staticmethod + def get_offline_chat_conversation_config(): + return OfflineChatProcessorConversationConfig.objects.filter().first() + + @staticmethod + async def aget_offline_chat_conversation_config(): + return await OfflineChatProcessorConversationConfig.objects.filter().afirst() + + @staticmethod + def has_valid_offline_conversation_config(): + return OfflineChatProcessorConversationConfig.objects.filter(enabled=True).exists() + + @staticmethod + def has_valid_openai_conversation_config(): + return OpenAIProcessorConversationConfig.objects.filter().exists() + + @staticmethod + async def aset_user_conversation_processor(user: KhojUser, conversation_processor_config_id: int): + config = await ChatModelOptions.objects.filter(id=conversation_processor_config_id).afirst() + if not config: + return None + new_config = await UserConversationConfig.objects.aupdate_or_create(user=user, defaults={"setting": config}) + return new_config + + @staticmethod + def get_conversation_config(user: KhojUser): + config = UserConversationConfig.objects.filter(user=user).first() + if not config: + return None + return config.setting + + @staticmethod + async def aget_conversation_config(user: KhojUser): + config = await UserConversationConfig.objects.filter(user=user).prefetch_related("setting").afirst() + if not config: + return None + return config.setting + + @staticmethod + def get_default_conversation_config(): + return ChatModelOptions.objects.filter().first() + + @staticmethod + async def aget_default_conversation_config(): + return await ChatModelOptions.objects.filter().afirst() + + @staticmethod + def save_conversation(user: KhojUser, conversation_log: dict): + conversation = Conversation.objects.filter(user=user) + if conversation.exists(): + conversation.update(conversation_log=conversation_log) + else: + Conversation.objects.create(user=user, conversation_log=conversation_log) + + @staticmethod + def get_conversation_processor_options(): + return ChatModelOptions.objects.all() + + @staticmethod + def set_conversation_processor_config(user: KhojUser, new_config: ChatModelOptions): + user_conversation_config, _ = UserConversationConfig.objects.get_or_create(user=user) + user_conversation_config.setting = new_config + user_conversation_config.save() + + @staticmethod + def has_offline_chat(): + return OfflineChatProcessorConversationConfig.objects.filter(enabled=True).exists() + + @staticmethod + async def ahas_offline_chat(): + return await OfflineChatProcessorConversationConfig.objects.filter(enabled=True).aexists() + + @staticmethod + async def get_offline_chat(): + return await ChatModelOptions.objects.filter(model_type="offline").afirst() + + @staticmethod + async def aget_user_conversation_config(user: KhojUser): + config = await UserConversationConfig.objects.filter(user=user).prefetch_related("setting").afirst() + if not config: + return None + return config.setting + + @staticmethod + async def has_openai_chat(): + return await OpenAIProcessorConversationConfig.objects.filter().aexists() + + @staticmethod + async def get_openai_chat(): + return await ChatModelOptions.objects.filter(model_type="openai").afirst() + + @staticmethod + async def get_openai_chat_config(): + return await OpenAIProcessorConversationConfig.objects.filter().afirst() + + +class EntryAdapters: + word_filer = WordFilter() + file_filter = FileFilter() + date_filter = DateFilter() + + @staticmethod + def does_entry_exist(user: KhojUser, hashed_value: str) -> bool: + return Entry.objects.filter(user=user, hashed_value=hashed_value).exists() + + @staticmethod + def delete_entry_by_file(user: KhojUser, file_path: str): + deleted_count, _ = Entry.objects.filter(user=user, file_path=file_path).delete() + return deleted_count + + @staticmethod + def delete_all_entries_by_type(user: KhojUser, file_type: str = None): + if file_type is None: + deleted_count, _ = Entry.objects.filter(user=user).delete() + else: + deleted_count, _ = Entry.objects.filter(user=user, file_type=file_type).delete() + return deleted_count + + @staticmethod + def delete_all_entries(user: KhojUser, file_source: str = None): + if file_source is None: + deleted_count, _ = Entry.objects.filter(user=user).delete() + else: + deleted_count, _ = Entry.objects.filter(user=user, file_source=file_source).delete() + return deleted_count + + @staticmethod + def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str): + return Entry.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True) + + @staticmethod + def delete_entry_by_hash(user: KhojUser, hashed_values: List[str]): + Entry.objects.filter(user=user, hashed_value__in=hashed_values).delete() + + @staticmethod + def get_entries_by_date_filter(entry: BaseManager[Entry], start_date: date, end_date: date): + return entry.filter( + entrydates__date__gte=start_date, + entrydates__date__lte=end_date, + ) + + @staticmethod + def user_has_entries(user: KhojUser): + return Entry.objects.filter(user=user).exists() + + @staticmethod + async def auser_has_entries(user: KhojUser): + return await Entry.objects.filter(user=user).aexists() + + @staticmethod + async def adelete_entry_by_file(user: KhojUser, file_path: str): + return await Entry.objects.filter(user=user, file_path=file_path).adelete() + + @staticmethod + def aget_all_filenames_by_source(user: KhojUser, file_source: str): + return ( + Entry.objects.filter(user=user, file_source=file_source) + .distinct("file_path") + .values_list("file_path", flat=True) + ) + + @staticmethod + async def adelete_all_entries(user: KhojUser): + return await Entry.objects.filter(user=user).adelete() + + @staticmethod + def apply_filters(user: KhojUser, query: str, file_type_filter: str = None): + q_filter_terms = Q() + + explicit_word_terms = EntryAdapters.word_filer.get_filter_terms(query) + file_filters = EntryAdapters.file_filter.get_filter_terms(query) + date_filters = EntryAdapters.date_filter.get_query_date_range(query) + + if len(explicit_word_terms) == 0 and len(file_filters) == 0 and len(date_filters) == 0: + return Entry.objects.filter(user=user) + + for term in explicit_word_terms: + if term.startswith("+"): + q_filter_terms &= Q(raw__icontains=term[1:]) + elif term.startswith("-"): + q_filter_terms &= ~Q(raw__icontains=term[1:]) + + q_file_filter_terms = Q() + + if len(file_filters) > 0: + for term in file_filters: + q_file_filter_terms |= Q(file_path__regex=term) + + q_filter_terms &= q_file_filter_terms + + if len(date_filters) > 0: + min_date, max_date = date_filters + if min_date is not None: + # Convert the min_date timestamp to yyyy-mm-dd format + formatted_min_date = date.fromtimestamp(min_date).strftime("%Y-%m-%d") + q_filter_terms &= Q(embeddings_dates__date__gte=formatted_min_date) + if max_date is not None: + # Convert the max_date timestamp to yyyy-mm-dd format + formatted_max_date = date.fromtimestamp(max_date).strftime("%Y-%m-%d") + q_filter_terms &= Q(embeddings_dates__date__lte=formatted_max_date) + + relevant_entries = Entry.objects.filter(user=user).filter( + q_filter_terms, + ) + if file_type_filter: + relevant_entries = relevant_entries.filter(file_type=file_type_filter) + return relevant_entries + + @staticmethod + def search_with_embeddings( + user: KhojUser, + embeddings: Tensor, + max_results: int = 10, + file_type_filter: str = None, + raw_query: str = None, + max_distance: float = math.inf, + ): + relevant_entries = EntryAdapters.apply_filters(user, raw_query, file_type_filter) + relevant_entries = relevant_entries.filter(user=user).annotate( + distance=CosineDistance("embeddings", embeddings) + ) + relevant_entries = relevant_entries.filter(distance__lte=max_distance) + + if file_type_filter: + relevant_entries = relevant_entries.filter(file_type=file_type_filter) + relevant_entries = relevant_entries.order_by("distance") + return relevant_entries[:max_results] + + @staticmethod + def get_unique_file_types(user: KhojUser): + return Entry.objects.filter(user=user).values_list("file_type", flat=True).distinct() + + @staticmethod + def get_unique_file_sources(user: KhojUser): + return Entry.objects.filter(user=user).values_list("file_source", flat=True).distinct().all() diff --git a/src/database/admin.py b/src/database/admin.py new file mode 100644 index 00000000..8d2130ba --- /dev/null +++ b/src/database/admin.py @@ -0,0 +1,21 @@ +from django.contrib import admin +from django.contrib.auth.admin import UserAdmin + +# Register your models here. + +from database.models import ( + KhojUser, + ChatModelOptions, + OpenAIProcessorConversationConfig, + OfflineChatProcessorConversationConfig, + SearchModelConfig, + Subscription, +) + +admin.site.register(KhojUser, UserAdmin) + +admin.site.register(ChatModelOptions) +admin.site.register(OpenAIProcessorConversationConfig) +admin.site.register(OfflineChatProcessorConversationConfig) +admin.site.register(SearchModelConfig) +admin.site.register(Subscription) diff --git a/src/database/apps.py b/src/database/apps.py new file mode 100644 index 00000000..a3b71b13 --- /dev/null +++ b/src/database/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class DatabaseConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "database" diff --git a/src/database/migrations/0001_khojuser.py b/src/database/migrations/0001_khojuser.py new file mode 100644 index 00000000..f1420575 --- /dev/null +++ b/src/database/migrations/0001_khojuser.py @@ -0,0 +1,98 @@ +# Generated by Django 4.2.5 on 2023-09-14 19:00 + +import django.contrib.auth.models +import django.contrib.auth.validators +from django.db import migrations, models +import django.utils.timezone + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("auth", "0012_alter_user_first_name_max_length"), + ] + + run_before = [ + ("admin", "0001_initial"), + ] + + operations = [ + migrations.CreateModel( + name="KhojUser", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("password", models.CharField(max_length=128, verbose_name="password")), + ("last_login", models.DateTimeField(blank=True, null=True, verbose_name="last login")), + ( + "is_superuser", + models.BooleanField( + default=False, + help_text="Designates that this user has all permissions without explicitly assigning them.", + verbose_name="superuser status", + ), + ), + ( + "username", + models.CharField( + error_messages={"unique": "A user with that username already exists."}, + help_text="Required. 150 characters or fewer. Letters, digits and @/./+/-/_ only.", + max_length=150, + unique=True, + validators=[django.contrib.auth.validators.UnicodeUsernameValidator()], + verbose_name="username", + ), + ), + ("first_name", models.CharField(blank=True, max_length=150, verbose_name="first name")), + ("last_name", models.CharField(blank=True, max_length=150, verbose_name="last name")), + ("email", models.EmailField(blank=True, max_length=254, verbose_name="email address")), + ( + "is_staff", + models.BooleanField( + default=False, + help_text="Designates whether the user can log into this admin site.", + verbose_name="staff status", + ), + ), + ( + "is_active", + models.BooleanField( + default=True, + help_text="Designates whether this user should be treated as active. Unselect this instead of deleting accounts.", + verbose_name="active", + ), + ), + ("date_joined", models.DateTimeField(default=django.utils.timezone.now, verbose_name="date joined")), + ( + "groups", + models.ManyToManyField( + blank=True, + help_text="The groups this user belongs to. A user will get all permissions granted to each of their groups.", + related_name="user_set", + related_query_name="user", + to="auth.group", + verbose_name="groups", + ), + ), + ( + "user_permissions", + models.ManyToManyField( + blank=True, + help_text="Specific permissions for this user.", + related_name="user_set", + related_query_name="user", + to="auth.permission", + verbose_name="user permissions", + ), + ), + ], + options={ + "verbose_name": "user", + "verbose_name_plural": "users", + "abstract": False, + }, + managers=[ + ("objects", django.contrib.auth.models.UserManager()), + ], + ), + ] diff --git a/src/database/migrations/0002_googleuser.py b/src/database/migrations/0002_googleuser.py new file mode 100644 index 00000000..478770d6 --- /dev/null +++ b/src/database/migrations/0002_googleuser.py @@ -0,0 +1,32 @@ +# Generated by Django 4.2.4 on 2023-09-18 23:24 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0001_khojuser"), + ] + + operations = [ + migrations.CreateModel( + name="GoogleUser", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("sub", models.CharField(max_length=200)), + ("azp", models.CharField(max_length=200)), + ("email", models.CharField(max_length=200)), + ("name", models.CharField(max_length=200)), + ("given_name", models.CharField(max_length=200)), + ("family_name", models.CharField(max_length=200)), + ("picture", models.CharField(max_length=200)), + ("locale", models.CharField(max_length=200)), + ( + "user", + models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + ], + ), + ] diff --git a/src/database/migrations/0003_vector_extension.py b/src/database/migrations/0003_vector_extension.py new file mode 100644 index 00000000..9de01df2 --- /dev/null +++ b/src/database/migrations/0003_vector_extension.py @@ -0,0 +1,10 @@ +from django.db import migrations +from pgvector.django import VectorExtension + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0002_googleuser"), + ] + + operations = [VectorExtension()] diff --git a/src/database/migrations/0004_content_types_and_more.py b/src/database/migrations/0004_content_types_and_more.py new file mode 100644 index 00000000..ec704e1f --- /dev/null +++ b/src/database/migrations/0004_content_types_and_more.py @@ -0,0 +1,180 @@ +# Generated by Django 4.2.5 on 2023-10-11 22:24 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion +import pgvector.django +import uuid + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0003_vector_extension"), + ] + + operations = [ + migrations.CreateModel( + name="GithubConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("pat_token", models.CharField(max_length=200)), + ], + options={ + "abstract": False, + }, + ), + migrations.AddField( + model_name="khojuser", + name="uuid", + field=models.UUIDField(default=1234, verbose_name=models.UUIDField(default=uuid.uuid4, editable=False)), + preserve_default=False, + ), + migrations.CreateModel( + name="NotionConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("token", models.CharField(max_length=200)), + ("user", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + "abstract": False, + }, + ), + migrations.CreateModel( + name="LocalPlaintextConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("input_files", models.JSONField(default=list, null=True)), + ("input_filter", models.JSONField(default=list, null=True)), + ("index_heading_entries", models.BooleanField(default=False)), + ("user", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + "abstract": False, + }, + ), + migrations.CreateModel( + name="LocalPdfConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("input_files", models.JSONField(default=list, null=True)), + ("input_filter", models.JSONField(default=list, null=True)), + ("index_heading_entries", models.BooleanField(default=False)), + ("user", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + "abstract": False, + }, + ), + migrations.CreateModel( + name="LocalOrgConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("input_files", models.JSONField(default=list, null=True)), + ("input_filter", models.JSONField(default=list, null=True)), + ("index_heading_entries", models.BooleanField(default=False)), + ("user", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + "abstract": False, + }, + ), + migrations.CreateModel( + name="LocalMarkdownConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("input_files", models.JSONField(default=list, null=True)), + ("input_filter", models.JSONField(default=list, null=True)), + ("index_heading_entries", models.BooleanField(default=False)), + ("user", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + "abstract": False, + }, + ), + migrations.CreateModel( + name="GithubRepoConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("name", models.CharField(max_length=200)), + ("owner", models.CharField(max_length=200)), + ("branch", models.CharField(max_length=200)), + ( + "github_config", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="githubrepoconfig", + to="database.githubconfig", + ), + ), + ], + options={ + "abstract": False, + }, + ), + migrations.AddField( + model_name="githubconfig", + name="user", + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.CreateModel( + name="Embeddings", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("embeddings", pgvector.django.VectorField(dimensions=384)), + ("raw", models.TextField()), + ("compiled", models.TextField()), + ("heading", models.CharField(blank=True, default=None, max_length=1000, null=True)), + ( + "file_type", + models.CharField( + choices=[ + ("image", "Image"), + ("pdf", "Pdf"), + ("plaintext", "Plaintext"), + ("markdown", "Markdown"), + ("org", "Org"), + ("notion", "Notion"), + ("github", "Github"), + ("conversation", "Conversation"), + ], + default="plaintext", + max_length=30, + ), + ), + ("file_path", models.CharField(blank=True, default=None, max_length=400, null=True)), + ("file_name", models.CharField(blank=True, default=None, max_length=400, null=True)), + ("url", models.URLField(blank=True, default=None, max_length=400, null=True)), + ("hashed_value", models.CharField(max_length=100)), + ( + "user", + models.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/src/database/migrations/0005_embeddings_corpus_id.py b/src/database/migrations/0005_embeddings_corpus_id.py new file mode 100644 index 00000000..984953d6 --- /dev/null +++ b/src/database/migrations/0005_embeddings_corpus_id.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.5 on 2023-10-13 02:39 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0004_content_types_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="embeddings", + name="corpus_id", + field=models.UUIDField(default=uuid.uuid4, editable=False), + ), + ] diff --git a/src/database/migrations/0006_embeddingsdates.py b/src/database/migrations/0006_embeddingsdates.py new file mode 100644 index 00000000..9d988ed8 --- /dev/null +++ b/src/database/migrations/0006_embeddingsdates.py @@ -0,0 +1,33 @@ +# Generated by Django 4.2.5 on 2023-10-13 19:28 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0005_embeddings_corpus_id"), + ] + + operations = [ + migrations.CreateModel( + name="EmbeddingsDates", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("date", models.DateField()), + ( + "embeddings", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="embeddings_dates", + to="database.embeddings", + ), + ), + ], + options={ + "indexes": [models.Index(fields=["date"], name="database_em_date_a1ba47_idx")], + }, + ), + ] diff --git a/src/database/migrations/0007_add_conversation.py b/src/database/migrations/0007_add_conversation.py new file mode 100644 index 00000000..167b6cab --- /dev/null +++ b/src/database/migrations/0007_add_conversation.py @@ -0,0 +1,27 @@ +# Generated by Django 4.2.5 on 2023-10-18 05:31 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0006_embeddingsdates"), + ] + + operations = [ + migrations.CreateModel( + name="Conversation", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("conversation_log", models.JSONField()), + ("user", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/src/database/migrations/0008_alter_conversation_conversation_log.py b/src/database/migrations/0008_alter_conversation_conversation_log.py new file mode 100644 index 00000000..00f37385 --- /dev/null +++ b/src/database/migrations/0008_alter_conversation_conversation_log.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.5 on 2023-10-18 16:46 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0007_add_conversation"), + ] + + operations = [ + migrations.AlterField( + model_name="conversation", + name="conversation_log", + field=models.JSONField(default=dict), + ), + ] diff --git a/src/database/migrations/0009_khojapiuser.py b/src/database/migrations/0009_khojapiuser.py new file mode 100644 index 00000000..86b09ab3 --- /dev/null +++ b/src/database/migrations/0009_khojapiuser.py @@ -0,0 +1,24 @@ +# Generated by Django 4.2.5 on 2023-10-26 17:02 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0008_alter_conversation_conversation_log"), + ] + + operations = [ + migrations.CreateModel( + name="KhojApiUser", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("token", models.CharField(max_length=50, unique=True)), + ("name", models.CharField(max_length=50)), + ("accessed_at", models.DateTimeField(default=None, null=True)), + ("user", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + ), + ] diff --git a/src/database/migrations/0010_chatmodeloptions_and_more.py b/src/database/migrations/0010_chatmodeloptions_and_more.py new file mode 100644 index 00000000..9f3a491a --- /dev/null +++ b/src/database/migrations/0010_chatmodeloptions_and_more.py @@ -0,0 +1,83 @@ +# Generated by Django 4.2.4 on 2023-11-01 17:41 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0009_khojapiuser"), + ] + + operations = [ + migrations.CreateModel( + name="ChatModelOptions", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("max_prompt_size", models.IntegerField(blank=True, default=None, null=True)), + ("tokenizer", models.CharField(blank=True, default=None, max_length=200, null=True)), + ("chat_model", models.CharField(blank=True, default=None, max_length=200, null=True)), + ( + "model_type", + models.CharField( + choices=[("openai", "Openai"), ("offline", "Offline")], default="openai", max_length=200 + ), + ), + ], + options={ + "abstract": False, + }, + ), + migrations.CreateModel( + name="OfflineChatProcessorConversationConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("enabled", models.BooleanField(default=False)), + ], + options={ + "abstract": False, + }, + ), + migrations.CreateModel( + name="OpenAIProcessorConversationConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("api_key", models.CharField(max_length=200)), + ], + options={ + "abstract": False, + }, + ), + migrations.CreateModel( + name="UserConversationConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "setting", + models.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.CASCADE, + to="database.chatmodeloptions", + ), + ), + ( + "user", + models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/src/database/migrations/0010_rename_embeddings_entry_and_more.py b/src/database/migrations/0010_rename_embeddings_entry_and_more.py new file mode 100644 index 00000000..f86b2caa --- /dev/null +++ b/src/database/migrations/0010_rename_embeddings_entry_and_more.py @@ -0,0 +1,30 @@ +# Generated by Django 4.2.5 on 2023-10-26 23:52 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0009_khojapiuser"), + ] + + operations = [ + migrations.RenameModel( + old_name="Embeddings", + new_name="Entry", + ), + migrations.RenameModel( + old_name="EmbeddingsDates", + new_name="EntryDates", + ), + migrations.RenameField( + model_name="entrydates", + old_name="embeddings", + new_name="entry", + ), + migrations.RenameIndex( + model_name="entrydates", + new_name="database_en_date_8d823c_idx", + old_name="database_em_date_a1ba47_idx", + ), + ] diff --git a/src/database/migrations/0011_merge_20231102_0138.py b/src/database/migrations/0011_merge_20231102_0138.py new file mode 100644 index 00000000..112c76a2 --- /dev/null +++ b/src/database/migrations/0011_merge_20231102_0138.py @@ -0,0 +1,12 @@ +# Generated by Django 4.2.5 on 2023-11-02 01:38 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0010_chatmodeloptions_and_more"), + ("database", "0010_rename_embeddings_entry_and_more"), + ] + + operations = [] diff --git a/src/database/migrations/0012_entry_file_source.py b/src/database/migrations/0012_entry_file_source.py new file mode 100644 index 00000000..187136ae --- /dev/null +++ b/src/database/migrations/0012_entry_file_source.py @@ -0,0 +1,21 @@ +# Generated by Django 4.2.5 on 2023-11-07 07:24 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0011_merge_20231102_0138"), + ] + + operations = [ + migrations.AddField( + model_name="entry", + name="file_source", + field=models.CharField( + choices=[("computer", "Computer"), ("notion", "Notion"), ("github", "Github")], + default="computer", + max_length=30, + ), + ), + ] diff --git a/src/database/migrations/0013_subscription.py b/src/database/migrations/0013_subscription.py new file mode 100644 index 00000000..931cea12 --- /dev/null +++ b/src/database/migrations/0013_subscription.py @@ -0,0 +1,37 @@ +# Generated by Django 4.2.5 on 2023-11-09 01:27 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0012_entry_file_source"), + ] + + operations = [ + migrations.CreateModel( + name="Subscription", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "type", + models.CharField( + choices=[("trial", "Trial"), ("standard", "Standard")], default="trial", max_length=20 + ), + ), + ("is_recurring", models.BooleanField(default=False)), + ("renewal_date", models.DateTimeField(default=None, null=True)), + ( + "user", + models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/src/database/migrations/0014_alter_googleuser_picture.py b/src/database/migrations/0014_alter_googleuser_picture.py new file mode 100644 index 00000000..a3d2ce05 --- /dev/null +++ b/src/database/migrations/0014_alter_googleuser_picture.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.5 on 2023-11-09 08:11 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0013_subscription"), + ] + + operations = [ + migrations.AlterField( + model_name="googleuser", + name="picture", + field=models.CharField(default=None, max_length=200, null=True), + ), + ] diff --git a/src/database/migrations/0015_alter_subscription_user.py b/src/database/migrations/0015_alter_subscription_user.py new file mode 100644 index 00000000..e4ba6ab0 --- /dev/null +++ b/src/database/migrations/0015_alter_subscription_user.py @@ -0,0 +1,21 @@ +# Generated by Django 4.2.5 on 2023-11-11 05:39 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0014_alter_googleuser_picture"), + ] + + operations = [ + migrations.AlterField( + model_name="subscription", + name="user", + field=models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, related_name="subscription", to=settings.AUTH_USER_MODEL + ), + ), + ] diff --git a/src/database/migrations/0016_alter_subscription_renewal_date.py b/src/database/migrations/0016_alter_subscription_renewal_date.py new file mode 100644 index 00000000..bc7c5ada --- /dev/null +++ b/src/database/migrations/0016_alter_subscription_renewal_date.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.5 on 2023-11-11 06:15 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0015_alter_subscription_user"), + ] + + operations = [ + migrations.AlterField( + model_name="subscription", + name="renewal_date", + field=models.DateTimeField(blank=True, default=None, null=True), + ), + ] diff --git a/src/database/migrations/0017_searchmodel.py b/src/database/migrations/0017_searchmodel.py new file mode 100644 index 00000000..f150e12b --- /dev/null +++ b/src/database/migrations/0017_searchmodel.py @@ -0,0 +1,32 @@ +# Generated by Django 4.2.5 on 2023-11-14 23:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0016_alter_subscription_renewal_date"), + ] + + operations = [ + migrations.CreateModel( + name="SearchModel", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("name", models.CharField(default="default", max_length=200)), + ("model_type", models.CharField(choices=[("text", "Text")], default="text", max_length=200)), + ("bi_encoder", models.CharField(default="thenlper/gte-small", max_length=200)), + ( + "cross_encoder", + models.CharField( + blank=True, default="cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=200, null=True + ), + ), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/src/database/migrations/0018_searchmodelconfig_delete_searchmodel.py b/src/database/migrations/0018_searchmodelconfig_delete_searchmodel.py new file mode 100644 index 00000000..a8100370 --- /dev/null +++ b/src/database/migrations/0018_searchmodelconfig_delete_searchmodel.py @@ -0,0 +1,30 @@ +# Generated by Django 4.2.5 on 2023-11-16 01:13 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0017_searchmodel"), + ] + + operations = [ + migrations.CreateModel( + name="SearchModelConfig", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("name", models.CharField(default="default", max_length=200)), + ("model_type", models.CharField(choices=[("text", "Text")], default="text", max_length=200)), + ("bi_encoder", models.CharField(default="thenlper/gte-small", max_length=200)), + ("cross_encoder", models.CharField(default="cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=200)), + ], + options={ + "abstract": False, + }, + ), + migrations.DeleteModel( + name="SearchModel", + ), + ] diff --git a/src/database/migrations/__init__.py b/src/database/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/database/models/__init__.py b/src/database/models/__init__.py new file mode 100644 index 00000000..92848e5c --- /dev/null +++ b/src/database/models/__init__.py @@ -0,0 +1,181 @@ +import uuid + +from django.db import models +from django.contrib.auth.models import AbstractUser +from pgvector.django import VectorField + + +class BaseModel(models.Model): + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + abstract = True + + +class KhojUser(AbstractUser): + uuid = models.UUIDField(models.UUIDField(default=uuid.uuid4, editable=False)) + + def save(self, *args, **kwargs): + if not self.uuid: + self.uuid = uuid.uuid4() + super().save(*args, **kwargs) + + +class GoogleUser(models.Model): + user = models.OneToOneField(KhojUser, on_delete=models.CASCADE) + sub = models.CharField(max_length=200) + azp = models.CharField(max_length=200) + email = models.CharField(max_length=200) + name = models.CharField(max_length=200) + given_name = models.CharField(max_length=200) + family_name = models.CharField(max_length=200) + picture = models.CharField(max_length=200, null=True, default=None) + locale = models.CharField(max_length=200) + + def __str__(self): + return self.name + + +class KhojApiUser(models.Model): + """User issued API tokens to authenticate Khoj clients""" + + user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) + token = models.CharField(max_length=50, unique=True) + name = models.CharField(max_length=50) + accessed_at = models.DateTimeField(null=True, default=None) + + +class Subscription(BaseModel): + class Type(models.TextChoices): + TRIAL = "trial" + STANDARD = "standard" + + user = models.OneToOneField(KhojUser, on_delete=models.CASCADE, related_name="subscription") + type = models.CharField(max_length=20, choices=Type.choices, default=Type.TRIAL) + is_recurring = models.BooleanField(default=False) + renewal_date = models.DateTimeField(null=True, default=None, blank=True) + + +class NotionConfig(BaseModel): + token = models.CharField(max_length=200) + user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) + + +class GithubConfig(BaseModel): + pat_token = models.CharField(max_length=200) + user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) + + +class GithubRepoConfig(BaseModel): + name = models.CharField(max_length=200) + owner = models.CharField(max_length=200) + branch = models.CharField(max_length=200) + github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig") + + +class LocalOrgConfig(BaseModel): + input_files = models.JSONField(default=list, null=True) + input_filter = models.JSONField(default=list, null=True) + index_heading_entries = models.BooleanField(default=False) + user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) + + +class LocalMarkdownConfig(BaseModel): + input_files = models.JSONField(default=list, null=True) + input_filter = models.JSONField(default=list, null=True) + index_heading_entries = models.BooleanField(default=False) + user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) + + +class LocalPdfConfig(BaseModel): + input_files = models.JSONField(default=list, null=True) + input_filter = models.JSONField(default=list, null=True) + index_heading_entries = models.BooleanField(default=False) + user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) + + +class LocalPlaintextConfig(BaseModel): + input_files = models.JSONField(default=list, null=True) + input_filter = models.JSONField(default=list, null=True) + index_heading_entries = models.BooleanField(default=False) + user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) + + +class SearchModelConfig(BaseModel): + class ModelType(models.TextChoices): + TEXT = "text" + + name = models.CharField(max_length=200, default="default") + model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.TEXT) + bi_encoder = models.CharField(max_length=200, default="thenlper/gte-small") + cross_encoder = models.CharField(max_length=200, default="cross-encoder/ms-marco-MiniLM-L-6-v2") + + +class OpenAIProcessorConversationConfig(BaseModel): + api_key = models.CharField(max_length=200) + + +class OfflineChatProcessorConversationConfig(BaseModel): + enabled = models.BooleanField(default=False) + + +class ChatModelOptions(BaseModel): + class ModelType(models.TextChoices): + OPENAI = "openai" + OFFLINE = "offline" + + max_prompt_size = models.IntegerField(default=None, null=True, blank=True) + tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True) + chat_model = models.CharField(max_length=200, default=None, null=True, blank=True) + model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OPENAI) + + +class UserConversationConfig(BaseModel): + user = models.OneToOneField(KhojUser, on_delete=models.CASCADE) + setting = models.ForeignKey(ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True) + + +class Conversation(BaseModel): + user = models.ForeignKey(KhojUser, on_delete=models.CASCADE) + conversation_log = models.JSONField(default=dict) + + +class Entry(BaseModel): + class EntryType(models.TextChoices): + IMAGE = "image" + PDF = "pdf" + PLAINTEXT = "plaintext" + MARKDOWN = "markdown" + ORG = "org" + NOTION = "notion" + GITHUB = "github" + CONVERSATION = "conversation" + + class EntrySource(models.TextChoices): + COMPUTER = "computer" + NOTION = "notion" + GITHUB = "github" + + user = models.ForeignKey(KhojUser, on_delete=models.CASCADE, default=None, null=True, blank=True) + embeddings = VectorField(dimensions=384) + raw = models.TextField() + compiled = models.TextField() + heading = models.CharField(max_length=1000, default=None, null=True, blank=True) + file_source = models.CharField(max_length=30, choices=EntrySource.choices, default=EntrySource.COMPUTER) + file_type = models.CharField(max_length=30, choices=EntryType.choices, default=EntryType.PLAINTEXT) + file_path = models.CharField(max_length=400, default=None, null=True, blank=True) + file_name = models.CharField(max_length=400, default=None, null=True, blank=True) + url = models.URLField(max_length=400, default=None, null=True, blank=True) + hashed_value = models.CharField(max_length=100) + corpus_id = models.UUIDField(default=uuid.uuid4, editable=False) + + +class EntryDates(BaseModel): + date = models.DateField() + entry = models.ForeignKey(Entry, on_delete=models.CASCADE, related_name="embeddings_dates") + + class Meta: + indexes = [ + models.Index(fields=["date"]), + ] diff --git a/src/database/tests.py b/src/database/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/src/database/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/src/interface/desktop/about.html b/src/interface/desktop/about.html new file mode 100644 index 00000000..deff03e1 --- /dev/null +++ b/src/interface/desktop/about.html @@ -0,0 +1,88 @@ + + + + + Khoj - About + + + + + + + + +
+ +

Khoj for Desktop +

+
+
+ + +
+
+ © 2023 Khoj Inc. All rights reserved. +
+ + diff --git a/src/interface/desktop/assets/icons/favicon-20x20.png b/src/interface/desktop/assets/icons/favicon-20x20.png new file mode 100644 index 00000000..1a4ee0be Binary files /dev/null and b/src/interface/desktop/assets/icons/favicon-20x20.png differ diff --git a/src/interface/desktop/assets/icons/key.svg b/src/interface/desktop/assets/icons/key.svg new file mode 100644 index 00000000..437688fb --- /dev/null +++ b/src/interface/desktop/assets/icons/key.svg @@ -0,0 +1,4 @@ + + + + diff --git a/src/interface/desktop/assets/icons/khoj-logo-sideways-500.png b/src/interface/desktop/assets/icons/khoj-logo-sideways-500.png index 56648932..765d6e33 100644 Binary files a/src/interface/desktop/assets/icons/khoj-logo-sideways-500.png and b/src/interface/desktop/assets/icons/khoj-logo-sideways-500.png differ diff --git a/src/interface/desktop/assets/icons/link.svg b/src/interface/desktop/assets/icons/link.svg index ef484368..43852d95 100644 --- a/src/interface/desktop/assets/icons/link.svg +++ b/src/interface/desktop/assets/icons/link.svg @@ -1,5 +1,4 @@ - + - - + diff --git a/src/interface/desktop/assets/khoj.css b/src/interface/desktop/assets/khoj.css index 02f2493f..b2e048c5 100644 --- a/src/interface/desktop/assets/khoj.css +++ b/src/interface/desktop/assets/khoj.css @@ -2,29 +2,44 @@ /* Can be forced with data-theme="light" */ [data-theme="light"], :root:not([data-theme="dark"]) { - --primary: #ffb300; - --primary-hover: #ffa000; + --primary: #fee285; + --primary-hover: #fcc50b; --primary-focus: rgba(255, 179, 0, 0.125); --primary-inverse: rgba(0, 0, 0, 0.75); + --background-color: #f5f4f3; + --main-text-color: #475569; + --water: #44b9da; + --leaf: #7b990a; + --flower: #d1684e; } /* Amber Dark scheme (Auto) */ /* Automatically enabled if user has Dark mode enabled */ @media only screen and (prefers-color-scheme: dark) { :root:not([data-theme]) { - --primary: #ffb300; - --primary-hover: #ffc107; + --primary: #fee285; + --primary-hover: #fcc50b; --primary-focus: rgba(255, 179, 0, 0.25); --primary-inverse: rgba(0, 0, 0, 0.75); + --background-color: #f5f4f3; + --main-text-color: #475569; + --water: #44b9da; + --leaf: #7b990a; + --flower: #d1684e; } } /* Amber Dark scheme (Forced) */ /* Enabled if forced with data-theme="dark" */ [data-theme="dark"] { - --primary: #ffb300; - --primary-hover: #ffc107; + --primary: #fee285; + --primary-hover: #fcc50b; --primary-focus: rgba(255, 179, 0, 0.25); --primary-inverse: rgba(0, 0, 0, 0.75); + --background-color: #f5f4f3; + --main-text-color: #475569; + --water: #44b9da; + --leaf: #7b990a; + --flower: #d1684e; } /* Amber (Common styles) */ :root { @@ -37,8 +52,10 @@ .khoj-configure { display: grid; grid-template-columns: 1fr; - padding: 0 24px; + font-family: roboto, karma, segoe ui, sans-serif; + font-weight: 300; } + .khoj-header { display: grid; grid-auto-flow: column; @@ -64,7 +81,7 @@ a.khoj-logo { } .khoj-nav a { - color: #333; + color: var(--main-text-color); text-decoration: none; font-size: small; font-weight: normal; @@ -75,8 +92,9 @@ a.khoj-logo { } .khoj-nav a:hover { background-color: var(--primary-hover); + color: var(--main-text-color); } -.khoj-nav-selected { +a.khoj-nav-selected { background-color: var(--primary); } img.khoj-logo { @@ -85,21 +103,6 @@ img.khoj-logo { justify-self: center; } -a.khoj-banner { - color: black; - text-decoration: none; -} - -p.khoj-banner { - font-size: small; - margin: 0; - padding: 10px; -} - -p#khoj-banner { - display: inline; -} - @media only screen and (max-width: 600px) { div.khoj-header { display: grid; diff --git a/src/interface/desktop/assets/three.min.js b/src/interface/desktop/assets/three.min.js new file mode 100644 index 00000000..57018496 --- /dev/null +++ b/src/interface/desktop/assets/three.min.js @@ -0,0 +1,991 @@ +// threejs.org/license +'use strict';var THREE={REVISION:"77"};"function"===typeof define&&define.amd?define("three",THREE):"undefined"!==typeof exports&&"undefined"!==typeof module&&(module.exports=THREE);void 0===Number.EPSILON&&(Number.EPSILON=Math.pow(2,-52));void 0===Math.sign&&(Math.sign=function(a){return 0>a?-1:0>16&255)/255;this.g=(a>>8&255)/255;this.b=(a&255)/255;return this},setRGB:function(a,b,c){this.r=a;this.g=b;this.b=c;return this},setHSL:function(){function a(a,c,d){0>d&&(d+=1);1d?c:d<2/3?a+6*(c-a)*(2/3-d):a}return function(b,c,d){b=THREE.Math.euclideanModulo(b,1);c=THREE.Math.clamp(c,0,1);d=THREE.Math.clamp(d,0,1);0===c?this.r=this.g=this.b=d:(c=.5>=d?d*(1+c):d+c-d*c,d=2*d-c,this.r=a(d,c,b+1/3),this.g=a(d,c,b),this.b=a(d,c,b-1/3));return this}}(),setStyle:function(a){function b(b){void 0!==b&&1>parseFloat(b)&&console.warn("THREE.Color: Alpha component of "+a+" will be ignored.")}var c;if(c=/^((?:rgb|hsl)a?)\(\s*([^\)]*)\)/.exec(a)){var d=c[2];switch(c[1]){case "rgb":case "rgba":if(c= +/^(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*(,\s*([0-9]*\.?[0-9]+)\s*)?$/.exec(d))return this.r=Math.min(255,parseInt(c[1],10))/255,this.g=Math.min(255,parseInt(c[2],10))/255,this.b=Math.min(255,parseInt(c[3],10))/255,b(c[5]),this;if(c=/^(\d+)\%\s*,\s*(\d+)\%\s*,\s*(\d+)\%\s*(,\s*([0-9]*\.?[0-9]+)\s*)?$/.exec(d))return this.r=Math.min(100,parseInt(c[1],10))/100,this.g=Math.min(100,parseInt(c[2],10))/100,this.b=Math.min(100,parseInt(c[3],10))/100,b(c[5]),this;break;case "hsl":case "hsla":if(c=/^([0-9]*\.?[0-9]+)\s*,\s*(\d+)\%\s*,\s*(\d+)\%\s*(,\s*([0-9]*\.?[0-9]+)\s*)?$/.exec(d)){var d= +parseFloat(c[1])/360,e=parseInt(c[2],10)/100,f=parseInt(c[3],10)/100;b(c[5]);return this.setHSL(d,e,f)}}}else if(c=/^\#([A-Fa-f0-9]+)$/.exec(a)){c=c[1];d=c.length;if(3===d)return this.r=parseInt(c.charAt(0)+c.charAt(0),16)/255,this.g=parseInt(c.charAt(1)+c.charAt(1),16)/255,this.b=parseInt(c.charAt(2)+c.charAt(2),16)/255,this;if(6===d)return this.r=parseInt(c.charAt(0)+c.charAt(1),16)/255,this.g=parseInt(c.charAt(2)+c.charAt(3),16)/255,this.b=parseInt(c.charAt(4)+c.charAt(5),16)/255,this}a&&0=h?k/(e+f):k/(2-e-f);switch(e){case b:g=(c-d)/k+(cf&&c>b?(c=2*Math.sqrt(1+c-f-b),this._w=(k-g)/c,this._x=.25*c,this._y=(a+e)/c,this._z=(d+h)/c):f>b?(c=2*Math.sqrt(1+f-c-b),this._w=(d-h)/c,this._x=(a+e)/c,this._y= +.25*c,this._z=(g+k)/c):(c=2*Math.sqrt(1+b-c-f),this._w=(e-a)/c,this._x=(d+h)/c,this._y=(g+k)/c,this._z=.25*c);this.onChangeCallback();return this},setFromUnitVectors:function(){var a,b;return function(c,d){void 0===a&&(a=new THREE.Vector3);b=c.dot(d)+1;1E-6>b?(b=0,Math.abs(c.x)>Math.abs(c.z)?a.set(-c.y,c.x,0):a.set(0,-c.z,c.y)):a.crossVectors(c,d);this._x=a.x;this._y=a.y;this._z=a.z;this._w=b;return this.normalize()}}(),inverse:function(){return this.conjugate().normalize()},conjugate:function(){this._x*= +-1;this._y*=-1;this._z*=-1;this.onChangeCallback();return this},dot:function(a){return this._x*a._x+this._y*a._y+this._z*a._z+this._w*a._w},lengthSq:function(){return this._x*this._x+this._y*this._y+this._z*this._z+this._w*this._w},length:function(){return Math.sqrt(this._x*this._x+this._y*this._y+this._z*this._z+this._w*this._w)},normalize:function(){var a=this.length();0===a?(this._z=this._y=this._x=0,this._w=1):(a=1/a,this._x*=a,this._y*=a,this._z*=a,this._w*=a);this.onChangeCallback();return this}, +multiply:function(a,b){return void 0!==b?(console.warn("THREE.Quaternion: .multiply() now only accepts one argument. Use .multiplyQuaternions( a, b ) instead."),this.multiplyQuaternions(a,b)):this.multiplyQuaternions(this,a)},premultiply:function(a){return this.multiplyQuaternions(a,this)},multiplyQuaternions:function(a,b){var c=a._x,d=a._y,e=a._z,f=a._w,g=b._x,h=b._y,k=b._z,l=b._w;this._x=c*l+f*g+d*k-e*h;this._y=d*l+f*h+e*g-c*k;this._z=e*l+f*k+c*h-d*g;this._w=f*l-c*g-d*h-e*k;this.onChangeCallback(); +return this},slerp:function(a,b){if(0===b)return this;if(1===b)return this.copy(a);var c=this._x,d=this._y,e=this._z,f=this._w,g=f*a._w+c*a._x+d*a._y+e*a._z;0>g?(this._w=-a._w,this._x=-a._x,this._y=-a._y,this._z=-a._z,g=-g):this.copy(a);if(1<=g)return this._w=f,this._x=c,this._y=d,this._z=e,this;var h=Math.sqrt(1-g*g);if(.001>Math.abs(h))return this._w=.5*(f+this._w),this._x=.5*(c+this._x),this._y=.5*(d+this._y),this._z=.5*(e+this._z),this;var k=Math.atan2(h,g),g=Math.sin((1-b)*k)/h,h=Math.sin(b* +k)/h;this._w=f*g+this._w*h;this._x=c*g+this._x*h;this._y=d*g+this._y*h;this._z=e*g+this._z*h;this.onChangeCallback();return this},equals:function(a){return a._x===this._x&&a._y===this._y&&a._z===this._z&&a._w===this._w},fromArray:function(a,b){void 0===b&&(b=0);this._x=a[b];this._y=a[b+1];this._z=a[b+2];this._w=a[b+3];this.onChangeCallback();return this},toArray:function(a,b){void 0===a&&(a=[]);void 0===b&&(b=0);a[b]=this._x;a[b+1]=this._y;a[b+2]=this._z;a[b+3]=this._w;return a},onChange:function(a){this.onChangeCallback= +a;return this},onChangeCallback:function(){}}; +Object.assign(THREE.Quaternion,{slerp:function(a,b,c,d){return c.copy(a).slerp(b,d)},slerpFlat:function(a,b,c,d,e,f,g){var h=c[d+0],k=c[d+1],l=c[d+2];c=c[d+3];d=e[f+0];var n=e[f+1],p=e[f+2];e=e[f+3];if(c!==e||h!==d||k!==n||l!==p){f=1-g;var m=h*d+k*n+l*p+c*e,q=0<=m?1:-1,r=1-m*m;r>Number.EPSILON&&(r=Math.sqrt(r),m=Math.atan2(r,m*q),f=Math.sin(f*m)/r,g=Math.sin(g*m)/r);q*=g;h=h*f+d*q;k=k*f+n*q;l=l*f+p*q;c=c*f+e*q;f===1-g&&(g=1/Math.sqrt(h*h+k*k+l*l+c*c),h*=g,k*=g,l*=g,c*=g)}a[b]=h;a[b+1]=k;a[b+2]=l; +a[b+3]=c}});THREE.Vector2=function(a,b){this.x=a||0;this.y=b||0}; +THREE.Vector2.prototype={constructor:THREE.Vector2,get width(){return this.x},set width(a){this.x=a},get height(){return this.y},set height(a){this.y=a},set:function(a,b){this.x=a;this.y=b;return this},setScalar:function(a){this.y=this.x=a;return this},setX:function(a){this.x=a;return this},setY:function(a){this.y=a;return this},setComponent:function(a,b){switch(a){case 0:this.x=b;break;case 1:this.y=b;break;default:throw Error("index is out of range: "+a);}},getComponent:function(a){switch(a){case 0:return this.x; +case 1:return this.y;default:throw Error("index is out of range: "+a);}},clone:function(){return new this.constructor(this.x,this.y)},copy:function(a){this.x=a.x;this.y=a.y;return this},add:function(a,b){if(void 0!==b)return console.warn("THREE.Vector2: .add() now only accepts one argument. Use .addVectors( a, b ) instead."),this.addVectors(a,b);this.x+=a.x;this.y+=a.y;return this},addScalar:function(a){this.x+=a;this.y+=a;return this},addVectors:function(a,b){this.x=a.x+b.x;this.y=a.y+b.y;return this}, +addScaledVector:function(a,b){this.x+=a.x*b;this.y+=a.y*b;return this},sub:function(a,b){if(void 0!==b)return console.warn("THREE.Vector2: .sub() now only accepts one argument. Use .subVectors( a, b ) instead."),this.subVectors(a,b);this.x-=a.x;this.y-=a.y;return this},subScalar:function(a){this.x-=a;this.y-=a;return this},subVectors:function(a,b){this.x=a.x-b.x;this.y=a.y-b.y;return this},multiply:function(a){this.x*=a.x;this.y*=a.y;return this},multiplyScalar:function(a){isFinite(a)?(this.x*=a, +this.y*=a):this.y=this.x=0;return this},divide:function(a){this.x/=a.x;this.y/=a.y;return this},divideScalar:function(a){return this.multiplyScalar(1/a)},min:function(a){this.x=Math.min(this.x,a.x);this.y=Math.min(this.y,a.y);return this},max:function(a){this.x=Math.max(this.x,a.x);this.y=Math.max(this.y,a.y);return this},clamp:function(a,b){this.x=Math.max(a.x,Math.min(b.x,this.x));this.y=Math.max(a.y,Math.min(b.y,this.y));return this},clampScalar:function(){var a,b;return function(c,d){void 0=== +a&&(a=new THREE.Vector2,b=new THREE.Vector2);a.set(c,c);b.set(d,d);return this.clamp(a,b)}}(),clampLength:function(a,b){var c=this.length();return this.multiplyScalar(Math.max(a,Math.min(b,c))/c)},floor:function(){this.x=Math.floor(this.x);this.y=Math.floor(this.y);return this},ceil:function(){this.x=Math.ceil(this.x);this.y=Math.ceil(this.y);return this},round:function(){this.x=Math.round(this.x);this.y=Math.round(this.y);return this},roundToZero:function(){this.x=0>this.x?Math.ceil(this.x):Math.floor(this.x); +this.y=0>this.y?Math.ceil(this.y):Math.floor(this.y);return this},negate:function(){this.x=-this.x;this.y=-this.y;return this},dot:function(a){return this.x*a.x+this.y*a.y},lengthSq:function(){return this.x*this.x+this.y*this.y},length:function(){return Math.sqrt(this.x*this.x+this.y*this.y)},lengthManhattan:function(){return Math.abs(this.x)+Math.abs(this.y)},normalize:function(){return this.divideScalar(this.length())},angle:function(){var a=Math.atan2(this.y,this.x);0>a&&(a+=2*Math.PI);return a}, +distanceTo:function(a){return Math.sqrt(this.distanceToSquared(a))},distanceToSquared:function(a){var b=this.x-a.x;a=this.y-a.y;return b*b+a*a},setLength:function(a){return this.multiplyScalar(a/this.length())},lerp:function(a,b){this.x+=(a.x-this.x)*b;this.y+=(a.y-this.y)*b;return this},lerpVectors:function(a,b,c){return this.subVectors(b,a).multiplyScalar(c).add(a)},equals:function(a){return a.x===this.x&&a.y===this.y},fromArray:function(a,b){void 0===b&&(b=0);this.x=a[b];this.y=a[b+1];return this}, +toArray:function(a,b){void 0===a&&(a=[]);void 0===b&&(b=0);a[b]=this.x;a[b+1]=this.y;return a},fromAttribute:function(a,b,c){void 0===c&&(c=0);b=b*a.itemSize+c;this.x=a.array[b];this.y=a.array[b+1];return this},rotateAround:function(a,b){var c=Math.cos(b),d=Math.sin(b),e=this.x-a.x,f=this.y-a.y;this.x=e*c-f*d+a.x;this.y=e*d+f*c+a.y;return this}};THREE.Vector3=function(a,b,c){this.x=a||0;this.y=b||0;this.z=c||0}; +THREE.Vector3.prototype={constructor:THREE.Vector3,set:function(a,b,c){this.x=a;this.y=b;this.z=c;return this},setScalar:function(a){this.z=this.y=this.x=a;return this},setX:function(a){this.x=a;return this},setY:function(a){this.y=a;return this},setZ:function(a){this.z=a;return this},setComponent:function(a,b){switch(a){case 0:this.x=b;break;case 1:this.y=b;break;case 2:this.z=b;break;default:throw Error("index is out of range: "+a);}},getComponent:function(a){switch(a){case 0:return this.x;case 1:return this.y; +case 2:return this.z;default:throw Error("index is out of range: "+a);}},clone:function(){return new this.constructor(this.x,this.y,this.z)},copy:function(a){this.x=a.x;this.y=a.y;this.z=a.z;return this},add:function(a,b){if(void 0!==b)return console.warn("THREE.Vector3: .add() now only accepts one argument. Use .addVectors( a, b ) instead."),this.addVectors(a,b);this.x+=a.x;this.y+=a.y;this.z+=a.z;return this},addScalar:function(a){this.x+=a;this.y+=a;this.z+=a;return this},addVectors:function(a, +b){this.x=a.x+b.x;this.y=a.y+b.y;this.z=a.z+b.z;return this},addScaledVector:function(a,b){this.x+=a.x*b;this.y+=a.y*b;this.z+=a.z*b;return this},sub:function(a,b){if(void 0!==b)return console.warn("THREE.Vector3: .sub() now only accepts one argument. Use .subVectors( a, b ) instead."),this.subVectors(a,b);this.x-=a.x;this.y-=a.y;this.z-=a.z;return this},subScalar:function(a){this.x-=a;this.y-=a;this.z-=a;return this},subVectors:function(a,b){this.x=a.x-b.x;this.y=a.y-b.y;this.z=a.z-b.z;return this}, +multiply:function(a,b){if(void 0!==b)return console.warn("THREE.Vector3: .multiply() now only accepts one argument. Use .multiplyVectors( a, b ) instead."),this.multiplyVectors(a,b);this.x*=a.x;this.y*=a.y;this.z*=a.z;return this},multiplyScalar:function(a){isFinite(a)?(this.x*=a,this.y*=a,this.z*=a):this.z=this.y=this.x=0;return this},multiplyVectors:function(a,b){this.x=a.x*b.x;this.y=a.y*b.y;this.z=a.z*b.z;return this},applyEuler:function(){var a;return function(b){!1===b instanceof THREE.Euler&& +console.error("THREE.Vector3: .applyEuler() now expects an Euler rotation rather than a Vector3 and order.");void 0===a&&(a=new THREE.Quaternion);return this.applyQuaternion(a.setFromEuler(b))}}(),applyAxisAngle:function(){var a;return function(b,c){void 0===a&&(a=new THREE.Quaternion);return this.applyQuaternion(a.setFromAxisAngle(b,c))}}(),applyMatrix3:function(a){var b=this.x,c=this.y,d=this.z;a=a.elements;this.x=a[0]*b+a[3]*c+a[6]*d;this.y=a[1]*b+a[4]*c+a[7]*d;this.z=a[2]*b+a[5]*c+a[8]*d;return this}, +applyMatrix4:function(a){var b=this.x,c=this.y,d=this.z;a=a.elements;this.x=a[0]*b+a[4]*c+a[8]*d+a[12];this.y=a[1]*b+a[5]*c+a[9]*d+a[13];this.z=a[2]*b+a[6]*c+a[10]*d+a[14];return this},applyProjection:function(a){var b=this.x,c=this.y,d=this.z;a=a.elements;var e=1/(a[3]*b+a[7]*c+a[11]*d+a[15]);this.x=(a[0]*b+a[4]*c+a[8]*d+a[12])*e;this.y=(a[1]*b+a[5]*c+a[9]*d+a[13])*e;this.z=(a[2]*b+a[6]*c+a[10]*d+a[14])*e;return this},applyQuaternion:function(a){var b=this.x,c=this.y,d=this.z,e=a.x,f=a.y,g=a.z;a= +a.w;var h=a*b+f*d-g*c,k=a*c+g*b-e*d,l=a*d+e*c-f*b,b=-e*b-f*c-g*d;this.x=h*a+b*-e+k*-g-l*-f;this.y=k*a+b*-f+l*-e-h*-g;this.z=l*a+b*-g+h*-f-k*-e;return this},project:function(){var a;return function(b){void 0===a&&(a=new THREE.Matrix4);a.multiplyMatrices(b.projectionMatrix,a.getInverse(b.matrixWorld));return this.applyProjection(a)}}(),unproject:function(){var a;return function(b){void 0===a&&(a=new THREE.Matrix4);a.multiplyMatrices(b.matrixWorld,a.getInverse(b.projectionMatrix));return this.applyProjection(a)}}(), +transformDirection:function(a){var b=this.x,c=this.y,d=this.z;a=a.elements;this.x=a[0]*b+a[4]*c+a[8]*d;this.y=a[1]*b+a[5]*c+a[9]*d;this.z=a[2]*b+a[6]*c+a[10]*d;return this.normalize()},divide:function(a){this.x/=a.x;this.y/=a.y;this.z/=a.z;return this},divideScalar:function(a){return this.multiplyScalar(1/a)},min:function(a){this.x=Math.min(this.x,a.x);this.y=Math.min(this.y,a.y);this.z=Math.min(this.z,a.z);return this},max:function(a){this.x=Math.max(this.x,a.x);this.y=Math.max(this.y,a.y);this.z= +Math.max(this.z,a.z);return this},clamp:function(a,b){this.x=Math.max(a.x,Math.min(b.x,this.x));this.y=Math.max(a.y,Math.min(b.y,this.y));this.z=Math.max(a.z,Math.min(b.z,this.z));return this},clampScalar:function(){var a,b;return function(c,d){void 0===a&&(a=new THREE.Vector3,b=new THREE.Vector3);a.set(c,c,c);b.set(d,d,d);return this.clamp(a,b)}}(),clampLength:function(a,b){var c=this.length();return this.multiplyScalar(Math.max(a,Math.min(b,c))/c)},floor:function(){this.x=Math.floor(this.x);this.y= +Math.floor(this.y);this.z=Math.floor(this.z);return this},ceil:function(){this.x=Math.ceil(this.x);this.y=Math.ceil(this.y);this.z=Math.ceil(this.z);return this},round:function(){this.x=Math.round(this.x);this.y=Math.round(this.y);this.z=Math.round(this.z);return this},roundToZero:function(){this.x=0>this.x?Math.ceil(this.x):Math.floor(this.x);this.y=0>this.y?Math.ceil(this.y):Math.floor(this.y);this.z=0>this.z?Math.ceil(this.z):Math.floor(this.z);return this},negate:function(){this.x=-this.x;this.y= +-this.y;this.z=-this.z;return this},dot:function(a){return this.x*a.x+this.y*a.y+this.z*a.z},lengthSq:function(){return this.x*this.x+this.y*this.y+this.z*this.z},length:function(){return Math.sqrt(this.x*this.x+this.y*this.y+this.z*this.z)},lengthManhattan:function(){return Math.abs(this.x)+Math.abs(this.y)+Math.abs(this.z)},normalize:function(){return this.divideScalar(this.length())},setLength:function(a){return this.multiplyScalar(a/this.length())},lerp:function(a,b){this.x+=(a.x-this.x)*b;this.y+= +(a.y-this.y)*b;this.z+=(a.z-this.z)*b;return this},lerpVectors:function(a,b,c){return this.subVectors(b,a).multiplyScalar(c).add(a)},cross:function(a,b){if(void 0!==b)return console.warn("THREE.Vector3: .cross() now only accepts one argument. Use .crossVectors( a, b ) instead."),this.crossVectors(a,b);var c=this.x,d=this.y,e=this.z;this.x=d*a.z-e*a.y;this.y=e*a.x-c*a.z;this.z=c*a.y-d*a.x;return this},crossVectors:function(a,b){var c=a.x,d=a.y,e=a.z,f=b.x,g=b.y,h=b.z;this.x=d*h-e*g;this.y=e*f-c*h; +this.z=c*g-d*f;return this},projectOnVector:function(){var a,b;return function(c){void 0===a&&(a=new THREE.Vector3);a.copy(c).normalize();b=this.dot(a);return this.copy(a).multiplyScalar(b)}}(),projectOnPlane:function(){var a;return function(b){void 0===a&&(a=new THREE.Vector3);a.copy(this).projectOnVector(b);return this.sub(a)}}(),reflect:function(){var a;return function(b){void 0===a&&(a=new THREE.Vector3);return this.sub(a.copy(b).multiplyScalar(2*this.dot(b)))}}(),angleTo:function(a){a=this.dot(a)/ +Math.sqrt(this.lengthSq()*a.lengthSq());return Math.acos(THREE.Math.clamp(a,-1,1))},distanceTo:function(a){return Math.sqrt(this.distanceToSquared(a))},distanceToSquared:function(a){var b=this.x-a.x,c=this.y-a.y;a=this.z-a.z;return b*b+c*c+a*a},setFromSpherical:function(a){var b=Math.sin(a.phi)*a.radius;this.x=b*Math.sin(a.theta);this.y=Math.cos(a.phi)*a.radius;this.z=b*Math.cos(a.theta);return this},setFromMatrixPosition:function(a){return this.setFromMatrixColumn(a,3)},setFromMatrixScale:function(a){var b= +this.setFromMatrixColumn(a,0).length(),c=this.setFromMatrixColumn(a,1).length();a=this.setFromMatrixColumn(a,2).length();this.x=b;this.y=c;this.z=a;return this},setFromMatrixColumn:function(a,b){if("number"===typeof a){console.warn("THREE.Vector3: setFromMatrixColumn now expects ( matrix, index ).");var c=a;a=b;b=c}return this.fromArray(a.elements,4*b)},equals:function(a){return a.x===this.x&&a.y===this.y&&a.z===this.z},fromArray:function(a,b){void 0===b&&(b=0);this.x=a[b];this.y=a[b+1];this.z=a[b+ +2];return this},toArray:function(a,b){void 0===a&&(a=[]);void 0===b&&(b=0);a[b]=this.x;a[b+1]=this.y;a[b+2]=this.z;return a},fromAttribute:function(a,b,c){void 0===c&&(c=0);b=b*a.itemSize+c;this.x=a.array[b];this.y=a.array[b+1];this.z=a.array[b+2];return this}};THREE.Vector4=function(a,b,c,d){this.x=a||0;this.y=b||0;this.z=c||0;this.w=void 0!==d?d:1}; +THREE.Vector4.prototype={constructor:THREE.Vector4,set:function(a,b,c,d){this.x=a;this.y=b;this.z=c;this.w=d;return this},setScalar:function(a){this.w=this.z=this.y=this.x=a;return this},setX:function(a){this.x=a;return this},setY:function(a){this.y=a;return this},setZ:function(a){this.z=a;return this},setW:function(a){this.w=a;return this},setComponent:function(a,b){switch(a){case 0:this.x=b;break;case 1:this.y=b;break;case 2:this.z=b;break;case 3:this.w=b;break;default:throw Error("index is out of range: "+ +a);}},getComponent:function(a){switch(a){case 0:return this.x;case 1:return this.y;case 2:return this.z;case 3:return this.w;default:throw Error("index is out of range: "+a);}},clone:function(){return new this.constructor(this.x,this.y,this.z,this.w)},copy:function(a){this.x=a.x;this.y=a.y;this.z=a.z;this.w=void 0!==a.w?a.w:1;return this},add:function(a,b){if(void 0!==b)return console.warn("THREE.Vector4: .add() now only accepts one argument. Use .addVectors( a, b ) instead."),this.addVectors(a,b); +this.x+=a.x;this.y+=a.y;this.z+=a.z;this.w+=a.w;return this},addScalar:function(a){this.x+=a;this.y+=a;this.z+=a;this.w+=a;return this},addVectors:function(a,b){this.x=a.x+b.x;this.y=a.y+b.y;this.z=a.z+b.z;this.w=a.w+b.w;return this},addScaledVector:function(a,b){this.x+=a.x*b;this.y+=a.y*b;this.z+=a.z*b;this.w+=a.w*b;return this},sub:function(a,b){if(void 0!==b)return console.warn("THREE.Vector4: .sub() now only accepts one argument. Use .subVectors( a, b ) instead."),this.subVectors(a,b);this.x-= +a.x;this.y-=a.y;this.z-=a.z;this.w-=a.w;return this},subScalar:function(a){this.x-=a;this.y-=a;this.z-=a;this.w-=a;return this},subVectors:function(a,b){this.x=a.x-b.x;this.y=a.y-b.y;this.z=a.z-b.z;this.w=a.w-b.w;return this},multiplyScalar:function(a){isFinite(a)?(this.x*=a,this.y*=a,this.z*=a,this.w*=a):this.w=this.z=this.y=this.x=0;return this},applyMatrix4:function(a){var b=this.x,c=this.y,d=this.z,e=this.w;a=a.elements;this.x=a[0]*b+a[4]*c+a[8]*d+a[12]*e;this.y=a[1]*b+a[5]*c+a[9]*d+a[13]*e;this.z= +a[2]*b+a[6]*c+a[10]*d+a[14]*e;this.w=a[3]*b+a[7]*c+a[11]*d+a[15]*e;return this},divideScalar:function(a){return this.multiplyScalar(1/a)},setAxisAngleFromQuaternion:function(a){this.w=2*Math.acos(a.w);var b=Math.sqrt(1-a.w*a.w);1E-4>b?(this.x=1,this.z=this.y=0):(this.x=a.x/b,this.y=a.y/b,this.z=a.z/b);return this},setAxisAngleFromRotationMatrix:function(a){var b,c,d;a=a.elements;var e=a[0];d=a[4];var f=a[8],g=a[1],h=a[5],k=a[9];c=a[2];b=a[6];var l=a[10];if(.01>Math.abs(d-g)&&.01>Math.abs(f-c)&&.01> +Math.abs(k-b)){if(.1>Math.abs(d+g)&&.1>Math.abs(f+c)&&.1>Math.abs(k+b)&&.1>Math.abs(e+h+l-3))return this.set(1,0,0,0),this;a=Math.PI;e=(e+1)/2;h=(h+1)/2;l=(l+1)/2;d=(d+g)/4;f=(f+c)/4;k=(k+b)/4;e>h&&e>l?.01>e?(b=0,d=c=.707106781):(b=Math.sqrt(e),c=d/b,d=f/b):h>l?.01>h?(b=.707106781,c=0,d=.707106781):(c=Math.sqrt(h),b=d/c,d=k/c):.01>l?(c=b=.707106781,d=0):(d=Math.sqrt(l),b=f/d,c=k/d);this.set(b,c,d,a);return this}a=Math.sqrt((b-k)*(b-k)+(f-c)*(f-c)+(g-d)*(g-d));.001>Math.abs(a)&&(a=1);this.x=(b-k)/ +a;this.y=(f-c)/a;this.z=(g-d)/a;this.w=Math.acos((e+h+l-1)/2);return this},min:function(a){this.x=Math.min(this.x,a.x);this.y=Math.min(this.y,a.y);this.z=Math.min(this.z,a.z);this.w=Math.min(this.w,a.w);return this},max:function(a){this.x=Math.max(this.x,a.x);this.y=Math.max(this.y,a.y);this.z=Math.max(this.z,a.z);this.w=Math.max(this.w,a.w);return this},clamp:function(a,b){this.x=Math.max(a.x,Math.min(b.x,this.x));this.y=Math.max(a.y,Math.min(b.y,this.y));this.z=Math.max(a.z,Math.min(b.z,this.z)); +this.w=Math.max(a.w,Math.min(b.w,this.w));return this},clampScalar:function(){var a,b;return function(c,d){void 0===a&&(a=new THREE.Vector4,b=new THREE.Vector4);a.set(c,c,c,c);b.set(d,d,d,d);return this.clamp(a,b)}}(),floor:function(){this.x=Math.floor(this.x);this.y=Math.floor(this.y);this.z=Math.floor(this.z);this.w=Math.floor(this.w);return this},ceil:function(){this.x=Math.ceil(this.x);this.y=Math.ceil(this.y);this.z=Math.ceil(this.z);this.w=Math.ceil(this.w);return this},round:function(){this.x= +Math.round(this.x);this.y=Math.round(this.y);this.z=Math.round(this.z);this.w=Math.round(this.w);return this},roundToZero:function(){this.x=0>this.x?Math.ceil(this.x):Math.floor(this.x);this.y=0>this.y?Math.ceil(this.y):Math.floor(this.y);this.z=0>this.z?Math.ceil(this.z):Math.floor(this.z);this.w=0>this.w?Math.ceil(this.w):Math.floor(this.w);return this},negate:function(){this.x=-this.x;this.y=-this.y;this.z=-this.z;this.w=-this.w;return this},dot:function(a){return this.x*a.x+this.y*a.y+this.z* +a.z+this.w*a.w},lengthSq:function(){return this.x*this.x+this.y*this.y+this.z*this.z+this.w*this.w},length:function(){return Math.sqrt(this.x*this.x+this.y*this.y+this.z*this.z+this.w*this.w)},lengthManhattan:function(){return Math.abs(this.x)+Math.abs(this.y)+Math.abs(this.z)+Math.abs(this.w)},normalize:function(){return this.divideScalar(this.length())},setLength:function(a){return this.multiplyScalar(a/this.length())},lerp:function(a,b){this.x+=(a.x-this.x)*b;this.y+=(a.y-this.y)*b;this.z+=(a.z- +this.z)*b;this.w+=(a.w-this.w)*b;return this},lerpVectors:function(a,b,c){return this.subVectors(b,a).multiplyScalar(c).add(a)},equals:function(a){return a.x===this.x&&a.y===this.y&&a.z===this.z&&a.w===this.w},fromArray:function(a,b){void 0===b&&(b=0);this.x=a[b];this.y=a[b+1];this.z=a[b+2];this.w=a[b+3];return this},toArray:function(a,b){void 0===a&&(a=[]);void 0===b&&(b=0);a[b]=this.x;a[b+1]=this.y;a[b+2]=this.z;a[b+3]=this.w;return a},fromAttribute:function(a,b,c){void 0===c&&(c=0);b=b*a.itemSize+ +c;this.x=a.array[b];this.y=a.array[b+1];this.z=a.array[b+2];this.w=a.array[b+3];return this}};THREE.Euler=function(a,b,c,d){this._x=a||0;this._y=b||0;this._z=c||0;this._order=d||THREE.Euler.DefaultOrder};THREE.Euler.RotationOrders="XYZ YZX ZXY XZY YXZ ZYX".split(" ");THREE.Euler.DefaultOrder="XYZ"; +THREE.Euler.prototype={constructor:THREE.Euler,get x(){return this._x},set x(a){this._x=a;this.onChangeCallback()},get y(){return this._y},set y(a){this._y=a;this.onChangeCallback()},get z(){return this._z},set z(a){this._z=a;this.onChangeCallback()},get order(){return this._order},set order(a){this._order=a;this.onChangeCallback()},set:function(a,b,c,d){this._x=a;this._y=b;this._z=c;this._order=d||this._order;this.onChangeCallback();return this},clone:function(){return new this.constructor(this._x, +this._y,this._z,this._order)},copy:function(a){this._x=a._x;this._y=a._y;this._z=a._z;this._order=a._order;this.onChangeCallback();return this},setFromRotationMatrix:function(a,b,c){var d=THREE.Math.clamp,e=a.elements;a=e[0];var f=e[4],g=e[8],h=e[1],k=e[5],l=e[9],n=e[2],p=e[6],e=e[10];b=b||this._order;"XYZ"===b?(this._y=Math.asin(d(g,-1,1)),.99999>Math.abs(g)?(this._x=Math.atan2(-l,e),this._z=Math.atan2(-f,a)):(this._x=Math.atan2(p,k),this._z=0)):"YXZ"===b?(this._x=Math.asin(-d(l,-1,1)),.99999>Math.abs(l)? +(this._y=Math.atan2(g,e),this._z=Math.atan2(h,k)):(this._y=Math.atan2(-n,a),this._z=0)):"ZXY"===b?(this._x=Math.asin(d(p,-1,1)),.99999>Math.abs(p)?(this._y=Math.atan2(-n,e),this._z=Math.atan2(-f,k)):(this._y=0,this._z=Math.atan2(h,a))):"ZYX"===b?(this._y=Math.asin(-d(n,-1,1)),.99999>Math.abs(n)?(this._x=Math.atan2(p,e),this._z=Math.atan2(h,a)):(this._x=0,this._z=Math.atan2(-f,k))):"YZX"===b?(this._z=Math.asin(d(h,-1,1)),.99999>Math.abs(h)?(this._x=Math.atan2(-l,k),this._y=Math.atan2(-n,a)):(this._x= +0,this._y=Math.atan2(g,e))):"XZY"===b?(this._z=Math.asin(-d(f,-1,1)),.99999>Math.abs(f)?(this._x=Math.atan2(p,k),this._y=Math.atan2(g,a)):(this._x=Math.atan2(-l,e),this._y=0)):console.warn("THREE.Euler: .setFromRotationMatrix() given unsupported order: "+b);this._order=b;if(!1!==c)this.onChangeCallback();return this},setFromQuaternion:function(){var a;return function(b,c,d){void 0===a&&(a=new THREE.Matrix4);a.makeRotationFromQuaternion(b);return this.setFromRotationMatrix(a,c,d)}}(),setFromVector3:function(a, +b){return this.set(a.x,a.y,a.z,b||this._order)},reorder:function(){var a=new THREE.Quaternion;return function(b){a.setFromEuler(this);return this.setFromQuaternion(a,b)}}(),equals:function(a){return a._x===this._x&&a._y===this._y&&a._z===this._z&&a._order===this._order},fromArray:function(a){this._x=a[0];this._y=a[1];this._z=a[2];void 0!==a[3]&&(this._order=a[3]);this.onChangeCallback();return this},toArray:function(a,b){void 0===a&&(a=[]);void 0===b&&(b=0);a[b]=this._x;a[b+1]=this._y;a[b+2]=this._z; +a[b+3]=this._order;return a},toVector3:function(a){return a?a.set(this._x,this._y,this._z):new THREE.Vector3(this._x,this._y,this._z)},onChange:function(a){this.onChangeCallback=a;return this},onChangeCallback:function(){}};THREE.Line3=function(a,b){this.start=void 0!==a?a:new THREE.Vector3;this.end=void 0!==b?b:new THREE.Vector3}; +THREE.Line3.prototype={constructor:THREE.Line3,set:function(a,b){this.start.copy(a);this.end.copy(b);return this},clone:function(){return(new this.constructor).copy(this)},copy:function(a){this.start.copy(a.start);this.end.copy(a.end);return this},center:function(a){return(a||new THREE.Vector3).addVectors(this.start,this.end).multiplyScalar(.5)},delta:function(a){return(a||new THREE.Vector3).subVectors(this.end,this.start)},distanceSq:function(){return this.start.distanceToSquared(this.end)},distance:function(){return this.start.distanceTo(this.end)}, +at:function(a,b){var c=b||new THREE.Vector3;return this.delta(c).multiplyScalar(a).add(this.start)},closestPointToPointParameter:function(){var a=new THREE.Vector3,b=new THREE.Vector3;return function(c,d){a.subVectors(c,this.start);b.subVectors(this.end,this.start);var e=b.dot(b),e=b.dot(a)/e;d&&(e=THREE.Math.clamp(e,0,1));return e}}(),closestPointToPoint:function(a,b,c){a=this.closestPointToPointParameter(a,b);c=c||new THREE.Vector3;return this.delta(c).multiplyScalar(a).add(this.start)},applyMatrix4:function(a){this.start.applyMatrix4(a); +this.end.applyMatrix4(a);return this},equals:function(a){return a.start.equals(this.start)&&a.end.equals(this.end)}};THREE.Box2=function(a,b){this.min=void 0!==a?a:new THREE.Vector2(Infinity,Infinity);this.max=void 0!==b?b:new THREE.Vector2(-Infinity,-Infinity)}; +THREE.Box2.prototype={constructor:THREE.Box2,set:function(a,b){this.min.copy(a);this.max.copy(b);return this},setFromPoints:function(a){this.makeEmpty();for(var b=0,c=a.length;bthis.max.x||a.ythis.max.y?!1:!0},containsBox:function(a){return this.min.x<=a.min.x&&a.max.x<=this.max.x&&this.min.y<=a.min.y&&a.max.y<=this.max.y?!0:!1},getParameter:function(a,b){return(b||new THREE.Vector2).set((a.x-this.min.x)/(this.max.x-this.min.x),(a.y-this.min.y)/(this.max.y-this.min.y))},intersectsBox:function(a){return a.max.xthis.max.x||a.max.y +this.max.y?!1:!0},clampPoint:function(a,b){return(b||new THREE.Vector2).copy(a).clamp(this.min,this.max)},distanceToPoint:function(){var a=new THREE.Vector2;return function(b){return a.copy(b).clamp(this.min,this.max).sub(b).length()}}(),intersect:function(a){this.min.max(a.min);this.max.min(a.max);return this},union:function(a){this.min.min(a.min);this.max.max(a.max);return this},translate:function(a){this.min.add(a);this.max.add(a);return this},equals:function(a){return a.min.equals(this.min)&& +a.max.equals(this.max)}};THREE.Box3=function(a,b){this.min=void 0!==a?a:new THREE.Vector3(Infinity,Infinity,Infinity);this.max=void 0!==b?b:new THREE.Vector3(-Infinity,-Infinity,-Infinity)}; +THREE.Box3.prototype={constructor:THREE.Box3,set:function(a,b){this.min.copy(a);this.max.copy(b);return this},setFromArray:function(a){for(var b=Infinity,c=Infinity,d=Infinity,e=-Infinity,f=-Infinity,g=-Infinity,h=0,k=a.length;he&&(e=l);n>f&&(f=n);p>g&&(g=p)}this.min.set(b,c,d);this.max.set(e,f,g)},setFromPoints:function(a){this.makeEmpty();for(var b=0,c=a.length;bthis.max.x||a.ythis.max.y||a.z< +this.min.z||a.z>this.max.z?!1:!0},containsBox:function(a){return this.min.x<=a.min.x&&a.max.x<=this.max.x&&this.min.y<=a.min.y&&a.max.y<=this.max.y&&this.min.z<=a.min.z&&a.max.z<=this.max.z?!0:!1},getParameter:function(a,b){return(b||new THREE.Vector3).set((a.x-this.min.x)/(this.max.x-this.min.x),(a.y-this.min.y)/(this.max.y-this.min.y),(a.z-this.min.z)/(this.max.z-this.min.z))},intersectsBox:function(a){return a.max.xthis.max.x||a.max.ythis.max.y||a.max.z< +this.min.z||a.min.z>this.max.z?!1:!0},intersectsSphere:function(){var a;return function(b){void 0===a&&(a=new THREE.Vector3);this.clampPoint(b.center,a);return a.distanceToSquared(b.center)<=b.radius*b.radius}}(),intersectsPlane:function(a){var b,c;0=a.constant},clampPoint:function(a,b){return(b||new THREE.Vector3).copy(a).clamp(this.min,this.max)},distanceToPoint:function(){var a=new THREE.Vector3;return function(b){return a.copy(b).clamp(this.min,this.max).sub(b).length()}}(),getBoundingSphere:function(){var a=new THREE.Vector3;return function(b){b=b||new THREE.Sphere;b.center=this.center();b.radius=.5*this.size(a).length();return b}}(), +intersect:function(a){this.min.max(a.min);this.max.min(a.max);this.isEmpty()&&this.makeEmpty();return this},union:function(a){this.min.min(a.min);this.max.max(a.max);return this},applyMatrix4:function(){var a=[new THREE.Vector3,new THREE.Vector3,new THREE.Vector3,new THREE.Vector3,new THREE.Vector3,new THREE.Vector3,new THREE.Vector3,new THREE.Vector3];return function(b){if(this.isEmpty())return this;a[0].set(this.min.x,this.min.y,this.min.z).applyMatrix4(b);a[1].set(this.min.x,this.min.y,this.max.z).applyMatrix4(b); +a[2].set(this.min.x,this.max.y,this.min.z).applyMatrix4(b);a[3].set(this.min.x,this.max.y,this.max.z).applyMatrix4(b);a[4].set(this.max.x,this.min.y,this.min.z).applyMatrix4(b);a[5].set(this.max.x,this.min.y,this.max.z).applyMatrix4(b);a[6].set(this.max.x,this.max.y,this.min.z).applyMatrix4(b);a[7].set(this.max.x,this.max.y,this.max.z).applyMatrix4(b);this.setFromPoints(a);return this}}(),translate:function(a){this.min.add(a);this.max.add(a);return this},equals:function(a){return a.min.equals(this.min)&& +a.max.equals(this.max)}};THREE.Matrix3=function(){this.elements=new Float32Array([1,0,0,0,1,0,0,0,1]);0this.determinant()&&(g=-g);c.x=f[12];c.y=f[13];c.z=f[14];b.elements.set(this.elements);c=1/g;var f=1/h,l=1/k;b.elements[0]*=c;b.elements[1]*=c; +b.elements[2]*=c;b.elements[4]*=f;b.elements[5]*=f;b.elements[6]*=f;b.elements[8]*=l;b.elements[9]*=l;b.elements[10]*=l;d.setFromRotationMatrix(b);e.x=g;e.y=h;e.z=k;return this}}(),makeFrustum:function(a,b,c,d,e,f){var g=this.elements;g[0]=2*e/(b-a);g[4]=0;g[8]=(b+a)/(b-a);g[12]=0;g[1]=0;g[5]=2*e/(d-c);g[9]=(d+c)/(d-c);g[13]=0;g[2]=0;g[6]=0;g[10]=-(f+e)/(f-e);g[14]=-2*f*e/(f-e);g[3]=0;g[7]=0;g[11]=-1;g[15]=0;return this},makePerspective:function(a,b,c,d){a=c*Math.tan(THREE.Math.DEG2RAD*a*.5);var e= +-a;return this.makeFrustum(e*b,a*b,e,a,c,d)},makeOrthographic:function(a,b,c,d,e,f){var g=this.elements,h=1/(b-a),k=1/(c-d),l=1/(f-e);g[0]=2*h;g[4]=0;g[8]=0;g[12]=-((b+a)*h);g[1]=0;g[5]=2*k;g[9]=0;g[13]=-((c+d)*k);g[2]=0;g[6]=0;g[10]=-2*l;g[14]=-((f+e)*l);g[3]=0;g[7]=0;g[11]=0;g[15]=1;return this},equals:function(a){var b=this.elements;a=a.elements;for(var c=0;16>c;c++)if(b[c]!==a[c])return!1;return!0},fromArray:function(a){this.elements.set(a);return this},toArray:function(a,b){void 0===a&&(a=[]); +void 0===b&&(b=0);var c=this.elements;a[b]=c[0];a[b+1]=c[1];a[b+2]=c[2];a[b+3]=c[3];a[b+4]=c[4];a[b+5]=c[5];a[b+6]=c[6];a[b+7]=c[7];a[b+8]=c[8];a[b+9]=c[9];a[b+10]=c[10];a[b+11]=c[11];a[b+12]=c[12];a[b+13]=c[13];a[b+14]=c[14];a[b+15]=c[15];return a}};THREE.Ray=function(a,b){this.origin=void 0!==a?a:new THREE.Vector3;this.direction=void 0!==b?b:new THREE.Vector3}; +THREE.Ray.prototype={constructor:THREE.Ray,set:function(a,b){this.origin.copy(a);this.direction.copy(b);return this},clone:function(){return(new this.constructor).copy(this)},copy:function(a){this.origin.copy(a.origin);this.direction.copy(a.direction);return this},at:function(a,b){return(b||new THREE.Vector3).copy(this.direction).multiplyScalar(a).add(this.origin)},lookAt:function(a){this.direction.copy(a).sub(this.origin).normalize();return this},recast:function(){var a=new THREE.Vector3;return function(b){this.origin.copy(this.at(b, +a));return this}}(),closestPointToPoint:function(a,b){var c=b||new THREE.Vector3;c.subVectors(a,this.origin);var d=c.dot(this.direction);return 0>d?c.copy(this.origin):c.copy(this.direction).multiplyScalar(d).add(this.origin)},distanceToPoint:function(a){return Math.sqrt(this.distanceSqToPoint(a))},distanceSqToPoint:function(){var a=new THREE.Vector3;return function(b){var c=a.subVectors(b,this.origin).dot(this.direction);if(0>c)return this.origin.distanceToSquared(b);a.copy(this.direction).multiplyScalar(c).add(this.origin); +return a.distanceToSquared(b)}}(),distanceSqToSegment:function(){var a=new THREE.Vector3,b=new THREE.Vector3,c=new THREE.Vector3;return function(d,e,f,g){a.copy(d).add(e).multiplyScalar(.5);b.copy(e).sub(d).normalize();c.copy(this.origin).sub(a);var h=.5*d.distanceTo(e),k=-this.direction.dot(b),l=c.dot(this.direction),n=-c.dot(b),p=c.lengthSq(),m=Math.abs(1-k*k),q;0=-q?e<=q?(h=1/m,d*=h,e*=h,k=d*(d+k*e+2*l)+e*(k*d+e+2*n)+p):(e=h,d=Math.max(0,-(k*e+l)),k=-d*d+e*(e+2* +n)+p):(e=-h,d=Math.max(0,-(k*e+l)),k=-d*d+e*(e+2*n)+p):e<=-q?(d=Math.max(0,-(-k*h+l)),e=0f)return null;f=Math.sqrt(f-e);e=d-f;d+=f;return 0>e&&0>d?null:0>e?this.at(d,c):this.at(e,c)}}(),intersectsSphere:function(a){return this.distanceToPoint(a.center)<=a.radius},distanceToPlane:function(a){var b=a.normal.dot(this.direction);if(0===b)return 0===a.distanceToPoint(this.origin)?0:null;a=-(this.origin.dot(a.normal)+a.constant)/b;return 0<=a?a:null},intersectPlane:function(a,b){var c= +this.distanceToPlane(a);return null===c?null:this.at(c,b)},intersectsPlane:function(a){var b=a.distanceToPoint(this.origin);return 0===b||0>a.normal.dot(this.direction)*b?!0:!1},intersectBox:function(a,b){var c,d,e,f,g;d=1/this.direction.x;f=1/this.direction.y;g=1/this.direction.z;var h=this.origin;0<=d?(c=(a.min.x-h.x)*d,d*=a.max.x-h.x):(c=(a.max.x-h.x)*d,d*=a.min.x-h.x);0<=f?(e=(a.min.y-h.y)*f,f*=a.max.y-h.y):(e=(a.max.y-h.y)*f,f*=a.min.y-h.y);if(c>f||e>d)return null;if(e>c||c!==c)c=e;if(fg||e>d)return null;if(e>c||c!==c)c=e;if(gd?null:this.at(0<=c?c:d,b)},intersectsBox:function(){var a=new THREE.Vector3;return function(b){return null!==this.intersectBox(b,a)}}(),intersectTriangle:function(){var a=new THREE.Vector3,b=new THREE.Vector3,c=new THREE.Vector3,d=new THREE.Vector3;return function(e,f,g,h,k){b.subVectors(f,e);c.subVectors(g,e);d.crossVectors(b,c);f=this.direction.dot(d); +if(0f)h=-1,f=-f;else return null;a.subVectors(this.origin,e);e=h*this.direction.dot(c.crossVectors(a,c));if(0>e)return null;g=h*this.direction.dot(b.cross(a));if(0>g||e+g>f)return null;e=-h*a.dot(d);return 0>e?null:this.at(e/f,k)}}(),applyMatrix4:function(a){this.direction.add(this.origin).applyMatrix4(a);this.origin.applyMatrix4(a);this.direction.sub(this.origin);this.direction.normalize();return this},equals:function(a){return a.origin.equals(this.origin)&&a.direction.equals(this.direction)}}; +THREE.Sphere=function(a,b){this.center=void 0!==a?a:new THREE.Vector3;this.radius=void 0!==b?b:0}; +THREE.Sphere.prototype={constructor:THREE.Sphere,set:function(a,b){this.center.copy(a);this.radius=b;return this},setFromPoints:function(){var a=new THREE.Box3;return function(b,c){var d=this.center;void 0!==c?d.copy(c):a.setFromPoints(b).center(d);for(var e=0,f=0,g=b.length;f=this.radius},containsPoint:function(a){return a.distanceToSquared(this.center)<=this.radius*this.radius},distanceToPoint:function(a){return a.distanceTo(this.center)-this.radius},intersectsSphere:function(a){var b=this.radius+a.radius;return a.center.distanceToSquared(this.center)<=b*b},intersectsBox:function(a){return a.intersectsSphere(this)},intersectsPlane:function(a){return Math.abs(this.center.dot(a.normal)-a.constant)<=this.radius},clampPoint:function(a,b){var c= +this.center.distanceToSquared(a),d=b||new THREE.Vector3;d.copy(a);c>this.radius*this.radius&&(d.sub(this.center).normalize(),d.multiplyScalar(this.radius).add(this.center));return d},getBoundingBox:function(a){a=a||new THREE.Box3;a.set(this.center,this.center);a.expandByScalar(this.radius);return a},applyMatrix4:function(a){this.center.applyMatrix4(a);this.radius*=a.getMaxScaleOnAxis();return this},translate:function(a){this.center.add(a);return this},equals:function(a){return a.center.equals(this.center)&& +a.radius===this.radius}};THREE.Frustum=function(a,b,c,d,e,f){this.planes=[void 0!==a?a:new THREE.Plane,void 0!==b?b:new THREE.Plane,void 0!==c?c:new THREE.Plane,void 0!==d?d:new THREE.Plane,void 0!==e?e:new THREE.Plane,void 0!==f?f:new THREE.Plane]}; +THREE.Frustum.prototype={constructor:THREE.Frustum,set:function(a,b,c,d,e,f){var g=this.planes;g[0].copy(a);g[1].copy(b);g[2].copy(c);g[3].copy(d);g[4].copy(e);g[5].copy(f);return this},clone:function(){return(new this.constructor).copy(this)},copy:function(a){for(var b=this.planes,c=0;6>c;c++)b[c].copy(a.planes[c]);return this},setFromMatrix:function(a){var b=this.planes,c=a.elements;a=c[0];var d=c[1],e=c[2],f=c[3],g=c[4],h=c[5],k=c[6],l=c[7],n=c[8],p=c[9],m=c[10],q=c[11],r=c[12],s=c[13],u=c[14], +c=c[15];b[0].setComponents(f-a,l-g,q-n,c-r).normalize();b[1].setComponents(f+a,l+g,q+n,c+r).normalize();b[2].setComponents(f+d,l+h,q+p,c+s).normalize();b[3].setComponents(f-d,l-h,q-p,c-s).normalize();b[4].setComponents(f-e,l-k,q-m,c-u).normalize();b[5].setComponents(f+e,l+k,q+m,c+u).normalize();return this},intersectsObject:function(){var a=new THREE.Sphere;return function(b){var c=b.geometry;null===c.boundingSphere&&c.computeBoundingSphere();a.copy(c.boundingSphere).applyMatrix4(b.matrixWorld);return this.intersectsSphere(a)}}(), +intersectsSprite:function(){var a=new THREE.Sphere;return function(b){a.center.set(0,0,0);a.radius=.7071067811865476;a.applyMatrix4(b.matrixWorld);return this.intersectsSphere(a)}}(),intersectsSphere:function(a){var b=this.planes,c=a.center;a=-a.radius;for(var d=0;6>d;d++)if(b[d].distanceToPoint(c)e;e++){var f=d[e];a.x=0g&&0>f)return!1}return!0}}(),containsPoint:function(a){for(var b=this.planes,c=0;6>c;c++)if(0>b[c].distanceToPoint(a))return!1;return!0}};THREE.Plane=function(a,b){this.normal=void 0!==a?a:new THREE.Vector3(1,0,0);this.constant=void 0!==b?b:0}; +THREE.Plane.prototype={constructor:THREE.Plane,set:function(a,b){this.normal.copy(a);this.constant=b;return this},setComponents:function(a,b,c,d){this.normal.set(a,b,c);this.constant=d;return this},setFromNormalAndCoplanarPoint:function(a,b){this.normal.copy(a);this.constant=-b.dot(this.normal);return this},setFromCoplanarPoints:function(){var a=new THREE.Vector3,b=new THREE.Vector3;return function(c,d,e){d=a.subVectors(e,d).cross(b.subVectors(c,d)).normalize();this.setFromNormalAndCoplanarPoint(d, +c);return this}}(),clone:function(){return(new this.constructor).copy(this)},copy:function(a){this.normal.copy(a.normal);this.constant=a.constant;return this},normalize:function(){var a=1/this.normal.length();this.normal.multiplyScalar(a);this.constant*=a;return this},negate:function(){this.constant*=-1;this.normal.negate();return this},distanceToPoint:function(a){return this.normal.dot(a)+this.constant},distanceToSphere:function(a){return this.distanceToPoint(a.center)-a.radius},projectPoint:function(a, +b){return this.orthoPoint(a,b).sub(a).negate()},orthoPoint:function(a,b){var c=this.distanceToPoint(a);return(b||new THREE.Vector3).copy(this.normal).multiplyScalar(c)},intersectLine:function(){var a=new THREE.Vector3;return function(b,c){var d=c||new THREE.Vector3,e=b.delta(a),f=this.normal.dot(e);if(0===f){if(0===this.distanceToPoint(b.start))return d.copy(b.start)}else return f=-(b.start.dot(this.normal)+this.constant)/f,0>f||1b&&0a&&0e;e++)8===e||13===e||18===e||23===e?b[e]="-":14===e?b[e]="4":(2>=c&&(c=33554432+16777216*Math.random()|0),d=c&15,c>>=4,b[e]=a[19===e?d&3|8:d]);return b.join("")}}(),clamp:function(a,b,c){return Math.max(b,Math.min(c,a))},euclideanModulo:function(a,b){return(a%b+b)%b},mapLinear:function(a,b,c, +d,e){return d+(a-b)*(e-d)/(c-b)},smoothstep:function(a,b,c){if(a<=b)return 0;if(a>=c)return 1;a=(a-b)/(c-b);return a*a*(3-2*a)},smootherstep:function(a,b,c){if(a<=b)return 0;if(a>=c)return 1;a=(a-b)/(c-b);return a*a*a*(a*(6*a-15)+10)},random16:function(){console.warn("THREE.Math.random16() has been deprecated. Use Math.random() instead.");return Math.random()},randInt:function(a,b){return a+Math.floor(Math.random()*(b-a+1))},randFloat:function(a,b){return a+Math.random()*(b-a)},randFloatSpread:function(a){return a* +(.5-Math.random())},degToRad:function(a){return a*THREE.Math.DEG2RAD},radToDeg:function(a){return a*THREE.Math.RAD2DEG},isPowerOfTwo:function(a){return 0===(a&a-1)&&0!==a},nearestPowerOfTwo:function(a){return Math.pow(2,Math.round(Math.log(a)/Math.LN2))},nextPowerOfTwo:function(a){a--;a|=a>>1;a|=a>>2;a|=a>>4;a|=a>>8;a|=a>>16;a++;return a}}; +THREE.Spline=function(a){function b(a,b,c,d,e,f,g){a=.5*(c-a);d=.5*(d-b);return(2*(b-c)+a+d)*g+(-3*(b-c)-2*a-d)*f+a*e+b}this.points=a;var c=[],d={x:0,y:0,z:0},e,f,g,h,k,l,n,p,m;this.initFromArray=function(a){this.points=[];for(var b=0;bthis.points.length-2?this.points.length-1:f+1;c[3]=f>this.points.length-3?this.points.length-1:f+ +2;l=this.points[c[0]];n=this.points[c[1]];p=this.points[c[2]];m=this.points[c[3]];h=g*g;k=g*h;d.x=b(l.x,n.x,p.x,m.x,g,h,k);d.y=b(l.y,n.y,p.y,m.y,g,h,k);d.z=b(l.z,n.z,p.z,m.z,g,h,k);return d};this.getControlPointsArray=function(){var a,b,c=this.points.length,d=[];for(a=0;a=b.x+b.y}}(); +THREE.Triangle.prototype={constructor:THREE.Triangle,set:function(a,b,c){this.a.copy(a);this.b.copy(b);this.c.copy(c);return this},setFromPointsAndIndices:function(a,b,c,d){this.a.copy(a[b]);this.b.copy(a[c]);this.c.copy(a[d]);return this},clone:function(){return(new this.constructor).copy(this)},copy:function(a){this.a.copy(a.a);this.b.copy(a.b);this.c.copy(a.c);return this},area:function(){var a=new THREE.Vector3,b=new THREE.Vector3;return function(){a.subVectors(this.c,this.b);b.subVectors(this.a, +this.b);return.5*a.cross(b).length()}}(),midpoint:function(a){return(a||new THREE.Vector3).addVectors(this.a,this.b).add(this.c).multiplyScalar(1/3)},normal:function(a){return THREE.Triangle.normal(this.a,this.b,this.c,a)},plane:function(a){return(a||new THREE.Plane).setFromCoplanarPoints(this.a,this.b,this.c)},barycoordFromPoint:function(a,b){return THREE.Triangle.barycoordFromPoint(a,this.a,this.b,this.c,b)},containsPoint:function(a){return THREE.Triangle.containsPoint(a,this.a,this.b,this.c)}, +closestPointToPoint:function(){var a,b,c,d;return function(e,f){void 0===a&&(a=new THREE.Plane,b=[new THREE.Line3,new THREE.Line3,new THREE.Line3],c=new THREE.Vector3,d=new THREE.Vector3);var g=f||new THREE.Vector3,h=Infinity;a.setFromCoplanarPoints(this.a,this.b,this.c);a.projectPoint(e,c);if(!0===this.containsPoint(c))g.copy(c);else{b[0].set(this.a,this.b);b[1].set(this.b,this.c);b[2].set(this.c,this.a);for(var k=0;k=e)break a;else{f=b[1];a=e)break b}d= +c;c=0}}for(;c>>1,ad;d++)if(e[d]===e[(d+1)%3]){a.push(f);break}for(f=a.length-1;0<=f;f--)for(e=a[f],this.faces.splice(e, +1),c=0,g=this.faceVertexUvs.length;cb||0===c)return;this._startTime=null;b*=c}b*=this._updateTimeScale(a);c=this._updateTime(b);a=this._updateWeight(a);if(0c.parameterPositions[1]&&(this.stopFading(),0===d&&(this.enabled=!1))}}return this._effectiveWeight=b},_updateTimeScale:function(a){var b=0;if(!this.paused){var b=this.timeScale,c=this._timeScaleInterpolant;if(null!==c){var d=c.evaluate(a)[0],b=b*d;a>c.parameterPositions[1]&&(this.stopWarping(),0===b?this.pause=!0: +this.timeScale=b)}}return this._effectiveTimeScale=b},_updateTime:function(a){var b=this.time+a;if(0===a)return b;var c=this._clip.duration,d=this.loop,e=this._loopCount;if(d===THREE.LoopOnce)a:{if(-1===e&&(this.loopCount=0,this._setEndings(!0,!0,!1)),b>=c)b=c;else if(0>b)b=0;else break a;this.clampWhenFinished?this.pause=!0:this.enabled=!1;this._mixer.dispatchEvent({type:"finished",action:this,direction:0>a?-1:1})}else{d=d===THREE.LoopPingPong;-1===e&&(0<=a?(e=0,this._setEndings(!0,0===this.repetitions, +d)):this._setEndings(0===this.repetitions,!0,d));if(b>=c||0>b){var f=Math.floor(b/c),b=b-c*f,e=e+Math.abs(f),g=this.repetitions-e;0>g?(this.clampWhenFinished?this.paused=!0:this.enabled=!1,b=0a,this._setEndings(a,!a,d)):this._setEndings(!1,!1,d),this._loopCount=e,this._mixer.dispatchEvent({type:"loop",action:this,loopDelta:f}))}if(d&&1===(e&1))return this.time=b,c-b}return this.time=b},_setEndings:function(a, +b,c){var d=this._interpolantSettings;c?(d.endingStart=THREE.ZeroSlopeEnding,d.endingEnd=THREE.ZeroSlopeEnding):(d.endingStart=a?this.zeroSlopeAtStart?THREE.ZeroSlopeEnding:THREE.ZeroCurvatureEnding:THREE.WrapAroundEnding,d.endingEnd=b?this.zeroSlopeAtEnd?THREE.ZeroSlopeEnding:THREE.ZeroCurvatureEnding:THREE.WrapAroundEnding)},_scheduleFading:function(a,b,c){var d=this._mixer,e=d.time,f=this._weightInterpolant;null===f&&(this._weightInterpolant=f=d._lendControlInterpolant());d=f.parameterPositions; +f=f.sampleValues;d[0]=e;f[0]=b;d[1]=e+a;f[1]=c;return this}};THREE.AnimationClip=function(a,b,c){this.name=a;this.tracks=c;this.duration=void 0!==b?b:-1;this.uuid=THREE.Math.generateUUID();0>this.duration&&this.resetDuration();this.trim();this.optimize()}; +THREE.AnimationClip.prototype={constructor:THREE.AnimationClip,resetDuration:function(){for(var a=0,b=0,c=this.tracks.length;b!==c;++b)var d=this.tracks[b],a=Math.max(a,d.times[d.times.length-1]);this.duration=a},trim:function(){for(var a=0;a=c){var p=c++,m=b[p];d[m.uuid]= +n;b[n]=m;d[l]=p;b[p]=k;k=0;for(l=f;k!==l;++k){var m=e[k],q=m[n];m[n]=m[p];m[p]=q}}}this.nCachedObjects_=c},uncache:function(a){for(var b=this._objects,c=b.length,d=this.nCachedObjects_,e=this._indicesByUUID,f=this._bindings,g=f.length,h=0,k=arguments.length;h!==k;++h){var l=arguments[h].uuid,n=e[l];if(void 0!==n)if(delete e[l],nb;)--f;++f;if(0!==e||f!==d)e>=f&&(f=Math.max(f,1),e=f-1),d=this.getValueSize(),this.times=THREE.AnimationUtils.arraySlice(c,e,f),this.values=THREE.AnimationUtils.arraySlice(this.values,e*d,f*d);return this},validate:function(){var a=!0,b=this.getValueSize();0!==b-Math.floor(b)&&(console.error("invalid value size in track", +this),a=!1);var c=this.times,b=this.values,d=c.length;0===d&&(console.error("track is empty",this),a=!1);for(var e=null,f=0;f!==d;f++){var g=c[f];if("number"===typeof g&&isNaN(g)){console.error("time is not a valid number",this,f,g);a=!1;break}if(null!==e&&e>g){console.error("out of order keys",this,f,g,e);a=!1;break}e=g}if(void 0!==b&&THREE.AnimationUtils.isTypedArray(b))for(f=0,c=b.length;f!==c;++f)if(d=b[f],isNaN(d)){console.error("value is not a valid number",this,f,d);a=!1;break}return a},optimize:function(){for(var a= +this.times,b=this.values,c=this.getValueSize(),d=1,e=1,f=a.length-1;e<=f;++e){var g=!1,h=a[e];if(h!==a[e+1]&&(1!==e||h!==h[0]))for(var k=e*c,l=k-c,n=k+c,h=0;h!==c;++h){var p=b[k+h];if(p!==b[l+h]||p!==b[n+h]){g=!0;break}}if(g){if(e!==d)for(a[d]=a[e],g=e*c,k=d*c,h=0;h!==c;++h)b[k+h]=b[g+h];++d}}d!==a.length&&(this.times=THREE.AnimationUtils.arraySlice(a,0,d),this.values=THREE.AnimationUtils.arraySlice(b,0,d*c));return this}}; +Object.assign(THREE.KeyframeTrack,{parse:function(a){if(void 0===a.type)throw Error("track type undefined, can not parse");var b=THREE.KeyframeTrack._getTrackTypeForValueTypeName(a.type);if(void 0===a.times){var c=[],d=[];THREE.AnimationUtils.flattenJSON(a.keys,c,d,"value");a.times=c;a.values=d}return void 0!==b.parse?b.parse(a):new b(a.name,a.times,a.values,a.interpolation)},toJSON:function(a){var b=a.constructor;if(void 0!==b.toJSON)b=b.toJSON(a);else{var b={name:a.name,times:THREE.AnimationUtils.convertArray(a.times, +Array),values:THREE.AnimationUtils.convertArray(a.values,Array)},c=a.getInterpolation();c!==a.DefaultInterpolation&&(b.interpolation=c)}b.type=a.ValueTypeName;return b},_getTrackTypeForValueTypeName:function(a){switch(a.toLowerCase()){case "scalar":case "double":case "float":case "number":case "integer":return THREE.NumberKeyframeTrack;case "vector":case "vector2":case "vector3":case "vector4":return THREE.VectorKeyframeTrack;case "color":return THREE.ColorKeyframeTrack;case "quaternion":return THREE.QuaternionKeyframeTrack; +case "bool":case "boolean":return THREE.BooleanKeyframeTrack;case "string":return THREE.StringKeyframeTrack}throw Error("Unsupported typeName: "+a);}});THREE.PropertyBinding=function(a,b,c){this.path=b;this.parsedPath=c||THREE.PropertyBinding.parseTrackName(b);this.node=THREE.PropertyBinding.findNode(a,this.parsedPath.nodeName)||a;this.rootNode=a}; +THREE.PropertyBinding.prototype={constructor:THREE.PropertyBinding,getValue:function(a,b){this.bind();this.getValue(a,b)},setValue:function(a,b){this.bind();this.setValue(a,b)},bind:function(){var a=this.node,b=this.parsedPath,c=b.objectName,d=b.propertyName,e=b.propertyIndex;a||(this.node=a=THREE.PropertyBinding.findNode(this.rootNode,b.nodeName)||this.rootNode);this.getValue=this._getValue_unavailable;this.setValue=this._setValue_unavailable;if(a){if(c){var f=b.objectIndex;switch(c){case "materials":if(!a.material){console.error(" can not bind to material as node does not have a material", +this);return}if(!a.material.materials){console.error(" can not bind to material.materials as node.material does not have a materials array",this);return}a=a.material.materials;break;case "bones":if(!a.skeleton){console.error(" can not bind to bones as node does not have a skeleton",this);return}a=a.skeleton.bones;for(c=0;cd&&this._mixBufferRegion(c,a,3*b,1-d,b);for(var d=b,f=b+b;d!==f;++d)if(c[d]!==c[d+b]){e.setValue(c,a); +break}},saveOriginalState:function(){var a=this.buffer,b=this.valueSize,c=3*b;this.binding.getValue(a,c);for(var d=b;d!==c;++d)a[d]=a[c+d%b];this.cumulativeWeight=0},restoreOriginalState:function(){this.binding.setValue(this.buffer,3*this.valueSize)},_select:function(a,b,c,d,e){if(.5<=d)for(d=0;d!==e;++d)a[b+d]=a[c+d]},_slerp:function(a,b,c,d,e){THREE.Quaternion.slerpFlat(a,b,a,b,a,c,d)},_lerp:function(a,b,c,d,e){for(var f=1-d,g=0;g!==e;++g){var h=b+g;a[h]=a[h]*f+a[c+g]*d}}}; +THREE.BooleanKeyframeTrack=function(a,b,c){THREE.KeyframeTrack.call(this,a,b,c)};THREE.BooleanKeyframeTrack.prototype=Object.assign(Object.create(THREE.KeyframeTrack.prototype),{constructor:THREE.BooleanKeyframeTrack,ValueTypeName:"bool",ValueBufferType:Array,DefaultInterpolation:THREE.InterpolateDiscrete,InterpolantFactoryMethodLinear:void 0,InterpolantFactoryMethodSmooth:void 0});THREE.ColorKeyframeTrack=function(a,b,c,d){THREE.KeyframeTrack.call(this,a,b,c,d)}; +THREE.ColorKeyframeTrack.prototype=Object.assign(Object.create(THREE.KeyframeTrack.prototype),{constructor:THREE.ColorKeyframeTrack,ValueTypeName:"color"});THREE.NumberKeyframeTrack=function(a,b,c,d){THREE.KeyframeTrack.call(this,a,b,c,d)};THREE.NumberKeyframeTrack.prototype=Object.assign(Object.create(THREE.KeyframeTrack.prototype),{constructor:THREE.NumberKeyframeTrack,ValueTypeName:"number"});THREE.QuaternionKeyframeTrack=function(a,b,c,d){THREE.KeyframeTrack.call(this,a,b,c,d)}; +THREE.QuaternionKeyframeTrack.prototype=Object.assign(Object.create(THREE.KeyframeTrack.prototype),{constructor:THREE.QuaternionKeyframeTrack,ValueTypeName:"quaternion",DefaultInterpolation:THREE.InterpolateLinear,InterpolantFactoryMethodLinear:function(a){return new THREE.QuaternionLinearInterpolant(this.times,this.values,this.getValueSize(),a)},InterpolantFactoryMethodSmooth:void 0});THREE.StringKeyframeTrack=function(a,b,c,d){THREE.KeyframeTrack.call(this,a,b,c,d)}; +THREE.StringKeyframeTrack.prototype=Object.assign(Object.create(THREE.KeyframeTrack.prototype),{constructor:THREE.StringKeyframeTrack,ValueTypeName:"string",ValueBufferType:Array,DefaultInterpolation:THREE.InterpolateDiscrete,InterpolantFactoryMethodLinear:void 0,InterpolantFactoryMethodSmooth:void 0});THREE.VectorKeyframeTrack=function(a,b,c,d){THREE.KeyframeTrack.call(this,a,b,c,d)}; +THREE.VectorKeyframeTrack.prototype=Object.assign(Object.create(THREE.KeyframeTrack.prototype),{constructor:THREE.VectorKeyframeTrack,ValueTypeName:"vector"}); +THREE.Audio=function(a){THREE.Object3D.call(this);this.type="Audio";this.context=a.context;this.source=this.context.createBufferSource();this.source.onended=this.onEnded.bind(this);this.gain=this.context.createGain();this.gain.connect(a.getInput());this.autoplay=!1;this.startTime=0;this.playbackRate=1;this.isPlaying=!1;this.hasPlaybackControl=!0;this.sourceType="empty";this.filters=[]}; +THREE.Audio.prototype=Object.assign(Object.create(THREE.Object3D.prototype),{constructor:THREE.Audio,getOutput:function(){return this.gain},setNodeSource:function(a){this.hasPlaybackControl=!1;this.sourceType="audioNode";this.source=a;this.connect();return this},setBuffer:function(a){this.source.buffer=a;this.sourceType="buffer";this.autoplay&&this.play();return this},play:function(){if(!0===this.isPlaying)console.warn("THREE.Audio: Audio is already playing.");else if(!1===this.hasPlaybackControl)console.warn("THREE.Audio: this Audio has no playback control."); +else{var a=this.context.createBufferSource();a.buffer=this.source.buffer;a.loop=this.source.loop;a.onended=this.source.onended;a.start(0,this.startTime);a.playbackRate.value=this.playbackRate;this.isPlaying=!0;this.source=a;return this.connect()}},pause:function(){if(!1===this.hasPlaybackControl)console.warn("THREE.Audio: this Audio has no playback control.");else return this.source.stop(),this.startTime=this.context.currentTime,this},stop:function(){if(!1===this.hasPlaybackControl)console.warn("THREE.Audio: this Audio has no playback control."); +else return this.source.stop(),this.startTime=0,this},connect:function(){if(0k.opacity&&(k.transparent=!0);c.setTextures(h);return c.parse(k)}}()}; +THREE.Loader.Handlers={handlers:[],add:function(a,b){this.handlers.push(a,b)},get:function(a){for(var b=this.handlers,c=0,d=b.length;cg;g++)m=v[k++],x=u[2*m],m=u[2*m+1],x=new THREE.Vector2(x,m),2!==g&&c.faceVertexUvs[d][h].push(x),0!==g&&c.faceVertexUvs[d][h+1].push(x);p&&(p=3*v[k++],q.normal.set(C[p++],C[p++],C[p]),s.normal.copy(q.normal));if(r)for(d=0;4>d;d++)p=3*v[k++],r=new THREE.Vector3(C[p++],C[p++],C[p]),2!==d&&q.vertexNormals.push(r),0!==d&&s.vertexNormals.push(r); +n&&(n=v[k++],n=w[n],q.color.setHex(n),s.color.setHex(n));if(b)for(d=0;4>d;d++)n=v[k++],n=w[n],2!==d&&q.vertexColors.push(new THREE.Color(n)),0!==d&&s.vertexColors.push(new THREE.Color(n));c.faces.push(q);c.faces.push(s)}else{q=new THREE.Face3;q.a=v[k++];q.b=v[k++];q.c=v[k++];h&&(h=v[k++],q.materialIndex=h);h=c.faces.length;if(d)for(d=0;dg;g++)m=v[k++],x=u[2*m],m=u[2*m+1],x=new THREE.Vector2(x,m),c.faceVertexUvs[d][h].push(x);p&&(p=3*v[k++],q.normal.set(C[p++], +C[p++],C[p]));if(r)for(d=0;3>d;d++)p=3*v[k++],r=new THREE.Vector3(C[p++],C[p++],C[p]),q.vertexNormals.push(r);n&&(n=v[k++],q.color.setHex(w[n]));if(b)for(d=0;3>d;d++)n=v[k++],q.vertexColors.push(new THREE.Color(w[n]));c.faces.push(q)}})(d);(function(){var b=void 0!==a.influencesPerVertex?a.influencesPerVertex:2;if(a.skinWeights)for(var d=0,g=a.skinWeights.length;dthis.opacity&&(d.opacity=this.opacity);!0===this.transparent&&(d.transparent=this.transparent);0a.x||1a.x?0:1;break;case THREE.MirroredRepeatWrapping:1===Math.abs(Math.floor(a.x)%2)?a.x=Math.ceil(a.x)-a.x:a.x-=Math.floor(a.x)}if(0>a.y||1a.y?0:1;break;case THREE.MirroredRepeatWrapping:1=== +Math.abs(Math.floor(a.y)%2)?a.y=Math.ceil(a.y)-a.y:a.y-=Math.floor(a.y)}this.flipY&&(a.y=1-a.y)}}};Object.assign(THREE.Texture.prototype,THREE.EventDispatcher.prototype);THREE.TextureIdCount=0; +THREE.DepthTexture=function(a,b,c,d,e,f,g,h,k){THREE.Texture.call(this,null,d,e,f,g,h,THREE.DepthFormat,c,k);this.image={width:a,height:b};this.type=void 0!==c?c:THREE.UnsignedShortType;this.magFilter=void 0!==g?g:THREE.NearestFilter;this.minFilter=void 0!==h?h:THREE.NearestFilter;this.generateMipmaps=this.flipY=!1};THREE.DepthTexture.prototype=Object.create(THREE.Texture.prototype);THREE.DepthTexture.prototype.constructor=THREE.DepthTexture; +THREE.CanvasTexture=function(a,b,c,d,e,f,g,h,k){THREE.Texture.call(this,a,b,c,d,e,f,g,h,k);this.needsUpdate=!0};THREE.CanvasTexture.prototype=Object.create(THREE.Texture.prototype);THREE.CanvasTexture.prototype.constructor=THREE.CanvasTexture;THREE.CubeTexture=function(a,b,c,d,e,f,g,h,k,l){a=void 0!==a?a:[];b=void 0!==b?b:THREE.CubeReflectionMapping;THREE.Texture.call(this,a,b,c,d,e,f,g,h,k,l);this.flipY=!1};THREE.CubeTexture.prototype=Object.create(THREE.Texture.prototype); +THREE.CubeTexture.prototype.constructor=THREE.CubeTexture;Object.defineProperty(THREE.CubeTexture.prototype,"images",{get:function(){return this.image},set:function(a){this.image=a}});THREE.CompressedTexture=function(a,b,c,d,e,f,g,h,k,l,n,p){THREE.Texture.call(this,null,f,g,h,k,l,d,e,n,p);this.image={width:b,height:c};this.mipmaps=a;this.generateMipmaps=this.flipY=!1};THREE.CompressedTexture.prototype=Object.create(THREE.Texture.prototype);THREE.CompressedTexture.prototype.constructor=THREE.CompressedTexture; +THREE.DataTexture=function(a,b,c,d,e,f,g,h,k,l,n,p){THREE.Texture.call(this,null,f,g,h,k,l,d,e,n,p);this.image={data:a,width:b,height:c};this.magFilter=void 0!==k?k:THREE.NearestFilter;this.minFilter=void 0!==l?l:THREE.NearestFilter;this.generateMipmaps=this.flipY=!1};THREE.DataTexture.prototype=Object.create(THREE.Texture.prototype);THREE.DataTexture.prototype.constructor=THREE.DataTexture; +THREE.VideoTexture=function(a,b,c,d,e,f,g,h,k){function l(){requestAnimationFrame(l);a.readyState>=a.HAVE_CURRENT_DATA&&(n.needsUpdate=!0)}THREE.Texture.call(this,a,b,c,d,e,f,g,h,k);this.generateMipmaps=!1;var n=this;l()};THREE.VideoTexture.prototype=Object.create(THREE.Texture.prototype);THREE.VideoTexture.prototype.constructor=THREE.VideoTexture;THREE.Group=function(){THREE.Object3D.call(this);this.type="Group"};THREE.Group.prototype=Object.assign(Object.create(THREE.Object3D.prototype),{constructor:THREE.Group}); +THREE.Points=function(a,b){THREE.Object3D.call(this);this.type="Points";this.geometry=void 0!==a?a:new THREE.BufferGeometry;this.material=void 0!==b?b:new THREE.PointsMaterial({color:16777215*Math.random()})}; +THREE.Points.prototype=Object.assign(Object.create(THREE.Object3D.prototype),{constructor:THREE.Points,raycast:function(){var a=new THREE.Matrix4,b=new THREE.Ray,c=new THREE.Sphere;return function(d,e){function f(a,c){var f=b.distanceSqToPoint(a);if(fd.far||e.push({distance:m,distanceToRay:Math.sqrt(f),point:h.clone(),index:c,face:null,object:g})}}var g=this,h=this.geometry,k=this.matrixWorld,l=d.params.Points.threshold; +null===h.boundingSphere&&h.computeBoundingSphere();c.copy(h.boundingSphere);c.applyMatrix4(k);if(!1!==d.ray.intersectsSphere(c)){a.getInverse(k);b.copy(d.ray).applyMatrix4(a);var l=l/((this.scale.x+this.scale.y+this.scale.z)/3),n=l*l,l=new THREE.Vector3;if(h instanceof THREE.BufferGeometry){var p=h.index,h=h.attributes.position.array;if(null!==p)for(var m=p.array,p=0,q=m.length;pf||(n.applyMatrix4(this.matrixWorld),s=d.ray.origin.distanceTo(n),sd.far||e.push({distance:s,point:h.clone().applyMatrix4(this.matrixWorld),index:g,face:null,faceIndex:null,object:this}))}else for(g=0,r= +q.length/3-1;gf||(n.applyMatrix4(this.matrixWorld),s=d.ray.origin.distanceTo(n),sd.far||e.push({distance:s,point:h.clone().applyMatrix4(this.matrixWorld),index:g,face:null,faceIndex:null,object:this}))}else if(g instanceof THREE.Geometry)for(k=g.vertices,l=k.length,g=0;gf||(n.applyMatrix4(this.matrixWorld),s=d.ray.origin.distanceTo(n),sd.far|| +e.push({distance:s,point:h.clone().applyMatrix4(this.matrixWorld),index:g,face:null,faceIndex:null,object:this}))}}}(),clone:function(){return(new this.constructor(this.geometry,this.material)).copy(this)}});THREE.LineSegments=function(a,b){THREE.Line.call(this,a,b);this.type="LineSegments"};THREE.LineSegments.prototype=Object.assign(Object.create(THREE.Line.prototype),{constructor:THREE.LineSegments}); +THREE.Mesh=function(a,b){THREE.Object3D.call(this);this.type="Mesh";this.geometry=void 0!==a?a:new THREE.BufferGeometry;this.material=void 0!==b?b:new THREE.MeshBasicMaterial({color:16777215*Math.random()});this.drawMode=THREE.TrianglesDrawMode;this.updateMorphTargets()}; +THREE.Mesh.prototype=Object.assign(Object.create(THREE.Object3D.prototype),{constructor:THREE.Mesh,setDrawMode:function(a){this.drawMode=a},updateMorphTargets:function(){if(void 0!==this.geometry.morphTargets&&0b.far?null:{distance:c,point:x.clone(),object:a}}function c(c,d,e,f,l,p,n,s){g.fromArray(f,3*p);h.fromArray(f,3*n);k.fromArray(f,3*s);if(c=b(c,d,e,g,h,k,u))l&&(m.fromArray(l,2*p),q.fromArray(l,2*n),r.fromArray(l,2*s),c.uv=a(u,g,h,k,m,q,r)),c.face=new THREE.Face3(p,n,s,THREE.Triangle.normal(g,h,k)),c.faceIndex=p;return c}var d=new THREE.Matrix4,e=new THREE.Ray,f=new THREE.Sphere, +g=new THREE.Vector3,h=new THREE.Vector3,k=new THREE.Vector3,l=new THREE.Vector3,n=new THREE.Vector3,p=new THREE.Vector3,m=new THREE.Vector2,q=new THREE.Vector2,r=new THREE.Vector2,s=new THREE.Vector3,u=new THREE.Vector3,x=new THREE.Vector3;return function(s,x){var w=this.geometry,D=this.material,A=this.matrixWorld;if(void 0!==D&&(null===w.boundingSphere&&w.computeBoundingSphere(),f.copy(w.boundingSphere),f.applyMatrix4(A),!1!==s.ray.intersectsSphere(f)&&(d.getInverse(A),e.copy(s.ray).applyMatrix4(d), +null===w.boundingBox||!1!==e.intersectsBox(w.boundingBox)))){var y,B;if(w instanceof THREE.BufferGeometry){var G,z,D=w.index,A=w.attributes,w=A.position.array;void 0!==A.uv&&(y=A.uv.array);if(null!==D)for(var A=D.array,H=0,M=A.length;H= +d[e].distance)d[e-1].object.visible=!1,d[e].object.visible=!0;else break;for(;ethis.scale.x*this.scale.y/4||c.push({distance:Math.sqrt(d),point:this.position,face:null,object:this})}}(),clone:function(){return(new this.constructor(this.material)).copy(this)}}); +THREE.LensFlare=function(a,b,c,d,e){THREE.Object3D.call(this);this.lensFlares=[];this.positionScreen=new THREE.Vector3;this.customUpdateCallback=void 0;void 0!==a&&this.add(a,b,c,d,e)}; +THREE.LensFlare.prototype=Object.assign(Object.create(THREE.Object3D.prototype),{constructor:THREE.LensFlare,copy:function(a){THREE.Object3D.prototype.copy.call(this,a);this.positionScreen.copy(a.positionScreen);this.customUpdateCallback=a.customUpdateCallback;for(var b=0,c=a.lensFlares.length;bc;c++)t.deleteFramebuffer(b.__webglFramebuffer[c]),b.__webglDepthbuffer&&t.deleteRenderbuffer(b.__webglDepthbuffer[c]);else t.deleteFramebuffer(b.__webglFramebuffer), +b.__webglDepthbuffer&&t.deleteRenderbuffer(b.__webglDepthbuffer);T.delete(a.texture);T.delete(a)}ja.textures--}function h(a){a=a.target;a.removeEventListener("dispose",h);k(a);T.delete(a)}function k(a){var b=T.get(a).program;a.program=void 0;void 0!==b&&pa.releaseProgram(b)}function l(a,b){return Math.abs(b[0])-Math.abs(a[0])}function n(a,b){return a.object.renderOrder!==b.object.renderOrder?a.object.renderOrder-b.object.renderOrder:a.material.id!==b.material.id?a.material.id-b.material.id:a.z!== +b.z?a.z-b.z:a.id-b.id}function p(a,b){return a.object.renderOrder!==b.object.renderOrder?a.object.renderOrder-b.object.renderOrder:a.z!==b.z?b.z-a.z:a.id-b.id}function m(a,b,c,d,e){var g;c.transparent?(d=R,g=++F):(d=P,g=++Q);g=d[g];void 0!==g?(g.id=a.id,g.object=a,g.geometry=b,g.material=c,g.z=X.z,g.group=e):(g={id:a.id,object:a,geometry:b,material:c,z:X.z,group:e},d.push(g))}function q(a){if(!Ba.intersectsSphere(a))return!1;var b=ba.numPlanes;if(0===b)return!0;var c=W.clippingPlanes,d=a.center;a= +-a.radius;var e=0;do if(c[e].distanceToPoint(d)b||a.height>b){var c=b/Math.max(a.width,a.height),d=document.createElement("canvas");d.width=Math.floor(a.width*c);d.height=Math.floor(a.height*c);d.getContext("2d").drawImage(a,0,0,a.width,a.height,0,0,d.width,d.height);console.warn("THREE.WebGLRenderer: image is too big ("+a.width+"x"+a.height+ +"). Resized to "+d.width+"x"+d.height,a);return d}return a}function D(a){return THREE.Math.isPowerOfTwo(a.width)&&THREE.Math.isPowerOfTwo(a.height)}function A(a,b,c,d){var e=G(b.texture.format),f=G(b.texture.type);J.texImage2D(d,0,e,b.width,b.height,0,e,f,null);t.bindFramebuffer(t.FRAMEBUFFER,a);t.framebufferTexture2D(t.FRAMEBUFFER,c,d,T.get(b.texture).__webglTexture,0);t.bindFramebuffer(t.FRAMEBUFFER,null)}function y(a,b){t.bindRenderbuffer(t.RENDERBUFFER,a);b.depthBuffer&&!b.stencilBuffer?(t.renderbufferStorage(t.RENDERBUFFER, +t.DEPTH_COMPONENT16,b.width,b.height),t.framebufferRenderbuffer(t.FRAMEBUFFER,t.DEPTH_ATTACHMENT,t.RENDERBUFFER,a)):b.depthBuffer&&b.stencilBuffer?(t.renderbufferStorage(t.RENDERBUFFER,t.DEPTH_STENCIL,b.width,b.height),t.framebufferRenderbuffer(t.FRAMEBUFFER,t.DEPTH_STENCIL_ATTACHMENT,t.RENDERBUFFER,a)):t.renderbufferStorage(t.RENDERBUFFER,t.RGBA4,b.width,b.height);t.bindRenderbuffer(t.RENDERBUFFER,null)}function B(a){return a===THREE.NearestFilter||a===THREE.NearestMipMapNearestFilter||a===THREE.NearestMipMapLinearFilter? +t.NEAREST:t.LINEAR}function G(a){var b;if(a===THREE.RepeatWrapping)return t.REPEAT;if(a===THREE.ClampToEdgeWrapping)return t.CLAMP_TO_EDGE;if(a===THREE.MirroredRepeatWrapping)return t.MIRRORED_REPEAT;if(a===THREE.NearestFilter)return t.NEAREST;if(a===THREE.NearestMipMapNearestFilter)return t.NEAREST_MIPMAP_NEAREST;if(a===THREE.NearestMipMapLinearFilter)return t.NEAREST_MIPMAP_LINEAR;if(a===THREE.LinearFilter)return t.LINEAR;if(a===THREE.LinearMipMapNearestFilter)return t.LINEAR_MIPMAP_NEAREST;if(a=== +THREE.LinearMipMapLinearFilter)return t.LINEAR_MIPMAP_LINEAR;if(a===THREE.UnsignedByteType)return t.UNSIGNED_BYTE;if(a===THREE.UnsignedShort4444Type)return t.UNSIGNED_SHORT_4_4_4_4;if(a===THREE.UnsignedShort5551Type)return t.UNSIGNED_SHORT_5_5_5_1;if(a===THREE.UnsignedShort565Type)return t.UNSIGNED_SHORT_5_6_5;if(a===THREE.ByteType)return t.BYTE;if(a===THREE.ShortType)return t.SHORT;if(a===THREE.UnsignedShortType)return t.UNSIGNED_SHORT;if(a===THREE.IntType)return t.INT;if(a===THREE.UnsignedIntType)return t.UNSIGNED_INT; +if(a===THREE.FloatType)return t.FLOAT;b=V.get("OES_texture_half_float");if(null!==b&&a===THREE.HalfFloatType)return b.HALF_FLOAT_OES;if(a===THREE.AlphaFormat)return t.ALPHA;if(a===THREE.RGBFormat)return t.RGB;if(a===THREE.RGBAFormat)return t.RGBA;if(a===THREE.LuminanceFormat)return t.LUMINANCE;if(a===THREE.LuminanceAlphaFormat)return t.LUMINANCE_ALPHA;if(a===THREE.DepthFormat)return t.DEPTH_COMPONENT;if(a===THREE.AddEquation)return t.FUNC_ADD;if(a===THREE.SubtractEquation)return t.FUNC_SUBTRACT;if(a=== +THREE.ReverseSubtractEquation)return t.FUNC_REVERSE_SUBTRACT;if(a===THREE.ZeroFactor)return t.ZERO;if(a===THREE.OneFactor)return t.ONE;if(a===THREE.SrcColorFactor)return t.SRC_COLOR;if(a===THREE.OneMinusSrcColorFactor)return t.ONE_MINUS_SRC_COLOR;if(a===THREE.SrcAlphaFactor)return t.SRC_ALPHA;if(a===THREE.OneMinusSrcAlphaFactor)return t.ONE_MINUS_SRC_ALPHA;if(a===THREE.DstAlphaFactor)return t.DST_ALPHA;if(a===THREE.OneMinusDstAlphaFactor)return t.ONE_MINUS_DST_ALPHA;if(a===THREE.DstColorFactor)return t.DST_COLOR; +if(a===THREE.OneMinusDstColorFactor)return t.ONE_MINUS_DST_COLOR;if(a===THREE.SrcAlphaSaturateFactor)return t.SRC_ALPHA_SATURATE;b=V.get("WEBGL_compressed_texture_s3tc");if(null!==b){if(a===THREE.RGB_S3TC_DXT1_Format)return b.COMPRESSED_RGB_S3TC_DXT1_EXT;if(a===THREE.RGBA_S3TC_DXT1_Format)return b.COMPRESSED_RGBA_S3TC_DXT1_EXT;if(a===THREE.RGBA_S3TC_DXT3_Format)return b.COMPRESSED_RGBA_S3TC_DXT3_EXT;if(a===THREE.RGBA_S3TC_DXT5_Format)return b.COMPRESSED_RGBA_S3TC_DXT5_EXT}b=V.get("WEBGL_compressed_texture_pvrtc"); +if(null!==b){if(a===THREE.RGB_PVRTC_4BPPV1_Format)return b.COMPRESSED_RGB_PVRTC_4BPPV1_IMG;if(a===THREE.RGB_PVRTC_2BPPV1_Format)return b.COMPRESSED_RGB_PVRTC_2BPPV1_IMG;if(a===THREE.RGBA_PVRTC_4BPPV1_Format)return b.COMPRESSED_RGBA_PVRTC_4BPPV1_IMG;if(a===THREE.RGBA_PVRTC_2BPPV1_Format)return b.COMPRESSED_RGBA_PVRTC_2BPPV1_IMG}b=V.get("WEBGL_compressed_texture_etc1");if(null!==b&&a===THREE.RGB_ETC1_Format)return b.COMPRESSED_RGB_ETC1_WEBGL;b=V.get("EXT_blend_minmax");if(null!==b){if(a===THREE.MinEquation)return b.MIN_EXT; +if(a===THREE.MaxEquation)return b.MAX_EXT}return 0}console.log("THREE.WebGLRenderer",THREE.REVISION);a=a||{};var z=void 0!==a.canvas?a.canvas:document.createElement("canvas"),H=void 0!==a.context?a.context:null,M=void 0!==a.alpha?a.alpha:!1,O=void 0!==a.depth?a.depth:!0,N=void 0!==a.stencil?a.stencil:!0,E=void 0!==a.antialias?a.antialias:!1,K=void 0!==a.premultipliedAlpha?a.premultipliedAlpha:!0,I=void 0!==a.preserveDrawingBuffer?a.preserveDrawingBuffer:!1,L=[],P=[],Q=-1,R=[],F=-1,da=new Float32Array(8), +U=[],Y=[];this.domElement=z;this.context=null;this.sortObjects=this.autoClearStencil=this.autoClearDepth=this.autoClearColor=this.autoClear=!0;this.clippingPlanes=[];this.localClippingEnabled=!1;this.gammaFactor=2;this.physicallyCorrectLights=this.gammaOutput=this.gammaInput=!1;this.toneMapping=THREE.LinearToneMapping;this.toneMappingWhitePoint=this.toneMappingExposure=1;this.maxMorphTargets=8;this.maxMorphNormals=4;this.autoScaleCubemaps=!0;var W=this,fa=null,la=null,ga=null,Z=-1,oa="",ea=null,ra= +new THREE.Vector4,Aa=null,ma=new THREE.Vector4,ta=0,aa=new THREE.Color(0),ia=0,va=z.width,wa=z.height,$=1,ya=new THREE.Vector4(0,0,va,wa),Ca=!1,na=new THREE.Vector4(0,0,va,wa),Ba=new THREE.Frustum,ba=new THREE.WebGLClipping,ua=!1,za=!1,ka=new THREE.Sphere,sa=new THREE.Matrix4,X=new THREE.Vector3,S={hash:"",ambient:[0,0,0],directional:[],directionalShadowMap:[],directionalShadowMatrix:[],spot:[],spotShadowMap:[],spotShadowMatrix:[],point:[],pointShadowMap:[],pointShadowMatrix:[],hemi:[],shadows:[]}, +ja={geometries:0,textures:0},ha={calls:0,vertices:0,faces:0,points:0};this.info={render:ha,memory:ja,programs:null};var t;try{M={alpha:M,depth:O,stencil:N,antialias:E,premultipliedAlpha:K,preserveDrawingBuffer:I};t=H||z.getContext("webgl",M)||z.getContext("experimental-webgl",M);if(null===t){if(null!==z.getContext("webgl"))throw"Error creating WebGL context with your selected attributes.";throw"Error creating WebGL context.";}void 0===t.getShaderPrecisionFormat&&(t.getShaderPrecisionFormat=function(){return{rangeMin:1, +rangeMax:1,precision:1}});z.addEventListener("webglcontextlost",e,!1)}catch(Fa){console.error("THREE.WebGLRenderer: "+Fa)}var Da="undefined"!==typeof WebGL2RenderingContext&&t instanceof WebGL2RenderingContext,V=new THREE.WebGLExtensions(t);V.get("WEBGL_depth_texture");V.get("OES_texture_float");V.get("OES_texture_float_linear");V.get("OES_texture_half_float");V.get("OES_texture_half_float_linear");V.get("OES_standard_derivatives");V.get("ANGLE_instanced_arrays");V.get("OES_element_index_uint")&& +(THREE.BufferGeometry.MaxIndex=4294967296);var ca=new THREE.WebGLCapabilities(t,V,a),J=new THREE.WebGLState(t,V,G),T=new THREE.WebGLProperties,qa=new THREE.WebGLObjects(t,T,this.info),pa=new THREE.WebGLPrograms(this,ca),xa=new THREE.WebGLLights;this.info.programs=pa.programs;var Ga=new THREE.WebGLBufferRenderer(t,V,ha),Ha=new THREE.WebGLIndexedBufferRenderer(t,V,ha);c();this.context=t;this.capabilities=ca;this.extensions=V;this.properties=T;this.state=J;var Ea=new THREE.WebGLShadowMap(this,S,qa); +this.shadowMap=Ea;var Ia=new THREE.SpritePlugin(this,U),Ja=new THREE.LensFlarePlugin(this,Y);this.getContext=function(){return t};this.getContextAttributes=function(){return t.getContextAttributes()};this.forceContextLoss=function(){V.get("WEBGL_lose_context").loseContext()};this.getMaxAnisotropy=function(){var a;return function(){if(void 0!==a)return a;var b=V.get("EXT_texture_filter_anisotropic");return a=null!==b?t.getParameter(b.MAX_TEXTURE_MAX_ANISOTROPY_EXT):0}}();this.getPrecision=function(){return ca.precision}; +this.getPixelRatio=function(){return $};this.setPixelRatio=function(a){void 0!==a&&($=a,this.setSize(na.z,na.w,!1))};this.getSize=function(){return{width:va,height:wa}};this.setSize=function(a,b,c){va=a;wa=b;z.width=a*$;z.height=b*$;!1!==c&&(z.style.width=a+"px",z.style.height=b+"px");this.setViewport(0,0,a,b)};this.setViewport=function(a,b,c,d){J.viewport(na.set(a,b,c,d))};this.setScissor=function(a,b,c,d){J.scissor(ya.set(a,b,c,d))};this.setScissorTest=function(a){J.setScissorTest(Ca=a)};this.getClearColor= +function(){return aa};this.setClearColor=function(a,c){aa.set(a);ia=void 0!==c?c:1;b(aa.r,aa.g,aa.b,ia)};this.getClearAlpha=function(){return ia};this.setClearAlpha=function(a){ia=a;b(aa.r,aa.g,aa.b,ia)};this.clear=function(a,b,c){var d=0;if(void 0===a||a)d|=t.COLOR_BUFFER_BIT;if(void 0===b||b)d|=t.DEPTH_BUFFER_BIT;if(void 0===c||c)d|=t.STENCIL_BUFFER_BIT;t.clear(d)};this.clearColor=function(){this.clear(!0,!1,!1)};this.clearDepth=function(){this.clear(!1,!0,!1)};this.clearStencil=function(){this.clear(!1, +!1,!0)};this.clearTarget=function(a,b,c,d){this.setRenderTarget(a);this.clear(b,c,d)};this.resetGLState=d;this.dispose=function(){z.removeEventListener("webglcontextlost",e,!1)};this.renderBufferImmediate=function(a,b,c){J.initAttributes();var d=T.get(a);a.hasPositions&&!d.position&&(d.position=t.createBuffer());a.hasNormals&&!d.normal&&(d.normal=t.createBuffer());a.hasUvs&&!d.uv&&(d.uv=t.createBuffer());a.hasColors&&!d.color&&(d.color=t.createBuffer());b=b.getAttributes();a.hasPositions&&(t.bindBuffer(t.ARRAY_BUFFER, +d.position),t.bufferData(t.ARRAY_BUFFER,a.positionArray,t.DYNAMIC_DRAW),J.enableAttribute(b.position),t.vertexAttribPointer(b.position,3,t.FLOAT,!1,0,0));if(a.hasNormals){t.bindBuffer(t.ARRAY_BUFFER,d.normal);if("MeshPhongMaterial"!==c.type&&"MeshStandardMaterial"!==c.type&&"MeshPhysicalMaterial"!==c.type&&c.shading===THREE.FlatShading)for(var e=0,f=3*a.count;e=ca.maxTextures&&console.warn("WebGLRenderer: trying to use "+ +a+" texture units while this GPU supports only "+ca.maxTextures);ta+=1;return a};this.setTexture2D=function(){var a=!1;return function(b,c){b instanceof THREE.WebGLRenderTarget&&(a||(console.warn("THREE.WebGLRenderer.setTexture2D: don't use render targets as textures. Use their .texture property instead."),a=!0),b=b.texture);var d=b,e=T.get(d);if(0m;m++)k[m]=!W.autoScaleCubemaps||g||h?h?d.image[m].image:d.image[m]:w(d.image[m],ca.maxCubemapSize);var l=D(k[0]),n=G(d.format),p=G(d.type);C(t.TEXTURE_CUBE_MAP,d,l);for(m=0;6>m;m++)if(g)for(var q,r=k[m].mipmaps, +s=0,x=r.length;sf;f++)b.__webglFramebuffer[f]=t.createFramebuffer()}else b.__webglFramebuffer=t.createFramebuffer();if(d){J.bindTexture(t.TEXTURE_CUBE_MAP,c.__webglTexture);C(t.TEXTURE_CUBE_MAP,a.texture,e);for(f=0;6>f;f++)A(b.__webglFramebuffer[f], +a,t.COLOR_ATTACHMENT0,t.TEXTURE_CUBE_MAP_POSITIVE_X+f);a.texture.generateMipmaps&&e&&t.generateMipmap(t.TEXTURE_CUBE_MAP);J.bindTexture(t.TEXTURE_CUBE_MAP,null)}else J.bindTexture(t.TEXTURE_2D,c.__webglTexture),C(t.TEXTURE_2D,a.texture,e),A(b.__webglFramebuffer,a,t.COLOR_ATTACHMENT0,t.TEXTURE_2D),a.texture.generateMipmaps&&e&&t.generateMipmap(t.TEXTURE_2D),J.bindTexture(t.TEXTURE_2D,null);if(a.depthBuffer){b=T.get(a);c=a instanceof THREE.WebGLRenderTargetCube;if(a.depthTexture){if(c)throw Error("target.depthTexture not supported in Cube render targets"); +if(a instanceof THREE.WebGLRenderTargetCube)throw Error("Depth Texture with cube render targets is not supported!");t.bindFramebuffer(t.FRAMEBUFFER,b.__webglFramebuffer);if(!(a.depthTexture instanceof THREE.DepthTexture))throw Error("renderTarget.depthTexture must be an instance of THREE.DepthTexture");T.get(a.depthTexture).__webglTexture&&a.depthTexture.image.width===a.width&&a.depthTexture.image.height===a.height||(a.depthTexture.image.width=a.width,a.depthTexture.image.height=a.height,a.depthTexture.needsUpdate= +!0);W.setTexture2D(a.depthTexture,0);b=T.get(a.depthTexture).__webglTexture;t.framebufferTexture2D(t.FRAMEBUFFER,t.DEPTH_ATTACHMENT,t.TEXTURE_2D,b,0)}else if(c)for(b.__webglDepthbuffer=[],c=0;6>c;c++)t.bindFramebuffer(t.FRAMEBUFFER,b.__webglFramebuffer[c]),b.__webglDepthbuffer[c]=t.createRenderbuffer(),y(b.__webglDepthbuffer[c],a);else t.bindFramebuffer(t.FRAMEBUFFER,b.__webglFramebuffer),b.__webglDepthbuffer=t.createRenderbuffer(),y(b.__webglDepthbuffer,a);t.bindFramebuffer(t.FRAMEBUFFER,null)}}b= +a instanceof THREE.WebGLRenderTargetCube;a?(c=T.get(a),c=b?c.__webglFramebuffer[a.activeCubeFace]:c.__webglFramebuffer,ra.copy(a.scissor),Aa=a.scissorTest,ma.copy(a.viewport)):(c=null,ra.copy(ya).multiplyScalar($),Aa=Ca,ma.copy(na).multiplyScalar($));ga!==c&&(t.bindFramebuffer(t.FRAMEBUFFER,c),ga=c);J.scissor(ra);J.setScissorTest(Aa);J.viewport(ma);b&&(b=T.get(a.texture),t.framebufferTexture2D(t.FRAMEBUFFER,t.COLOR_ATTACHMENT0,t.TEXTURE_CUBE_MAP_POSITIVE_X+a.activeCubeFace,b.__webglTexture,a.activeMipMapLevel))}; +this.readRenderTargetPixels=function(a,b,c,d,e,f){if(!1===a instanceof THREE.WebGLRenderTarget)console.error("THREE.WebGLRenderer.readRenderTargetPixels: renderTarget is not THREE.WebGLRenderTarget.");else{var g=T.get(a).__webglFramebuffer;if(g){var h=!1;g!==ga&&(t.bindFramebuffer(t.FRAMEBUFFER,g),h=!0);try{var k=a.texture;k.format!==THREE.RGBAFormat&&G(k.format)!==t.getParameter(t.IMPLEMENTATION_COLOR_READ_FORMAT)?console.error("THREE.WebGLRenderer.readRenderTargetPixels: renderTarget is not in RGBA or implementation defined format."): +k.type===THREE.UnsignedByteType||G(k.type)===t.getParameter(t.IMPLEMENTATION_COLOR_READ_TYPE)||k.type===THREE.FloatType&&V.get("WEBGL_color_buffer_float")||k.type===THREE.HalfFloatType&&V.get("EXT_color_buffer_half_float")?t.checkFramebufferStatus(t.FRAMEBUFFER)===t.FRAMEBUFFER_COMPLETE?0<=b&&b<=a.width-d&&0<=c&&c<=a.height-e&&t.readPixels(b,c,d,e,G(k.format),G(k.type),f):console.error("THREE.WebGLRenderer.readRenderTargetPixels: readPixels from renderTarget failed. Framebuffer not complete."):console.error("THREE.WebGLRenderer.readRenderTargetPixels: renderTarget is not in UnsignedByteType or implementation defined type.")}finally{h&& +t.bindFramebuffer(t.FRAMEBUFFER,ga)}}}}}; +THREE.WebGLRenderTarget=function(a,b,c){this.uuid=THREE.Math.generateUUID();this.width=a;this.height=b;this.scissor=new THREE.Vector4(0,0,a,b);this.scissorTest=!1;this.viewport=new THREE.Vector4(0,0,a,b);c=c||{};void 0===c.minFilter&&(c.minFilter=THREE.LinearFilter);this.texture=new THREE.Texture(void 0,void 0,c.wrapS,c.wrapT,c.magFilter,c.minFilter,c.format,c.type,c.anisotropy,c.encoding);this.depthBuffer=void 0!==c.depthBuffer?c.depthBuffer:!0;this.stencilBuffer=void 0!==c.stencilBuffer?c.stencilBuffer: +!0;this.depthTexture=null}; +Object.assign(THREE.WebGLRenderTarget.prototype,THREE.EventDispatcher.prototype,{setSize:function(a,b){if(this.width!==a||this.height!==b)this.width=a,this.height=b,this.dispose();this.viewport.set(0,0,a,b);this.scissor.set(0,0,a,b)},clone:function(){return(new this.constructor).copy(this)},copy:function(a){this.width=a.width;this.height=a.height;this.viewport.copy(a.viewport);this.texture=a.texture.clone();this.depthBuffer=a.depthBuffer;this.stencilBuffer=a.stencilBuffer;this.depthTexture=a.depthTexture; +return this},dispose:function(){this.dispatchEvent({type:"dispose"})}});THREE.WebGLRenderTargetCube=function(a,b,c){THREE.WebGLRenderTarget.call(this,a,b,c);this.activeMipMapLevel=this.activeCubeFace=0};THREE.WebGLRenderTargetCube.prototype=Object.create(THREE.WebGLRenderTarget.prototype);THREE.WebGLRenderTargetCube.prototype.constructor=THREE.WebGLRenderTargetCube; +THREE.WebGLBufferRenderer=function(a,b,c){var d;this.setMode=function(a){d=a};this.render=function(b,f){a.drawArrays(d,b,f);c.calls++;c.vertices+=f;d===a.TRIANGLES&&(c.faces+=f/3)};this.renderInstances=function(e){var f=b.get("ANGLE_instanced_arrays");if(null===f)console.error("THREE.WebGLBufferRenderer: using THREE.InstancedBufferGeometry but hardware does not support extension ANGLE_instanced_arrays.");else{var g=e.attributes.position,h=0,h=g instanceof THREE.InterleavedBufferAttribute?g.data.count: +g.count;f.drawArraysInstancedANGLE(d,0,h,e.maxInstancedCount);c.calls++;c.vertices+=h*e.maxInstancedCount;d===a.TRIANGLES&&(c.faces+=e.maxInstancedCount*h/3)}}}; +THREE.WebGLClipping=function(){function a(){l.value!==d&&(l.value=d,l.needsUpdate=0c){var d=b;b=c;c=d}d=a[b];return void 0===d?(a[b]=[c],!0):-1===d.indexOf(c)?(d.push(c),!0):!1}var f=new THREE.WebGLGeometries(a,b,c);this.getAttributeBuffer=function(a){return a instanceof THREE.InterleavedBufferAttribute?b.get(a.data).__webglBuffer:b.get(a).__webglBuffer};this.getWireframeAttribute= +function(c){var f=b.get(c);if(void 0!==f.wireframe)return f.wireframe;var k=[],l=c.index,n=c.attributes;c=n.position;if(null!==l)for(var n={},l=l.array,p=0,m=l.length;p/g,function(a,b){var c=THREE.ShaderChunk[b];if(void 0===c)throw Error("Can not resolve #include <"+ +b+">");return k(c)})}function l(a){return a.replace(/for \( int i \= (\d+)\; i < (\d+)\; i \+\+ \) \{([\s\S]+?)(?=\})\}/g,function(a,b,c,d){a="";for(b=parseInt(b);b=e||0 0 ) {\nfloat depth = gl_FragCoord.z / gl_FragCoord.w;\nfloat fogFactor = 0.0;\nif ( fogType == 1 ) {\nfogFactor = smoothstep( fogNear, fogFar, depth );\n} else {\nconst float LOG2 = 1.442695;\nfogFactor = exp2( - fogDensity * fogDensity * depth * depth * LOG2 );\nfogFactor = 1.0 - clamp( fogFactor, 0.0, 1.0 );\n}\ngl_FragColor = mix( gl_FragColor, vec4( fogColor, gl_FragColor.w ), fogFactor );\n}\n}"].join("\n")); +w.compileShader(K);w.compileShader(I);w.attachShader(E,K);w.attachShader(E,I);w.linkProgram(E);B=E;x=w.getAttribLocation(B,"position");v=w.getAttribLocation(B,"uv");c=w.getUniformLocation(B,"uvOffset");d=w.getUniformLocation(B,"uvScale");e=w.getUniformLocation(B,"rotation");f=w.getUniformLocation(B,"scale");g=w.getUniformLocation(B,"color");h=w.getUniformLocation(B,"map");k=w.getUniformLocation(B,"opacity");l=w.getUniformLocation(B,"modelViewMatrix");n=w.getUniformLocation(B,"projectionMatrix");p= +w.getUniformLocation(B,"fogType");m=w.getUniformLocation(B,"fogDensity");q=w.getUniformLocation(B,"fogNear");r=w.getUniformLocation(B,"fogFar");s=w.getUniformLocation(B,"fogColor");u=w.getUniformLocation(B,"alphaTest");E=document.createElement("canvas");E.width=8;E.height=8;K=E.getContext("2d");K.fillStyle="white";K.fillRect(0,0,8,8);G=new THREE.Texture(E);G.needsUpdate=!0}w.useProgram(B);D.initAttributes();D.enableAttribute(x);D.enableAttribute(v);D.disableUnusedAttributes();D.disable(w.CULL_FACE); +D.enable(w.BLEND);w.bindBuffer(w.ARRAY_BUFFER,A);w.vertexAttribPointer(x,2,w.FLOAT,!1,16,0);w.vertexAttribPointer(v,2,w.FLOAT,!1,16,8);w.bindBuffer(w.ELEMENT_ARRAY_BUFFER,y);w.uniformMatrix4fv(n,!1,N.projectionMatrix.elements);D.activeTexture(w.TEXTURE0);w.uniform1i(h,0);K=E=0;(I=O.fog)?(w.uniform3f(s,I.color.r,I.color.g,I.color.b),I instanceof THREE.Fog?(w.uniform1f(q,I.near),w.uniform1f(r,I.far),w.uniform1i(p,1),K=E=1):I instanceof THREE.FogExp2&&(w.uniform1f(m,I.density),w.uniform1i(p,2),K=E=2)): +(w.uniform1i(p,0),K=E=0);for(var I=0,L=b.length;Ic)return null;var d=[],e=[],f=[],g,h,k;if(0=l--){console.warn("THREE.ShapeUtils: Unable to triangulate polygon! in triangulate()");break}g=h;c<=g&&(g=0);h=g+1;c<=h&&(h=0);k=h+1;c<=k&&(k=0);var n;a:{var p= +n=void 0,m=void 0,q=void 0,r=void 0,s=void 0,u=void 0,x=void 0,v=void 0,p=a[e[g]].x,m=a[e[g]].y,q=a[e[h]].x,r=a[e[h]].y,s=a[e[k]].x,u=a[e[k]].y;if(Number.EPSILON>(q-p)*(u-m)-(r-m)*(s-p))n=!1;else{var C=void 0,w=void 0,D=void 0,A=void 0,y=void 0,B=void 0,G=void 0,z=void 0,H=void 0,M=void 0,H=z=G=v=x=void 0,C=s-q,w=u-r,D=p-s,A=m-u,y=q-p,B=r-m;for(n=0;n=-Number.EPSILON&& +z>=-Number.EPSILON&&G>=-Number.EPSILON)){n=!1;break a}n=!0}}if(n){d.push([a[e[g]],a[e[h]],a[e[k]]]);f.push([e[g],e[h],e[k]]);g=h;for(k=h+1;kNumber.EPSILON){if(0B||B> +y)return[];k=l*n-k*p;if(0>k||k>y)return[]}else{if(0d?[]:k===d?f?[]:[g]:a<=d?[g,h]:[g,l]}function e(a,b,c,d){var e=b.x-a.x,f=b.y-a.y;b=c.x-a.x;c=c.y-a.y;var g=d.x-a.x;d=d.y-a.y;a=e*c-f*b;e=e*d-f*g;return Math.abs(a)>Number.EPSILON?(b=g*c-d*b,0f&&(f=d);var g=a+1;g>d&&(g=0);d=e(h[a],h[f],h[g],k[b]);if(!d)return!1;d=k.length-1;f=b-1;0>f&&(f=d);g=b+1;g>d&&(g=0);return(d=e(k[b],k[f],k[g],h[a]))?!0:!1}function f(a,b){var c,e;for(c=0;cN){console.log("Infinite Loop! Holes left:"+l.length+", Probably Hole outside Shape!");break}for(p=z;ph;h++)l=k[h].x+":"+k[h].y,l=n[l],void 0!==l&&(k[h]=l);return p.concat()},isClockWise:function(a){return 0>THREE.ShapeUtils.area(a)},b2:function(){return function(a,b,c,d){var e=1-a;return e*e*b+2*(1-a)*a*c+a*a*d}}(),b3:function(){return function(a,b,c,d,e){var f= +1-a,g=1-a;return f*f*f*b+3*g*g*a*c+3*(1-a)*a*a*d+a*a*a*e}}()};THREE.Curve=function(){}; +THREE.Curve.prototype={constructor:THREE.Curve,getPoint:function(a){console.warn("THREE.Curve: Warning, getPoint() not implemented!");return null},getPointAt:function(a){a=this.getUtoTmapping(a);return this.getPoint(a)},getPoints:function(a){a||(a=5);var b,c=[];for(b=0;b<=a;b++)c.push(this.getPoint(b/a));return c},getSpacedPoints:function(a){a||(a=5);var b,c=[];for(b=0;b<=a;b++)c.push(this.getPointAt(b/a));return c},getLength:function(){var a=this.getLengths();return a[a.length-1]},getLengths:function(a){a|| +(a=this.__arcLengthDivisions?this.__arcLengthDivisions:200);if(this.cacheArcLengths&&this.cacheArcLengths.length===a+1&&!this.needsUpdate)return this.cacheArcLengths;this.needsUpdate=!1;var b=[],c,d=this.getPoint(0),e,f=0;b.push(0);for(e=1;e<=a;e++)c=this.getPoint(e/a),f+=c.distanceTo(d),b.push(f),d=c;return this.cacheArcLengths=b},updateArcLengths:function(){this.needsUpdate=!0;this.getLengths()},getUtoTmapping:function(a,b){var c=this.getLengths(),d=0,e=c.length,f;f=b?b:a*c[e-1];for(var g=0,h=e- +1,k;g<=h;)if(d=Math.floor(g+(h-g)/2),k=c[d]-f,0>k)g=d+1;else if(0b&&(b=0);1=b)return a=this.curves[d],b=1-(c[d]-b)/a.getLength(),a.getPointAt(b);d++}return null},getLength:function(){var a= +this.getCurveLengths();return a[a.length-1]},getCurveLengths:function(){if(this.cacheLengths&&this.cacheLengths.length===this.curves.length)return this.cacheLengths;for(var a=[],b=0,c=0,d=this.curves.length;cNumber.EPSILON){if(0>l&&(g=b[f],k=-k,h=b[e],l=-l),!(a.yh.y))if(a.y===g.y){if(a.x=== +g.x)return!0}else{e=l*(a.x-g.x)-k*(a.y-g.y);if(0===e)return!0;0>e||(d=!d)}}else if(a.y===g.y&&(h.x<=a.x&&a.x<=g.x||g.x<=a.x&&a.x<=h.x))return!0}return d}var e=THREE.ShapeUtils.isClockWise,f=function(a){for(var b=[],c=new THREE.Path,d=0,e=a.length;db.length-2?b.length-1:c+1],b=b[c>b.length-3?b.length-1:c+2],c=THREE.CurveUtils.interpolate;return new THREE.Vector2(c(d.x,e.x,f.x,b.x,a),c(d.y,e.y,f.y,b.y,a))}; +THREE.EllipseCurve=function(a,b,c,d,e,f,g,h){this.aX=a;this.aY=b;this.xRadius=c;this.yRadius=d;this.aStartAngle=e;this.aEndAngle=f;this.aClockwise=g;this.aRotation=h||0};THREE.EllipseCurve.prototype=Object.create(THREE.Curve.prototype);THREE.EllipseCurve.prototype.constructor=THREE.EllipseCurve; +THREE.EllipseCurve.prototype.getPoint=function(a){var b=this.aEndAngle-this.aStartAngle;0>b&&(b+=2*Math.PI);b>2*Math.PI&&(b-=2*Math.PI);b=!0===this.aClockwise?this.aEndAngle+(1-a)*(2*Math.PI-b):this.aStartAngle+a*b;a=this.aX+this.xRadius*Math.cos(b);var c=this.aY+this.yRadius*Math.sin(b);if(0!==this.aRotation){var b=Math.cos(this.aRotation),d=Math.sin(this.aRotation),e=a;a=(e-this.aX)*b-(c-this.aY)*d+this.aX;c=(e-this.aX)*d+(c-this.aY)*b+this.aY}return new THREE.Vector2(a,c)}; +THREE.ArcCurve=function(a,b,c,d,e,f){THREE.EllipseCurve.call(this,a,b,c,c,d,e,f)};THREE.ArcCurve.prototype=Object.create(THREE.EllipseCurve.prototype);THREE.ArcCurve.prototype.constructor=THREE.ArcCurve;THREE.LineCurve3=THREE.Curve.create(function(a,b){this.v1=a;this.v2=b},function(a){var b=new THREE.Vector3;b.subVectors(this.v2,this.v1);b.multiplyScalar(a);b.add(this.v1);return b}); +THREE.QuadraticBezierCurve3=THREE.Curve.create(function(a,b,c){this.v0=a;this.v1=b;this.v2=c},function(a){var b=THREE.ShapeUtils.b2;return new THREE.Vector3(b(a,this.v0.x,this.v1.x,this.v2.x),b(a,this.v0.y,this.v1.y,this.v2.y),b(a,this.v0.z,this.v1.z,this.v2.z))}); +THREE.CubicBezierCurve3=THREE.Curve.create(function(a,b,c,d){this.v0=a;this.v1=b;this.v2=c;this.v3=d},function(a){var b=THREE.ShapeUtils.b3;return new THREE.Vector3(b(a,this.v0.x,this.v1.x,this.v2.x,this.v3.x),b(a,this.v0.y,this.v1.y,this.v2.y,this.v3.y),b(a,this.v0.z,this.v1.z,this.v2.z,this.v3.z))}); +THREE.SplineCurve3=THREE.Curve.create(function(a){console.warn("THREE.SplineCurve3 will be deprecated. Please use THREE.CatmullRomCurve3");this.points=void 0==a?[]:a},function(a){var b=this.points;a*=b.length-1;var c=Math.floor(a);a-=c;var d=b[0==c?c:c-1],e=b[c],f=b[c>b.length-2?b.length-1:c+1],b=b[c>b.length-3?b.length-1:c+2],c=THREE.CurveUtils.interpolate;return new THREE.Vector3(c(d.x,e.x,f.x,b.x,a),c(d.y,e.y,f.y,b.y,a),c(d.z,e.z,f.z,b.z,a))}); +THREE.CatmullRomCurve3=function(){function a(){}var b=new THREE.Vector3,c=new a,d=new a,e=new a;a.prototype.init=function(a,b,c,d){this.c0=a;this.c1=c;this.c2=-3*a+3*b-2*c-d;this.c3=2*a-2*b+c+d};a.prototype.initNonuniformCatmullRom=function(a,b,c,d,e,n,p){a=((b-a)/e-(c-a)/(e+n)+(c-b)/n)*n;d=((c-b)/n-(d-b)/(n+p)+(d-c)/p)*n;this.init(b,c,a,d)};a.prototype.initCatmullRom=function(a,b,c,d,e){this.init(b,c,e*(c-a),e*(d-b))};a.prototype.calc=function(a){var b=a*a;return this.c0+this.c1*a+this.c2*b+this.c3* +b*a};return THREE.Curve.create(function(a){this.points=a||[];this.closed=!1},function(a){var g=this.points,h,k;k=g.length;2>k&&console.log("duh, you need at least 2 points");a*=k-(this.closed?0:1);h=Math.floor(a);a-=h;this.closed?h+=0h&&(h=1);1E-4>k&&(k=h);1E-4>m&&(m=h);c.initNonuniformCatmullRom(l.x,n.x,p.x,g.x,k,h,m);d.initNonuniformCatmullRom(l.y,n.y,p.y,g.y,k,h,m);e.initNonuniformCatmullRom(l.z,n.z,p.z,g.z,k,h,m)}else"catmullrom"===this.type&&(k=void 0!==this.tension?this.tension:.5,c.initCatmullRom(l.x,n.x,p.x,g.x, +k),d.initCatmullRom(l.y,n.y,p.y,g.y,k),e.initCatmullRom(l.z,n.z,p.z,g.z,k));return new THREE.Vector3(c.calc(a),d.calc(a),e.calc(a))})}();THREE.ClosedSplineCurve3=function(a){console.warn("THREE.ClosedSplineCurve3 has been deprecated. Please use THREE.CatmullRomCurve3.");THREE.CatmullRomCurve3.call(this,a);this.type="catmullrom";this.closed=!0};THREE.ClosedSplineCurve3.prototype=Object.create(THREE.CatmullRomCurve3.prototype); +THREE.BoxGeometry=function(a,b,c,d,e,f){THREE.Geometry.call(this);this.type="BoxGeometry";this.parameters={width:a,height:b,depth:c,widthSegments:d,heightSegments:e,depthSegments:f};this.fromBufferGeometry(new THREE.BoxBufferGeometry(a,b,c,d,e,f));this.mergeVertices()};THREE.BoxGeometry.prototype=Object.create(THREE.Geometry.prototype);THREE.BoxGeometry.prototype.constructor=THREE.BoxGeometry;THREE.CubeGeometry=THREE.BoxGeometry; +THREE.BoxBufferGeometry=function(a,b,c,d,e,f){function g(a,b,c,d,e,f,g,k,l,M,O){var N=f/l,E=g/M,K=f/2,I=g/2,L=k/2;g=l+1;for(var P=M+1,Q=f=0,R=new THREE.Vector3,F=0;Fm;m++){e[0]=p[g[m]];e[1]=p[g[(m+1)%3]];e.sort(c);var q=e.toString();void 0===f[q]?f[q]={vert1:e[0],vert2:e[1],face1:l, +face2:void 0}:f[q].face2=l}e=[];for(q in f)if(g=f[q],void 0===g.face2||h[g.face1].normal.dot(h[g.face2].normal)<=d)l=k[g.vert1],e.push(l.x),e.push(l.y),e.push(l.z),l=k[g.vert2],e.push(l.x),e.push(l.y),e.push(l.z);this.addAttribute("position",new THREE.BufferAttribute(new Float32Array(e),3))};THREE.EdgesGeometry.prototype=Object.create(THREE.BufferGeometry.prototype);THREE.EdgesGeometry.prototype.constructor=THREE.EdgesGeometry; +THREE.ExtrudeGeometry=function(a,b){"undefined"!==typeof a&&(THREE.Geometry.call(this),this.type="ExtrudeGeometry",a=Array.isArray(a)?a:[a],this.addShapeList(a,b),this.computeFaceNormals())};THREE.ExtrudeGeometry.prototype=Object.create(THREE.Geometry.prototype);THREE.ExtrudeGeometry.prototype.constructor=THREE.ExtrudeGeometry;THREE.ExtrudeGeometry.prototype.addShapeList=function(a,b){for(var c=a.length,d=0;dNumber.EPSILON){var k=Math.sqrt(h),l=Math.sqrt(f*f+g*g),h=b.x-e/k;b=b.y+d/k;f=((c.x-g/l-h)*g-(c.y+f/l-b)*f)/(d*g-e*f);c=h+d*f-a.x;a=b+e*f-a.y;d=c*c+a*a;if(2>=d)return new THREE.Vector2(c,a);d=Math.sqrt(d/2)}else a=!1,d>Number.EPSILON? +f>Number.EPSILON&&(a=!0):d<-Number.EPSILON?f<-Number.EPSILON&&(a=!0):Math.sign(e)===Math.sign(g)&&(a=!0),a?(c=-e,a=d,d=Math.sqrt(h)):(c=d,a=e,d=Math.sqrt(h/2));return new THREE.Vector2(c/d,a/d)}function e(a,b){var c,d;for(F=a.length;0<=--F;){c=F;d=F-1;0>d&&(d=a.length-1);for(var e=0,f=q+2*n,e=0;eMath.abs(b.y-c.y)?[new THREE.Vector2(b.x,1-b.z),new THREE.Vector2(c.x,1-c.z),new THREE.Vector2(d.x,1-d.z),new THREE.Vector2(e.x,1-e.z)]:[new THREE.Vector2(b.y,1-b.z),new THREE.Vector2(c.y,1-c.z),new THREE.Vector2(d.y, +1-d.z),new THREE.Vector2(e.y,1-e.z)]}};THREE.ShapeGeometry=function(a,b){THREE.Geometry.call(this);this.type="ShapeGeometry";!1===Array.isArray(a)&&(a=[a]);this.addShapeList(a,b);this.computeFaceNormals()};THREE.ShapeGeometry.prototype=Object.create(THREE.Geometry.prototype);THREE.ShapeGeometry.prototype.constructor=THREE.ShapeGeometry;THREE.ShapeGeometry.prototype.addShapeList=function(a,b){for(var c=0,d=a.length;cNumber.EPSILON&&(h.normalize(),d=Math.acos(THREE.Math.clamp(e[l-1].dot(e[l]),-1,1)),f[l].applyMatrix4(k.makeRotationAxis(h,d))),g[l].crossVectors(e[l],f[l]);if(c)for(d=Math.acos(THREE.Math.clamp(f[0].dot(f[b-1]),-1,1)),d/=b-1,0c&&1===a.x&&(a=new THREE.Vector2(a.x-1,a.y));0===b.x&&0===b.z&&(a=new THREE.Vector2(c/ +2/Math.PI+.5,a.y));return a.clone()}THREE.Geometry.call(this);this.type="PolyhedronGeometry";this.parameters={vertices:a,indices:b,radius:c,detail:d};c=c||1;d=d||0;for(var k=this,l=0,n=a.length;lq&&(.2>d&&(b[0].x+=1),.2>a&&(b[1].x+=1),.2>p&&(b[2].x+=1));l=0;for(n=this.vertices.length;lp;p++){c[0]=n[e[p]];c[1]=n[e[(p+1)%3]];c.sort(b);var m=c.toString();void 0===d[m]&&(k[2*h]=c[0],k[2*h+1]=c[1],d[m]=!0,h++)}c=new Float32Array(6*h);a=0;for(l=h;ap;p++)d=f[k[2*a+p]],h=6*a+3*p,c[h+0]=d.x,c[h+1]=d.y, +c[h+2]=d.z;this.addAttribute("position",new THREE.BufferAttribute(c,3))}else if(a instanceof THREE.BufferGeometry){if(null!==a.index){l=a.index.array;f=a.attributes.position;e=a.groups;h=0;0===e.length&&a.addGroup(0,l.length);k=new Uint32Array(2*l.length);g=0;for(n=e.length;gp;p++)c[0]=l[a+p],c[1]=l[a+(p+1)%3],c.sort(b),m=c.toString(),void 0===d[m]&&(k[2*h]=c[0],k[2*h+1]=c[1],d[m]=!0,h++)}c=new Float32Array(6*h);a=0;for(l=h;a< +l;a++)for(p=0;2>p;p++)h=6*a+3*p,d=k[2*a+p],c[h+0]=f.getX(d),c[h+1]=f.getY(d),c[h+2]=f.getZ(d)}else for(f=a.attributes.position.array,h=f.length/3,k=h/3,c=new Float32Array(6*h),a=0,l=k;ap;p++)h=18*a+6*p,k=9*a+3*p,c[h+0]=f[k],c[h+1]=f[k+1],c[h+2]=f[k+2],d=9*a+(p+1)%3*3,c[h+3]=f[d],c[h+4]=f[d+1],c[h+5]=f[d+2];this.addAttribute("position",new THREE.BufferAttribute(c,3))}};THREE.WireframeGeometry.prototype=Object.create(THREE.BufferGeometry.prototype); +THREE.WireframeGeometry.prototype.constructor=THREE.WireframeGeometry;THREE.AxisHelper=function(a){a=a||1;var b=new Float32Array([0,0,0,a,0,0,0,0,0,0,a,0,0,0,0,0,0,a]),c=new Float32Array([1,0,0,1,.6,0,0,1,0,.6,1,0,0,0,1,0,.6,1]);a=new THREE.BufferGeometry;a.addAttribute("position",new THREE.BufferAttribute(b,3));a.addAttribute("color",new THREE.BufferAttribute(c,3));b=new THREE.LineBasicMaterial({vertexColors:THREE.VertexColors});THREE.LineSegments.call(this,a,b)};THREE.AxisHelper.prototype=Object.create(THREE.LineSegments.prototype); +THREE.AxisHelper.prototype.constructor=THREE.AxisHelper; +THREE.ArrowHelper=function(){var a=new THREE.BufferGeometry;a.addAttribute("position",new THREE.Float32Attribute([0,0,0,0,1,0],3));var b=new THREE.CylinderBufferGeometry(0,.5,1,5,1);b.translate(0,-.5,0);return function(c,d,e,f,g,h){THREE.Object3D.call(this);void 0===f&&(f=16776960);void 0===e&&(e=1);void 0===g&&(g=.2*e);void 0===h&&(h=.2*g);this.position.copy(d);this.line=new THREE.Line(a,new THREE.LineBasicMaterial({color:f}));this.line.matrixAutoUpdate=!1;this.add(this.line);this.cone=new THREE.Mesh(b, +new THREE.MeshBasicMaterial({color:f}));this.cone.matrixAutoUpdate=!1;this.add(this.cone);this.setDirection(c);this.setLength(e,g,h)}}();THREE.ArrowHelper.prototype=Object.create(THREE.Object3D.prototype);THREE.ArrowHelper.prototype.constructor=THREE.ArrowHelper; +THREE.ArrowHelper.prototype.setDirection=function(){var a=new THREE.Vector3,b;return function(c){.99999c.y?this.quaternion.set(1,0,0,0):(a.set(c.z,0,-c.x).normalize(),b=Math.acos(c.y),this.quaternion.setFromAxisAngle(a,b))}}();THREE.ArrowHelper.prototype.setLength=function(a,b,c){void 0===b&&(b=.2*a);void 0===c&&(c=.2*b);this.line.scale.set(1,Math.max(0,a-b),1);this.line.updateMatrix();this.cone.scale.set(c,b,c);this.cone.position.y=a;this.cone.updateMatrix()}; +THREE.ArrowHelper.prototype.setColor=function(a){this.line.material.color.copy(a);this.cone.material.color.copy(a)};THREE.BoxHelper=function(a){var b=new Uint16Array([0,1,1,2,2,3,3,0,4,5,5,6,6,7,7,4,0,4,1,5,2,6,3,7]),c=new Float32Array(24),d=new THREE.BufferGeometry;d.setIndex(new THREE.BufferAttribute(b,1));d.addAttribute("position",new THREE.BufferAttribute(c,3));THREE.LineSegments.call(this,d,new THREE.LineBasicMaterial({color:16776960}));void 0!==a&&this.update(a)};THREE.BoxHelper.prototype=Object.create(THREE.LineSegments.prototype); +THREE.BoxHelper.prototype.constructor=THREE.BoxHelper; +THREE.BoxHelper.prototype.update=function(){var a=new THREE.Box3;return function(b){b instanceof THREE.Box3?a.copy(b):a.setFromObject(b);if(!a.isEmpty()){b=a.min;var c=a.max,d=this.geometry.attributes.position,e=d.array;e[0]=c.x;e[1]=c.y;e[2]=c.z;e[3]=b.x;e[4]=c.y;e[5]=c.z;e[6]=b.x;e[7]=b.y;e[8]=c.z;e[9]=c.x;e[10]=b.y;e[11]=c.z;e[12]=c.x;e[13]=c.y;e[14]=b.z;e[15]=b.x;e[16]=c.y;e[17]=b.z;e[18]=b.x;e[19]=b.y;e[20]=b.z;e[21]=c.x;e[22]=b.y;e[23]=b.z;d.needsUpdate=!0;this.geometry.computeBoundingSphere()}}}(); +THREE.BoundingBoxHelper=function(a,b){var c=void 0!==b?b:8947848;this.object=a;this.box=new THREE.Box3;THREE.Mesh.call(this,new THREE.BoxGeometry(1,1,1),new THREE.MeshBasicMaterial({color:c,wireframe:!0}))};THREE.BoundingBoxHelper.prototype=Object.create(THREE.Mesh.prototype);THREE.BoundingBoxHelper.prototype.constructor=THREE.BoundingBoxHelper;THREE.BoundingBoxHelper.prototype.update=function(){this.box.setFromObject(this.object);this.box.size(this.scale);this.box.center(this.position)}; +THREE.CameraHelper=function(a){function b(a,b,d){c(a,d);c(b,d)}function c(a,b){d.vertices.push(new THREE.Vector3);d.colors.push(new THREE.Color(b));void 0===f[a]&&(f[a]=[]);f[a].push(d.vertices.length-1)}var d=new THREE.Geometry,e=new THREE.LineBasicMaterial({color:16777215,vertexColors:THREE.FaceColors}),f={};b("n1","n2",16755200);b("n2","n4",16755200);b("n4","n3",16755200);b("n3","n1",16755200);b("f1","f2",16755200);b("f2","f4",16755200);b("f4","f3",16755200);b("f3","f1",16755200);b("n1","f1",16755200); +b("n2","f2",16755200);b("n3","f3",16755200);b("n4","f4",16755200);b("p","n1",16711680);b("p","n2",16711680);b("p","n3",16711680);b("p","n4",16711680);b("u1","u2",43775);b("u2","u3",43775);b("u3","u1",43775);b("c","t",16777215);b("p","c",3355443);b("cn1","cn2",3355443);b("cn3","cn4",3355443);b("cf1","cf2",3355443);b("cf3","cf4",3355443);THREE.LineSegments.call(this,d,e);this.camera=a;this.camera.updateProjectionMatrix();this.matrix=a.matrixWorld;this.matrixAutoUpdate=!1;this.pointMap=f;this.update()}; +THREE.CameraHelper.prototype=Object.create(THREE.LineSegments.prototype);THREE.CameraHelper.prototype.constructor=THREE.CameraHelper; +THREE.CameraHelper.prototype.update=function(){function a(a,g,h,k){d.set(g,h,k).unproject(e);a=c[a];if(void 0!==a)for(g=0,h=a.length;gd;d++)c.faces[d].color=this.colors[4>d?0:1];d=new THREE.MeshBasicMaterial({vertexColors:THREE.FaceColors,wireframe:!0});this.lightSphere=new THREE.Mesh(c,d);this.add(this.lightSphere);this.update()}; +THREE.HemisphereLightHelper.prototype=Object.create(THREE.Object3D.prototype);THREE.HemisphereLightHelper.prototype.constructor=THREE.HemisphereLightHelper;THREE.HemisphereLightHelper.prototype.dispose=function(){this.lightSphere.geometry.dispose();this.lightSphere.material.dispose()}; +THREE.HemisphereLightHelper.prototype.update=function(){var a=new THREE.Vector3;return function(){this.colors[0].copy(this.light.color).multiplyScalar(this.light.intensity);this.colors[1].copy(this.light.groundColor).multiplyScalar(this.light.intensity);this.lightSphere.lookAt(a.setFromMatrixPosition(this.light.matrixWorld).negate());this.lightSphere.geometry.colorsNeedUpdate=!0}}(); +THREE.PointLightHelper=function(a,b){this.light=a;this.light.updateMatrixWorld();var c=new THREE.SphereBufferGeometry(b,4,2),d=new THREE.MeshBasicMaterial({wireframe:!0,fog:!1});d.color.copy(this.light.color).multiplyScalar(this.light.intensity);THREE.Mesh.call(this,c,d);this.matrix=this.light.matrixWorld;this.matrixAutoUpdate=!1};THREE.PointLightHelper.prototype=Object.create(THREE.Mesh.prototype);THREE.PointLightHelper.prototype.constructor=THREE.PointLightHelper; +THREE.PointLightHelper.prototype.dispose=function(){this.geometry.dispose();this.material.dispose()};THREE.PointLightHelper.prototype.update=function(){this.material.color.copy(this.light.color).multiplyScalar(this.light.intensity)}; +THREE.SkeletonHelper=function(a){this.bones=this.getBoneList(a);for(var b=new THREE.Geometry,c=0;cc;c++,d++){var e=c/32*Math.PI*2,f=d/32*Math.PI*2;b.push(Math.cos(e),Math.sin(e),1,Math.cos(f),Math.sin(f),1)}a.addAttribute("position",new THREE.Float32Attribute(b,3));b=new THREE.LineBasicMaterial({fog:!1});this.cone=new THREE.LineSegments(a, +b);this.add(this.cone);this.update()};THREE.SpotLightHelper.prototype=Object.create(THREE.Object3D.prototype);THREE.SpotLightHelper.prototype.constructor=THREE.SpotLightHelper;THREE.SpotLightHelper.prototype.dispose=function(){this.cone.geometry.dispose();this.cone.material.dispose()}; +THREE.SpotLightHelper.prototype.update=function(){var a=new THREE.Vector3,b=new THREE.Vector3;return function(){var c=this.light.distance?this.light.distance:1E3,d=c*Math.tan(this.light.angle);this.cone.scale.set(d,d,c);a.setFromMatrixPosition(this.light.matrixWorld);b.setFromMatrixPosition(this.light.target.matrixWorld);this.cone.lookAt(b.sub(a));this.cone.material.color.copy(this.light.color).multiplyScalar(this.light.intensity)}}(); +THREE.VertexNormalsHelper=function(a,b,c,d){this.object=a;this.size=void 0!==b?b:1;a=void 0!==c?c:16711680;d=void 0!==d?d:1;b=0;c=this.object.geometry;c instanceof THREE.Geometry?b=3*c.faces.length:c instanceof THREE.BufferGeometry&&(b=c.attributes.normal.count);c=new THREE.BufferGeometry;b=new THREE.Float32Attribute(6*b,3);c.addAttribute("position",b);THREE.LineSegments.call(this,c,new THREE.LineBasicMaterial({color:a,linewidth:d}));this.matrixAutoUpdate=!1;this.update()}; +THREE.VertexNormalsHelper.prototype=Object.create(THREE.LineSegments.prototype);THREE.VertexNormalsHelper.prototype.constructor=THREE.VertexNormalsHelper; +THREE.VertexNormalsHelper.prototype.update=function(){var a=new THREE.Vector3,b=new THREE.Vector3,c=new THREE.Matrix3;return function(){var d=["a","b","c"];this.object.updateMatrixWorld(!0);c.getNormalMatrix(this.object.matrixWorld);var e=this.object.matrixWorld,f=this.geometry.attributes.position,g=this.object.geometry;if(g instanceof THREE.Geometry)for(var h=g.vertices,k=g.faces,l=g=0,n=k.length;lh.end&&(h.end=f);c||(c=k)}}for(k in d)h=d[k],this.createAnimation(k,h.start,h.end,a);this.firstAnimation=c}; +THREE.MorphBlendMesh.prototype.setAnimationDirectionForward=function(a){if(a=this.animationsMap[a])a.direction=1,a.directionBackwards=!1};THREE.MorphBlendMesh.prototype.setAnimationDirectionBackward=function(a){if(a=this.animationsMap[a])a.direction=-1,a.directionBackwards=!0};THREE.MorphBlendMesh.prototype.setAnimationFPS=function(a,b){var c=this.animationsMap[a];c&&(c.fps=b,c.duration=(c.end-c.start)/c.fps)}; +THREE.MorphBlendMesh.prototype.setAnimationDuration=function(a,b){var c=this.animationsMap[a];c&&(c.duration=b,c.fps=(c.end-c.start)/c.duration)};THREE.MorphBlendMesh.prototype.setAnimationWeight=function(a,b){var c=this.animationsMap[a];c&&(c.weight=b)};THREE.MorphBlendMesh.prototype.setAnimationTime=function(a,b){var c=this.animationsMap[a];c&&(c.time=b)};THREE.MorphBlendMesh.prototype.getAnimationTime=function(a){var b=0;if(a=this.animationsMap[a])b=a.time;return b}; +THREE.MorphBlendMesh.prototype.getAnimationDuration=function(a){var b=-1;if(a=this.animationsMap[a])b=a.duration;return b};THREE.MorphBlendMesh.prototype.playAnimation=function(a){var b=this.animationsMap[a];b?(b.time=0,b.active=!0):console.warn("THREE.MorphBlendMesh: animation["+a+"] undefined in .playAnimation()")};THREE.MorphBlendMesh.prototype.stopAnimation=function(a){if(a=this.animationsMap[a])a.active=!1}; +THREE.MorphBlendMesh.prototype.update=function(a){for(var b=0,c=this.animationsList.length;bd.duration||0>d.time)d.direction*=-1,d.time>d.duration&&(d.time=d.duration,d.directionBackwards=!0),0>d.time&&(d.time=0,d.directionBackwards=!1)}else d.time%=d.duration,0>d.time&&(d.time+=d.duration);var f=d.start+THREE.Math.clamp(Math.floor(d.time/e),0,d.length-1),g=d.weight;f!==d.currentFrame&& +(this.morphTargetInfluences[d.lastFrame]=0,this.morphTargetInfluences[d.currentFrame]=1*g,this.morphTargetInfluences[f]=0,d.lastFrame=d.currentFrame,d.currentFrame=f);e=d.time%e/e;d.directionBackwards&&(e=1-e);d.currentFrame!==d.lastFrame?(this.morphTargetInfluences[d.currentFrame]=e*g,this.morphTargetInfluences[d.lastFrame]=(1-e)*g):this.morphTargetInfluences[d.currentFrame]=g}}}; diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html index 21a1a416..ebf93195 100644 --- a/src/interface/desktop/chat.html +++ b/src/interface/desktop/chat.html @@ -8,6 +8,8 @@ + + -
+
+ @@ -284,8 +399,7 @@ @@ -298,8 +412,8 @@ } body { display: grid; - background: #fff; - color: #475569; + background: var(--background-color); + color: var(--main-text-color); text-align: center; font-family: roboto, karma, segoe ui, sans-serif; font-size: small; @@ -433,6 +547,83 @@ box-shadow: 0 0 12px rgb(119, 156, 46); } + div.collapsed { + display: none; + } + + div.expanded { + display: block; + } + + div.reference { + display: grid; + grid-template-rows: auto; + grid-auto-flow: row; + grid-column-gap: 10px; + grid-row-gap: 10px; + margin: 10px; + } + + div.expanded.reference-section { + display: grid; + grid-template-rows: auto; + grid-auto-flow: row; + grid-column-gap: 10px; + grid-row-gap: 10px; + margin: 10px; + } + + button.reference-button { + background: var(--background-color); + color: var(--main-text-color); + border: 1px solid var(--main-text-color); + border-radius: 5px; + padding: 5px; + font-size: 14px; + font-weight: 300; + line-height: 1.5em; + cursor: pointer; + transition: background 0.2s ease-in-out; + text-align: left; + max-height: 75px; + transition: max-height 0.3s ease-in-out; + overflow: hidden; + } + button.reference-button.expanded { + max-height: none; + } + + button.reference-button::before { + content: "▶"; + margin-right: 5px; + display: inline-block; + transition: transform 0.3s ease-in-out; + } + + button.reference-button:active:before, + button.reference-button[aria-expanded="true"]::before { + transform: rotate(90deg); + } + + button.reference-expand-button { + background: var(--background-color); + color: var(--main-text-color); + border: 1px dotted var(--main-text-color); + border-radius: 5px; + padding: 5px; + font-size: 14px; + font-weight: 300; + line-height: 1.5em; + cursor: pointer; + transition: background 0.4s ease-in-out; + text-align: left; + } + + button.reference-expand-button:hover { + background: var(--primary-hover); + } + + .option-enabled:focus { outline: none !important; border:1px solid #475569; @@ -445,6 +636,11 @@ border-bottom: 1px dotted #475569; } + div.khoj-empty-container { + padding: 0; + margin: 0; + } + @media (pointer: coarse), (hover: none) { abbr[title] { position: relative; @@ -481,12 +677,6 @@ margin: 4px; grid-template-columns: auto; } - a.khoj-banner { - display: block; - } - p.khoj-banner { - padding: 0; - } } @media only screen and (min-width: 600px) { body { @@ -498,11 +688,6 @@ } } - div.khoj-banner-container { - padding: 0px; - margin: 0px; - } - div#chat-tooltip { text-align: left; font-size: medium; @@ -524,23 +709,6 @@ text-align: center; } - button#khoj-banner-submit, - input#khoj-banner-email { - padding: 10px; - border-radius: 5px; - border: 1px solid #475569; - background: #f9fafc; - } - - button#khoj-banner-submit:hover, - input#khoj-banner-email:hover { - box-shadow: 0 0 11px #aaa; - } - div.khoj-banner-container-hidden { - margin: 0px; - padding: 0px; - } - div.programmatic-output { background-color: #f5f5f5; border: 1px solid #ddd; diff --git a/src/interface/desktop/config.html b/src/interface/desktop/config.html index 04599bb1..fb39fbb8 100644 --- a/src/interface/desktop/config.html +++ b/src/interface/desktop/config.html @@ -2,89 +2,105 @@ - Khoj - Search + Khoj - Settings - - + -
- -
- - -
+ +
-
-
- File -

- Host -

+
+
+
+ Khoj Server URL +

+ Server URL +

+
+
+ +
+
+ Khoj Access Key +

+ API Key +

+
+
+ +
-
- -
-
- File -

- Files -

+
+
+
+ File +

+ Files + +

+
+
+
+
+
+ - +
-
-
-
-
- -
-
- Folder -

- Folders -

+
+
+
+ Folder +

+ Folders + +

+
+
+
+
+
+ - +
+
+
+
+
+
-
-
-
- -
-
- -
-
- - -
- -
-
+
+ +
+
+
@@ -93,7 +109,7 @@ body { display: grid; grid-template-columns: 1fr; - grid-template-rows: 1fr auto auto auto minmax(80px, 100%); + grid-template-rows: 1fr auto; font-size: small!important; } body > * { @@ -104,8 +120,7 @@ body { display: grid; grid-template-columns: 1fr min(70vw, 100%) 1fr; - grid-template-rows: 1fr auto auto auto minmax(80px, 100%); - padding-top: 60vw; + grid-template-rows: 80px auto; } body > * { grid-column: 2; @@ -114,7 +129,7 @@ body, input { padding: 0px; margin: 0px; - background: #fff; + background: var(--background-color); color: #475569; font-family: roboto, karma, segoe ui, sans-serif; font-size: small; @@ -126,11 +141,6 @@ margin: 10px; } - div.page { - padding: 0px; - margin: 0px; - } - svg { transition: transform 0.3s ease-in-out; } @@ -167,19 +177,19 @@ } } - #khoj-host-url { + .card-input { padding: 4px; - box-shadow: 0 0 2px 1px rgba(0, 0, 0, 0.2); + box-shadow: 0 0 2px 1px rgba(0, 0, 0, 0.3); border: none; + width: 450px; } .card { display: grid; - /* grid-template-rows: repeat(3, 1fr); */ gap: 8px; padding: 24px 16px; - width: 100%; - background: white; + width: 450px; + background: var(--background-color); border: 1px solid rgb(229, 229, 229); border-radius: 4px; box-shadow: 0px 1px 3px 0px rgba(0,0,0,0.1),0px 1px 2px -1px rgba(0,0,0,0.1); @@ -188,15 +198,15 @@ .section-cards { display: grid; - grid-template-columns: repeat(1, 1fr); gap: 16px; - justify-items: start; + justify-items: center; margin: 0; - width: auto; } - - div.configuration { - width: auto; + .section-action-row { + display: grid; + grid-auto-flow: column; + gap: 16px; + height: fit-content; } .card-title-row { @@ -247,7 +257,7 @@ } .primary-button { border: none; - color: white; + color: var(--background-color); padding: 15px 32px; text-align: center; text-decoration: none; @@ -256,7 +266,7 @@ } button.card-button.disabled { - color: rgb(255, 136, 136); + color: var(--flower); background: transparent; font-size: small; cursor: pointer; @@ -268,11 +278,7 @@ } button.card-button.happy { - color: rgb(0, 146, 0); - } - - button.card-button.happy { - color: rgb(0, 146, 0); + color: var(--leaf); } img.configured-icon { @@ -296,13 +302,14 @@ div.folder-element { display: grid; grid-template-columns: auto 1fr; - box-shadow: 0 0 2px 1px rgba(0, 0, 0, 0.2); + border: 1px solid rgb(229, 229, 229); + border-radius: 4px; + box-shadow: 0px 1px 3px 0px rgba(0,0,0,0.1),0px 1px 2px -1px rgba(0,0,0,0.8); padding: 4px; margin-bottom: 8px; } div.content-name { - width: 500px; overflow-wrap: break-word; } @@ -315,7 +322,7 @@ background-color: rgb(253 214 214); border-radius: 3px; border: none; - color: rgb(207, 67, 59); + color: var(--flower); padding: 4px; } @@ -324,14 +331,14 @@ background-color: rgb(255 235 235); border-radius: 3px; border: none; - color: rgb(207, 67, 59); + color: var(--flower); padding: 4px; cursor: pointer; } - #sync-data { - background-color: #ffb300; + button.sync-data { + background-color: var(--primary); border: none; - color: white; + color: var(--main-text-color); padding: 12px; text-align: center; text-decoration: none; @@ -340,43 +347,20 @@ border-radius: 4px; cursor: pointer; transition: background-color 0.3s ease; - box-shadow: 0px 5px 0px #f9f5de; + box-shadow: 0px 5px 0px var(--background-color); } - #sync-data:hover { - background-color: #ffcc00; - box-shadow: 0px 3px 0px #f9f5de; + button.sync-data:hover { + background-color: var(--primary-hover); + box-shadow: 0px 3px 0px var(--background-color); + } + .sync-force-toggle { + align-content: center; + display: grid; + grid-auto-flow: column; + gap: 4px; } - diff --git a/src/interface/desktop/loading-animation.js b/src/interface/desktop/loading-animation.js new file mode 100644 index 00000000..d9bffb57 --- /dev/null +++ b/src/interface/desktop/loading-animation.js @@ -0,0 +1,129 @@ +var $wrap = document.getElementById('loading-animation'), + +canvassize = 380, + +length = 40, +radius = 6.8, + +rotatevalue = 0.02, +acceleration = 0, +animatestep = 0, +toend = false, + +pi2 = Math.PI*2, + +group = new THREE.Group(), +mesh, ringcover, ring, + +camera, scene, renderer; + + +camera = new THREE.PerspectiveCamera(65, 1, 1, 10000); +camera.position.z = 120; + +scene = new THREE.Scene(); +// scene.add(new THREE.AxisHelper(30)); +scene.add(group); + +mesh = new THREE.Mesh( + new THREE.TubeGeometry(new (THREE.Curve.create(function() {}, + function(percent) { + + var x = length*Math.sin(pi2*percent), + y = radius*Math.cos(pi2*3*percent), + z, t; + + t = percent%0.25/0.25; + t = percent%0.25-(2*(1-t)*t* -0.0185 +t*t*0.25); + if (Math.floor(percent/0.25) == 0 || Math.floor(percent/0.25) == 2) { + t *= -1; + } + z = radius*Math.sin(pi2*2* (percent-t)); + + return new THREE.Vector3(x, y, z); + + } + ))(), 200, 1.1, 2, true), + new THREE.MeshBasicMaterial({ + color: 0xfcc50b + // , wireframe: true + }) +); +group.add(mesh); + +ringcover = new THREE.Mesh(new THREE.PlaneGeometry(50, 15, 1), new THREE.MeshBasicMaterial({color: 0xd1684e, opacity: 0, transparent: true})); +ringcover.position.x = length+1; +ringcover.rotation.y = Math.PI/2; +group.add(ringcover); + +ring = new THREE.Mesh(new THREE.RingGeometry(4.3, 5.55, 32), new THREE.MeshBasicMaterial({color: 0xfcc50b, opacity: 0, transparent: true})); +ring.position.x = length+1.1; +ring.rotation.y = Math.PI/2; +group.add(ring); + +// fake shadow +(function() { + var plain, i; + for (i = 0; i < 10; i++) { + plain = new THREE.Mesh(new THREE.PlaneGeometry(length*2+1, radius*3, 1), new THREE.MeshBasicMaterial({color: 0xd1684e, transparent: true, opacity: 0.15})); + plain.position.z = -2.5+i*0.5; + group.add(plain); + } +})(); + +renderer = new THREE.WebGLRenderer({ + antialias: true +}); +renderer.setPixelRatio(window.devicePixelRatio); +renderer.setSize(canvassize, canvassize); +renderer.setClearColor('#d1684e'); + + +$wrap.appendChild(renderer.domElement); + +function start() { + toend = true; +} + +function back() { + toend = false; +} + +function tilt(percent) { + group.rotation.y = percent*0.5; +} + +function render() { + var progress; + + animatestep = Math.max(0, Math.min(240, toend ? animatestep+1 : animatestep-4)); + acceleration = easing(animatestep, 0, 1, 240); + + if (acceleration > 0.35) { + progress = (acceleration-0.35)/0.65; + group.rotation.y = -Math.PI/2 *progress; + group.position.z = 20*progress; + progress = Math.max(0, (acceleration-0.99)/0.01); + mesh.material.opacity = 1-progress; + ringcover.material.opacity = ring.material.opacity = progress; + ring.scale.x = ring.scale.y = 0.9 + 0.1*progress; + } + + renderer.render(scene, camera); + +} + +function animate() { + mesh.rotation.x += rotatevalue + acceleration*Math.sin(Math.PI*acceleration); + render(); + requestAnimationFrame(animate); +} + +function easing(t, b, c, d) { + if ((t /= d/2) < 1) + return c/2*t*t+b; + return c/2*((t-=2)*t*t+2)+b; +} + +animate(); +setTimeout(start, 30); diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js index fd75e3a7..eb355a5f 100644 --- a/src/interface/desktop/main.js +++ b/src/interface/desktop/main.js @@ -1,5 +1,6 @@ -const { app, BrowserWindow, ipcMain } = require('electron'); +const { app, BrowserWindow, ipcMain, Tray, Menu, nativeImage, shell } = require('electron'); const todesktop = require("@todesktop/runtime"); +const khojPackage = require('./package.json'); todesktop.init(); @@ -9,7 +10,7 @@ const {dialog} = require('electron'); const cron = require('cron').CronJob; const axios = require('axios'); -const KHOJ_URL = 'http://127.0.0.1:42110' +const KHOJ_URL = 'https://app.khoj.dev'; const Store = require('electron-store'); @@ -42,6 +43,10 @@ const schema = { }, default: [] }, + khojToken: { + type: 'string', + default: '' + }, hostURL: { type: 'string', default: KHOJ_URL @@ -62,8 +67,8 @@ const schema = { } }; +let syncing = false; var state = {} - const store = new Store({ schema }); console.log(store); @@ -106,6 +111,15 @@ function filenameToMimeType (filename) { } function pushDataToKhoj (regenerate = false) { + // Don't sync if token or hostURL is not set or if already syncing + if (store.get('khojToken') === '' || store.get('hostURL') === '' || syncing === true) { + const win = BrowserWindow.getAllWindows()[0]; + if (win) win.webContents.send('update-state', state); + return; + } else { + syncing = true; + } + let filesToPush = []; const files = store.get('files') || []; const folders = store.get('folders') || []; @@ -168,7 +182,7 @@ function pushDataToKhoj (regenerate = false) { if (!!formData?.entries()?.next().value) { const hostURL = store.get('hostURL') || KHOJ_URL; const headers = { - 'x-api-key': 'secret' + 'Authorization': `Bearer ${store.get("khojToken")}` }; axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers }) .then(response => { @@ -188,11 +202,13 @@ function pushDataToKhoj (regenerate = false) { }) .finally(() => { // Syncing complete + syncing = false; const win = BrowserWindow.getAllWindows()[0]; if (win) win.webContents.send('update-state', state); }); } else { // Syncing complete + syncing = false; const win = BrowserWindow.getAllWindows()[0]; if (win) win.webContents.send('update-state', state); } @@ -246,6 +262,15 @@ async function handleFileOpen (type) { } } +async function getToken () { + return store.get('khojToken'); +} + +async function setToken (event, token) { + store.set('khojToken', token); + return store.get('khojToken'); +} + async function getFiles () { return store.get('files'); } @@ -255,6 +280,12 @@ async function getFolders () { } async function setURL (event, url) { + // Sanitize the URL. Remove trailing slash if present. Add http:// if not present. + url = url.replace(/\/$/, ""); + if (!url.match(/^[a-zA-Z]+:\/\//)) { + url = `http://${url}`; + } + store.set('hostURL', url); return store.get('hostURL'); } @@ -287,10 +318,26 @@ async function syncData (regenerate = false) { } } -const createWindow = () => { - const win = new BrowserWindow({ +async function deleteAllFiles () { + try { + store.set('files', []); + store.set('folders', []); + pushDataToKhoj(true); + const date = new Date(); + console.log('Pushing data to Khoj at: ', date); + } catch (err) { + console.error(err); + } +} + + +let firstRun = true; +let win = null; +const createWindow = (tab = 'chat.html') => { + win = new BrowserWindow({ width: 800, height: 800, + show: false, // titleBarStyle: 'hidden', webPreferences: { preload: path.join(__dirname, 'preload.js'), @@ -311,12 +358,30 @@ const createWindow = () => { win.setResizable(true); win.setOpacity(0.95); - win.setBackgroundColor('#FFFFFF'); + win.setBackgroundColor('#f5f4f3'); win.setHasShadow(true); job.start(); - win.loadFile('index.html') + win.loadFile(tab) + + if (firstRun === true) { + firstRun = false; + + // Create splash screen + var splash = new BrowserWindow({width: 400, height: 400, transparent: true, frame: false, alwaysOnTop: true}); + splash.setOpacity(1.0); + splash.setBackgroundColor('#d16b4e'); + splash.loadFile('splash.html'); + + // Show splash screen on app load + win.once('ready-to-show', () => { + setTimeout(function(){ splash.close(); win.show(); }, 4500); + }); + } else { + // Show main window directly if not first run + win.once('ready-to-show', () => { win.show(); }); + } } app.whenReady().then(() => { @@ -331,6 +396,14 @@ app.whenReady().then(() => { event.reply('update-state', arg); }); + ipcMain.on('navigate', (event, page) => { + win.loadFile(page); + }); + + ipcMain.on('navigateToWebApp', (event, page) => { + shell.openExternal(`${store.get('hostURL')}/${page}`); + }); + ipcMain.handle('getFiles', getFiles); ipcMain.handle('getFolders', getFolders); @@ -340,19 +413,24 @@ app.whenReady().then(() => { ipcMain.handle('setURL', setURL); ipcMain.handle('getURL', getURL); + ipcMain.handle('setToken', setToken); + ipcMain.handle('getToken', getToken); + ipcMain.handle('syncData', (event, regenerate) => { syncData(regenerate); }); + ipcMain.handle('deleteAllFiles', deleteAllFiles); createWindow() app.setAboutPanelOptions({ applicationName: "Khoj", - applicationVersion: "0.0.1", - version: "0.0.1", - authors: "Khoj Team", + applicationVersion: khojPackage.version, + version: khojPackage.version, + authors: "Saba Imran, Debanjum Singh Solanky and contributors", website: "https://khoj.dev", - iconPath: path.join(__dirname, 'assets', 'khoj.png') + copyright: "GPL v3", + iconPath: path.join(__dirname, 'assets', 'icons', 'favicon-128x128.png') }); app.on('ready', async() => { @@ -375,3 +453,71 @@ app.whenReady().then(() => { app.on('window-all-closed', () => { if (process.platform !== 'darwin') app.quit() }) + +/* +** About Page +*/ + +let aboutWindow; + +function openAboutWindow() { + if (aboutWindow) { aboutWindow.focus(); return; } + + aboutWindow = new BrowserWindow({ + width: 400, + height: 400, + titleBarStyle: 'hidden', + show: false, + webPreferences: { + preload: path.join(__dirname, 'preload.js'), + nodeIntegration: true, + }, + }); + + aboutWindow.loadFile('about.html'); + + // Pass OS, Khoj version to About page + aboutWindow.webContents.on('did-finish-load', () => { + aboutWindow.webContents.send('appInfo', { version: khojPackage.version, platform: process.platform }); + }); + + // Open links in external browser + aboutWindow.webContents.setWindowOpenHandler(({ url }) => { + shell.openExternal(url); + return { action: 'deny' }; + }); + + aboutWindow.once('ready-to-show', () => { aboutWindow.show(); }); + aboutWindow.on('closed', () => { aboutWindow = null; }); +} + +/* +** System Tray Icon +*/ + +let tray + +openWindow = (page) => { + if (BrowserWindow.getAllWindows().length === 0) { + createWindow(page); + } else { + win.loadFile(page); win.show(); + } +} + +app.whenReady().then(() => { + const icon = nativeImage.createFromPath('assets/icons/favicon-20x20.png') + tray = new Tray(icon) + + const contextMenu = Menu.buildFromTemplate([ + { label: 'Chat', type: 'normal', click: () => { openWindow('chat.html'); }}, + { label: 'Search', type: 'normal', click: () => { openWindow('search.html') }}, + { label: 'Configure', type: 'normal', click: () => { openWindow('config.html') }}, + { type: 'separator' }, + { label: 'About Khoj', type: 'normal', click: () => { openAboutWindow(); } }, + { label: 'Quit', type: 'normal', click: () => { app.quit() } } + ]) + + tray.setToolTip('Khoj') + tray.setContextMenu(contextMenu) +}) diff --git a/src/interface/desktop/package.json b/src/interface/desktop/package.json index e043d982..04e1016b 100644 --- a/src/interface/desktop/package.json +++ b/src/interface/desktop/package.json @@ -1,6 +1,6 @@ { "name": "Khoj", - "version": "0.14.0", + "version": "1.0.0", "description": "An AI copilot for your Second Brain", "author": "Saba Imran, Debanjum Singh Solanky ", "license": "GPL-3.0-or-later", @@ -10,14 +10,14 @@ "main": "main.js", "private": false, "devDependencies": { - "electron": "25.8.1" + "electron": "25.8.4" }, "scripts": { "start": "yarn electron ." }, "dependencies": { "@todesktop/runtime": "^1.3.0", - "axios": "^1.5.0", + "axios": "^1.6.0", "cron": "^2.4.3", "electron-store": "^8.1.0", "fs": "^0.0.1-security" diff --git a/src/interface/desktop/preload.js b/src/interface/desktop/preload.js index aea16b9f..1d4c6ec0 100644 --- a/src/interface/desktop/preload.js +++ b/src/interface/desktop/preload.js @@ -45,5 +45,20 @@ contextBridge.exposeInMainWorld('hostURLAPI', { }) contextBridge.exposeInMainWorld('syncDataAPI', { - syncData: (regenerate) => ipcRenderer.invoke('syncData', regenerate) + syncData: (regenerate) => ipcRenderer.invoke('syncData', regenerate), + deleteAllFiles: () => ipcRenderer.invoke('deleteAllFiles') +}) + +contextBridge.exposeInMainWorld('tokenAPI', { + setToken: (token) => ipcRenderer.invoke('setToken', token), + getToken: () => ipcRenderer.invoke('getToken') +}) + +contextBridge.exposeInMainWorld('appInfoAPI', { + getInfo: (callback) => ipcRenderer.on('appInfo', callback) +}) + +contextBridge.exposeInMainWorld('navigateAPI', { + navigateToSettings: () => ipcRenderer.send('navigate', 'config.html'), + navigateToWebSettings: () => ipcRenderer.send('navigateToWebApp', 'config'), }) diff --git a/src/interface/desktop/renderer.js b/src/interface/desktop/renderer.js index d7486dae..7d0d906e 100644 --- a/src/interface/desktop/renderer.js +++ b/src/interface/desktop/renderer.js @@ -61,6 +61,7 @@ toggleFoldersButton.addEventListener('click', () => { function makeFileElement(file) { let fileElement = document.createElement("div"); fileElement.classList.add("file-element"); + let fileNameElement = document.createElement("div"); fileNameElement.classList.add("content-name"); fileNameElement.innerHTML = file.path; @@ -82,6 +83,7 @@ function makeFileElement(file) { function makeFolderElement(folder) { let folderElement = document.createElement("div"); folderElement.classList.add("folder-element"); + let folderNameElement = document.createElement("div"); folderNameElement.classList.add("content-name"); folderNameElement.innerHTML = folder.path; @@ -153,11 +155,14 @@ window.updateStateAPI.onUpdateState((event, state) => { loadingBar.style.display = 'none'; let syncStatusElement = document.getElementById("sync-status"); const currentTime = new Date(); + nextSyncTime = new Date(); + nextSyncTime.setMinutes(Math.ceil((nextSyncTime.getMinutes() + 1) / 10) * 10); if (state.completed == false) { syncStatusElement.innerHTML = `Sync was unsuccessful at ${currentTime.toLocaleTimeString()}. Contact team@khoj.dev to report this issue.`; return; } - syncStatusElement.innerHTML = `Last synced at ${currentTime.toLocaleTimeString()}`; + const options = { hour: '2-digit', minute: '2-digit' }; + syncStatusElement.innerHTML = `⏱️ Synced at ${currentTime.toLocaleTimeString(undefined, options)}. Next sync at ${nextSyncTime.toLocaleTimeString(undefined, options)}.`; }); const urlInput = document.getElementById('khoj-host-url'); @@ -174,6 +179,7 @@ urlInput.addEventListener('blur', async () => { new URL(urlInputValue); } catch (e) { console.log(e); + alert('Please enter a valid URL'); return; } @@ -181,10 +187,25 @@ urlInput.addEventListener('blur', async () => { urlInput.value = url; }); -const syncButton = document.getElementById('sync-data'); -const syncForceToggle = document.getElementById('sync-force'); -syncButton.addEventListener('click', async () => { - loadingBar.style.display = 'block'; - const regenerate = syncForceToggle.checked; - await window.syncDataAPI.syncData(regenerate); +const khojKeyInput = document.getElementById('khoj-access-key'); +(async function() { + const token = await window.tokenAPI.getToken(); + khojKeyInput.value = token; +})(); + +khojKeyInput.addEventListener('blur', async () => { + const token = await window.tokenAPI.setToken(khojKeyInput.value.trim()); + khojKeyInput.value = token; +}); + +const syncForceButton = document.getElementById('sync-force'); +syncForceButton.addEventListener('click', async () => { + loadingBar.style.display = 'block'; + await window.syncDataAPI.syncData(true); +}); + +const deleteAllButton = document.getElementById('delete-all'); +deleteAllButton.addEventListener('click', async () => { + loadingBar.style.display = 'block'; + await window.syncDataAPI.deleteAllFiles(); }); diff --git a/src/interface/desktop/index.html b/src/interface/desktop/search.html similarity index 85% rename from src/interface/desktop/index.html rename to src/interface/desktop/search.html index 6b3c7e32..aa8aa662 100644 --- a/src/interface/desktop/index.html +++ b/src/interface/desktop/search.html @@ -10,6 +10,7 @@ + - diff --git a/src/interface/desktop/splash.html b/src/interface/desktop/splash.html new file mode 100644 index 00000000..5cc32ab6 --- /dev/null +++ b/src/interface/desktop/splash.html @@ -0,0 +1,15 @@ + + + + + Khoj + + + + + + +
+ + + diff --git a/src/interface/desktop/utils.js b/src/interface/desktop/utils.js new file mode 100644 index 00000000..8f9c0aeb --- /dev/null +++ b/src/interface/desktop/utils.js @@ -0,0 +1,26 @@ +console.log(`%c %s`, "font-family:monospace", ` + __ __ __ __ ______ __ _____ __ +/\\ \\/ / /\\ \\_\\ \\ /\\ __ \\ /\\ \\ /\\ __ \\ /\\ \\ +\\ \\ _"-. \\ \\ __ \\ \\ \\ \\/\\ \\ _\\_\\ \\ \\ \\ __ \\ \\ \\ \\ + \\ \\_\\ \\_\\ \\ \\_\\ \\_\\ \\ \\_____\\ /\\_____\\ \\ \\_\\ \\_\\ \\ \\_\\ + \\/_/\\/_/ \\/_/\\/_/ \\/_____/ \\/_____/ \\/_/\\/_/ \\/_/ + +Greetings traveller, + +I am ✨Khoj✨, your open-source, personal AI copilot. + +See my source code at https://github.com/khoj-ai/khoj +Read my operating manual at https://docs.khoj.dev +`); + + +window.appInfoAPI.getInfo((_, info) => { + let khojVersionElement = document.getElementById("about-page-version"); + if (khojVersionElement) { + khojVersionElement.innerHTML = `${info.version}`; + } + let khojTitleElement = document.getElementById("about-page-title"); + if (khojTitleElement) { + khojTitleElement.innerHTML = 'Khoj for ' + (info.platform === 'win32' ? 'Windows' : info.platform === 'darwin' ? 'macOS' : 'Linux') + ''; + } +}); diff --git a/src/interface/desktop/yarn.lock b/src/interface/desktop/yarn.lock index 8591b00d..57583e13 100644 --- a/src/interface/desktop/yarn.lock +++ b/src/interface/desktop/yarn.lock @@ -163,10 +163,10 @@ atomically@^1.7.0: resolved "https://registry.yarnpkg.com/atomically/-/atomically-1.7.0.tgz#c07a0458432ea6dbc9a3506fffa424b48bccaafe" integrity sha512-Xcz9l0z7y9yQ9rdDaxlmaI4uJHf/T8g9hOEzJcsEqX2SjCj4J20uK7+ldkDHMbpJDK76wF7xEIgxc/vSlsfw5w== -axios@^1.5.0: - version "1.5.0" - resolved "https://registry.yarnpkg.com/axios/-/axios-1.5.0.tgz#f02e4af823e2e46a9768cfc74691fdd0517ea267" - integrity sha512-D4DdjDo5CY50Qms0qGQTTw6Q44jl7zRwY7bthds06pUGfChBCTcQs+N743eFWGEd6pRTMd6A+I87aWyFV5wiZQ== +axios@^1.6.0: + version "1.6.2" + resolved "https://registry.yarnpkg.com/axios/-/axios-1.6.2.tgz#de67d42c755b571d3e698df1b6504cde9b0ee9f2" + integrity sha512-7i24Ri4pmDRfJTR7LDBhsOTtcm+9kjX5WiY1X3wIisx6G9So3pfMkEiU7emUBe46oceVImccTEM3k6C5dbVW8A== dependencies: follow-redirects "^1.15.0" form-data "^4.0.0" @@ -379,10 +379,10 @@ electron-updater@^4.6.1: lodash.isequal "^4.5.0" semver "^7.3.5" -electron@25.8.1: - version "25.8.1" - resolved "https://registry.yarnpkg.com/electron/-/electron-25.8.1.tgz#092fab5a833db4d9240d4d6f36218cf7ca954f86" - integrity sha512-GtcP1nMrROZfFg0+mhyj1hamrHvukfF6of2B/pcWxmWkd5FVY1NJib0tlhiorFZRzQN5Z+APLPr7aMolt7i2AQ== +electron@25.8.4: + version "25.8.4" + resolved "https://registry.yarnpkg.com/electron/-/electron-25.8.4.tgz#b50877aac7d96323920437baf309ad86382cb455" + integrity sha512-hUYS3RGdaa6E1UWnzeGnsdsBYOggwMMg4WGxNGvAoWtmRrr6J1BsjFW/yRq4WsJHJce2HdzQXtz4OGXV6yUCLg== dependencies: "@electron/get" "^2.0.0" "@types/node" "^18.11.18" diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 74e080c5..f427c197 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -6,7 +6,7 @@ ;; Saba Imran ;; Description: An AI copilot for your Second Brain ;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image -;; Version: 0.14.0 +;; Version: 1.0.0 ;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1")) ;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs @@ -63,7 +63,7 @@ ;; Khoj Static Configuration ;; ------------------------- -(defcustom khoj-server-url "http://localhost:42110" +(defcustom khoj-server-url "https://app.khoj.dev" "Location of Khoj API server." :group 'khoj :type 'string) @@ -93,8 +93,8 @@ :group 'khoj :type 'number) -(defcustom khoj-server-api-key "secret" - "API Key to Khoj server." +(defcustom khoj-api-key nil + "API Key to your Khoj. Default at https://app.khoj.dev/config#clients." :group 'khoj :type 'string) @@ -246,26 +246,6 @@ for example), set this to the full interpreter path." :type '(repeat string) :group 'khoj) -(defcustom khoj-chat-model "gpt-3.5-turbo" - "Specify chat model to use for chat with khoj." - :type 'string - :group 'khoj) - -(defcustom khoj-openai-api-key nil - "OpenAI API key used to configure chat on khoj server." - :type 'string - :group 'khoj) - -(defcustom khoj-chat-offline nil - "Use offline model to chat with khoj." - :type 'boolean - :group 'khoj) - -(defcustom khoj-offline-chat-model nil - "Specify chat model to use for offline chat with khoj." - :type 'string - :group 'khoj) - (defcustom khoj-auto-setup t "Automate install, configure and start of khoj server. Auto invokes setup steps on calling main entrypoint." @@ -319,8 +299,7 @@ Auto invokes setup steps on calling main entrypoint." :filter (lambda (process msg) (cond ((string-match (format "Uvicorn running on %s" khoj-server-url) msg) (progn - (setq khoj--server-ready? t) - (khoj--server-configure))) + (setq khoj--server-ready? t))) ((string-match "Batches: " msg) (when (string-match "\\([0-9]+\\.[0-9]+\\|\\([0-9]+\\)\\)%?" msg) (message "khoj.el: %s updating index %s" @@ -383,106 +362,13 @@ Auto invokes setup steps on calling main entrypoint." (when (not (khoj--server-started?)) (khoj--server-start))) -(defun khoj--get-directory-from-config (config keys &optional level) - "Extract directory under specified KEYS in CONFIG and trim it to LEVEL. -CONFIG is json obtained from Khoj config API." - (let ((item config)) - (dolist (key keys) - (setq item (cdr (assoc key item)))) - (-> item - (split-string "/") - (butlast (or level nil)) - (string-join "/")))) - -(defun khoj--server-configure () - "Configure the Khoj server for search and chat." - (interactive) - (let* ((url-request-method "GET") - (current-config - (with-temp-buffer - (url-insert-file-contents (format "%s/api/config/data" khoj-server-url)) - (ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false)))) - (default-config - (with-temp-buffer - (url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url)) - (ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false)))) - (default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile))) - (chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config)))))) - (enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config)))))) - (offline-chat-model (or khoj-offline-chat-model (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config)))))) - (config (or current-config default-config))) - - ;; Configure processors - (cond - ((not khoj-openai-api-key) - (let* ((processor (assoc 'processor config)) - (conversation (assoc 'conversation processor)) - (openai (assoc 'openai conversation))) - (when openai - ;; Unset the `openai' field in the khoj conversation processor config - (message "khoj.el: Disable Chat using OpenAI as your OpenAI API key got removed from config") - (setcdr conversation (delq openai (cdr conversation))) - (push conversation (cdr processor)) - (push processor config)))) - - ;; If khoj backend isn't configured yet - ((not current-config) - (message "khoj.el: Khoj not configured yet.") - (setq config (delq (assoc 'processor config) config)) - (cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir)) - (offline-chat . ((enable-offline-chat . ,enable-offline-chat) - (chat-model . ,offline-chat-model))) - (openai . ((chat-model . ,chat-model) - (api-key . ,khoj-openai-api-key))))))) - config)) - - ;; Else if chat isn't configured in khoj backend - ((not (alist-get 'conversation (alist-get 'processor config))) - (message "khoj.el: Chat not configured yet.") - (let ((new-processor-type (alist-get 'processor config))) - (setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type)) - (cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir)) - (offline-chat . ((enable-offline-chat . ,enable-offline-chat) - (chat-model . ,offline-chat-model))) - (openai . ((chat-model . ,chat-model) - (api-key . ,khoj-openai-api-key))))) - new-processor-type) - (setq config (delq (assoc 'processor config) config)) - (cl-pushnew `(processor . ,new-processor-type) config))) - - ;; Else if chat configuration in khoj backend has gone stale - ((not (and (equal (alist-get 'api-key (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-openai-api-key) - (equal (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-chat-model) - (equal (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) enable-offline-chat) - (equal (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) offline-chat-model))) - (message "khoj.el: Chat configuration has gone stale.") - (let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile))) - (new-processor-type (alist-get 'processor config))) - (setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type)) - (cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory)) - (offline-chat . ((enable-offline-chat . ,enable-offline-chat) - (chat-model . ,offline-chat-model))) - (openai . ((chat-model . ,khoj-chat-model) - (api-key . ,khoj-openai-api-key))))) - new-processor-type) - (setq config (delq (assoc 'processor config) config)) - (cl-pushnew `(processor . ,new-processor-type) config)))) - - ;; Update server with latest configuration, if required - (cond ((not current-config) - (khoj--post-new-config config) - (message "khoj.el: ⚙️ Generated new khoj server configuration.")) - ((not (equal config current-config)) - (khoj--post-new-config config) - (message "khoj.el: ⚙️ Updated khoj server configuration."))))) - (defun khoj-setup (&optional interact) - "Install, start and configure Khoj server. Get permission if INTERACT is non-nil." + "Install and start Khoj server. Get permission if INTERACT is non-nil." (interactive "p") ;; Setup khoj server if not running (let* ((not-started (not (khoj--server-started?))) (permitted (if (and not-started interact) - (y-or-n-p "Could not connect to Khoj server. Should I install, start and configure it for you?") + (y-or-n-p "Could not connect to Khoj server. Should I install, start it for you?") t))) ;; If user permits setup of khoj server from khoj.el (when permitted @@ -491,12 +377,9 @@ CONFIG is json obtained from Khoj config API." (khoj--server-setup)) ;; Wait until server is ready - ;; As server can be started but not ready to use/configure + ;; As server can be started but not ready to use (while (not khoj--server-ready?) - (sit-for 0.5)) - - ;; Configure server once it's ready - (khoj--server-configure)))) + (sit-for 0.5))))) ;; ------------------- @@ -516,7 +399,7 @@ CONFIG is json obtained from Khoj config API." (let ((url-request-method "POST") (url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary)) (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary)) - ("x-api-key" . ,khoj-server-api-key)))) + ("Authorization" . ,(format "Bearer %s" khoj-api-key))))) (with-current-buffer (url-retrieve (format "%s/api/v1/index/update?%s&force=%s&client=emacs" khoj-server-url type-query (or force "false")) ;; render response from indexing API endpoint on server @@ -690,19 +573,22 @@ Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request. "Configure khoj server with provided CONFIG." ;; POST provided config to khoj server (let ((url-request-method "POST") - (url-request-extra-headers '(("Content-Type" . "application/json"))) + (url-request-extra-headers `(("Content-Type" . "application/json") + ("Authorization" . ,(format "Bearer %s" khoj-api-key)))) (url-request-data (encode-coding-string (json-encode-alist config) 'utf-8)) (config-url (format "%s/api/config/data" khoj-server-url))) (with-current-buffer (url-retrieve-synchronously config-url) (buffer-string))) ;; Update index on khoj server after configuration update - (let ((khoj--server-ready? nil)) + (let ((khoj--server-ready? nil) + (url-request-extra-headers `(("Authorization" . ,(format "\"Bearer %s\"" khoj-api-key))))) (url-retrieve (format "%s/api/update?client=emacs" khoj-server-url) #'identity))) (defun khoj--get-enabled-content-types () "Get content types enabled for search from API." (let ((config-url (format "%s/api/config/types" khoj-server-url)) - (url-request-method "GET")) + (url-request-method "GET") + (url-request-extra-headers `(("Authorization" . ,(format "Bearer %s" khoj-api-key))))) (with-temp-buffer (url-insert-file-contents config-url) (thread-last @@ -722,7 +608,8 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE." ;; get json response from api (with-current-buffer buffer-name (let ((inhibit-read-only t) - (url-request-method "GET")) + (url-request-method "GET") + (url-request-extra-headers `(("Authorization" . ,(format "Bearer %s" khoj-api-key))))) (erase-buffer) (url-insert-file-contents query-url))) ;; render json response into formatted entries @@ -848,6 +735,7 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE." "Send QUERY to Khoj Chat API." (let* ((url-request-method "GET") (encoded-query (url-hexify-string query)) + (url-request-extra-headers `(("Authorization" . ,(format "Bearer %s" khoj-api-key)))) (query-url (format "%s/api/chat?q=%s&n=%s&client=emacs" khoj-server-url encoded-query khoj-results-count))) (with-temp-buffer (condition-case ex @@ -862,6 +750,7 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE." (defun khoj--get-chat-history-api () "Send QUERY to Khoj Chat History API." (let* ((url-request-method "GET") + (url-request-extra-headers `(("Authorization" . ,(format "Bearer %s" khoj-api-key)))) (query-url (format "%s/api/chat/history?client=emacs" khoj-server-url))) (with-temp-buffer (condition-case ex diff --git a/src/interface/obsidian/manifest.json b/src/interface/obsidian/manifest.json index a1c1f913..4d019834 100644 --- a/src/interface/obsidian/manifest.json +++ b/src/interface/obsidian/manifest.json @@ -1,7 +1,7 @@ { "id": "khoj", "name": "Khoj", - "version": "0.14.0", + "version": "1.0.0", "minAppVersion": "0.15.0", "description": "An AI copilot for your Second Brain", "author": "Khoj Inc.", diff --git a/src/interface/obsidian/package.json b/src/interface/obsidian/package.json index e862b23c..a0a0df3b 100644 --- a/src/interface/obsidian/package.json +++ b/src/interface/obsidian/package.json @@ -1,6 +1,6 @@ { "name": "Khoj", - "version": "0.14.0", + "version": "1.0.0", "description": "An AI copilot for your Second Brain", "author": "Debanjum Singh Solanky, Saba Imran ", "license": "GPL-3.0-or-later", diff --git a/src/interface/obsidian/src/chat_modal.ts b/src/interface/obsidian/src/chat_modal.ts index d390cbf2..a8008048 100644 --- a/src/interface/obsidian/src/chat_modal.ts +++ b/src/interface/obsidian/src/chat_modal.ts @@ -142,7 +142,8 @@ export class KhojChatModal extends Modal { async getChatHistory(): Promise { // Get chat history from Khoj backend let chatUrl = `${this.setting.khojUrl}/api/chat/history?client=obsidian`; - let response = await request(chatUrl); + let headers = { "Authorization": `Bearer ${this.setting.khojApiKey}` }; + let response = await request({ url: chatUrl, headers: headers }); let chatLogs = JSON.parse(response).response; chatLogs.forEach((chatLog: any) => { this.renderMessageWithReferences(chatLog.message, chatLog.by, chatLog.context, new Date(chatLog.created)); @@ -168,7 +169,8 @@ export class KhojChatModal extends Modal { method: "GET", headers: { "Access-Control-Allow-Origin": "*", - "Content-Type": "text/event-stream" + "Content-Type": "text/event-stream", + "Authorization": `Bearer ${this.setting.khojApiKey}`, }, }) diff --git a/src/interface/obsidian/src/main.ts b/src/interface/obsidian/src/main.ts index 1fbed55f..26b0a5a1 100644 --- a/src/interface/obsidian/src/main.ts +++ b/src/interface/obsidian/src/main.ts @@ -1,8 +1,8 @@ -import { Notice, Plugin, TFile } from 'obsidian'; +import { Notice, Plugin, request } from 'obsidian'; import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings' import { KhojSearchModal } from 'src/search_modal' import { KhojChatModal } from 'src/chat_modal' -import { configureKhojBackend, updateContentIndex } from './utils'; +import { updateContentIndex } from './utils'; export default class Khoj extends Plugin { @@ -39,9 +39,9 @@ export default class Khoj extends Plugin { id: 'chat', name: 'Chat', checkCallback: (checking) => { - if (!checking && this.settings.connectedToBackend && (!!this.settings.openaiApiKey || this.settings.enableOfflineChat)) + if (!checking && this.settings.connectedToBackend) new KhojChatModal(this.app, this.settings).open(); - return !!this.settings.openaiApiKey || this.settings.enableOfflineChat; + return this.settings.connectedToBackend; } }); @@ -70,16 +70,27 @@ export default class Khoj extends Plugin { // Load khoj obsidian plugin settings this.settings = Object.assign({}, DEFAULT_SETTINGS, await this.loadData()); - if (this.settings.autoConfigure) { - // Load, configure khoj server settings - await configureKhojBackend(this.app.vault, this.settings); + // Check if khoj backend is configured, note if cannot connect to backend + let headers = { "Authorization": `Bearer ${this.settings.khojApiKey}` }; + + if (this.settings.khojUrl === "https://app.khoj.dev") { + if (this.settings.khojApiKey === "") { + new Notice(`❗️Khoj API key is not configured. Please visit https://app.khoj.dev/config#clients to get an API key.`); + return; + } + + await request({ url: this.settings.khojUrl ,method: "GET", headers: headers }) + .then(response => { + this.settings.connectedToBackend = true; + }) + .catch(error => { + this.settings.connectedToBackend = false; + new Notice(`❗️Ensure Khoj backend is running and Khoj URL is pointing to it in the plugin settings.\n\n${error}`); + }); } } async saveSettings() { - if (this.settings.autoConfigure) { - await configureKhojBackend(this.app.vault, this.settings, false); - } this.saveData(this.settings); } diff --git a/src/interface/obsidian/src/search_modal.ts b/src/interface/obsidian/src/search_modal.ts index 56133153..e841360e 100644 --- a/src/interface/obsidian/src/search_modal.ts +++ b/src/interface/obsidian/src/search_modal.ts @@ -90,10 +90,11 @@ export class KhojSearchModal extends SuggestModal { // Query Khoj backend for search results let encodedQuery = encodeURIComponent(query); let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}&client=obsidian`; + let headers = { 'Authorization': `Bearer ${this.setting.khojApiKey}` } // Get search results for markdown and pdf files - let mdResponse = await request(`${searchUrl}&t=markdown`); - let pdfResponse = await request(`${searchUrl}&t=pdf`); + let mdResponse = await request({ url: `${searchUrl}&t=markdown`, headers: headers }); + let pdfResponse = await request({ url: `${searchUrl}&t=pdf`, headers: headers }); // Parse search results let mdData = JSON.parse(mdResponse) diff --git a/src/interface/obsidian/src/settings.ts b/src/interface/obsidian/src/settings.ts index c3f40905..9150c438 100644 --- a/src/interface/obsidian/src/settings.ts +++ b/src/interface/obsidian/src/settings.ts @@ -3,22 +3,20 @@ import Khoj from 'src/main'; import { updateContentIndex } from './utils'; export interface KhojSetting { - enableOfflineChat: boolean; - openaiApiKey: string; resultsCount: number; khojUrl: string; + khojApiKey: string; connectedToBackend: boolean; autoConfigure: boolean; lastSyncedFiles: TFile[]; } export const DEFAULT_SETTINGS: KhojSetting = { - enableOfflineChat: false, resultsCount: 6, - khojUrl: 'http://127.0.0.1:42110', + khojUrl: 'https://app.khoj.dev', + khojApiKey: '', connectedToBackend: false, autoConfigure: true, - openaiApiKey: '', lastSyncedFiles: [] } @@ -49,21 +47,12 @@ export class KhojSettingTab extends PluginSettingTab { containerEl.firstElementChild?.setText(this.getBackendStatusMessage()); })); new Setting(containerEl) - .setName('OpenAI API Key') - .setDesc('Use OpenAI for Khoj Chat with your API key.') + .setName('Khoj API Key') + .setDesc('Use Khoj Cloud with your Khoj API Key') .addText(text => text - .setValue(`${this.plugin.settings.openaiApiKey}`) + .setValue(`${this.plugin.settings.khojApiKey}`) .onChange(async (value) => { - this.plugin.settings.openaiApiKey = value.trim(); - await this.plugin.saveSettings(); - })); - new Setting(containerEl) - .setName('Enable Offline Chat') - .setDesc('Chat privately without an internet connection. Enabling this will use offline chat even if OpenAI is configured.') - .addToggle(toggle => toggle - .setValue(this.plugin.settings.enableOfflineChat) - .onChange(async (value) => { - this.plugin.settings.enableOfflineChat = value; + this.plugin.settings.khojApiKey = value.trim(); await this.plugin.saveSettings(); })); new Setting(containerEl) @@ -78,8 +67,8 @@ export class KhojSettingTab extends PluginSettingTab { await this.plugin.saveSettings(); })); new Setting(containerEl) - .setName('Auto Configure') - .setDesc('Automatically configure the Khoj backend.') + .setName('Auto Sync') + .setDesc('Automatically index your vault with Khoj.') .addToggle(toggle => toggle .setValue(this.plugin.settings.autoConfigure) .onChange(async (value) => { @@ -88,7 +77,7 @@ export class KhojSettingTab extends PluginSettingTab { })); let indexVaultSetting = new Setting(containerEl); indexVaultSetting - .setName('Index Vault') + .setName('Force Sync') .setDesc('Manually force Khoj to re-index your Obsidian Vault.') .addButton(button => button .setButtonText('Update') diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts index eb3d4d12..7eda8a47 100644 --- a/src/interface/obsidian/src/utils.ts +++ b/src/interface/obsidian/src/utils.ts @@ -1,4 +1,4 @@ -import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal, TFile } from 'obsidian'; +import { FileSystemAdapter, Notice, Vault, Modal, TFile } from 'obsidian'; import { KhojSetting } from 'src/settings' export function getVaultAbsolutePath(vault: Vault): string { @@ -9,26 +9,6 @@ export function getVaultAbsolutePath(vault: Vault): string { return ''; } -type OpenAIType = null | { - "chat-model": string; - "api-key": string; -}; - -type OfflineChatType = null | { - "chat-model": string; - "enable-offline-chat": boolean; -}; - -interface ProcessorData { - conversation: { - "conversation-logfile": string; - openai: OpenAIType; - "offline-chat": OfflineChatType; - "tokenizer": null | string; - "max-prompt-size": null | number; - }; -} - function fileExtensionToMimeType (extension: string): string { switch (extension) { case 'pdf': @@ -78,7 +58,7 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, { method: 'POST', headers: { - 'x-api-key': 'secret', + 'Authorization': `Bearer ${setting.khojApiKey}`, }, body: formData, }); @@ -92,100 +72,6 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las return files; } -export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) { - let khojConfigUrl = `${setting.khojUrl}/api/config/data`; - - // Check if khoj backend is configured, note if cannot connect to backend - let khoj_already_configured = await request(khojConfigUrl) - .then(response => { - setting.connectedToBackend = true; - return response !== "null" - }) - .catch(error => { - setting.connectedToBackend = false; - if (notify) - new Notice(`❗️Ensure Khoj backend is running and Khoj URL is pointing to it in the plugin settings.\n\n${error}`); - }) - // Short-circuit configuring khoj if unable to connect to khoj backend - if (!setting.connectedToBackend) return; - - // Set index name from the path of the current vault - // Get default config fields from khoj backend - let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response)); - let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]); - let khojDefaultOpenAIChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"]; - let khojDefaultOfflineChatModelName = defaultConfig["processor"]["conversation"]["offline-chat"]["chat-model"]; - - // Get current config if khoj backend configured, else get default config from khoj backend - await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`) - .then(response => JSON.parse(response)) - .then(data => { - let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`; - let processorData: ProcessorData = { - "conversation": { - "conversation-logfile": conversationLogFile, - "openai": null, - "offline-chat": { - "chat-model": khojDefaultOfflineChatModelName, - "enable-offline-chat": setting.enableOfflineChat, - }, - "tokenizer": null, - "max-prompt-size": null, - } - } - - // If the Open AI API Key was configured in the plugin settings - if (!!setting.openaiApiKey) { - let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultOpenAIChatModelName; - processorData = { - "conversation": { - "conversation-logfile": conversationLogFile, - "openai": { - "chat-model": openAIChatModel, - "api-key": setting.openaiApiKey, - }, - "offline-chat": { - "chat-model": khojDefaultOfflineChatModelName, - "enable-offline-chat": setting.enableOfflineChat, - }, - "tokenizer": null, - "max-prompt-size": null, - }, - } - } - - // Set khoj processor config to conversation processor config - data["processor"] = processorData; - - // Save updated config and refresh index on khoj backend - updateKhojBackend(setting.khojUrl, data); - if (!khoj_already_configured) - console.log(`Khoj: Created khoj backend config:\n${JSON.stringify(data)}`) - else - console.log(`Khoj: Updated khoj backend config:\n${JSON.stringify(data)}`) - }) - .catch(error => { - if (notify) - new Notice(`❗️Failed to configure Khoj backend. Contact developer on Github.\n\nError: ${error}`); - }) -} - -export async function updateKhojBackend(khojUrl: string, khojConfig: Object) { - // POST khojConfig to khojConfigUrl - let requestContent: RequestUrlParam = { - url: `${khojUrl}/api/config/data`, - body: JSON.stringify(khojConfig), - method: 'POST', - contentType: 'application/json', - }; - // Save khojConfig on khoj backend at khojConfigUrl - request(requestContent); -} - -function getIndexDirectoryFromBackendConfig(filepath: string) { - return filepath.split("/").slice(0, -1).join("/"); -} - export async function createNote(name: string, newLeaf = false): Promise { try { let pathPrefix: string diff --git a/src/interface/obsidian/styles.css b/src/interface/obsidian/styles.css index 3e3808f7..d322804d 100644 --- a/src/interface/obsidian/styles.css +++ b/src/interface/obsidian/styles.css @@ -8,7 +8,7 @@ If your plugin does not need CSS, delete this file. */ :root { - --khoj-chat-primary: #ffb300; + --khoj-chat-primary: #fee285; --khoj-chat-dark-grey: #475569; } diff --git a/src/interface/obsidian/versions.json b/src/interface/obsidian/versions.json index 8deb4367..06efeecb 100644 --- a/src/interface/obsidian/versions.json +++ b/src/interface/obsidian/versions.json @@ -26,5 +26,6 @@ "0.12.2": "0.15.0", "0.12.3": "0.15.0", "0.13.0": "0.15.0", - "0.14.0": "0.15.0" + "0.14.0": "0.15.0", + "1.0.0": "0.15.0" } diff --git a/src/khoj/configure.py b/src/khoj/configure.py index a67b2403..5ed92727 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -1,43 +1,97 @@ # Standard Packages -import sys import logging import json from enum import Enum from typing import Optional import requests +import os # External Packages import schedule -from fastapi.staticfiles import StaticFiles +from starlette.middleware.sessions import SessionMiddleware +from starlette.middleware.authentication import AuthenticationMiddleware +from starlette.requests import HTTPConnection + +from starlette.authentication import ( + AuthCredentials, + AuthenticationBackend, + SimpleUser, + UnauthenticatedUser, +) # Internal Packages +from database.models import KhojUser, Subscription +from database.adapters import get_all_users, get_or_create_search_model +from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel +from khoj.routers.indexer import configure_content, load_content, configure_search from khoj.utils import constants, state from khoj.utils.config import ( SearchType, - ProcessorConfigModel, - ConversationProcessorConfigModel, ) -from khoj.utils.helpers import resolve_absolute_path, merge_dicts from khoj.utils.fs_syncer import collect_files -from khoj.utils.rawconfig import FullConfig, OfflineChatProcessorConfig, ProcessorConfig, ConversationProcessorConfig -from khoj.routers.indexer import configure_content, load_content, configure_search +from khoj.utils.rawconfig import FullConfig logger = logging.getLogger(__name__) -def initialize_server(config: Optional[FullConfig], required=False): - if config is None and required: - logger.error( - f"🚨 Exiting as Khoj is not configured.\nConfigure it via http://{state.host}:{state.port}/config or by editing {state.config_file}." - ) - sys.exit(1) - elif config is None: - logger.warning( - f"🚨 Khoj is not configured.\nConfigure it via http://{state.host}:{state.port}/config, plugins or by editing {state.config_file}." - ) - return None +class AuthenticatedKhojUser(SimpleUser): + def __init__(self, user): + self.object = user + super().__init__(user.email) + +class UserAuthenticationBackend(AuthenticationBackend): + def __init__( + self, + ): + from database.models import KhojUser, KhojApiUser + + self.khojuser_manager = KhojUser.objects + self.khojapiuser_manager = KhojApiUser.objects + self._initialize_default_user() + super().__init__() + + def _initialize_default_user(self): + if not self.khojuser_manager.filter(username="default").exists(): + default_user = self.khojuser_manager.create_user( + username="default", + email="default@example.com", + password="default", + ) + Subscription.objects.create(user=default_user, type="standard", renewal_date="2100-04-01") + + async def authenticate(self, request: HTTPConnection): + current_user = request.session.get("user") + if current_user and current_user.get("email"): + user = ( + await self.khojuser_manager.filter(email=current_user.get("email")) + .prefetch_related("subscription") + .afirst() + ) + if user: + return AuthCredentials(["authenticated"]), AuthenticatedKhojUser(user) + if len(request.headers.get("Authorization", "").split("Bearer ")) == 2: + # Get bearer token from header + bearer_token = request.headers["Authorization"].split("Bearer ")[1] + # Get user owning token + user_with_token = ( + await self.khojapiuser_manager.filter(token=bearer_token) + .select_related("user") + .prefetch_related("user__subscription") + .afirst() + ) + if user_with_token: + return AuthCredentials(["authenticated"]), AuthenticatedKhojUser(user_with_token.user) + if state.anonymous_mode: + user = await self.khojuser_manager.filter(username="default").prefetch_related("subscription").afirst() + if user: + return AuthCredentials(["authenticated"]), AuthenticatedKhojUser(user) + + return AuthCredentials(), UnauthenticatedUser() + + +def initialize_server(config: Optional[FullConfig]): try: configure_server(config, init=True) except Exception as e: @@ -45,32 +99,30 @@ def initialize_server(config: Optional[FullConfig], required=False): def configure_server( - config: FullConfig, regenerate: bool = False, search_type: Optional[SearchType] = None, init=False + config: FullConfig, + regenerate: bool = False, + search_type: Optional[SearchType] = None, + init=False, + user: KhojUser = None, ): # Update Config + if config == None: + logger.info(f"🚨 Khoj is not configured.\nInitializing it with a default config.") + config = FullConfig() state.config = config - # Initialize Processor from Config - try: - state.processor_config = configure_processor(state.config.processor) - except Exception as e: - logger.error(f"🚨 Failed to configure processor", exc_info=True) - raise e - # Initialize Search Models from Config and initialize content try: - state.config_lock.acquire() - state.SearchType = configure_search_types(state.config) + state.embeddings_model = EmbeddingsModel(get_or_create_search_model().bi_encoder) + state.cross_encoder_model = CrossEncoderModel(get_or_create_search_model().cross_encoder) + state.SearchType = configure_search_types() state.search_models = configure_search(state.search_models, state.config.search_type) - initialize_content(regenerate, search_type, init) + initialize_content(regenerate, search_type, init, user) except Exception as e: - logger.error(f"🚨 Failed to configure search models", exc_info=True) raise e - finally: - state.config_lock.release() -def initialize_content(regenerate: bool, search_type: Optional[SearchType] = None, init=False): +def initialize_content(regenerate: bool, search_type: Optional[SearchType] = None, init=False, user: KhojUser = None): # Initialize Content from Config if state.search_models: try: @@ -79,17 +131,19 @@ def initialize_content(regenerate: bool, search_type: Optional[SearchType] = Non state.content_index = load_content(state.config.content_type, state.content_index, state.search_models) else: logger.info("📬 Updating content index...") - all_files = collect_files(state.config.content_type) - state.content_index = configure_content( + all_files = collect_files(user=user) + state.content_index, status = configure_content( state.content_index, state.config.content_type, all_files, state.search_models, regenerate, search_type, + user=user, ) + if not status: + raise RuntimeError("Failed to update content index") except Exception as e: - logger.error(f"🚨 Failed to index content", exc_info=True) raise e @@ -99,134 +153,50 @@ def configure_routes(app): from khoj.routers.api_beta import api_beta from khoj.routers.web_client import web_client from khoj.routers.indexer import indexer + from khoj.routers.auth import auth_router + from khoj.routers.subscription import subscription_router - app.mount("/static", StaticFiles(directory=constants.web_directory), name="static") app.include_router(api, prefix="/api") app.include_router(api_beta, prefix="/api/beta") app.include_router(indexer, prefix="/api/v1/index") + if state.billing_enabled: + logger.info("💳 Enabled Billing") + app.include_router(subscription_router, prefix="/api/subscription") app.include_router(web_client) + app.include_router(auth_router, prefix="/auth") -if not state.demo: +def configure_middleware(app): + app.add_middleware(AuthenticationMiddleware, backend=UserAuthenticationBackend()) + app.add_middleware(SessionMiddleware, secret_key=os.environ.get("KHOJ_DJANGO_SECRET_KEY", "!secret")) - @schedule.repeat(schedule.every(61).minutes) - def update_search_index(): - try: - logger.info("📬 Updating content index via Scheduler") - all_files = collect_files(state.config.content_type) - state.content_index = configure_content( - state.content_index, state.config.content_type, all_files, state.search_models + +@schedule.repeat(schedule.every(61).minutes) +def update_search_index(): + try: + logger.info("📬 Updating content index via Scheduler") + for user in get_all_users(): + all_files = collect_files(user=user) + state.content_index, success = configure_content( + state.content_index, state.config.content_type, all_files, state.search_models, user=user ) - logger.info("📪 Content index updated via Scheduler") - except Exception as e: - logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True) + all_files = collect_files(user=None) + state.content_index, success = configure_content( + state.content_index, state.config.content_type, all_files, state.search_models, user=None + ) + if not success: + raise RuntimeError("Failed to update content index") + logger.info("📪 Content index updated via Scheduler") + except Exception as e: + logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True) -def configure_search_types(config: FullConfig): +def configure_search_types(): # Extract core search types core_search_types = {e.name: e.value for e in SearchType} - # Extract configured plugin search types - plugin_search_types = {} - if config.content_type and config.content_type.plugins: - plugin_search_types = {plugin_type: plugin_type for plugin_type in config.content_type.plugins.keys()} # Dynamically generate search type enum by merging core search types with configured plugin search types - return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types)) - - -def configure_processor( - processor_config: Optional[ProcessorConfig], state_processor_config: Optional[ProcessorConfigModel] = None -): - if not processor_config: - logger.warning("🚨 No Processor configuration available.") - return None - - processor = ProcessorConfigModel() - - # Initialize Conversation Processor - logger.info("💬 Setting up conversation processor") - processor.conversation = configure_conversation_processor(processor_config, state_processor_config) - - return processor - - -def configure_conversation_processor( - processor_config: Optional[ProcessorConfig], state_processor_config: Optional[ProcessorConfigModel] = None -): - if ( - not processor_config - or not processor_config.conversation - or not processor_config.conversation.conversation_logfile - ): - default_config = constants.default_config - default_conversation_logfile = resolve_absolute_path( - default_config["processor"]["conversation"]["conversation-logfile"] # type: ignore - ) - conversation_logfile = resolve_absolute_path(default_conversation_logfile) - conversation_config = processor_config.conversation if processor_config else None - conversation_processor = ConversationProcessorConfigModel( - conversation_config=ConversationProcessorConfig( - conversation_logfile=conversation_logfile, - openai=(conversation_config.openai if (conversation_config is not None) else None), - offline_chat=conversation_config.offline_chat if conversation_config else OfflineChatProcessorConfig(), - ) - ) - else: - conversation_processor = ConversationProcessorConfigModel( - conversation_config=processor_config.conversation, - ) - conversation_logfile = resolve_absolute_path(conversation_processor.conversation_logfile) - - # Load Conversation Logs from Disk - if state_processor_config and state_processor_config.conversation and state_processor_config.conversation.meta_log: - conversation_processor.meta_log = state_processor_config.conversation.meta_log - conversation_processor.chat_session = state_processor_config.conversation.chat_session - logger.debug(f"Loaded conversation logs from state") - return conversation_processor - - if conversation_logfile.is_file(): - # Load Metadata Logs from Conversation Logfile - with conversation_logfile.open("r") as f: - conversation_processor.meta_log = json.load(f) - logger.debug(f"Loaded conversation logs from {conversation_logfile}") - else: - # Initialize Conversation Logs - conversation_processor.meta_log = {} - conversation_processor.chat_session = [] - - return conversation_processor - - -@schedule.repeat(schedule.every(17).minutes) -def save_chat_session(): - # No need to create empty log file - if not ( - state.processor_config - and state.processor_config.conversation - and state.processor_config.conversation.meta_log - and state.processor_config.conversation.chat_session - ): - return - - # Summarize Conversation Logs for this Session - conversation_log = state.processor_config.conversation.meta_log - session = { - "session-start": conversation_log.get("session", [{"session-end": 0}])[-1]["session-end"], - "session-end": len(conversation_log["chat"]), - } - if "session" in conversation_log: - conversation_log["session"].append(session) - else: - conversation_log["session"] = [session] - - # Save Conversation Metadata Logs to Disk - conversation_logfile = resolve_absolute_path(state.processor_config.conversation.conversation_logfile) - conversation_logfile.parent.mkdir(parents=True, exist_ok=True) # create conversation directory if doesn't exist - with open(conversation_logfile, "w+", encoding="utf-8") as logfile: - json.dump(conversation_log, logfile, indent=2) - - state.processor_config.conversation.chat_session = [] - logger.info("📩 Saved current chat session to conversation logs") + return Enum("SearchType", core_search_types) @schedule.repeat(schedule.every(59).minutes) diff --git a/src/khoj/interface/web/assets/icons/computer.png b/src/khoj/interface/web/assets/icons/computer.png new file mode 100644 index 00000000..12473485 Binary files /dev/null and b/src/khoj/interface/web/assets/icons/computer.png differ diff --git a/src/khoj/interface/web/assets/icons/copy-solid.svg b/src/khoj/interface/web/assets/icons/copy-solid.svg new file mode 100644 index 00000000..da7020be --- /dev/null +++ b/src/khoj/interface/web/assets/icons/copy-solid.svg @@ -0,0 +1 @@ + diff --git a/src/khoj/interface/web/assets/icons/credit-card.png b/src/khoj/interface/web/assets/icons/credit-card.png new file mode 100644 index 00000000..487dba5c Binary files /dev/null and b/src/khoj/interface/web/assets/icons/credit-card.png differ diff --git a/src/khoj/interface/web/assets/icons/key.svg b/src/khoj/interface/web/assets/icons/key.svg new file mode 100644 index 00000000..437688fb --- /dev/null +++ b/src/khoj/interface/web/assets/icons/key.svg @@ -0,0 +1,4 @@ + + + + diff --git a/src/khoj/interface/web/assets/icons/khoj-logo-sideways-500.png b/src/khoj/interface/web/assets/icons/khoj-logo-sideways-500.png index 56648932..765d6e33 100644 Binary files a/src/khoj/interface/web/assets/icons/khoj-logo-sideways-500.png and b/src/khoj/interface/web/assets/icons/khoj-logo-sideways-500.png differ diff --git a/src/khoj/interface/web/assets/icons/trash-solid.svg b/src/khoj/interface/web/assets/icons/trash-solid.svg new file mode 100644 index 00000000..768d80f8 --- /dev/null +++ b/src/khoj/interface/web/assets/icons/trash-solid.svg @@ -0,0 +1 @@ + diff --git a/src/khoj/interface/web/assets/khoj.css b/src/khoj/interface/web/assets/khoj.css index a84d562f..a29c02a6 100644 --- a/src/khoj/interface/web/assets/khoj.css +++ b/src/khoj/interface/web/assets/khoj.css @@ -2,29 +2,44 @@ /* Can be forced with data-theme="light" */ [data-theme="light"], :root:not([data-theme="dark"]) { - --primary: #ffb300; - --primary-hover: #ffa000; + --primary: #fee285; + --primary-hover: #fcc50b; --primary-focus: rgba(255, 179, 0, 0.125); --primary-inverse: rgba(0, 0, 0, 0.75); + --background-color: #f5f4f3; + --main-text-color: #475569; + --water: #44b9da; + --leaf: #7b990a; + --flower: #ffaeae; } /* Amber Dark scheme (Auto) */ /* Automatically enabled if user has Dark mode enabled */ @media only screen and (prefers-color-scheme: dark) { :root:not([data-theme]) { - --primary: #ffb300; - --primary-hover: #ffc107; + --primary: #fee285; + --primary-hover: #fcc50b; --primary-focus: rgba(255, 179, 0, 0.25); --primary-inverse: rgba(0, 0, 0, 0.75); + --background-color: #f5f4f3; + --main-text-color: #475569; + --water: #44b9da; + --leaf: #7b990a; + --flower: #ffaeae; } } /* Amber Dark scheme (Forced) */ /* Enabled if forced with data-theme="dark" */ [data-theme="dark"] { - --primary: #ffb300; - --primary-hover: #ffc107; + --primary: #fee285; + --primary-hover: #fcc50b; --primary-focus: rgba(255, 179, 0, 0.25); --primary-inverse: rgba(0, 0, 0, 0.75); + --background-color: #f5f4f3; + --main-text-color: #475569; + --water: #44b9da; + --leaf: #7b990a; + --flower: #ffaeae; } /* Amber (Common styles) */ :root { @@ -37,8 +52,11 @@ .khoj-configure { display: grid; grid-template-columns: 1fr; - padding: 0 24px; + font-family: roboto, karma, segoe ui, sans-serif; + font-weight: 300; } + +.khoj-footer, .khoj-header { display: grid; grid-auto-flow: column; @@ -46,6 +64,9 @@ padding: 16px 0; margin: 0 0 16px 0; } +.khoj-footer { + margin: 16px 0 0 0; +} nav.khoj-nav { display: grid; @@ -64,7 +85,7 @@ a.khoj-logo { } .khoj-nav a { - color: #333; + color: var(--main-text-color); text-decoration: none; font-size: 20px; font-weight: normal; @@ -85,22 +106,88 @@ img.khoj-logo { justify-self: center; } -a.khoj-banner { +/* Dropdown in navigation menu*/ +#khoj-nav-menu-container { + display: flex; + align-items: center; +} +.khoj-nav-dropdown-content { + display: block; + grid-auto-flow: row; + position: absolute; + background-color: var(--background-color); + min-width: 160px; + box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); + right: 15vw; + top: 64px; + z-index: 1; + opacity: 0; + transition: opacity 0.1s ease-in-out; + pointer-events: none; + text-align: left; +} +.khoj-nav-dropdown-content.show { + opacity: 1; + pointer-events: auto; +} +.khoj-nav-dropdown-content a { color: black; + padding: 12px 16px; text-decoration: none; + display: block; +} +.khoj-nav-dropdown-content a:hover { + background-color: var(--primary-hover); +} +.khoj-nav-username { + padding: 12px 16px; + text-decoration: none; + display: block; + font-weight: bold; +} +.circle { + border-radius: 50%; + border: 3px dotted var(--main-text-color); + width: 40px; + height: 40px; + padding: 3px; + cursor: pointer; +} +.circle:hover { + background-color: var(--primary-hover); +} +.user-initial { + background-color: var(--background-color); + color: black; + display: grid; + justify-content: center; + align-items: center; + font-size: 20px; + box-sizing: unset; + width: 40px; + height: 40px; +} +.subscribed { + border: 3px solid var(--primary-hover); } -p.khoj-banner { - font-size: medium; - margin: 0; - padding: 10px; +@media screen and (max-width: 700px) { + .khoj-nav-dropdown-content { + display: block; + grid-auto-flow: row; + position: absolute; + background-color: var(--background-color); + min-width: 160px; + box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); + right: 10px; + z-index: 1; + opacity: 0; + transition: opacity 0.1s ease-in-out; + pointer-events: none; + } } -p#khoj-banner { - display: inline; -} - -@media only screen and (max-width: 600px) { +@media only screen and (max-width: 700px) { div.khoj-header { display: grid; grid-auto-flow: column; diff --git a/src/khoj/interface/web/assets/utils.js b/src/khoj/interface/web/assets/utils.js new file mode 100644 index 00000000..0d513146 --- /dev/null +++ b/src/khoj/interface/web/assets/utils.js @@ -0,0 +1,31 @@ +// Toggle the navigation menu +function toggleMenu() { + var menu = document.getElementById("khoj-nav-menu"); + menu.classList.toggle("show"); +} + +// Close the dropdown menu if the user clicks outside of it +document.addEventListener('click', function(event) { + let menu = document.getElementById("khoj-nav-menu"); + let menuContainer = document.getElementById("khoj-nav-menu-container"); + let isClickOnMenu = menuContainer.contains(event.target) || menuContainer === event.target; + if (isClickOnMenu === false && menu.classList.contains("show")) { + menu.classList.remove("show"); + } +}); + +console.log(`%c %s`, "font-family:monospace", ` + __ __ __ __ ______ __ _____ __ +/\\ \\/ / /\\ \\_\\ \\ /\\ __ \\ /\\ \\ /\\ __ \\ /\\ \\ +\\ \\ _"-. \\ \\ __ \\ \\ \\ \\/\\ \\ _\\_\\ \\ \\ \\ __ \\ \\ \\ \\ + \\ \\_\\ \\_\\ \\ \\_\\ \\_\\ \\ \\_____\\ /\\_____\\ \\ \\_\\ \\_\\ \\ \\_\\ + \\/_/\\/_/ \\/_/\\/_/ \\/_____/ \\/_____/ \\/_/\\/_/ \\/_/ + + +Greetings traveller, + +I am ✨Khoj✨, your open-source, personal AI copilot. + +See my source code at https://github.com/khoj-ai/khoj +Read my operating manual at https://docs.khoj.dev +`); diff --git a/src/khoj/interface/web/base_config.html b/src/khoj/interface/web/base_config.html index 5b643d58..d9546249 100644 --- a/src/khoj/interface/web/base_config.html +++ b/src/khoj/interface/web/base_config.html @@ -8,19 +8,15 @@ +
-
- - -
+ + + {% import 'utils.html' as utils %} + {{ utils.heading_pane(user_photo, username, is_active, has_documents) }} +
@@ -28,6 +24,9 @@ {% endblock %}
+ diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html index 2230e901..82e3233d 100644 --- a/src/khoj/interface/web/chat.html +++ b/src/khoj/interface/web/chat.html @@ -8,7 +8,18 @@ + -
- {% if demo %} - - -

- Enroll in Khoj cloud to get your own assistant -

-
- - - {% endif %} +
+ -
- {% if demo %} - - {% else %} - - {% endif %} - -
+ {% import 'utils.html' as utils %} + {{ utils.heading_pane(user_photo, username, is_active, has_documents) }}
@@ -295,11 +383,12 @@ - + - diff --git a/src/khoj/interface/web/config.html b/src/khoj/interface/web/config.html index d41ca26b..01a3786f 100644 --- a/src/khoj/interface/web/config.html +++ b/src/khoj/interface/web/config.html @@ -2,435 +2,335 @@ {% block content %}
-
-

Plugins

+
+

Content

+
+ Computer +

+ Files + Configured +

+
+
+

Manage files from your computer

+
+ +
+
Github

Github - {% if current_config.content_type.github %} - {% if current_model_state.github == False %} - Not Configured - {% else %} - Configured - {% endif %} - {% endif %} + Configured

Set repositories to index

- {% if current_config.content_type.github %} -
+
- {% endif %} +
Notion

Notion - {% if current_config.content_type.notion %} - {% if current_model_state.notion == False %} - Not Configured - {% else %} - Configured - {% endif %} - {% endif %} + Configured

-

Configure your settings from Notion

+

Sync your Notion pages

- {% if current_config.content_type.notion %} -
+
- {% endif %} +
-
-
- markdown -

- Markdown - {% if current_config.content_type.markdown %} - {% if current_model_state.markdown == False%} - Not Configured - {% else %} - Configured - {% endif %} - {% endif %} -

+
+
+ +
+
+
+
+
-
-

Set markdown files to index

+
+
- - {% if current_config.content_type.markdown %} -
- -
- {% endif %} -
-
-
- org -

- Org - {% if current_config.content_type.org %} - {% if current_model_state.org == False %} - Not Configured - {% else %} - Configured - {% endif %} - {% endif %} -

-
-
-

Set org files to index

-
- - {% if current_config.content_type.org %} -
- -
- {% endif %} -
-
-
- PDF -

- PDF - {% if current_config.content_type.pdf %} - {% if current_model_state.pdf == False %} - Not Configured - {% else %} - Configured - {% endif %} - {% endif %} -

-
-
-

Set PDF files to index

-
- - {% if current_config.content_type.pdf %} -
- -
- {% endif %} -
-
-
- Plaintext -

- Plaintext - {% if current_config.content_type.plaintext %} - {% if current_model_state.plaintext == False %} - Not Configured - {% else %} - Configured - {% endif %} - {% endif %} -

-
-
-

Set Plaintext files to index

-
- - {% if current_config.content_type.plaintext %} -
- -
- {% endif %}
-
+

Features

- Chat + Chat

Chat - {% if current_config.processor and current_config.processor.conversation.openai %} - {% if current_model_state.conversation_openai == False %} - Not Configured - {% else %} - Configured - {% endif %} - {% endif %}

-

Setup online chat using OpenAI

+
+
+
+
+
+

Clients

+
+
+ API Key +

API Keys

+
+
+

Manage access from your client apps to Khoj

+
+ + + + + + + + + +
NameKeyActions
+
+ +
+
+
+ {% if billing_enabled %} +
+

Billing

+
+ -
-
- Chat -

- Offline Chat - Configured - {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and not current_model_state.conversation_gpt4all %} - Not Configured - {% endif %} -

-
-
-

Setup offline chat

-
-
- -
-
- -
-
-
-
-
-
-
-
-
- - -
- -
-
-
-
- -
-
-
+ {% endif %} +
{% endblock %} diff --git a/src/khoj/interface/web/content_source_computer_input.html b/src/khoj/interface/web/content_source_computer_input.html new file mode 100644 index 00000000..72aa3810 --- /dev/null +++ b/src/khoj/interface/web/content_source_computer_input.html @@ -0,0 +1,130 @@ +{% extends "base_config.html" %} +{% block content %} +
+
+

+ files + Files +
+

Manage files from your computer

+

Download the Khoj Desktop app to sync documents from your computer

+
+

+
+
+ +
+
+
+
+
+
+ + +{% endblock %} diff --git a/src/khoj/interface/web/content_type_github_input.html b/src/khoj/interface/web/content_source_github_input.html similarity index 75% rename from src/khoj/interface/web/content_type_github_input.html rename to src/khoj/interface/web/content_source_github_input.html index 1e5d2bc4..cce0d083 100644 --- a/src/khoj/interface/web/content_type_github_input.html +++ b/src/khoj/interface/web/content_source_github_input.html @@ -38,24 +38,6 @@ {% endfor %}
- - - - - - - - - -
- - - -
- - - -
@@ -64,6 +46,9 @@
+ + diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/search.html similarity index 80% rename from src/khoj/interface/web/index.html rename to src/khoj/interface/web/search.html index cb2bae49..5331ea92 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/search.html @@ -10,6 +10,7 @@ + - {% if demo %} - - - {% endif %} -
- {% if demo %} - - {% else %} - - {% endif %} - -
+ {% import 'utils.html' as utils %} + {{ utils.heading_pane(user_photo, username, is_active, has_documents) }} @@ -297,9 +284,12 @@
+ - diff --git a/src/khoj/interface/web/utils.html b/src/khoj/interface/web/utils.html new file mode 100644 index 00000000..4579368b --- /dev/null +++ b/src/khoj/interface/web/utils.html @@ -0,0 +1,34 @@ +{% macro heading_pane(user_photo, username, is_active, has_documents) -%} +
+ + +
+{%- endmacro %} diff --git a/src/khoj/main.py b/src/khoj/main.py index 4c759c2a..9fa65fc3 100644 --- a/src/khoj/main.py +++ b/src/khoj/main.py @@ -3,11 +3,6 @@ import os import sys import locale -if sys.stdout is None: - sys.stdout = open(os.devnull, "w") -if sys.stderr is None: - sys.stderr = open(os.devnull, "w") - import logging import threading import warnings @@ -19,19 +14,32 @@ warnings.filterwarnings("ignore", message=r"legacy way to download files from th # External Packages import uvicorn +import django from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles from rich.logging import RichHandler import schedule -# Internal Packages -from khoj.configure import configure_routes, initialize_server -from khoj.utils import state -from khoj.utils.cli import cli +from django.core.asgi import get_asgi_application +from django.core.management import call_command + +# Initialize Django +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "app.settings") +django.setup() + +# Initialize Django Database +call_command("migrate", "--noinput") + +# Initialize Django Static Files +call_command("collectstatic", "--noinput") # Initialize the Application Server app = FastAPI() +# Get Django Application +django_app = get_asgi_application() + # Add CORS middleware app.add_middleware( CORSMiddleware, @@ -44,6 +52,12 @@ app.add_middleware( # Set Locale locale.setlocale(locale.LC_ALL, "") +# Internal Packages. We do this after setting up Django so that Django features are accessible to the app. +from khoj.configure import configure_routes, initialize_server, configure_middleware +from khoj.utils import state +from khoj.utils.cli import cli +from khoj.utils.initialization import initialization + # Setup Logger rich_handler = RichHandler(rich_tracebacks=True) rich_handler.setFormatter(fmt=logging.Formatter(fmt="%(message)s", datefmt="[%X]")) @@ -52,7 +66,7 @@ logging.basicConfig(handlers=[rich_handler]) logger = logging.getLogger("khoj") -def run(): +def run(should_start_server=True): # Turn Tokenizers Parallelism Off. App does not support it. os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -61,8 +75,7 @@ def run(): args = cli(state.cli_args) set_state(args) - # Create app directory, if it doesn't exist - state.config_file.parent.mkdir(parents=True, exist_ok=True) + logger.info(f"🚒 Initializing Khoj v{state.khoj_version}") # Set Logging Level if args.verbose == 0: @@ -70,6 +83,11 @@ def run(): elif args.verbose >= 1: logger.setLevel(logging.DEBUG) + initialization() + + # Create app directory, if it doesn't exist + state.config_file.parent.mkdir(parents=True, exist_ok=True) + # Set Log File fh = logging.FileHandler(state.config_file.parent / "khoj.log", encoding="utf-8") fh.setLevel(logging.DEBUG) @@ -82,8 +100,22 @@ def run(): # Start Server configure_routes(app) - initialize_server(args.config, required=False) - start_server(app, host=args.host, port=args.port, socket=args.socket) + + # Mount Django and Static Files + app.mount("/server", django_app, name="server") + static_dir = "static" + if not os.path.exists(static_dir): + os.mkdir(static_dir) + app.mount(f"/{static_dir}", StaticFiles(directory=static_dir), name=static_dir) + + # Configure Middleware + configure_middleware(app) + + initialize_server(args.config) + + # If the server is started through gunicorn (external to the script), don't start the server + if should_start_server: + start_server(app, host=args.host, port=args.port, socket=args.socket) def set_state(args): @@ -92,7 +124,7 @@ def set_state(args): state.verbose = args.verbose state.host = args.host state.port = args.port - state.demo = args.demo + state.anonymous_mode = args.anonymous_mode state.khoj_version = version("khoj-assistant") state.chat_on_gpu = args.chat_on_gpu @@ -115,3 +147,5 @@ def poll_task_scheduler(): if __name__ == "__main__": run() +else: + run(should_start_server=False) diff --git a/src/khoj/migrations/migrate_server_pg.py b/src/khoj/migrations/migrate_server_pg.py new file mode 100644 index 00000000..434e27d7 --- /dev/null +++ b/src/khoj/migrations/migrate_server_pg.py @@ -0,0 +1,139 @@ +""" +The application config currently looks like this: +app: + should-log-telemetry: true +content-type: + ... +processor: + conversation: + conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json + max-prompt-size: null + offline-chat: + chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf + enable-offline-chat: false + openai: + api-key: sk-blah + chat-model: gpt-3.5-turbo + tokenizer: null +search-type: + asymmetric: + cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2 + encoder: sentence-transformers/multi-qa-MiniLM-L6-cos-v1 + encoder-type: null + model-directory: /Users/si/.khoj/search/asymmetric + image: + encoder: sentence-transformers/clip-ViT-B-32 + encoder-type: null + model-directory: /Users/si/.khoj/search/image + symmetric: + cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2 + encoder: sentence-transformers/all-MiniLM-L6-v2 + encoder-type: null + model-directory: ~/.khoj/search/symmetric +version: 0.14.0 + + +The new version will looks like this: +app: + should-log-telemetry: true +processor: + conversation: + offline-chat: + enabled: false + openai: + api-key: sk-blah + chat-model-options: + - chat-model: gpt-3.5-turbo + tokenizer: null + type: openai + - chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf + tokenizer: null + type: offline +search-type: + asymmetric: + cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2 + encoder: sentence-transformers/multi-qa-MiniLM-L6-cos-v1 +version: 0.15.0 +""" + +import logging +from packaging import version + +from khoj.utils.yaml import load_config_from_file, save_config_to_file +from database.models import ( + OpenAIProcessorConversationConfig, + OfflineChatProcessorConversationConfig, + ChatModelOptions, + SearchModelConfig, +) + +logger = logging.getLogger(__name__) + + +def migrate_server_pg(args): + schema_version = "0.15.0" + raw_config = load_config_from_file(args.config_file) + previous_version = raw_config.get("version") + + if previous_version is None or version.parse(previous_version) < version.parse(schema_version): + logger.info( + f"Migrating configuration used for version {previous_version} to latest version for server with postgres in {args.version_no}" + ) + raw_config["version"] = schema_version + + if raw_config is None: + return args + + if "search-type" in raw_config and raw_config["search-type"]: + if "asymmetric" in raw_config["search-type"]: + # Delete all existing search models + SearchModelConfig.objects.filter(model_type=SearchModelConfig.ModelType.TEXT).delete() + # Create new search model from existing Khoj YAML config + asymmetric_search = raw_config["search-type"]["asymmetric"] + SearchModelConfig.objects.create( + name="default", + model_type=SearchModelConfig.ModelType.TEXT, + bi_encoder=asymmetric_search.get("encoder"), + cross_encoder=asymmetric_search.get("cross-encoder"), + ) + + if "processor" in raw_config and raw_config["processor"] and "conversation" in raw_config["processor"]: + processor_conversation = raw_config["processor"]["conversation"] + + if "offline-chat" in raw_config["processor"]["conversation"]: + offline_chat = raw_config["processor"]["conversation"]["offline-chat"] + OfflineChatProcessorConversationConfig.objects.create( + enabled=offline_chat.get("enable-offline-chat"), + ) + ChatModelOptions.objects.create( + chat_model=offline_chat.get("chat-model"), + tokenizer=processor_conversation.get("tokenizer"), + max_prompt_size=processor_conversation.get("max-prompt-size"), + model_type=ChatModelOptions.ModelType.OFFLINE, + ) + + if ( + "openai" in raw_config["processor"]["conversation"] + and raw_config["processor"]["conversation"]["openai"] + ): + openai = raw_config["processor"]["conversation"]["openai"] + + if openai.get("api-key") is None: + logger.error("OpenAI API Key is not set. Will not be migrating OpenAI config.") + else: + if openai.get("chat-model") is None: + openai["chat-model"] = "gpt-3.5-turbo" + + OpenAIProcessorConversationConfig.objects.create( + api_key=openai.get("api-key"), + ) + ChatModelOptions.objects.create( + chat_model=openai.get("chat-model"), + tokenizer=processor_conversation.get("tokenizer"), + max_prompt_size=processor_conversation.get("max-prompt-size"), + model_type=ChatModelOptions.ModelType.OPENAI, + ) + + save_config_to_file(raw_config, args.config_file) + + return args diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/gpt4all/chat_model.py index 04a004f0..d3eaa01a 100644 --- a/src/khoj/processor/conversation/gpt4all/chat_model.py +++ b/src/khoj/processor/conversation/gpt4all/chat_model.py @@ -55,10 +55,10 @@ def extract_questions_offline( last_year = datetime.now().year - 1 last_christmas_date = f"{last_year}-12-25" next_christmas_date = f"{datetime.now().year}-12-25" - system_prompt = prompts.extract_questions_system_prompt_llamav2.format( - message=(prompts.system_prompt_message_extract_questions_llamav2) + system_prompt = prompts.system_prompt_extract_questions_gpt4all.format( + message=(prompts.system_prompt_message_extract_questions_gpt4all) ) - example_questions = prompts.extract_questions_llamav2_sample.format( + example_questions = prompts.extract_questions_gpt4all_sample.format( query=text, chat_history=chat_history, current_date=current_date, @@ -150,14 +150,14 @@ def converse_offline( elif conversation_command == ConversationCommand.General or is_none_or_empty(compiled_references_message): conversation_primer = user_query else: - conversation_primer = prompts.notes_conversation_llamav2.format( + conversation_primer = prompts.notes_conversation_gpt4all.format( query=user_query, references=compiled_references_message ) # Setup Prompt with Primer or Conversation History messages = generate_chatml_messages_with_context( conversation_primer, - prompts.system_prompt_message_llamav2, + prompts.system_prompt_message_gpt4all, conversation_log, model_name=model, max_prompt_size=max_prompt_size, @@ -183,16 +183,16 @@ def llm_thread(g, messages: List[ChatMessage], model: Any): conversation_history = messages[1:-1] formatted_messages = [ - prompts.chat_history_llamav2_from_assistant.format(message=message.content) + prompts.khoj_message_gpt4all.format(message=message.content) if message.role == "assistant" - else prompts.chat_history_llamav2_from_user.format(message=message.content) + else prompts.user_message_gpt4all.format(message=message.content) for message in conversation_history ] stop_words = [""] chat_history = "".join(formatted_messages) - templated_system_message = prompts.system_prompt_llamav2.format(message=system_message.content) - templated_user_message = prompts.general_conversation_llamav2.format(query=user_message.content) + templated_system_message = prompts.system_prompt_gpt4all.format(message=system_message.content) + templated_user_message = prompts.user_message_gpt4all.format(message=user_message.content) prompted_message = templated_system_message + chat_history + templated_user_message state.chat_lock.acquire() diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py index 45a1158e..0b876b26 100644 --- a/src/khoj/processor/conversation/gpt4all/utils.py +++ b/src/khoj/processor/conversation/gpt4all/utils.py @@ -2,24 +2,33 @@ import logging from khoj.utils import state - logger = logging.getLogger(__name__) def download_model(model_name: str): try: - from gpt4all import GPT4All + import gpt4all except ModuleNotFoundError as e: logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.") raise e - # Use GPU for Chat Model, if available - try: - device = "gpu" if state.chat_on_gpu else "cpu" - model = GPT4All(model_name=model_name, device=device) - logger.debug(f"Loaded {model_name} chat model to {device.upper()}") - except ValueError: - model = GPT4All(model_name=model_name) - logger.debug(f"Loaded {model_name} chat model to CPU.") + # Download the chat model + chat_model_config = gpt4all.GPT4All.retrieve_model(model_name=model_name, allow_download=True) - return model + # Decide whether to load model to GPU or CPU + try: + # Try load chat model to GPU if: + # 1. Loading chat model to GPU isn't disabled via CLI and + # 2. Machine has GPU + # 3. GPU has enough free memory to load the chat model + device = ( + "gpu" if state.chat_on_gpu and gpt4all.pyllmodel.LLModel().list_gpu(chat_model_config["path"]) else "cpu" + ) + except ValueError: + device = "cpu" + + # Now load the downloaded chat model onto appropriate device + chat_model = gpt4all.GPT4All(model_name=model_name, device=device, allow_download=False) + logger.debug(f"Loaded chat model to {device.upper()}.") + + return chat_model diff --git a/src/khoj/processor/conversation/openai/gpt.py b/src/khoj/processor/conversation/openai/gpt.py index 73b4f176..31cfda1e 100644 --- a/src/khoj/processor/conversation/openai/gpt.py +++ b/src/khoj/processor/conversation/openai/gpt.py @@ -1,5 +1,6 @@ # Standard Packages import logging +import json from datetime import datetime, timedelta from typing import Optional @@ -20,27 +21,6 @@ from khoj.utils.helpers import ConversationCommand, is_none_or_empty logger = logging.getLogger(__name__) -def summarize(session, model, api_key=None, temperature=0.5, max_tokens=200): - """ - Summarize conversation session using the specified OpenAI chat model - """ - messages = [ChatMessage(content=prompts.summarize_chat.format(), role="system")] + session - - # Get Response from GPT - logger.debug(f"Prompt for GPT: {messages}") - response = completion_with_backoff( - messages=messages, - model_name=model, - temperature=temperature, - max_tokens=max_tokens, - model_kwargs={"stop": ['"""'], "frequency_penalty": 0.2}, - openai_api_key=api_key, - ) - - # Extract, Clean Message from GPT's Response - return str(response.content).replace("\n\n", "") - - def extract_questions( text, model: Optional[str] = "gpt-4", @@ -52,6 +32,10 @@ def extract_questions( """ Infer search queries to retrieve relevant notes to answer user query """ + + def _valid_question(question: str): + return not is_none_or_empty(question) and question != "[]" + # Extract Past User Message and Inferred Questions from Conversation Log chat_history = "".join( [ @@ -91,7 +75,7 @@ def extract_questions( # Extract, Clean Message from GPT's Response try: - questions = ( + split_questions = ( response.content.strip(empty_escape_sequences) .replace("['", '["') .replace("']", '"]') @@ -100,9 +84,18 @@ def extract_questions( .replace('"]', "") .split('", "') ) + questions = [] + + for question in split_questions: + if question not in questions and _valid_question(question): + questions.append(question) + + if is_none_or_empty(questions): + raise ValueError("GPT returned empty JSON") except: logger.warning(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response}") questions = [text] + logger.debug(f"Extracted Questions by GPT: {questions}") return questions @@ -131,16 +124,14 @@ def converse( completion_func(chat_response=prompts.no_notes_found.format()) return iter([prompts.no_notes_found.format()]) elif conversation_command == ConversationCommand.General or is_none_or_empty(compiled_references): - conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query) + conversation_primer = prompts.general_conversation.format(query=user_query) else: - conversation_primer = prompts.notes_conversation.format( - current_date=current_date, query=user_query, references=compiled_references - ) + conversation_primer = prompts.notes_conversation.format(query=user_query, references=compiled_references) # Setup Prompt with Primer or Conversation History messages = generate_chatml_messages_with_context( conversation_primer, - prompts.personality.format(), + prompts.personality.format(current_date=current_date), conversation_log, model, max_prompt_size, @@ -157,4 +148,5 @@ def converse( temperature=temperature, openai_api_key=api_key, completion_func=completion_func, + model_kwargs={"stop": ["Notes:\n["]}, ) diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py index 130532e0..dce72e1f 100644 --- a/src/khoj/processor/conversation/openai/utils.py +++ b/src/khoj/processor/conversation/openai/utils.py @@ -69,15 +69,15 @@ def completion_with_backoff(**kwargs): reraise=True, ) def chat_completion_with_backoff( - messages, compiled_references, model_name, temperature, openai_api_key=None, completion_func=None + messages, compiled_references, model_name, temperature, openai_api_key=None, completion_func=None, model_kwargs=None ): g = ThreadedGenerator(compiled_references, completion_func=completion_func) - t = Thread(target=llm_thread, args=(g, messages, model_name, temperature, openai_api_key)) + t = Thread(target=llm_thread, args=(g, messages, model_name, temperature, openai_api_key, model_kwargs)) t.start() return g -def llm_thread(g, messages, model_name, temperature, openai_api_key=None): +def llm_thread(g, messages, model_name, temperature, openai_api_key=None, model_kwargs=None): callback_handler = StreamingChatCallbackHandler(g) chat = ChatOpenAI( streaming=True, @@ -86,6 +86,7 @@ def llm_thread(g, messages, model_name, temperature, openai_api_key=None): model_name=model_name, # type: ignore temperature=temperature, openai_api_key=openai_api_key or os.getenv("OPENAI_API_KEY"), + model_kwargs=model_kwargs, request_timeout=20, max_retries=1, client=None, diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index d487609d..f6b84804 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -4,30 +4,50 @@ from langchain.prompts import PromptTemplate ## Personality ## -- -personality = PromptTemplate.from_template("You are Khoj, a smart, inquisitive and helpful personal assistant.") +personality = PromptTemplate.from_template( + """ +You are Khoj, a smart, inquisitive and helpful personal assistant. +Use your general knowledge and the past conversation with the user as context to inform your responses. +You were created by Khoj Inc. with the following capabilities: +- You *CAN REMEMBER ALL NOTES and PERSONAL INFORMATION FOREVER* that the user ever shares with you. +- You cannot set reminders. +- Say "I don't know" or "I don't understand" if you don't know what to say or if you don't know the answer to a question. +- Ask crisp follow-up questions to get additional context, when the answer cannot be inferred from the provided notes or past conversations. +- Sometimes the user will share personal information that needs to be remembered, like an account ID or a residential address. These can be acknowledged with a simple "Got it" or "Okay". + +Note: More information about you, the company or other Khoj apps can be found at https://khoj.dev. +Today is {current_date} in UTC. +""".strip() +) ## General Conversation ## -- general_conversation = PromptTemplate.from_template( """ -Using your general knowledge and our past conversations as context, answer the following question. -Current Date: {current_date} - -Question: {query} +{query} """.strip() ) + no_notes_found = PromptTemplate.from_template( """ I'm sorry, I couldn't find any relevant notes to respond to your message. """.strip() ) -system_prompt_message_llamav2 = f"""You are Khoj, a smart, inquisitive and helpful personal assistant. +no_entries_found = PromptTemplate.from_template( + """ + It looks like you haven't added any notes yet. No worries, you can fix that by downloading the Khoj app from here. +""".strip() +) + +## Conversation Prompts for GPT4All Models +## -- +system_prompt_message_gpt4all = f"""You are Khoj, a smart, inquisitive and helpful personal assistant. Using your general knowledge and our past conversations as context, answer the following question. If you do not know the answer, say 'I don't know.'""" -system_prompt_message_extract_questions_llamav2 = f"""You are Khoj, a kind and intelligent personal assistant. When the user asks you a question, you ask follow-up questions to clarify the necessary information you need in order to answer from the user's perspective. +system_prompt_message_extract_questions_gpt4all = f"""You are Khoj, a kind and intelligent personal assistant. When the user asks you a question, you ask follow-up questions to clarify the necessary information you need in order to answer from the user's perspective. - Write the question as if you can search for the answer on the user's personal notes. - Try to be as specific as possible. Instead of saying "they" or "it" or "he", use the name of the person or thing you are referring to. For example, instead of saying "Which store did they go to?", say "Which store did Alice and Bob go to?". - Add as much context from the previous questions and notes as required into your search queries. @@ -35,61 +55,47 @@ system_prompt_message_extract_questions_llamav2 = f"""You are Khoj, a kind and i What follow-up questions, if any, will you need to ask to answer the user's question? """ -system_prompt_llamav2 = PromptTemplate.from_template( +system_prompt_gpt4all = PromptTemplate.from_template( """ [INST] <> {message} <>Hi there! [/INST] Hello! How can I help you today? """ ) -extract_questions_system_prompt_llamav2 = PromptTemplate.from_template( +system_prompt_extract_questions_gpt4all = PromptTemplate.from_template( """ [INST] <> {message} <>[/INST]""" ) -general_conversation_llamav2 = PromptTemplate.from_template( - """ -[INST] {query} [/INST] -""".strip() -) - -chat_history_llamav2_from_user = PromptTemplate.from_template( +user_message_gpt4all = PromptTemplate.from_template( """ [INST] {message} [/INST] """.strip() ) -chat_history_llamav2_from_assistant = PromptTemplate.from_template( +khoj_message_gpt4all = PromptTemplate.from_template( """ {message} """.strip() ) -conversation_llamav2 = PromptTemplate.from_template( - """ -[INST] {query} [/INST] -""".strip() -) - ## Notes Conversation ## -- notes_conversation = PromptTemplate.from_template( """ -Using my personal notes and our past conversations as context, answer the following question. -Ask crisp follow-up questions to get additional context, when the answer cannot be inferred from the provided notes or past conversations. -These questions should end with a question mark. -Current Date: {current_date} +Use my personal notes and our past conversations to inform your response. +Ask crisp follow-up questions to get additional context, when a helpful response cannot be provided from the provided notes or past conversations. Notes: {references} -Question: {query} +Query: {query} """.strip() ) -notes_conversation_llamav2 = PromptTemplate.from_template( +notes_conversation_gpt4all = PromptTemplate.from_template( """ User's Notes: {references} @@ -98,13 +104,6 @@ Question: {query} ) -## Summarize Chat -## -- -summarize_chat = PromptTemplate.from_template( - f"{personality.format()} Summarize the conversation from your first person perspective" -) - - ## Summarize Notes ## -- summarize_notes = PromptTemplate.from_template( @@ -132,7 +131,10 @@ Question: {user_query} Answer (in second person):""" ) -extract_questions_llamav2_sample = PromptTemplate.from_template( + +## Extract Questions +## -- +extract_questions_gpt4all_sample = PromptTemplate.from_template( """ [INST] <>Current Date: {current_date}<> [/INST] [INST] How was my trip to Cambodia? [/INST] @@ -157,8 +159,6 @@ Use these notes from the user's previous conversations to provide a response: ) -## Extract Questions -## -- extract_questions = PromptTemplate.from_template( """ You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the user's notes. @@ -255,7 +255,7 @@ help_message = PromptTemplate.from_template( **/default**: Chat using your knowledge base and Khoj's general knowledge for context. **/help**: Show this help message. -You are using the **{model}** model. +You are using the **{model}** model on the **{device}**. **version**: {version} """.strip() ) diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index b0d401fa..ecd4f8ad 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -151,17 +151,20 @@ def truncate_messages( ) system_message = messages.pop() + assert type(system_message.content) == str system_message_tokens = len(encoder.encode(system_message.content)) - tokens = sum([len(encoder.encode(message.content)) for message in messages]) + tokens = sum([len(encoder.encode(message.content)) for message in messages if type(message.content) == str]) while (tokens + system_message_tokens) > max_prompt_size and len(messages) > 1: messages.pop() - tokens = sum([len(encoder.encode(message.content)) for message in messages]) + assert type(system_message.content) == str + tokens = sum([len(encoder.encode(message.content)) for message in messages if type(message.content) == str]) # Truncate current message if still over max supported prompt size by model if (tokens + system_message_tokens) > max_prompt_size: - current_message = "\n".join(messages[0].content.split("\n")[:-1]) - original_question = "\n".join(messages[0].content.split("\n")[-1:]) + assert type(system_message.content) == str + current_message = "\n".join(messages[0].content.split("\n")[:-1]) if type(messages[0].content) == str else "" + original_question = "\n".join(messages[0].content.split("\n")[-1:]) if type(messages[0].content) == str else "" original_question_tokens = len(encoder.encode(original_question)) remaining_tokens = max_prompt_size - original_question_tokens - system_message_tokens truncated_message = encoder.decode(encoder.encode(current_message)[:remaining_tokens]).strip() diff --git a/src/khoj/processor/embeddings.py b/src/khoj/processor/embeddings.py new file mode 100644 index 00000000..392d402f --- /dev/null +++ b/src/khoj/processor/embeddings.py @@ -0,0 +1,32 @@ +from typing import List + +from sentence_transformers import SentenceTransformer, CrossEncoder +from torch import nn + +from khoj.utils.helpers import get_device +from khoj.utils.rawconfig import SearchResponse + + +class EmbeddingsModel: + def __init__(self, model_name: str = "thenlper/gte-small"): + self.encode_kwargs = {"normalize_embeddings": True} + self.model_kwargs = {"device": get_device()} + self.model_name = model_name + self.embeddings_model = SentenceTransformer(self.model_name, **self.model_kwargs) + + def embed_query(self, query): + return self.embeddings_model.encode([query], show_progress_bar=False, **self.encode_kwargs)[0] + + def embed_documents(self, docs): + return self.embeddings_model.encode(docs, show_progress_bar=True, **self.encode_kwargs).tolist() + + +class CrossEncoderModel: + def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"): + self.model_name = model_name + self.cross_encoder_model = CrossEncoder(model_name=self.model_name, device=get_device()) + + def predict(self, query, hits: List[SearchResponse], key: str = "compiled"): + cross_inp = [[query, hit.additional[key]] for hit in hits] + cross_scores = self.cross_encoder_model.predict(cross_inp, activation_fct=nn.Sigmoid()) + return cross_scores diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_entries.py similarity index 83% rename from src/khoj/processor/github/github_to_jsonl.py rename to src/khoj/processor/github/github_to_entries.py index bcd2e530..56279453 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_entries.py @@ -2,7 +2,7 @@ import logging import time from datetime import datetime -from typing import Dict, List, Union +from typing import Dict, List, Union, Tuple # External Packages import requests @@ -10,20 +10,32 @@ import requests # Internal Packages from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig -from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.text_to_jsonl import TextToJsonl -from khoj.utils.jsonl import compress_jsonl_data -from khoj.utils.rawconfig import Entry +from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from khoj.processor.text_to_entries import TextToEntries +from database.models import Entry as DbEntry, GithubConfig, KhojUser logger = logging.getLogger(__name__) -class GithubToJsonl(TextToJsonl): - def __init__(self, config: GithubContentConfig): +class GithubToEntries(TextToEntries): + def __init__(self, config: GithubConfig): super().__init__(config) - self.config = config + raw_repos = config.githubrepoconfig.all() + repos = [] + for repo in raw_repos: + repos.append( + GithubRepoConfig( + name=repo.name, + owner=repo.owner, + branch=repo.branch, + ) + ) + self.config = GithubContentConfig( + pat_token=config.pat_token, + repos=repos, + ) self.session = requests.Session() self.session.headers.update({"Authorization": f"token {self.config.pat_token}"}) @@ -37,7 +49,9 @@ class GithubToJsonl(TextToJsonl): else: return - def process(self, previous_entries=[], files=None, full_corpus=True): + def process( + self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False + ) -> Tuple[int, int]: if self.config.pat_token is None or self.config.pat_token == "": logger.error(f"Github PAT token is not set. Skipping github content") raise ValueError("Github PAT token is not set. Skipping github content") @@ -45,7 +59,7 @@ class GithubToJsonl(TextToJsonl): for repo in self.config.repos: current_entries += self.process_repo(repo) - return self.update_entries_with_ids(current_entries, previous_entries) + return self.update_entries_with_ids(current_entries, user=user) def process_repo(self, repo: GithubRepoConfig): repo_url = f"https://api.github.com/repos/{repo.owner}/{repo.name}" @@ -63,43 +77,42 @@ class GithubToJsonl(TextToJsonl): current_entries = [] with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger): - current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( - *GithubToJsonl.extract_markdown_entries(markdown_files) + current_entries = MarkdownToEntries.convert_markdown_entries_to_maps( + *GithubToEntries.extract_markdown_entries(markdown_files) ) with timer(f"Extract org entries from github repo {repo_shorthand}", logger): - current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files)) + current_entries += OrgToEntries.convert_org_nodes_to_entries( + *GithubToEntries.extract_org_entries(org_files) + ) with timer(f"Extract commit messages from github repo {repo_shorthand}", logger): current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo) with timer(f"Extract issues from github repo {repo_shorthand}", logger): - issue_entries = GithubToJsonl.convert_issues_to_entries( - *GithubToJsonl.extract_github_issues(self.get_issues(repo_url)) + issue_entries = GithubToEntries.convert_issues_to_entries( + *GithubToEntries.extract_github_issues(self.get_issues(repo_url)) ) current_entries += issue_entries with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger): - current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256) return current_entries - def update_entries_with_ids(self, current_entries, previous_entries): + def update_entries_with_ids(self, current_entries, user: KhojUser = None): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger + num_new_embeddings, num_deleted_embeddings = self.update_embeddings( + current_entries, + DbEntry.EntryType.GITHUB, + DbEntry.EntrySource.GITHUB, + key="compiled", + logger=logger, + user=user, ) - with timer("Write github entries to JSONL file", logger): - # Process Each Entry from All Notes Files - entries = list(map(lambda entry: entry[1], entries_with_ids)) - jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) - - # Compress JSONL formatted Data - compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) - - return entries_with_ids + return num_new_embeddings, num_deleted_embeddings def get_files(self, repo_url: str, repo: GithubRepoConfig): # Get the contents of the repository @@ -274,7 +287,7 @@ class GithubToJsonl(TextToJsonl): entries = [] entry_to_file_map = [] for doc in markdown_files: - entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( + entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file( doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map) @@ -285,7 +298,7 @@ class GithubToJsonl(TextToJsonl): entry_to_file_map = [] for doc in org_files: - entries, entry_to_file_map = OrgToJsonl.process_single_org_file( + entries, entry_to_file_map = OrgToEntries.process_single_org_file( doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map) diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py deleted file mode 100644 index 4a6fab99..00000000 --- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py +++ /dev/null @@ -1,91 +0,0 @@ -# Standard Packages -import glob -import logging -from pathlib import Path -from typing import List - -# Internal Packages -from khoj.processor.text_to_jsonl import TextToJsonl -from khoj.utils.helpers import get_absolute_path, timer -from khoj.utils.jsonl import load_jsonl, compress_jsonl_data -from khoj.utils.rawconfig import Entry - - -logger = logging.getLogger(__name__) - - -class JsonlToJsonl(TextToJsonl): - # Define Functions - def process(self, previous_entries=[], files: dict[str, str] = {}, full_corpus: bool = True): - # Extract required fields from config - input_jsonl_files, input_jsonl_filter, output_file = ( - self.config.input_files, - self.config.input_filter, - self.config.compressed_jsonl, - ) - - # Get Jsonl Input Files to Process - all_input_jsonl_files = JsonlToJsonl.get_jsonl_files(input_jsonl_files, input_jsonl_filter) - - # Extract Entries from specified jsonl files - with timer("Parse entries from jsonl files", logger): - input_jsons = JsonlToJsonl.extract_jsonl_entries(all_input_jsonl_files) - current_entries = list(map(Entry.from_dict, input_jsons)) - - # Split entries by max tokens supported by model - with timer("Split entries by max token size supported by model", logger): - current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) - - # Identify, mark and merge any new entries with previous entries - with timer("Identify new or updated entries", logger): - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger - ) - - with timer("Write entries to JSONL file", logger): - # Process Each Entry from All Notes Files - entries = list(map(lambda entry: entry[1], entries_with_ids)) - jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries) - - # Compress JSONL formatted Data - compress_jsonl_data(jsonl_data, output_file) - - return entries_with_ids - - @staticmethod - def get_jsonl_files(jsonl_files=None, jsonl_file_filters=None): - "Get all jsonl files to process" - absolute_jsonl_files, filtered_jsonl_files = set(), set() - if jsonl_files: - absolute_jsonl_files = {get_absolute_path(jsonl_file) for jsonl_file in jsonl_files} - if jsonl_file_filters: - filtered_jsonl_files = { - filtered_file - for jsonl_file_filter in jsonl_file_filters - for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True) - } - - all_jsonl_files = sorted(absolute_jsonl_files | filtered_jsonl_files) - - files_with_non_jsonl_extensions = { - jsonl_file for jsonl_file in all_jsonl_files if not jsonl_file.endswith(".jsonl") - } - if any(files_with_non_jsonl_extensions): - print(f"[Warning] There maybe non jsonl files in the input set: {files_with_non_jsonl_extensions}") - - logger.debug(f"Processing files: {all_jsonl_files}") - - return all_jsonl_files - - @staticmethod - def extract_jsonl_entries(jsonl_files): - "Extract entries from specified jsonl files" - entries = [] - for jsonl_file in jsonl_files: - entries.extend(load_jsonl(Path(jsonl_file))) - return entries - - @staticmethod - def convert_entries_to_jsonl(entries: List[Entry]): - "Convert each entry to JSON and collate as JSONL" - return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_entries.py similarity index 77% rename from src/khoj/processor/markdown/markdown_to_jsonl.py rename to src/khoj/processor/markdown/markdown_to_entries.py index c2f0f0bf..0dd71740 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_entries.py @@ -3,29 +3,28 @@ import logging import re import urllib3 from pathlib import Path -from typing import List +from typing import Tuple, List # Internal Packages -from khoj.processor.text_to_jsonl import TextToJsonl +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.helpers import timer from khoj.utils.constants import empty_escape_sequences -from khoj.utils.jsonl import compress_jsonl_data -from khoj.utils.rawconfig import Entry, TextContentConfig +from khoj.utils.rawconfig import Entry +from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class MarkdownToJsonl(TextToJsonl): - def __init__(self, config: TextContentConfig): - super().__init__(config) - self.config = config +class MarkdownToEntries(TextToEntries): + def __init__(self): + super().__init__() # Define Functions - def process(self, previous_entries=[], files=None, full_corpus: bool = True): + def process( + self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False + ) -> Tuple[int, int]: # Extract required fields from config - output_file = self.config.compressed_jsonl - if not full_corpus: deletion_file_names = set([file for file in files if files[file] == ""]) files_to_process = set(files) - deletion_file_names @@ -35,8 +34,8 @@ class MarkdownToJsonl(TextToJsonl): # Extract Entries from specified Markdown files with timer("Parse entries from Markdown files into dictionaries", logger): - current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( - *MarkdownToJsonl.extract_markdown_entries(files) + current_entries = MarkdownToEntries.convert_markdown_entries_to_maps( + *MarkdownToEntries.extract_markdown_entries(files) ) # Split entries by max tokens supported by model @@ -45,19 +44,18 @@ class MarkdownToJsonl(TextToJsonl): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger, deletion_filenames=deletion_file_names + num_new_embeddings, num_deleted_embeddings = self.update_embeddings( + current_entries, + DbEntry.EntryType.MARKDOWN, + DbEntry.EntrySource.COMPUTER, + "compiled", + logger, + deletion_file_names, + user, + regenerate=regenerate, ) - with timer("Write markdown entries to JSONL file", logger): - # Process Each Entry from All Notes Files - entries = list(map(lambda entry: entry[1], entries_with_ids)) - jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) - - # Compress JSONL formatted Data - compress_jsonl_data(jsonl_data, output_file) - - return entries_with_ids + return num_new_embeddings, num_deleted_embeddings @staticmethod def extract_markdown_entries(markdown_files): @@ -70,7 +68,7 @@ class MarkdownToJsonl(TextToJsonl): for markdown_file in markdown_files: try: markdown_content = markdown_files[markdown_file] - entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( + entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file( markdown_content, markdown_file, entries, entry_to_file_map ) except Exception as e: diff --git a/src/khoj/processor/notion/notion_to_jsonl.py b/src/khoj/processor/notion/notion_to_entries.py similarity index 89% rename from src/khoj/processor/notion/notion_to_jsonl.py rename to src/khoj/processor/notion/notion_to_entries.py index 0df56c37..7a88e2a1 100644 --- a/src/khoj/processor/notion/notion_to_jsonl.py +++ b/src/khoj/processor/notion/notion_to_entries.py @@ -1,5 +1,6 @@ # Standard Packages import logging +from typing import Tuple # External Packages import requests @@ -7,9 +8,9 @@ import requests # Internal Packages from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry, NotionContentConfig -from khoj.processor.text_to_jsonl import TextToJsonl -from khoj.utils.jsonl import compress_jsonl_data +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.rawconfig import Entry +from database.models import Entry as DbEntry, KhojUser, NotionConfig from enum import Enum @@ -49,10 +50,12 @@ class NotionBlockType(Enum): CALLOUT = "callout" -class NotionToJsonl(TextToJsonl): - def __init__(self, config: NotionContentConfig): +class NotionToEntries(TextToEntries): + def __init__(self, config: NotionConfig): super().__init__(config) - self.config = config + self.config = NotionContentConfig( + token=config.token, + ) self.session = requests.Session() self.session.headers.update({"Authorization": f"Bearer {config.token}", "Notion-Version": "2022-02-22"}) self.unsupported_block_types = [ @@ -80,7 +83,9 @@ class NotionToJsonl(TextToJsonl): self.body_params = {"page_size": 100} - def process(self, previous_entries=[], files=None, full_corpus=True): + def process( + self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False + ) -> Tuple[int, int]: current_entries = [] # Get all pages @@ -112,7 +117,7 @@ class NotionToJsonl(TextToJsonl): page_entries = self.process_page(p_or_d) current_entries.extend(page_entries) - return self.update_entries_with_ids(current_entries, previous_entries) + return self.update_entries_with_ids(current_entries, user) def process_page(self, page): page_id = page["id"] @@ -241,19 +246,16 @@ class NotionToJsonl(TextToJsonl): title = None return title, content - def update_entries_with_ids(self, current_entries, previous_entries): + def update_entries_with_ids(self, current_entries, user: KhojUser = None): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger + num_new_embeddings, num_deleted_embeddings = self.update_embeddings( + current_entries, + DbEntry.EntryType.NOTION, + DbEntry.EntrySource.NOTION, + key="compiled", + logger=logger, + user=user, ) - with timer("Write Notion entries to JSONL file", logger): - # Process Each Entry from all Notion entries - entries = list(map(lambda entry: entry[1], entries_with_ids)) - jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries) - - # Compress JSONL formatted Data - compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) - - return entries_with_ids + return num_new_embeddings, num_deleted_embeddings diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_entries.py similarity index 82% rename from src/khoj/processor/org_mode/org_to_jsonl.py rename to src/khoj/processor/org_mode/org_to_entries.py index 2f22add4..04ce97e4 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_entries.py @@ -5,28 +5,26 @@ from typing import Iterable, List, Tuple # Internal Packages from khoj.processor.org_mode import orgnode -from khoj.processor.text_to_jsonl import TextToJsonl +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.helpers import timer -from khoj.utils.jsonl import compress_jsonl_data -from khoj.utils.rawconfig import Entry, TextContentConfig +from khoj.utils.rawconfig import Entry from khoj.utils import state +from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class OrgToJsonl(TextToJsonl): - def __init__(self, config: TextContentConfig): - super().__init__(config) - self.config = config +class OrgToEntries(TextToEntries): + def __init__(self): + super().__init__() # Define Functions def process( - self, previous_entries: List[Entry] = [], files: dict[str, str] = None, full_corpus: bool = True - ) -> List[Tuple[int, Entry]]: + self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False + ) -> Tuple[int, int]: # Extract required fields from config - output_file = self.config.compressed_jsonl - index_heading_entries = self.config.index_heading_entries + index_heading_entries = False if not full_corpus: deletion_file_names = set([file for file in files if files[file] == ""]) @@ -47,19 +45,18 @@ class OrgToJsonl(TextToJsonl): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger, deletion_filenames=deletion_file_names + num_new_embeddings, num_deleted_embeddings = self.update_embeddings( + current_entries, + DbEntry.EntryType.ORG, + DbEntry.EntrySource.COMPUTER, + "compiled", + logger, + deletion_file_names, + user, + regenerate=regenerate, ) - # Process Each Entry from All Notes Files - with timer("Write org entries to JSONL file", logger): - entries = map(lambda entry: entry[1], entries_with_ids) - jsonl_data = self.convert_org_entries_to_jsonl(entries) - - # Compress JSONL formatted Data - compress_jsonl_data(jsonl_data, output_file) - - return entries_with_ids + return num_new_embeddings, num_deleted_embeddings @staticmethod def extract_org_entries(org_files: dict[str, str]): diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_entries.py similarity index 68% rename from src/khoj/processor/pdf/pdf_to_jsonl.py rename to src/khoj/processor/pdf/pdf_to_entries.py index c24d9940..3a47096a 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_entries.py @@ -1,28 +1,31 @@ # Standard Packages import os import logging -from typing import List +from typing import List, Tuple import base64 # External Packages from langchain.document_loaders import PyMuPDFLoader # Internal Packages -from khoj.processor.text_to_jsonl import TextToJsonl +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.helpers import timer -from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry +from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class PdfToJsonl(TextToJsonl): - # Define Functions - def process(self, previous_entries=[], files: dict[str, str] = None, full_corpus: bool = True): - # Extract required fields from config - output_file = self.config.compressed_jsonl +class PdfToEntries(TextToEntries): + def __init__(self): + super().__init__() + # Define Functions + def process( + self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False + ) -> Tuple[int, int]: + # Extract required fields from config if not full_corpus: deletion_file_names = set([file for file in files if files[file] == ""]) files_to_process = set(files) - deletion_file_names @@ -32,7 +35,7 @@ class PdfToJsonl(TextToJsonl): # Extract Entries from specified Pdf files with timer("Parse entries from PDF files into dictionaries", logger): - current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(files)) + current_entries = PdfToEntries.convert_pdf_entries_to_maps(*PdfToEntries.extract_pdf_entries(files)) # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): @@ -40,19 +43,18 @@ class PdfToJsonl(TextToJsonl): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger, deletion_filenames=deletion_file_names + num_new_embeddings, num_deleted_embeddings = self.update_embeddings( + current_entries, + DbEntry.EntryType.PDF, + DbEntry.EntrySource.COMPUTER, + "compiled", + logger, + deletion_file_names, + user, + regenerate=regenerate, ) - with timer("Write PDF entries to JSONL file", logger): - # Process Each Entry from All Notes Files - entries = list(map(lambda entry: entry[1], entries_with_ids)) - jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries) - - # Compress JSONL formatted Data - compress_jsonl_data(jsonl_data, output_file) - - return entries_with_ids + return num_new_embeddings, num_deleted_embeddings @staticmethod def extract_pdf_entries(pdf_files): @@ -62,18 +64,22 @@ class PdfToJsonl(TextToJsonl): entry_to_location_map = [] for pdf_file in pdf_files: try: - # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path + # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path tmp_file = f"tmp_pdf_file.pdf" with open(f"{tmp_file}", "wb") as f: bytes = pdf_files[pdf_file] f.write(bytes) - loader = PyMuPDFLoader(f"{tmp_file}") - pdf_entries_per_file = [page.page_content for page in loader.load()] + try: + loader = PyMuPDFLoader(f"{tmp_file}", extract_images=True) + pdf_entries_per_file = [page.page_content for page in loader.load()] + except ImportError: + loader = PyMuPDFLoader(f"{tmp_file}") + pdf_entries_per_file = [page.page_content for page in loader.load()] entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file)) entries.extend(pdf_entries_per_file) except Exception as e: logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.") - logger.warning(e) + logger.warning(e, exc_info=True) finally: if os.path.exists(f"{tmp_file}"): os.remove(f"{tmp_file}") diff --git a/src/khoj/processor/plaintext/plaintext_to_entries.py b/src/khoj/processor/plaintext/plaintext_to_entries.py new file mode 100644 index 00000000..d42dae30 --- /dev/null +++ b/src/khoj/processor/plaintext/plaintext_to_entries.py @@ -0,0 +1,96 @@ +# Standard Packages +import logging +from pathlib import Path +from typing import List, Tuple +from bs4 import BeautifulSoup + + +# Internal Packages +from khoj.processor.text_to_entries import TextToEntries +from khoj.utils.helpers import timer +from khoj.utils.rawconfig import Entry +from database.models import Entry as DbEntry, KhojUser + + +logger = logging.getLogger(__name__) + + +class PlaintextToEntries(TextToEntries): + def __init__(self): + super().__init__() + + # Define Functions + def process( + self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False + ) -> Tuple[int, int]: + if not full_corpus: + deletion_file_names = set([file for file in files if files[file] == ""]) + files_to_process = set(files) - deletion_file_names + files = {file: files[file] for file in files_to_process} + else: + deletion_file_names = None + + with timer("Scrub plaintext files and extract text", logger): + for file in files: + try: + plaintext_content = files[file] + if file.endswith(("html", "htm", "xml")): + plaintext_content = PlaintextToEntries.extract_html_content( + plaintext_content, file.split(".")[-1] + ) + files[file] = plaintext_content + except Exception as e: + logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.") + logger.warning(e, exc_info=True) + + # Extract Entries from specified plaintext files + with timer("Parse entries from plaintext files", logger): + current_entries = PlaintextToEntries.convert_plaintext_entries_to_maps(files) + + # Split entries by max tokens supported by model + with timer("Split entries by max token size supported by model", logger): + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + + # Identify, mark and merge any new entries with previous entries + with timer("Identify new or updated entries", logger): + num_new_embeddings, num_deleted_embeddings = self.update_embeddings( + current_entries, + DbEntry.EntryType.PLAINTEXT, + DbEntry.EntrySource.COMPUTER, + key="compiled", + logger=logger, + deletion_filenames=deletion_file_names, + user=user, + regenerate=regenerate, + ) + + return num_new_embeddings, num_deleted_embeddings + + @staticmethod + def extract_html_content(markup_content: str, markup_type: str): + "Extract content from HTML" + if markup_type == "xml": + soup = BeautifulSoup(markup_content, "xml") + else: + soup = BeautifulSoup(markup_content, "html.parser") + return soup.get_text(strip=True, separator="\n") + + @staticmethod + def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]: + "Convert each plaintext entries into a dictionary" + entries = [] + for file, entry in entry_to_file_map.items(): + entries.append( + Entry( + raw=entry, + file=file, + compiled=f"{Path(file).stem}\n{entry}", + heading=Path(file).stem, + ) + ) + return entries + + @staticmethod + def convert_entries_to_jsonl(entries: List[Entry]): + "Convert each entry to JSON and collate as JSONL" + return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/plaintext/plaintext_to_jsonl.py b/src/khoj/processor/plaintext/plaintext_to_jsonl.py deleted file mode 100644 index 3acb656e..00000000 --- a/src/khoj/processor/plaintext/plaintext_to_jsonl.py +++ /dev/null @@ -1,72 +0,0 @@ -# Standard Packages -import logging -from pathlib import Path -from typing import List, Tuple - -# Internal Packages -from khoj.processor.text_to_jsonl import TextToJsonl -from khoj.utils.helpers import timer -from khoj.utils.jsonl import compress_jsonl_data -from khoj.utils.rawconfig import Entry - - -logger = logging.getLogger(__name__) - - -class PlaintextToJsonl(TextToJsonl): - # Define Functions - def process( - self, previous_entries: List[Entry] = [], files: dict[str, str] = None, full_corpus: bool = True - ) -> List[Tuple[int, Entry]]: - output_file = self.config.compressed_jsonl - - if not full_corpus: - deletion_file_names = set([file for file in files if files[file] == ""]) - files_to_process = set(files) - deletion_file_names - files = {file: files[file] for file in files_to_process} - else: - deletion_file_names = None - - # Extract Entries from specified plaintext files - with timer("Parse entries from plaintext files", logger): - current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files) - - # Split entries by max tokens supported by model - with timer("Split entries by max token size supported by model", logger): - current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) - - # Identify, mark and merge any new entries with previous entries - with timer("Identify new or updated entries", logger): - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger, deletion_filenames=deletion_file_names - ) - - with timer("Write entries to JSONL file", logger): - # Process Each Entry from All Notes Files - entries = list(map(lambda entry: entry[1], entries_with_ids)) - plaintext_data = PlaintextToJsonl.convert_entries_to_jsonl(entries) - - # Compress JSONL formatted Data - compress_jsonl_data(plaintext_data, output_file) - - return entries_with_ids - - @staticmethod - def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]: - "Convert each plaintext entries into a dictionary" - entries = [] - for file, entry in entry_to_file_map.items(): - entries.append( - Entry( - raw=entry, - file=file, - compiled=f"{Path(file).stem}\n{entry}", - heading=Path(file).stem, - ) - ) - return entries - - @staticmethod - def convert_entries_to_jsonl(entries: List[Entry]): - "Convert each entry to JSON and collate as JSONL" - return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/text_to_entries.py b/src/khoj/processor/text_to_entries.py new file mode 100644 index 00000000..ac42105a --- /dev/null +++ b/src/khoj/processor/text_to_entries.py @@ -0,0 +1,235 @@ +# Standard Packages +from abc import ABC, abstractmethod +import hashlib +from itertools import repeat +import logging +import uuid +from tqdm import tqdm +from typing import Callable, List, Tuple, Set, Any +from khoj.utils import state +from khoj.utils.helpers import is_none_or_empty, timer, batcher + + +# Internal Packages +from khoj.utils.rawconfig import Entry +from khoj.search_filter.date_filter import DateFilter +from database.models import KhojUser, Entry as DbEntry, EntryDates +from database.adapters import EntryAdapters + + +logger = logging.getLogger(__name__) + + +class TextToEntries(ABC): + def __init__(self, config: Any = None): + self.embeddings_model = state.embeddings_model + self.config = config + self.date_filter = DateFilter() + + @abstractmethod + def process( + self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False + ) -> Tuple[int, int]: + ... + + @staticmethod + def hash_func(key: str) -> Callable: + return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding="utf-8")).hexdigest() + + @staticmethod + def split_entries_by_max_tokens( + entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500 + ) -> List[Entry]: + "Split entries if compiled entry length exceeds the max tokens supported by the ML model." + chunked_entries: List[Entry] = [] + for entry in entries: + # Split entry into words + compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""] + + # Drop long words instead of having entry truncated to maintain quality of entry processed by models + compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] + corpus_id = uuid.uuid4() + + # Split entry into chunks of max tokens + for chunk_index in range(0, len(compiled_entry_words), max_tokens): + compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens] + compiled_entry_chunk = " ".join(compiled_entry_words_chunk) + + # Prepend heading to all other chunks, the first chunk already has heading from original entry + if chunk_index > 0: + # Snip heading to avoid crossing max_tokens limit + # Keep last 100 characters of heading as entry heading more important than filename + snipped_heading = entry.heading[-100:] + compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}" + + chunked_entries.append( + Entry( + compiled=compiled_entry_chunk, + raw=entry.raw, + heading=entry.heading, + file=entry.file, + corpus_id=corpus_id, + ) + ) + + return chunked_entries + + def update_embeddings( + self, + current_entries: List[Entry], + file_type: str, + file_source: str, + key="compiled", + logger: logging.Logger = None, + deletion_filenames: Set[str] = None, + user: KhojUser = None, + regenerate: bool = False, + ): + with timer("Constructed current entry hashes in", logger): + hashes_by_file = dict[str, set[str]]() + current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries)) + hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) + for entry in tqdm(current_entries, desc="Hashing Entries"): + hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry)) + + num_deleted_entries = 0 + if regenerate: + with timer("Cleared existing dataset for regeneration in", logger): + logger.debug(f"Deleting all entries for file type {file_type}") + num_deleted_entries = EntryAdapters.delete_all_entries_by_type(user, file_type) + + hashes_to_process = set() + with timer("Identified entries to add to database in", logger): + for file in tqdm(hashes_by_file, desc="Identify new entries"): + hashes_for_file = hashes_by_file[file] + existing_entries = DbEntry.objects.filter( + user=user, hashed_value__in=hashes_for_file, file_type=file_type + ) + existing_entry_hashes = set([entry.hashed_value for entry in existing_entries]) + hashes_to_process |= hashes_for_file - existing_entry_hashes + + embeddings = [] + with timer("Generated embeddings for entries to add to database in", logger): + entries_to_process = [hash_to_current_entries[hashed_val] for hashed_val in hashes_to_process] + data_to_embed = [getattr(entry, key) for entry in entries_to_process] + embeddings += self.embeddings_model.embed_documents(data_to_embed) + + added_entries: list[DbEntry] = [] + with timer("Added entries to database in", logger): + num_items = len(hashes_to_process) + assert num_items == len(embeddings) + batch_size = min(200, num_items) + entry_batches = zip(hashes_to_process, embeddings) + + for entry_batch in tqdm(batcher(entry_batches, batch_size), desc="Add entries to database"): + batch_embeddings_to_create = [] + for entry_hash, new_entry in entry_batch: + entry = hash_to_current_entries[entry_hash] + batch_embeddings_to_create.append( + DbEntry( + user=user, + embeddings=new_entry, + raw=entry.raw, + compiled=entry.compiled, + heading=entry.heading[:1000], # Truncate to max chars of field allowed + file_path=entry.file, + file_source=file_source, + file_type=file_type, + hashed_value=entry_hash, + corpus_id=entry.corpus_id, + ) + ) + added_entries += DbEntry.objects.bulk_create(batch_embeddings_to_create) + logger.debug(f"Added {len(added_entries)} {file_type} entries to database") + + new_dates = [] + with timer("Indexed dates from added entries in", logger): + for added_entry in added_entries: + dates_in_entries = zip(self.date_filter.extract_dates(added_entry.raw), repeat(added_entry)) + dates_to_create = [ + EntryDates(date=date, entry=added_entry) + for date, added_entry in dates_in_entries + if not is_none_or_empty(date) + ] + new_dates += EntryDates.objects.bulk_create(dates_to_create) + logger.debug(f"Indexed {len(new_dates)} dates from added {file_type} entries") + + with timer("Deleted entries identified by server from database in", logger): + for file in hashes_by_file: + existing_entry_hashes = EntryAdapters.get_existing_entry_hashes_by_file(user, file) + to_delete_entry_hashes = set(existing_entry_hashes) - hashes_by_file[file] + num_deleted_entries += len(to_delete_entry_hashes) + EntryAdapters.delete_entry_by_hash(user, hashed_values=list(to_delete_entry_hashes)) + + with timer("Deleted entries requested by clients from database in", logger): + if deletion_filenames is not None: + for file_path in deletion_filenames: + deleted_count = EntryAdapters.delete_entry_by_file(user, file_path) + num_deleted_entries += deleted_count + + return len(added_entries), num_deleted_entries + + @staticmethod + def mark_entries_for_update( + current_entries: List[Entry], + previous_entries: List[Entry], + key="compiled", + logger: logging.Logger = None, + deletion_filenames: Set[str] = None, + ): + # Hash all current and previous entries to identify new entries + with timer("Hash previous, current entries", logger): + current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries)) + previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries)) + if deletion_filenames is not None: + deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames] + deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries)) + else: + deletion_entry_hashes = [] + + with timer("Identify, Mark, Combine new, existing entries", logger): + hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) + hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries)) + + # All entries that did not exist in the previous set are to be added + new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes) + # All entries that exist in both current and previous sets are kept + existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes) + # All entries that exist in the previous set but not in the current set should be preserved + remaining_entry_hashes = set(previous_entry_hashes) - set(current_entry_hashes) + # All entries that exist in the previous set and also in the deletions set should be removed + to_delete_entry_hashes = set(previous_entry_hashes) & set(deletion_entry_hashes) + + preserving_entry_hashes = existing_entry_hashes + + if deletion_filenames is not None: + preserving_entry_hashes = ( + (existing_entry_hashes | remaining_entry_hashes) + if len(deletion_entry_hashes) == 0 + else (set(previous_entry_hashes) - to_delete_entry_hashes) + ) + + # load new entries in the order in which they are processed for a stable sort + new_entries = [ + (current_entry_hashes.index(entry_hash), hash_to_current_entries[entry_hash]) + for entry_hash in new_entry_hashes + ] + new_entries_sorted = sorted(new_entries, key=lambda e: e[0]) + # Mark new entries with -1 id to flag for later embeddings generation + new_entries_sorted = [(-1, entry[1]) for entry in new_entries_sorted] + + # Set id of existing entries to their previous ids to reuse their existing encoded embeddings + existing_entries = [ + (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash]) + for entry_hash in preserving_entry_hashes + ] + existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0]) + + entries_with_ids = existing_entries_sorted + new_entries_sorted + + return entries_with_ids + + @staticmethod + def convert_text_maps_to_jsonl(entries: List[Entry]) -> str: + # Convert each entry to JSON and write to JSONL file + return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py deleted file mode 100644 index 98f5986f..00000000 --- a/src/khoj/processor/text_to_jsonl.py +++ /dev/null @@ -1,128 +0,0 @@ -# Standard Packages -from abc import ABC, abstractmethod -import hashlib -import logging -from typing import Callable, List, Tuple, Set -from khoj.utils.helpers import timer - -# Internal Packages -from khoj.utils.rawconfig import Entry, TextConfigBase - - -logger = logging.getLogger(__name__) - - -class TextToJsonl(ABC): - def __init__(self, config: TextConfigBase): - self.config = config - - @abstractmethod - def process( - self, previous_entries: List[Entry] = [], files: dict[str, str] = None, full_corpus: bool = True - ) -> List[Tuple[int, Entry]]: - ... - - @staticmethod - def hash_func(key: str) -> Callable: - return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding="utf-8")).hexdigest() - - @staticmethod - def split_entries_by_max_tokens( - entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500 - ) -> List[Entry]: - "Split entries if compiled entry length exceeds the max tokens supported by the ML model." - chunked_entries: List[Entry] = [] - for entry in entries: - # Split entry into words - compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""] - - # Drop long words instead of having entry truncated to maintain quality of entry processed by models - compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] - - # Split entry into chunks of max tokens - for chunk_index in range(0, len(compiled_entry_words), max_tokens): - compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens] - compiled_entry_chunk = " ".join(compiled_entry_words_chunk) - - # Prepend heading to all other chunks, the first chunk already has heading from original entry - if chunk_index > 0: - # Snip heading to avoid crossing max_tokens limit - # Keep last 100 characters of heading as entry heading more important than filename - snipped_heading = entry.heading[-100:] - compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}" - - chunked_entries.append( - Entry( - compiled=compiled_entry_chunk, - raw=entry.raw, - heading=entry.heading, - file=entry.file, - ) - ) - - return chunked_entries - - @staticmethod - def mark_entries_for_update( - current_entries: List[Entry], - previous_entries: List[Entry], - key="compiled", - logger: logging.Logger = None, - deletion_filenames: Set[str] = None, - ): - # Hash all current and previous entries to identify new entries - with timer("Hash previous, current entries", logger): - current_entry_hashes = list(map(TextToJsonl.hash_func(key), current_entries)) - previous_entry_hashes = list(map(TextToJsonl.hash_func(key), previous_entries)) - if deletion_filenames is not None: - deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames] - deletion_entry_hashes = list(map(TextToJsonl.hash_func(key), deletion_entries)) - else: - deletion_entry_hashes = [] - - with timer("Identify, Mark, Combine new, existing entries", logger): - hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) - hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries)) - - # All entries that did not exist in the previous set are to be added - new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes) - # All entries that exist in both current and previous sets are kept - existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes) - # All entries that exist in the previous set but not in the current set should be preserved - remaining_entry_hashes = set(previous_entry_hashes) - set(current_entry_hashes) - # All entries that exist in the previous set and also in the deletions set should be removed - to_delete_entry_hashes = set(previous_entry_hashes) & set(deletion_entry_hashes) - - preserving_entry_hashes = existing_entry_hashes - - if deletion_filenames is not None: - preserving_entry_hashes = ( - (existing_entry_hashes | remaining_entry_hashes) - if len(deletion_entry_hashes) == 0 - else (set(previous_entry_hashes) - to_delete_entry_hashes) - ) - - # load new entries in the order in which they are processed for a stable sort - new_entries = [ - (current_entry_hashes.index(entry_hash), hash_to_current_entries[entry_hash]) - for entry_hash in new_entry_hashes - ] - new_entries_sorted = sorted(new_entries, key=lambda e: e[0]) - # Mark new entries with -1 id to flag for later embeddings generation - new_entries_sorted = [(-1, entry[1]) for entry in new_entries_sorted] - - # Set id of existing entries to their previous ids to reuse their existing encoded embeddings - existing_entries = [ - (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash]) - for entry_hash in preserving_entry_hashes - ] - existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0]) - - entries_with_ids = existing_entries_sorted + new_entries_sorted - - return entries_with_ids - - @staticmethod - def convert_text_maps_to_jsonl(entries: List[Entry]) -> str: - # Convert each entry to JSON and write to JSONL file - return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 345429e8..6d67fcbe 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -2,332 +2,327 @@ import concurrent.futures import math import time -import yaml import logging import json -from typing import List, Optional, Union, Any +from typing import Annotated, List, Optional, Union, Any # External Packages -from fastapi import APIRouter, HTTPException, Header, Request -from sentence_transformers import util +from fastapi import APIRouter, Depends, HTTPException, Header, Request +from starlette.authentication import requires +from asgiref.sync import sync_to_async # Internal Packages -from khoj.configure import configure_processor, configure_server +from khoj.configure import configure_server from khoj.search_type import image_search, text_search from khoj.search_filter.date_filter import DateFilter from khoj.search_filter.file_filter import FileFilter from khoj.search_filter.word_filter import WordFilter -from khoj.utils.config import TextSearchModel +from khoj.utils.config import TextSearchModel, GPT4AllProcessorModel from khoj.utils.helpers import ConversationCommand, is_none_or_empty, timer, command_descriptions from khoj.utils.rawconfig import ( - ContentConfig, FullConfig, - ProcessorConfig, SearchConfig, SearchResponse, - TextContentConfig, - OpenAIProcessorConfig, GithubContentConfig, NotionContentConfig, - ConversationProcessorConfig, - OfflineChatProcessorConfig, ) -from khoj.utils.helpers import resolve_absolute_path from khoj.utils.state import SearchType from khoj.utils import state, constants -from khoj.utils.yaml import save_config_to_file_updated_state +from khoj.utils.helpers import AsyncIteratorWrapper, get_device from fastapi.responses import StreamingResponse, Response from khoj.routers.helpers import ( + CommonQueryParams, get_conversation_command, - perform_chat_checks, - generate_chat_response, + validate_conversation_config, + agenerate_chat_response, update_telemetry_state, + is_ready_to_chat, + ApiUserRateLimiter, ) -from khoj.processor.conversation.prompts import help_message +from khoj.processor.conversation.prompts import help_message, no_entries_found from khoj.processor.conversation.openai.gpt import extract_questions from khoj.processor.conversation.gpt4all.chat_model import extract_questions_offline from fastapi.requests import Request +from database import adapters +from database.adapters import EntryAdapters, ConversationAdapters +from database.models import ( + LocalMarkdownConfig, + LocalOrgConfig, + LocalPdfConfig, + LocalPlaintextConfig, + KhojUser, + Entry as DbEntry, + GithubConfig, + NotionConfig, + ChatModelOptions, +) + # Initialize Router api = APIRouter() logger = logging.getLogger(__name__) -# If it's a demo instance, prevent updating any of the configuration. -if not state.demo: - def _initialize_config(): - if state.config is None: - state.config = FullConfig() - state.config.search_type = SearchConfig.parse_obj(constants.default_config["search-type"]) - if state.processor_config is None: - state.processor_config = configure_processor(state.config.processor) +def map_config_to_object(content_source: str): + if content_source == DbEntry.EntrySource.GITHUB: + return GithubConfig + if content_source == DbEntry.EntrySource.GITHUB: + return NotionConfig + if content_source == DbEntry.EntrySource.COMPUTER: + return "Computer" - @api.get("/config/data", response_model=FullConfig) - def get_config_data(): - return state.config - @api.post("/config/data") - async def set_config_data( - request: Request, - updated_config: FullConfig, - client: Optional[str] = None, - ): - state.config = updated_config - with open(state.config_file, "w") as outfile: - yaml.dump(yaml.safe_load(state.config.json(by_alias=True)), outfile) - outfile.close() - - configuration_update_metadata = dict() - - if state.config.content_type is not None: - configuration_update_metadata["github"] = state.config.content_type.github is not None - configuration_update_metadata["notion"] = state.config.content_type.notion is not None - configuration_update_metadata["org"] = state.config.content_type.org is not None - configuration_update_metadata["pdf"] = state.config.content_type.pdf is not None - configuration_update_metadata["markdown"] = state.config.content_type.markdown is not None - configuration_update_metadata["plugins"] = state.config.content_type.plugins is not None - - if state.config.processor is not None: - configuration_update_metadata["conversation_processor"] = state.config.processor.conversation is not None - - update_telemetry_state( - request=request, - telemetry_type="api", - api="set_config", - client=client, - metadata=configuration_update_metadata, - ) - return state.config - - @api.post("/config/data/content_type/github", status_code=200) - async def set_content_config_github_data( - request: Request, - updated_config: Union[GithubContentConfig, None], - client: Optional[str] = None, - ): - _initialize_config() - - if not state.config.content_type: - state.config.content_type = ContentConfig(**{"github": updated_config}) - else: - state.config.content_type.github = updated_config - - update_telemetry_state( - request=request, - telemetry_type="api", - api="set_content_config", - client=client, - metadata={"content_type": "github"}, - ) - - try: - save_config_to_file_updated_state() - return {"status": "ok"} - except Exception as e: - return {"status": "error", "message": str(e)} - - @api.post("/config/data/content_type/notion", status_code=200) - async def set_content_config_notion_data( - request: Request, - updated_config: Union[NotionContentConfig, None], - client: Optional[str] = None, - ): - _initialize_config() - - if not state.config.content_type: - state.config.content_type = ContentConfig(**{"notion": updated_config}) - else: - state.config.content_type.notion = updated_config - - update_telemetry_state( - request=request, - telemetry_type="api", - api="set_content_config", - client=client, - metadata={"content_type": "notion"}, - ) - - try: - save_config_to_file_updated_state() - return {"status": "ok"} - except Exception as e: - return {"status": "error", "message": str(e)} - - @api.post("/delete/config/data/content_type/{content_type}", status_code=200) - async def remove_content_config_data( - request: Request, - content_type: str, - client: Optional[str] = None, - ): - if not state.config or not state.config.content_type: - return {"status": "ok"} - - update_telemetry_state( - request=request, - telemetry_type="api", - api="delete_content_config", - client=client, - metadata={"content_type": content_type}, - ) - - if state.config.content_type: - state.config.content_type[content_type] = None - - if content_type == "github": - state.content_index.github = None - elif content_type == "notion": - state.content_index.notion = None - elif content_type == "plugins": - state.content_index.plugins = None - elif content_type == "pdf": - state.content_index.pdf = None - elif content_type == "markdown": - state.content_index.markdown = None - elif content_type == "org": - state.content_index.org = None - elif content_type == "plaintext": - state.content_index.plaintext = None - else: - logger.warning(f"Request to delete unknown content type: {content_type} via API") - - try: - save_config_to_file_updated_state() - return {"status": "ok"} - except Exception as e: - return {"status": "error", "message": str(e)} - - @api.post("/delete/config/data/processor/conversation/openai", status_code=200) - async def remove_processor_conversation_config_data( - request: Request, - client: Optional[str] = None, - ): - if ( - not state.config - or not state.config.processor - or not state.config.processor.conversation - or not state.config.processor.conversation.openai - ): - return {"status": "ok"} - - state.config.processor.conversation.openai = None - state.processor_config = configure_processor(state.config.processor, state.processor_config) - - update_telemetry_state( - request=request, - telemetry_type="api", - api="delete_processor_openai_config", - client=client, - metadata={"processor_conversation_type": "openai"}, - ) - - try: - save_config_to_file_updated_state() - return {"status": "ok"} - except Exception as e: - return {"status": "error", "message": str(e)} - - @api.post("/config/data/content_type/{content_type}", status_code=200) - async def set_content_config_data( - request: Request, - content_type: str, - updated_config: Union[TextContentConfig, None], - client: Optional[str] = None, - ): - _initialize_config() - - if not state.config.content_type: - state.config.content_type = ContentConfig(**{content_type: updated_config}) - else: - state.config.content_type[content_type] = updated_config - - update_telemetry_state( - request=request, - telemetry_type="api", - api="set_content_config", - client=client, - metadata={"content_type": content_type}, - ) - - try: - save_config_to_file_updated_state() - return {"status": "ok"} - except Exception as e: - return {"status": "error", "message": str(e)} - - @api.post("/config/data/processor/conversation/openai", status_code=200) - async def set_processor_openai_config_data( - request: Request, - updated_config: Union[OpenAIProcessorConfig, None], - client: Optional[str] = None, - ): - _initialize_config() - - if not state.config.processor or not state.config.processor.conversation: - default_config = constants.default_config - default_conversation_logfile = resolve_absolute_path( - default_config["processor"]["conversation"]["conversation-logfile"] # type: ignore +async def map_config_to_db(config: FullConfig, user: KhojUser): + if config.content_type: + if config.content_type.org: + await LocalOrgConfig.objects.filter(user=user).adelete() + await LocalOrgConfig.objects.acreate( + input_files=config.content_type.org.input_files, + input_filter=config.content_type.org.input_filter, + index_heading_entries=config.content_type.org.index_heading_entries, + user=user, ) - conversation_logfile = resolve_absolute_path(default_conversation_logfile) - state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile)) # type: ignore - - assert state.config.processor.conversation is not None - state.config.processor.conversation.openai = updated_config - state.processor_config = configure_processor(state.config.processor, state.processor_config) - - update_telemetry_state( - request=request, - telemetry_type="api", - api="set_processor_config", - client=client, - metadata={"processor_conversation_type": "conversation"}, - ) - - try: - save_config_to_file_updated_state() - return {"status": "ok"} - except Exception as e: - return {"status": "error", "message": str(e)} - - @api.post("/config/data/processor/conversation/offline_chat", status_code=200) - async def set_processor_enable_offline_chat_config_data( - request: Request, - enable_offline_chat: bool, - offline_chat_model: Optional[str] = None, - client: Optional[str] = None, - ): - _initialize_config() - - if not state.config.processor or not state.config.processor.conversation: - default_config = constants.default_config - default_conversation_logfile = resolve_absolute_path( - default_config["processor"]["conversation"]["conversation-logfile"] # type: ignore + if config.content_type.markdown: + await LocalMarkdownConfig.objects.filter(user=user).adelete() + await LocalMarkdownConfig.objects.acreate( + input_files=config.content_type.markdown.input_files, + input_filter=config.content_type.markdown.input_filter, + index_heading_entries=config.content_type.markdown.index_heading_entries, + user=user, + ) + if config.content_type.pdf: + await LocalPdfConfig.objects.filter(user=user).adelete() + await LocalPdfConfig.objects.acreate( + input_files=config.content_type.pdf.input_files, + input_filter=config.content_type.pdf.input_filter, + index_heading_entries=config.content_type.pdf.index_heading_entries, + user=user, + ) + if config.content_type.plaintext: + await LocalPlaintextConfig.objects.filter(user=user).adelete() + await LocalPlaintextConfig.objects.acreate( + input_files=config.content_type.plaintext.input_files, + input_filter=config.content_type.plaintext.input_filter, + index_heading_entries=config.content_type.plaintext.index_heading_entries, + user=user, + ) + if config.content_type.github: + await adapters.set_user_github_config( + user=user, + pat_token=config.content_type.github.pat_token, + repos=config.content_type.github.repos, + ) + if config.content_type.notion: + await adapters.set_notion_config( + user=user, + token=config.content_type.notion.token, ) - conversation_logfile = resolve_absolute_path(default_conversation_logfile) - state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile)) # type: ignore - assert state.config.processor.conversation is not None - if state.config.processor.conversation.offline_chat is None: - state.config.processor.conversation.offline_chat = OfflineChatProcessorConfig() - state.config.processor.conversation.offline_chat.enable_offline_chat = enable_offline_chat - if offline_chat_model is not None: - state.config.processor.conversation.offline_chat.chat_model = offline_chat_model - state.processor_config = configure_processor(state.config.processor, state.processor_config) +def _initialize_config(): + if state.config is None: + state.config = FullConfig() + state.config.search_type = SearchConfig.model_validate(constants.default_config["search-type"]) - update_telemetry_state( - request=request, - telemetry_type="api", - api="set_processor_config", - client=client, - metadata={"processor_conversation_type": f"{'enable' if enable_offline_chat else 'disable'}_local_llm"}, + +@api.get("/config/data", response_model=FullConfig) +@requires(["authenticated"]) +def get_config_data(request: Request): + user = request.user.object + EntryAdapters.get_unique_file_types(user) + + return state.config + + +@api.post("/config/data") +@requires(["authenticated"]) +async def set_config_data( + request: Request, + updated_config: FullConfig, + client: Optional[str] = None, +): + user = request.user.object + await map_config_to_db(updated_config, user) + + configuration_update_metadata = {} + + enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user) + + if state.config.content_type is not None: + configuration_update_metadata["github"] = "github" in enabled_content + configuration_update_metadata["notion"] = "notion" in enabled_content + configuration_update_metadata["org"] = "org" in enabled_content + configuration_update_metadata["pdf"] = "pdf" in enabled_content + configuration_update_metadata["markdown"] = "markdown" in enabled_content + + if state.config.processor is not None: + configuration_update_metadata["conversation_processor"] = state.config.processor.conversation is not None + + update_telemetry_state( + request=request, + telemetry_type="api", + api="set_config", + client=client, + metadata=configuration_update_metadata, + ) + return state.config + + +@api.post("/config/data/content-source/github", status_code=200) +@requires(["authenticated"]) +async def set_content_config_github_data( + request: Request, + updated_config: Union[GithubContentConfig, None], + client: Optional[str] = None, +): + _initialize_config() + + user = request.user.object + + try: + await adapters.set_user_github_config( + user=user, + pat_token=updated_config.pat_token, + repos=updated_config.repos, ) + except Exception as e: + logger.error(e, exc_info=True) + raise HTTPException(status_code=500, detail="Failed to set Github config") - try: - save_config_to_file_updated_state() - return {"status": "ok"} - except Exception as e: - return {"status": "error", "message": str(e)} + update_telemetry_state( + request=request, + telemetry_type="api", + api="set_content_config", + client=client, + metadata={"content_type": "github"}, + ) + + return {"status": "ok"} + + +@api.post("/config/data/content-source/notion", status_code=200) +@requires(["authenticated"]) +async def set_content_config_notion_data( + request: Request, + updated_config: Union[NotionContentConfig, None], + client: Optional[str] = None, +): + _initialize_config() + + user = request.user.object + + try: + await adapters.set_notion_config( + user=user, + token=updated_config.token, + ) + except Exception as e: + logger.error(e, exc_info=True) + raise HTTPException(status_code=500, detail="Failed to set Github config") + + update_telemetry_state( + request=request, + telemetry_type="api", + api="set_content_config", + client=client, + metadata={"content_type": "notion"}, + ) + + return {"status": "ok"} + + +@api.delete("/config/data/content-source/{content_source}", status_code=200) +@requires(["authenticated"]) +async def remove_content_source_data( + request: Request, + content_source: str, + client: Optional[str] = None, +): + user = request.user.object + + update_telemetry_state( + request=request, + telemetry_type="api", + api="delete_content_config", + client=client, + metadata={"content_source": content_source}, + ) + + content_object = map_config_to_object(content_source) + if content_object is None: + raise ValueError(f"Invalid content source: {content_source}") + elif content_object != "Computer": + await content_object.objects.filter(user=user).adelete() + await sync_to_async(EntryAdapters.delete_all_entries)(user, content_source) + + enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user) + return {"status": "ok"} + + +@api.delete("/config/data/file", status_code=200) +@requires(["authenticated"]) +async def remove_file_data( + request: Request, + filename: str, + client: Optional[str] = None, +): + user = request.user.object + + update_telemetry_state( + request=request, + telemetry_type="api", + api="delete_file", + client=client, + ) + + await EntryAdapters.adelete_entry_by_file(user, filename) + + return {"status": "ok"} + + +@api.get("/config/data/{content_source}", response_model=List[str]) +@requires(["authenticated"]) +async def get_all_filenames( + request: Request, + content_source: str, + client: Optional[str] = None, +): + user = request.user.object + + update_telemetry_state( + request=request, + telemetry_type="api", + api="get_all_filenames", + client=client, + ) + + return await sync_to_async(list)(EntryAdapters.aget_all_filenames_by_source(user, content_source)) # type: ignore[call-arg] + + +@api.post("/config/data/conversation/model", status_code=200) +@requires(["authenticated"]) +async def update_chat_model( + request: Request, + id: str, + client: Optional[str] = None, +): + user = request.user.object + + new_config = await ConversationAdapters.aset_user_conversation_processor(user, int(id)) + + update_telemetry_state( + request=request, + telemetry_type="api", + api="set_conversation_chat_model", + client=client, + metadata={"processor_conversation_type": "conversation"}, + ) + + if new_config is None: + return {"status": "error", "message": "Model not found"} + + return {"status": "ok"} # Create Routes @@ -337,41 +332,38 @@ def get_default_config_data(): @api.get("/config/types", response_model=List[str]) -def get_config_types(): - """Get configured content types""" - if state.config is None or state.config.content_type is None: - raise HTTPException( - status_code=500, - detail="Content types not configured. Configure at least one content type on server and restart it.", - ) +@requires(["authenticated"]) +def get_config_types( + request: Request, +): + user = request.user.object + enabled_file_types = EntryAdapters.get_unique_file_types(user) + configured_content_types = list(enabled_file_types) + + if state.config and state.config.content_type: + for ctype in state.config.content_type.dict(exclude_none=True): + configured_content_types.append(ctype) - configured_content_types = state.config.content_type.dict(exclude_none=True) return [ search_type.value for search_type in SearchType - if ( - search_type.value in configured_content_types - and getattr(state.content_index, search_type.value) is not None - ) - or ("plugins" in configured_content_types and search_type.name in configured_content_types["plugins"]) - or search_type == SearchType.All + if (search_type.value in configured_content_types) or search_type == SearchType.All ] @api.get("/search", response_model=List[SearchResponse]) +@requires(["authenticated"]) async def search( q: str, request: Request, + common: CommonQueryParams, n: Optional[int] = 5, t: Optional[SearchType] = SearchType.All, r: Optional[bool] = False, - score_threshold: Optional[Union[float, None]] = None, + max_distance: Optional[Union[float, None]] = None, dedupe: Optional[bool] = True, - client: Optional[str] = None, - user_agent: Optional[str] = Header(None), - referer: Optional[str] = Header(None), - host: Optional[str] = Header(None), ): + user = request.user.object start_time = time.time() # Run validation checks @@ -379,21 +371,19 @@ async def search( if q is None or q == "": logger.warning(f"No query param (q) passed in API call to initiate search") return results - if not state.search_models or not any(state.search_models.__dict__.values()): - logger.warning(f"No search models loaded. Configure a search model before initiating search") - return results # initialize variables user_query = q.strip() results_count = n or 5 - score_threshold = score_threshold if score_threshold is not None else -math.inf + max_distance = max_distance or math.inf search_futures: List[concurrent.futures.Future] = [] # return cached results, if available - query_cache_key = f"{user_query}-{n}-{t}-{r}-{score_threshold}-{dedupe}" - if query_cache_key in state.query_cache: - logger.debug(f"Return response from query cache") - return state.query_cache[query_cache_key] + if user: + query_cache_key = f"{user_query}-{n}-{t}-{r}-{max_distance}-{dedupe}" + if query_cache_key in state.query_cache[user.uuid]: + logger.debug(f"Return response from query cache") + return state.query_cache[user.uuid][query_cache_key] # Encode query with filter terms removed defiltered_query = user_query @@ -407,84 +397,31 @@ async def search( ] if text_search_models: with timer("Encoding query took", logger=logger): - encoded_asymmetric_query = util.normalize_embeddings( - text_search_models[0].bi_encoder.encode( - [defiltered_query], - convert_to_tensor=True, - device=state.device, - ) - ) + encoded_asymmetric_query = state.embeddings_model.embed_query(defiltered_query) with concurrent.futures.ThreadPoolExecutor() as executor: - if (t == SearchType.Org or t == SearchType.All) and state.content_index.org and state.search_models.text_search: - # query org-mode notes - search_futures += [ - executor.submit( - text_search.query, - user_query, - state.search_models.text_search, - state.content_index.org, - question_embedding=encoded_asymmetric_query, - rank_results=r or False, - score_threshold=score_threshold, - dedupe=dedupe or True, - ) - ] - - if ( - (t == SearchType.Markdown or t == SearchType.All) - and state.content_index.markdown - and state.search_models.text_search - ): + if t in [ + SearchType.All, + SearchType.Org, + SearchType.Markdown, + SearchType.Github, + SearchType.Notion, + SearchType.Plaintext, + SearchType.Pdf, + ]: # query markdown notes search_futures += [ executor.submit( text_search.query, + user, user_query, - state.search_models.text_search, - state.content_index.markdown, + t, question_embedding=encoded_asymmetric_query, - rank_results=r or False, - score_threshold=score_threshold, - dedupe=dedupe or True, + max_distance=max_distance, ) ] - if ( - (t == SearchType.Github or t == SearchType.All) - and state.content_index.github - and state.search_models.text_search - ): - # query github issues - search_futures += [ - executor.submit( - text_search.query, - user_query, - state.search_models.text_search, - state.content_index.github, - question_embedding=encoded_asymmetric_query, - rank_results=r or False, - score_threshold=score_threshold, - dedupe=dedupe or True, - ) - ] - - if (t == SearchType.Pdf or t == SearchType.All) and state.content_index.pdf and state.search_models.text_search: - # query pdf files - search_futures += [ - executor.submit( - text_search.query, - user_query, - state.search_models.text_search, - state.content_index.pdf, - question_embedding=encoded_asymmetric_query, - rank_results=r or False, - score_threshold=score_threshold, - dedupe=dedupe or True, - ) - ] - - if (t == SearchType.Image) and state.content_index.image and state.search_models.image_search: + elif (t == SearchType.Image) and state.content_index.image and state.search_models.image_search: # query images search_futures += [ executor.submit( @@ -493,71 +430,6 @@ async def search( results_count, state.search_models.image_search, state.content_index.image, - score_threshold=score_threshold, - ) - ] - - if ( - (t == SearchType.All or t in SearchType) - and state.content_index.plugins - and state.search_models.plugin_search - ): - # query specified plugin type - # Get plugin content, search model for specified search type, or the first one if none specified - plugin_search = state.search_models.plugin_search.get(t.value) or next( - iter(state.search_models.plugin_search.values()) - ) - plugin_content = state.content_index.plugins.get(t.value) or next( - iter(state.content_index.plugins.values()) - ) - search_futures += [ - executor.submit( - text_search.query, - user_query, - plugin_search, - plugin_content, - question_embedding=encoded_asymmetric_query, - rank_results=r or False, - score_threshold=score_threshold, - dedupe=dedupe or True, - ) - ] - - if ( - (t == SearchType.Notion or t == SearchType.All) - and state.content_index.notion - and state.search_models.text_search - ): - # query notion pages - search_futures += [ - executor.submit( - text_search.query, - user_query, - state.search_models.text_search, - state.content_index.notion, - question_embedding=encoded_asymmetric_query, - rank_results=r or False, - score_threshold=score_threshold, - dedupe=dedupe or True, - ) - ] - - if ( - (t == SearchType.Plaintext or t == SearchType.All) - and state.content_index.plaintext - and state.search_models.text_search - ): - # query plaintext files - search_futures += [ - executor.submit( - text_search.query, - user_query, - state.search_models.text_search, - state.content_index.plaintext, - question_embedding=encoded_asymmetric_query, - rank_results=r or False, - score_threshold=score_threshold, - dedupe=dedupe or True, ) ] @@ -576,28 +448,26 @@ async def search( count=results_count, ) else: - hits, entries = await search_future.result() + hits = await search_future.result() # Collate results - results += text_search.collate_results(hits, entries, results_count) + results += text_search.collate_results(hits, dedupe=dedupe) - # Sort results across all content types and take top results - results = sorted(results, key=lambda x: float(x.score), reverse=True)[:results_count] + # Sort results across all content types and take top results + results = text_search.rerank_and_sort_results(results, query=defiltered_query, rank_results=r)[ + :results_count + ] # Cache results - state.query_cache[query_cache_key] = results + if user: + state.query_cache[user.uuid][query_cache_key] = results update_telemetry_state( request=request, telemetry_type="api", api="search", - client=client, - user_agent=user_agent, - referer=referer, - host=host, + **common.__dict__, ) - state.previous_query = user_query - end_time = time.time() logger.debug(f"🔍 Search took: {end_time - start_time:.3f} seconds") @@ -605,21 +475,20 @@ async def search( @api.get("/update") +@requires(["authenticated"]) def update( request: Request, + common: CommonQueryParams, t: Optional[SearchType] = None, force: Optional[bool] = False, - client: Optional[str] = None, - user_agent: Optional[str] = Header(None), - referer: Optional[str] = Header(None), - host: Optional[str] = Header(None), ): + user = request.user.object if not state.config: error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}." logger.warning(error_msg) raise HTTPException(status_code=500, detail=error_msg) try: - configure_server(state.config, regenerate=force, search_type=t) + configure_server(state.config, regenerate=force, search_type=t, user=user) except Exception as e: error_msg = f"🚨 Failed to update server via API: {e}" logger.error(error_msg, exc_info=True) @@ -630,8 +499,6 @@ def update( components.append("Search models") if state.content_index: components.append("Content index") - if state.processor_config: - components.append("Conversation processor") components_msg = ", ".join(components) logger.info(f"📪 {components_msg} updated via API") @@ -639,50 +506,39 @@ def update( request=request, telemetry_type="api", api="update", - client=client, - user_agent=user_agent, - referer=referer, - host=host, + **common.__dict__, ) return {"status": "ok", "message": "khoj reloaded"} @api.get("/chat/history") +@requires(["authenticated"]) def chat_history( request: Request, - client: Optional[str] = None, - user_agent: Optional[str] = Header(None), - referer: Optional[str] = Header(None), - host: Optional[str] = Header(None), + common: CommonQueryParams, ): - perform_chat_checks() + user = request.user.object + validate_conversation_config() # Load Conversation History - meta_log = {} - if state.processor_config.conversation: - meta_log = state.processor_config.conversation.meta_log + meta_log = ConversationAdapters.get_conversation_by_user(user=user).conversation_log update_telemetry_state( request=request, telemetry_type="api", api="chat", - client=client, - user_agent=user_agent, - referer=referer, - host=host, + **common.__dict__, ) return {"status": "ok", "response": meta_log.get("chat", [])} @api.get("/chat/options", response_class=Response) +@requires(["authenticated"]) async def chat_options( request: Request, - client: Optional[str] = None, - user_agent: Optional[str] = Header(None), - referer: Optional[str] = Header(None), - host: Optional[str] = Header(None), + common: CommonQueryParams, ) -> Response: cmd_options = {} for cmd in ConversationCommand: @@ -692,49 +548,69 @@ async def chat_options( request=request, telemetry_type="api", api="chat_options", - client=client, - user_agent=user_agent, - referer=referer, - host=host, + **common.__dict__, ) return Response(content=json.dumps(cmd_options), media_type="application/json", status_code=200) @api.get("/chat", response_class=Response) +@requires(["authenticated"]) async def chat( request: Request, + common: CommonQueryParams, q: str, n: Optional[int] = 5, - client: Optional[str] = None, + d: Optional[float] = 0.18, stream: Optional[bool] = False, - user_agent: Optional[str] = Header(None), - referer: Optional[str] = Header(None), - host: Optional[str] = Header(None), + rate_limiter_per_minute=Depends(ApiUserRateLimiter(requests=30, window=60)), + rate_limiter_per_day=Depends(ApiUserRateLimiter(requests=500, window=60 * 60 * 24)), ) -> Response: - perform_chat_checks() + user = request.user.object + + await is_ready_to_chat(user) conversation_command = get_conversation_command(query=q, any_references=True) q = q.replace(f"/{conversation_command.value}", "").strip() + meta_log = (await ConversationAdapters.aget_conversation_by_user(user)).conversation_log + compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions( - request, q, (n or 5), conversation_command + request, common, meta_log, q, (n or 5), (d or math.inf), conversation_command ) if conversation_command == ConversationCommand.Default and is_none_or_empty(compiled_references): conversation_command = ConversationCommand.General - if conversation_command == ConversationCommand.Help: - model_type = "offline" if state.processor_config.conversation.offline_chat.enable_offline_chat else "openai" - formatted_help = help_message.format(model=model_type, version=state.khoj_version) + elif conversation_command == ConversationCommand.Help: + conversation_config = await ConversationAdapters.aget_user_conversation_config(user) + if conversation_config == None: + conversation_config = await ConversationAdapters.aget_default_conversation_config() + model_type = conversation_config.model_type + formatted_help = help_message.format(model=model_type, version=state.khoj_version, device=get_device()) return StreamingResponse(iter([formatted_help]), media_type="text/event-stream", status_code=200) + elif conversation_command == ConversationCommand.Notes and not await EntryAdapters.auser_has_entries(user): + no_entries_found_format = no_entries_found.format() + return StreamingResponse(iter([no_entries_found_format]), media_type="text/event-stream", status_code=200) + # Get the (streamed) chat response from the LLM of choice. - llm_response = generate_chat_response( + llm_response, chat_metadata = await agenerate_chat_response( defiltered_query, - meta_log=state.processor_config.conversation.meta_log, - compiled_references=compiled_references, - inferred_queries=inferred_queries, - conversation_command=conversation_command, + meta_log, + compiled_references, + inferred_queries, + conversation_command, + user, + ) + + chat_metadata.update({"conversation_command": conversation_command.value}) + + update_telemetry_state( + request=request, + telemetry_type="api", + api="chat", + metadata=chat_metadata, + **common.__dict__, ) if llm_response is None: @@ -743,70 +619,83 @@ async def chat( if stream: return StreamingResponse(llm_response, media_type="text/event-stream", status_code=200) + iterator = AsyncIteratorWrapper(llm_response) + # Get the full response from the generator if the stream is not requested. aggregated_gpt_response = "" - while True: - try: - aggregated_gpt_response += next(llm_response) - except StopIteration: + async for item in iterator: + if item is None: break + aggregated_gpt_response += item actual_response = aggregated_gpt_response.split("### compiled references:")[0] response_obj = {"response": actual_response, "context": compiled_references} - update_telemetry_state( - request=request, - telemetry_type="api", - api="chat", - client=client, - user_agent=user_agent, - referer=referer, - host=host, - ) - return Response(content=json.dumps(response_obj), media_type="application/json", status_code=200) async def extract_references_and_questions( request: Request, + common: CommonQueryParams, + meta_log: dict, q: str, n: int, + d: float, conversation_type: ConversationCommand = ConversationCommand.Default, ): - # Load Conversation History - meta_log = state.processor_config.conversation.meta_log + user = request.user.object if request.user.is_authenticated else None # Initialize Variables compiled_references: List[Any] = [] inferred_queries: List[str] = [] - if state.content_index is None: + if conversation_type == ConversationCommand.General: + return compiled_references, inferred_queries, q + + if not await sync_to_async(EntryAdapters.user_has_entries)(user=user): logger.warning( "No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes." ) return compiled_references, inferred_queries, q - if conversation_type == ConversationCommand.General: - return compiled_references, inferred_queries, q - # Extract filter terms from user message defiltered_query = q for filter in [DateFilter(), WordFilter(), FileFilter()]: defiltered_query = filter.defilter(defiltered_query) filters_in_query = q.replace(defiltered_query, "").strip() + using_offline_chat = False + # Infer search queries from user message with timer("Extracting search queries took", logger): # If we've reached here, either the user has enabled offline chat or the openai model is enabled. - if state.processor_config.conversation.offline_chat.enable_offline_chat: - loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model + offline_chat_config = await ConversationAdapters.aget_offline_chat_conversation_config() + conversation_config = await ConversationAdapters.aget_conversation_config(user) + if conversation_config is None: + conversation_config = await ConversationAdapters.aget_default_conversation_config() + openai_chat_config = await ConversationAdapters.aget_openai_conversation_config() + if ( + offline_chat_config + and offline_chat_config.enabled + and conversation_config.model_type == ChatModelOptions.ModelType.OFFLINE + ): + using_offline_chat = True + offline_chat = await ConversationAdapters.get_offline_chat() + chat_model = offline_chat.chat_model + if state.gpt4all_processor_config is None: + state.gpt4all_processor_config = GPT4AllProcessorModel(chat_model=chat_model) + + loaded_model = state.gpt4all_processor_config.loaded_model + inferred_queries = extract_questions_offline( defiltered_query, loaded_model=loaded_model, conversation_log=meta_log, should_extract_questions=False ) - elif state.processor_config.conversation.openai_model: - api_key = state.processor_config.conversation.openai_model.api_key - chat_model = state.processor_config.conversation.openai_model.chat_model + elif openai_chat_config and conversation_config.model_type == ChatModelOptions.ModelType.OPENAI: + openai_chat_config = await ConversationAdapters.get_openai_chat_config() + openai_chat = await ConversationAdapters.get_openai_chat() + api_key = openai_chat_config.api_key + chat_model = openai_chat.chat_model inferred_queries = extract_questions( defiltered_query, model=chat_model, api_key=api_key, conversation_log=meta_log ) @@ -815,17 +704,19 @@ async def extract_references_and_questions( with timer("Searching knowledge base took", logger): result_list = [] for query in inferred_queries: - n_items = min(n, 3) if state.processor_config.conversation.offline_chat.enable_offline_chat else n + n_items = min(n, 3) if using_offline_chat else n result_list.extend( await search( f"{query} {filters_in_query}", request=request, n=n_items, r=True, - score_threshold=-5.0, + max_distance=d, dedupe=False, + common=common, ) ) + result_list = text_search.deduplicated_search_responses(result_list) compiled_references = [item.additional["compiled"] for item in result_list] return compiled_references, inferred_queries, defiltered_query diff --git a/src/khoj/routers/auth.py b/src/khoj/routers/auth.py new file mode 100644 index 00000000..a9a88325 --- /dev/null +++ b/src/khoj/routers/auth.py @@ -0,0 +1,120 @@ +# Standard Packages +import logging +import os +from typing import Optional + +# External Packages +from fastapi import APIRouter +from starlette.config import Config +from starlette.requests import Request +from starlette.responses import HTMLResponse, RedirectResponse, Response +from starlette.authentication import requires +from authlib.integrations.starlette_client import OAuth, OAuthError + +from google.oauth2 import id_token +from google.auth.transport import requests as google_requests + +# Internal Packages +from database.adapters import get_khoj_tokens, get_or_create_user, create_khoj_token, delete_khoj_token +from database.models import KhojApiUser +from khoj.routers.helpers import update_telemetry_state +from khoj.utils import state + + +logger = logging.getLogger(__name__) + +auth_router = APIRouter() + +if not state.anonymous_mode and not (os.environ.get("GOOGLE_CLIENT_ID") and os.environ.get("GOOGLE_CLIENT_SECRET")): + logger.warn( + "🚨 Use --anonymous-mode flag to disable Google OAuth or set GOOGLE_CLIENT_ID, GOOGLE_CLIENT_SECRET environment variables to enable it" + ) +else: + config = Config(environ=os.environ) + + oauth = OAuth(config) + + CONF_URL = "https://accounts.google.com/.well-known/openid-configuration" + oauth.register(name="google", server_metadata_url=CONF_URL, client_kwargs={"scope": "openid email profile"}) + + +@auth_router.get("/login") +async def login_get(request: Request): + redirect_uri = str(request.app.url_path_for("auth")) + return await oauth.google.authorize_redirect(request, redirect_uri) + + +@auth_router.post("/login") +async def login(request: Request): + redirect_uri = str(request.app.url_path_for("auth")) + return await oauth.google.authorize_redirect(request, redirect_uri) + + +@auth_router.post("/token") +@requires(["authenticated"], redirect="login_page") +async def generate_token(request: Request, token_name: Optional[str] = None): + "Generate API token for given user" + if token_name: + token = await create_khoj_token(user=request.user.object, name=token_name) + else: + token = await create_khoj_token(user=request.user.object) + return { + "token": token.token, + "name": token.name, + } + + +@auth_router.get("/token") +@requires(["authenticated"], redirect="login_page") +def get_tokens(request: Request): + "Get API tokens enabled for given user" + tokens = get_khoj_tokens(user=request.user.object) + return tokens + + +@auth_router.delete("/token") +@requires(["authenticated"], redirect="login_page") +async def delete_token(request: Request, token: str) -> str: + "Delete API token for given user" + return await delete_khoj_token(user=request.user.object, token=token) + + +@auth_router.post("/redirect") +async def auth(request: Request): + form = await request.form() + credential = form.get("credential") + + csrf_token_cookie = request.cookies.get("g_csrf_token") + if not csrf_token_cookie: + return Response("Missing CSRF token", status_code=400) + csrf_token_body = form.get("g_csrf_token") + if not csrf_token_body: + return Response("Missing CSRF token", status_code=400) + if csrf_token_cookie != csrf_token_body: + return Response("Invalid CSRF token", status_code=400) + + try: + idinfo = id_token.verify_oauth2_token(credential, google_requests.Request(), os.environ["GOOGLE_CLIENT_ID"]) + except OAuthError as error: + return HTMLResponse(f"

{error.error}

") + khoj_user = await get_or_create_user(idinfo) + if khoj_user: + request.session["user"] = dict(idinfo) + + if not khoj_user.last_login: + update_telemetry_state( + request=request, + telemetry_type="api", + api="create_user", + metadata={"user_id": str(khoj_user.uuid)}, + ) + logger.log(logging.INFO, f"New User Created: {khoj_user.uuid}") + RedirectResponse(url="/?status=welcome") + + return RedirectResponse(url="/") + + +@auth_router.get("/logout") +async def logout(request: Request): + request.session.pop("user", None) + return RedirectResponse(url="/") diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 6b42f29c..272f962d 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -1,33 +1,60 @@ -import logging +# Standard Packages +import asyncio +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor from datetime import datetime from functools import partial -from typing import Iterator, List, Optional, Union +import logging +from time import time +from typing import Annotated, Iterator, List, Optional, Union, Tuple, Dict -from fastapi import HTTPException, Request +# External Packages +from fastapi import HTTPException, Header, Request, Depends +# Internal Packages from khoj.utils import state -from khoj.utils.helpers import ConversationCommand, timer, log_telemetry +from khoj.utils.config import GPT4AllProcessorModel +from khoj.utils.helpers import ConversationCommand, log_telemetry from khoj.processor.conversation.openai.gpt import converse from khoj.processor.conversation.gpt4all.chat_model import converse_offline -from khoj.processor.conversation.utils import reciprocal_conversation_to_chatml, message_to_log, ThreadedGenerator +from khoj.processor.conversation.utils import message_to_log, ThreadedGenerator +from database.models import KhojUser, Subscription +from database.adapters import ConversationAdapters + logger = logging.getLogger(__name__) +executor = ThreadPoolExecutor(max_workers=1) -def perform_chat_checks(): + +def validate_conversation_config(): if ( - state.processor_config - and state.processor_config.conversation - and ( - state.processor_config.conversation.openai_model - or state.processor_config.conversation.gpt4all_model.loaded_model - ) + ConversationAdapters.has_valid_offline_conversation_config() + or ConversationAdapters.has_valid_openai_conversation_config() ): + if ConversationAdapters.get_default_conversation_config() is None: + raise HTTPException(status_code=500, detail="Contact the server administrator to set a default chat model.") return - raise HTTPException( - status_code=500, detail="Set your OpenAI API key or enable Local LLM via Khoj settings and restart it." - ) + raise HTTPException(status_code=500, detail="Set your OpenAI API key or enable Local LLM via Khoj settings.") + + +async def is_ready_to_chat(user: KhojUser): + has_offline_config = await ConversationAdapters.ahas_offline_chat() + has_openai_config = await ConversationAdapters.has_openai_chat() + user_conversation_config = await ConversationAdapters.aget_user_conversation_config(user) + + if has_offline_config and user_conversation_config and user_conversation_config.model_type == "offline": + chat_model = user_conversation_config.chat_model + if state.gpt4all_processor_config is None: + logger.info("Loading Offline Chat Model...") + state.gpt4all_processor_config = GPT4AllProcessorModel(chat_model=chat_model) + return True + + ready = has_openai_config or has_offline_config + + if not ready: + raise HTTPException(status_code=500, detail="Set your OpenAI API key or enable Local LLM via Khoj settings.") def update_telemetry_state( @@ -40,11 +67,16 @@ def update_telemetry_state( host: Optional[str] = None, metadata: Optional[dict] = None, ): + user: KhojUser = request.user.object if request.user.is_authenticated else None + subscription: Subscription = user.subscription if user and user.subscription else None user_state = { "client_host": request.client.host if request.client else None, "user_agent": user_agent or "unknown", "referer": referer or "unknown", "host": host or "unknown", + "server_id": str(user.uuid) if user else None, + "subscription_type": subscription.type if subscription else None, + "is_recurring": subscription.is_recurring if subscription else None, } if metadata: @@ -71,13 +103,23 @@ def get_conversation_command(query: str, any_references: bool = False) -> Conver return ConversationCommand.Default +async def construct_conversation_logs(user: KhojUser): + return (await ConversationAdapters.aget_conversation_by_user(user)).conversation_log + + +async def agenerate_chat_response(*args): + loop = asyncio.get_event_loop() + return await loop.run_in_executor(executor, generate_chat_response, *args) + + def generate_chat_response( q: str, meta_log: dict, compiled_references: List[str] = [], inferred_queries: List[str] = [], conversation_command: ConversationCommand = ConversationCommand.Default, -) -> Union[ThreadedGenerator, Iterator[str]]: + user: KhojUser = None, +) -> Tuple[Union[ThreadedGenerator, Iterator[str]], Dict[str, str]]: def _save_to_conversation_log( q: str, chat_response: str, @@ -86,23 +128,22 @@ def generate_chat_response( inferred_queries: List[str], meta_log, ): - state.processor_config.conversation.chat_session += reciprocal_conversation_to_chatml([q, chat_response]) - state.processor_config.conversation.meta_log["chat"] = message_to_log( + updated_conversation = message_to_log( user_message=q, chat_response=chat_response, user_message_metadata={"created": user_message_time}, khoj_message_metadata={"context": compiled_references, "intent": {"inferred-queries": inferred_queries}}, conversation_log=meta_log.get("chat", []), ) - - # Load Conversation History - meta_log = state.processor_config.conversation.meta_log + ConversationAdapters.save_conversation(user, {"chat": updated_conversation}) # Initialize Variables user_message_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") chat_response = None logger.debug(f"Conversation Type: {conversation_command.name}") + metadata = {} + try: partial_completion = partial( _save_to_conversation_log, @@ -113,8 +154,16 @@ def generate_chat_response( meta_log=meta_log, ) - if state.processor_config.conversation.offline_chat.enable_offline_chat: - loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model + offline_chat_config = ConversationAdapters.get_offline_chat_conversation_config() + conversation_config = ConversationAdapters.get_conversation_config(user) + if conversation_config is None: + conversation_config = ConversationAdapters.get_default_conversation_config() + openai_chat_config = ConversationAdapters.get_openai_conversation_config() + if offline_chat_config and offline_chat_config.enabled and conversation_config.model_type == "offline": + if state.gpt4all_processor_config is None or state.gpt4all_processor_config.loaded_model is None: + state.gpt4all_processor_config = GPT4AllProcessorModel(conversation_config.chat_model) + + loaded_model = state.gpt4all_processor_config.loaded_model chat_response = converse_offline( references=compiled_references, user_query=q, @@ -122,14 +171,14 @@ def generate_chat_response( conversation_log=meta_log, completion_func=partial_completion, conversation_command=conversation_command, - model=state.processor_config.conversation.offline_chat.chat_model, - max_prompt_size=state.processor_config.conversation.max_prompt_size, - tokenizer_name=state.processor_config.conversation.tokenizer, + model=conversation_config.chat_model, + max_prompt_size=conversation_config.max_prompt_size, + tokenizer_name=conversation_config.tokenizer, ) - elif state.processor_config.conversation.openai_model: - api_key = state.processor_config.conversation.openai_model.api_key - chat_model = state.processor_config.conversation.openai_model.chat_model + elif openai_chat_config and conversation_config.model_type == "openai": + api_key = openai_chat_config.api_key + chat_model = conversation_config.chat_model chat_response = converse( compiled_references, q, @@ -138,12 +187,54 @@ def generate_chat_response( api_key=api_key, completion_func=partial_completion, conversation_command=conversation_command, - max_prompt_size=state.processor_config.conversation.max_prompt_size, - tokenizer_name=state.processor_config.conversation.tokenizer, + max_prompt_size=conversation_config.max_prompt_size, + tokenizer_name=conversation_config.tokenizer, ) + metadata.update({"chat_model": conversation_config.chat_model}) + except Exception as e: logger.error(e, exc_info=True) raise HTTPException(status_code=500, detail=str(e)) - return chat_response + return chat_response, metadata + + +class ApiUserRateLimiter: + def __init__(self, requests: int, window: int): + self.requests = requests + self.window = window + self.cache: dict[str, list[float]] = defaultdict(list) + + def __call__(self, request: Request): + user: KhojUser = request.user.object + user_requests = self.cache[user.uuid] + + # Remove requests outside of the time window + cutoff = time() - self.window + while user_requests and user_requests[0] < cutoff: + user_requests.pop(0) + + # Check if the user has exceeded the rate limit + if len(user_requests) >= self.requests: + raise HTTPException(status_code=429, detail="Too Many Requests") + + # Add the current request to the cache + user_requests.append(time()) + + +class CommonQueryParamsClass: + def __init__( + self, + client: Optional[str] = None, + user_agent: Optional[str] = Header(None), + referer: Optional[str] = Header(None), + host: Optional[str] = Header(None), + ): + self.client = client + self.user_agent = user_agent + self.referer = referer + self.host = host + + +CommonQueryParams = Annotated[CommonQueryParamsClass, Depends()] diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py index a9656050..ccb65063 100644 --- a/src/khoj/routers/indexer.py +++ b/src/khoj/routers/indexer.py @@ -1,39 +1,40 @@ # Standard Packages import logging from typing import Optional, Union, Dict +import asyncio # External Packages -from fastapi import APIRouter, HTTPException, Header, Request, Response, UploadFile +from fastapi import APIRouter, Header, Request, Response, UploadFile from pydantic import BaseModel -from khoj.routers.helpers import update_telemetry_state +from starlette.authentication import requires # Internal Packages from khoj.utils import state, constants -from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl -from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl -from khoj.processor.github.github_to_jsonl import GithubToJsonl -from khoj.processor.notion.notion_to_jsonl import NotionToJsonl -from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl -from khoj.utils.rawconfig import ContentConfig, TextContentConfig +from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from khoj.processor.pdf.pdf_to_entries import PdfToEntries +from khoj.processor.github.github_to_entries import GithubToEntries +from khoj.processor.notion.notion_to_entries import NotionToEntries +from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries from khoj.search_type import text_search, image_search +from khoj.routers.helpers import update_telemetry_state from khoj.utils.yaml import save_config_to_file_updated_state from khoj.utils.config import SearchModels -from khoj.utils.constants import default_config from khoj.utils.helpers import LRU, get_file_type from khoj.utils.rawconfig import ( ContentConfig, FullConfig, SearchConfig, ) -from khoj.search_filter.date_filter import DateFilter -from khoj.search_filter.word_filter import WordFilter -from khoj.search_filter.file_filter import FileFilter from khoj.utils.config import ( ContentIndex, SearchModels, ) +from database.models import ( + KhojUser, + GithubConfig, + NotionConfig, +) logger = logging.getLogger(__name__) @@ -57,25 +58,23 @@ class IndexerInput(BaseModel): @indexer.post("/update") +@requires(["authenticated"]) async def update( request: Request, files: list[UploadFile], - x_api_key: str = Header(None), force: bool = False, - t: Optional[Union[state.SearchType, str]] = None, + t: Optional[Union[state.SearchType, str]] = state.SearchType.All, client: Optional[str] = None, user_agent: Optional[str] = Header(None), referer: Optional[str] = Header(None), host: Optional[str] = Header(None), ): - if x_api_key != "secret": - raise HTTPException(status_code=401, detail="Invalid API Key") - state.config_lock.acquire() + user = request.user.object try: logger.info(f"📬 Updating content index via API call by {client} client") org_files: Dict[str, str] = {} markdown_files: Dict[str, str] = {} - pdf_files: Dict[str, str] = {} + pdf_files: Dict[str, bytes] = {} plaintext_files: Dict[str, str] = {} for file in files: @@ -86,13 +85,13 @@ async def update( elif file_type == "markdown": dict_to_update = markdown_files elif file_type == "pdf": - dict_to_update = pdf_files + dict_to_update = pdf_files # type: ignore elif file_type == "plaintext": dict_to_update = plaintext_files if dict_to_update is not None: dict_to_update[file.filename] = ( - file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read() + file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read() # type: ignore ) else: logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}") @@ -120,30 +119,35 @@ async def update( github=None, notion=None, plaintext=None, - plugins=None, ) state.config.content_type = default_content_config save_config_to_file_updated_state() configure_search(state.search_models, state.config.search_type) # Extract required fields from config - state.content_index = configure_content( + loop = asyncio.get_event_loop() + state.content_index, success = await loop.run_in_executor( + None, + configure_content, state.content_index, state.config.content_type, indexer_input.dict(), state.search_models, - regenerate=force, - t=t, - full_corpus=False, + force, + t, + False, + user, ) - + if not success: + raise RuntimeError("Failed to update content index") + logger.info(f"Finished processing batch indexing request") except Exception as e: + logger.error(f"Failed to process batch indexing request: {e}", exc_info=True) logger.error( f"🚨 Failed to {force} update {t} content index triggered via API call by {client} client: {e}", exc_info=True, ) - finally: - state.config_lock.release() + return Response(content="Failed", status_code=500) update_telemetry_state( request=request, @@ -156,23 +160,16 @@ async def update( ) logger.info(f"📪 Content index updated via API call by {client} client") + return Response(content="OK", status_code=200) def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]: # Run Validation Checks - if search_config is None: - logger.warning("🚨 No Search configuration available.") - return None if search_models is None: search_models = SearchModels() - # Initialize Search Models - if search_config.asymmetric: - logger.info("🔍 📜 Setting up text search model") - search_models.text_search = text_search.initialize_model(search_config.asymmetric) - - if search_config.image: + if search_config and search_config.image: logger.info("🔍 🌄 Setting up image search model") search_models.image_search = image_search.initialize_model(search_config.image) @@ -185,153 +182,105 @@ def configure_content( files: Optional[dict[str, dict[str, str]]], search_models: SearchModels, regenerate: bool = False, - t: Optional[Union[state.SearchType, str]] = None, + t: Optional[state.SearchType] = state.SearchType.All, full_corpus: bool = True, -) -> Optional[ContentIndex]: - def has_valid_text_config(config: TextContentConfig): - return config.input_files or config.input_filter + user: KhojUser = None, +) -> tuple[Optional[ContentIndex], bool]: + content_index = ContentIndex() - # Run Validation Checks - if content_config is None: - logger.warning("🚨 No Content configuration available.") - return None - if content_index is None: - content_index = ContentIndex() + success = True + if t is not None and t in [type.value for type in state.SearchType]: + t = state.SearchType(t) - if t in [type.value for type in state.SearchType]: - t = state.SearchType(t).value + if t is not None and not t.value in [type.value for type in state.SearchType]: + logger.warning(f"🚨 Invalid search type: {t}") + return None, False - assert type(t) == str or t == None, f"Invalid search type: {t}" + search_type = t.value if t else None if files is None: - logger.warning(f"🚨 No files to process for {t} search.") - return None + logger.warning(f"🚨 No files to process for {search_type} search.") + return None, True try: # Initialize Org Notes Search - if ( - (t == None or t == state.SearchType.Org.value) - and ((content_config.org and has_valid_text_config(content_config.org)) or files["org"]) - and search_models.text_search - ): - if content_config.org == None: - logger.info("🦄 No configuration for orgmode notes. Using default configuration.") - default_configuration = default_config["content-type"]["org"] # type: ignore - content_config.org = TextContentConfig( - compressed_jsonl=default_configuration["compressed-jsonl"], - embeddings_file=default_configuration["embeddings-file"], - ) - + if (search_type == state.SearchType.All.value or search_type == state.SearchType.Org.value) and files["org"]: logger.info("🦄 Setting up search for orgmode notes") # Extract Entries, Generate Notes Embeddings - content_index.org = text_search.setup( - OrgToJsonl, + text_search.setup( + OrgToEntries, files.get("org"), - content_config.org, - search_models.text_search.bi_encoder, regenerate=regenerate, - filters=[DateFilter(), WordFilter(), FileFilter()], full_corpus=full_corpus, + user=user, ) except Exception as e: logger.error(f"🚨 Failed to setup org: {e}", exc_info=True) + success = False try: # Initialize Markdown Search - if ( - (t == None or t == state.SearchType.Markdown.value) - and ((content_config.markdown and has_valid_text_config(content_config.markdown)) or files["markdown"]) - and search_models.text_search - and files["markdown"] - ): - if content_config.markdown == None: - logger.info("💎 No configuration for markdown notes. Using default configuration.") - default_configuration = default_config["content-type"]["markdown"] # type: ignore - content_config.markdown = TextContentConfig( - compressed_jsonl=default_configuration["compressed-jsonl"], - embeddings_file=default_configuration["embeddings-file"], - ) - + if (search_type == state.SearchType.All.value or search_type == state.SearchType.Markdown.value) and files[ + "markdown" + ]: logger.info("💎 Setting up search for markdown notes") # Extract Entries, Generate Markdown Embeddings - content_index.markdown = text_search.setup( - MarkdownToJsonl, + text_search.setup( + MarkdownToEntries, files.get("markdown"), - content_config.markdown, - search_models.text_search.bi_encoder, regenerate=regenerate, - filters=[DateFilter(), WordFilter(), FileFilter()], full_corpus=full_corpus, + user=user, ) except Exception as e: logger.error(f"🚨 Failed to setup markdown: {e}", exc_info=True) + success = False try: # Initialize PDF Search - if ( - (t == None or t == state.SearchType.Pdf.value) - and ((content_config.pdf and has_valid_text_config(content_config.pdf)) or files["pdf"]) - and search_models.text_search - and files["pdf"] - ): - if content_config.pdf == None: - logger.info("🖨️ No configuration for pdf notes. Using default configuration.") - default_configuration = default_config["content-type"]["pdf"] # type: ignore - content_config.pdf = TextContentConfig( - compressed_jsonl=default_configuration["compressed-jsonl"], - embeddings_file=default_configuration["embeddings-file"], - ) - + if (search_type == state.SearchType.All.value or search_type == state.SearchType.Pdf.value) and files["pdf"]: logger.info("🖨️ Setting up search for pdf") # Extract Entries, Generate PDF Embeddings - content_index.pdf = text_search.setup( - PdfToJsonl, + text_search.setup( + PdfToEntries, files.get("pdf"), - content_config.pdf, - search_models.text_search.bi_encoder, regenerate=regenerate, - filters=[DateFilter(), WordFilter(), FileFilter()], full_corpus=full_corpus, + user=user, ) except Exception as e: logger.error(f"🚨 Failed to setup PDF: {e}", exc_info=True) + success = False try: # Initialize Plaintext Search - if ( - (t == None or t == state.SearchType.Plaintext.value) - and ((content_config.plaintext and has_valid_text_config(content_config.plaintext)) or files["plaintext"]) - and search_models.text_search - and files["plaintext"] - ): - if content_config.plaintext == None: - logger.info("📄 No configuration for plaintext notes. Using default configuration.") - default_configuration = default_config["content-type"]["plaintext"] # type: ignore - content_config.plaintext = TextContentConfig( - compressed_jsonl=default_configuration["compressed-jsonl"], - embeddings_file=default_configuration["embeddings-file"], - ) - + if (search_type == state.SearchType.All.value or search_type == state.SearchType.Plaintext.value) and files[ + "plaintext" + ]: logger.info("📄 Setting up search for plaintext") # Extract Entries, Generate Plaintext Embeddings - content_index.plaintext = text_search.setup( - PlaintextToJsonl, + text_search.setup( + PlaintextToEntries, files.get("plaintext"), - content_config.plaintext, - search_models.text_search.bi_encoder, regenerate=regenerate, - filters=[DateFilter(), WordFilter(), FileFilter()], full_corpus=full_corpus, + user=user, ) except Exception as e: logger.error(f"🚨 Failed to setup plaintext: {e}", exc_info=True) + success = False try: # Initialize Image Search - if (t == None or t == state.SearchType.Image.value) and content_config.image and search_models.image_search: + if ( + (search_type == state.SearchType.All.value or search_type == state.SearchType.Image.value) + and content_config + and content_config.image + and search_models.image_search + ): logger.info("🌄 Setting up search for images") # Extract Entries, Generate Image Embeddings content_index.image = image_search.setup( @@ -340,64 +289,53 @@ def configure_content( except Exception as e: logger.error(f"🚨 Failed to setup images: {e}", exc_info=True) + success = False try: - if (t == None or t == state.SearchType.Github.value) and content_config.github and search_models.text_search: + github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first() + if ( + search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value + ) and github_config is not None: logger.info("🐙 Setting up search for github") # Extract Entries, Generate Github Embeddings - content_index.github = text_search.setup( - GithubToJsonl, + text_search.setup( + GithubToEntries, None, - content_config.github, - search_models.text_search.bi_encoder, regenerate=regenerate, - filters=[DateFilter(), WordFilter(), FileFilter()], full_corpus=full_corpus, + user=user, + config=github_config, ) except Exception as e: logger.error(f"🚨 Failed to setup GitHub: {e}", exc_info=True) + success = False try: # Initialize Notion Search - if (t == None or t in state.SearchType.Notion.value) and content_config.notion and search_models.text_search: + notion_config = NotionConfig.objects.filter(user=user).first() + if ( + search_type == state.SearchType.All.value or search_type in state.SearchType.Notion.value + ) and notion_config: logger.info("🔌 Setting up search for notion") - content_index.notion = text_search.setup( - NotionToJsonl, + text_search.setup( + NotionToEntries, None, - content_config.notion, - search_models.text_search.bi_encoder, regenerate=regenerate, - filters=[DateFilter(), WordFilter(), FileFilter()], full_corpus=full_corpus, + user=user, + config=notion_config, ) except Exception as e: logger.error(f"🚨 Failed to setup GitHub: {e}", exc_info=True) - - try: - # Initialize External Plugin Search - if t == None and content_config.plugins and search_models.text_search: - logger.info("🔌 Setting up search for plugins") - content_index.plugins = {} - for plugin_type, plugin_config in content_config.plugins.items(): - content_index.plugins[plugin_type] = text_search.setup( - JsonlToJsonl, - None, - plugin_config, - search_models.text_search.bi_encoder, - regenerate=regenerate, - filters=[DateFilter(), WordFilter(), FileFilter()], - full_corpus=full_corpus, - ) - - except Exception as e: - logger.error(f"🚨 Failed to setup Plugin: {e}", exc_info=True) + success = False # Invalidate Query Cache - state.query_cache = LRU() + if user: + state.query_cache[user.uuid] = LRU() - return content_index + return content_index, success def load_content( @@ -405,51 +343,15 @@ def load_content( content_index: Optional[ContentIndex], search_models: SearchModels, ): - logger.info(f"Loading content from existing embeddings...") if content_config is None: logger.warning("🚨 No Content configuration available.") return None if content_index is None: content_index = ContentIndex() - if content_config.org: - logger.info("🦄 Loading orgmode notes") - content_index.org = text_search.load(content_config.org, filters=[DateFilter(), WordFilter(), FileFilter()]) - if content_config.markdown: - logger.info("💎 Loading markdown notes") - content_index.markdown = text_search.load( - content_config.markdown, filters=[DateFilter(), WordFilter(), FileFilter()] - ) - if content_config.pdf: - logger.info("🖨️ Loading pdf") - content_index.pdf = text_search.load(content_config.pdf, filters=[DateFilter(), WordFilter(), FileFilter()]) - if content_config.plaintext: - logger.info("📄 Loading plaintext") - content_index.plaintext = text_search.load( - content_config.plaintext, filters=[DateFilter(), WordFilter(), FileFilter()] - ) if content_config.image: logger.info("🌄 Loading images") content_index.image = image_search.setup( content_config.image, search_models.image_search.image_encoder, regenerate=False ) - if content_config.github: - logger.info("🐙 Loading github") - content_index.github = text_search.load( - content_config.github, filters=[DateFilter(), WordFilter(), FileFilter()] - ) - if content_config.notion: - logger.info("🔌 Loading notion") - content_index.notion = text_search.load( - content_config.notion, filters=[DateFilter(), WordFilter(), FileFilter()] - ) - if content_config.plugins: - logger.info("🔌 Loading plugins") - content_index.plugins = {} - for plugin_type, plugin_config in content_config.plugins.items(): - content_index.plugins[plugin_type] = text_search.load( - plugin_config, filters=[DateFilter(), WordFilter(), FileFilter()] - ) - - state.query_cache = LRU() return content_index diff --git a/src/khoj/routers/subscription.py b/src/khoj/routers/subscription.py new file mode 100644 index 00000000..62e50d72 --- /dev/null +++ b/src/khoj/routers/subscription.py @@ -0,0 +1,105 @@ +# Standard Packages +from datetime import datetime, timezone +import logging +import os + +# External Packages +from asgiref.sync import sync_to_async +from fastapi import APIRouter, Request +from starlette.authentication import requires +import stripe + +# Internal Packages +from database import adapters + + +# Stripe integration for Khoj Cloud Subscription +stripe.api_key = os.getenv("STRIPE_API_KEY") +endpoint_secret = os.getenv("STRIPE_SIGNING_SECRET") +logger = logging.getLogger(__name__) +subscription_router = APIRouter() + + +@subscription_router.post("") +async def subscribe(request: Request): + """Webhook for Stripe to send subscription events to Khoj Cloud""" + event = None + try: + payload = await request.body() + sig_header = request.headers["stripe-signature"] + event = stripe.Webhook.construct_event(payload, sig_header, endpoint_secret) + except ValueError as e: + # Invalid payload + raise e + except stripe.error.SignatureVerificationError as e: + # Invalid signature + raise e + + event_type = event["type"] + if event_type not in { + "invoice.paid", + "customer.subscription.updated", + "customer.subscription.deleted", + }: + logger.warn(f"Unhandled Stripe event type: {event['type']}") + return {"success": False} + + # Retrieve the customer's details + subscription = event["data"]["object"] + customer_id = subscription["customer"] + customer = stripe.Customer.retrieve(customer_id) + customer_email = customer["email"] + + # Handle valid stripe webhook events + success = True + if event_type in {"invoice.paid"}: + # Mark the user as subscribed and update the next renewal date on payment + subscription = stripe.Subscription.list(customer=customer_id).data[0] + renewal_date = datetime.fromtimestamp(subscription["current_period_end"], tz=timezone.utc) + user = await adapters.set_user_subscription(customer_email, is_recurring=True, renewal_date=renewal_date) + success = user is not None + elif event_type in {"customer.subscription.updated"}: + user_subscription = await sync_to_async(adapters.get_user_subscription)(customer_email) + # Allow updating subscription status if paid user + if user_subscription and user_subscription.renewal_date: + # Mark user as unsubscribed or resubscribed + is_recurring = not subscription["cancel_at_period_end"] + updated_user = await adapters.set_user_subscription(customer_email, is_recurring=is_recurring) + success = updated_user is not None + elif event_type in {"customer.subscription.deleted"}: + # Reset the user to trial state + user = await adapters.set_user_subscription( + customer_email, is_recurring=False, renewal_date=False, type="trial" + ) + success = user is not None + + logger.info(f'Stripe subscription {event["type"]} for {customer["email"]}') + return {"success": success} + + +@subscription_router.patch("") +@requires(["authenticated"]) +async def update_subscription(request: Request, email: str, operation: str): + # Retrieve the customer's details + customers = stripe.Customer.list(email=email).auto_paging_iter() + customer = next(customers, None) + if customer is None: + return {"success": False, "message": "Customer not found"} + + if operation == "cancel": + customer_id = customer.id + for subscription in stripe.Subscription.list(customer=customer_id): + stripe.Subscription.modify(subscription.id, cancel_at_period_end=True) + return {"success": True} + + elif operation == "resubscribe": + subscriptions = stripe.Subscription.list(customer=customer.id).auto_paging_iter() + # Find the subscription that is set to cancel at the end of the period + for subscription in subscriptions: + if subscription.cancel_at_period_end: + # Update the subscription to not cancel at the end of the period + stripe.Subscription.modify(subscription.id, cancel_at_period_end=False) + return {"success": True} + return {"success": False, "message": "No subscription found that is set to cancel"} + + return {"success": False, "message": "Invalid operation"} diff --git a/src/khoj/routers/web_client.py b/src/khoj/routers/web_client.py index 492a263c..f30499d8 100644 --- a/src/khoj/routers/web_client.py +++ b/src/khoj/routers/web_client.py @@ -1,184 +1,266 @@ +# System Packages +import json +import os + # External Packages from fastapi import APIRouter from fastapi import Request -from fastapi.responses import HTMLResponse, FileResponse +from fastapi.responses import HTMLResponse, FileResponse, RedirectResponse from fastapi.templating import Jinja2Templates -from khoj.utils.rawconfig import TextContentConfig, OpenAIProcessorConfig, FullConfig +from starlette.authentication import requires +from database import adapters +from database.models import KhojUser +from khoj.utils.rawconfig import ( + GithubContentConfig, + GithubRepoConfig, + NotionContentConfig, +) # Internal Packages from khoj.utils import constants, state - -import json - +from database.adapters import ( + EntryAdapters, + get_user_github_config, + get_user_notion_config, + ConversationAdapters, + get_user_subscription_state, +) # Initialize Router web_client = APIRouter() templates = Jinja2Templates(directory=constants.web_directory) -VALID_TEXT_CONTENT_TYPES = ["org", "markdown", "pdf", "plaintext"] - # Create Routes @web_client.get("/", response_class=FileResponse) +@requires(["authenticated"], redirect="login_page") def index(request: Request): - return templates.TemplateResponse("index.html", context={"request": request, "demo": state.demo}) + user = request.user.object + user_picture = request.session.get("user", {}).get("picture") + user_subscription_state = get_user_subscription_state(user.email) + has_documents = EntryAdapters.user_has_entries(user=user) + + return templates.TemplateResponse( + "chat.html", + context={ + "request": request, + "username": user.username, + "user_photo": user_picture, + "is_active": user_subscription_state == "subscribed" or user_subscription_state == "unsubscribed", + "has_documents": has_documents, + }, + ) + + +@web_client.post("/", response_class=FileResponse) +@requires(["authenticated"], redirect="login_page") +def index_post(request: Request): + user = request.user.object + user_picture = request.session.get("user", {}).get("picture") + user_subscription_state = get_user_subscription_state(user.email) + has_documents = EntryAdapters.user_has_entries(user=user) + + return templates.TemplateResponse( + "chat.html", + context={ + "request": request, + "username": user.username, + "user_photo": user_picture, + "is_active": user_subscription_state == "subscribed" or user_subscription_state == "unsubscribed", + "has_documents": has_documents, + }, + ) + + +@web_client.get("/search", response_class=FileResponse) +@requires(["authenticated"], redirect="login_page") +def search_page(request: Request): + user = request.user.object + user_picture = request.session.get("user", {}).get("picture") + user_subscription_state = get_user_subscription_state(user.email) + has_documents = EntryAdapters.user_has_entries(user=user) + + return templates.TemplateResponse( + "search.html", + context={ + "request": request, + "username": user.username, + "user_photo": user_picture, + "is_active": user_subscription_state == "subscribed" or user_subscription_state == "unsubscribed", + "has_documents": has_documents, + }, + ) @web_client.get("/chat", response_class=FileResponse) +@requires(["authenticated"], redirect="login_page") def chat_page(request: Request): - return templates.TemplateResponse("chat.html", context={"request": request, "demo": state.demo}) + user = request.user.object + user_picture = request.session.get("user", {}).get("picture") + user_subscription_state = get_user_subscription_state(user.email) + has_documents = EntryAdapters.user_has_entries(user=user) + + return templates.TemplateResponse( + "chat.html", + context={ + "request": request, + "username": user.username, + "user_photo": user_picture, + "is_active": user_subscription_state == "subscribed" or user_subscription_state == "unsubscribed", + "has_documents": has_documents, + }, + ) -if not state.demo: +@web_client.get("/login", response_class=FileResponse) +def login_page(request: Request): + if request.user.is_authenticated: + next_url = request.query_params.get("next", "/") + return RedirectResponse(url=next_url) + google_client_id = os.environ.get("GOOGLE_CLIENT_ID") + redirect_uri = str(request.app.url_path_for("auth")) + return templates.TemplateResponse( + "login.html", + context={ + "request": request, + "google_client_id": google_client_id, + "redirect_uri": redirect_uri, + }, + ) - @web_client.get("/config", response_class=HTMLResponse) - def config_page(request: Request): - default_full_config = FullConfig( - content_type=None, - search_type=None, - processor=None, - ) - current_config = state.config or json.loads(default_full_config.json()) - successfully_configured = { - "pdf": False, - "markdown": False, - "org": False, - "image": False, - "github": False, - "notion": False, - "plaintext": False, - "enable_offline_model": False, - "conversation_openai": False, - "conversation_gpt4all": False, - } +@web_client.get("/config", response_class=HTMLResponse) +@requires(["authenticated"], redirect="login_page") +def config_page(request: Request): + user: KhojUser = request.user.object + user_picture = request.session.get("user", {}).get("picture") + has_documents = EntryAdapters.user_has_entries(user=user) - if state.content_index: - successfully_configured.update( - { - "pdf": state.content_index.pdf is not None, - "markdown": state.content_index.markdown is not None, - "org": state.content_index.org is not None, - "image": state.content_index.image is not None, - "github": state.content_index.github is not None, - "notion": state.content_index.notion is not None, - "plaintext": state.content_index.plaintext is not None, - } + user_subscription_state = get_user_subscription_state(user.email) + user_subscription = adapters.get_user_subscription(user.email) + subscription_renewal_date = ( + user_subscription.renewal_date.strftime("%d %b %Y") + if user_subscription and user_subscription.renewal_date + else None + ) + + enabled_content_source = set(EntryAdapters.get_unique_file_sources(user)) + successfully_configured = { + "computer": ("computer" in enabled_content_source), + "github": ("github" in enabled_content_source), + "notion": ("notion" in enabled_content_source), + } + + selected_conversation_config = ConversationAdapters.get_conversation_config(user) + conversation_options = ConversationAdapters.get_conversation_processor_options().all() + all_conversation_options = list() + for conversation_option in conversation_options: + all_conversation_options.append({"chat_model": conversation_option.chat_model, "id": conversation_option.id}) + + return templates.TemplateResponse( + "config.html", + context={ + "request": request, + "current_model_state": successfully_configured, + "anonymous_mode": state.anonymous_mode, + "username": user.username, + "conversation_options": all_conversation_options, + "selected_conversation_config": selected_conversation_config.id if selected_conversation_config else None, + "user_photo": user_picture, + "billing_enabled": state.billing_enabled, + "subscription_state": user_subscription_state, + "subscription_renewal_date": subscription_renewal_date, + "khoj_cloud_subscription_url": os.getenv("KHOJ_CLOUD_SUBSCRIPTION_URL"), + "is_active": user_subscription_state == "subscribed" or user_subscription_state == "unsubscribed", + "has_documents": has_documents, + }, + ) + + +@web_client.get("/config/content-source/github", response_class=HTMLResponse) +@requires(["authenticated"], redirect="login_page") +def github_config_page(request: Request): + user = request.user.object + user_picture = request.session.get("user", {}).get("picture") + user_subscription_state = get_user_subscription_state(user.email) + has_documents = EntryAdapters.user_has_entries(user=user) + current_github_config = get_user_github_config(user) + + if current_github_config: + raw_repos = current_github_config.githubrepoconfig.all() + repos = [] + for repo in raw_repos: + repos.append( + GithubRepoConfig( + name=repo.name, + owner=repo.owner, + branch=repo.branch, + ) ) - - if state.processor_config and state.processor_config.conversation: - successfully_configured.update( - { - "conversation_openai": state.processor_config.conversation.openai_model is not None, - "conversation_gpt4all": state.processor_config.conversation.gpt4all_model.loaded_model is not None, - } - ) - - return templates.TemplateResponse( - "config.html", - context={ - "request": request, - "current_config": current_config, - "current_model_state": successfully_configured, - }, - ) - - @web_client.get("/config/content_type/github", response_class=HTMLResponse) - def github_config_page(request: Request): - default_copy = constants.default_config.copy() - default_github = default_copy["content-type"]["github"] # type: ignore - - default_config = TextContentConfig( - compressed_jsonl=default_github["compressed-jsonl"], - embeddings_file=default_github["embeddings-file"], - ) - - current_config = ( - state.config.content_type.github - if state.config and state.config.content_type and state.config.content_type.github - else default_config - ) - - current_config = json.loads(current_config.json()) - - return templates.TemplateResponse( - "content_type_github_input.html", context={"request": request, "current_config": current_config} - ) - - @web_client.get("/config/content_type/notion", response_class=HTMLResponse) - def notion_config_page(request: Request): - default_copy = constants.default_config.copy() - default_notion = default_copy["content-type"]["notion"] # type: ignore - - default_config = TextContentConfig( - compressed_jsonl=default_notion["compressed-jsonl"], - embeddings_file=default_notion["embeddings-file"], - ) - - current_config = ( - state.config.content_type.notion - if state.config and state.config.content_type and state.config.content_type.notion - else default_config - ) - - current_config = json.loads(current_config.json()) - - return templates.TemplateResponse( - "content_type_notion_input.html", context={"request": request, "current_config": current_config} - ) - - @web_client.get("/config/content_type/{content_type}", response_class=HTMLResponse) - def content_config_page(request: Request, content_type: str): - if content_type not in VALID_TEXT_CONTENT_TYPES: - return templates.TemplateResponse("config.html", context={"request": request}) - - default_copy = constants.default_config.copy() - default_content_type = default_copy["content-type"][content_type] # type: ignore - - default_config = TextContentConfig( - compressed_jsonl=default_content_type["compressed-jsonl"], - embeddings_file=default_content_type["embeddings-file"], - ) - - current_config = ( - state.config.content_type[content_type] - if state.config and state.config.content_type and state.config.content_type[content_type] # type: ignore - else default_config + current_config = GithubContentConfig( + pat_token=current_github_config.pat_token, + repos=repos, ) current_config = json.loads(current_config.json()) + else: + current_config = {} # type: ignore - return templates.TemplateResponse( - "content_type_input.html", - context={ - "request": request, - "current_config": current_config, - "content_type": content_type, - }, - ) + return templates.TemplateResponse( + "content_source_github_input.html", + context={ + "request": request, + "current_config": current_config, + "username": user.username, + "user_photo": user_picture, + "is_active": user_subscription_state == "subscribed" or user_subscription_state == "unsubscribed", + "has_documents": has_documents, + }, + ) - @web_client.get("/config/processor/conversation/openai", response_class=HTMLResponse) - def conversation_processor_config_page(request: Request): - default_copy = constants.default_config.copy() - default_processor_config = default_copy["processor"]["conversation"]["openai"] # type: ignore - default_openai_config = OpenAIProcessorConfig( - api_key="", - chat_model=default_processor_config["chat-model"], - ) - current_processor_openai_config = ( - state.config.processor.conversation.openai - if state.config - and state.config.processor - and state.config.processor.conversation - and state.config.processor.conversation.openai - else default_openai_config - ) - current_processor_openai_config = json.loads(current_processor_openai_config.json()) +@web_client.get("/config/content-source/notion", response_class=HTMLResponse) +@requires(["authenticated"], redirect="login_page") +def notion_config_page(request: Request): + user = request.user.object + user_picture = request.session.get("user", {}).get("picture") + user_subscription_state = adapters.get_user_subscription(user.email) + has_documents = EntryAdapters.user_has_entries(user=user) + current_notion_config = get_user_notion_config(user) - return templates.TemplateResponse( - "processor_conversation_input.html", - context={ - "request": request, - "current_config": current_processor_openai_config, - }, - ) + current_config = NotionContentConfig( + token=current_notion_config.token if current_notion_config else "", + ) + + current_config = json.loads(current_config.json()) + + return templates.TemplateResponse( + "content_source_notion_input.html", + context={ + "request": request, + "current_config": current_config, + "username": user.username, + "user_photo": user_picture, + "is_active": user_subscription_state == "subscribed" or user_subscription_state == "unsubscribed", + "has_documents": has_documents, + }, + ) + + +@web_client.get("/config/content-source/computer", response_class=HTMLResponse) +@requires(["authenticated"], redirect="login_page") +def computer_config_page(request: Request): + user = request.user.object + user_picture = request.session.get("user", {}).get("picture") + user_subscription_state = get_user_subscription_state(user.email) + has_documents = EntryAdapters.user_has_entries(user=user) + + return templates.TemplateResponse( + "content_source_computer_input.html", + context={ + "request": request, + "username": user.username, + "user_photo": user_picture, + "is_active": user_subscription_state == "subscribed" or user_subscription_state == "unsubscribed", + "has_documents": has_documents, + }, + ) diff --git a/src/khoj/search_filter/base_filter.py b/src/khoj/search_filter/base_filter.py index 470f7341..ae596587 100644 --- a/src/khoj/search_filter/base_filter.py +++ b/src/khoj/search_filter/base_filter.py @@ -1,16 +1,9 @@ # Standard Packages from abc import ABC, abstractmethod -from typing import List, Set, Tuple - -# Internal Packages -from khoj.utils.rawconfig import Entry +from typing import List class BaseFilter(ABC): - @abstractmethod - def load(self, entries: List[Entry], *args, **kwargs): - ... - @abstractmethod def get_filter_terms(self, query: str) -> List[str]: ... @@ -18,10 +11,6 @@ class BaseFilter(ABC): def can_filter(self, raw_query: str) -> bool: return len(self.get_filter_terms(raw_query)) > 0 - @abstractmethod - def apply(self, query: str, entries: List[Entry]) -> Tuple[str, Set[int]]: - ... - @abstractmethod def defilter(self, query: str) -> str: ... diff --git a/src/khoj/search_filter/date_filter.py b/src/khoj/search_filter/date_filter.py index 39e7bec3..1d90b9f5 100644 --- a/src/khoj/search_filter/date_filter.py +++ b/src/khoj/search_filter/date_filter.py @@ -25,72 +25,42 @@ class DateFilter(BaseFilter): # - dt>="last week" # - dt:"2 years ago" date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']" + raw_date_regex = r"\d{4}-\d{2}-\d{2}" def __init__(self, entry_key="compiled"): self.entry_key = entry_key self.date_to_entry_ids = defaultdict(set) self.cache = LRU() - def load(self, entries, *args, **kwargs): - with timer("Created date filter index", logger): - for id, entry in enumerate(entries): - # Extract dates from entry - for date_in_entry_string in re.findall(r"\d{4}-\d{2}-\d{2}", getattr(entry, self.entry_key)): - # Convert date string in entry to unix timestamp - try: - date_in_entry = datetime.strptime(date_in_entry_string, "%Y-%m-%d").timestamp() - except ValueError: - continue - except OSError: - logger.debug(f"OSError: Ignoring unprocessable date in entry: {date_in_entry_string}") - continue - self.date_to_entry_ids[date_in_entry].add(id) + def extract_dates(self, content): + pattern_matched_dates = re.findall(self.raw_date_regex, content) + + # Filter down to valid dates + valid_dates = [] + for date_str in pattern_matched_dates: + try: + valid_dates.append(datetime.strptime(date_str, "%Y-%m-%d")) + except ValueError: + continue + + return valid_dates def get_filter_terms(self, query: str) -> List[str]: "Get all filter terms in query" return [f"dt{item[0]}'{item[1]}'" for item in re.findall(self.date_regex, query)] + def get_query_date_range(self, query) -> List: + with timer("Extract date range to filter from query", logger): + query_daterange = self.extract_date_range(query) + + return query_daterange + def defilter(self, query): # remove date range filter from query query = re.sub(rf"\s+{self.date_regex}", " ", query) query = re.sub(r"\s{2,}", " ", query).strip() # remove multiple spaces return query - def apply(self, query, entries): - "Find entries containing any dates that fall within date range specified in query" - # extract date range specified in date filter of query - with timer("Extract date range to filter from query", logger): - query_daterange = self.extract_date_range(query) - - # if no date in query, return all entries - if query_daterange == []: - return query, set(range(len(entries))) - - query = self.defilter(query) - - # return results from cache if exists - cache_key = tuple(query_daterange) - if cache_key in self.cache: - logger.debug(f"Return date filter results from cache") - entries_to_include = self.cache[cache_key] - return query, entries_to_include - - if not self.date_to_entry_ids: - self.load(entries) - - # find entries containing any dates that fall with date range specified in query - with timer("Mark entries satisfying filter", logger): - entries_to_include = set() - for date_in_entry in self.date_to_entry_ids.keys(): - # Check if date in entry is within date range specified in query - if query_daterange[0] <= date_in_entry < query_daterange[1]: - entries_to_include |= self.date_to_entry_ids[date_in_entry] - - # cache results - self.cache[cache_key] = entries_to_include - - return query, entries_to_include - def extract_date_range(self, query): # find date range filter in query date_range_matches = re.findall(self.date_regex, query) @@ -138,6 +108,15 @@ class DateFilter(BaseFilter): if effective_date_range == [0, inf] or effective_date_range[0] > effective_date_range[1]: return [] else: + # If the first element is 0, replace it with None + + if effective_date_range[0] == 0: + effective_date_range[0] = None + + # If the second element is inf, replace it with None + if effective_date_range[1] == inf: + effective_date_range[1] = None + return effective_date_range def parse(self, date_str, relative_base=None): @@ -148,14 +127,18 @@ class DateFilter(BaseFilter): clean_date_str = re.sub("|".join(future_strings), "", date_str) # parse date passed in query date filter - parsed_date = dtparse.parse( - clean_date_str, - settings={ - "RELATIVE_BASE": relative_base or datetime.now(), - "PREFER_DAY_OF_MONTH": "first", - "PREFER_DATES_FROM": prefer_dates_from, - }, - ) + try: + parsed_date = dtparse.parse( + clean_date_str, + settings={ + "RELATIVE_BASE": relative_base or datetime.now(), + "PREFER_DAY_OF_MONTH": "first", + "PREFER_DATES_FROM": prefer_dates_from, + }, + ) + except Exception as e: + logger.error(f"Failed to parse date string: {date_str} with error: {e}") + return None if parsed_date is None: return None diff --git a/src/khoj/search_filter/file_filter.py b/src/khoj/search_filter/file_filter.py index 420bf9e7..291838ea 100644 --- a/src/khoj/search_filter/file_filter.py +++ b/src/khoj/search_filter/file_filter.py @@ -21,62 +21,13 @@ class FileFilter(BaseFilter): self.file_to_entry_map = defaultdict(set) self.cache = LRU() - def load(self, entries, *args, **kwargs): - with timer("Created file filter index", logger): - for id, entry in enumerate(entries): - self.file_to_entry_map[getattr(entry, self.entry_key)].add(id) - def get_filter_terms(self, query: str) -> List[str]: "Get all filter terms in query" - return [f'file:"{term}"' for term in re.findall(self.file_filter_regex, query)] + return [f"{self.convert_to_regex(term)}" for term in re.findall(self.file_filter_regex, query)] + + def convert_to_regex(self, file_filter: str) -> str: + "Convert file filter to regex" + return file_filter.replace(".", r"\.").replace("*", r".*") def defilter(self, query: str) -> str: return re.sub(self.file_filter_regex, "", query).strip() - - def apply(self, query, entries): - # Extract file filters from raw query - with timer("Extract files_to_search from query", logger): - raw_files_to_search = re.findall(self.file_filter_regex, query) - if not raw_files_to_search: - return query, set(range(len(entries))) - - # Convert simple file filters with no path separator into regex - # e.g. "file:notes.org" -> "file:.*notes.org" - files_to_search = [] - for file in sorted(raw_files_to_search): - if "/" not in file and "\\" not in file and "*" not in file: - files_to_search += [f"*{file}"] - else: - files_to_search += [file] - - # Remove filter terms from original query - query = self.defilter(query) - - # Return item from cache if exists - cache_key = tuple(files_to_search) - if cache_key in self.cache: - logger.debug(f"Return file filter results from cache") - included_entry_indices = self.cache[cache_key] - return query, included_entry_indices - - if not self.file_to_entry_map: - self.load(entries, regenerate=False) - - # Mark entries that contain any blocked_words for exclusion - with timer("Mark entries satisfying filter", logger): - included_entry_indices = set.union( - *[ - self.file_to_entry_map[entry_file] - for entry_file in self.file_to_entry_map.keys() - for search_file in files_to_search - if fnmatch.fnmatch(entry_file, search_file) - ], - set(), - ) - if not included_entry_indices: - return query, {} - - # Cache results - self.cache[cache_key] = included_entry_indices - - return query, included_entry_indices diff --git a/src/khoj/search_filter/word_filter.py b/src/khoj/search_filter/word_filter.py index ebf64b34..b2053dbe 100644 --- a/src/khoj/search_filter/word_filter.py +++ b/src/khoj/search_filter/word_filter.py @@ -6,7 +6,7 @@ from typing import List # Internal Packages from khoj.search_filter.base_filter import BaseFilter -from khoj.utils.helpers import LRU, timer +from khoj.utils.helpers import LRU logger = logging.getLogger(__name__) @@ -22,21 +22,6 @@ class WordFilter(BaseFilter): self.word_to_entry_index = defaultdict(set) self.cache = LRU() - def load(self, entries, *args, **kwargs): - with timer("Created word filter index", logger): - self.cache = {} # Clear cache on filter (re-)load - entry_splitter = ( - r",|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\'" - ) - # Create map of words to entries they exist in - for entry_index, entry in enumerate(entries): - for word in re.split(entry_splitter, getattr(entry, self.entry_key).lower()): - if word == "": - continue - self.word_to_entry_index[word].add(entry_index) - - return self.word_to_entry_index - def get_filter_terms(self, query: str) -> List[str]: "Get all filter terms in query" required_terms = [f"+{required_term}" for required_term in re.findall(self.required_regex, query)] @@ -45,47 +30,3 @@ class WordFilter(BaseFilter): def defilter(self, query: str) -> str: return re.sub(self.blocked_regex, "", re.sub(self.required_regex, "", query)).strip() - - def apply(self, query, entries): - "Find entries containing required and not blocked words specified in query" - # Separate natural query from required, blocked words filters - with timer("Extract required, blocked filters from query", logger): - required_words = set([word.lower() for word in re.findall(self.required_regex, query)]) - blocked_words = set([word.lower() for word in re.findall(self.blocked_regex, query)]) - query = self.defilter(query) - - if len(required_words) == 0 and len(blocked_words) == 0: - return query, set(range(len(entries))) - - # Return item from cache if exists - cache_key = tuple(sorted(required_words)), tuple(sorted(blocked_words)) - if cache_key in self.cache: - logger.debug(f"Return word filter results from cache") - included_entry_indices = self.cache[cache_key] - return query, included_entry_indices - - if not self.word_to_entry_index: - self.load(entries, regenerate=False) - - # mark entries that contain all required_words for inclusion - with timer("Mark entries satisfying filter", logger): - entries_with_all_required_words = set(range(len(entries))) - if len(required_words) > 0: - entries_with_all_required_words = set.intersection( - *[self.word_to_entry_index.get(word, set()) for word in required_words] - ) - - # mark entries that contain any blocked_words for exclusion - entries_with_any_blocked_words = set() - if len(blocked_words) > 0: - entries_with_any_blocked_words = set.union( - *[self.word_to_entry_index.get(word, set()) for word in blocked_words] - ) - - # get entries satisfying inclusion and exclusion filters - included_entry_indices = entries_with_all_required_words - entries_with_any_blocked_words - - # Cache results - self.cache[cache_key] = included_entry_indices - - return query, included_entry_indices diff --git a/src/khoj/search_type/image_search.py b/src/khoj/search_type/image_search.py index 8b92d9db..8c0a3cdb 100644 --- a/src/khoj/search_type/image_search.py +++ b/src/khoj/search_type/image_search.py @@ -146,7 +146,7 @@ def extract_metadata(image_name): async def query( - raw_query, count, search_model: ImageSearchModel, content: ImageContent, score_threshold: float = -math.inf + raw_query, count, search_model: ImageSearchModel, content: ImageContent, score_threshold: float = math.inf ): # Set query to image content if query is of form file:/path/to/file.png if raw_query.startswith("file:") and pathlib.Path(raw_query[5:]).is_file(): @@ -167,7 +167,8 @@ async def query( # Compute top_k ranked images based on cosine-similarity b/w query and all image embeddings. with timer("Search Time", logger): image_hits = { - result["corpus_id"]: {"image_score": result["score"], "score": result["score"]} + # Map scores to distance metric by multiplying by -1 + result["corpus_id"]: {"image_score": -1 * result["score"], "score": -1 * result["score"]} for result in util.semantic_search(query_embedding, content.image_embeddings, top_k=count)[0] } @@ -204,7 +205,7 @@ async def query( ] # Filter results by score threshold - hits = [hit for hit in hits if hit["image_score"] >= score_threshold] + hits = [hit for hit in hits if hit["image_score"] <= score_threshold] # Sort the images based on their combined metadata, image scores return sorted(hits, key=lambda hit: hit["score"], reverse=True) @@ -228,7 +229,7 @@ def collate_results(hits, image_names, output_directory, image_files_url, count= # Add the image metadata to the results results += [ - SearchResponse.parse_obj( + SearchResponse.model_validate( { "entry": f"{image_files_url}/{target_image_name}", "score": f"{hit['score']:.9f}", @@ -236,6 +237,7 @@ def collate_results(hits, image_names, output_directory, image_files_url, count= "image_score": f"{hit['image_score']:.9f}", "metadata_score": f"{hit['metadata_score']:.9f}", }, + "corpus_id": str(hit["corpus_id"]), } ) ] diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 2890baa9..7e295903 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -6,50 +6,33 @@ from typing import List, Tuple, Type, Union # External Packages import torch -from sentence_transformers import SentenceTransformer, CrossEncoder, util -from khoj.processor.text_to_jsonl import TextToJsonl -from khoj.search_filter.base_filter import BaseFilter +from sentence_transformers import util + +from asgiref.sync import sync_to_async + # Internal Packages from khoj.utils import state -from khoj.utils.helpers import get_absolute_path, is_none_or_empty, resolve_absolute_path, load_model, timer -from khoj.utils.config import TextContent, TextSearchModel +from khoj.utils.helpers import get_absolute_path, timer from khoj.utils.models import BaseEncoder -from khoj.utils.rawconfig import SearchResponse, TextSearchConfig, TextConfigBase, Entry +from khoj.utils.state import SearchType +from khoj.utils.rawconfig import SearchResponse, Entry from khoj.utils.jsonl import load_jsonl - +from khoj.processor.text_to_entries import TextToEntries +from database.adapters import EntryAdapters +from database.models import KhojUser, Entry as DbEntry logger = logging.getLogger(__name__) - -def initialize_model(search_config: TextSearchConfig): - "Initialize model for semantic search on text" - torch.set_num_threads(4) - - # If model directory is configured - if search_config.model_directory: - # Convert model directory to absolute path - search_config.model_directory = resolve_absolute_path(search_config.model_directory) - # Create model directory if it doesn't exist - search_config.model_directory.parent.mkdir(parents=True, exist_ok=True) - - # The bi-encoder encodes all entries to use for semantic search - bi_encoder = load_model( - model_dir=search_config.model_directory, - model_name=search_config.encoder, - model_type=search_config.encoder_type or SentenceTransformer, - device=f"{state.device}", - ) - - # The cross-encoder re-ranks the results to improve quality - cross_encoder = load_model( - model_dir=search_config.model_directory, - model_name=search_config.cross_encoder, - model_type=CrossEncoder, - device=f"{state.device}", - ) - - return TextSearchModel(bi_encoder, cross_encoder) +search_type_to_embeddings_type = { + SearchType.Org.value: DbEntry.EntryType.ORG, + SearchType.Markdown.value: DbEntry.EntryType.MARKDOWN, + SearchType.Plaintext.value: DbEntry.EntryType.PLAINTEXT, + SearchType.Pdf.value: DbEntry.EntryType.PDF, + SearchType.Github.value: DbEntry.EntryType.GITHUB, + SearchType.Notion.value: DbEntry.EntryType.NOTION, + SearchType.All.value: None, +} def extract_entries(jsonl_file) -> List[Entry]: @@ -117,171 +100,130 @@ def load_embeddings( async def query( + user: KhojUser, raw_query: str, - search_model: TextSearchModel, - content: TextContent, + type: SearchType = SearchType.All, question_embedding: Union[torch.Tensor, None] = None, - rank_results: bool = False, - score_threshold: float = -math.inf, - dedupe: bool = True, + max_distance: float = math.inf, ) -> Tuple[List[dict], List[Entry]]: "Search for entries that answer the query" - if ( - content.entries is None - or len(content.entries) == 0 - or content.corpus_embeddings is None - or len(content.corpus_embeddings) == 0 - ): - return [], [] - query, entries, corpus_embeddings = raw_query, content.entries, content.corpus_embeddings + file_type = search_type_to_embeddings_type[type.value] - # Filter query, entries and embeddings before semantic search - query, entries, corpus_embeddings = apply_filters(query, entries, corpus_embeddings, content.filters) - - # If no entries left after filtering, return empty results - if entries is None or len(entries) == 0: - return [], [] - # If query only had filters it'll be empty now. So short-circuit and return results. - if query.strip() == "": - hits = [{"corpus_id": id, "score": 1.0} for id, _ in enumerate(entries)] - return hits, entries + query = raw_query # Encode the query using the bi-encoder if question_embedding is None: with timer("Query Encode Time", logger, state.device): - question_embedding = search_model.bi_encoder.encode([query], convert_to_tensor=True, device=state.device) - question_embedding = util.normalize_embeddings(question_embedding) + question_embedding = state.embeddings_model.embed_query(query) # Find relevant entries for the query - top_k = min(len(entries), search_model.top_k or 10) # top_k hits can't be more than the total entries in corpus + top_k = 10 with timer("Search Time", logger, state.device): - hits = util.semantic_search(question_embedding, corpus_embeddings, top_k, score_function=util.dot_score)[0] + hits = EntryAdapters.search_with_embeddings( + user=user, + embeddings=question_embedding, + max_results=top_k, + file_type_filter=file_type, + raw_query=raw_query, + max_distance=max_distance, + ).all() + hits = await sync_to_async(list)(hits) # type: ignore[call-arg] + + return hits + + +def collate_results(hits, dedupe=True): + hit_ids = set() + for hit in hits: + if dedupe and hit.corpus_id in hit_ids: + continue + + else: + hit_ids.add(hit.corpus_id) + yield SearchResponse.parse_obj( + { + "entry": hit.raw, + "score": hit.distance, + "corpus_id": str(hit.corpus_id), + "additional": { + "file": hit.file_path, + "compiled": hit.compiled, + "heading": hit.heading, + }, + } + ) + + +def deduplicated_search_responses(hits: List[SearchResponse]): + hit_ids = set() + for hit in hits: + if hit.corpus_id in hit_ids: + continue + + else: + hit_ids.add(hit.corpus_id) + yield SearchResponse.model_validate( + { + "entry": hit.entry, + "score": hit.score, + "corpus_id": hit.corpus_id, + "additional": { + "file": hit.additional["file"], + "compiled": hit.additional["compiled"], + "heading": hit.additional["heading"], + }, + } + ) + + +def rerank_and_sort_results(hits, query, rank_results): + # If we have more than one result and reranking is enabled + rank_results = rank_results and len(list(hits)) > 1 # Score all retrieved entries using the cross-encoder - if rank_results and search_model.cross_encoder: - hits = cross_encoder_score(search_model.cross_encoder, query, entries, hits) + if rank_results: + hits = cross_encoder_score(query, hits) - # Filter results by score threshold - hits = [hit for hit in hits if hit.get("cross-score", hit.get("score")) >= score_threshold] + # Sort results by cross-encoder score followed by bi-encoder score + hits = sort_results(rank_results=rank_results, hits=hits) - # Order results by cross-encoder score followed by bi-encoder score - hits = sort_results(rank_results, hits) - - # Deduplicate entries by raw entry text before showing to users - if dedupe: - hits = deduplicate_results(entries, hits) - - return hits, entries - - -def collate_results(hits, entries: List[Entry], count=5) -> List[SearchResponse]: - return [ - SearchResponse.parse_obj( - { - "entry": entries[hit["corpus_id"]].raw, - "score": f"{hit.get('cross-score') or hit.get('score')}", - "additional": { - "file": entries[hit["corpus_id"]].file, - "compiled": entries[hit["corpus_id"]].compiled, - "heading": entries[hit["corpus_id"]].heading, - }, - } - ) - for hit in hits[0:count] - ] + return hits def setup( - text_to_jsonl: Type[TextToJsonl], + text_to_entries: Type[TextToEntries], files: dict[str, str], - config: TextConfigBase, - bi_encoder: BaseEncoder, regenerate: bool, - filters: List[BaseFilter] = [], - normalize: bool = True, full_corpus: bool = True, -) -> TextContent: - # Map notes in text files to (compressed) JSONL formatted file - config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) - previous_entries = [] - if config.compressed_jsonl.exists() and not regenerate: - previous_entries = extract_entries(config.compressed_jsonl) - entries_with_indices = text_to_jsonl(config).process( - previous_entries=previous_entries, files=files, full_corpus=full_corpus - ) - - # Extract Updated Entries - entries = extract_entries(config.compressed_jsonl) - if is_none_or_empty(entries): - config_params = ", ".join([f"{key}={value}" for key, value in config.dict().items()]) - raise ValueError( - f"No valid entries found in specified configuration: {config_params}, with files: {files.keys()}" + user: KhojUser = None, + config=None, +) -> None: + if config: + num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process( + files=files, full_corpus=full_corpus, user=user, regenerate=regenerate + ) + else: + num_new_embeddings, num_deleted_embeddings = text_to_entries().process( + files=files, full_corpus=full_corpus, user=user, regenerate=regenerate ) - # Compute or Load Embeddings - config.embeddings_file = resolve_absolute_path(config.embeddings_file) - corpus_embeddings = compute_embeddings( - entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate, normalize=normalize - ) + if files: + file_names = [file_name for file_name in files] - for filter in filters: - filter.load(entries, regenerate=regenerate) - - return TextContent(entries, corpus_embeddings, filters) + logger.info( + f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names}" + ) -def load( - config: TextConfigBase, - filters: List[BaseFilter] = [], -) -> TextContent: - # Map notes in text files to (compressed) JSONL formatted file - config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) - entries = extract_entries(config.compressed_jsonl) - - # Compute or Load Embeddings - config.embeddings_file = resolve_absolute_path(config.embeddings_file) - corpus_embeddings = load_embeddings(config.embeddings_file) - - for filter in filters: - filter.load(entries, regenerate=False) - - return TextContent(entries, corpus_embeddings, filters) - - -def apply_filters( - query: str, entries: List[Entry], corpus_embeddings: torch.Tensor, filters: List[BaseFilter] -) -> Tuple[str, List[Entry], torch.Tensor]: - """Filter query, entries and embeddings before semantic search""" - - with timer("Total Filter Time", logger, state.device): - included_entry_indices = set(range(len(entries))) - filters_in_query = [filter for filter in filters if filter.can_filter(query)] - for filter in filters_in_query: - query, included_entry_indices_by_filter = filter.apply(query, entries) - included_entry_indices.intersection_update(included_entry_indices_by_filter) - - # Get entries (and associated embeddings) satisfying all filters - if not included_entry_indices: - return "", [], torch.tensor([], device=state.device) - else: - entries = [entries[id] for id in included_entry_indices] - corpus_embeddings = torch.index_select( - corpus_embeddings, 0, torch.tensor(list(included_entry_indices), device=state.device) - ) - - return query, entries, corpus_embeddings - - -def cross_encoder_score(cross_encoder: CrossEncoder, query: str, entries: List[Entry], hits: List[dict]) -> List[dict]: +def cross_encoder_score(query: str, hits: List[SearchResponse]) -> List[SearchResponse]: """Score all retrieved entries using the cross-encoder""" with timer("Cross-Encoder Predict Time", logger, state.device): - cross_inp = [[query, entries[hit["corpus_id"]].compiled] for hit in hits] - cross_scores = cross_encoder.predict(cross_inp) + cross_scores = state.cross_encoder_model.predict(query, hits) - # Store cross-encoder scores in results dictionary for ranking + # Convert cross-encoder scores to distances and pass in hits for reranking for idx in range(len(cross_scores)): - hits[idx]["cross-score"] = cross_scores[idx] + hits[idx]["cross_score"] = 1 - cross_scores[idx] return hits @@ -289,25 +231,7 @@ def cross_encoder_score(cross_encoder: CrossEncoder, query: str, entries: List[E def sort_results(rank_results: bool, hits: List[dict]) -> List[dict]: """Order results by cross-encoder score followed by bi-encoder score""" with timer("Rank Time", logger, state.device): - hits.sort(key=lambda x: x["score"], reverse=True) # sort by bi-encoder score + hits.sort(key=lambda x: x["score"]) # sort by bi-encoder score if rank_results: - hits.sort(key=lambda x: x["cross-score"], reverse=True) # sort by cross-encoder score - return hits - - -def deduplicate_results(entries: List[Entry], hits: List[dict]) -> List[dict]: - """Deduplicate entries by raw entry text before showing to users - Compiled entries are split by max tokens supported by ML models. - This can result in duplicate hits, entries shown to user.""" - - with timer("Deduplication Time", logger, state.device): - seen, original_hits_count = set(), len(hits) - hits = [ - hit - for hit in hits - if entries[hit["corpus_id"]].raw not in seen and not seen.add(entries[hit["corpus_id"]].raw) # type: ignore[func-returns-value] - ] - duplicate_hits = original_hits_count - len(hits) - - logger.debug(f"Removed {duplicate_hits} duplicates") + hits.sort(key=lambda x: x["cross_score"]) # sort by cross-encoder score return hits diff --git a/src/khoj/utils/cli.py b/src/khoj/utils/cli.py index 9f129b17..2e4c9d59 100644 --- a/src/khoj/utils/cli.py +++ b/src/khoj/utils/cli.py @@ -2,6 +2,10 @@ import argparse import pathlib from importlib.metadata import version +import os +import logging + +logger = logging.getLogger(__name__) # Internal Packages from khoj.utils.helpers import resolve_absolute_path @@ -11,13 +15,14 @@ from khoj.migrations.migrate_processor_config_openai import migrate_processor_co from khoj.migrations.migrate_offline_model import migrate_offline_model from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema from khoj.migrations.migrate_offline_chat_default_model import migrate_offline_chat_default_model +from khoj.migrations.migrate_server_pg import migrate_server_pg def cli(args=None): # Setup Argument Parser for the Commandline Interface parser = argparse.ArgumentParser(description="Start Khoj; An AI personal assistant for your Digital Brain") parser.add_argument( - "--config-file", "-c", default="~/.khoj/khoj.yml", type=pathlib.Path, help="YAML file to configure Khoj" + "--config-file", default="~/.khoj/khoj.yml", type=pathlib.Path, help="YAML file to configure Khoj" ) parser.add_argument( "--regenerate", @@ -37,9 +42,17 @@ def cli(args=None): parser.add_argument( "--disable-chat-on-gpu", action="store_true", default=False, help="Disable using GPU for the offline chat model" ) - parser.add_argument("--demo", action="store_true", default=False, help="Run Khoj in demo mode") + parser.add_argument( + "--anonymous-mode", + action="store_true", + default=False, + help="Run Khoj in anonymous mode. This does not require any login for connecting users.", + ) - args = parser.parse_args(args) + args, remaining_args = parser.parse_known_args(args) + + if len(remaining_args) > 0: + logger.info(f"⚠️ Ignoring unknown commandline args: {remaining_args}") # Set default values for arguments args.chat_on_gpu = not args.disable_chat_on_gpu @@ -58,6 +71,8 @@ def cli(args=None): else: args = run_migrations(args) args.config = parse_config_from_file(args.config_file) + if os.environ.get("KHOJ_DEBUG"): + args.config.app.should_log_telemetry = False return args @@ -69,6 +84,7 @@ def run_migrations(args): migrate_offline_model, migrate_offline_chat_schema, migrate_offline_chat_default_model, + migrate_server_pg, ] for migration in migrations: args = migration(args) diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index 5b3b9f6e..7795d695 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -5,23 +5,21 @@ from enum import Enum import logging from dataclasses import dataclass -from pathlib import Path -from typing import TYPE_CHECKING, Dict, List, Optional, Union, Any -from khoj.processor.conversation.gpt4all.utils import download_model +from typing import TYPE_CHECKING, List, Optional, Union, Any # External Packages import torch -from khoj.utils.rawconfig import OfflineChatProcessorConfig +# Internal Packages +from khoj.processor.conversation.gpt4all.utils import download_model + logger = logging.getLogger(__name__) # Internal Packages if TYPE_CHECKING: from sentence_transformers import CrossEncoder - from khoj.search_filter.base_filter import BaseFilter from khoj.utils.models import BaseEncoder - from khoj.utils.rawconfig import ConversationProcessorConfig, Entry, OpenAIProcessorConfig class SearchType(str, Enum): @@ -41,9 +39,7 @@ class ProcessorType(str, Enum): @dataclass class TextContent: - entries: List[Entry] - corpus_embeddings: torch.Tensor - filters: List[BaseFilter] + enabled: bool @dataclass @@ -67,21 +63,13 @@ class ImageSearchModel: @dataclass class ContentIndex: - org: Optional[TextContent] = None - markdown: Optional[TextContent] = None - pdf: Optional[TextContent] = None - github: Optional[TextContent] = None - notion: Optional[TextContent] = None image: Optional[ImageContent] = None - plaintext: Optional[TextContent] = None - plugins: Optional[Dict[str, TextContent]] = None @dataclass class SearchModels: text_search: Optional[TextSearchModel] = None image_search: Optional[ImageSearchModel] = None - plugin_search: Optional[Dict[str, TextSearchModel]] = None @dataclass @@ -89,31 +77,16 @@ class GPT4AllProcessorConfig: loaded_model: Union[Any, None] = None -class ConversationProcessorConfigModel: +class GPT4AllProcessorModel: def __init__( self, - conversation_config: ConversationProcessorConfig, + chat_model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin", ): - self.openai_model = conversation_config.openai - self.gpt4all_model = GPT4AllProcessorConfig() - self.offline_chat = conversation_config.offline_chat or OfflineChatProcessorConfig() - self.max_prompt_size = conversation_config.max_prompt_size - self.tokenizer = conversation_config.tokenizer - self.conversation_logfile = Path(conversation_config.conversation_logfile) - self.chat_session: List[str] = [] - self.meta_log: dict = {} - - if self.offline_chat.enable_offline_chat: - try: - self.gpt4all_model.loaded_model = download_model(self.offline_chat.chat_model) - except Exception as e: - self.offline_chat.enable_offline_chat = False - self.gpt4all_model.loaded_model = None - logger.error(f"Error while loading offline chat model: {e}", exc_info=True) - else: - self.gpt4all_model.loaded_model = None - - -@dataclass -class ProcessorConfigModel: - conversation: Union[ConversationProcessorConfigModel, None] = None + self.chat_model = chat_model + self.loaded_model = None + try: + self.loaded_model = download_model(self.chat_model) + except ValueError as e: + self.loaded_model = None + logger.error(f"Error while loading offline chat model: {e}", exc_info=True) + raise e diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index 7f534bf6..8a106153 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -5,138 +5,18 @@ web_directory = app_root_directory / "khoj/interface/web/" empty_escape_sequences = "\n|\r|\t| " app_env_filepath = "~/.khoj/env" telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry" +content_directory = "~/.khoj/content/" +default_offline_chat_model = "mistral-7b-instruct-v0.1.Q4_0.gguf" empty_config = { - "content-type": { - "org": { - "input-files": None, - "input-filter": None, - "compressed-jsonl": "~/.khoj/content/org/org.jsonl.gz", - "embeddings-file": "~/.khoj/content/org/org_embeddings.pt", - "index-heading-entries": False, - }, - "markdown": { - "input-files": None, - "input-filter": None, - "compressed-jsonl": "~/.khoj/content/markdown/markdown.jsonl.gz", - "embeddings-file": "~/.khoj/content/markdown/markdown_embeddings.pt", - }, - "pdf": { - "input-files": None, - "input-filter": None, - "compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz", - "embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt", - }, - "plaintext": { - "input-files": None, - "input-filter": None, - "compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz", - "embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt", - }, - }, "search-type": { - "symmetric": { - "encoder": "sentence-transformers/all-MiniLM-L6-v2", - "cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "model_directory": "~/.khoj/search/symmetric/", - }, - "asymmetric": { - "encoder": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", - "cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "model_directory": "~/.khoj/search/asymmetric/", - }, "image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"}, }, - "processor": { - "conversation": { - "openai": { - "api-key": None, - "chat-model": "gpt-3.5-turbo", - }, - "offline-chat": { - "enable-offline-chat": False, - "chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf", - }, - "tokenizer": None, - "max-prompt-size": None, - "conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json", - } - }, } # default app config to use default_config = { - "content-type": { - "org": { - "input-files": None, - "input-filter": None, - "compressed-jsonl": "~/.khoj/content/org/org.jsonl.gz", - "embeddings-file": "~/.khoj/content/org/org_embeddings.pt", - "index-heading-entries": False, - }, - "markdown": { - "input-files": None, - "input-filter": None, - "compressed-jsonl": "~/.khoj/content/markdown/markdown.jsonl.gz", - "embeddings-file": "~/.khoj/content/markdown/markdown_embeddings.pt", - }, - "pdf": { - "input-files": None, - "input-filter": None, - "compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz", - "embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt", - }, - "image": { - "input-directories": None, - "input-filter": None, - "embeddings-file": "~/.khoj/content/image/image_embeddings.pt", - "batch-size": 50, - "use-xmp-metadata": False, - }, - "github": { - "pat-token": None, - "repos": [], - "compressed-jsonl": "~/.khoj/content/github/github.jsonl.gz", - "embeddings-file": "~/.khoj/content/github/github_embeddings.pt", - }, - "notion": { - "token": None, - "compressed-jsonl": "~/.khoj/content/notion/notion.jsonl.gz", - "embeddings-file": "~/.khoj/content/notion/notion_embeddings.pt", - }, - "plaintext": { - "input-files": None, - "input-filter": None, - "compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz", - "embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt", - }, - }, "search-type": { - "symmetric": { - "encoder": "sentence-transformers/all-MiniLM-L6-v2", - "cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "model_directory": "~/.khoj/search/symmetric/", - }, - "asymmetric": { - "encoder": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", - "cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "model_directory": "~/.khoj/search/asymmetric/", - }, "image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"}, }, - "processor": { - "conversation": { - "openai": { - "api-key": None, - "chat-model": "gpt-3.5-turbo", - }, - "offline-chat": { - "enable-offline-chat": False, - "chat-model": "mistral-7b-instruct-v0.1.Q4_0.gguf", - }, - "tokenizer": None, - "max-prompt-size": None, - "conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json", - } - }, } diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py index 74619581..fc7e4a2d 100644 --- a/src/khoj/utils/fs_syncer.py +++ b/src/khoj/utils/fs_syncer.py @@ -5,25 +5,39 @@ from typing import Optional from bs4 import BeautifulSoup from khoj.utils.helpers import get_absolute_path, is_none_or_empty -from khoj.utils.rawconfig import TextContentConfig, ContentConfig +from khoj.utils.rawconfig import TextContentConfig from khoj.utils.config import SearchType +from database.models import LocalMarkdownConfig, LocalOrgConfig, LocalPdfConfig, LocalPlaintextConfig logger = logging.getLogger(__name__) -def collect_files(config: ContentConfig, search_type: Optional[SearchType] = SearchType.All): +def collect_files(search_type: Optional[SearchType] = SearchType.All, user=None) -> dict: files = {} + if search_type == SearchType.All or search_type == SearchType.Org: - files["org"] = get_org_files(config.org) if config.org else {} + org_config = LocalOrgConfig.objects.filter(user=user).first() + files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {} if search_type == SearchType.All or search_type == SearchType.Markdown: - files["markdown"] = get_markdown_files(config.markdown) if config.markdown else {} + markdown_config = LocalMarkdownConfig.objects.filter(user=user).first() + files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {} if search_type == SearchType.All or search_type == SearchType.Plaintext: - files["plaintext"] = get_plaintext_files(config.plaintext) if config.plaintext else {} + plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first() + files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {} if search_type == SearchType.All or search_type == SearchType.Pdf: - files["pdf"] = get_pdf_files(config.pdf) if config.pdf else {} + pdf_config = LocalPdfConfig.objects.filter(user=user).first() + files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {} return files +def construct_config_from_db(db_config) -> TextContentConfig: + return TextContentConfig( + input_files=db_config.input_files, + input_filter=db_config.input_filter, + index_heading_entries=db_config.index_heading_entries, + ) + + def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: def is_plaintextfile(file: str): "Check if file is plaintext file" diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 9209ff67..cc35aae0 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -5,16 +5,18 @@ import datetime from enum import Enum from importlib import import_module from importlib.metadata import version +from itertools import islice import logging from os import path import os from pathlib import Path import platform -import sys +import random from time import perf_counter import torch from typing import Optional, Union, TYPE_CHECKING import uuid +from asgiref.sync import sync_to_async # Internal Packages from khoj.utils import constants @@ -29,6 +31,28 @@ if TYPE_CHECKING: from khoj.utils.rawconfig import AppConfig +class AsyncIteratorWrapper: + def __init__(self, obj): + self._it = iter(obj) + + def __aiter__(self): + return self + + async def __anext__(self): + try: + value = await self.next_async() + except StopAsyncIteration: + return + return value + + @sync_to_async + def next_async(self): + try: + return next(self._it) + except StopIteration: + raise StopAsyncIteration + + def is_none_or_empty(item): return item == None or (hasattr(item, "__iter__") and len(item) == 0) or item == "" @@ -209,10 +233,12 @@ def log_telemetry( if not app_config or not app_config.should_log_telemetry: return [] + if properties.get("server_id") is None: + properties["server_id"] = get_server_id() + # Populate telemetry data to log request_body = { "telemetry_type": telemetry_type, - "server_id": get_server_id(), "server_version": version("khoj-assistant"), "os": platform.system(), "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), @@ -229,6 +255,18 @@ def log_telemetry( return request_body +def get_device() -> torch.device: + """Get device to run model on""" + if torch.cuda.is_available(): + # Use CUDA GPU + return torch.device("cuda:0") + elif torch.backends.mps.is_available(): + # Use Apple M1 Metal Acceleration + return torch.device("mps") + else: + return torch.device("cpu") + + class ConversationCommand(str, Enum): Default = "default" General = "general" @@ -242,3 +280,39 @@ command_descriptions = { ConversationCommand.Default: "The default command when no command specified. It intelligently auto-switches between general and notes mode.", ConversationCommand.Help: "Display a help message with all available commands and other metadata.", } + + +def generate_random_name(): + # List of adjectives and nouns to choose from + adjectives = [ + "happy", + "serendipitous", + "exuberant", + "calm", + "brave", + "scared", + "energetic", + "chivalrous", + "kind", + "suave", + ] + nouns = ["dog", "cat", "falcon", "whale", "turtle", "rabbit", "hamster", "snake", "spider", "elephant"] + + # Select two random words from the lists + adjective = random.choice(adjectives) + noun = random.choice(nouns) + + # Combine the words to form a name + name = f"{adjective} {noun}" + + return name + + +def batcher(iterable, max_n): + "Split an iterable into chunks of size max_n" + it = iter(iterable) + while True: + chunk = list(islice(it, max_n)) + if not chunk: + return + yield (x for x in chunk if x is not None) diff --git a/src/khoj/utils/initialization.py b/src/khoj/utils/initialization.py new file mode 100644 index 00000000..c797f848 --- /dev/null +++ b/src/khoj/utils/initialization.py @@ -0,0 +1,98 @@ +import logging +import os + +from database.models import ( + KhojUser, + OfflineChatProcessorConversationConfig, + OpenAIProcessorConversationConfig, + ChatModelOptions, +) + +from khoj.utils.constants import default_offline_chat_model + +from database.adapters import ConversationAdapters + + +logger = logging.getLogger(__name__) + + +def initialization(): + def _create_admin_user(): + logger.info( + "👩‍✈️ Setting up admin user. These credentials will allow you to configure your server at /server/admin." + ) + email_addr = os.getenv("KHOJ_ADMIN_EMAIL") or input("Email: ") + password = os.getenv("KHOJ_ADMIN_PASSWORD") or input("Password: ") + admin_user = KhojUser.objects.create_superuser(email=email_addr, username=email_addr, password=password) + logger.info(f"👩‍✈️ Created admin user: {admin_user.email}") + + def _create_chat_configuration(): + logger.info( + "🗣️ Configure chat models available to your server. You can always update these at /server/admin using the credentials of your admin account" + ) + try: + # Some environments don't support interactive input. We catch the exception and return if that's the case. The admin can still configure their settings from the admin page. + input() + except EOFError: + return + + try: + # Note: gpt4all package is not available on all devices. + # So ensure gpt4all package is installed before continuing this step. + import gpt4all + + use_offline_model = input("Use offline chat model? (y/n): ") + if use_offline_model == "y": + logger.info("🗣️ Setting up offline chat model") + OfflineChatProcessorConversationConfig.objects.create(enabled=True) + + offline_chat_model = input( + f"Enter the name of the offline chat model you want to use, based on the models in HuggingFace (press enter to use the default: {default_offline_chat_model}): " + ) + if offline_chat_model == "": + ChatModelOptions.objects.create( + chat_model=default_offline_chat_model, model_type=ChatModelOptions.ModelType.OFFLINE + ) + else: + max_tokens = input("Enter the maximum number of tokens to use for the offline chat model:") + tokenizer = input("Enter the tokenizer to use for the offline chat model:") + ChatModelOptions.objects.create( + chat_model=offline_chat_model, + model_type=ChatModelOptions.ModelType.OFFLINE, + max_prompt_size=max_tokens, + tokenizer=tokenizer, + ) + except ModuleNotFoundError as e: + logger.warning("Offline models are not supported on this device.") + + use_openai_model = input("Use OpenAI chat model? (y/n): ") + + if use_openai_model == "y": + logger.info("🗣️ Setting up OpenAI chat model") + api_key = input("Enter your OpenAI API key: ") + OpenAIProcessorConversationConfig.objects.create(api_key=api_key) + openai_chat_model = input("Enter the name of the OpenAI chat model you want to use: ") + max_tokens = input("Enter the maximum number of tokens to use for the OpenAI chat model:") + ChatModelOptions.objects.create( + chat_model=openai_chat_model, model_type=ChatModelOptions.ModelType.OPENAI, max_tokens=max_tokens + ) + + logger.info("🗣️ Chat model configuration complete") + + admin_user = KhojUser.objects.filter(is_staff=True).first() + if admin_user is None: + while True: + try: + _create_admin_user() + break + except Exception as e: + logger.error(f"🚨 Failed to create admin user: {e}", exc_info=True) + + chat_config = ConversationAdapters.get_default_conversation_config() + if admin_user is None and chat_config is None: + while True: + try: + _create_chat_configuration() + break + except Exception as e: + logger.error(f"🚨 Failed to create chat configuration: {e}", exc_info=True) diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index cc4fe208..4c97aedd 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -1,19 +1,20 @@ # System Packages import json from pathlib import Path -from typing import List, Dict, Optional, Union, Any +from typing import List, Dict, Optional +import uuid # External Packages -from pydantic import BaseModel, validator +from pydantic import BaseModel # Internal Packages -from khoj.utils.helpers import to_snake_case_from_dash, is_none_or_empty +from khoj.utils.helpers import to_snake_case_from_dash class ConfigBase(BaseModel): class Config: alias_generator = to_snake_case_from_dash - allow_population_by_field_name = True + populate_by_name = True def __getitem__(self, item): return getattr(self, item) @@ -27,9 +28,9 @@ class TextConfigBase(ConfigBase): embeddings_file: Path -class TextContentConfig(TextConfigBase): - input_files: Optional[List[Path]] - input_filter: Optional[List[str]] +class TextContentConfig(ConfigBase): + input_files: Optional[List[Path]] = None + input_filter: Optional[List[str]] = None index_heading_entries: Optional[bool] = False @@ -39,51 +40,41 @@ class GithubRepoConfig(ConfigBase): branch: Optional[str] = "master" -class GithubContentConfig(TextConfigBase): +class GithubContentConfig(ConfigBase): pat_token: str repos: List[GithubRepoConfig] -class NotionContentConfig(TextConfigBase): +class NotionContentConfig(ConfigBase): token: str class ImageContentConfig(ConfigBase): - input_directories: Optional[List[Path]] - input_filter: Optional[List[str]] + input_directories: Optional[List[Path]] = None + input_filter: Optional[List[str]] = None embeddings_file: Path use_xmp_metadata: bool batch_size: int class ContentConfig(ConfigBase): - org: Optional[TextContentConfig] - image: Optional[ImageContentConfig] - markdown: Optional[TextContentConfig] - pdf: Optional[TextContentConfig] - plaintext: Optional[TextContentConfig] - github: Optional[GithubContentConfig] - plugins: Optional[Dict[str, TextContentConfig]] - notion: Optional[NotionContentConfig] - - -class TextSearchConfig(ConfigBase): - encoder: str - cross_encoder: str - encoder_type: Optional[str] - model_directory: Optional[Path] + org: Optional[TextContentConfig] = None + image: Optional[ImageContentConfig] = None + markdown: Optional[TextContentConfig] = None + pdf: Optional[TextContentConfig] = None + plaintext: Optional[TextContentConfig] = None + github: Optional[GithubContentConfig] = None + notion: Optional[NotionContentConfig] = None class ImageSearchConfig(ConfigBase): encoder: str - encoder_type: Optional[str] - model_directory: Optional[Path] + encoder_type: Optional[str] = None + model_directory: Optional[Path] = None class SearchConfig(ConfigBase): - asymmetric: Optional[TextSearchConfig] - symmetric: Optional[TextSearchConfig] - image: Optional[ImageSearchConfig] + image: Optional[ImageSearchConfig] = None class OpenAIProcessorConfig(ConfigBase): @@ -97,33 +88,34 @@ class OfflineChatProcessorConfig(ConfigBase): class ConversationProcessorConfig(ConfigBase): - conversation_logfile: Path - openai: Optional[OpenAIProcessorConfig] - offline_chat: Optional[OfflineChatProcessorConfig] - max_prompt_size: Optional[int] - tokenizer: Optional[str] + openai: Optional[OpenAIProcessorConfig] = None + offline_chat: Optional[OfflineChatProcessorConfig] = None + max_prompt_size: Optional[int] = None + tokenizer: Optional[str] = None class ProcessorConfig(ConfigBase): - conversation: Optional[ConversationProcessorConfig] + conversation: Optional[ConversationProcessorConfig] = None class AppConfig(ConfigBase): - should_log_telemetry: bool + should_log_telemetry: bool = True class FullConfig(ConfigBase): content_type: Optional[ContentConfig] = None search_type: Optional[SearchConfig] = None processor: Optional[ProcessorConfig] = None - app: Optional[AppConfig] = AppConfig(should_log_telemetry=True) + app: Optional[AppConfig] = AppConfig() version: Optional[str] = None class SearchResponse(ConfigBase): entry: str - score: str - additional: Optional[dict] + score: float + cross_score: Optional[float] = None + additional: Optional[dict] = None + corpus_id: str class Entry: @@ -131,14 +123,21 @@ class Entry: compiled: str heading: Optional[str] file: Optional[str] + corpus_id: str def __init__( - self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None + self, + raw: str = None, + compiled: str = None, + heading: Optional[str] = None, + file: Optional[str] = None, + corpus_id: uuid.UUID = None, ): self.raw = raw self.compiled = compiled self.heading = heading self.file = file + self.corpus_id = str(corpus_id) def to_json(self) -> str: return json.dumps(self.__dict__, ensure_ascii=False) @@ -153,4 +152,5 @@ class Entry: compiled=dictionary["compiled"], file=dictionary.get("file", None), heading=dictionary.get("heading", None), + corpus_id=dictionary.get("corpus_id", None), ) diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py index e9b2ca6c..91f5f0ce 100644 --- a/src/khoj/utils/state.py +++ b/src/khoj/utils/state.py @@ -1,44 +1,41 @@ # Standard Packages +import os import threading from typing import List, Dict -from packaging import version +from collections import defaultdict # External Packages -import torch from pathlib import Path +from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel # Internal Packages from khoj.utils import config as utils_config -from khoj.utils.config import ContentIndex, SearchModels, ProcessorConfigModel -from khoj.utils.helpers import LRU +from khoj.utils.config import ContentIndex, SearchModels, GPT4AllProcessorModel +from khoj.utils.helpers import LRU, get_device from khoj.utils.rawconfig import FullConfig # Application Global State config = FullConfig() search_models = SearchModels() +embeddings_model: EmbeddingsModel = None +cross_encoder_model: CrossEncoderModel = None content_index = ContentIndex() -processor_config = ProcessorConfigModel() +gpt4all_processor_config: GPT4AllProcessorModel = None config_file: Path = None verbose: int = 0 host: str = None port: int = None cli_args: List[str] = None -query_cache = LRU() -config_lock = threading.Lock() +query_cache: Dict[str, LRU] = defaultdict(LRU) chat_lock = threading.Lock() SearchType = utils_config.SearchType telemetry: List[Dict[str, str]] = [] -previous_query: str = None -demo: bool = False khoj_version: str = None +device = get_device() chat_on_gpu: bool = True - - -if torch.cuda.is_available(): - # Use CUDA GPU - device = torch.device("cuda:0") -elif version.parse(torch.__version__) >= version.parse("1.13.0.dev") and torch.backends.mps.is_available(): - # Use Apple M1 Metal Acceleration - device = torch.device("mps") -else: - device = torch.device("cpu") +anonymous_mode: bool = False +billing_enabled: bool = ( + os.getenv("STRIPE_API_KEY") is not None + and os.getenv("STRIPE_SIGNING_SECRET") is not None + and os.getenv("KHOJ_CLOUD_SUBSCRIPTION_URL") is not None +) diff --git a/src/khoj/utils/yaml.py b/src/khoj/utils/yaml.py index abfe270a..36546688 100644 --- a/src/khoj/utils/yaml.py +++ b/src/khoj/utils/yaml.py @@ -39,7 +39,7 @@ def load_config_from_file(yaml_config_file: Path) -> dict: def parse_config_from_string(yaml_config: dict) -> FullConfig: "Parse and validate config in YML string" - return FullConfig.parse_obj(yaml_config) + return FullConfig.model_validate(yaml_config) def parse_config_from_file(yaml_config_file): diff --git a/src/manage.py b/src/manage.py new file mode 100755 index 00000000..1a64b14a --- /dev/null +++ b/src/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "app.settings") + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == "__main__": + main() diff --git a/tests/conftest.py b/tests/conftest.py index 8b661f50..e7a73d6c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,61 +1,65 @@ # External Packages import os -from copy import deepcopy from fastapi.testclient import TestClient from pathlib import Path import pytest +from fastapi.staticfiles import StaticFiles +from fastapi import FastAPI +import os +from fastapi import FastAPI + # Internal Packages -from khoj.main import app -from khoj.configure import configure_processor, configure_routes, configure_search_types -from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl -from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl +from khoj.configure import configure_routes, configure_search_types, configure_middleware +from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel +from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries from khoj.search_type import image_search, text_search from khoj.utils.config import SearchModels +from khoj.utils.constants import web_directory from khoj.utils.helpers import resolve_absolute_path from khoj.utils.rawconfig import ( ContentConfig, - ConversationProcessorConfig, - OfflineChatProcessorConfig, - OpenAIProcessorConfig, - ProcessorConfig, - TextContentConfig, - GithubContentConfig, - GithubRepoConfig, ImageContentConfig, SearchConfig, - TextSearchConfig, ImageSearchConfig, ) from khoj.utils import state, fs_syncer from khoj.routers.indexer import configure_content -from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.search_filter.date_filter import DateFilter -from khoj.search_filter.word_filter import WordFilter -from khoj.search_filter.file_filter import FileFilter +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from database.models import ( + KhojApiUser, + LocalOrgConfig, + LocalMarkdownConfig, + LocalPlaintextConfig, + GithubConfig, + KhojUser, + GithubRepoConfig, +) + +from tests.helpers import ( + UserFactory, + ChatModelOptionsFactory, + OpenAIProcessorConversationConfigFactory, + OfflineChatProcessorConversationConfigFactory, + UserConversationProcessorConfigFactory, + SubscriptionFactory, +) + + +@pytest.fixture(autouse=True) +def enable_db_access_for_all_tests(db): + pass @pytest.fixture(scope="session") def search_config() -> SearchConfig: + state.embeddings_model = EmbeddingsModel() + state.cross_encoder_model = CrossEncoderModel() + model_dir = resolve_absolute_path("~/.khoj/search") model_dir.mkdir(parents=True, exist_ok=True) search_config = SearchConfig() - search_config.symmetric = TextSearchConfig( - encoder="sentence-transformers/all-MiniLM-L6-v2", - cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2", - model_directory=model_dir / "symmetric/", - encoder_type=None, - ) - - search_config.asymmetric = TextSearchConfig( - encoder="sentence-transformers/multi-qa-MiniLM-L6-cos-v1", - cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2", - model_directory=model_dir / "asymmetric/", - encoder_type=None, - ) - search_config.image = ImageSearchConfig( encoder="sentence-transformers/clip-ViT-B-32", model_directory=model_dir / "image/", @@ -65,17 +69,102 @@ def search_config() -> SearchConfig: return search_config +@pytest.mark.django_db +@pytest.fixture +def default_user(): + user = UserFactory() + SubscriptionFactory(user=user) + return user + + +@pytest.mark.django_db +@pytest.fixture +def default_user2(): + if KhojUser.objects.filter(username="default").exists(): + return KhojUser.objects.get(username="default") + + user = KhojUser.objects.create( + username="default", + email="default@example.com", + password="default", + ) + SubscriptionFactory(user=user) + return user + + +@pytest.mark.django_db +@pytest.fixture +def default_user3(): + """ + This user should not have any data associated with it + """ + if KhojUser.objects.filter(username="default3").exists(): + return KhojUser.objects.get(username="default3") + + user = KhojUser.objects.create( + username="default3", + email="default3@example.com", + password="default3", + ) + SubscriptionFactory(user=user) + return user + + +@pytest.mark.django_db +@pytest.fixture +def api_user(default_user): + if KhojApiUser.objects.filter(user=default_user).exists(): + return KhojApiUser.objects.get(user=default_user) + + return KhojApiUser.objects.create( + user=default_user, + name="api-key", + token="kk-secret", + ) + + +@pytest.mark.django_db +@pytest.fixture +def api_user2(default_user2): + if KhojApiUser.objects.filter(user=default_user2).exists(): + return KhojApiUser.objects.get(user=default_user2) + + return KhojApiUser.objects.create( + user=default_user2, + name="api-key", + token="kk-diff-secret", + ) + + +@pytest.mark.django_db +@pytest.fixture +def api_user3(default_user3): + if KhojApiUser.objects.filter(user=default_user3).exists(): + return KhojApiUser.objects.get(user=default_user3) + + return KhojApiUser.objects.create( + user=default_user3, + name="api-key", + token="kk-diff-secret-3", + ) + + @pytest.fixture(scope="session") def search_models(search_config: SearchConfig): search_models = SearchModels() - search_models.text_search = text_search.initialize_model(search_config.asymmetric) search_models.image_search = image_search.initialize_model(search_config.image) return search_models -@pytest.fixture(scope="session") -def content_config(tmp_path_factory, search_models: SearchModels, search_config: SearchConfig): +@pytest.fixture +def anyio_backend(): + return "asyncio" + + +@pytest.mark.django_db +@pytest.fixture(scope="function") +def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser): content_dir = tmp_path_factory.mktemp("content") # Generate Image Embeddings from Test Images @@ -90,217 +179,188 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config: image_search.setup(content_config.image, search_models.image_search.image_encoder, regenerate=False) - # Generate Notes Embeddings from Test Notes - content_config.org = TextContentConfig( + LocalOrgConfig.objects.create( input_files=None, input_filter=["tests/data/org/*.org"], - compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"), - embeddings_file=content_dir.joinpath("note_embeddings.pt"), + index_heading_entries=False, + user=default_user, ) - filters = [DateFilter(), WordFilter(), FileFilter()] - text_search.setup( - OrgToJsonl, - get_sample_data("org"), - content_config.org, - search_models.text_search.bi_encoder, - regenerate=False, - filters=filters, - ) - - content_config.plugins = { - "plugin1": TextContentConfig( - input_files=[content_dir.joinpath("notes.jsonl.gz")], - input_filter=None, - compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"), - embeddings_file=content_dir.joinpath("plugin_embeddings.pt"), - ) - } + text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user) if os.getenv("GITHUB_PAT_TOKEN"): - content_config.github = GithubContentConfig( - pat_token=os.getenv("GITHUB_PAT_TOKEN", ""), - repos=[ - GithubRepoConfig( - owner="khoj-ai", - name="lantern", - branch="master", - ) - ], - compressed_jsonl=content_dir.joinpath("github.jsonl.gz"), - embeddings_file=content_dir.joinpath("github_embeddings.pt"), + GithubConfig.objects.create( + pat_token=os.getenv("GITHUB_PAT_TOKEN"), + user=default_user, ) - content_config.plaintext = TextContentConfig( + GithubRepoConfig.objects.create( + owner="khoj-ai", + name="lantern", + branch="master", + github_config=GithubConfig.objects.get(user=default_user), + ) + + LocalPlaintextConfig.objects.create( input_files=None, input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"], - compressed_jsonl=content_dir.joinpath("plaintext.jsonl.gz"), - embeddings_file=content_dir.joinpath("plaintext_embeddings.pt"), - ) - - content_config.github = GithubContentConfig( - pat_token=os.getenv("GITHUB_PAT_TOKEN", ""), - repos=[ - GithubRepoConfig( - owner="khoj-ai", - name="lantern", - branch="master", - ) - ], - compressed_jsonl=content_dir.joinpath("github.jsonl.gz"), - embeddings_file=content_dir.joinpath("github_embeddings.pt"), - ) - - filters = [DateFilter(), WordFilter(), FileFilter()] - text_search.setup( - JsonlToJsonl, - None, - content_config.plugins["plugin1"], - search_models.text_search.bi_encoder, - regenerate=False, - filters=filters, + user=default_user, ) return content_config @pytest.fixture(scope="session") -def md_content_config(tmp_path_factory): - content_dir = tmp_path_factory.mktemp("content") - - # Generate Embeddings for Markdown Content - content_config = ContentConfig() - content_config.markdown = TextContentConfig( +def md_content_config(): + markdown_config = LocalMarkdownConfig.objects.create( input_files=None, input_filter=["tests/data/markdown/*.markdown"], - compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"), - embeddings_file=content_dir.joinpath("markdown_embeddings.pt"), ) - return content_config + return markdown_config -@pytest.fixture(scope="session") -def processor_config(tmp_path_factory): - openai_api_key = os.getenv("OPENAI_API_KEY") - processor_dir = tmp_path_factory.mktemp("processor") - - # The conversation processor is the only configured processor - # It needs an OpenAI API key to work. - if not openai_api_key: - return - - # Setup conversation processor, if OpenAI API key is set - processor_config = ProcessorConfig() - processor_config.conversation = ConversationProcessorConfig( - openai=OpenAIProcessorConfig(api_key=openai_api_key), - conversation_logfile=processor_dir.joinpath("conversation_logs.json"), - ) - - return processor_config - - -@pytest.fixture(scope="session") -def processor_config_offline_chat(tmp_path_factory): - processor_dir = tmp_path_factory.mktemp("processor") - - # Setup conversation processor - processor_config = ProcessorConfig() - offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True, chat_model="mistral-7b-instruct-v0.1.Q4_0.gguf") - processor_config.conversation = ConversationProcessorConfig( - offline_chat=offline_chat, - conversation_logfile=processor_dir.joinpath("conversation_logs.json"), - ) - - return processor_config - - -@pytest.fixture(scope="session") -def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, processor_config: ProcessorConfig): +@pytest.fixture(scope="function") +def chat_client(search_config: SearchConfig, default_user2: KhojUser): # Initialize app state - state.config.content_type = md_content_config state.config.search_type = search_config - state.SearchType = configure_search_types(state.config) + state.SearchType = configure_search_types() + + LocalMarkdownConfig.objects.create( + input_files=None, + input_filter=["tests/data/markdown/*.markdown"], + user=default_user2, + ) # Index Markdown Content for Search - state.search_models.text_search = text_search.initialize_model(search_config.asymmetric) - all_files = fs_syncer.collect_files(state.config.content_type) - state.content_index = configure_content( - state.content_index, state.config.content_type, all_files, state.search_models + all_files = fs_syncer.collect_files(user=default_user2) + state.content_index, _ = configure_content( + state.content_index, state.config.content_type, all_files, state.search_models, user=default_user2 ) # Initialize Processor from Config - state.processor_config = configure_processor(processor_config) + if os.getenv("OPENAI_API_KEY"): + chat_model = ChatModelOptionsFactory(chat_model="gpt-3.5-turbo", model_type="openai") + OpenAIProcessorConversationConfigFactory() + UserConversationProcessorConfigFactory(user=default_user2, setting=chat_model) + + state.anonymous_mode = True + + app = FastAPI() configure_routes(app) + configure_middleware(app) + app.mount("/static", StaticFiles(directory=web_directory), name="static") return TestClient(app) @pytest.fixture(scope="function") -def client(content_config: ContentConfig, search_config: SearchConfig, processor_config: ProcessorConfig): +def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUser): + # Initialize app state + state.config.search_type = search_config + state.SearchType = configure_search_types() + + # Initialize Processor from Config + if os.getenv("OPENAI_API_KEY"): + chat_model = ChatModelOptionsFactory(chat_model="gpt-3.5-turbo", model_type="openai") + OpenAIProcessorConversationConfigFactory() + UserConversationProcessorConfigFactory(user=default_user2, setting=chat_model) + + state.anonymous_mode = True + + app = FastAPI() + + configure_routes(app) + configure_middleware(app) + app.mount("/static", StaticFiles(directory=web_directory), name="static") + return TestClient(app) + + +@pytest.fixture(scope="function") +def fastapi_app(): + app = FastAPI() + configure_routes(app) + configure_middleware(app) + app.mount("/static", StaticFiles(directory=web_directory), name="static") + return app + + +@pytest.fixture(scope="function") +def client( + content_config: ContentConfig, + search_config: SearchConfig, + api_user: KhojApiUser, +): state.config.content_type = content_config state.config.search_type = search_config - state.SearchType = configure_search_types(state.config) + state.SearchType = configure_search_types() + state.embeddings_model = EmbeddingsModel() + state.cross_encoder_model = CrossEncoderModel() # These lines help us Mock the Search models for these search types - state.search_models.text_search = text_search.initialize_model(search_config.asymmetric) state.search_models.image_search = image_search.initialize_model(search_config.image) - state.content_index.org = text_search.setup( - OrgToJsonl, + text_search.setup( + OrgToEntries, get_sample_data("org"), - content_config.org, - state.search_models.text_search.bi_encoder, regenerate=False, + user=api_user.user, ) state.content_index.image = image_search.setup( content_config.image, state.search_models.image_search, regenerate=False ) - state.content_index.plaintext = text_search.setup( - PlaintextToJsonl, + text_search.setup( + PlaintextToEntries, get_sample_data("plaintext"), - content_config.plaintext, - state.search_models.text_search.bi_encoder, regenerate=False, + user=api_user.user, ) - state.processor_config = configure_processor(processor_config) + state.anonymous_mode = False + app = FastAPI() configure_routes(app) + configure_middleware(app) + app.mount("/static", StaticFiles(directory=web_directory), name="static") return TestClient(app) @pytest.fixture(scope="function") -def client_offline_chat( - search_config: SearchConfig, - processor_config_offline_chat: ProcessorConfig, - content_config: ContentConfig, - md_content_config, -): +def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser): # Initialize app state - state.config.content_type = md_content_config state.config.search_type = search_config - state.SearchType = configure_search_types(state.config) + state.SearchType = configure_search_types() - # Index Markdown Content for Search - state.search_models.text_search = text_search.initialize_model(search_config.asymmetric) - state.search_models.image_search = image_search.initialize_model(search_config.image) + LocalMarkdownConfig.objects.create( + input_files=None, + input_filter=["tests/data/markdown/*.markdown"], + user=default_user2, + ) - all_files = fs_syncer.collect_files(state.config.content_type) - state.content_index = configure_content( - state.content_index, state.config.content_type, all_files, state.search_models + all_files = fs_syncer.collect_files(user=default_user2) + configure_content( + state.content_index, state.config.content_type, all_files, state.search_models, user=default_user2 ) # Initialize Processor from Config - state.processor_config = configure_processor(processor_config_offline_chat) + OfflineChatProcessorConversationConfigFactory(enabled=True) + UserConversationProcessorConfigFactory(user=default_user2) + + state.anonymous_mode = True + + app = FastAPI() configure_routes(app) + configure_middleware(app) + app.mount("/static", StaticFiles(directory=web_directory), name="static") return TestClient(app) @pytest.fixture(scope="function") -def new_org_file(content_config: ContentConfig): +def new_org_file(default_user: KhojUser, content_config: ContentConfig): # Setup - new_org_file = Path(content_config.org.input_filter[0]).parent / "new_file.org" + org_config = LocalOrgConfig.objects.filter(user=default_user).first() + input_filters = org_config.input_filter + new_org_file = Path(input_filters[0]).parent / "new_file.org" new_org_file.touch() yield new_org_file @@ -311,11 +371,9 @@ def new_org_file(content_config: ContentConfig): @pytest.fixture(scope="function") -def org_config_with_only_new_file(content_config: ContentConfig, new_org_file: Path): - new_org_config = deepcopy(content_config.org) - new_org_config.input_files = [f"{new_org_file}"] - new_org_config.input_filter = None - return new_org_config +def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser): + LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None) + return LocalOrgConfig.objects.filter(user=default_user).first() @pytest.fixture(scope="function") diff --git a/tests/data/config.yml b/tests/data/config.yml index 06978cf1..bb6736ab 100644 --- a/tests/data/config.yml +++ b/tests/data/config.yml @@ -9,20 +9,9 @@ content-type: input-filter: - '*.org' - ~/notes/*.org - plugins: - content_plugin_1: - compressed-jsonl: content_plugin_1.jsonl.gz - embeddings-file: content_plugin_1_embeddings.pt - input-files: - - content_plugin_1_new.jsonl.gz - content_plugin_2: - compressed-jsonl: content_plugin_2.jsonl.gz - embeddings-file: content_plugin_2_embeddings.pt - input-filter: - - '*2_new.jsonl.gz' enable-offline-chat: false search-type: asymmetric: cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2 encoder: sentence-transformers/msmarco-MiniLM-L-6-v3 -version: 0.10.1 +version: 0.15.0 diff --git a/tests/data/pdf/ocr_samples.pdf b/tests/data/pdf/ocr_samples.pdf new file mode 100644 index 00000000..100f60e0 Binary files /dev/null and b/tests/data/pdf/ocr_samples.pdf differ diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 00000000..079eb475 --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,92 @@ +import factory +import os + +from database.models import ( + KhojUser, + KhojApiUser, + ChatModelOptions, + OfflineChatProcessorConversationConfig, + OpenAIProcessorConversationConfig, + SearchModelConfig, + UserConversationConfig, + Conversation, + Subscription, +) + + +class UserFactory(factory.django.DjangoModelFactory): + class Meta: + model = KhojUser + + username = factory.Faker("name") + email = factory.Faker("email") + password = factory.Faker("password") + uuid = factory.Faker("uuid4") + + +class ApiUserFactory(factory.django.DjangoModelFactory): + class Meta: + model = KhojApiUser + + user = None + name = factory.Faker("name") + token = factory.Faker("password") + + +class ChatModelOptionsFactory(factory.django.DjangoModelFactory): + class Meta: + model = ChatModelOptions + + max_prompt_size = 2000 + tokenizer = None + chat_model = "mistral-7b-instruct-v0.1.Q4_0.gguf" + model_type = "offline" + + +class UserConversationProcessorConfigFactory(factory.django.DjangoModelFactory): + class Meta: + model = UserConversationConfig + + user = factory.SubFactory(UserFactory) + setting = factory.SubFactory(ChatModelOptionsFactory) + + +class OfflineChatProcessorConversationConfigFactory(factory.django.DjangoModelFactory): + class Meta: + model = OfflineChatProcessorConversationConfig + + enabled = True + + +class OpenAIProcessorConversationConfigFactory(factory.django.DjangoModelFactory): + class Meta: + model = OpenAIProcessorConversationConfig + + api_key = os.getenv("OPENAI_API_KEY") + + +class ConversationFactory(factory.django.DjangoModelFactory): + class Meta: + model = Conversation + + user = factory.SubFactory(UserFactory) + + +class SearchModelFactory(factory.django.DjangoModelFactory): + class Meta: + model = SearchModelConfig + + name = "default" + model_type = "text" + bi_encoder = "thenlper/gte-small" + cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2" + + +class SubscriptionFactory(factory.django.DjangoModelFactory): + class Meta: + model = Subscription + + user = factory.SubFactory(UserFactory) + type = "standard" + is_recurring = False + renewal_date = "2100-04-01" diff --git a/tests/test_cli.py b/tests/test_cli.py index 9de3a853..e3daa2c6 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -25,7 +25,7 @@ def test_cli_invalid_config_file_path(): non_existent_config_file = f"non-existent-khoj-{random()}.yml" # Act - actual_args = cli([f"-c={non_existent_config_file}"]) + actual_args = cli([f"--config-file={non_existent_config_file}"]) # Assert assert actual_args.config_file == resolve_absolute_path(non_existent_config_file) @@ -35,7 +35,7 @@ def test_cli_invalid_config_file_path(): # ---------------------------------------------------------------------------------------------------- def test_cli_config_from_file(): # Act - actual_args = cli(["-c=tests/data/config.yml", "--regenerate", "-vvv"]) + actual_args = cli(["--config-file=tests/data/config.yml", "--regenerate", "-vvv"]) # Assert assert actual_args.config_file == resolve_absolute_path(Path("tests/data/config.yml")) @@ -48,14 +48,3 @@ def test_cli_config_from_file(): Path("~/first_from_config.org"), Path("~/second_from_config.org"), ] - assert len(actual_args.config.content_type.plugins.keys()) == 2 - assert actual_args.config.content_type.plugins["content_plugin_1"].input_files == [ - Path("content_plugin_1_new.jsonl.gz") - ] - assert actual_args.config.content_type.plugins["content_plugin_2"].input_filter == ["*2_new.jsonl.gz"] - assert actual_args.config.content_type.plugins["content_plugin_1"].compressed_jsonl == Path( - "content_plugin_1.jsonl.gz" - ) - assert actual_args.config.content_type.plugins["content_plugin_2"].embeddings_file == Path( - "content_plugin_2_embeddings.pt" - ) diff --git a/tests/test_client.py b/tests/test_client.py index 55bf09f7..f642a727 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -2,69 +2,135 @@ from io import BytesIO from PIL import Image from urllib.parse import quote - +import pytest # External Packages from fastapi.testclient import TestClient +from fastapi import FastAPI import pytest # Internal Packages -from khoj.main import app from khoj.configure import configure_routes, configure_search_types from khoj.utils import state from khoj.utils.state import search_models, content_index, config from khoj.search_type import text_search, image_search from khoj.utils.rawconfig import ContentConfig, SearchConfig -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.search_filter.word_filter import WordFilter -from khoj.search_filter.file_filter import FileFilter +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from database.models import KhojUser, KhojApiUser +from database.adapters import EntryAdapters # Test # ---------------------------------------------------------------------------------------------------- -def test_search_with_invalid_content_type(client): +@pytest.mark.django_db(transaction=True) +def test_search_with_no_auth_key(client): # Arrange user_query = quote("How to call Khoj from Emacs?") # Act - response = client.get(f"/api/search?q={user_query}&t=invalid_content_type") + response = client.get(f"/api/search?q={user_query}") + + # Assert + assert response.status_code == 403 + + +@pytest.mark.django_db(transaction=True) +def test_search_with_invalid_auth_key(client): + # Arrange + headers = {"Authorization": "Bearer invalid-token"} + user_query = quote("How to call Khoj from Emacs?") + + # Act + response = client.get(f"/api/search?q={user_query}", headers=headers) + + # Assert + assert response.status_code == 403 + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_search_with_invalid_content_type(client): + # Arrange + headers = {"Authorization": "Bearer kk-secret"} + user_query = quote("How to call Khoj from Emacs?") + + # Act + response = client.get(f"/api/search?q={user_query}&t=invalid_content_type", headers=headers) # Assert assert response.status_code == 422 # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) def test_search_with_valid_content_type(client): - for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plugin1"]: + headers = {"Authorization": "Bearer kk-secret"} + for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plaintext"]: # Act - response = client.get(f"/api/search?q=random&t={content_type}") + response = client.get(f"/api/search?q=random&t={content_type}", headers=headers) # Assert assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}" # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_index_update_with_no_auth_key(client): + # Arrange + files = get_sample_files_data() + + # Act + response = client.post("/api/v1/index/update", files=files) + + # Assert + assert response.status_code == 403 + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_index_update_with_invalid_auth_key(client): + # Arrange + files = get_sample_files_data() + headers = {"Authorization": "Bearer kk-invalid-token"} + + # Act + response = client.post("/api/v1/index/update", files=files, headers=headers) + + # Assert + assert response.status_code == 403 + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) def test_update_with_invalid_content_type(client): + # Arrange + headers = {"Authorization": "Bearer kk-secret"} + # Act - response = client.get(f"/api/update?t=invalid_content_type") + response = client.get(f"/api/update?t=invalid_content_type", headers=headers) # Assert assert response.status_code == 422 # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) def test_regenerate_with_invalid_content_type(client): + # Arrange + headers = {"Authorization": "Bearer kk-secret"} + # Act - response = client.get(f"/api/update?force=true&t=invalid_content_type") + response = client.get(f"/api/update?force=true&t=invalid_content_type", headers=headers) # Assert assert response.status_code == 422 # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) def test_index_update(client): # Arrange files = get_sample_files_data() - headers = {"x-api-key": "secret"} + headers = {"Authorization": "Bearer kk-secret"} # Act response = client.post("/api/v1/index/update", files=files, headers=headers) @@ -74,88 +140,75 @@ def test_index_update(client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) def test_regenerate_with_valid_content_type(client): - for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]: + for content_type in ["all", "org", "markdown", "image", "pdf", "notion"]: # Arrange files = get_sample_files_data() - headers = {"x-api-key": "secret"} + headers = {"Authorization": "Bearer kk-secret"} # Act response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers) + # Assert assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}" # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) def test_regenerate_with_github_fails_without_pat(client): # Act - response = client.get(f"/api/update?force=true&t=github") + headers = {"Authorization": "Bearer kk-secret"} + response = client.get(f"/api/update?force=true&t=github", headers=headers) # Arrange files = get_sample_files_data() - headers = {"x-api-key": "secret"} # Act response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers) + # Assert assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github" # ---------------------------------------------------------------------------------------------------- -@pytest.mark.skip(reason="Flaky test on parallel test runs") -def test_get_configured_types_via_api(client): +@pytest.mark.django_db +def test_get_configured_types_via_api(client, sample_org_data): # Act - response = client.get(f"/api/config/types") + text_search.setup(OrgToEntries, sample_org_data, regenerate=False) + + enabled_types = EntryAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True) # Assert - assert response.status_code == 200 - assert response.json() == ["all", "org", "image", "plaintext", "plugin1"] + assert list(enabled_types) == ["org"] # ---------------------------------------------------------------------------------------------------- -def test_get_configured_types_with_only_plugin_content_config(content_config): +@pytest.mark.django_db(transaction=True) +def test_get_api_config_types(client, sample_org_data, default_user: KhojUser): # Arrange - config.content_type = ContentConfig() - config.content_type.plugins = content_config.plugins - state.SearchType = configure_search_types(config) - - configure_routes(app) - client = TestClient(app) + headers = {"Authorization": "Bearer kk-secret"} + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) # Act - response = client.get(f"/api/config/types") + response = client.get(f"/api/config/types", headers=headers) # Assert assert response.status_code == 200 - assert response.json() == ["all", "plugin1"] + assert response.json() == ["all", "org", "image", "plaintext"] # ---------------------------------------------------------------------------------------------------- -def test_get_configured_types_with_no_plugin_content_config(content_config): +@pytest.mark.django_db(transaction=True) +def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI): # Arrange - config.content_type = content_config - config.content_type.plugins = None - state.SearchType = configure_search_types(config) + state.anonymous_mode = True + if state.config and state.config.content_type: + state.config.content_type = None + state.search_models = configure_search_types() - configure_routes(app) - client = TestClient(app) - - # Act - response = client.get(f"/api/config/types") - - # Assert - assert response.status_code == 200 - assert "plugin1" not in response.json() - - -# ---------------------------------------------------------------------------------------------------- -def test_get_configured_types_with_no_content_config(): - # Arrange - config.content_type = ContentConfig() - state.SearchType = configure_search_types(config) - - configure_routes(app) - client = TestClient(app) + configure_routes(fastapi_app) + client = TestClient(fastapi_app) # Act response = client.get(f"/api/config/types") @@ -166,8 +219,10 @@ def test_get_configured_types_with_no_content_config(): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) def test_image_search(client, content_config: ContentConfig, search_config: SearchConfig): # Arrange + headers = {"Authorization": "Bearer kk-secret"} search_models.image_search = image_search.initialize_model(search_config.image) content_index.image = image_search.setup( content_config.image, search_models.image_search.image_encoder, regenerate=False @@ -180,7 +235,7 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear for query, expected_image_name in query_expected_image_pairs: # Act - response = client.get(f"/api/search?q={query}&n=1&t=image") + response = client.get(f"/api/search?q={query}&n=1&t=image", headers=headers) # Assert assert response.status_code == 200 @@ -192,43 +247,57 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear # ---------------------------------------------------------------------------------------------------- -def test_notes_search(client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data): +@pytest.mark.django_db(transaction=True) +def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser): # Arrange - search_models.text_search = text_search.initialize_model(search_config.asymmetric) - content_index.org = text_search.setup( - OrgToJsonl, sample_org_data, content_config.org, search_models.text_search.bi_encoder, regenerate=False - ) + headers = {"Authorization": "Bearer kk-secret"} + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) user_query = quote("How to git install application?") # Act - response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true") + response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers) # Assert assert response.status_code == 200 - # assert actual_data contains "Khoj via Emacs" entry + + assert len(response.json()) == 1, "Expected only 1 result" search_result = response.json()[0]["entry"] - assert "git clone https://github.com/khoj-ai/khoj" in search_result + assert "git clone https://github.com/khoj-ai/khoj" in search_result, "Expected 'git clone' in search result" # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_notes_search_no_results(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser): + # Arrange + headers = {"Authorization": "Bearer kk-secret"} + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) + user_query = quote("How to find my goat?") + + # Act + response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers) + + # Assert + assert response.status_code == 200 + assert response.json() == [], "Expected no results" + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) def test_notes_search_with_only_filters( - client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data + client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data, default_user: KhojUser ): # Arrange - filters = [WordFilter(), FileFilter()] - search_models.text_search = text_search.initialize_model(search_config.asymmetric) - content_index.org = text_search.setup( - OrgToJsonl, + headers = {"Authorization": "Bearer kk-secret"} + text_search.setup( + OrgToEntries, sample_org_data, - content_config.org, - search_models.text_search.bi_encoder, regenerate=False, - filters=filters, + user=default_user, ) user_query = quote('+"Emacs" file:"*.org"') # Act - response = client.get(f"/api/search?q={user_query}&n=1&t=org") + response = client.get(f"/api/search?q={user_query}&n=1&t=org", headers=headers) # Assert assert response.status_code == 200 @@ -238,19 +307,15 @@ def test_notes_search_with_only_filters( # ---------------------------------------------------------------------------------------------------- -def test_notes_search_with_include_filter( - client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data -): +@pytest.mark.django_db(transaction=True) +def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser): # Arrange - filters = [WordFilter()] - search_models.text_search = text_search.initialize_model(search_config.asymmetric) - content_index.org = text_search.setup( - OrgToJsonl, sample_org_data, content_config.org, search_models.text_search, regenerate=False, filters=filters - ) + headers = {"Authorization": "Bearer kk-secret"} + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) user_query = quote('How to git install application? +"Emacs"') # Act - response = client.get(f"/api/search?q={user_query}&n=1&t=org") + response = client.get(f"/api/search?q={user_query}&n=1&t=org", headers=headers) # Assert assert response.status_code == 200 @@ -260,24 +325,20 @@ def test_notes_search_with_include_filter( # ---------------------------------------------------------------------------------------------------- -def test_notes_search_with_exclude_filter( - client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data -): +@pytest.mark.django_db(transaction=True) +def test_notes_search_with_exclude_filter(client, sample_org_data, default_user: KhojUser): # Arrange - filters = [WordFilter()] - search_models.text_search = text_search.initialize_model(search_config.asymmetric) - content_index.org = text_search.setup( - OrgToJsonl, + headers = {"Authorization": "Bearer kk-secret"} + text_search.setup( + OrgToEntries, sample_org_data, - content_config.org, - search_models.text_search.bi_encoder, regenerate=False, - filters=filters, + user=default_user, ) user_query = quote('How to git install application? -"clone"') # Act - response = client.get(f"/api/search?q={user_query}&n=1&t=org") + response = client.get(f"/api/search?q={user_query}&n=1&t=org", headers=headers) # Assert assert response.status_code == 200 @@ -286,22 +347,56 @@ def test_notes_search_with_exclude_filter( assert "clone" not in search_result +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser): + # Arrange + headers = {"Authorization": "Bearer kk-token"} # Token for default_user2 + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) + user_query = quote("How to git install application?") + + # Act + response = client.get(f"/api/search?q={user_query}&n=1&t=org", headers=headers) + + # Assert + assert response.status_code == 403 + # assert actual response has no data as the default_user is different from the user making the query (anonymous) + assert len(response.json()) == 1 and response.json()["detail"] == "Forbidden" + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_user_no_data_returns_empty(client, sample_org_data, api_user3: KhojApiUser): + # Arrange + token = api_user3.token + headers = {"Authorization": "Bearer " + token} + user_query = quote("How to git install application?") + + # Act + response = client.get(f"/api/search?q={user_query}&n=1&t=org", headers=headers) + + # Assert + assert response.status_code == 200 + # assert actual response has no data as the default_user3, though other users have data + assert len(response.json()) == 0 + assert response.json() == [] + + def get_sample_files_data(): - return { - "files": ("path/to/filename.org", "* practicing piano", "text/org"), - "files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"), - "files": ("path/to/filename2.org", "* how to build a search engine", "text/org"), - "files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"), - "files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"), - "files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"), - "files": ("path/to/filename.txt", "data,column,value", "text/plain"), - "files": ("path/to/filename1.txt", "my first web page", "text/plain"), - "files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"), - "files": ("path/to/filename.md", "# Notes from client call", "text/markdown"), - "files": ( - "path/to/filename1.md", - "## Studying anthropological records from the Fatimid caliphate", - "text/markdown", + return [ + ("files", ("path/to/filename.org", "* practicing piano", "text/org")), + ("files", ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org")), + ("files", ("path/to/filename2.org", "* how to build a search engine", "text/org")), + ("files", ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf")), + ("files", ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf")), + ("files", ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf")), + ("files", ("path/to/filename.txt", "data,column,value", "text/plain")), + ("files", ("path/to/filename1.txt", "my first web page", "text/plain")), + ("files", ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain")), + ("files", ("path/to/filename.md", "# Notes from client call", "text/markdown")), + ( + "files", + ("path/to/filename1.md", "## Studying anthropological records from the Fatimid caliphate", "text/markdown"), ), - "files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"), - } + ("files", ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown")), + ] diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index d0d05bc5..f1f26d28 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -1,53 +1,12 @@ # Standard Packages import re from datetime import datetime -from math import inf # External Packages import pytest # Internal Packages from khoj.search_filter.date_filter import DateFilter -from khoj.utils.rawconfig import Entry - - -@pytest.mark.filterwarnings("ignore:The localize method is no longer necessary.") -def test_date_filter(): - entries = [ - Entry(compiled="Entry with no date", raw="Entry with no date"), - Entry(compiled="April Fools entry: 1984-04-01", raw="April Fools entry: 1984-04-01"), - Entry(compiled="Entry with date:1984-04-02", raw="Entry with date:1984-04-02"), - ] - - q_with_no_date_filter = "head tail" - ret_query, entry_indices = DateFilter().apply(q_with_no_date_filter, entries) - assert ret_query == "head tail" - assert entry_indices == {0, 1, 2} - - q_with_dtrange_non_overlapping_at_boundary = 'head dt>"1984-04-01" dt<"1984-04-02" tail' - ret_query, entry_indices = DateFilter().apply(q_with_dtrange_non_overlapping_at_boundary, entries) - assert ret_query == "head tail" - assert entry_indices == set() - - query_with_overlapping_dtrange = 'head dt>"1984-04-01" dt<"1984-04-03" tail' - ret_query, entry_indices = DateFilter().apply(query_with_overlapping_dtrange, entries) - assert ret_query == "head tail" - assert entry_indices == {2} - - query_with_overlapping_dtrange = 'head dt>="1984-04-01" dt<"1984-04-02" tail' - ret_query, entry_indices = DateFilter().apply(query_with_overlapping_dtrange, entries) - assert ret_query == "head tail" - assert entry_indices == {1} - - query_with_overlapping_dtrange = 'head dt>"1984-04-01" dt<="1984-04-02" tail' - ret_query, entry_indices = DateFilter().apply(query_with_overlapping_dtrange, entries) - assert ret_query == "head tail" - assert entry_indices == {2} - - query_with_overlapping_dtrange = 'head dt>="1984-04-01" dt<="1984-04-02" tail' - ret_query, entry_indices = DateFilter().apply(query_with_overlapping_dtrange, entries) - assert ret_query == "head tail" - assert entry_indices == {1, 2} @pytest.mark.filterwarnings("ignore:The localize method is no longer necessary.") @@ -56,8 +15,8 @@ def test_extract_date_range(): datetime(1984, 1, 5, 0, 0, 0).timestamp(), datetime(1984, 1, 7, 0, 0, 0).timestamp(), ] - assert DateFilter().extract_date_range('head dt<="1984-01-01"') == [0, datetime(1984, 1, 2, 0, 0, 0).timestamp()] - assert DateFilter().extract_date_range('head dt>="1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), inf] + assert DateFilter().extract_date_range('head dt<="1984-01-01"') == [None, datetime(1984, 1, 2, 0, 0, 0).timestamp()] + assert DateFilter().extract_date_range('head dt>="1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), None] assert DateFilter().extract_date_range('head dt:"1984-01-01"') == [ datetime(1984, 1, 1, 0, 0, 0).timestamp(), datetime(1984, 1, 2, 0, 0, 0).timestamp(), diff --git a/tests/test_file_filter.py b/tests/test_file_filter.py index ed632d32..f5a903f8 100644 --- a/tests/test_file_filter.py +++ b/tests/test_file_filter.py @@ -6,97 +6,73 @@ from khoj.utils.rawconfig import Entry def test_no_file_filter(): # Arrange file_filter = FileFilter() - entries = arrange_content() q_with_no_filter = "head tail" # Act can_filter = file_filter.can_filter(q_with_no_filter) - ret_query, entry_indices = file_filter.apply(q_with_no_filter, entries) # Assert assert can_filter == False - assert ret_query == "head tail" - assert entry_indices == {0, 1, 2, 3} def test_file_filter_with_non_existent_file(): # Arrange file_filter = FileFilter() - entries = arrange_content() q_with_no_filter = 'head file:"nonexistent.org" tail' # Act can_filter = file_filter.can_filter(q_with_no_filter) - ret_query, entry_indices = file_filter.apply(q_with_no_filter, entries) # Assert assert can_filter == True - assert ret_query == "head tail" - assert entry_indices == {} def test_single_file_filter(): # Arrange file_filter = FileFilter() - entries = arrange_content() q_with_no_filter = 'head file:"file 1.org" tail' # Act can_filter = file_filter.can_filter(q_with_no_filter) - ret_query, entry_indices = file_filter.apply(q_with_no_filter, entries) # Assert assert can_filter == True - assert ret_query == "head tail" - assert entry_indices == {0, 2} def test_file_filter_with_partial_match(): # Arrange file_filter = FileFilter() - entries = arrange_content() q_with_no_filter = 'head file:"1.org" tail' # Act can_filter = file_filter.can_filter(q_with_no_filter) - ret_query, entry_indices = file_filter.apply(q_with_no_filter, entries) # Assert assert can_filter == True - assert ret_query == "head tail" - assert entry_indices == {0, 2} def test_file_filter_with_regex_match(): # Arrange file_filter = FileFilter() - entries = arrange_content() q_with_no_filter = 'head file:"*.org" tail' # Act can_filter = file_filter.can_filter(q_with_no_filter) - ret_query, entry_indices = file_filter.apply(q_with_no_filter, entries) # Assert assert can_filter == True - assert ret_query == "head tail" - assert entry_indices == {0, 1, 2, 3} def test_multiple_file_filter(): # Arrange file_filter = FileFilter() - entries = arrange_content() q_with_no_filter = 'head tail file:"file 1.org" file:"file2.org"' # Act can_filter = file_filter.can_filter(q_with_no_filter) - ret_query, entry_indices = file_filter.apply(q_with_no_filter, entries) # Assert assert can_filter == True - assert ret_query == "head tail" - assert entry_indices == {0, 1, 2, 3} def test_get_file_filter_terms(): @@ -108,7 +84,7 @@ def test_get_file_filter_terms(): filter_terms = file_filter.get_filter_terms(q_with_filter_terms) # Assert - assert filter_terms == ['file:"file 1.org"', 'file:"/path/to/dir/*.org"'] + assert filter_terms == ["file 1\\.org", "/path/to/dir/.*\\.org"] def arrange_content(): diff --git a/tests/test_gpt4all_chat_director.py b/tests/test_gpt4all_chat_director.py index 3e72a7e2..d978fc99 100644 --- a/tests/test_gpt4all_chat_director.py +++ b/tests/test_gpt4all_chat_director.py @@ -9,8 +9,7 @@ from faker import Faker # Internal Packages from khoj.processor.conversation import prompts from khoj.processor.conversation.utils import message_to_log -from khoj.utils import state - +from tests.helpers import ConversationFactory SKIP_TESTS = True pytestmark = pytest.mark.skipif( @@ -23,7 +22,7 @@ fake = Faker() # Helpers # ---------------------------------------------------------------------------------------------------- -def populate_chat_history(message_list): +def populate_chat_history(message_list, user): # Generate conversation logs conversation_log = {"chat": []} for user_message, llm_message, context in message_list: @@ -33,14 +32,15 @@ def populate_chat_history(message_list): {"context": context, "intent": {"query": user_message, "inferred-queries": f'["{user_message}"]'}}, ) - # Update Conversation Metadata Logs in Application State - state.processor_config.conversation.meta_log = conversation_log + # Update Conversation Metadata Logs in Database + ConversationFactory(user=user, conversation_log=conversation_log) # Tests # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering this question yet") @pytest.mark.chatquality +@pytest.mark.django_db(transaction=True) def test_chat_with_no_chat_history_or_retrieved_content_gpt4all(client_offline_chat): # Act response = client_offline_chat.get(f'/api/chat?q="Hello, my name is Testatron. Who are you?"&stream=true') @@ -56,13 +56,14 @@ def test_chat_with_no_chat_history_or_retrieved_content_gpt4all(client_offline_c # ---------------------------------------------------------------------------------------------------- @pytest.mark.chatquality -def test_answer_from_chat_history(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_from_chat_history(client_offline_chat, default_user2): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f'/api/chat?q="What is my name?"&stream=true') @@ -78,7 +79,8 @@ def test_answer_from_chat_history(client_offline_chat): # ---------------------------------------------------------------------------------------------------- @pytest.mark.chatquality -def test_answer_from_currently_retrieved_content(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_from_currently_retrieved_content(client_offline_chat, default_user2): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), @@ -88,7 +90,7 @@ def test_answer_from_currently_retrieved_content(client_offline_chat): ["Testatron was born on 1st April 1984 in Testville."], ), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f'/api/chat?q="Where was Xi Li born?"') @@ -101,7 +103,8 @@ def test_answer_from_currently_retrieved_content(client_offline_chat): # ---------------------------------------------------------------------------------------------------- @pytest.mark.chatquality -def test_answer_from_chat_history_and_previously_retrieved_content(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_from_chat_history_and_previously_retrieved_content(client_offline_chat, default_user2): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), @@ -111,7 +114,7 @@ def test_answer_from_chat_history_and_previously_retrieved_content(client_offlin ["Testatron was born on 1st April 1984 in Testville."], ), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f'/api/chat?q="Where was I born?"') @@ -130,13 +133,14 @@ def test_answer_from_chat_history_and_previously_retrieved_content(client_offlin reason="Chat director not capable of answering this question yet because it requires extract_questions", ) @pytest.mark.chatquality -def test_answer_from_chat_history_and_currently_retrieved_content(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_from_chat_history_and_currently_retrieved_content(client_offline_chat, default_user2): # Arrange message_list = [ ("Hello, my name is Xi Li. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f'/api/chat?q="Where was I born?"') @@ -154,14 +158,15 @@ def test_answer_from_chat_history_and_currently_retrieved_content(client_offline # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering this question yet") @pytest.mark.chatquality -def test_no_answer_in_chat_history_or_retrieved_content(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_no_answer_in_chat_history_or_retrieved_content(client_offline_chat, default_user2): "Chat director should say don't know as not enough contexts in chat history or retrieved to answer question" # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f'/api/chat?q="Where was I born?"&stream=true') @@ -177,11 +182,12 @@ def test_no_answer_in_chat_history_or_retrieved_content(client_offline_chat): # ---------------------------------------------------------------------------------------------------- @pytest.mark.chatquality -def test_answer_using_general_command(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_using_general_command(client_offline_chat, default_user2): # Arrange query = urllib.parse.quote("/general Where was Xi Li born?") message_list = [] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f"/api/chat?q={query}&stream=true") @@ -194,11 +200,12 @@ def test_answer_using_general_command(client_offline_chat): # ---------------------------------------------------------------------------------------------------- @pytest.mark.chatquality -def test_answer_from_retrieved_content_using_notes_command(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_from_retrieved_content_using_notes_command(client_offline_chat, default_user2): # Arrange query = urllib.parse.quote("/notes Where was Xi Li born?") message_list = [] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f"/api/chat?q={query}&stream=true") @@ -211,12 +218,13 @@ def test_answer_from_retrieved_content_using_notes_command(client_offline_chat): # ---------------------------------------------------------------------------------------------------- @pytest.mark.chatquality -def test_answer_using_file_filter(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_using_file_filter(client_offline_chat, default_user2): # Arrange no_answer_query = urllib.parse.quote('Where was Xi Li born? file:"Namita.markdown"') answer_query = urllib.parse.quote('Where was Xi Li born? file:"Xi Li.markdown"') message_list = [] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act no_answer_response = client_offline_chat.get(f"/api/chat?q={no_answer_query}&stream=true").content.decode("utf-8") @@ -229,11 +237,12 @@ def test_answer_using_file_filter(client_offline_chat): # ---------------------------------------------------------------------------------------------------- @pytest.mark.chatquality -def test_answer_not_known_using_notes_command(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_not_known_using_notes_command(client_offline_chat, default_user2): # Arrange query = urllib.parse.quote("/notes Where was Testatron born?") message_list = [] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f"/api/chat?q={query}&stream=true") @@ -247,6 +256,7 @@ def test_answer_not_known_using_notes_command(client_offline_chat): # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering time aware questions yet") @pytest.mark.chatquality +@pytest.mark.django_db(transaction=True) @freeze_time("2023-04-01") def test_answer_requires_current_date_awareness(client_offline_chat): "Chat actor should be able to answer questions relative to current date using provided notes" @@ -265,6 +275,7 @@ def test_answer_requires_current_date_awareness(client_offline_chat): # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering this question yet") @pytest.mark.chatquality +@pytest.mark.django_db(transaction=True) @freeze_time("2023-04-01") def test_answer_requires_date_aware_aggregation_across_provided_notes(client_offline_chat): "Chat director should be able to answer questions that require date aware aggregation across multiple notes" @@ -280,14 +291,15 @@ def test_answer_requires_date_aware_aggregation_across_provided_notes(client_off # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering this question yet") @pytest.mark.chatquality -def test_answer_general_question_not_in_chat_history_or_retrieved_content(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_general_question_not_in_chat_history_or_retrieved_content(client_offline_chat, default_user2): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ("Where was I born?", "You were born Testville.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get( @@ -307,7 +319,8 @@ def test_answer_general_question_not_in_chat_history_or_retrieved_content(client # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(reason="Chat director not consistently capable of asking for clarification yet.") @pytest.mark.chatquality -def test_ask_for_clarification_if_not_enough_context_in_question(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_ask_for_clarification_if_not_enough_context_in_question(client_offline_chat, default_user2): # Act response = client_offline_chat.get(f'/api/chat?q="What is the name of Namitas older son"&stream=true') response_message = response.content.decode("utf-8") @@ -328,14 +341,15 @@ def test_ask_for_clarification_if_not_enough_context_in_question(client_offline_ # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(reason="Chat director not capable of answering this question yet") @pytest.mark.chatquality -def test_answer_in_chat_history_beyond_lookback_window(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_in_chat_history_beyond_lookback_window(client_offline_chat, default_user2): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ("Where was I born?", "You were born Testville.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f'/api/chat?q="What is my name?"&stream=true') @@ -350,11 +364,12 @@ def test_answer_in_chat_history_beyond_lookback_window(client_offline_chat): @pytest.mark.chatquality -def test_answer_chat_history_very_long(client_offline_chat): +@pytest.mark.django_db(transaction=True) +def test_answer_chat_history_very_long(client_offline_chat, default_user2): # Arrange message_list = [(" ".join([fake.paragraph() for _ in range(50)]), fake.sentence(), []) for _ in range(10)] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = client_offline_chat.get(f'/api/chat?q="What is my name?"&stream=true') @@ -368,6 +383,7 @@ def test_answer_chat_history_very_long(client_offline_chat): # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering this question yet") @pytest.mark.chatquality +@pytest.mark.django_db(transaction=True) def test_answer_requires_multiple_independent_searches(client_offline_chat): "Chat director should be able to answer by doing multiple independent searches for required information" # Act diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 622592b1..fdd29b02 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,3 +1,14 @@ +# Standard Packages +import numpy as np +import psutil +from scipy.stats import linregress +import secrets + +# External Packages +import pytest + +# Internal Packages +from khoj.processor.embeddings import EmbeddingsModel from khoj.utils import helpers @@ -44,3 +55,31 @@ def test_lru_cache(): cache["b"] # accessing 'b' makes it the most recently used item cache["d"] = 4 # so 'c' is deleted from the cache instead of 'b' assert cache == {"b": 2, "d": 4} + + +@pytest.mark.skip(reason="Memory leak exists on GPU, MPS devices") +def test_encode_docs_memory_leak(): + # Arrange + iterations = 50 + batch_size = 20 + embeddings_model = EmbeddingsModel() + memory_usage_trend = [] + device = f"{helpers.get_device()}".upper() + + # Act + # Encode random strings repeatedly and record memory usage trend + for iteration in range(iterations): + random_docs = [" ".join(secrets.token_hex(5) for _ in range(10)) for _ in range(batch_size)] + a = [embeddings_model.embed_documents(random_docs)] + memory_usage_trend += [psutil.Process().memory_info().rss / (1024 * 1024)] + print(f"{iteration:02d}, {memory_usage_trend[-1]:.2f}", flush=True) + + # Calculate slope of line fitting memory usage history + memory_usage_trend = np.array(memory_usage_trend) + slope, _, _, _, _ = linregress(np.arange(len(memory_usage_trend)), memory_usage_trend) + print(f"Memory usage increased at ~{slope:.2f} MB per iteration on {device}") + + # Assert + # If slope is positive memory utilization is increasing + # Positive threshold of 2, from observing memory usage trend on MPS vs CPU device + assert slope < 2, f"Memory leak suspected on {device}. Memory usage increased at ~{slope:.2f} MB per iteration" diff --git a/tests/test_jsonl_to_jsonl.py b/tests/test_jsonl_to_jsonl.py deleted file mode 100644 index b52b5fc9..00000000 --- a/tests/test_jsonl_to_jsonl.py +++ /dev/null @@ -1,78 +0,0 @@ -# Internal Packages -from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl -from khoj.utils.rawconfig import Entry - - -def test_process_entries_from_single_input_jsonl(tmp_path): - "Convert multiple jsonl entries from single file to entries." - # Arrange - input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"} -{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"} -""" - input_jsonl_file = create_file(tmp_path, input_jsonl) - - # Act - # Process Each Entry from All Notes Files - input_jsons = JsonlToJsonl.extract_jsonl_entries([input_jsonl_file]) - entries = list(map(Entry.from_dict, input_jsons)) - output_jsonl = JsonlToJsonl.convert_entries_to_jsonl(entries) - - # Assert - assert len(entries) == 2 - assert output_jsonl == input_jsonl - - -def test_process_entries_from_multiple_input_jsonls(tmp_path): - "Convert multiple jsonl entries from single file to entries." - # Arrange - input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}""" - input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}""" - input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl") - input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl") - - # Act - # Process Each Entry from All Notes Files - input_jsons = JsonlToJsonl.extract_jsonl_entries([input_jsonl_file_1, input_jsonl_file_2]) - entries = list(map(Entry.from_dict, input_jsons)) - output_jsonl = JsonlToJsonl.convert_entries_to_jsonl(entries) - - # Assert - assert len(entries) == 2 - assert output_jsonl == f"{input_jsonl_1}\n{input_jsonl_2}\n" - - -def test_get_jsonl_files(tmp_path): - "Ensure JSONL files specified via input-filter, input-files extracted" - # Arrange - # Include via input-filter globs - group1_file1 = create_file(tmp_path, filename="group1-file1.jsonl") - group1_file2 = create_file(tmp_path, filename="group1-file2.jsonl") - group2_file1 = create_file(tmp_path, filename="group2-file1.jsonl") - group2_file2 = create_file(tmp_path, filename="group2-file2.jsonl") - # Include via input-file field - file1 = create_file(tmp_path, filename="notes.jsonl") - # Not included by any filter - create_file(tmp_path, filename="not-included-jsonl.jsonl") - create_file(tmp_path, filename="not-included-text.txt") - - expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1])) - - # Setup input-files, input-filters - input_files = [tmp_path / "notes.jsonl"] - input_filter = [tmp_path / "group1*.jsonl", tmp_path / "group2*.jsonl"] - - # Act - extracted_org_files = JsonlToJsonl.get_jsonl_files(input_files, input_filter) - - # Assert - assert len(extracted_org_files) == 5 - assert extracted_org_files == expected_files - - -# Helper Functions -def create_file(tmp_path, entry=None, filename="test.jsonl"): - jsonl_file = tmp_path / filename - jsonl_file.touch() - if entry: - jsonl_file.write_text(entry) - return jsonl_file diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_entries.py similarity index 82% rename from tests/test_markdown_to_jsonl.py rename to tests/test_markdown_to_entries.py index a1a458ef..4593b23a 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_entries.py @@ -4,7 +4,7 @@ from pathlib import Path import os # Internal Packages -from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl +from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries from khoj.utils.fs_syncer import get_markdown_files from khoj.utils.rawconfig import TextContentConfig @@ -23,11 +23,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) + entry_nodes, file_to_entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Process Each Entry from All Notes Files - jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( - MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries) + jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl( + MarkdownToEntries.convert_markdown_entries_to_maps(entry_nodes, file_to_entries) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -52,11 +52,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) + entries, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Process Each Entry from All Notes Files - jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( - MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map) + jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl( + MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -81,11 +81,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) - entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map) + entry_strings, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data) + entries = MarkdownToEntries.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map) # Process Each Entry from All Notes Files - jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) + jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -144,7 +144,7 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Act # Extract Entries from specified Markdown files - entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) + entries, _ = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Assert assert len(entries) == 2 diff --git a/tests/test_multiple_users.py b/tests/test_multiple_users.py new file mode 100644 index 00000000..95a2535f --- /dev/null +++ b/tests/test_multiple_users.py @@ -0,0 +1,111 @@ +# Standard Modules +from io import BytesIO +from PIL import Image +from urllib.parse import quote +import pytest + +# External Packages +from fastapi.testclient import TestClient +from fastapi import FastAPI, UploadFile +from io import BytesIO +import pytest + +# Internal Packages +from khoj.configure import configure_routes, configure_search_types +from khoj.utils import state +from khoj.utils.state import search_models, content_index, config +from khoj.search_type import text_search, image_search +from khoj.utils.rawconfig import ContentConfig, SearchConfig +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from database.models import KhojUser, KhojApiUser +from database.adapters import EntryAdapters + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_search_for_user2_returns_empty(client, api_user2: KhojApiUser): + token = api_user2.token + headers = {"Authorization": f"Bearer {token}"} + for content_type in ["all", "org", "markdown", "pdf", "github", "notion", "plaintext"]: + # Act + response = client.get(f"/api/search?q=random&t={content_type}", headers=headers) + # Assert + assert response.text == "[]" + assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}" + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_index_update_with_user2(client, api_user2: KhojApiUser): + # Arrange + files = get_sample_files_data() + source_file_symbol = set([f[1][0] for f in files]) + + headers = {"Authorization": f"Bearer {api_user2.token}"} + update_response = client.post("/api/v1/index/update", files=files, headers=headers) + search_response = client.get("/api/search?q=hardware&t=all", headers=headers) + results = search_response.json() + + # Assert + assert update_response.status_code == 200 + assert len(results) == 5 + for result in results: + assert result["additional"]["file"] in source_file_symbol + + +@pytest.mark.django_db(transaction=True) +def test_index_update_with_user2_inaccessible_user1(client, api_user2: KhojApiUser, api_user: KhojApiUser): + # Arrange + files = get_sample_files_data() + source_file_symbol = set([f[1][0] for f in files]) + + headers = {"Authorization": f"Bearer {api_user2.token}"} + update_response = client.post("/api/v1/index/update", files=files, headers=headers) + + # Act + headers = {"Authorization": f"Bearer {api_user.token}"} + search_response = client.get("/api/search?q=hardware&t=all", headers=headers) + results = search_response.json() + + # Assert + assert update_response.status_code == 200 + assert len(results) == 4 + for result in results: + assert result["additional"]["file"] not in source_file_symbol + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser): + # Arrange + headers = {"Authorization": "Bearer kk-token"} # Token for default_user2 + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) + user_query = quote("How to git install application?") + + # Act + response = client.get(f"/api/search?q={user_query}&n=1&t=org", headers=headers) + + # Assert + assert response.status_code == 403 + # assert actual response has no data as the default_user is different from the user making the query (anonymous) + assert len(response.json()) == 1 and response.json()["detail"] == "Forbidden" + + +def get_sample_files_data(): + return [ + ("files", ("path/to/filename.org", "* practicing piano", "text/org")), + ("files", ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org")), + ("files", ("path/to/filename2.org", "* how to build a search engine", "text/org")), + ("files", ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf")), + ("files", ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf")), + ("files", ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf")), + ("files", ("path/to/filename.txt", "data,column,value", "text/plain")), + ("files", ("path/to/filename1.txt", "my first web page", "text/plain")), + ("files", ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain")), + ("files", ("path/to/filename.md", "# Notes from client call", "text/markdown")), + ( + "files", + ("path/to/filename1.md", "## Studying anthropological records from the Fatimid caliphate", "text/markdown"), + ), + ("files", ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown")), + ] diff --git a/tests/test_openai_chat_director.py b/tests/test_openai_chat_director.py index abbd1831..07c4e0d8 100644 --- a/tests/test_openai_chat_director.py +++ b/tests/test_openai_chat_director.py @@ -9,8 +9,8 @@ from khoj.processor.conversation import prompts # Internal Packages from khoj.processor.conversation.utils import message_to_log -from khoj.utils import state - +from tests.helpers import ConversationFactory +from database.models import KhojUser # Initialize variables for tests api_key = os.getenv("OPENAI_API_KEY") @@ -23,7 +23,7 @@ if api_key is None: # Helpers # ---------------------------------------------------------------------------------------------------- -def populate_chat_history(message_list): +def populate_chat_history(message_list, user=None): # Generate conversation logs conversation_log = {"chat": []} for user_message, gpt_message, context in message_list: @@ -33,13 +33,14 @@ def populate_chat_history(message_list): {"context": context, "intent": {"query": user_message, "inferred-queries": f'["{user_message}"]'}}, ) - # Update Conversation Metadata Logs in Application State - state.processor_config.conversation.meta_log = conversation_log + # Update Conversation Metadata Logs in Database + ConversationFactory(user=user, conversation_log=conversation_log) # Tests # ---------------------------------------------------------------------------------------------------- @pytest.mark.chatquality +@pytest.mark.django_db(transaction=True) def test_chat_with_no_chat_history_or_retrieved_content(chat_client): # Act response = chat_client.get(f'/api/chat?q="Hello, my name is Testatron. Who are you?"&stream=true') @@ -54,14 +55,15 @@ def test_chat_with_no_chat_history_or_retrieved_content(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_answer_from_chat_history(chat_client): +def test_answer_from_chat_history(chat_client, default_user2: KhojUser): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = chat_client.get(f'/api/chat?q="What is my name?"&stream=true') @@ -76,8 +78,9 @@ def test_answer_from_chat_history(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_answer_from_currently_retrieved_content(chat_client): +def test_answer_from_currently_retrieved_content(chat_client, default_user2: KhojUser): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), @@ -87,7 +90,7 @@ def test_answer_from_currently_retrieved_content(chat_client): ["Testatron was born on 1st April 1984 in Testville."], ), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = chat_client.get(f'/api/chat?q="Where was Xi Li born?"') @@ -99,8 +102,9 @@ def test_answer_from_currently_retrieved_content(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_answer_from_chat_history_and_previously_retrieved_content(chat_client): +def test_answer_from_chat_history_and_previously_retrieved_content(chat_client_no_background, default_user2: KhojUser): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), @@ -110,10 +114,10 @@ def test_answer_from_chat_history_and_previously_retrieved_content(chat_client): ["Testatron was born on 1st April 1984 in Testville."], ), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act - response = chat_client.get(f'/api/chat?q="Where was I born?"') + response = chat_client_no_background.get(f'/api/chat?q="Where was I born?"') response_message = response.content.decode("utf-8") # Assert @@ -125,14 +129,15 @@ def test_answer_from_chat_history_and_previously_retrieved_content(chat_client): # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering this question yet") +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_answer_from_chat_history_and_currently_retrieved_content(chat_client): +def test_answer_from_chat_history_and_currently_retrieved_content(chat_client, default_user2: KhojUser): # Arrange message_list = [ ("Hello, my name is Xi Li. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = chat_client.get(f'/api/chat?q="Where was I born?"') @@ -148,15 +153,16 @@ def test_answer_from_chat_history_and_currently_retrieved_content(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_no_answer_in_chat_history_or_retrieved_content(chat_client): +def test_no_answer_in_chat_history_or_retrieved_content(chat_client, default_user2: KhojUser): "Chat director should say don't know as not enough contexts in chat history or retrieved to answer question" # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = chat_client.get(f'/api/chat?q="Where was I born?"&stream=true') @@ -171,12 +177,13 @@ def test_no_answer_in_chat_history_or_retrieved_content(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_answer_using_general_command(chat_client): +def test_answer_using_general_command(chat_client, default_user2: KhojUser): # Arrange query = urllib.parse.quote("/general Where was Xi Li born?") message_list = [] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = chat_client.get(f"/api/chat?q={query}&stream=true") @@ -188,12 +195,13 @@ def test_answer_using_general_command(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_answer_from_retrieved_content_using_notes_command(chat_client): +def test_answer_from_retrieved_content_using_notes_command(chat_client, default_user2: KhojUser): # Arrange query = urllib.parse.quote("/notes Where was Xi Li born?") message_list = [] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = chat_client.get(f"/api/chat?q={query}&stream=true") @@ -205,24 +213,26 @@ def test_answer_from_retrieved_content_using_notes_command(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_answer_not_known_using_notes_command(chat_client): +def test_answer_not_known_using_notes_command(chat_client_no_background, default_user2: KhojUser): # Arrange query = urllib.parse.quote("/notes Where was Testatron born?") message_list = [] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act - response = chat_client.get(f"/api/chat?q={query}&stream=true") + response = chat_client_no_background.get(f"/api/chat?q={query}&stream=true") response_message = response.content.decode("utf-8") # Assert assert response.status_code == 200 - assert response_message == prompts.no_notes_found.format() + assert response_message == prompts.no_entries_found.format() # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering time aware questions yet") +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality @freeze_time("2023-04-01") def test_answer_requires_current_date_awareness(chat_client): @@ -240,11 +250,13 @@ def test_answer_requires_current_date_awareness(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality @freeze_time("2023-04-01") def test_answer_requires_date_aware_aggregation_across_provided_notes(chat_client): "Chat director should be able to answer questions that require date aware aggregation across multiple notes" # Act + response = chat_client.get(f'/api/chat?q="How much did I spend on dining this year?"&stream=true') response_message = response.content.decode("utf-8") @@ -254,15 +266,16 @@ def test_answer_requires_date_aware_aggregation_across_provided_notes(chat_clien # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_answer_general_question_not_in_chat_history_or_retrieved_content(chat_client): +def test_answer_general_question_not_in_chat_history_or_retrieved_content(chat_client, default_user2: KhojUser): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ("Where was I born?", "You were born Testville.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = chat_client.get( @@ -280,10 +293,12 @@ def test_answer_general_question_not_in_chat_history_or_retrieved_content(chat_c # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_ask_for_clarification_if_not_enough_context_in_question(chat_client): +def test_ask_for_clarification_if_not_enough_context_in_question(chat_client_no_background): # Act - response = chat_client.get(f'/api/chat?q="What is the name of Namitas older son"&stream=true') + + response = chat_client_no_background.get(f'/api/chat?q="What is the name of Namitas older son"&stream=true') response_message = response.content.decode("utf-8") # Assert @@ -292,6 +307,8 @@ def test_ask_for_clarification_if_not_enough_context_in_question(chat_client): "which one is", "which of namita's sons", "the birth order", + "provide more context", + "provide me with more context", ] assert response.status_code == 200 assert any([expected_response in response_message.lower() for expected_response in expected_responses]), ( @@ -301,15 +318,16 @@ def test_ask_for_clarification_if_not_enough_context_in_question(chat_client): # ---------------------------------------------------------------------------------------------------- @pytest.mark.xfail(reason="Chat director not capable of answering this question yet") +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality -def test_answer_in_chat_history_beyond_lookback_window(chat_client): +def test_answer_in_chat_history_beyond_lookback_window(chat_client, default_user2: KhojUser): # Arrange message_list = [ ("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []), ("When was I born?", "You were born on 1st April 1984.", []), ("Where was I born?", "You were born Testville.", []), ] - populate_chat_history(message_list) + populate_chat_history(message_list, default_user2) # Act response = chat_client.get(f'/api/chat?q="What is my name?"&stream=true') @@ -324,6 +342,7 @@ def test_answer_in_chat_history_beyond_lookback_window(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) @pytest.mark.chatquality def test_answer_requires_multiple_independent_searches(chat_client): "Chat director should be able to answer by doing multiple independent searches for required information" @@ -340,10 +359,12 @@ def test_answer_requires_multiple_independent_searches(chat_client): # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) def test_answer_using_file_filter(chat_client): "Chat should be able to use search filters in the query" # Act query = urllib.parse.quote('Is Xi older than Namita? file:"Namita.markdown" file:"Xi Li.markdown"') + response = chat_client.get(f"/api/chat?q={query}&stream=true") response_message = response.content.decode("utf-8") diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_entries.py similarity index 80% rename from tests/test_org_to_jsonl.py rename to tests/test_org_to_entries.py index abf20d09..1eddcf95 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_entries.py @@ -3,8 +3,8 @@ import json import os # Internal Packages -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.text_to_jsonl import TextToJsonl +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.helpers import is_none_or_empty from khoj.utils.rawconfig import Entry from khoj.utils.fs_syncer import get_org_files @@ -29,9 +29,9 @@ def test_configure_heading_entry_to_jsonl(tmp_path): for index_heading_entries in [True, False]: # Act # Extract entries into jsonl from specified Org files - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( - OrgToJsonl.convert_org_nodes_to_entries( - *OrgToJsonl.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( + OrgToEntries.convert_org_nodes_to_entries( + *OrgToEntries.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries ) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -59,12 +59,12 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data) + entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data) # Split each entry from specified Org files by max words - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( - TextToJsonl.split_entries_by_max_tokens( - OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( + TextToEntries.split_entries_by_max_tokens( + OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 ) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -86,7 +86,7 @@ def test_entry_split_drops_large_words(): # Act # Split entry by max words and drop words larger than max word length - processed_entry = TextToJsonl.split_entries_by_max_tokens([entry], max_word_length=5)[0] + processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0] # Assert # "Heading" dropped from compiled version because its over the set max word limit @@ -109,11 +109,11 @@ def test_entry_with_body_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data) + entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data) # Process Each Entry from All Notes Files - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( - OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map) + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( + OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -136,11 +136,11 @@ Intro text # Act # Extract Entries from specified Org files - entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data) + entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data) # Process Each Entry from All Notes Files - entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries) - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries) + entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -160,11 +160,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data) + entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data) # Process Each Entry from All Notes Files - entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries) - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries) + entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -224,7 +224,7 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Act # Extract Entries from specified Org files - entries, _ = OrgToJsonl.extract_org_entries(org_files=data) + entries, _ = OrgToEntries.extract_org_entries(org_files=data) # Assert assert len(entries) == 2 diff --git a/tests/test_pdf_to_jsonl.py b/tests/test_pdf_to_entries.py similarity index 71% rename from tests/test_pdf_to_jsonl.py rename to tests/test_pdf_to_entries.py index b9b26986..3ab44639 100644 --- a/tests/test_pdf_to_jsonl.py +++ b/tests/test_pdf_to_entries.py @@ -3,7 +3,7 @@ import json import os # Internal Packages -from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl +from khoj.processor.pdf.pdf_to_entries import PdfToEntries from khoj.utils.fs_syncer import get_pdf_files from khoj.utils.rawconfig import TextContentConfig @@ -18,11 +18,11 @@ def test_single_page_pdf_to_jsonl(): pdf_bytes = f.read() data = {"tests/data/pdf/singlepage.pdf": pdf_bytes} - entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) + entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data) # Process Each Entry from All Pdf Files - jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( - PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) + jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl( + PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -38,11 +38,11 @@ def test_multi_page_pdf_to_jsonl(): pdf_bytes = f.read() data = {"tests/data/pdf/multipage.pdf": pdf_bytes} - entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) + entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data) # Process Each Entry from All Pdf Files - jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( - PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) + jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl( + PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -50,6 +50,23 @@ def test_multi_page_pdf_to_jsonl(): assert len(jsonl_data) == 6 +def test_ocr_page_pdf_to_jsonl(): + "Convert multiple pages from single PDF file to jsonl." + # Act + # Extract Entries from specified Pdf files + with open("tests/data/pdf/ocr_samples.pdf", "rb") as f: + pdf_bytes = f.read() + + data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes} + entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data) + + # Process Each Entry from All Pdf Files + entries = PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map) + + assert len(entries) == 1 + assert "playing on a strip of marsh" in entries[0].raw + + def test_get_pdf_files(tmp_path): "Ensure Pdf files specified via input-filter, input-files extracted" # Arrange diff --git a/tests/test_plaintext_to_jsonl.py b/tests/test_plaintext_to_entries.py similarity index 85% rename from tests/test_plaintext_to_jsonl.py rename to tests/test_plaintext_to_entries.py index a6da30e1..23b0d652 100644 --- a/tests/test_plaintext_to_jsonl.py +++ b/tests/test_plaintext_to_entries.py @@ -6,7 +6,8 @@ from pathlib import Path # Internal Packages from khoj.utils.fs_syncer import get_plaintext_files from khoj.utils.rawconfig import TextContentConfig -from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl +from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries +from database.models import LocalPlaintextConfig, KhojUser def test_plaintext_file(tmp_path): @@ -26,14 +27,14 @@ def test_plaintext_file(tmp_path): f"{plaintextfile}": entry, } - maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(entry_to_file_map=data) + maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data) # Convert each entry.file to absolute path to make them JSON serializable for map in maps: map.file = str(Path(map.file).absolute()) # Process Each Entry from All Notes Files - jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps) + jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -91,14 +92,15 @@ def test_get_plaintext_files(tmp_path): assert set(extracted_plaintext_files.keys()) == set(expected_files) -def test_parse_html_plaintext_file(content_config): +def test_parse_html_plaintext_file(content_config, default_user: KhojUser): "Ensure HTML files are parsed correctly" # Arrange # Setup input-files, input-filters - extracted_plaintext_files = get_plaintext_files(content_config.plaintext) + config = LocalPlaintextConfig.objects.filter(user=default_user).first() + extracted_plaintext_files = get_plaintext_files(config=config) # Act - maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(extracted_plaintext_files) + maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files) # Assert assert len(maps) == 1 diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 179718fa..3d729ab5 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -1,25 +1,27 @@ # System Packages import logging -import locale from pathlib import Path import os +import asyncio # External Packages import pytest # Internal Packages -from khoj.utils.state import content_index, search_models from khoj.search_type import text_search -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.github.github_to_jsonl import GithubToJsonl -from khoj.utils.config import SearchModels -from khoj.utils.fs_syncer import get_org_files -from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig +from khoj.utils.rawconfig import ContentConfig, SearchConfig +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from khoj.processor.github.github_to_entries import GithubToEntries +from khoj.utils.fs_syncer import collect_files, get_org_files +from database.models import LocalOrgConfig, KhojUser, Entry, GithubConfig + +logger = logging.getLogger(__name__) # Test # ---------------------------------------------------------------------------------------------------- -def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig): +@pytest.mark.django_db +def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: LocalOrgConfig): # Arrange # Ensure file mentioned in org.input-files is missing single_new_file = Path(org_config_with_only_new_file.input_files[0]) @@ -32,98 +34,148 @@ def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_n # ---------------------------------------------------------------------------------------------------- -def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path): +@pytest.mark.django_db +def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path, default_user: KhojUser): # Arrange orgfile = tmp_path / "directory.org" / "file.org" orgfile.parent.mkdir() with open(orgfile, "w") as f: f.write("* Heading\n- List item\n") - org_content_config = TextContentConfig( - input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt" + + LocalOrgConfig.objects.create( + input_filter=[f"{tmp_path}/**/*"], + input_files=None, + user=default_user, ) # Act - # should not raise IsADirectoryError and return orgfile - assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"} + org_files = collect_files(user=default_user)["org"] + + # Assert + # should return orgfile and not raise IsADirectoryError + assert org_files == {f"{orgfile}": "* Heading\n- List item\n"} # ---------------------------------------------------------------------------------------------------- -def test_text_search_setup_with_empty_file_raises_error( - org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig +@pytest.mark.django_db +def test_text_search_setup_with_empty_file_creates_no_entries( + org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog ): # Arrange data = get_org_files(org_config_with_only_new_file) + # Act # Generate notes embeddings during asymmetric setup - with pytest.raises(ValueError, match=r"^No valid entries found*"): - text_search.setup(OrgToJsonl, data, org_config_with_only_new_file, search_config.asymmetric, regenerate=True) - - -# ---------------------------------------------------------------------------------------------------- -def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels): - # Arrange - data = get_org_files(content_config.org) - - # Act - # Regenerate notes embeddings during asymmetric setup - notes_model = text_search.setup( - OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True - ) + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) # Assert - assert len(notes_model.entries) == 10 - assert len(notes_model.corpus_embeddings) == 10 + assert "Deleted 3 entries. Created 0 new entries for user " in caplog.records[-1].message + verify_embeddings(0, default_user) # ---------------------------------------------------------------------------------------------------- -def test_text_index_same_if_content_unchanged(content_config: ContentConfig, search_models: SearchModels, caplog): +@pytest.mark.django_db +def test_text_indexer_deletes_embedding_before_regenerate( + content_config: ContentConfig, default_user: KhojUser, caplog +): # Arrange - caplog.set_level(logging.INFO, logger="khoj") + org_config = LocalOrgConfig.objects.filter(user=default_user).first() + data = get_org_files(org_config) - data = get_org_files(content_config.org) + # Act + # Generate notes embeddings during asymmetric setup + with caplog.at_level(logging.DEBUG): + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) + + # Assert + assert "Deleting all entries for file type org" in caplog.text + assert "Deleted 3 entries. Created 10 new entries for user " in caplog.records[-1].message + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db +def test_text_search_setup_batch_processes(content_config: ContentConfig, default_user: KhojUser, caplog): + # Arrange + org_config = LocalOrgConfig.objects.filter(user=default_user).first() + data = get_org_files(org_config) + + # Act + # Generate notes embeddings during asymmetric setup + with caplog.at_level(logging.DEBUG): + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) + + # Assert + assert "Deleted 3 entries. Created 10 new entries for user " in caplog.records[-1].message + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db +def test_text_index_same_if_content_unchanged(content_config: ContentConfig, default_user: KhojUser, caplog): + # Arrange + org_config = LocalOrgConfig.objects.filter(user=default_user).first() + data = get_org_files(org_config) # Act # Generate initial notes embeddings during asymmetric setup - text_search.setup(OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True) + with caplog.at_level(logging.DEBUG): + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) initial_logs = caplog.text caplog.clear() # Clear logs # Run asymmetric setup again with no changes to data source. Ensure index is not updated - text_search.setup(OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=False) + with caplog.at_level(logging.DEBUG): + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) final_logs = caplog.text # Assert - assert "Creating index from scratch." in initial_logs - assert "Creating index from scratch." not in final_logs + assert "Deleting all entries for file type org" in initial_logs + assert "Deleting all entries for file type org" not in final_logs # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db @pytest.mark.anyio -async def test_text_search(content_config: ContentConfig, search_config: SearchConfig): +# @pytest.mark.asyncio +async def test_text_search(search_config: SearchConfig): # Arrange - data = get_org_files(content_config.org) - - search_models.text_search = text_search.initialize_model(search_config.asymmetric) - content_index.org = text_search.setup( - OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True + default_user = await KhojUser.objects.acreate( + username="test_user", password="test_password", email="test@example.com" ) + org_config = await LocalOrgConfig.objects.acreate( + input_files=None, + input_filter=["tests/data/org/*.org"], + index_heading_entries=False, + user=default_user, + ) + data = get_org_files(org_config) + + loop = asyncio.get_event_loop() + await loop.run_in_executor( + None, + text_search.setup, + OrgToEntries, + data, + True, + True, + default_user, + ) + query = "How to git install application?" # Act - hits, entries = await text_search.query( - query, search_model=search_models.text_search, content=content_index.org, rank_results=True - ) - - results = text_search.collate_results(hits, entries, count=1) + hits = await text_search.query(default_user, query) + results = text_search.collate_results(hits) + results = sorted(results, key=lambda x: float(x.score))[:1] # Assert - # search results should contain "git clone" entry search_result = results[0].entry - assert "git clone" in search_result + assert "git clone" in search_result, 'search result did not contain "git clone" entry' # ---------------------------------------------------------------------------------------------------- -def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContentConfig, search_models: SearchModels): +@pytest.mark.django_db +def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog): # Arrange # Insert org-mode entry with size exceeding max token limit to new org file max_tokens = 256 @@ -137,47 +189,45 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContent # Act # reload embeddings, entries, notes model after adding new org-mode file - initial_notes_model = text_search.setup( - OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False - ) + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) # Assert - # verify newly added org-mode entry is split by max tokens - assert len(initial_notes_model.entries) == 2 - assert len(initial_notes_model.corpus_embeddings) == 2 + assert ( + "Deleted 0 entries. Created 2 new entries for user " in caplog.records[-1].message + ), "new entry not split by max tokens" # ---------------------------------------------------------------------------------------------------- -# @pytest.mark.skip(reason="Flaky due to compressed_jsonl file being rewritten by other tests") +@pytest.mark.django_db def test_entry_chunking_by_max_tokens_not_full_corpus( - org_config_with_only_new_file: TextContentConfig, search_models: SearchModels + org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog ): # Arrange # Insert org-mode entry with size exceeding max token limit to new org file data = { "readme.org": """ * Khoj - /Allow natural language search on user content like notes, images using transformer based models/ +/Allow natural language search on user content like notes, images using transformer based models/ - All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline +All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline ** Dependencies - - Python3 - - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]] +- Python3 +- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]] ** Install - #+begin_src shell - git clone https://github.com/khoj-ai/khoj && cd khoj - conda env create -f environment.yml - conda activate khoj - #+end_src""" +#+begin_src shell +git clone https://github.com/khoj-ai/khoj && cd khoj +conda env create -f environment.yml +conda activate khoj +#+end_src""" } text_search.setup( - OrgToJsonl, + OrgToEntries, data, - org_config_with_only_new_file, - search_models.text_search.bi_encoder, regenerate=False, + user=default_user, ) max_tokens = 256 @@ -191,64 +241,58 @@ def test_entry_chunking_by_max_tokens_not_full_corpus( # Act # reload embeddings, entries, notes model after adding new org-mode file - initial_notes_model = text_search.setup( - OrgToJsonl, - data, - org_config_with_only_new_file, - search_models.text_search.bi_encoder, - regenerate=False, - full_corpus=False, - ) + with caplog.at_level(logging.INFO): + text_search.setup( + OrgToEntries, + data, + regenerate=False, + full_corpus=False, + user=default_user, + ) # Assert - # verify newly added org-mode entry is split by max tokens - assert len(initial_notes_model.entries) == 5 - assert len(initial_notes_model.corpus_embeddings) == 5 + assert ( + "Deleted 0 entries. Created 2 new entries for user " in caplog.records[-1].message + ), "new entry not split by max tokens" # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db def test_regenerate_index_with_new_entry( - content_config: ContentConfig, search_models: SearchModels, new_org_file: Path + content_config: ContentConfig, new_org_file: Path, default_user: KhojUser, caplog ): # Arrange - data = get_org_files(content_config.org) - initial_notes_model = text_search.setup( - OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True - ) - - assert len(initial_notes_model.entries) == 10 - assert len(initial_notes_model.corpus_embeddings) == 10 + org_config = LocalOrgConfig.objects.filter(user=default_user).first() + initial_data = get_org_files(org_config) # append org-mode entry to first org input file in config - content_config.org.input_files = [f"{new_org_file}"] + org_config.input_files = [f"{new_org_file}"] with open(new_org_file, "w") as f: f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n") - data = get_org_files(content_config.org) + final_data = get_org_files(org_config) # Act + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user) + initial_logs = caplog.text + caplog.clear() # Clear logs + # regenerate notes jsonl, model embeddings and model to include entry from new file - regenerated_notes_model = text_search.setup( - OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True - ) + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user) + final_logs = caplog.text # Assert - assert len(regenerated_notes_model.entries) == 11 - assert len(regenerated_notes_model.corpus_embeddings) == 11 - - # verify new entry appended to index, without disrupting order or content of existing entries - error_details = compare_index(initial_notes_model, regenerated_notes_model) - if error_details: - pytest.fail(error_details, False) - - # Cleanup - # reset input_files in config to empty list - content_config.org.input_files = [] + assert "Deleted 3 entries. Created 10 new entries for user " in initial_logs + assert "Deleted 10 entries. Created 11 new entries for user " in final_logs + verify_embeddings(11, default_user) # ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db def test_update_index_with_duplicate_entries_in_stable_order( - org_config_with_only_new_file: TextContentConfig, search_models: SearchModels + org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog ): # Arrange new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) @@ -261,31 +305,30 @@ def test_update_index_with_duplicate_entries_in_stable_order( data = get_org_files(org_config_with_only_new_file) # Act - # load embeddings, entries, notes model after adding new org-mode file - initial_index = text_search.setup( - OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True - ) + # generate embeddings, entries, notes model from scratch after adding new org-mode file + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) + initial_logs = caplog.text + caplog.clear() # Clear logs data = get_org_files(org_config_with_only_new_file) - # update embeddings, entries, notes model after adding new org-mode file - updated_index = text_search.setup( - OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False - ) + # update embeddings, entries, notes model with no new changes + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) + final_logs = caplog.text # Assert # verify only 1 entry added even if there are multiple duplicate entries - assert len(initial_index.entries) == len(updated_index.entries) == 1 - assert len(initial_index.corpus_embeddings) == len(updated_index.corpus_embeddings) == 1 + assert "Deleted 3 entries. Created 1 new entries for user " in initial_logs + assert "Deleted 0 entries. Created 0 new entries for user " in final_logs - # verify the same entry is added even when there are multiple duplicate entries - error_details = compare_index(initial_index, updated_index) - if error_details: - pytest.fail(error_details) + verify_embeddings(1, default_user) # ---------------------------------------------------------------------------------------------------- -def test_update_index_with_deleted_entry(org_config_with_only_new_file: TextContentConfig, search_models: SearchModels): +@pytest.mark.django_db +def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog): # Arrange new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) @@ -293,101 +336,84 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: TextCont new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" with open(new_file_to_index, "w") as f: f.write(f"{new_entry}{new_entry} -- Tatooine") - data = get_org_files(org_config_with_only_new_file) - - # load embeddings, entries, notes model after adding new org file with 2 entries - initial_index = text_search.setup( - OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True - ) + initial_data = get_org_files(org_config_with_only_new_file) # update embeddings, entries, notes model after removing an entry from the org file with open(new_file_to_index, "w") as f: f.write(f"{new_entry}") - data = get_org_files(org_config_with_only_new_file) + final_data = get_org_files(org_config_with_only_new_file) # Act - updated_index = text_search.setup( - OrgToJsonl, data, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False - ) + # load embeddings, entries, notes model after adding new org file with 2 entries + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user) + initial_logs = caplog.text + caplog.clear() # Clear logs + + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, final_data, regenerate=False, user=default_user) + final_logs = caplog.text # Assert # verify only 1 entry added even if there are multiple duplicate entries - assert len(initial_index.entries) == len(updated_index.entries) + 1 - assert len(initial_index.corpus_embeddings) == len(updated_index.corpus_embeddings) + 1 + assert "Deleted 3 entries. Created 2 new entries for user " in initial_logs + assert "Deleted 1 entries. Created 0 new entries for user " in final_logs - # verify the same entry is added even when there are multiple duplicate entries - error_details = compare_index(updated_index, initial_index) - if error_details: - pytest.fail(error_details) + verify_embeddings(1, default_user) # ---------------------------------------------------------------------------------------------------- -def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path): +@pytest.mark.django_db +def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser, caplog): # Arrange - data = get_org_files(content_config.org) - initial_notes_model = text_search.setup( - OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=True, normalize=False - ) + org_config = LocalOrgConfig.objects.filter(user=default_user).first() + data = get_org_files(org_config) + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) + initial_logs = caplog.text + caplog.clear() # Clear logs # append org-mode entry to first org input file in config with open(new_org_file, "w") as f: new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" f.write(new_entry) - data = get_org_files(content_config.org) + data = get_org_files(org_config) # Act # update embeddings, entries with the newly added note - content_config.org.input_files = [f"{new_org_file}"] - final_notes_model = text_search.setup( - OrgToJsonl, data, content_config.org, search_models.text_search.bi_encoder, regenerate=False, normalize=False - ) + with caplog.at_level(logging.INFO): + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) + final_logs = caplog.text # Assert - assert len(final_notes_model.entries) == len(initial_notes_model.entries) + 1 - assert len(final_notes_model.corpus_embeddings) == len(initial_notes_model.corpus_embeddings) + 1 - - # verify new entry appended to index, without disrupting order or content of existing entries - error_details = compare_index(initial_notes_model, final_notes_model) - if error_details: - pytest.fail(error_details, False) - - # Cleanup - # reset input_files in config to empty list - content_config.org.input_files = [] + assert "Deleted 3 entries. Created 10 new entries for user " in initial_logs + assert "Deleted 0 entries. Created 1 new entries for user " in final_logs + verify_embeddings(11, default_user) # ---------------------------------------------------------------------------------------------------- @pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set") -def test_text_search_setup_github(content_config: ContentConfig, search_models: SearchModels): +def test_text_search_setup_github(content_config: ContentConfig, default_user: KhojUser): + # Arrange + github_config = GithubConfig.objects.filter(user=default_user).first() + # Act # Regenerate github embeddings to test asymmetric setup without caching - github_model = text_search.setup( - GithubToJsonl, content_config.github, search_models.text_search.bi_encoder, regenerate=True + text_search.setup( + GithubToEntries, + {}, + regenerate=True, + user=default_user, + config=github_config, ) # Assert - assert len(github_model.entries) > 1 + embeddings = Entry.objects.filter(user=default_user, file_type="github").count() + assert embeddings > 1 -def compare_index(initial_notes_model, final_notes_model): - mismatched_entries, mismatched_embeddings = [], [] - for index in range(len(initial_notes_model.entries)): - if initial_notes_model.entries[index].to_json() != final_notes_model.entries[index].to_json(): - mismatched_entries.append(index) - - # verify new entry embedding appended to embeddings tensor, without disrupting order or content of existing embeddings - for index in range(len(initial_notes_model.corpus_embeddings)): - if not initial_notes_model.corpus_embeddings[index].allclose(final_notes_model.corpus_embeddings[index]): - mismatched_embeddings.append(index) - - error_details = "" - if mismatched_entries: - mismatched_entries_str = ",".join(map(str, mismatched_entries)) - error_details += f"Entries at {mismatched_entries_str} not equal\n" - if mismatched_embeddings: - mismatched_embeddings_str = ", ".join(map(str, mismatched_embeddings)) - error_details += f"Embeddings at {mismatched_embeddings_str} not equal\n" - - return error_details +def verify_embeddings(expected_count, user): + embeddings = Entry.objects.filter(user=user, file_type="org").count() + assert embeddings == expected_count diff --git a/tests/test_word_filter.py b/tests/test_word_filter.py index 4d29af05..253caedd 100644 --- a/tests/test_word_filter.py +++ b/tests/test_word_filter.py @@ -2,6 +2,7 @@ from khoj.search_filter.word_filter import WordFilter from khoj.utils.rawconfig import Entry + # Test # ---------------------------------------------------------------------------------------------------- def test_no_word_filter(): @@ -21,54 +22,44 @@ def test_no_word_filter(): # ---------------------------------------------------------------------------------------------------- + + def test_word_exclude_filter(): # Arrange word_filter = WordFilter() - entries = arrange_content() q_with_exclude_filter = 'head -"exclude_word" tail' # Act can_filter = word_filter.can_filter(q_with_exclude_filter) - ret_query, entry_indices = word_filter.apply(q_with_exclude_filter, entries) # Assert assert can_filter == True - assert ret_query == "head tail" - assert entry_indices == {0, 2} # ---------------------------------------------------------------------------------------------------- def test_word_include_filter(): # Arrange word_filter = WordFilter() - entries = arrange_content() query_with_include_filter = 'head +"include_word" tail' # Act can_filter = word_filter.can_filter(query_with_include_filter) - ret_query, entry_indices = word_filter.apply(query_with_include_filter, entries) # Assert assert can_filter == True - assert ret_query == "head tail" - assert entry_indices == {2, 3} # ---------------------------------------------------------------------------------------------------- def test_word_include_and_exclude_filter(): # Arrange word_filter = WordFilter() - entries = arrange_content() query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail' # Act can_filter = word_filter.can_filter(query_with_include_and_exclude_filter) - ret_query, entry_indices = word_filter.apply(query_with_include_and_exclude_filter, entries) # Assert assert can_filter == True - assert ret_query == "head tail" - assert entry_indices == {2} # ---------------------------------------------------------------------------------------------------- diff --git a/versions.json b/versions.json index 8deb4367..06efeecb 100644 --- a/versions.json +++ b/versions.json @@ -26,5 +26,6 @@ "0.12.2": "0.15.0", "0.12.3": "0.15.0", "0.13.0": "0.15.0", - "0.14.0": "0.15.0" + "0.14.0": "0.15.0", + "1.0.0": "0.15.0" }