diff --git a/documentation/docs/miscellaneous/advanced.md b/documentation/docs/miscellaneous/advanced.md index b2023c1b..532ba7cd 100644 --- a/documentation/docs/miscellaneous/advanced.md +++ b/documentation/docs/miscellaneous/advanced.md @@ -7,7 +7,7 @@ sidebar_position: 3 ## Search across Different Languages (Self-Hosting) To search for notes in multiple, different languages, you can use a [multi-lingual model](https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models).
For example, the [paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) supports [50+ languages](https://www.sbert.net/docs/pretrained_models.html#:~:text=we%20used%20the%20following%2050%2B%20languages), has good search quality and speed. To use it: -1. Manually update the search config in server's admin settings page. Go to [the search config](http://localhost:42110/server/admin/database/searchmodelconfig/). Either create a new one, if none exists, or update the existing one. Set the bi_encoder to `sentence-transformers/multi-qa-MiniLM-L6-cos-v1` and the cross_encoder to `cross-encoder/ms-marco-MiniLM-L-6-v2`. +1. Manually update the search config in server's admin settings page. Go to [the search config](http://localhost:42110/server/admin/database/searchmodelconfig/). Either create a new one, if none exists, or update the existing one. Set the bi_encoder to `sentence-transformers/multi-qa-MiniLM-L6-cos-v1` and the cross_encoder to `mixedbread-ai/mxbai-rerank-xsmall-v1`. 2. Regenerate your content index from all the relevant clients. This step is very important, as you'll need to re-encode all your content with the new model. ## Query Filters diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 58b8b729..a98b641e 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -185,7 +185,7 @@ class SearchModelConfig(BaseModel): bi_encoder_model_config = models.JSONField(default=dict) bi_encoder_query_encode_config = models.JSONField(default=dict) bi_encoder_docs_encode_config = models.JSONField(default=dict) - cross_encoder = models.CharField(max_length=200, default="cross-encoder/ms-marco-MiniLM-L-6-v2") + cross_encoder = models.CharField(max_length=200, default="mixedbread-ai/mxbai-rerank-xsmall-v1") embeddings_inference_endpoint = models.CharField(max_length=200, default=None, null=True, blank=True) embeddings_inference_endpoint_api_key = models.CharField(max_length=200, default=None, null=True, blank=True) cross_encoder_inference_endpoint = models.CharField(max_length=200, default=None, null=True, blank=True) diff --git a/src/khoj/processor/embeddings.py b/src/khoj/processor/embeddings.py index 19e986af..701bbfac 100644 --- a/src/khoj/processor/embeddings.py +++ b/src/khoj/processor/embeddings.py @@ -92,7 +92,7 @@ class EmbeddingsModel: class CrossEncoderModel: def __init__( self, - model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", + model_name: str = "mixedbread-ai/mxbai-rerank-xsmall-v1", cross_encoder_inference_endpoint: str = None, cross_encoder_inference_endpoint_api_key: str = None, ): diff --git a/tests/helpers.py b/tests/helpers.py index 642f05dd..68673596 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -75,7 +75,7 @@ class SearchModelFactory(factory.django.DjangoModelFactory): name = "default" model_type = "text" bi_encoder = "thenlper/gte-small" - cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2" + cross_encoder = "mixedbread-ai/mxbai-rerank-xsmall-v1" class SubscriptionFactory(factory.django.DjangoModelFactory):