mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-30 10:53:02 +01:00
Use a more accurate model for symmetric semantic search
- The all-MiniLM-L6-v2 is more accurate - The exact previous model isn't benchmarked but based on the performance of the closest model to it. Seems like the new model maybe similar in speed and size - On very preliminary evaluation of the model, the new model seems faster, with pretty decent results
This commit is contained in:
parent
4a90972e38
commit
989526ae54
4 changed files with 4 additions and 4 deletions
|
@ -28,7 +28,7 @@ content-type:
|
||||||
|
|
||||||
search-type:
|
search-type:
|
||||||
symmetric:
|
symmetric:
|
||||||
encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2"
|
encoder: "sentence-transformers/all-MiniLM-L6-v2"
|
||||||
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||||
model_directory: "/data/models/symmetric"
|
model_directory: "/data/models/symmetric"
|
||||||
|
|
||||||
|
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--dataset', type=str, default="./.dataset", help="Path to dataset to generate index from")
|
parser.add_argument('--dataset', type=str, default="./.dataset", help="Path to dataset to generate index from")
|
||||||
parser.add_argument('--column', type=str, default="DATA", help="Name of dataset column to index")
|
parser.add_argument('--column', type=str, default="DATA", help="Name of dataset column to index")
|
||||||
parser.add_argument('--num_results', type=int, default=10, help="Number of most suitable matches to show")
|
parser.add_argument('--num_results', type=int, default=10, help="Number of most suitable matches to show")
|
||||||
parser.add_argument('--model_name', type=str, default='paraphrase-distilroberta-base-v1', help="Specify name of the SentenceTransformer model to use for encoding")
|
parser.add_argument('--model_name', type=str, default='all-MiniLM-L6-v2', help="Specify name of the SentenceTransformer model to use for encoding")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
model = SentenceTransformer(args.model_name)
|
model = SentenceTransformer(args.model_name)
|
||||||
|
|
|
@ -79,7 +79,7 @@ default_config = {
|
||||||
{
|
{
|
||||||
'symmetric':
|
'symmetric':
|
||||||
{
|
{
|
||||||
'encoder': "sentence-transformers/paraphrase-MiniLM-L6-v2",
|
'encoder': "sentence-transformers/all-MiniLM-L6-v2",
|
||||||
'cross-encoder': "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
'cross-encoder': "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||||
'model_directory': None
|
'model_directory': None
|
||||||
},
|
},
|
||||||
|
|
|
@ -14,7 +14,7 @@ def search_config(tmp_path_factory):
|
||||||
search_config = SearchConfig()
|
search_config = SearchConfig()
|
||||||
|
|
||||||
search_config.asymmetric = SymmetricSearchConfig(
|
search_config.asymmetric = SymmetricSearchConfig(
|
||||||
encoder = "sentence-transformers/paraphrase-MiniLM-L6-v2",
|
encoder = "sentence-transformers/all-MiniLM-L6-v2",
|
||||||
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||||
model_directory = model_dir
|
model_directory = model_dir
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue