Use a more accurate model for symmetric semantic search

- The all-MiniLM-L6-v2 is more accurate
  - The exact previous model isn't benchmarked but based on the
    performance of the closest model to it. Seems like the new model
    maybe similar in speed and size

- On very preliminary evaluation of the model, the new model seems
  faster, with pretty decent results
This commit is contained in:
Debanjum Singh Solanky 2022-07-18 20:16:40 +04:00
parent 4a90972e38
commit 989526ae54
4 changed files with 4 additions and 4 deletions

View file

@ -28,7 +28,7 @@ content-type:
search-type: search-type:
symmetric: symmetric:
encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2" encoder: "sentence-transformers/all-MiniLM-L6-v2"
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_directory: "/data/models/symmetric" model_directory: "/data/models/symmetric"

View file

@ -59,7 +59,7 @@ if __name__ == '__main__':
parser.add_argument('--dataset', type=str, default="./.dataset", help="Path to dataset to generate index from") parser.add_argument('--dataset', type=str, default="./.dataset", help="Path to dataset to generate index from")
parser.add_argument('--column', type=str, default="DATA", help="Name of dataset column to index") parser.add_argument('--column', type=str, default="DATA", help="Name of dataset column to index")
parser.add_argument('--num_results', type=int, default=10, help="Number of most suitable matches to show") parser.add_argument('--num_results', type=int, default=10, help="Number of most suitable matches to show")
parser.add_argument('--model_name', type=str, default='paraphrase-distilroberta-base-v1', help="Specify name of the SentenceTransformer model to use for encoding") parser.add_argument('--model_name', type=str, default='all-MiniLM-L6-v2', help="Specify name of the SentenceTransformer model to use for encoding")
args = parser.parse_args() args = parser.parse_args()
model = SentenceTransformer(args.model_name) model = SentenceTransformer(args.model_name)

View file

@ -79,7 +79,7 @@ default_config = {
{ {
'symmetric': 'symmetric':
{ {
'encoder': "sentence-transformers/paraphrase-MiniLM-L6-v2", 'encoder': "sentence-transformers/all-MiniLM-L6-v2",
'cross-encoder': "cross-encoder/ms-marco-MiniLM-L-6-v2", 'cross-encoder': "cross-encoder/ms-marco-MiniLM-L-6-v2",
'model_directory': None 'model_directory': None
}, },

View file

@ -14,7 +14,7 @@ def search_config(tmp_path_factory):
search_config = SearchConfig() search_config = SearchConfig()
search_config.asymmetric = SymmetricSearchConfig( search_config.asymmetric = SymmetricSearchConfig(
encoder = "sentence-transformers/paraphrase-MiniLM-L6-v2", encoder = "sentence-transformers/all-MiniLM-L6-v2",
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2", cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
model_directory = model_dir model_directory = model_dir
) )