From e0d8398b2736897cf6eb459200a50453090e59c9 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 16 Jul 2022 03:39:33 +0400 Subject: [PATCH] Normalize metadata match score to work better with image match score - Metadata match score were consistently giving higher scores by a factor of ~3x wrt to image match score. This was resulting in all results being from the metadata match with query and none from the image match with query. - Scaling the metadata match scores down by scaling factor seems to give more consistently give a blend of results from both image and metadata matches --- src/search_type/image_search.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/search_type/image_search.py b/src/search_type/image_search.py index 1cbe0c8f..d97754c3 100644 --- a/src/search_type/image_search.py +++ b/src/search_type/image_search.py @@ -102,7 +102,7 @@ def extract_metadata(image_name, verbose=0): image_metadata = et.get_tags(["XMP:Subject", "XMP:Description"], str(image_name)) image_metadata_subjects = set([subject.split(":")[1] for subject in image_metadata.get("XMP:Subject", "") if ":" in subject]) image_processed_metadata = image_metadata.get("XMP:Description", "") + ". " + ", ".join(image_metadata_subjects) - if verbose > 1: + if verbose > 2: print(f"{image_name}:\t{image_processed_metadata}") return image_processed_metadata @@ -135,13 +135,14 @@ def query(raw_query, count, model: ImageSearchModel): # Sum metadata, image scores of the highest ranked images for corpus_id, score in metadata_hits.items(): + scaling_factor = 0.33 if 'corpus_id' in image_hits: image_hits[corpus_id].update({ 'metadata_score': score, - 'score': image_hits[corpus_id].get('score', 0) + score, + 'score': image_hits[corpus_id].get('score', 0) + scaling_factor*score, }) else: - image_hits[corpus_id] = {'metadata_score': score, 'score': score} + image_hits[corpus_id] = {'metadata_score': score, 'score': scaling_factor*score} # Reformat results in original form from sentence transformer semantic_search() hits = [