Normalize metadata match score to work better with image match score

- Metadata match score were consistently giving higher scores by a
  factor of ~3x wrt to image match score. This was resulting in all
  results being from the metadata match with query and none from the
  image match with query.
- Scaling the metadata match scores down by scaling factor seems to
  give more consistently give a blend of results from both image and
  metadata matches
This commit is contained in:
Debanjum Singh Solanky 2022-07-16 03:39:33 +04:00
parent a3fc82817d
commit e0d8398b27

View file

@ -102,7 +102,7 @@ def extract_metadata(image_name, verbose=0):
image_metadata = et.get_tags(["XMP:Subject", "XMP:Description"], str(image_name)) image_metadata = et.get_tags(["XMP:Subject", "XMP:Description"], str(image_name))
image_metadata_subjects = set([subject.split(":")[1] for subject in image_metadata.get("XMP:Subject", "") if ":" in subject]) image_metadata_subjects = set([subject.split(":")[1] for subject in image_metadata.get("XMP:Subject", "") if ":" in subject])
image_processed_metadata = image_metadata.get("XMP:Description", "") + ". " + ", ".join(image_metadata_subjects) image_processed_metadata = image_metadata.get("XMP:Description", "") + ". " + ", ".join(image_metadata_subjects)
if verbose > 1: if verbose > 2:
print(f"{image_name}:\t{image_processed_metadata}") print(f"{image_name}:\t{image_processed_metadata}")
return image_processed_metadata return image_processed_metadata
@ -135,13 +135,14 @@ def query(raw_query, count, model: ImageSearchModel):
# Sum metadata, image scores of the highest ranked images # Sum metadata, image scores of the highest ranked images
for corpus_id, score in metadata_hits.items(): for corpus_id, score in metadata_hits.items():
scaling_factor = 0.33
if 'corpus_id' in image_hits: if 'corpus_id' in image_hits:
image_hits[corpus_id].update({ image_hits[corpus_id].update({
'metadata_score': score, 'metadata_score': score,
'score': image_hits[corpus_id].get('score', 0) + score, 'score': image_hits[corpus_id].get('score', 0) + scaling_factor*score,
}) })
else: else:
image_hits[corpus_id] = {'metadata_score': score, 'score': score} image_hits[corpus_id] = {'metadata_score': score, 'score': scaling_factor*score}
# Reformat results in original form from sentence transformer semantic_search() # Reformat results in original form from sentence transformer semantic_search()
hits = [ hits = [