mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Normalize metadata match score to work better with image match score
- Metadata match score were consistently giving higher scores by a factor of ~3x wrt to image match score. This was resulting in all results being from the metadata match with query and none from the image match with query. - Scaling the metadata match scores down by scaling factor seems to give more consistently give a blend of results from both image and metadata matches
This commit is contained in:
parent
a3fc82817d
commit
e0d8398b27
1 changed files with 4 additions and 3 deletions
|
@ -102,7 +102,7 @@ def extract_metadata(image_name, verbose=0):
|
||||||
image_metadata = et.get_tags(["XMP:Subject", "XMP:Description"], str(image_name))
|
image_metadata = et.get_tags(["XMP:Subject", "XMP:Description"], str(image_name))
|
||||||
image_metadata_subjects = set([subject.split(":")[1] for subject in image_metadata.get("XMP:Subject", "") if ":" in subject])
|
image_metadata_subjects = set([subject.split(":")[1] for subject in image_metadata.get("XMP:Subject", "") if ":" in subject])
|
||||||
image_processed_metadata = image_metadata.get("XMP:Description", "") + ". " + ", ".join(image_metadata_subjects)
|
image_processed_metadata = image_metadata.get("XMP:Description", "") + ". " + ", ".join(image_metadata_subjects)
|
||||||
if verbose > 1:
|
if verbose > 2:
|
||||||
print(f"{image_name}:\t{image_processed_metadata}")
|
print(f"{image_name}:\t{image_processed_metadata}")
|
||||||
return image_processed_metadata
|
return image_processed_metadata
|
||||||
|
|
||||||
|
@ -135,13 +135,14 @@ def query(raw_query, count, model: ImageSearchModel):
|
||||||
|
|
||||||
# Sum metadata, image scores of the highest ranked images
|
# Sum metadata, image scores of the highest ranked images
|
||||||
for corpus_id, score in metadata_hits.items():
|
for corpus_id, score in metadata_hits.items():
|
||||||
|
scaling_factor = 0.33
|
||||||
if 'corpus_id' in image_hits:
|
if 'corpus_id' in image_hits:
|
||||||
image_hits[corpus_id].update({
|
image_hits[corpus_id].update({
|
||||||
'metadata_score': score,
|
'metadata_score': score,
|
||||||
'score': image_hits[corpus_id].get('score', 0) + score,
|
'score': image_hits[corpus_id].get('score', 0) + scaling_factor*score,
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
image_hits[corpus_id] = {'metadata_score': score, 'score': score}
|
image_hits[corpus_id] = {'metadata_score': score, 'score': scaling_factor*score}
|
||||||
|
|
||||||
# Reformat results in original form from sentence transformer semantic_search()
|
# Reformat results in original form from sentence transformer semantic_search()
|
||||||
hits = [
|
hits = [
|
||||||
|
|
Loading…
Reference in a new issue