mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Get XMP metadata from image using Pillow. Remove ExifTool dependency
- Pillow already supports reading XMP metadata from Images - Removes need to maintain my fork of unmaintained PyExiftool - This also removes dependency on system Exiftool package for XMP metadata extraction - Add test to verify XMP metadata extracted from test images - Remove references to Exiftool from Documentation
This commit is contained in:
parent
8f57a62675
commit
bf1ae038cb
10 changed files with 35 additions and 350 deletions
|
@ -4,7 +4,7 @@ LABEL org.opencontainers.image.source https://github.com/debanjum/khoj
|
|||
|
||||
# Install System Dependencies
|
||||
RUN apt-get update -y && \
|
||||
apt-get -y install libimage-exiftool-perl python3-pyqt5
|
||||
apt-get -y install python3-pyqt5
|
||||
|
||||
# Copy Application to Container
|
||||
COPY . /app
|
||||
|
|
|
@ -208,11 +208,7 @@ docker-compose build --pull
|
|||
|
||||
#### Using Conda
|
||||
##### 1. Install Dependencies
|
||||
- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) \[Required\]
|
||||
- Install Exiftool \[Optional\]
|
||||
``` shell
|
||||
sudo apt -y install libimage-exiftool-perl
|
||||
```
|
||||
- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)
|
||||
|
||||
##### 2. Install Khoj
|
||||
```shell
|
||||
|
@ -254,4 +250,3 @@ pytest
|
|||
- Charles Cave for [OrgNode Parser](http://members.optusnet.com.au/~charles57/GTD/orgnode.html)
|
||||
- [Org.js](https://mooz.github.io/org-js/) to render Org-mode results on the Web interface
|
||||
- [Markdown-it](https://github.com/markdown-it/markdown-it) to render Markdown results on the Web interface
|
||||
- Sven Marnach for [PyExifTool](https://github.com/smarnach/pyexiftool/blob/master/exiftool.py)
|
||||
|
|
1
setup.py
1
setup.py
|
@ -40,6 +40,7 @@ setup(
|
|||
"aiofiles == 0.8.0",
|
||||
"dateparser == 1.1.1",
|
||||
"pyqt6 == 6.3.1",
|
||||
"defusedxml == 0.7.1",
|
||||
],
|
||||
include_package_data=True,
|
||||
entry_points={"console_scripts": ["khoj = src.main:run"]},
|
||||
|
|
|
@ -13,8 +13,7 @@ from tqdm import trange
|
|||
import torch
|
||||
|
||||
# Internal Packages
|
||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
||||
import src.utils.exiftool as exiftool
|
||||
from src.utils.helpers import get_absolute_path, get_from_dict, resolve_absolute_path, load_model
|
||||
from src.utils.config import ImageSearchModel
|
||||
from src.utils.rawconfig import ImageContentConfig, ImageSearchConfig
|
||||
|
||||
|
@ -122,11 +121,12 @@ def compute_metadata_embeddings(image_names, encoder, embeddings_file, batch_siz
|
|||
|
||||
|
||||
def extract_metadata(image_name):
|
||||
with exiftool.ExifTool() as et:
|
||||
image_metadata = et.get_tags(["XMP:Subject", "XMP:Description"], str(image_name))
|
||||
image_metadata_subjects = set([subject.split(":")[1] for subject in image_metadata.get("XMP:Subject", "") if ":" in subject])
|
||||
image_xmp_metadata = Image.open(image_name).getxmp()
|
||||
image_description = get_from_dict(image_xmp_metadata, 'xmpmeta', 'RDF', 'Description', 'description', 'Alt', 'li', 'text')
|
||||
image_subjects = get_from_dict(image_xmp_metadata, 'xmpmeta', 'RDF', 'Description', 'subject', 'Bag', 'li')
|
||||
image_metadata_subjects = set([subject.split(":")[1] for subject in image_subjects if ":" in subject])
|
||||
|
||||
image_processed_metadata = image_metadata.get("XMP:Description", "")
|
||||
image_processed_metadata = image_description
|
||||
if len(image_metadata_subjects) > 0:
|
||||
image_processed_metadata += ". " + ", ".join(image_metadata_subjects)
|
||||
|
||||
|
|
|
@ -1,325 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# PyExifTool <http://github.com/smarnach/pyexiftool>
|
||||
# Copyright 2012 Sven Marnach
|
||||
|
||||
# This file is part of PyExifTool.
|
||||
#
|
||||
# PyExifTool is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the licence, or
|
||||
# (at your option) any later version, or the BSD licence.
|
||||
#
|
||||
# PyExifTool is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
#
|
||||
# See COPYING.GPL or COPYING.BSD for more details.
|
||||
|
||||
"""
|
||||
PyExifTool is a Python library to communicate with an instance of Phil
|
||||
Harvey's excellent ExifTool_ command-line application. The library
|
||||
provides the class :py:class:`ExifTool` that runs the command-line
|
||||
tool in batch mode and features methods to send commands to that
|
||||
program, including methods to extract meta-information from one or
|
||||
more image files. Since ``exiftool`` is run in batch mode, only a
|
||||
single instance needs to be launched and can be reused for many
|
||||
queries. This is much more efficient than launching a separate
|
||||
process for every single query.
|
||||
|
||||
.. _ExifTool: http://www.sno.phy.queensu.ca/~phil/exiftool/
|
||||
|
||||
The source code can be checked out from the github repository with
|
||||
|
||||
::
|
||||
|
||||
git clone git://github.com/smarnach/pyexiftool.git
|
||||
|
||||
Alternatively, you can download a tarball_. There haven't been any
|
||||
releases yet.
|
||||
|
||||
.. _tarball: https://github.com/smarnach/pyexiftool/tarball/master
|
||||
|
||||
PyExifTool is licenced under GNU GPL version 3 or later.
|
||||
|
||||
Example usage::
|
||||
|
||||
import exiftool
|
||||
|
||||
files = ["a.jpg", "b.png", "c.tif"]
|
||||
with exiftool.ExifTool() as et:
|
||||
metadata = et.get_metadata_batch(files)
|
||||
for d in metadata:
|
||||
print("{:20.20} {:20.20}".format(d["SourceFile"],
|
||||
d["EXIF:DateTimeOriginal"]))
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
import subprocess
|
||||
import os
|
||||
import json
|
||||
import warnings
|
||||
import codecs
|
||||
|
||||
try: # Py3k compatibility
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = (bytes, str)
|
||||
|
||||
executable = "exiftool"
|
||||
"""The name of the executable to run.
|
||||
|
||||
If the executable is not located in one of the paths listed in the
|
||||
``PATH`` environment variable, the full path should be given here.
|
||||
"""
|
||||
|
||||
# Sentinel indicating the end of the output of a sequence of commands.
|
||||
# The standard value should be fine.
|
||||
sentinel = b"{ready}"
|
||||
|
||||
# The block size when reading from exiftool. The standard value
|
||||
# should be fine, though other values might give better performance in
|
||||
# some cases.
|
||||
block_size = 4096
|
||||
|
||||
# This code has been adapted from Lib/os.py in the Python source tree
|
||||
# (sha1 265e36e277f3)
|
||||
def _fscodec():
|
||||
encoding = sys.getfilesystemencoding()
|
||||
errors = "strict"
|
||||
if encoding != "mbcs":
|
||||
try:
|
||||
codecs.lookup_error("surrogateescape")
|
||||
except LookupError:
|
||||
pass
|
||||
else:
|
||||
errors = "surrogateescape"
|
||||
|
||||
def fsencode(filename):
|
||||
"""
|
||||
Encode filename to the filesystem encoding with 'surrogateescape' error
|
||||
handler, return bytes unchanged. On Windows, use 'strict' error handler if
|
||||
the file system encoding is 'mbcs' (which is the default encoding).
|
||||
"""
|
||||
if isinstance(filename, bytes):
|
||||
return filename
|
||||
else:
|
||||
return filename.encode(encoding, errors)
|
||||
|
||||
return fsencode
|
||||
|
||||
fsencode = _fscodec()
|
||||
del _fscodec
|
||||
|
||||
class ExifTool(object):
|
||||
"""Run the `exiftool` command-line tool and communicate to it.
|
||||
|
||||
You can pass the file name of the ``exiftool`` executable as an
|
||||
argument to the constructor. The default value ``exiftool`` will
|
||||
only work if the executable is in your ``PATH``.
|
||||
|
||||
Most methods of this class are only available after calling
|
||||
:py:meth:`start()`, which will actually launch the subprocess. To
|
||||
avoid leaving the subprocess running, make sure to call
|
||||
:py:meth:`terminate()` method when finished using the instance.
|
||||
This method will also be implicitly called when the instance is
|
||||
garbage collected, but there are circumstance when this won't ever
|
||||
happen, so you should not rely on the implicit process
|
||||
termination. Subprocesses won't be automatically terminated if
|
||||
the parent process exits, so a leaked subprocess will stay around
|
||||
until manually killed.
|
||||
|
||||
A convenient way to make sure that the subprocess is terminated is
|
||||
to use the :py:class:`ExifTool` instance as a context manager::
|
||||
|
||||
with ExifTool() as et:
|
||||
...
|
||||
|
||||
.. warning:: Note that there is no error handling. Nonsensical
|
||||
options will be silently ignored by exiftool, so there's not
|
||||
much that can be done in that regard. You should avoid passing
|
||||
non-existent files to any of the methods, since this will lead
|
||||
to undefied behaviour.
|
||||
|
||||
.. py:attribute:: running
|
||||
|
||||
A Boolean value indicating whether this instance is currently
|
||||
associated with a running subprocess.
|
||||
"""
|
||||
|
||||
def __init__(self, executable_=None):
|
||||
if executable_ is None:
|
||||
self.executable = executable
|
||||
else:
|
||||
self.executable = executable_
|
||||
self.running = False
|
||||
|
||||
def start(self):
|
||||
"""Start an ``exiftool`` process in batch mode for this instance.
|
||||
|
||||
This method will issue a ``UserWarning`` if the subprocess is
|
||||
already running. The process is started with the ``-G`` and
|
||||
``-n`` as common arguments, which are automatically included
|
||||
in every command you run with :py:meth:`execute()`.
|
||||
"""
|
||||
if self.running:
|
||||
warnings.warn("ExifTool already running; doing nothing.")
|
||||
return
|
||||
with open(os.devnull, "w") as devnull:
|
||||
self._process = subprocess.Popen(
|
||||
[self.executable, "-stay_open", "True", "-@", "-",
|
||||
"-common_args", "-G", "-n"],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
stderr=devnull)
|
||||
self.running = True
|
||||
|
||||
def terminate(self):
|
||||
"""Terminate the ``exiftool`` process of this instance.
|
||||
|
||||
If the subprocess isn't running, this method will do nothing.
|
||||
"""
|
||||
if not self.running:
|
||||
return
|
||||
self._process.stdin.write(b"-stay_open\nFalse\n")
|
||||
self._process.stdin.flush()
|
||||
self._process.communicate()
|
||||
del self._process
|
||||
self.running = False
|
||||
|
||||
def __enter__(self):
|
||||
self.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.terminate()
|
||||
|
||||
def __del__(self):
|
||||
self.terminate()
|
||||
|
||||
def execute(self, *params):
|
||||
"""Execute the given batch of parameters with ``exiftool``.
|
||||
|
||||
This method accepts any number of parameters and sends them to
|
||||
the attached ``exiftool`` process. The process must be
|
||||
running, otherwise ``ValueError`` is raised. The final
|
||||
``-execute`` necessary to actually run the batch is appended
|
||||
automatically; see the documentation of :py:meth:`start()` for
|
||||
the common options. The ``exiftool`` output is read up to the
|
||||
end-of-output sentinel and returned as a raw ``bytes`` object,
|
||||
excluding the sentinel.
|
||||
|
||||
The parameters must also be raw ``bytes``, in whatever
|
||||
encoding exiftool accepts. For filenames, this should be the
|
||||
system's filesystem encoding.
|
||||
|
||||
.. note:: This is considered a low-level method, and should
|
||||
rarely be needed by application developers.
|
||||
"""
|
||||
if not self.running:
|
||||
raise ValueError("ExifTool instance not running.")
|
||||
self._process.stdin.write(b"\n".join(params + (b"-execute\n",)))
|
||||
self._process.stdin.flush()
|
||||
output = b""
|
||||
fd = self._process.stdout.fileno()
|
||||
while not output[-32:].strip().endswith(sentinel):
|
||||
output += os.read(fd, block_size)
|
||||
return output.strip()[:-len(sentinel)]
|
||||
|
||||
def execute_json(self, *params):
|
||||
"""Execute the given batch of parameters and parse the JSON output.
|
||||
|
||||
This method is similar to :py:meth:`execute()`. It
|
||||
automatically adds the parameter ``-j`` to request JSON output
|
||||
from ``exiftool`` and parses the output. The return value is
|
||||
a list of dictionaries, mapping tag names to the corresponding
|
||||
values. All keys are Unicode strings with the tag names
|
||||
including the ExifTool group name in the format <group>:<tag>.
|
||||
The values can have multiple types. All strings occurring as
|
||||
values will be Unicode strings. Each dictionary contains the
|
||||
name of the file it corresponds to in the key ``"SourceFile"``.
|
||||
|
||||
The parameters to this function must be either raw strings
|
||||
(type ``str`` in Python 2.x, type ``bytes`` in Python 3.x) or
|
||||
Unicode strings (type ``unicode`` in Python 2.x, type ``str``
|
||||
in Python 3.x). Unicode strings will be encoded using
|
||||
system's filesystem encoding. This behaviour means you can
|
||||
pass in filenames according to the convention of the
|
||||
respective Python version – as raw strings in Python 2.x and
|
||||
as Unicode strings in Python 3.x.
|
||||
"""
|
||||
params = map(fsencode, params)
|
||||
return json.loads(self.execute(b"-j", *params).decode("utf-8"))
|
||||
|
||||
def get_metadata_batch(self, filenames):
|
||||
"""Return all meta-data for the given files.
|
||||
|
||||
The return value will have the format described in the
|
||||
documentation of :py:meth:`execute_json()`.
|
||||
"""
|
||||
return self.execute_json(*filenames)
|
||||
|
||||
def get_metadata(self, filename):
|
||||
"""Return meta-data for a single file.
|
||||
|
||||
The returned dictionary has the format described in the
|
||||
documentation of :py:meth:`execute_json()`.
|
||||
"""
|
||||
return self.execute_json(filename)[0]
|
||||
|
||||
def get_tags_batch(self, tags, filenames):
|
||||
"""Return only specified tags for the given files.
|
||||
|
||||
The first argument is an iterable of tags. The tag names may
|
||||
include group names, as usual in the format <group>:<tag>.
|
||||
|
||||
The second argument is an iterable of file names.
|
||||
|
||||
The format of the return value is the same as for
|
||||
:py:meth:`execute_json()`.
|
||||
"""
|
||||
# Explicitly ruling out strings here because passing in a
|
||||
# string would lead to strange and hard-to-find errors
|
||||
if isinstance(tags, basestring):
|
||||
raise TypeError("The argument 'tags' must be "
|
||||
"an iterable of strings")
|
||||
if isinstance(filenames, basestring):
|
||||
raise TypeError("The argument 'filenames' must be "
|
||||
"an iterable of strings")
|
||||
params = ["-" + t for t in tags]
|
||||
params.extend(filenames)
|
||||
return self.execute_json(*params)
|
||||
|
||||
def get_tags(self, tags, filename):
|
||||
"""Return only specified tags for a single file.
|
||||
|
||||
The returned dictionary has the format described in the
|
||||
documentation of :py:meth:`execute_json()`.
|
||||
"""
|
||||
return self.get_tags_batch(tags, [filename])[0]
|
||||
|
||||
def get_tag_batch(self, tag, filenames):
|
||||
"""Extract a single tag from the given files.
|
||||
|
||||
The first argument is a single tag name, as usual in the
|
||||
format <group>:<tag>.
|
||||
|
||||
The second argument is an iterable of file names.
|
||||
|
||||
The return value is a list of tag values or ``None`` for
|
||||
non-existent tags, in the same order as ``filenames``.
|
||||
"""
|
||||
data = self.get_tags_batch([tag], filenames)
|
||||
result = []
|
||||
for d in data:
|
||||
d.pop("SourceFile")
|
||||
result.append(next(iter(d.values()), None))
|
||||
return result
|
||||
|
||||
def get_tag(self, tag, filename):
|
||||
"""Extract a single tag from a single file.
|
||||
|
||||
The return value is the value of the specified tag, or
|
||||
``None`` if this tag was not found in the file.
|
||||
"""
|
||||
return self.get_tag_batch(tag, [filename])[0]
|
Binary file not shown.
Before Width: | Height: | Size: 170 KiB After Width: | Height: | Size: 173 KiB |
Binary file not shown.
Before Width: | Height: | Size: 330 KiB After Width: | Height: | Size: 334 KiB |
Binary file not shown.
Before Width: | Height: | Size: 268 KiB After Width: | Height: | Size: 271 KiB |
|
@ -96,12 +96,6 @@ docker-compose build --pull
|
|||
Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)
|
||||
\[Required\]
|
||||
|
||||
3. Install Exiftool \[Optional\]
|
||||
|
||||
``` shell
|
||||
sudo apt-get -y install libimage-exiftool-perl
|
||||
```
|
||||
|
||||
2. 2\. Install Khoj
|
||||
|
||||
``` shell
|
||||
|
@ -149,5 +143,3 @@ conda activate khoj
|
|||
Documentation](https://www.sbert.net/examples/applications/image-search/README.html)
|
||||
- Charles Cave for [OrgNode
|
||||
Parser](http://members.optusnet.com.au/~charles57/GTD/orgnode.html)
|
||||
- Sven Marnach for
|
||||
[PyExifTool](https://github.com/smarnach/pyexiftool/blob/master/exiftool.py)
|
||||
|
|
|
@ -25,6 +25,28 @@ def test_image_search_setup(content_config: ContentConfig, search_config: Search
|
|||
assert len(image_search_model.image_embeddings) == 3
|
||||
|
||||
|
||||
def test_image_metadata(content_config: ContentConfig):
|
||||
"Verify XMP Description and Subjects Extracted from Image"
|
||||
# Arrange
|
||||
expected_metadata_image_name_pairs = [
|
||||
(["Billi Ka Bacha.", "Cat", "Grass"], "kitten_park.jpg"),
|
||||
(["Pasture.", "Horse", "Dog"], "horse_dog.jpg"),
|
||||
(["Guinea Pig Eating Celery.", "Rodent", "Whiskers"], "guineapig_grass.jpg")]
|
||||
|
||||
test_image_paths = [
|
||||
Path(content_config.image.input_directories[0] / image_name[1])
|
||||
for image_name in expected_metadata_image_name_pairs
|
||||
]
|
||||
|
||||
for expected_metadata, test_image_path in zip(expected_metadata_image_name_pairs, test_image_paths):
|
||||
# Act
|
||||
actual_metadata = image_search.extract_metadata(test_image_path)
|
||||
|
||||
# Assert
|
||||
for expected_snippet in expected_metadata[0]:
|
||||
assert expected_snippet in actual_metadata
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
|
|
Loading…
Reference in a new issue