Get XMP metadata from image using Pillow. Remove ExifTool dependency

- Pillow already supports reading XMP metadata from Images - Removes need to maintain my fork of unmaintained PyExiftool - This also removes dependency on system Exiftool package for XMP metadata extraction - Add test to verify XMP metadata extracted from test images - Remove references to Exiftool from Documentation
2024-11-23 15:38:55 +01:00 · 2022-09-14 13:22:27 +03:00 · 2022-09-14 13:22:27 +03:00 · bf1ae038cb
commit bf1ae038cb
parent 8f57a62675
10 changed files with 35 additions and 350 deletions
--- a/2
+++ b/2
@ -4,7 +4,7 @@ LABEL org.opencontainers.image.source https://github.com/debanjum/khoj

 # Install System Dependencies
 RUN apt-get update -y && \
-    apt-get -y install libimage-exiftool-perl python3-pyqt5
+    apt-get -y install python3-pyqt5

 # Copy Application to Container
 COPY . /app
--- a/Readme.md
+++ b/Readme.md
@ -208,11 +208,7 @@ docker-compose build --pull

 #### Using Conda
 ##### 1. Install Dependencies
- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) \[Required\]
- Install Exiftool \[Optional\]
-  ``` shell
-  sudo apt -y install libimage-exiftool-perl
-  ```
+- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)

 ##### 2. Install Khoj
 ```shell
@ -254,4 +250,3 @@ pytest
 - Charles Cave for [OrgNode Parser](http://members.optusnet.com.au/~charles57/GTD/orgnode.html)
 - [Org.js](https://mooz.github.io/org-js/) to render Org-mode results on the Web interface
 - [Markdown-it](https://github.com/markdown-it/markdown-it) to render Markdown results on the Web interface
- Sven Marnach for [PyExifTool](https://github.com/smarnach/pyexiftool/blob/master/exiftool.py)
--- a/setup.py
+++ b/setup.py
@ -40,6 +40,7 @@ setup(
        "aiofiles == 0.8.0",
        "dateparser == 1.1.1",
        "pyqt6 == 6.3.1",
+        "defusedxml == 0.7.1",
    ],
    include_package_data=True,
    entry_points={"console_scripts": ["khoj = src.main:run"]},
--- a/src/search_type/image_search.py
+++ b/src/search_type/image_search.py
@ -13,8 +13,7 @@ from tqdm import trange
 import torch

 # Internal Packages
-from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
-import src.utils.exiftool as exiftool
+from src.utils.helpers import get_absolute_path, get_from_dict, resolve_absolute_path, load_model
 from src.utils.config import ImageSearchModel
 from src.utils.rawconfig import ImageContentConfig, ImageSearchConfig

@ -122,11 +121,12 @@ def compute_metadata_embeddings(image_names, encoder, embeddings_file, batch_siz


 def extract_metadata(image_name):
-    with exiftool.ExifTool() as et:
-        image_metadata = et.get_tags(["XMP:Subject", "XMP:Description"], str(image_name))
-        image_metadata_subjects = set([subject.split(":")[1] for subject in image_metadata.get("XMP:Subject", "") if ":" in subject])
+    image_xmp_metadata = Image.open(image_name).getxmp()
+    image_description = get_from_dict(image_xmp_metadata, 'xmpmeta', 'RDF', 'Description', 'description', 'Alt', 'li', 'text')
+    image_subjects = get_from_dict(image_xmp_metadata, 'xmpmeta', 'RDF', 'Description', 'subject', 'Bag', 'li')
+    image_metadata_subjects = set([subject.split(":")[1] for subject in image_subjects if ":" in subject])

-        image_processed_metadata = image_metadata.get("XMP:Description", "")
+    image_processed_metadata = image_description
    if len(image_metadata_subjects) > 0:
        image_processed_metadata += ". " + ", ".join(image_metadata_subjects)

--- a/src/utils/exiftool.py
+++ b/src/utils/exiftool.py
@ -1,325 +0,0 @@
-# -*- coding: utf-8 -*-
-# PyExifTool <http://github.com/smarnach/pyexiftool>
-# Copyright 2012 Sven Marnach
-
-# This file is part of PyExifTool.
-#
-# PyExifTool is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the licence, or
-# (at your option) any later version, or the BSD licence.
-#
-# PyExifTool is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# See COPYING.GPL or COPYING.BSD for more details.
-
-"""
-PyExifTool is a Python library to communicate with an instance of Phil
-Harvey's excellent ExifTool_ command-line application.  The library
-provides the class :py:class:`ExifTool` that runs the command-line
-tool in batch mode and features methods to send commands to that
-program, including methods to extract meta-information from one or
-more image files.  Since ``exiftool`` is run in batch mode, only a
-single instance needs to be launched and can be reused for many
-queries.  This is much more efficient than launching a separate
-process for every single query.
-
-.. _ExifTool: http://www.sno.phy.queensu.ca/~phil/exiftool/
-
-The source code can be checked out from the github repository with
-
-::
-
-    git clone git://github.com/smarnach/pyexiftool.git
-
-Alternatively, you can download a tarball_.  There haven't been any
-releases yet.
-
-.. _tarball: https://github.com/smarnach/pyexiftool/tarball/master
-
-PyExifTool is licenced under GNU GPL version 3 or later.
-
-Example usage::
-
-    import exiftool
-
-    files = ["a.jpg", "b.png", "c.tif"]
-    with exiftool.ExifTool() as et:
-        metadata = et.get_metadata_batch(files)
-    for d in metadata:
-        print("{:20.20} {:20.20}".format(d["SourceFile"],
-                                         d["EXIF:DateTimeOriginal"]))
-"""
-
-from __future__ import unicode_literals
-
-import sys
-import subprocess
-import os
-import json
-import warnings
-import codecs
-
-try:        # Py3k compatibility
-    basestring
-except NameError:
-    basestring = (bytes, str)
-
-executable = "exiftool"
-"""The name of the executable to run.
-
-If the executable is not located in one of the paths listed in the
-``PATH`` environment variable, the full path should be given here.
-"""
-
-# Sentinel indicating the end of the output of a sequence of commands.
-# The standard value should be fine.
-sentinel = b"{ready}"
-
-# The block size when reading from exiftool.  The standard value
-# should be fine, though other values might give better performance in
-# some cases.
-block_size = 4096
-
-# This code has been adapted from Lib/os.py in the Python source tree
-# (sha1 265e36e277f3)
-def _fscodec():
-    encoding = sys.getfilesystemencoding()
-    errors = "strict"
-    if encoding != "mbcs":
-        try:
-            codecs.lookup_error("surrogateescape")
-        except LookupError:
-            pass
-        else:
-            errors = "surrogateescape"
-
-    def fsencode(filename):
-        """
-        Encode filename to the filesystem encoding with 'surrogateescape' error
-        handler, return bytes unchanged. On Windows, use 'strict' error handler if
-        the file system encoding is 'mbcs' (which is the default encoding).
-        """
-        if isinstance(filename, bytes):
-            return filename
-        else:
-            return filename.encode(encoding, errors)
-
-    return fsencode
-
-fsencode = _fscodec()
-del _fscodec
-
-class ExifTool(object):
-    """Run the `exiftool` command-line tool and communicate to it.
-
-    You can pass the file name of the ``exiftool`` executable as an
-    argument to the constructor.  The default value ``exiftool`` will
-    only work if the executable is in your ``PATH``.
-
-    Most methods of this class are only available after calling
-    :py:meth:`start()`, which will actually launch the subprocess.  To
-    avoid leaving the subprocess running, make sure to call
-    :py:meth:`terminate()` method when finished using the instance.
-    This method will also be implicitly called when the instance is
-    garbage collected, but there are circumstance when this won't ever
-    happen, so you should not rely on the implicit process
-    termination.  Subprocesses won't be automatically terminated if
-    the parent process exits, so a leaked subprocess will stay around
-    until manually killed.
-
-    A convenient way to make sure that the subprocess is terminated is
-    to use the :py:class:`ExifTool` instance as a context manager::
-
-        with ExifTool() as et:
-            ...
-
-    .. warning:: Note that there is no error handling.  Nonsensical
-       options will be silently ignored by exiftool, so there's not
-       much that can be done in that regard.  You should avoid passing
-       non-existent files to any of the methods, since this will lead
-       to undefied behaviour.
-
-    .. py:attribute:: running
-
-       A Boolean value indicating whether this instance is currently
-       associated with a running subprocess.
-    """
-
-    def __init__(self, executable_=None):
-        if executable_ is None:
-            self.executable = executable
-        else:
-            self.executable = executable_
-        self.running = False
-
-    def start(self):
-        """Start an ``exiftool`` process in batch mode for this instance.
-
-        This method will issue a ``UserWarning`` if the subprocess is
-        already running.  The process is started with the ``-G`` and
-        ``-n`` as common arguments, which are automatically included
-        in every command you run with :py:meth:`execute()`.
-        """
-        if self.running:
-            warnings.warn("ExifTool already running; doing nothing.")
-            return
-        with open(os.devnull, "w") as devnull:
-            self._process = subprocess.Popen(
-                [self.executable, "-stay_open", "True",  "-@", "-",
-                 "-common_args", "-G", "-n"],
-                stdin=subprocess.PIPE, stdout=subprocess.PIPE,
-                stderr=devnull)
-        self.running = True
-
-    def terminate(self):
-        """Terminate the ``exiftool`` process of this instance.
-
-        If the subprocess isn't running, this method will do nothing.
-        """
-        if not self.running:
-            return
-        self._process.stdin.write(b"-stay_open\nFalse\n")
-        self._process.stdin.flush()
-        self._process.communicate()
-        del self._process
-        self.running = False
-
-    def __enter__(self):
-        self.start()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.terminate()
-
-    def __del__(self):
-        self.terminate()
-
-    def execute(self, *params):
-        """Execute the given batch of parameters with ``exiftool``.
-
-        This method accepts any number of parameters and sends them to
-        the attached ``exiftool`` process.  The process must be
-        running, otherwise ``ValueError`` is raised.  The final
-        ``-execute`` necessary to actually run the batch is appended
-        automatically; see the documentation of :py:meth:`start()` for
-        the common options.  The ``exiftool`` output is read up to the
-        end-of-output sentinel and returned as a raw ``bytes`` object,
-        excluding the sentinel.
-
-        The parameters must also be raw ``bytes``, in whatever
-        encoding exiftool accepts.  For filenames, this should be the
-        system's filesystem encoding.
-
-        .. note:: This is considered a low-level method, and should
-           rarely be needed by application developers.
-        """
-        if not self.running:
-            raise ValueError("ExifTool instance not running.")
-        self._process.stdin.write(b"\n".join(params + (b"-execute\n",)))
-        self._process.stdin.flush()
-        output = b""
-        fd = self._process.stdout.fileno()
-        while not output[-32:].strip().endswith(sentinel):
-            output += os.read(fd, block_size)
-        return output.strip()[:-len(sentinel)]
-
-    def execute_json(self, *params):
-        """Execute the given batch of parameters and parse the JSON output.
-
-        This method is similar to :py:meth:`execute()`.  It
-        automatically adds the parameter ``-j`` to request JSON output
-        from ``exiftool`` and parses the output.  The return value is
-        a list of dictionaries, mapping tag names to the corresponding
-        values.  All keys are Unicode strings with the tag names
-        including the ExifTool group name in the format <group>:<tag>.
-        The values can have multiple types.  All strings occurring as
-        values will be Unicode strings.  Each dictionary contains the
-        name of the file it corresponds to in the key ``"SourceFile"``.
-
-        The parameters to this function must be either raw strings
-        (type ``str`` in Python 2.x, type ``bytes`` in Python 3.x) or
-        Unicode strings (type ``unicode`` in Python 2.x, type ``str``
-        in Python 3.x).  Unicode strings will be encoded using
-        system's filesystem encoding.  This behaviour means you can
-        pass in filenames according to the convention of the
-        respective Python version – as raw strings in Python 2.x and
-        as Unicode strings in Python 3.x.
-        """
-        params = map(fsencode, params)
-        return json.loads(self.execute(b"-j", *params).decode("utf-8"))
-
-    def get_metadata_batch(self, filenames):
-        """Return all meta-data for the given files.
-
-        The return value will have the format described in the
-        documentation of :py:meth:`execute_json()`.
-        """
-        return self.execute_json(*filenames)
-
-    def get_metadata(self, filename):
-        """Return meta-data for a single file.
-
-        The returned dictionary has the format described in the
-        documentation of :py:meth:`execute_json()`.
-        """
-        return self.execute_json(filename)[0]
-
-    def get_tags_batch(self, tags, filenames):
-        """Return only specified tags for the given files.
-
-        The first argument is an iterable of tags.  The tag names may
-        include group names, as usual in the format <group>:<tag>.
-
-        The second argument is an iterable of file names.
-
-        The format of the return value is the same as for
-        :py:meth:`execute_json()`.
-        """
-        # Explicitly ruling out strings here because passing in a
-        # string would lead to strange and hard-to-find errors
-        if isinstance(tags, basestring):
-            raise TypeError("The argument 'tags' must be "
-                            "an iterable of strings")
-        if isinstance(filenames, basestring):
-            raise TypeError("The argument 'filenames' must be "
-                            "an iterable of strings")
-        params = ["-" + t for t in tags]
-        params.extend(filenames)
-        return self.execute_json(*params)
-
-    def get_tags(self, tags, filename):
-        """Return only specified tags for a single file.
-
-        The returned dictionary has the format described in the
-        documentation of :py:meth:`execute_json()`.
-        """
-        return self.get_tags_batch(tags, [filename])[0]
-
-    def get_tag_batch(self, tag, filenames):
-        """Extract a single tag from the given files.
-
-        The first argument is a single tag name, as usual in the
-        format <group>:<tag>.
-
-        The second argument is an iterable of file names.
-
-        The return value is a list of tag values or ``None`` for
-        non-existent tags, in the same order as ``filenames``.
-        """
-        data = self.get_tags_batch([tag], filenames)
-        result = []
-        for d in data:
-            d.pop("SourceFile")
-            result.append(next(iter(d.values()), None))
-        return result
-
-    def get_tag(self, tag, filename):
-        """Extract a single tag from a single file.
-
-        The return value is the value of the specified tag, or
-        ``None`` if this tag was not found in the file.
-        """
-        return self.get_tag_batch(tag, [filename])[0]
--- a/tests/data/images/guineapig_grass.jpg
+++ b/tests/data/images/guineapig_grass.jpg
--- a/tests/data/images/horse_dog.jpg
+++ b/tests/data/images/horse_dog.jpg
--- a/tests/data/images/kitten_park.jpg
+++ b/tests/data/images/kitten_park.jpg
--- a/tests/data/markdown/main_readme.md
+++ b/tests/data/markdown/main_readme.md
@ -96,12 +96,6 @@ docker-compose build --pull
        Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)
        \[Required\]

-    3.  Install Exiftool \[Optional\]
-
-        ``` shell
-        sudo apt-get -y install libimage-exiftool-perl
-        ```
-
 2.  2\. Install Khoj

    ``` shell
@ -149,5 +143,3 @@ conda activate khoj
    Documentation](https://www.sbert.net/examples/applications/image-search/README.html)
 -   Charles Cave for [OrgNode
    Parser](http://members.optusnet.com.au/~charles57/GTD/orgnode.html)
-   Sven Marnach for
-    [PyExifTool](https://github.com/smarnach/pyexiftool/blob/master/exiftool.py)
--- a/tests/test_image_search.py
+++ b/tests/test_image_search.py
@ -25,6 +25,28 @@ def test_image_search_setup(content_config: ContentConfig, search_config: Search
    assert len(image_search_model.image_embeddings) == 3


+def test_image_metadata(content_config: ContentConfig):
+    "Verify XMP Description and Subjects Extracted from Image"
+    # Arrange
+    expected_metadata_image_name_pairs = [
+        (["Billi Ka Bacha.", "Cat", "Grass"], "kitten_park.jpg"),
+        (["Pasture.", "Horse", "Dog"], "horse_dog.jpg"),
+        (["Guinea Pig Eating Celery.", "Rodent", "Whiskers"], "guineapig_grass.jpg")]
+
+    test_image_paths = [
+        Path(content_config.image.input_directories[0] / image_name[1])
+        for image_name in expected_metadata_image_name_pairs
+    ]
+
+    for expected_metadata, test_image_path in zip(expected_metadata_image_name_pairs, test_image_paths):
+        # Act
+        actual_metadata = image_search.extract_metadata(test_image_path)
+
+        # Assert
+        for expected_snippet in expected_metadata[0]:
+            assert expected_snippet in actual_metadata
+
+
 # ----------------------------------------------------------------------------------------------------
 def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
    # Arrange