From bf1ae038cb2b799f6f28568a95c8a5f8495a1d6f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 14 Sep 2022 13:22:27 +0300 Subject: [PATCH] Get XMP metadata from image using Pillow. Remove ExifTool dependency - Pillow already supports reading XMP metadata from Images - Removes need to maintain my fork of unmaintained PyExiftool - This also removes dependency on system Exiftool package for XMP metadata extraction - Add test to verify XMP metadata extracted from test images - Remove references to Exiftool from Documentation --- Dockerfile | 2 +- Readme.md | 7 +- setup.py | 1 + src/search_type/image_search.py | 20 +- src/utils/exiftool.py | 325 -------------------------- tests/data/images/guineapig_grass.jpg | Bin 173963 -> 177493 bytes tests/data/images/horse_dog.jpg | Bin 338007 -> 341658 bytes tests/data/images/kitten_park.jpg | Bin 274076 -> 277596 bytes tests/data/markdown/main_readme.md | 8 - tests/test_image_search.py | 22 ++ 10 files changed, 35 insertions(+), 350 deletions(-) delete mode 100644 src/utils/exiftool.py diff --git a/Dockerfile b/Dockerfile index e1cd9321..d1d23687 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ LABEL org.opencontainers.image.source https://github.com/debanjum/khoj # Install System Dependencies RUN apt-get update -y && \ - apt-get -y install libimage-exiftool-perl python3-pyqt5 + apt-get -y install python3-pyqt5 # Copy Application to Container COPY . /app diff --git a/Readme.md b/Readme.md index c47e07a5..da09b2ce 100644 --- a/Readme.md +++ b/Readme.md @@ -208,11 +208,7 @@ docker-compose build --pull #### Using Conda ##### 1. Install Dependencies -- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) \[Required\] -- Install Exiftool \[Optional\] - ``` shell - sudo apt -y install libimage-exiftool-perl - ``` +- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) ##### 2. Install Khoj ```shell @@ -254,4 +250,3 @@ pytest - Charles Cave for [OrgNode Parser](http://members.optusnet.com.au/~charles57/GTD/orgnode.html) - [Org.js](https://mooz.github.io/org-js/) to render Org-mode results on the Web interface - [Markdown-it](https://github.com/markdown-it/markdown-it) to render Markdown results on the Web interface -- Sven Marnach for [PyExifTool](https://github.com/smarnach/pyexiftool/blob/master/exiftool.py) diff --git a/setup.py b/setup.py index 3de0fb75..16fd0061 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ setup( "aiofiles == 0.8.0", "dateparser == 1.1.1", "pyqt6 == 6.3.1", + "defusedxml == 0.7.1", ], include_package_data=True, entry_points={"console_scripts": ["khoj = src.main:run"]}, diff --git a/src/search_type/image_search.py b/src/search_type/image_search.py index c9dcdd6b..a86cc42c 100644 --- a/src/search_type/image_search.py +++ b/src/search_type/image_search.py @@ -13,8 +13,7 @@ from tqdm import trange import torch # Internal Packages -from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model -import src.utils.exiftool as exiftool +from src.utils.helpers import get_absolute_path, get_from_dict, resolve_absolute_path, load_model from src.utils.config import ImageSearchModel from src.utils.rawconfig import ImageContentConfig, ImageSearchConfig @@ -122,17 +121,18 @@ def compute_metadata_embeddings(image_names, encoder, embeddings_file, batch_siz def extract_metadata(image_name): - with exiftool.ExifTool() as et: - image_metadata = et.get_tags(["XMP:Subject", "XMP:Description"], str(image_name)) - image_metadata_subjects = set([subject.split(":")[1] for subject in image_metadata.get("XMP:Subject", "") if ":" in subject]) + image_xmp_metadata = Image.open(image_name).getxmp() + image_description = get_from_dict(image_xmp_metadata, 'xmpmeta', 'RDF', 'Description', 'description', 'Alt', 'li', 'text') + image_subjects = get_from_dict(image_xmp_metadata, 'xmpmeta', 'RDF', 'Description', 'subject', 'Bag', 'li') + image_metadata_subjects = set([subject.split(":")[1] for subject in image_subjects if ":" in subject]) - image_processed_metadata = image_metadata.get("XMP:Description", "") - if len(image_metadata_subjects) > 0: - image_processed_metadata += ". " + ", ".join(image_metadata_subjects) + image_processed_metadata = image_description + if len(image_metadata_subjects) > 0: + image_processed_metadata += ". " + ", ".join(image_metadata_subjects) - logger.debug(f"{image_name}:\t{image_processed_metadata}") + logger.debug(f"{image_name}:\t{image_processed_metadata}") - return image_processed_metadata + return image_processed_metadata def query(raw_query, count, model: ImageSearchModel): diff --git a/src/utils/exiftool.py b/src/utils/exiftool.py deleted file mode 100644 index 8a11daa1..00000000 --- a/src/utils/exiftool.py +++ /dev/null @@ -1,325 +0,0 @@ -# -*- coding: utf-8 -*- -# PyExifTool -# Copyright 2012 Sven Marnach - -# This file is part of PyExifTool. -# -# PyExifTool is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the licence, or -# (at your option) any later version, or the BSD licence. -# -# PyExifTool is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# -# See COPYING.GPL or COPYING.BSD for more details. - -""" -PyExifTool is a Python library to communicate with an instance of Phil -Harvey's excellent ExifTool_ command-line application. The library -provides the class :py:class:`ExifTool` that runs the command-line -tool in batch mode and features methods to send commands to that -program, including methods to extract meta-information from one or -more image files. Since ``exiftool`` is run in batch mode, only a -single instance needs to be launched and can be reused for many -queries. This is much more efficient than launching a separate -process for every single query. - -.. _ExifTool: http://www.sno.phy.queensu.ca/~phil/exiftool/ - -The source code can be checked out from the github repository with - -:: - - git clone git://github.com/smarnach/pyexiftool.git - -Alternatively, you can download a tarball_. There haven't been any -releases yet. - -.. _tarball: https://github.com/smarnach/pyexiftool/tarball/master - -PyExifTool is licenced under GNU GPL version 3 or later. - -Example usage:: - - import exiftool - - files = ["a.jpg", "b.png", "c.tif"] - with exiftool.ExifTool() as et: - metadata = et.get_metadata_batch(files) - for d in metadata: - print("{:20.20} {:20.20}".format(d["SourceFile"], - d["EXIF:DateTimeOriginal"])) -""" - -from __future__ import unicode_literals - -import sys -import subprocess -import os -import json -import warnings -import codecs - -try: # Py3k compatibility - basestring -except NameError: - basestring = (bytes, str) - -executable = "exiftool" -"""The name of the executable to run. - -If the executable is not located in one of the paths listed in the -``PATH`` environment variable, the full path should be given here. -""" - -# Sentinel indicating the end of the output of a sequence of commands. -# The standard value should be fine. -sentinel = b"{ready}" - -# The block size when reading from exiftool. The standard value -# should be fine, though other values might give better performance in -# some cases. -block_size = 4096 - -# This code has been adapted from Lib/os.py in the Python source tree -# (sha1 265e36e277f3) -def _fscodec(): - encoding = sys.getfilesystemencoding() - errors = "strict" - if encoding != "mbcs": - try: - codecs.lookup_error("surrogateescape") - except LookupError: - pass - else: - errors = "surrogateescape" - - def fsencode(filename): - """ - Encode filename to the filesystem encoding with 'surrogateescape' error - handler, return bytes unchanged. On Windows, use 'strict' error handler if - the file system encoding is 'mbcs' (which is the default encoding). - """ - if isinstance(filename, bytes): - return filename - else: - return filename.encode(encoding, errors) - - return fsencode - -fsencode = _fscodec() -del _fscodec - -class ExifTool(object): - """Run the `exiftool` command-line tool and communicate to it. - - You can pass the file name of the ``exiftool`` executable as an - argument to the constructor. The default value ``exiftool`` will - only work if the executable is in your ``PATH``. - - Most methods of this class are only available after calling - :py:meth:`start()`, which will actually launch the subprocess. To - avoid leaving the subprocess running, make sure to call - :py:meth:`terminate()` method when finished using the instance. - This method will also be implicitly called when the instance is - garbage collected, but there are circumstance when this won't ever - happen, so you should not rely on the implicit process - termination. Subprocesses won't be automatically terminated if - the parent process exits, so a leaked subprocess will stay around - until manually killed. - - A convenient way to make sure that the subprocess is terminated is - to use the :py:class:`ExifTool` instance as a context manager:: - - with ExifTool() as et: - ... - - .. warning:: Note that there is no error handling. Nonsensical - options will be silently ignored by exiftool, so there's not - much that can be done in that regard. You should avoid passing - non-existent files to any of the methods, since this will lead - to undefied behaviour. - - .. py:attribute:: running - - A Boolean value indicating whether this instance is currently - associated with a running subprocess. - """ - - def __init__(self, executable_=None): - if executable_ is None: - self.executable = executable - else: - self.executable = executable_ - self.running = False - - def start(self): - """Start an ``exiftool`` process in batch mode for this instance. - - This method will issue a ``UserWarning`` if the subprocess is - already running. The process is started with the ``-G`` and - ``-n`` as common arguments, which are automatically included - in every command you run with :py:meth:`execute()`. - """ - if self.running: - warnings.warn("ExifTool already running; doing nothing.") - return - with open(os.devnull, "w") as devnull: - self._process = subprocess.Popen( - [self.executable, "-stay_open", "True", "-@", "-", - "-common_args", "-G", "-n"], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=devnull) - self.running = True - - def terminate(self): - """Terminate the ``exiftool`` process of this instance. - - If the subprocess isn't running, this method will do nothing. - """ - if not self.running: - return - self._process.stdin.write(b"-stay_open\nFalse\n") - self._process.stdin.flush() - self._process.communicate() - del self._process - self.running = False - - def __enter__(self): - self.start() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.terminate() - - def __del__(self): - self.terminate() - - def execute(self, *params): - """Execute the given batch of parameters with ``exiftool``. - - This method accepts any number of parameters and sends them to - the attached ``exiftool`` process. The process must be - running, otherwise ``ValueError`` is raised. The final - ``-execute`` necessary to actually run the batch is appended - automatically; see the documentation of :py:meth:`start()` for - the common options. The ``exiftool`` output is read up to the - end-of-output sentinel and returned as a raw ``bytes`` object, - excluding the sentinel. - - The parameters must also be raw ``bytes``, in whatever - encoding exiftool accepts. For filenames, this should be the - system's filesystem encoding. - - .. note:: This is considered a low-level method, and should - rarely be needed by application developers. - """ - if not self.running: - raise ValueError("ExifTool instance not running.") - self._process.stdin.write(b"\n".join(params + (b"-execute\n",))) - self._process.stdin.flush() - output = b"" - fd = self._process.stdout.fileno() - while not output[-32:].strip().endswith(sentinel): - output += os.read(fd, block_size) - return output.strip()[:-len(sentinel)] - - def execute_json(self, *params): - """Execute the given batch of parameters and parse the JSON output. - - This method is similar to :py:meth:`execute()`. It - automatically adds the parameter ``-j`` to request JSON output - from ``exiftool`` and parses the output. The return value is - a list of dictionaries, mapping tag names to the corresponding - values. All keys are Unicode strings with the tag names - including the ExifTool group name in the format :. - The values can have multiple types. All strings occurring as - values will be Unicode strings. Each dictionary contains the - name of the file it corresponds to in the key ``"SourceFile"``. - - The parameters to this function must be either raw strings - (type ``str`` in Python 2.x, type ``bytes`` in Python 3.x) or - Unicode strings (type ``unicode`` in Python 2.x, type ``str`` - in Python 3.x). Unicode strings will be encoded using - system's filesystem encoding. This behaviour means you can - pass in filenames according to the convention of the - respective Python version – as raw strings in Python 2.x and - as Unicode strings in Python 3.x. - """ - params = map(fsencode, params) - return json.loads(self.execute(b"-j", *params).decode("utf-8")) - - def get_metadata_batch(self, filenames): - """Return all meta-data for the given files. - - The return value will have the format described in the - documentation of :py:meth:`execute_json()`. - """ - return self.execute_json(*filenames) - - def get_metadata(self, filename): - """Return meta-data for a single file. - - The returned dictionary has the format described in the - documentation of :py:meth:`execute_json()`. - """ - return self.execute_json(filename)[0] - - def get_tags_batch(self, tags, filenames): - """Return only specified tags for the given files. - - The first argument is an iterable of tags. The tag names may - include group names, as usual in the format :. - - The second argument is an iterable of file names. - - The format of the return value is the same as for - :py:meth:`execute_json()`. - """ - # Explicitly ruling out strings here because passing in a - # string would lead to strange and hard-to-find errors - if isinstance(tags, basestring): - raise TypeError("The argument 'tags' must be " - "an iterable of strings") - if isinstance(filenames, basestring): - raise TypeError("The argument 'filenames' must be " - "an iterable of strings") - params = ["-" + t for t in tags] - params.extend(filenames) - return self.execute_json(*params) - - def get_tags(self, tags, filename): - """Return only specified tags for a single file. - - The returned dictionary has the format described in the - documentation of :py:meth:`execute_json()`. - """ - return self.get_tags_batch(tags, [filename])[0] - - def get_tag_batch(self, tag, filenames): - """Extract a single tag from the given files. - - The first argument is a single tag name, as usual in the - format :. - - The second argument is an iterable of file names. - - The return value is a list of tag values or ``None`` for - non-existent tags, in the same order as ``filenames``. - """ - data = self.get_tags_batch([tag], filenames) - result = [] - for d in data: - d.pop("SourceFile") - result.append(next(iter(d.values()), None)) - return result - - def get_tag(self, tag, filename): - """Extract a single tag from a single file. - - The return value is the value of the specified tag, or - ``None`` if this tag was not found in the file. - """ - return self.get_tag_batch(tag, [filename])[0] diff --git a/tests/data/images/guineapig_grass.jpg b/tests/data/images/guineapig_grass.jpg index a94668daba67e53f37310b021f1cda7e25912757..a916c3820d625c8d381250c7cc33a6e1489f579d 100644 GIT binary patch delta 3602 zcmcJSJ#W)M7{{G7iI4>>h=G^kBnBg-dL}LD0k=7b>b#m%kAhy_^=V` z2ZGS;3XcRqxF%FfH-$2d?$BeWbXTa-Yl$9e>0S~WrH8^2U#I8ttx?QWbZiB>MKIXa zHL!tw-O|7sF^GG_Z{DY8rsLPsljz3Iq3=5gLd$Dl)$WsqVw*6;4s14-q0m_k9jpw9 z5BfyYEm`{e{f7i}RhD{ByUSf?jU4LR$1Ztu{Ayo09w=>9T3r!4Aqq{$Bt8bAX;>Z# zWr>?38nb060WR^Qfh@f+u|^PLW!SfE12kKWCoPG(xatAgTi+ac(3vcak{u3*jp5^l z?P{>uZnxob3$|Kys;GNMmXE`_Z ziAKHYK$e)H$A)||a_qn?l4aoq{dYu(q5LhMV{I&hm6y0r4771X#>~$WjBiJ1mnP}Y zP@p|qr8StP*mAMw%@Xtub#Fl23p5dX=Qz<#JhWz+Y`Jz|sfzucxHDz(?ss%;ma1pl zD!U&gMIWyV+6<;pUh}e}ZESk}v`l)Wk20;ku+tH5^{92c)#rVU^G|JF`bls({U%tL zd}MLk=}x@*h_3Jm+w1clS^gp@nXD5?mY>o7|3X5UZZQXb4$2((g($iC3!;qk(~vpT zf80Eor76i`CMe3!N6BE5g?#*?lmwRv()`6yQrHPdnx9e?4JRnm?F#X8Qws6tK*_;h zDCLZwg%qm)neD_iOPLU)rvJ~s3MB=c2ruK$hcW?8O=JF|DI9nq8P`6vwYaDj*cpp+W+6u!O*eG;>_YMP~)HBG0NQfY91b)yuI zV@)jo_2o9@EW~brE1NrN+0=DKhUS8UW@bH{W*9K|>Zbe}8u-?DS#QEi zoG!5pE1Y)(3o#p9^u}cGxAES=-1Rq~KfF(Vi#z=C(Tne2?#{ITg1INfjUu4XNmA$w z+Q(O))+|fsSXMPtf}~YoO4J&xE$D0}m1aSH#nuI}4lTL@tFp>7|9<{rXj$T!^1V{J zq(6i;dHukEI|t8p#e=$-lbDr4B5!kcLvKJ!pzVg@s3vFg3~}es80T4rCK{{GGaC(| z3ONp6_^zfYbS9fx&N9fwkoLIkwIcRFuc3%%yv4R!tyF6%r5RN=lgs7Ube7F#lSq*? z_f<=jAJJyxF<9fiz#^5lxOiW6xn^B1fD^j z?IvKT*(3$g0j>(w5U}7D5UIV8J?)T#L@=)g>-*vP@H8s0e$%8W`U%2af|ZWH)6>K8 zRzmuwpI5k%`YCrvV?`?*ci*Jdq7|Lqi=GcQV_!w&;W_xC1N|Cua0Bl$$>n!$+MNp& zFX{nl0L+Ncd7Jp#MSo{f42H;PXa>`G1 i{xU%o{ZDE!v6aHxL+afjg`XIDE}$p=a`SX!U}ORr2sE;pS)iR+ UfDwq9fS4JGS++9^uufM40CmV6I{*Lx diff --git a/tests/data/images/kitten_park.jpg b/tests/data/images/kitten_park.jpg index 98899d653c3cfb96671885df6c11b17aecbc213c..b3e2b4652859afd984dbefdb868129ddd7002741 100644 GIT binary patch delta 3571 zcmchaO>fgM7{`;1NvIMqiE@F9B@RZ2#%p1RM$IO+vB98#*rZ9jIkDH8IxkAxHS`j3 zn8c9}z}Mi!fiD7weFCoBz@yr`}X%H(B?c?l z>f_j=K8kOX=bytHn|&cX9U&(QG)!ERYL;uG5PPVpEg+?{9D3LqPytNZV~zoTeftgp zBL>`A>@IY@RoZ9I$3EQ}zuvaS1FKEo@=~o6>Y?q~RA3O=+zIs1fQh&+=dl?R-$TuIyNwoFsMTsn zk4Esp5jbo(!F{Pv1Bu2O4J@B|g1HWeHL>Xqg#lp=WYWYkV3bLFL!T$Y#6px)n>r$p zxo9G(2SA5dIysh$O)29k7s(=B$*jx3b&j2$0mBBNdw9qNTsmEl4vJ61S@L3(t}>kt zIuDB3!O(n9trV5Swt{=7xJ$3FNxA-3qQxow$-y#*ReWYUO625WQh{~K8Nsw%!gtud z>z6w1^jR>Veo!W~Wn<1GAY5Qm!iIJuU}H*vg+=LW>v-gp$!_>KC>7Z7-JwG)cc1zl z6rHHWnOto-vS0E^Z<2E8=dw9T<_C~n(EoNYR7*})RDhQBl$n{ Tv3^ij59%`q_1UlWYoD~g@EM0- delta 29 icmccfLtxHbfeE6`S9#m7@-hN36A&|Rzsk$ft_J|hbPNjs diff --git a/tests/data/markdown/main_readme.md b/tests/data/markdown/main_readme.md index 60e92410..45e289b1 100644 --- a/tests/data/markdown/main_readme.md +++ b/tests/data/markdown/main_readme.md @@ -96,12 +96,6 @@ docker-compose build --pull Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) \[Required\] - 3. Install Exiftool \[Optional\] - - ``` shell - sudo apt-get -y install libimage-exiftool-perl - ``` - 2. 2\. Install Khoj ``` shell @@ -149,5 +143,3 @@ conda activate khoj Documentation](https://www.sbert.net/examples/applications/image-search/README.html) - Charles Cave for [OrgNode Parser](http://members.optusnet.com.au/~charles57/GTD/orgnode.html) -- Sven Marnach for - [PyExifTool](https://github.com/smarnach/pyexiftool/blob/master/exiftool.py) diff --git a/tests/test_image_search.py b/tests/test_image_search.py index ad374da1..e1a56b44 100644 --- a/tests/test_image_search.py +++ b/tests/test_image_search.py @@ -25,6 +25,28 @@ def test_image_search_setup(content_config: ContentConfig, search_config: Search assert len(image_search_model.image_embeddings) == 3 +def test_image_metadata(content_config: ContentConfig): + "Verify XMP Description and Subjects Extracted from Image" + # Arrange + expected_metadata_image_name_pairs = [ + (["Billi Ka Bacha.", "Cat", "Grass"], "kitten_park.jpg"), + (["Pasture.", "Horse", "Dog"], "horse_dog.jpg"), + (["Guinea Pig Eating Celery.", "Rodent", "Whiskers"], "guineapig_grass.jpg")] + + test_image_paths = [ + Path(content_config.image.input_directories[0] / image_name[1]) + for image_name in expected_metadata_image_name_pairs + ] + + for expected_metadata, test_image_path in zip(expected_metadata_image_name_pairs, test_image_paths): + # Act + actual_metadata = image_search.extract_metadata(test_image_path) + + # Assert + for expected_snippet in expected_metadata[0]: + assert expected_snippet in actual_metadata + + # ---------------------------------------------------------------------------------------------------- def test_image_search(content_config: ContentConfig, search_config: SearchConfig): # Arrange