diff --git a/Dockerfile b/Dockerfile index e1cd9321..d1d23687 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ LABEL org.opencontainers.image.source https://github.com/debanjum/khoj # Install System Dependencies RUN apt-get update -y && \ - apt-get -y install libimage-exiftool-perl python3-pyqt5 + apt-get -y install python3-pyqt5 # Copy Application to Container COPY . /app diff --git a/Readme.md b/Readme.md index c47e07a5..da09b2ce 100644 --- a/Readme.md +++ b/Readme.md @@ -208,11 +208,7 @@ docker-compose build --pull #### Using Conda ##### 1. Install Dependencies -- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) \[Required\] -- Install Exiftool \[Optional\] - ``` shell - sudo apt -y install libimage-exiftool-perl - ``` +- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) ##### 2. Install Khoj ```shell @@ -254,4 +250,3 @@ pytest - Charles Cave for [OrgNode Parser](http://members.optusnet.com.au/~charles57/GTD/orgnode.html) - [Org.js](https://mooz.github.io/org-js/) to render Org-mode results on the Web interface - [Markdown-it](https://github.com/markdown-it/markdown-it) to render Markdown results on the Web interface -- Sven Marnach for [PyExifTool](https://github.com/smarnach/pyexiftool/blob/master/exiftool.py) diff --git a/setup.py b/setup.py index 3de0fb75..16fd0061 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ setup( "aiofiles == 0.8.0", "dateparser == 1.1.1", "pyqt6 == 6.3.1", + "defusedxml == 0.7.1", ], include_package_data=True, entry_points={"console_scripts": ["khoj = src.main:run"]}, diff --git a/src/main.py b/src/main.py index dc3b12c3..378758b2 100644 --- a/src/main.py +++ b/src/main.py @@ -8,6 +8,7 @@ from platform import system # Ignore non-actionable warnings warnings.filterwarnings("ignore", message=r'snapshot_download.py has been made private', category=FutureWarning) +warnings.filterwarnings("ignore", message=r'legacy way to download files from the HF hub,', category=FutureWarning) # External Packages import uvicorn diff --git a/src/search_filter/base_filter.py b/src/search_filter/base_filter.py index 2550b32e..5b0ed62c 100644 --- a/src/search_filter/base_filter.py +++ b/src/search_filter/base_filter.py @@ -4,13 +4,10 @@ from abc import ABC, abstractmethod class BaseFilter(ABC): @abstractmethod - def load(self, *args, **kwargs): - pass + def load(self, *args, **kwargs): ... @abstractmethod - def can_filter(self, raw_query:str) -> bool: - pass + def can_filter(self, raw_query:str) -> bool: ... @abstractmethod - def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]: - pass \ No newline at end of file + def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]: ... diff --git a/src/search_type/image_search.py b/src/search_type/image_search.py index c9dcdd6b..a86cc42c 100644 --- a/src/search_type/image_search.py +++ b/src/search_type/image_search.py @@ -13,8 +13,7 @@ from tqdm import trange import torch # Internal Packages -from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model -import src.utils.exiftool as exiftool +from src.utils.helpers import get_absolute_path, get_from_dict, resolve_absolute_path, load_model from src.utils.config import ImageSearchModel from src.utils.rawconfig import ImageContentConfig, ImageSearchConfig @@ -122,17 +121,18 @@ def compute_metadata_embeddings(image_names, encoder, embeddings_file, batch_siz def extract_metadata(image_name): - with exiftool.ExifTool() as et: - image_metadata = et.get_tags(["XMP:Subject", "XMP:Description"], str(image_name)) - image_metadata_subjects = set([subject.split(":")[1] for subject in image_metadata.get("XMP:Subject", "") if ":" in subject]) + image_xmp_metadata = Image.open(image_name).getxmp() + image_description = get_from_dict(image_xmp_metadata, 'xmpmeta', 'RDF', 'Description', 'description', 'Alt', 'li', 'text') + image_subjects = get_from_dict(image_xmp_metadata, 'xmpmeta', 'RDF', 'Description', 'subject', 'Bag', 'li') + image_metadata_subjects = set([subject.split(":")[1] for subject in image_subjects if ":" in subject]) - image_processed_metadata = image_metadata.get("XMP:Description", "") - if len(image_metadata_subjects) > 0: - image_processed_metadata += ". " + ", ".join(image_metadata_subjects) + image_processed_metadata = image_description + if len(image_metadata_subjects) > 0: + image_processed_metadata += ". " + ", ".join(image_metadata_subjects) - logger.debug(f"{image_name}:\t{image_processed_metadata}") + logger.debug(f"{image_name}:\t{image_processed_metadata}") - return image_processed_metadata + return image_processed_metadata def query(raw_query, count, model: ImageSearchModel): diff --git a/src/utils/exiftool.py b/src/utils/exiftool.py deleted file mode 100644 index 8a11daa1..00000000 --- a/src/utils/exiftool.py +++ /dev/null @@ -1,325 +0,0 @@ -# -*- coding: utf-8 -*- -# PyExifTool -# Copyright 2012 Sven Marnach - -# This file is part of PyExifTool. -# -# PyExifTool is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the licence, or -# (at your option) any later version, or the BSD licence. -# -# PyExifTool is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# -# See COPYING.GPL or COPYING.BSD for more details. - -""" -PyExifTool is a Python library to communicate with an instance of Phil -Harvey's excellent ExifTool_ command-line application. The library -provides the class :py:class:`ExifTool` that runs the command-line -tool in batch mode and features methods to send commands to that -program, including methods to extract meta-information from one or -more image files. Since ``exiftool`` is run in batch mode, only a -single instance needs to be launched and can be reused for many -queries. This is much more efficient than launching a separate -process for every single query. - -.. _ExifTool: http://www.sno.phy.queensu.ca/~phil/exiftool/ - -The source code can be checked out from the github repository with - -:: - - git clone git://github.com/smarnach/pyexiftool.git - -Alternatively, you can download a tarball_. There haven't been any -releases yet. - -.. _tarball: https://github.com/smarnach/pyexiftool/tarball/master - -PyExifTool is licenced under GNU GPL version 3 or later. - -Example usage:: - - import exiftool - - files = ["a.jpg", "b.png", "c.tif"] - with exiftool.ExifTool() as et: - metadata = et.get_metadata_batch(files) - for d in metadata: - print("{:20.20} {:20.20}".format(d["SourceFile"], - d["EXIF:DateTimeOriginal"])) -""" - -from __future__ import unicode_literals - -import sys -import subprocess -import os -import json -import warnings -import codecs - -try: # Py3k compatibility - basestring -except NameError: - basestring = (bytes, str) - -executable = "exiftool" -"""The name of the executable to run. - -If the executable is not located in one of the paths listed in the -``PATH`` environment variable, the full path should be given here. -""" - -# Sentinel indicating the end of the output of a sequence of commands. -# The standard value should be fine. -sentinel = b"{ready}" - -# The block size when reading from exiftool. The standard value -# should be fine, though other values might give better performance in -# some cases. -block_size = 4096 - -# This code has been adapted from Lib/os.py in the Python source tree -# (sha1 265e36e277f3) -def _fscodec(): - encoding = sys.getfilesystemencoding() - errors = "strict" - if encoding != "mbcs": - try: - codecs.lookup_error("surrogateescape") - except LookupError: - pass - else: - errors = "surrogateescape" - - def fsencode(filename): - """ - Encode filename to the filesystem encoding with 'surrogateescape' error - handler, return bytes unchanged. On Windows, use 'strict' error handler if - the file system encoding is 'mbcs' (which is the default encoding). - """ - if isinstance(filename, bytes): - return filename - else: - return filename.encode(encoding, errors) - - return fsencode - -fsencode = _fscodec() -del _fscodec - -class ExifTool(object): - """Run the `exiftool` command-line tool and communicate to it. - - You can pass the file name of the ``exiftool`` executable as an - argument to the constructor. The default value ``exiftool`` will - only work if the executable is in your ``PATH``. - - Most methods of this class are only available after calling - :py:meth:`start()`, which will actually launch the subprocess. To - avoid leaving the subprocess running, make sure to call - :py:meth:`terminate()` method when finished using the instance. - This method will also be implicitly called when the instance is - garbage collected, but there are circumstance when this won't ever - happen, so you should not rely on the implicit process - termination. Subprocesses won't be automatically terminated if - the parent process exits, so a leaked subprocess will stay around - until manually killed. - - A convenient way to make sure that the subprocess is terminated is - to use the :py:class:`ExifTool` instance as a context manager:: - - with ExifTool() as et: - ... - - .. warning:: Note that there is no error handling. Nonsensical - options will be silently ignored by exiftool, so there's not - much that can be done in that regard. You should avoid passing - non-existent files to any of the methods, since this will lead - to undefied behaviour. - - .. py:attribute:: running - - A Boolean value indicating whether this instance is currently - associated with a running subprocess. - """ - - def __init__(self, executable_=None): - if executable_ is None: - self.executable = executable - else: - self.executable = executable_ - self.running = False - - def start(self): - """Start an ``exiftool`` process in batch mode for this instance. - - This method will issue a ``UserWarning`` if the subprocess is - already running. The process is started with the ``-G`` and - ``-n`` as common arguments, which are automatically included - in every command you run with :py:meth:`execute()`. - """ - if self.running: - warnings.warn("ExifTool already running; doing nothing.") - return - with open(os.devnull, "w") as devnull: - self._process = subprocess.Popen( - [self.executable, "-stay_open", "True", "-@", "-", - "-common_args", "-G", "-n"], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=devnull) - self.running = True - - def terminate(self): - """Terminate the ``exiftool`` process of this instance. - - If the subprocess isn't running, this method will do nothing. - """ - if not self.running: - return - self._process.stdin.write(b"-stay_open\nFalse\n") - self._process.stdin.flush() - self._process.communicate() - del self._process - self.running = False - - def __enter__(self): - self.start() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.terminate() - - def __del__(self): - self.terminate() - - def execute(self, *params): - """Execute the given batch of parameters with ``exiftool``. - - This method accepts any number of parameters and sends them to - the attached ``exiftool`` process. The process must be - running, otherwise ``ValueError`` is raised. The final - ``-execute`` necessary to actually run the batch is appended - automatically; see the documentation of :py:meth:`start()` for - the common options. The ``exiftool`` output is read up to the - end-of-output sentinel and returned as a raw ``bytes`` object, - excluding the sentinel. - - The parameters must also be raw ``bytes``, in whatever - encoding exiftool accepts. For filenames, this should be the - system's filesystem encoding. - - .. note:: This is considered a low-level method, and should - rarely be needed by application developers. - """ - if not self.running: - raise ValueError("ExifTool instance not running.") - self._process.stdin.write(b"\n".join(params + (b"-execute\n",))) - self._process.stdin.flush() - output = b"" - fd = self._process.stdout.fileno() - while not output[-32:].strip().endswith(sentinel): - output += os.read(fd, block_size) - return output.strip()[:-len(sentinel)] - - def execute_json(self, *params): - """Execute the given batch of parameters and parse the JSON output. - - This method is similar to :py:meth:`execute()`. It - automatically adds the parameter ``-j`` to request JSON output - from ``exiftool`` and parses the output. The return value is - a list of dictionaries, mapping tag names to the corresponding - values. All keys are Unicode strings with the tag names - including the ExifTool group name in the format :. - The values can have multiple types. All strings occurring as - values will be Unicode strings. Each dictionary contains the - name of the file it corresponds to in the key ``"SourceFile"``. - - The parameters to this function must be either raw strings - (type ``str`` in Python 2.x, type ``bytes`` in Python 3.x) or - Unicode strings (type ``unicode`` in Python 2.x, type ``str`` - in Python 3.x). Unicode strings will be encoded using - system's filesystem encoding. This behaviour means you can - pass in filenames according to the convention of the - respective Python version – as raw strings in Python 2.x and - as Unicode strings in Python 3.x. - """ - params = map(fsencode, params) - return json.loads(self.execute(b"-j", *params).decode("utf-8")) - - def get_metadata_batch(self, filenames): - """Return all meta-data for the given files. - - The return value will have the format described in the - documentation of :py:meth:`execute_json()`. - """ - return self.execute_json(*filenames) - - def get_metadata(self, filename): - """Return meta-data for a single file. - - The returned dictionary has the format described in the - documentation of :py:meth:`execute_json()`. - """ - return self.execute_json(filename)[0] - - def get_tags_batch(self, tags, filenames): - """Return only specified tags for the given files. - - The first argument is an iterable of tags. The tag names may - include group names, as usual in the format :. - - The second argument is an iterable of file names. - - The format of the return value is the same as for - :py:meth:`execute_json()`. - """ - # Explicitly ruling out strings here because passing in a - # string would lead to strange and hard-to-find errors - if isinstance(tags, basestring): - raise TypeError("The argument 'tags' must be " - "an iterable of strings") - if isinstance(filenames, basestring): - raise TypeError("The argument 'filenames' must be " - "an iterable of strings") - params = ["-" + t for t in tags] - params.extend(filenames) - return self.execute_json(*params) - - def get_tags(self, tags, filename): - """Return only specified tags for a single file. - - The returned dictionary has the format described in the - documentation of :py:meth:`execute_json()`. - """ - return self.get_tags_batch(tags, [filename])[0] - - def get_tag_batch(self, tag, filenames): - """Extract a single tag from the given files. - - The first argument is a single tag name, as usual in the - format :. - - The second argument is an iterable of file names. - - The return value is a list of tag values or ``None`` for - non-existent tags, in the same order as ``filenames``. - """ - data = self.get_tags_batch([tag], filenames) - result = [] - for d in data: - d.pop("SourceFile") - result.append(next(iter(d.values()), None)) - return result - - def get_tag(self, tag, filename): - """Extract a single tag from a single file. - - The return value is the value of the specified tag, or - ``None`` if this tag was not found in the file. - """ - return self.get_tag_batch(tag, [filename])[0] diff --git a/tests/data/images/guineapig_grass.jpg b/tests/data/images/guineapig_grass.jpg index a94668da..a916c382 100644 Binary files a/tests/data/images/guineapig_grass.jpg and b/tests/data/images/guineapig_grass.jpg differ diff --git a/tests/data/images/horse_dog.jpg b/tests/data/images/horse_dog.jpg index 3937613a..9caf69ec 100644 Binary files a/tests/data/images/horse_dog.jpg and b/tests/data/images/horse_dog.jpg differ diff --git a/tests/data/images/kitten_park.jpg b/tests/data/images/kitten_park.jpg index 98899d65..b3e2b465 100644 Binary files a/tests/data/images/kitten_park.jpg and b/tests/data/images/kitten_park.jpg differ diff --git a/tests/data/markdown/main_readme.md b/tests/data/markdown/main_readme.md index 60e92410..45e289b1 100644 --- a/tests/data/markdown/main_readme.md +++ b/tests/data/markdown/main_readme.md @@ -96,12 +96,6 @@ docker-compose build --pull Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) \[Required\] - 3. Install Exiftool \[Optional\] - - ``` shell - sudo apt-get -y install libimage-exiftool-perl - ``` - 2. 2\. Install Khoj ``` shell @@ -149,5 +143,3 @@ conda activate khoj Documentation](https://www.sbert.net/examples/applications/image-search/README.html) - Charles Cave for [OrgNode Parser](http://members.optusnet.com.au/~charles57/GTD/orgnode.html) -- Sven Marnach for - [PyExifTool](https://github.com/smarnach/pyexiftool/blob/master/exiftool.py) diff --git a/tests/test_image_search.py b/tests/test_image_search.py index ad374da1..e1a56b44 100644 --- a/tests/test_image_search.py +++ b/tests/test_image_search.py @@ -25,6 +25,28 @@ def test_image_search_setup(content_config: ContentConfig, search_config: Search assert len(image_search_model.image_embeddings) == 3 +def test_image_metadata(content_config: ContentConfig): + "Verify XMP Description and Subjects Extracted from Image" + # Arrange + expected_metadata_image_name_pairs = [ + (["Billi Ka Bacha.", "Cat", "Grass"], "kitten_park.jpg"), + (["Pasture.", "Horse", "Dog"], "horse_dog.jpg"), + (["Guinea Pig Eating Celery.", "Rodent", "Whiskers"], "guineapig_grass.jpg")] + + test_image_paths = [ + Path(content_config.image.input_directories[0] / image_name[1]) + for image_name in expected_metadata_image_name_pairs + ] + + for expected_metadata, test_image_path in zip(expected_metadata_image_name_pairs, test_image_paths): + # Act + actual_metadata = image_search.extract_metadata(test_image_path) + + # Assert + for expected_snippet in expected_metadata[0]: + assert expected_snippet in actual_metadata + + # ---------------------------------------------------------------------------------------------------- def test_image_search(content_config: ContentConfig, search_config: SearchConfig): # Arrange