From 7923903d2108911e18f1606676f025caaf5b96e1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 29 Mar 2024 18:22:41 +0530 Subject: [PATCH] Improve date filter regexes to extract structured, natural, partial dates - Much faster than using dateparser - It took 2x-4x for improved regex to extracts 1-15% more dates - Whereas It took 33x to 100x for dateparser to extract 65% - 400% more dates - Improve date extractor tests to test deduping dates, natural, structured date extraction from content - Extract some natural, partial dates and more structured dates Using regex is much faster than using dateparser. It's a little crude but should pay off in performance. Supports dates of form: - (Day-of-Month) Month|AbbreviatedMonth Year|2DigitYear - Month|AbbreviatedMonth (Day-of-Month) Year|2DigitYear --- src/khoj/search_filter/date_filter.py | 97 ++++++++++++++++++--------- tests/test_date_filter.py | 70 ++++++++++++++++--- 2 files changed, 128 insertions(+), 39 deletions(-) diff --git a/src/khoj/search_filter/date_filter.py b/src/khoj/search_filter/date_filter.py index 6d1d0e18..0f53abbe 100644 --- a/src/khoj/search_filter/date_filter.py +++ b/src/khoj/search_filter/date_filter.py @@ -1,9 +1,10 @@ +import calendar import logging import re from collections import defaultdict from datetime import datetime, timedelta from math import inf -from typing import List +from typing import List, Tuple import dateparser as dtparse from dateparser.search import search_dates @@ -23,48 +24,82 @@ class DateFilter(BaseFilter): # - dt>="last week" # - dt:"2 years ago" date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']" - raw_date_regex = r"\d{4}[-/]\d{2}[-/]\d{2}" def __init__(self, entry_key="compiled"): self.entry_key = entry_key self.date_to_entry_ids = defaultdict(set) self.cache = LRU() + self.dtparser_regexes = self.compile_date_regexes() + self.dtparser_ordinal_suffixes = re.compile(r"(st|nd|rd|th)") self.dtparser_settings = { "PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": "YMD", # Prefer YMD and DMY over MDY when parsing ambiguous dates } + def compile_date_regexes(self): + months = calendar.month_name[1:] + abbr_months = calendar.month_abbr[1:] + # Extract natural dates from content like 1st April 1984, 31 April 84, Apr 4th 1984, 13 Apr 84 + dBY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE) + dBy_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE) + BdY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE) + Bdy_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE) + dbY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE) + dby_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE) + bdY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE) + bdy_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE) + # Extract natural of form Month, Year like January 2021, Jan 2021, Jan 21 + BY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE) + By_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE) + bY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE) + by_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE) + # Extract structured dates from content like 1984-04-01, 1984/04/01, 01-04-1984, 01/04/1984, 01.04.1984, 01-04-84, 01/04/84 + Ymd_date_regex = re.compile(r"\b\d{4}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE) + dmY_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{4}\b", re.IGNORECASE) + dmy_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE) + dmY_dot_date_regex = re.compile(r"\b\d{2}[\.]\d{2}[\.]\d{4}\b", re.IGNORECASE) + + # Combine date formatter and date identifier regex pairs + dtparser_regexes: List[Tuple[str, re.Pattern[str]]] = [ + # Structured dates + ("%Y-%m-%d", Ymd_date_regex), + ("%Y/%m/%d", Ymd_date_regex), + ("%d-%m-%Y", dmY_date_regex), + ("%d/%m/%Y", dmY_date_regex), + ("%d.%m.%Y", dmY_dot_date_regex), + ("%d-%m-%y", dmy_date_regex), + ("%d/%m/%y", dmy_date_regex), + # Natural dates + ("%d %B %Y", dBY_regex), + ("%d %B %y", dBy_regex), + ("%B %d %Y", BdY_regex), + ("%B %d %y", Bdy_regex), + ("%d %b %Y", dbY_regex), + ("%d %b %y", dby_regex), + ("%b %d %Y", bdY_regex), + ("%b %d %y", bdy_regex), + # Partial natural dates + ("%B %Y", BY_regex), + ("%B %y", By_regex), + ("%b %Y", bY_regex), + ("%b %y", by_regex), + ] + return dtparser_regexes + def extract_dates(self, content): - "Extract all natural and structured dates across formats and locales from content" - excluded_parsers = ["relative-time"] - dtparser_settings = merge_dicts( - { - # Exclude relative dates for date extraction from content as very ambiguous - "PARSERS": [parser for parser in default_parsers if parser not in excluded_parsers], - "RETURN_AS_TIMEZONE_AWARE": False, - }, - self.dtparser_settings, - ) - try: - valid_dates = [ - dt_item[1] for dt_item in search_dates(content, settings=dtparser_settings, languages=["en"]) or [] - ] - return valid_dates - except Exception as e: - logger.warning( - f"Failed to extract natural dates from content with error: {e}. Fallback to regex based extraction." - ) + "Extract natural and structured dates from content" + valid_dates = set() + for date_format, date_regex in self.dtparser_regexes: + matched_dates = date_regex.findall(content) + for date_str in matched_dates: + # Remove ordinal suffixes to parse date + date_str = self.dtparser_ordinal_suffixes.sub("", date_str) + try: + valid_dates.add(datetime.strptime(date_str, date_format)) + except ValueError: + continue - # Fallback to extracting YYYY-MM-DD format dates from content - pattern_matched_dates = re.findall(self.raw_date_regex, content) - valid_dates = [] - for date_str in pattern_matched_dates: - try: - valid_dates.append(datetime.strptime(date_str, "%Y-%m-%d")) - except ValueError: - continue - - return valid_dates + return list(valid_dates) def get_filter_terms(self, query: str) -> List[str]: "Get all filter terms in query" diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index c990a1fa..1e11348f 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -147,16 +147,70 @@ def test_date_extraction(): assert extracted_dates == [], "Expected relative date to be ignored" extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984-04-01 tail") - assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only YMD structured date to be extracted" + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y-m-d structured date to be extracted" - extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 01-04-1984 tail") - assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected DMY structured date to be extracted" + extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01-04-1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d-m-Y structured date to be extracted" - extracted_dates = DateFilter().extract_dates("head Updates from April 1984 tail") - assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected natural date to be extracted" + extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984/04/01 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y/m/d structured date to be extracted" - extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 mer 09:50]--[1984-04-01 mer 10:10] => 24:20") - expected_dates = [datetime(1984, 4, 1, 9, 50, 0), datetime(1984, 4, 1, 10, 10, 0)] + extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01/04/1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d/m/Y structured date to be extracted" + + extracted_dates = DateFilter().extract_dates("head DEADLINE: 01.04.1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d.m.Y structured date to be extracted" + + extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 Sun 09:50]--[1984-04-01 Sun 10:10] => 24:20") + assert extracted_dates == [ + datetime(1984, 4, 1, 0, 0, 0) + ], "Expected single deduplicated date extracted from logbook entry" + + extracted_dates = DateFilter().extract_dates("CLOCK: [1984/03/31 mer 09:50]--[1984/04/01 mer 10:10] => 24:20") + expected_dates = [datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 3, 31, 0, 0, 0)] assert all( [dt in extracted_dates for dt in expected_dates] - ), "Expected multiple non-english dates extracted from logbook entry" + ), "Expected multiple different dates extracted from logbook entry" + + +def test_natual_date_extraction(): + extracted_dates = DateFilter().extract_dates("head 1 April 1984 tail") + assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted" + + extracted_dates = DateFilter().extract_dates("head 1st April 1984 tail") + assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted" + + extracted_dates = DateFilter().extract_dates("head 2nd Apr 1984 tail") + assert datetime(1984, 4, 2, 0, 0, 0) in extracted_dates, "Expected natural date with short month to be extracted" + + extracted_dates = DateFilter().extract_dates("head 4th Apr 1984 tail") + assert datetime(1984, 4, 4, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted" + + extracted_dates = DateFilter().extract_dates("head 11th april 1984 tail") + assert ( + datetime(1984, 4, 11, 0, 0, 0) in extracted_dates + ), "Expected natural date with lowercase month to be extracted" + + extracted_dates = DateFilter().extract_dates("head 23rd april 84 tail") + assert datetime(1984, 4, 23, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted" + + extracted_dates = DateFilter().extract_dates("head 31st march 84 tail") + assert datetime(1984, 3, 31, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted" + + extracted_dates = DateFilter().extract_dates("head April 1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected partial natural date to be extracted" + + extracted_dates = DateFilter().extract_dates("head Apr 1984 tail") + assert extracted_dates == [ + datetime(1984, 4, 1, 0, 0, 0) + ], "Expected partial natural date with short month to be extracted" + + extracted_dates = DateFilter().extract_dates("head apr 1984 tail") + assert extracted_dates == [ + datetime(1984, 4, 1, 0, 0, 0) + ], "Expected partial natural date with lowercase month to be extracted" + + extracted_dates = DateFilter().extract_dates("head apr 84 tail") + assert extracted_dates == [ + datetime(1984, 4, 1, 0, 0, 0) + ], "Expected partial natural date with 2-digit year to be extracted"