From 104eeea27453bb1d1ca0f125e634a15c40402bf1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 29 Mar 2024 13:04:49 +0530 Subject: [PATCH 1/2] Extract natural language and locale specific dates in content Previously we just extracted dates in YYYY-MM-DD format from content for date filterings during search. Use dateparser to extract dates across locales and natural language This should improve notes returned as context when chat searches knowledge base with date filters Fallback to regex for date parsing from content if dateparser fails - Limit natural date extractor capabilities to improve performance - Assume language is english Language detection otherwise takes a REALLY long time - Do not extract unix timestamps, timezone - This isn't required, as just using date and approximating dates as UTC --- src/khoj/search_filter/date_filter.py | 46 +++++++++++++++++++-------- tests/test_date_filter.py | 28 +++++++++++++++- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/src/khoj/search_filter/date_filter.py b/src/khoj/search_filter/date_filter.py index 85b77076..6d1d0e18 100644 --- a/src/khoj/search_filter/date_filter.py +++ b/src/khoj/search_filter/date_filter.py @@ -6,10 +6,12 @@ from math import inf from typing import List import dateparser as dtparse +from dateparser.search import search_dates +from dateparser_data.settings import default_parsers from dateutil.relativedelta import relativedelta from khoj.search_filter.base_filter import BaseFilter -from khoj.utils.helpers import LRU, timer +from khoj.utils.helpers import LRU, merge_dicts, timer logger = logging.getLogger(__name__) @@ -21,17 +23,40 @@ class DateFilter(BaseFilter): # - dt>="last week" # - dt:"2 years ago" date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']" - raw_date_regex = r"\d{4}-\d{2}-\d{2}" + raw_date_regex = r"\d{4}[-/]\d{2}[-/]\d{2}" def __init__(self, entry_key="compiled"): self.entry_key = entry_key self.date_to_entry_ids = defaultdict(set) self.cache = LRU() + self.dtparser_settings = { + "PREFER_DAY_OF_MONTH": "first", + "DATE_ORDER": "YMD", # Prefer YMD and DMY over MDY when parsing ambiguous dates + } def extract_dates(self, content): - pattern_matched_dates = re.findall(self.raw_date_regex, content) + "Extract all natural and structured dates across formats and locales from content" + excluded_parsers = ["relative-time"] + dtparser_settings = merge_dicts( + { + # Exclude relative dates for date extraction from content as very ambiguous + "PARSERS": [parser for parser in default_parsers if parser not in excluded_parsers], + "RETURN_AS_TIMEZONE_AWARE": False, + }, + self.dtparser_settings, + ) + try: + valid_dates = [ + dt_item[1] for dt_item in search_dates(content, settings=dtparser_settings, languages=["en"]) or [] + ] + return valid_dates + except Exception as e: + logger.warning( + f"Failed to extract natural dates from content with error: {e}. Fallback to regex based extraction." + ) - # Filter down to valid dates + # Fallback to extracting YYYY-MM-DD format dates from content + pattern_matched_dates = re.findall(self.raw_date_regex, content) valid_dates = [] for date_str in pattern_matched_dates: try: @@ -120,18 +145,13 @@ class DateFilter(BaseFilter): # clean date string to handle future date parsing by date parser future_strings = ["later", "from now", "from today"] prefer_dates_from = {True: "future", False: "past"}[any([True for fstr in future_strings if fstr in date_str])] - clean_date_str = re.sub("|".join(future_strings), "", date_str) + dtquery_settings = {"RELATIVE_BASE": relative_base or datetime.now(), "PREFER_DATES_FROM": prefer_dates_from} + dtparser_settings = merge_dicts(dtquery_settings, self.dtparser_settings) # parse date passed in query date filter + clean_date_str = re.sub("|".join(future_strings), "", date_str) try: - parsed_date = dtparse.parse( - clean_date_str, - settings={ - "RELATIVE_BASE": relative_base or datetime.now(), - "PREFER_DAY_OF_MONTH": "first", - "PREFER_DATES_FROM": prefer_dates_from, - }, - ) + parsed_date = dtparse.parse(clean_date_str, settings=dtparser_settings) except Exception as e: logger.error(f"Failed to parse date string: {date_str} with error: {e}") return None diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index adb6c2c9..c990a1fa 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -116,7 +116,7 @@ def test_date_filter_regex(): assert dtrange_match == [] -def test_get_file_filter_terms(): +def test_get_date_filter_terms(): dtrange_match = DateFilter().get_filter_terms('multi word head dt>"today" dt:"1984-01-01"') assert dtrange_match == ["dt>'today'", "dt:'1984-01-01'"] @@ -134,3 +134,29 @@ def test_get_file_filter_terms(): dtrange_match = DateFilter().get_filter_terms("head tail") assert dtrange_match == [] + + +def test_date_extraction(): + extracted_dates = DateFilter().extract_dates("") + assert extracted_dates == [], "Expected to handle empty string" + + extracted_dates = DateFilter().extract_dates("head tail") + assert extracted_dates == [], "Expected to handle no dates" + + extracted_dates = DateFilter().extract_dates("head CREATED: today tail") + assert extracted_dates == [], "Expected relative date to be ignored" + + extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984-04-01 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only YMD structured date to be extracted" + + extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 01-04-1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected DMY structured date to be extracted" + + extracted_dates = DateFilter().extract_dates("head Updates from April 1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected natural date to be extracted" + + extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 mer 09:50]--[1984-04-01 mer 10:10] => 24:20") + expected_dates = [datetime(1984, 4, 1, 9, 50, 0), datetime(1984, 4, 1, 10, 10, 0)] + assert all( + [dt in extracted_dates for dt in expected_dates] + ), "Expected multiple non-english dates extracted from logbook entry" From 7923903d2108911e18f1606676f025caaf5b96e1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 29 Mar 2024 18:22:41 +0530 Subject: [PATCH 2/2] Improve date filter regexes to extract structured, natural, partial dates - Much faster than using dateparser - It took 2x-4x for improved regex to extracts 1-15% more dates - Whereas It took 33x to 100x for dateparser to extract 65% - 400% more dates - Improve date extractor tests to test deduping dates, natural, structured date extraction from content - Extract some natural, partial dates and more structured dates Using regex is much faster than using dateparser. It's a little crude but should pay off in performance. Supports dates of form: - (Day-of-Month) Month|AbbreviatedMonth Year|2DigitYear - Month|AbbreviatedMonth (Day-of-Month) Year|2DigitYear --- src/khoj/search_filter/date_filter.py | 97 ++++++++++++++++++--------- tests/test_date_filter.py | 70 ++++++++++++++++--- 2 files changed, 128 insertions(+), 39 deletions(-) diff --git a/src/khoj/search_filter/date_filter.py b/src/khoj/search_filter/date_filter.py index 6d1d0e18..0f53abbe 100644 --- a/src/khoj/search_filter/date_filter.py +++ b/src/khoj/search_filter/date_filter.py @@ -1,9 +1,10 @@ +import calendar import logging import re from collections import defaultdict from datetime import datetime, timedelta from math import inf -from typing import List +from typing import List, Tuple import dateparser as dtparse from dateparser.search import search_dates @@ -23,48 +24,82 @@ class DateFilter(BaseFilter): # - dt>="last week" # - dt:"2 years ago" date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']" - raw_date_regex = r"\d{4}[-/]\d{2}[-/]\d{2}" def __init__(self, entry_key="compiled"): self.entry_key = entry_key self.date_to_entry_ids = defaultdict(set) self.cache = LRU() + self.dtparser_regexes = self.compile_date_regexes() + self.dtparser_ordinal_suffixes = re.compile(r"(st|nd|rd|th)") self.dtparser_settings = { "PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": "YMD", # Prefer YMD and DMY over MDY when parsing ambiguous dates } + def compile_date_regexes(self): + months = calendar.month_name[1:] + abbr_months = calendar.month_abbr[1:] + # Extract natural dates from content like 1st April 1984, 31 April 84, Apr 4th 1984, 13 Apr 84 + dBY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE) + dBy_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE) + BdY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE) + Bdy_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE) + dbY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE) + dby_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE) + bdY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE) + bdy_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE) + # Extract natural of form Month, Year like January 2021, Jan 2021, Jan 21 + BY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE) + By_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE) + bY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE) + by_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE) + # Extract structured dates from content like 1984-04-01, 1984/04/01, 01-04-1984, 01/04/1984, 01.04.1984, 01-04-84, 01/04/84 + Ymd_date_regex = re.compile(r"\b\d{4}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE) + dmY_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{4}\b", re.IGNORECASE) + dmy_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE) + dmY_dot_date_regex = re.compile(r"\b\d{2}[\.]\d{2}[\.]\d{4}\b", re.IGNORECASE) + + # Combine date formatter and date identifier regex pairs + dtparser_regexes: List[Tuple[str, re.Pattern[str]]] = [ + # Structured dates + ("%Y-%m-%d", Ymd_date_regex), + ("%Y/%m/%d", Ymd_date_regex), + ("%d-%m-%Y", dmY_date_regex), + ("%d/%m/%Y", dmY_date_regex), + ("%d.%m.%Y", dmY_dot_date_regex), + ("%d-%m-%y", dmy_date_regex), + ("%d/%m/%y", dmy_date_regex), + # Natural dates + ("%d %B %Y", dBY_regex), + ("%d %B %y", dBy_regex), + ("%B %d %Y", BdY_regex), + ("%B %d %y", Bdy_regex), + ("%d %b %Y", dbY_regex), + ("%d %b %y", dby_regex), + ("%b %d %Y", bdY_regex), + ("%b %d %y", bdy_regex), + # Partial natural dates + ("%B %Y", BY_regex), + ("%B %y", By_regex), + ("%b %Y", bY_regex), + ("%b %y", by_regex), + ] + return dtparser_regexes + def extract_dates(self, content): - "Extract all natural and structured dates across formats and locales from content" - excluded_parsers = ["relative-time"] - dtparser_settings = merge_dicts( - { - # Exclude relative dates for date extraction from content as very ambiguous - "PARSERS": [parser for parser in default_parsers if parser not in excluded_parsers], - "RETURN_AS_TIMEZONE_AWARE": False, - }, - self.dtparser_settings, - ) - try: - valid_dates = [ - dt_item[1] for dt_item in search_dates(content, settings=dtparser_settings, languages=["en"]) or [] - ] - return valid_dates - except Exception as e: - logger.warning( - f"Failed to extract natural dates from content with error: {e}. Fallback to regex based extraction." - ) + "Extract natural and structured dates from content" + valid_dates = set() + for date_format, date_regex in self.dtparser_regexes: + matched_dates = date_regex.findall(content) + for date_str in matched_dates: + # Remove ordinal suffixes to parse date + date_str = self.dtparser_ordinal_suffixes.sub("", date_str) + try: + valid_dates.add(datetime.strptime(date_str, date_format)) + except ValueError: + continue - # Fallback to extracting YYYY-MM-DD format dates from content - pattern_matched_dates = re.findall(self.raw_date_regex, content) - valid_dates = [] - for date_str in pattern_matched_dates: - try: - valid_dates.append(datetime.strptime(date_str, "%Y-%m-%d")) - except ValueError: - continue - - return valid_dates + return list(valid_dates) def get_filter_terms(self, query: str) -> List[str]: "Get all filter terms in query" diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index c990a1fa..1e11348f 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -147,16 +147,70 @@ def test_date_extraction(): assert extracted_dates == [], "Expected relative date to be ignored" extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984-04-01 tail") - assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only YMD structured date to be extracted" + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y-m-d structured date to be extracted" - extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 01-04-1984 tail") - assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected DMY structured date to be extracted" + extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01-04-1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d-m-Y structured date to be extracted" - extracted_dates = DateFilter().extract_dates("head Updates from April 1984 tail") - assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected natural date to be extracted" + extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984/04/01 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y/m/d structured date to be extracted" - extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 mer 09:50]--[1984-04-01 mer 10:10] => 24:20") - expected_dates = [datetime(1984, 4, 1, 9, 50, 0), datetime(1984, 4, 1, 10, 10, 0)] + extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01/04/1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d/m/Y structured date to be extracted" + + extracted_dates = DateFilter().extract_dates("head DEADLINE: 01.04.1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d.m.Y structured date to be extracted" + + extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 Sun 09:50]--[1984-04-01 Sun 10:10] => 24:20") + assert extracted_dates == [ + datetime(1984, 4, 1, 0, 0, 0) + ], "Expected single deduplicated date extracted from logbook entry" + + extracted_dates = DateFilter().extract_dates("CLOCK: [1984/03/31 mer 09:50]--[1984/04/01 mer 10:10] => 24:20") + expected_dates = [datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 3, 31, 0, 0, 0)] assert all( [dt in extracted_dates for dt in expected_dates] - ), "Expected multiple non-english dates extracted from logbook entry" + ), "Expected multiple different dates extracted from logbook entry" + + +def test_natual_date_extraction(): + extracted_dates = DateFilter().extract_dates("head 1 April 1984 tail") + assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted" + + extracted_dates = DateFilter().extract_dates("head 1st April 1984 tail") + assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted" + + extracted_dates = DateFilter().extract_dates("head 2nd Apr 1984 tail") + assert datetime(1984, 4, 2, 0, 0, 0) in extracted_dates, "Expected natural date with short month to be extracted" + + extracted_dates = DateFilter().extract_dates("head 4th Apr 1984 tail") + assert datetime(1984, 4, 4, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted" + + extracted_dates = DateFilter().extract_dates("head 11th april 1984 tail") + assert ( + datetime(1984, 4, 11, 0, 0, 0) in extracted_dates + ), "Expected natural date with lowercase month to be extracted" + + extracted_dates = DateFilter().extract_dates("head 23rd april 84 tail") + assert datetime(1984, 4, 23, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted" + + extracted_dates = DateFilter().extract_dates("head 31st march 84 tail") + assert datetime(1984, 3, 31, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted" + + extracted_dates = DateFilter().extract_dates("head April 1984 tail") + assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected partial natural date to be extracted" + + extracted_dates = DateFilter().extract_dates("head Apr 1984 tail") + assert extracted_dates == [ + datetime(1984, 4, 1, 0, 0, 0) + ], "Expected partial natural date with short month to be extracted" + + extracted_dates = DateFilter().extract_dates("head apr 1984 tail") + assert extracted_dates == [ + datetime(1984, 4, 1, 0, 0, 0) + ], "Expected partial natural date with lowercase month to be extracted" + + extracted_dates = DateFilter().extract_dates("head apr 84 tail") + assert extracted_dates == [ + datetime(1984, 4, 1, 0, 0, 0) + ], "Expected partial natural date with 2-digit year to be extracted"