Extract natural language and locale specific dates in content

Previously we just extracted dates in YYYY-MM-DD format from content
for date filterings during search.

Use dateparser to extract dates across locales and natural language

This should improve notes returned as context when chat searches
knowledge base with date filters

Fallback to regex for date parsing from content if dateparser fails

- Limit natural date extractor capabilities to improve performance
  - Assume language is english
    Language detection otherwise takes a REALLY long time
  - Do not extract unix timestamps, timezone
    - This isn't required, as just using date and approximating dates as UTC
This commit is contained in:
Debanjum Singh Solanky 2024-03-29 13:04:49 +05:30
parent 90c5b3c410
commit 104eeea274
2 changed files with 60 additions and 14 deletions

View file

@ -6,10 +6,12 @@ from math import inf
from typing import List
import dateparser as dtparse
from dateparser.search import search_dates
from dateparser_data.settings import default_parsers
from dateutil.relativedelta import relativedelta
from khoj.search_filter.base_filter import BaseFilter
from khoj.utils.helpers import LRU, timer
from khoj.utils.helpers import LRU, merge_dicts, timer
logger = logging.getLogger(__name__)
@ -21,17 +23,40 @@ class DateFilter(BaseFilter):
# - dt>="last week"
# - dt:"2 years ago"
date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']"
raw_date_regex = r"\d{4}-\d{2}-\d{2}"
raw_date_regex = r"\d{4}[-/]\d{2}[-/]\d{2}"
def __init__(self, entry_key="compiled"):
self.entry_key = entry_key
self.date_to_entry_ids = defaultdict(set)
self.cache = LRU()
self.dtparser_settings = {
"PREFER_DAY_OF_MONTH": "first",
"DATE_ORDER": "YMD", # Prefer YMD and DMY over MDY when parsing ambiguous dates
}
def extract_dates(self, content):
pattern_matched_dates = re.findall(self.raw_date_regex, content)
"Extract all natural and structured dates across formats and locales from content"
excluded_parsers = ["relative-time"]
dtparser_settings = merge_dicts(
{
# Exclude relative dates for date extraction from content as very ambiguous
"PARSERS": [parser for parser in default_parsers if parser not in excluded_parsers],
"RETURN_AS_TIMEZONE_AWARE": False,
},
self.dtparser_settings,
)
try:
valid_dates = [
dt_item[1] for dt_item in search_dates(content, settings=dtparser_settings, languages=["en"]) or []
]
return valid_dates
except Exception as e:
logger.warning(
f"Failed to extract natural dates from content with error: {e}. Fallback to regex based extraction."
)
# Filter down to valid dates
# Fallback to extracting YYYY-MM-DD format dates from content
pattern_matched_dates = re.findall(self.raw_date_regex, content)
valid_dates = []
for date_str in pattern_matched_dates:
try:
@ -120,18 +145,13 @@ class DateFilter(BaseFilter):
# clean date string to handle future date parsing by date parser
future_strings = ["later", "from now", "from today"]
prefer_dates_from = {True: "future", False: "past"}[any([True for fstr in future_strings if fstr in date_str])]
clean_date_str = re.sub("|".join(future_strings), "", date_str)
dtquery_settings = {"RELATIVE_BASE": relative_base or datetime.now(), "PREFER_DATES_FROM": prefer_dates_from}
dtparser_settings = merge_dicts(dtquery_settings, self.dtparser_settings)
# parse date passed in query date filter
clean_date_str = re.sub("|".join(future_strings), "", date_str)
try:
parsed_date = dtparse.parse(
clean_date_str,
settings={
"RELATIVE_BASE": relative_base or datetime.now(),
"PREFER_DAY_OF_MONTH": "first",
"PREFER_DATES_FROM": prefer_dates_from,
},
)
parsed_date = dtparse.parse(clean_date_str, settings=dtparser_settings)
except Exception as e:
logger.error(f"Failed to parse date string: {date_str} with error: {e}")
return None

View file

@ -116,7 +116,7 @@ def test_date_filter_regex():
assert dtrange_match == []
def test_get_file_filter_terms():
def test_get_date_filter_terms():
dtrange_match = DateFilter().get_filter_terms('multi word head dt>"today" dt:"1984-01-01"')
assert dtrange_match == ["dt>'today'", "dt:'1984-01-01'"]
@ -134,3 +134,29 @@ def test_get_file_filter_terms():
dtrange_match = DateFilter().get_filter_terms("head tail")
assert dtrange_match == []
def test_date_extraction():
extracted_dates = DateFilter().extract_dates("")
assert extracted_dates == [], "Expected to handle empty string"
extracted_dates = DateFilter().extract_dates("head tail")
assert extracted_dates == [], "Expected to handle no dates"
extracted_dates = DateFilter().extract_dates("head CREATED: today tail")
assert extracted_dates == [], "Expected relative date to be ignored"
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984-04-01 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only YMD structured date to be extracted"
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 01-04-1984 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected DMY structured date to be extracted"
extracted_dates = DateFilter().extract_dates("head Updates from April 1984 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected natural date to be extracted"
extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 mer 09:50]--[1984-04-01 mer 10:10] => 24:20")
expected_dates = [datetime(1984, 4, 1, 9, 50, 0), datetime(1984, 4, 1, 10, 10, 0)]
assert all(
[dt in extracted_dates for dt in expected_dates]
), "Expected multiple non-english dates extracted from logbook entry"