mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Improve date filter regexes to extract structured, natural, partial dates
- Much faster than using dateparser - It took 2x-4x for improved regex to extracts 1-15% more dates - Whereas It took 33x to 100x for dateparser to extract 65% - 400% more dates - Improve date extractor tests to test deduping dates, natural, structured date extraction from content - Extract some natural, partial dates and more structured dates Using regex is much faster than using dateparser. It's a little crude but should pay off in performance. Supports dates of form: - (Day-of-Month) Month|AbbreviatedMonth Year|2DigitYear - Month|AbbreviatedMonth (Day-of-Month) Year|2DigitYear
This commit is contained in:
parent
104eeea274
commit
7923903d21
2 changed files with 128 additions and 39 deletions
|
@ -1,9 +1,10 @@
|
|||
import calendar
|
||||
import logging
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timedelta
|
||||
from math import inf
|
||||
from typing import List
|
||||
from typing import List, Tuple
|
||||
|
||||
import dateparser as dtparse
|
||||
from dateparser.search import search_dates
|
||||
|
@ -23,48 +24,82 @@ class DateFilter(BaseFilter):
|
|||
# - dt>="last week"
|
||||
# - dt:"2 years ago"
|
||||
date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']"
|
||||
raw_date_regex = r"\d{4}[-/]\d{2}[-/]\d{2}"
|
||||
|
||||
def __init__(self, entry_key="compiled"):
|
||||
self.entry_key = entry_key
|
||||
self.date_to_entry_ids = defaultdict(set)
|
||||
self.cache = LRU()
|
||||
self.dtparser_regexes = self.compile_date_regexes()
|
||||
self.dtparser_ordinal_suffixes = re.compile(r"(st|nd|rd|th)")
|
||||
self.dtparser_settings = {
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"DATE_ORDER": "YMD", # Prefer YMD and DMY over MDY when parsing ambiguous dates
|
||||
}
|
||||
|
||||
def extract_dates(self, content):
|
||||
"Extract all natural and structured dates across formats and locales from content"
|
||||
excluded_parsers = ["relative-time"]
|
||||
dtparser_settings = merge_dicts(
|
||||
{
|
||||
# Exclude relative dates for date extraction from content as very ambiguous
|
||||
"PARSERS": [parser for parser in default_parsers if parser not in excluded_parsers],
|
||||
"RETURN_AS_TIMEZONE_AWARE": False,
|
||||
},
|
||||
self.dtparser_settings,
|
||||
)
|
||||
try:
|
||||
valid_dates = [
|
||||
dt_item[1] for dt_item in search_dates(content, settings=dtparser_settings, languages=["en"]) or []
|
||||
]
|
||||
return valid_dates
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to extract natural dates from content with error: {e}. Fallback to regex based extraction."
|
||||
)
|
||||
def compile_date_regexes(self):
|
||||
months = calendar.month_name[1:]
|
||||
abbr_months = calendar.month_abbr[1:]
|
||||
# Extract natural dates from content like 1st April 1984, 31 April 84, Apr 4th 1984, 13 Apr 84
|
||||
dBY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE)
|
||||
dBy_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE)
|
||||
BdY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE)
|
||||
Bdy_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE)
|
||||
dbY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE)
|
||||
dby_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE)
|
||||
bdY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE)
|
||||
bdy_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE)
|
||||
# Extract natural of form Month, Year like January 2021, Jan 2021, Jan 21
|
||||
BY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE)
|
||||
By_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE)
|
||||
bY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE)
|
||||
by_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE)
|
||||
# Extract structured dates from content like 1984-04-01, 1984/04/01, 01-04-1984, 01/04/1984, 01.04.1984, 01-04-84, 01/04/84
|
||||
Ymd_date_regex = re.compile(r"\b\d{4}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE)
|
||||
dmY_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{4}\b", re.IGNORECASE)
|
||||
dmy_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE)
|
||||
dmY_dot_date_regex = re.compile(r"\b\d{2}[\.]\d{2}[\.]\d{4}\b", re.IGNORECASE)
|
||||
|
||||
# Fallback to extracting YYYY-MM-DD format dates from content
|
||||
pattern_matched_dates = re.findall(self.raw_date_regex, content)
|
||||
valid_dates = []
|
||||
for date_str in pattern_matched_dates:
|
||||
# Combine date formatter and date identifier regex pairs
|
||||
dtparser_regexes: List[Tuple[str, re.Pattern[str]]] = [
|
||||
# Structured dates
|
||||
("%Y-%m-%d", Ymd_date_regex),
|
||||
("%Y/%m/%d", Ymd_date_regex),
|
||||
("%d-%m-%Y", dmY_date_regex),
|
||||
("%d/%m/%Y", dmY_date_regex),
|
||||
("%d.%m.%Y", dmY_dot_date_regex),
|
||||
("%d-%m-%y", dmy_date_regex),
|
||||
("%d/%m/%y", dmy_date_regex),
|
||||
# Natural dates
|
||||
("%d %B %Y", dBY_regex),
|
||||
("%d %B %y", dBy_regex),
|
||||
("%B %d %Y", BdY_regex),
|
||||
("%B %d %y", Bdy_regex),
|
||||
("%d %b %Y", dbY_regex),
|
||||
("%d %b %y", dby_regex),
|
||||
("%b %d %Y", bdY_regex),
|
||||
("%b %d %y", bdy_regex),
|
||||
# Partial natural dates
|
||||
("%B %Y", BY_regex),
|
||||
("%B %y", By_regex),
|
||||
("%b %Y", bY_regex),
|
||||
("%b %y", by_regex),
|
||||
]
|
||||
return dtparser_regexes
|
||||
|
||||
def extract_dates(self, content):
|
||||
"Extract natural and structured dates from content"
|
||||
valid_dates = set()
|
||||
for date_format, date_regex in self.dtparser_regexes:
|
||||
matched_dates = date_regex.findall(content)
|
||||
for date_str in matched_dates:
|
||||
# Remove ordinal suffixes to parse date
|
||||
date_str = self.dtparser_ordinal_suffixes.sub("", date_str)
|
||||
try:
|
||||
valid_dates.append(datetime.strptime(date_str, "%Y-%m-%d"))
|
||||
valid_dates.add(datetime.strptime(date_str, date_format))
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return valid_dates
|
||||
return list(valid_dates)
|
||||
|
||||
def get_filter_terms(self, query: str) -> List[str]:
|
||||
"Get all filter terms in query"
|
||||
|
|
|
@ -147,16 +147,70 @@ def test_date_extraction():
|
|||
assert extracted_dates == [], "Expected relative date to be ignored"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984-04-01 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only YMD structured date to be extracted"
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y-m-d structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 01-04-1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected DMY structured date to be extracted"
|
||||
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01-04-1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d-m-Y structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head Updates from April 1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected natural date to be extracted"
|
||||
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984/04/01 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y/m/d structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 mer 09:50]--[1984-04-01 mer 10:10] => 24:20")
|
||||
expected_dates = [datetime(1984, 4, 1, 9, 50, 0), datetime(1984, 4, 1, 10, 10, 0)]
|
||||
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01/04/1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d/m/Y structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head DEADLINE: 01.04.1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d.m.Y structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 Sun 09:50]--[1984-04-01 Sun 10:10] => 24:20")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected single deduplicated date extracted from logbook entry"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("CLOCK: [1984/03/31 mer 09:50]--[1984/04/01 mer 10:10] => 24:20")
|
||||
expected_dates = [datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 3, 31, 0, 0, 0)]
|
||||
assert all(
|
||||
[dt in extracted_dates for dt in expected_dates]
|
||||
), "Expected multiple non-english dates extracted from logbook entry"
|
||||
), "Expected multiple different dates extracted from logbook entry"
|
||||
|
||||
|
||||
def test_natual_date_extraction():
|
||||
extracted_dates = DateFilter().extract_dates("head 1 April 1984 tail")
|
||||
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 1st April 1984 tail")
|
||||
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 2nd Apr 1984 tail")
|
||||
assert datetime(1984, 4, 2, 0, 0, 0) in extracted_dates, "Expected natural date with short month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 4th Apr 1984 tail")
|
||||
assert datetime(1984, 4, 4, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 11th april 1984 tail")
|
||||
assert (
|
||||
datetime(1984, 4, 11, 0, 0, 0) in extracted_dates
|
||||
), "Expected natural date with lowercase month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 23rd april 84 tail")
|
||||
assert datetime(1984, 4, 23, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 31st march 84 tail")
|
||||
assert datetime(1984, 3, 31, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head April 1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected partial natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head Apr 1984 tail")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected partial natural date with short month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head apr 1984 tail")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected partial natural date with lowercase month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head apr 84 tail")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected partial natural date with 2-digit year to be extracted"
|
||||
|
|
Loading…
Reference in a new issue