mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Allow Indexing Heading Entries. Improve Org, TextToJsonl Parser
### Summary - Set `index_heading_entries` field in `~/.khoj/khoj.yml` to `true` to index entries with empty body ### Main Changes #### Make Indexing Org-Mode Entries with Empty Body Configurable -253c9ea
Set `index_heading_entries` field in `khoj.yml` to index entries with no body ### Fix, Improve OrgNode, TextToJsonl Parser -9d369ae
Fix `OrgNode` render of entries with property drawers and empty body -1d3b3d5
Convert field get/set methods in `OrgNode` class to `@property` -db37e38
Create `OrgNode` `hasBody` method. Use it in `org_to_jsonl` checks -b4878d7
Extract entries from scratch when regenerate requested -52e3dd9
Pass the whole `TextContentConfig` as argument to `text_to_jsonl` methods -e951ba3
Raise exception when org file not found Resolves #87
This commit is contained in:
commit
182fbbd8df
9 changed files with 308 additions and 219 deletions
|
@ -13,13 +13,17 @@ import time
|
|||
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||
from src.utils.constants import empty_escape_sequences
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Define Functions
|
||||
def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, previous_entries=None):
|
||||
def beancount_to_jsonl(config: TextContentConfig, previous_entries=None):
|
||||
# Extract required fields from config
|
||||
beancount_files, beancount_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter):
|
||||
print("At least one of beancount-files or beancount-file-filter is required to be specified")
|
||||
|
|
|
@ -13,13 +13,17 @@ import time
|
|||
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||
from src.utils.constants import empty_escape_sequences
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Define Functions
|
||||
def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previous_entries=None):
|
||||
def markdown_to_jsonl(config: TextContentConfig, previous_entries=None):
|
||||
# Extract required fields from config
|
||||
markdown_files, markdown_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
||||
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
||||
|
|
|
@ -14,13 +14,18 @@ from src.processor.org_mode import orgnode
|
|||
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils import state
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Define Functions
|
||||
def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None):
|
||||
def org_to_jsonl(config: TextContentConfig, previous_entries=None):
|
||||
# Extract required fields from config
|
||||
org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
|
||||
index_heading_entries = config.index_heading_entries
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
|
||||
print("At least one of org-files or org-file-filter is required to be specified")
|
||||
|
@ -37,7 +42,7 @@ def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None)
|
|||
logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds")
|
||||
|
||||
start = time.time()
|
||||
current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries)
|
||||
end = time.time()
|
||||
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
|
||||
|
||||
|
@ -96,40 +101,40 @@ def extract_org_entries(org_files):
|
|||
return entries, dict(entry_to_file_map)
|
||||
|
||||
|
||||
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map) -> list[dict]:
|
||||
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
|
||||
"Convert Org-Mode entries into list of dictionary"
|
||||
entry_maps = []
|
||||
for entry in entries:
|
||||
entry_dict = dict()
|
||||
|
||||
# Ignore title notes i.e notes with just headings and empty body
|
||||
if not entry.Body() or re.sub(r'\n|\t|\r| ', '', entry.Body()) == "":
|
||||
if not entry.hasBody and not index_heading_entries:
|
||||
# Ignore title notes i.e notes with just headings and empty body
|
||||
continue
|
||||
|
||||
entry_dict["compiled"] = f'{entry.Heading()}.'
|
||||
entry_dict["compiled"] = f'{entry.heading}.'
|
||||
if state.verbose > 2:
|
||||
logger.debug(f"Title: {entry.Heading()}")
|
||||
logger.debug(f"Title: {entry.heading}")
|
||||
|
||||
if entry.Tags():
|
||||
tags_str = " ".join(entry.Tags())
|
||||
if entry.tags:
|
||||
tags_str = " ".join(entry.tags)
|
||||
entry_dict["compiled"] += f'\t {tags_str}.'
|
||||
if state.verbose > 2:
|
||||
logger.debug(f"Tags: {tags_str}")
|
||||
|
||||
if entry.Closed():
|
||||
entry_dict["compiled"] += f'\n Closed on {entry.Closed().strftime("%Y-%m-%d")}.'
|
||||
if entry.closed:
|
||||
entry_dict["compiled"] += f'\n Closed on {entry.closed.strftime("%Y-%m-%d")}.'
|
||||
if state.verbose > 2:
|
||||
logger.debug(f'Closed: {entry.Closed().strftime("%Y-%m-%d")}')
|
||||
logger.debug(f'Closed: {entry.closed.strftime("%Y-%m-%d")}')
|
||||
|
||||
if entry.Scheduled():
|
||||
entry_dict["compiled"] += f'\n Scheduled for {entry.Scheduled().strftime("%Y-%m-%d")}.'
|
||||
if entry.scheduled:
|
||||
entry_dict["compiled"] += f'\n Scheduled for {entry.scheduled.strftime("%Y-%m-%d")}.'
|
||||
if state.verbose > 2:
|
||||
logger.debug(f'Scheduled: {entry.Scheduled().strftime("%Y-%m-%d")}')
|
||||
logger.debug(f'Scheduled: {entry.scheduled.strftime("%Y-%m-%d")}')
|
||||
|
||||
if entry.Body():
|
||||
entry_dict["compiled"] += f'\n {entry.Body()}'
|
||||
if entry.hasBody:
|
||||
entry_dict["compiled"] += f'\n {entry.body}'
|
||||
if state.verbose > 2:
|
||||
logger.debug(f"Body: {entry.Body()}")
|
||||
logger.debug(f"Body: {entry.body}")
|
||||
|
||||
if entry_dict:
|
||||
entry_dict["raw"] = f'{entry}'
|
||||
|
|
|
@ -33,12 +33,12 @@ headline and associated text from an org-mode file, and routines for
|
|||
constructing data structures of these classes.
|
||||
"""
|
||||
|
||||
import re, sys
|
||||
import re
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
from os.path import relpath
|
||||
|
||||
indent_regex = re.compile(r'^\s*')
|
||||
indent_regex = re.compile(r'^ *')
|
||||
|
||||
def normalize_filename(filename):
|
||||
"Normalize and escape filename for rendering"
|
||||
|
@ -57,12 +57,7 @@ def makelist(filename):
|
|||
"""
|
||||
ctr = 0
|
||||
|
||||
try:
|
||||
f = open(filename, 'r')
|
||||
except IOError:
|
||||
print(f"Unable to open file {filename}")
|
||||
print("Program terminating.")
|
||||
sys.exit(1)
|
||||
f = open(filename, 'r')
|
||||
|
||||
todos = { "TODO": "", "WAITING": "", "ACTIVE": "",
|
||||
"DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line
|
||||
|
@ -74,41 +69,41 @@ def makelist(filename):
|
|||
sched_date = ''
|
||||
deadline_date = ''
|
||||
logbook = list()
|
||||
nodelist = []
|
||||
propdict = dict()
|
||||
nodelist: list[Orgnode] = list()
|
||||
property_map = dict()
|
||||
in_properties_drawer = False
|
||||
in_logbook_drawer = False
|
||||
file_title = f'{filename}'
|
||||
|
||||
for line in f:
|
||||
ctr += 1
|
||||
hdng = re.search(r'^(\*+)\s(.*?)\s*$', line)
|
||||
if hdng: # we are processing a heading line
|
||||
heading_search = re.search(r'^(\*+)\s(.*?)\s*$', line)
|
||||
if heading_search: # we are processing a heading line
|
||||
if heading: # if we have are on second heading, append first heading to headings list
|
||||
thisNode = Orgnode(level, heading, bodytext, tags)
|
||||
if closed_date:
|
||||
thisNode.setClosed(closed_date)
|
||||
thisNode.closed = closed_date
|
||||
closed_date = ''
|
||||
if sched_date:
|
||||
thisNode.setScheduled(sched_date)
|
||||
thisNode.scheduled = sched_date
|
||||
sched_date = ""
|
||||
if deadline_date:
|
||||
thisNode.setDeadline(deadline_date)
|
||||
thisNode.deadline = deadline_date
|
||||
deadline_date = ''
|
||||
if logbook:
|
||||
thisNode.setLogbook(logbook)
|
||||
thisNode.logbook = logbook
|
||||
logbook = list()
|
||||
thisNode.setProperties(propdict)
|
||||
thisNode.properties = property_map
|
||||
nodelist.append( thisNode )
|
||||
propdict = {'LINE': f'file:{normalize_filename(filename)}::{ctr}'}
|
||||
level = hdng.group(1)
|
||||
heading = hdng.group(2)
|
||||
property_map = {'LINE': f'file:{normalize_filename(filename)}::{ctr}'}
|
||||
level = heading_search.group(1)
|
||||
heading = heading_search.group(2)
|
||||
bodytext = ""
|
||||
tags = list() # set of all tags in headline
|
||||
tagsrch = re.search(r'(.*?)\s*:([a-zA-Z0-9].*?):$',heading)
|
||||
if tagsrch:
|
||||
heading = tagsrch.group(1)
|
||||
parsedtags = tagsrch.group(2)
|
||||
tag_search = re.search(r'(.*?)\s*:([a-zA-Z0-9].*?):$',heading)
|
||||
if tag_search:
|
||||
heading = tag_search.group(1)
|
||||
parsedtags = tag_search.group(2)
|
||||
if parsedtags:
|
||||
for parsedtag in parsedtags.split(':'):
|
||||
if parsedtag != '': tags.append(parsedtag)
|
||||
|
@ -153,13 +148,13 @@ def makelist(filename):
|
|||
logbook += [(clocked_in, clocked_out)]
|
||||
line = ""
|
||||
|
||||
prop_srch = re.search(r'^\s*:([a-zA-Z0-9]+):\s*(.*?)\s*$', line)
|
||||
if prop_srch:
|
||||
property_search = re.search(r'^\s*:([a-zA-Z0-9]+):\s*(.*?)\s*$', line)
|
||||
if property_search:
|
||||
# Set ID property to an id based org-mode link to the entry
|
||||
if prop_srch.group(1) == 'ID':
|
||||
propdict['ID'] = f'id:{prop_srch.group(2)}'
|
||||
if property_search.group(1) == 'ID':
|
||||
property_map['ID'] = f'id:{property_search.group(2)}'
|
||||
else:
|
||||
propdict[prop_srch.group(1)] = prop_srch.group(2)
|
||||
property_map[property_search.group(1)] = property_search.group(2)
|
||||
continue
|
||||
|
||||
cd_re = re.search(r'CLOSED:\s*\[([0-9]{4})-([0-9]{2})-([0-9]{2})', line)
|
||||
|
@ -184,39 +179,38 @@ def makelist(filename):
|
|||
|
||||
# write out last node
|
||||
thisNode = Orgnode(level, heading or file_title, bodytext, tags)
|
||||
thisNode.setProperties(propdict)
|
||||
thisNode.properties = property_map
|
||||
if sched_date:
|
||||
thisNode.setScheduled(sched_date)
|
||||
thisNode.scheduled = sched_date
|
||||
if deadline_date:
|
||||
thisNode.setDeadline(deadline_date)
|
||||
thisNode.deadline = deadline_date
|
||||
if closed_date:
|
||||
thisNode.setClosed(closed_date)
|
||||
thisNode.closed = closed_date
|
||||
if logbook:
|
||||
thisNode.setLogbook(logbook)
|
||||
thisNode.logbook = logbook
|
||||
nodelist.append( thisNode )
|
||||
|
||||
# using the list of TODO keywords found in the file
|
||||
# process the headings searching for TODO keywords
|
||||
for n in nodelist:
|
||||
h = n.Heading()
|
||||
todoSrch = re.search(r'([A-Z]+)\s(.*?)$', h)
|
||||
if todoSrch:
|
||||
if todoSrch.group(1) in todos:
|
||||
n.setHeading( todoSrch.group(2) )
|
||||
n.setTodo ( todoSrch.group(1) )
|
||||
todo_search = re.search(r'([A-Z]+)\s(.*?)$', n.heading)
|
||||
if todo_search:
|
||||
if todo_search.group(1) in todos:
|
||||
n.heading = todo_search.group(2)
|
||||
n.todo = todo_search.group(1)
|
||||
|
||||
# extract, set priority from heading, update heading if necessary
|
||||
prtysrch = re.search(r'^\[\#(A|B|C)\] (.*?)$', n.Heading())
|
||||
if prtysrch:
|
||||
n.setPriority(prtysrch.group(1))
|
||||
n.setHeading(prtysrch.group(2))
|
||||
priority_search = re.search(r'^\[\#(A|B|C)\] (.*?)$', n.heading)
|
||||
if priority_search:
|
||||
n.priority = priority_search.group(1)
|
||||
n.heading = priority_search.group(2)
|
||||
|
||||
# Set SOURCE property to a file+heading based org-mode link to the entry
|
||||
if n.Level() == 0:
|
||||
if n.level == 0:
|
||||
n.properties['LINE'] = f'file:{normalize_filename(filename)}::0'
|
||||
n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}]]'
|
||||
else:
|
||||
escaped_heading = n.Heading().replace("[","\\[").replace("]","\\]")
|
||||
escaped_heading = n.heading.replace("[","\\[").replace("]","\\]")
|
||||
n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}::*{escaped_heading}]]'
|
||||
|
||||
return nodelist
|
||||
|
@ -234,199 +228,235 @@ class Orgnode(object):
|
|||
first tag. The makelist routine postprocesses the list to
|
||||
identify TODO tags and updates headline and todo fields.
|
||||
"""
|
||||
self.level = len(level)
|
||||
self.headline = headline
|
||||
self.body = body
|
||||
self.tags = tags # All tags in the headline
|
||||
self.todo = ""
|
||||
self.prty = "" # empty of A, B or C
|
||||
self.scheduled = "" # Scheduled date
|
||||
self.deadline = "" # Deadline date
|
||||
self.closed = "" # Closed date
|
||||
self.properties = dict()
|
||||
self.logbook = list() # List of clock-in, clock-out tuples representing logbook entries
|
||||
self._level = len(level)
|
||||
self._heading = headline
|
||||
self._body = body
|
||||
self._tags = tags # All tags in the headline
|
||||
self._todo = ""
|
||||
self._priority = "" # empty of A, B or C
|
||||
self._scheduled = "" # Scheduled date
|
||||
self._deadline = "" # Deadline date
|
||||
self._closed = "" # Closed date
|
||||
self._properties = dict()
|
||||
self._logbook = list() # List of clock-in, clock-out tuples representing logbook entries
|
||||
|
||||
# Look for priority in headline and transfer to prty field
|
||||
|
||||
def Heading(self):
|
||||
@property
|
||||
def heading(self):
|
||||
"""
|
||||
Return the Heading text of the node without the TODO tag
|
||||
"""
|
||||
return self.headline
|
||||
return self._heading
|
||||
|
||||
def setHeading(self, newhdng):
|
||||
@heading.setter
|
||||
def heading(self, newhdng):
|
||||
"""
|
||||
Change the heading to the supplied string
|
||||
"""
|
||||
self.headline = newhdng
|
||||
self._heading = newhdng
|
||||
|
||||
def Body(self):
|
||||
@property
|
||||
def body(self):
|
||||
"""
|
||||
Returns all lines of text of the body of this node except the
|
||||
Property Drawer
|
||||
"""
|
||||
return self.body
|
||||
return self._body
|
||||
|
||||
def Level(self):
|
||||
@property
|
||||
def hasBody(self):
|
||||
"""
|
||||
Returns True if node has non empty body, else False
|
||||
"""
|
||||
return self._body and re.sub(r'\n|\t|\r| ', '', self._body) != ''
|
||||
|
||||
@property
|
||||
def level(self):
|
||||
"""
|
||||
Returns an integer corresponding to the level of the node.
|
||||
Top level (one asterisk) has a level of 1.
|
||||
"""
|
||||
return self.level
|
||||
return self._level
|
||||
|
||||
def Priority(self):
|
||||
@property
|
||||
def priority(self):
|
||||
"""
|
||||
Returns the priority of this headline: 'A', 'B', 'C' or empty
|
||||
string if priority has not been set.
|
||||
"""
|
||||
return self.prty
|
||||
return self._priority
|
||||
|
||||
def setPriority(self, newprty):
|
||||
@priority.setter
|
||||
def priority(self, new_priority):
|
||||
"""
|
||||
Change the value of the priority of this headline.
|
||||
Values values are '', 'A', 'B', 'C'
|
||||
"""
|
||||
self.prty = newprty
|
||||
self._priority = new_priority
|
||||
|
||||
def Tags(self):
|
||||
@property
|
||||
def tags(self):
|
||||
"""
|
||||
Returns the list of all tags
|
||||
For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER']
|
||||
"""
|
||||
return self.tags
|
||||
return self._tags
|
||||
|
||||
def hasTag(self, srch):
|
||||
@property
|
||||
def hasTag(self, tag):
|
||||
"""
|
||||
Returns True if the supplied tag is present in this headline
|
||||
For example, hasTag('COMPUTER') on headling containing
|
||||
:HOME:COMPUTER: would return True.
|
||||
"""
|
||||
return srch in self.tags
|
||||
return tag in self._tags
|
||||
|
||||
def setTags(self, newtags):
|
||||
@tags.setter
|
||||
def tags(self, newtags):
|
||||
"""
|
||||
Store all the tags found in the headline.
|
||||
"""
|
||||
self.tags = newtags
|
||||
self._tags = newtags
|
||||
|
||||
def Todo(self):
|
||||
@property
|
||||
def todo(self):
|
||||
"""
|
||||
Return the value of the TODO tag
|
||||
"""
|
||||
return self.todo
|
||||
return self._todo
|
||||
|
||||
def setTodo(self, value):
|
||||
@todo.setter
|
||||
def todo(self, new_todo):
|
||||
"""
|
||||
Set the value of the TODO tag to the supplied string
|
||||
"""
|
||||
self.todo = value
|
||||
self._todo = new_todo
|
||||
|
||||
def setProperties(self, dictval):
|
||||
@property
|
||||
def properties(self):
|
||||
"""
|
||||
Return the dictionary of properties
|
||||
"""
|
||||
return self._properties
|
||||
|
||||
@properties.setter
|
||||
def properties(self, new_properties):
|
||||
"""
|
||||
Sets all properties using the supplied dictionary of
|
||||
name/value pairs
|
||||
"""
|
||||
self.properties = dictval
|
||||
self._properties = new_properties
|
||||
|
||||
def Property(self, keyval):
|
||||
def Property(self, property_key):
|
||||
"""
|
||||
Returns the value of the requested property or null if the
|
||||
property does not exist.
|
||||
"""
|
||||
return self.properties.get(keyval, "")
|
||||
return self._properties.get(property_key, "")
|
||||
|
||||
def setScheduled(self, dateval):
|
||||
@property
|
||||
def scheduled(self):
|
||||
"""
|
||||
Set the scheduled date using the supplied date object
|
||||
Return the scheduled date
|
||||
"""
|
||||
self.scheduled = dateval
|
||||
return self._scheduled
|
||||
|
||||
def Scheduled(self):
|
||||
@scheduled.setter
|
||||
def scheduled(self, new_scheduled):
|
||||
"""
|
||||
Return the scheduled date object or null if nonexistent
|
||||
Set the scheduled date to the scheduled date
|
||||
"""
|
||||
return self.scheduled
|
||||
self._scheduled = new_scheduled
|
||||
|
||||
def setDeadline(self, dateval):
|
||||
@property
|
||||
def deadline(self):
|
||||
"""
|
||||
Set the deadline (due) date using the supplied date object
|
||||
Return the deadline date
|
||||
"""
|
||||
self.deadline = dateval
|
||||
return self._deadline
|
||||
|
||||
def Deadline(self):
|
||||
@deadline.setter
|
||||
def deadline(self, new_deadline):
|
||||
"""
|
||||
Return the deadline date object or null if nonexistent
|
||||
Set the deadline (due) date to the new deadline date
|
||||
"""
|
||||
return self.deadline
|
||||
self._deadline = new_deadline
|
||||
|
||||
def setClosed(self, dateval):
|
||||
@property
|
||||
def closed(self):
|
||||
"""
|
||||
Set the closed date using the supplied date object
|
||||
Return the closed date
|
||||
"""
|
||||
self.closed = dateval
|
||||
return self._closed
|
||||
|
||||
def Closed(self):
|
||||
@closed.setter
|
||||
def closed(self, new_closed):
|
||||
"""
|
||||
Return the closed date object or null if nonexistent
|
||||
Set the closed date to the new closed date
|
||||
"""
|
||||
return self.closed
|
||||
self._closed = new_closed
|
||||
|
||||
def setLogbook(self, logbook):
|
||||
"""
|
||||
Set the logbook with list of clocked-in, clocked-out tuples for the entry
|
||||
"""
|
||||
self.logbook = logbook
|
||||
|
||||
def Logbook(self):
|
||||
@property
|
||||
def logbook(self):
|
||||
"""
|
||||
Return the logbook with all clocked-in, clocked-out date object pairs or empty list if nonexistent
|
||||
"""
|
||||
return self.logbook
|
||||
return self._logbook
|
||||
|
||||
@logbook.setter
|
||||
def logbook(self, new_logbook):
|
||||
"""
|
||||
Set the logbook with list of clocked-in, clocked-out tuples for the entry
|
||||
"""
|
||||
self._logbook = new_logbook
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Print the level, heading text and tag of a node and the body
|
||||
text as used to construct the node.
|
||||
"""
|
||||
# This method is not completed yet.
|
||||
# Output heading line
|
||||
n = ''
|
||||
for _ in range(0, self.level):
|
||||
for _ in range(0, self._level):
|
||||
n = n + '*'
|
||||
n = n + ' '
|
||||
if self.todo:
|
||||
n = n + self.todo + ' '
|
||||
if self.prty:
|
||||
n = n + '[#' + self.prty + '] '
|
||||
n = n + self.headline
|
||||
if self._todo:
|
||||
n = n + self._todo + ' '
|
||||
if self._priority:
|
||||
n = n + '[#' + self._priority + '] '
|
||||
n = n + self._heading
|
||||
n = "%-60s " % n # hack - tags will start in column 62
|
||||
closecolon = ''
|
||||
for t in self.tags:
|
||||
for t in self._tags:
|
||||
n = n + ':' + t
|
||||
closecolon = ':'
|
||||
n = n + closecolon
|
||||
n = n + "\n"
|
||||
|
||||
# Get body indentation from first line of body
|
||||
indent = indent_regex.match(self.body).group()
|
||||
indent = indent_regex.match(self._body).group()
|
||||
|
||||
# Output Closed Date, Scheduled Date, Deadline Date
|
||||
if self.closed or self.scheduled or self.deadline:
|
||||
if self._closed or self._scheduled or self._deadline:
|
||||
n = n + indent
|
||||
if self.closed:
|
||||
n = n + f'CLOSED: [{self.closed.strftime("%Y-%m-%d %a")}] '
|
||||
if self.scheduled:
|
||||
n = n + f'SCHEDULED: <{self.scheduled.strftime("%Y-%m-%d %a")}> '
|
||||
if self.deadline:
|
||||
n = n + f'DEADLINE: <{self.deadline.strftime("%Y-%m-%d %a")}> '
|
||||
if self.closed or self.scheduled or self.deadline:
|
||||
if self._closed:
|
||||
n = n + f'CLOSED: [{self._closed.strftime("%Y-%m-%d %a")}] '
|
||||
if self._scheduled:
|
||||
n = n + f'SCHEDULED: <{self._scheduled.strftime("%Y-%m-%d %a")}> '
|
||||
if self._deadline:
|
||||
n = n + f'DEADLINE: <{self._deadline.strftime("%Y-%m-%d %a")}> '
|
||||
if self._closed or self._scheduled or self._deadline:
|
||||
n = n + '\n'
|
||||
|
||||
# Ouput Property Drawer
|
||||
n = n + indent + ":PROPERTIES:\n"
|
||||
for key, value in self.properties.items():
|
||||
for key, value in self._properties.items():
|
||||
n = n + indent + f":{key}: {value}\n"
|
||||
n = n + indent + ":END:\n"
|
||||
|
||||
n = n + self.body
|
||||
# Output Body
|
||||
if self.hasBody:
|
||||
n = n + self._body
|
||||
|
||||
return n
|
||||
|
|
|
@ -182,8 +182,8 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon
|
|||
|
||||
# Map notes in text files to (compressed) JSONL formatted file
|
||||
config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
|
||||
previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() else None
|
||||
entries_with_indices = text_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, previous_entries)
|
||||
previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None
|
||||
entries_with_indices = text_to_jsonl(config, previous_entries)
|
||||
|
||||
# Extract Updated Entries
|
||||
entries = extract_entries(config.compressed_jsonl)
|
||||
|
|
|
@ -18,6 +18,7 @@ class TextContentConfig(ConfigBase):
|
|||
input_filter: Optional[str]
|
||||
compressed_jsonl: Path
|
||||
embeddings_file: Path
|
||||
index_heading_entries: Optional[bool] = False
|
||||
|
||||
@validator('input_filter')
|
||||
def input_filter_or_files_required(cls, input_filter, values, **kwargs):
|
||||
|
|
|
@ -6,28 +6,33 @@ from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, co
|
|||
from src.utils.helpers import is_none_or_empty
|
||||
|
||||
|
||||
def test_entry_with_empty_body_line_to_jsonl(tmp_path):
|
||||
'''Ensure entries with empty body are ignored.
|
||||
def test_configure_heading_entry_to_jsonl(tmp_path):
|
||||
'''Ensure entries with empty body are ignored, unless explicitly configured to index heading entries.
|
||||
Property drawers not considered Body. Ignore control characters for evaluating if Body empty.'''
|
||||
# Arrange
|
||||
entry = f'''*** Heading
|
||||
:PROPERTIES:
|
||||
:ID: 42-42-42
|
||||
:END:
|
||||
\t\r
|
||||
\t \r
|
||||
'''
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile])
|
||||
for index_heading_entries in [True, False]:
|
||||
# Act
|
||||
# Extract entries into jsonl from specified Org files
|
||||
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(
|
||||
*extract_org_entries(org_files=[orgfile]),
|
||||
index_heading_entries=index_heading_entries))
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
jsonl_data = convert_org_entries_to_jsonl(entries)
|
||||
|
||||
# Assert
|
||||
assert is_none_or_empty(jsonl_data)
|
||||
# Assert
|
||||
if index_heading_entries:
|
||||
# Entry with empty body indexed when index_heading_entries set to True
|
||||
assert len(jsonl_data) == 1
|
||||
else:
|
||||
# Entry with empty body ignored when index_heading_entries set to False
|
||||
assert is_none_or_empty(jsonl_data)
|
||||
|
||||
|
||||
def test_entry_with_body_to_jsonl(tmp_path):
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
# Standard Packages
|
||||
import datetime
|
||||
from os.path import relpath
|
||||
from pathlib import Path
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.org_mode import orgnode
|
||||
|
@ -20,14 +18,14 @@ def test_parse_entry_with_no_headings(tmp_path):
|
|||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert entries[0].Heading() == f'{orgfile}'
|
||||
assert entries[0].Tags() == list()
|
||||
assert entries[0].Body() == "Body Line 1"
|
||||
assert entries[0].Priority() == ""
|
||||
assert entries[0].heading == f'{orgfile}'
|
||||
assert entries[0].tags == list()
|
||||
assert entries[0].body == "Body Line 1"
|
||||
assert entries[0].priority == ""
|
||||
assert entries[0].Property("ID") == ""
|
||||
assert entries[0].Closed() == ""
|
||||
assert entries[0].Scheduled() == ""
|
||||
assert entries[0].Deadline() == ""
|
||||
assert entries[0].closed == ""
|
||||
assert entries[0].scheduled == ""
|
||||
assert entries[0].deadline == ""
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
|
@ -44,14 +42,14 @@ Body Line 1'''
|
|||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert entries[0].Heading() == "Heading"
|
||||
assert entries[0].Tags() == list()
|
||||
assert entries[0].Body() == "Body Line 1"
|
||||
assert entries[0].Priority() == ""
|
||||
assert entries[0].heading == "Heading"
|
||||
assert entries[0].tags == list()
|
||||
assert entries[0].body == "Body Line 1"
|
||||
assert entries[0].priority == ""
|
||||
assert entries[0].Property("ID") == ""
|
||||
assert entries[0].Closed() == ""
|
||||
assert entries[0].Scheduled() == ""
|
||||
assert entries[0].Deadline() == ""
|
||||
assert entries[0].closed == ""
|
||||
assert entries[0].scheduled == ""
|
||||
assert entries[0].deadline == ""
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
|
@ -77,16 +75,44 @@ Body Line 2'''
|
|||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert entries[0].Heading() == "Heading"
|
||||
assert entries[0].Todo() == "DONE"
|
||||
assert entries[0].Tags() == ["Tag1", "TAG2", "tag3"]
|
||||
assert entries[0].Body() == "- Clocked Log 1\nBody Line 1\nBody Line 2"
|
||||
assert entries[0].Priority() == "A"
|
||||
assert entries[0].heading == "Heading"
|
||||
assert entries[0].todo == "DONE"
|
||||
assert entries[0].tags == ["Tag1", "TAG2", "tag3"]
|
||||
assert entries[0].body == "- Clocked Log 1\nBody Line 1\nBody Line 2"
|
||||
assert entries[0].priority == "A"
|
||||
assert entries[0].Property("ID") == "id:123-456-789-4234-1231"
|
||||
assert entries[0].Closed() == datetime.date(1984,4,1)
|
||||
assert entries[0].Scheduled() == datetime.date(1984,4,1)
|
||||
assert entries[0].Deadline() == datetime.date(1984,4,1)
|
||||
assert entries[0].Logbook() == [(datetime.datetime(1984,4,1,9,0,0), datetime.datetime(1984,4,1,12,0,0))]
|
||||
assert entries[0].closed == datetime.date(1984,4,1)
|
||||
assert entries[0].scheduled == datetime.date(1984,4,1)
|
||||
assert entries[0].deadline == datetime.date(1984,4,1)
|
||||
assert entries[0].logbook == [(datetime.datetime(1984,4,1,9,0,0), datetime.datetime(1984,4,1,12,0,0))]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_render_entry_with_property_drawer_and_empty_body(tmp_path):
|
||||
"Render heading entry with property drawer"
|
||||
# Arrange
|
||||
entry_to_render = f'''
|
||||
*** [#A] Heading1 :tag1:
|
||||
:PROPERTIES:
|
||||
:ID: 111-111-111-1111-1111
|
||||
:END:
|
||||
\t\r \n
|
||||
'''
|
||||
orgfile = create_file(tmp_path, entry_to_render)
|
||||
|
||||
expected_entry = f'''*** [#A] Heading1 :tag1:
|
||||
:PROPERTIES:
|
||||
:LINE: file:{orgfile}::2
|
||||
:ID: id:111-111-111-1111-1111
|
||||
:SOURCE: [[file:{orgfile}::*Heading1]]
|
||||
:END:
|
||||
'''
|
||||
|
||||
# Act
|
||||
parsed_entries = orgnode.makelist(orgfile)
|
||||
|
||||
# Assert
|
||||
assert f'{parsed_entries[0]}' == expected_entry
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
|
@ -109,7 +135,7 @@ Body Line 2
|
|||
|
||||
# Assert
|
||||
# SOURCE link rendered with Heading
|
||||
assert f':SOURCE: [[file:{orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}'
|
||||
assert f':SOURCE: [[file:{orgfile}::*{entries[0].heading}]]' in f'{entries[0]}'
|
||||
# ID link rendered with ID
|
||||
assert f':ID: id:123-456-789-4234-1231' in f'{entries[0]}'
|
||||
# LINE link rendered with line number
|
||||
|
@ -134,7 +160,7 @@ Body Line 1'''
|
|||
# Assert
|
||||
assert len(entries) == 1
|
||||
# parsed heading from entry
|
||||
assert entries[0].Heading() == "Heading[1]"
|
||||
assert entries[0].heading == "Heading[1]"
|
||||
# ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
|
||||
escaped_orgfile = f'{orgfile}'.replace("[1]", "\\[1\\]")
|
||||
assert f':SOURCE: [[file:{escaped_orgfile}::*Heading\[1\]' in f'{entries[0]}'
|
||||
|
@ -176,16 +202,16 @@ Body 2
|
|||
# Assert
|
||||
assert len(entries) == 2
|
||||
for index, entry in enumerate(entries):
|
||||
assert entry.Heading() == f"Heading{index+1}"
|
||||
assert entry.Todo() == "FAILED" if index == 0 else "CANCELLED"
|
||||
assert entry.Tags() == [f"tag{index+1}"]
|
||||
assert entry.Body() == f"- Clocked Log {index+1}\nBody {index+1}\n\n"
|
||||
assert entry.Priority() == "A"
|
||||
assert entry.heading == f"Heading{index+1}"
|
||||
assert entry.todo == "FAILED" if index == 0 else "CANCELLED"
|
||||
assert entry.tags == [f"tag{index+1}"]
|
||||
assert entry.body == f"- Clocked Log {index+1}\nBody {index+1}\n\n"
|
||||
assert entry.priority == "A"
|
||||
assert entry.Property("ID") == f"id:123-456-789-4234-000{index+1}"
|
||||
assert entry.Closed() == datetime.date(1984,4,index+1)
|
||||
assert entry.Scheduled() == datetime.date(1984,4,index+1)
|
||||
assert entry.Deadline() == datetime.date(1984,4,index+1)
|
||||
assert entry.Logbook() == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))]
|
||||
assert entry.closed == datetime.date(1984,4,index+1)
|
||||
assert entry.scheduled == datetime.date(1984,4,index+1)
|
||||
assert entry.deadline == datetime.date(1984,4,index+1)
|
||||
assert entry.logbook == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
|
@ -201,14 +227,14 @@ Body Line 1'''
|
|||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert entries[0].Heading() == f'{orgfile}'
|
||||
assert entries[0].Tags() == list()
|
||||
assert entries[0].Body() == "Body Line 1"
|
||||
assert entries[0].Priority() == ""
|
||||
assert entries[0].heading == f'{orgfile}'
|
||||
assert entries[0].tags == list()
|
||||
assert entries[0].body == "Body Line 1"
|
||||
assert entries[0].priority == ""
|
||||
assert entries[0].Property("ID") == ""
|
||||
assert entries[0].Closed() == ""
|
||||
assert entries[0].Scheduled() == ""
|
||||
assert entries[0].Deadline() == ""
|
||||
assert entries[0].closed == ""
|
||||
assert entries[0].scheduled == ""
|
||||
assert entries[0].deadline == ""
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
|
@ -224,14 +250,14 @@ Body Line 1'''
|
|||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert entries[0].Heading() == 'test'
|
||||
assert entries[0].Tags() == list()
|
||||
assert entries[0].Body() == "Body Line 1"
|
||||
assert entries[0].Priority() == ""
|
||||
assert entries[0].heading == 'test'
|
||||
assert entries[0].tags == list()
|
||||
assert entries[0].body == "Body Line 1"
|
||||
assert entries[0].priority == ""
|
||||
assert entries[0].Property("ID") == ""
|
||||
assert entries[0].Closed() == ""
|
||||
assert entries[0].Scheduled() == ""
|
||||
assert entries[0].Deadline() == ""
|
||||
assert entries[0].closed == ""
|
||||
assert entries[0].scheduled == ""
|
||||
assert entries[0].deadline == ""
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
|
@ -248,14 +274,14 @@ Body Line 1
|
|||
|
||||
# Assert
|
||||
assert len(entries) == 1
|
||||
assert entries[0].Heading() == 'title1 title2'
|
||||
assert entries[0].Tags() == list()
|
||||
assert entries[0].Body() == "Body Line 1\n"
|
||||
assert entries[0].Priority() == ""
|
||||
assert entries[0].heading == 'title1 title2'
|
||||
assert entries[0].tags == list()
|
||||
assert entries[0].body == "Body Line 1\n"
|
||||
assert entries[0].priority == ""
|
||||
assert entries[0].Property("ID") == ""
|
||||
assert entries[0].Closed() == ""
|
||||
assert entries[0].Scheduled() == ""
|
||||
assert entries[0].Deadline() == ""
|
||||
assert entries[0].closed == ""
|
||||
assert entries[0].scheduled == ""
|
||||
assert entries[0].deadline == ""
|
||||
|
||||
|
||||
# Helper Functions
|
||||
|
|
|
@ -13,6 +13,20 @@ from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
|||
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_setup_with_missing_file_raises_error(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
file_to_index = Path(content_config.org.input_filter).parent / "new_file_to_index.org"
|
||||
new_org_content_config = deepcopy(content_config.org)
|
||||
new_org_content_config.input_files = [f'{file_to_index}']
|
||||
new_org_content_config.input_filter = None
|
||||
|
||||
# Act
|
||||
# Generate notes embeddings during asymmetric setup
|
||||
with pytest.raises(FileNotFoundError):
|
||||
text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
|
|
Loading…
Add table
Reference in a new issue