mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Add org processor to generate compressed jsonl from org-mode files
The corpus embeddings are generated from this compressed JSONL using the specified transformer ML model
This commit is contained in:
parent
b74cb9a104
commit
354c541b62
3 changed files with 454 additions and 0 deletions
0
processor/org-mode/__init__.py
Normal file
0
processor/org-mode/__init__.py
Normal file
122
processor/org-mode/org-to-jsonl.py
Normal file
122
processor/org-mode/org-to-jsonl.py
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# Import Modules
|
||||||
|
import orgnode
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
import pathlib
|
||||||
|
import glob
|
||||||
|
import gzip
|
||||||
|
|
||||||
|
|
||||||
|
# Define Functions
|
||||||
|
def dump_jsonl(jsonl_data, output_path, verbose=0):
|
||||||
|
"Write List of JSON objects to JSON line file"
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(jsonl_data)
|
||||||
|
|
||||||
|
if verbose > 0:
|
||||||
|
print(f'Wrote {len(jsonl_data)} records to jsonl at {output_path}')
|
||||||
|
|
||||||
|
|
||||||
|
def compress_jsonl_data(jsonl_data, output_path, verbose=0):
|
||||||
|
with gzip.open(f'{output_path}.gz', 'wt') as gzip_file:
|
||||||
|
gzip_file.write(jsonl_data)
|
||||||
|
|
||||||
|
if verbose > 0:
|
||||||
|
print(f'Wrote {len(jsonl_data)} records to gzip compressed jsonl at {output_path}.gz')
|
||||||
|
|
||||||
|
|
||||||
|
def load_jsonl(input_path, verbose=0):
|
||||||
|
"Read List of JSON objects from JSON line file"
|
||||||
|
data = []
|
||||||
|
with open(input_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
data.append(json.loads(line.rstrip('\n|\r')))
|
||||||
|
|
||||||
|
if verbose > 0:
|
||||||
|
print(f'Loaded {len(data)} records from {input_path}')
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_org_files(org_directory, org_files=None, org_file_filter="*.org"):
|
||||||
|
"Get Org files to process"
|
||||||
|
expanded_org_directory = org_directory.expanduser()
|
||||||
|
filtered_org_files = {org_file
|
||||||
|
for org_file
|
||||||
|
in expanded_org_directory.glob(org_file_filter)
|
||||||
|
if not org_file.name.startswith('.')}
|
||||||
|
|
||||||
|
# Filter to User specified Org files when set by User
|
||||||
|
if org_files:
|
||||||
|
filtered_org_files = {str(org_file)
|
||||||
|
for org_file in filtered_org_files
|
||||||
|
if str(org_file.relative_to(expanded_org_directory)) in set(org_files)}
|
||||||
|
|
||||||
|
return filtered_org_files
|
||||||
|
|
||||||
|
|
||||||
|
def extract_org_entries(org_files):
|
||||||
|
"Extract entries from specified Org files"
|
||||||
|
entries = []
|
||||||
|
for org_file in org_files:
|
||||||
|
entries.extend(
|
||||||
|
orgnode.makelist(
|
||||||
|
str(org_file)))
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
|
def convert_org_entries_to_jsonl(entries, jsonl_file, verbose=0):
|
||||||
|
"Convert each org entries to json and write to jsonl file"
|
||||||
|
jsonl = ''
|
||||||
|
for entry in entries:
|
||||||
|
entry_dict = dict()
|
||||||
|
|
||||||
|
entry_dict["Title"] = entry.Heading()
|
||||||
|
if verbose > 1:
|
||||||
|
print(f"Title: {entry.Heading()}")
|
||||||
|
|
||||||
|
if entry.Tags():
|
||||||
|
tags_str = " ".join([tag for tag in entry.Tags()])
|
||||||
|
entry_dict["Tags"] = tags_str
|
||||||
|
if verbose > 1:
|
||||||
|
print(f"Tags: {tags_str}")
|
||||||
|
|
||||||
|
if entry.Body():
|
||||||
|
entry_dict["Body"] = entry.Body()
|
||||||
|
if verbose > 2:
|
||||||
|
print(f"Body: {entry.Body()}")
|
||||||
|
|
||||||
|
if entry_dict:
|
||||||
|
# Convert Dictionary to JSON and Append to JSONL string
|
||||||
|
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||||
|
|
||||||
|
return jsonl
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Setup argument parser
|
||||||
|
parser = argparse.ArgumentParser(description="Map Org-Mode notes into JSONL format")
|
||||||
|
parser.add_argument('--jsonl-file', type=pathlib.Path, required=True, help="Output file for JSONL formatted notes")
|
||||||
|
parser.add_argument('--org-directory', default=pathlib.Path("./"), type=pathlib.Path, help="Input directory from which to retrieve Org-Mode files to convert. Default: Current directory.")
|
||||||
|
parser.add_argument('--org-files', '-f', nargs='+', help="List of org mode files to process. Requires file path relative to org_directory")
|
||||||
|
parser.add_argument('--org-file-filter', type=str, default="*.org", help="Regex filter org files in org_directory to process. Default: All org files in org_directory")
|
||||||
|
parser.add_argument('--compress', action='store_true', default=False, help="Compress output to gunzipped jsonl file")
|
||||||
|
parser.add_argument('--verbose', '-v', action='count', help="Show verbose conversion logs")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Get Org Files to Process
|
||||||
|
org_files = get_org_files(args.org_directory, args.org_files, args.org_file_filter)
|
||||||
|
|
||||||
|
# Extract Entries from specified Org files
|
||||||
|
entries = extract_org_entries(org_files)
|
||||||
|
|
||||||
|
# Process Each Entry from All Notes Files
|
||||||
|
jsonl_data = convert_org_entries_to_jsonl(entries, str(args.jsonl_file), verbose=args.verbose)
|
||||||
|
|
||||||
|
# Compress JSONL formatted Data
|
||||||
|
if args.compress:
|
||||||
|
compress_jsonl_data(jsonl_data, str(args.jsonl_file.expanduser()), verbose=args.verbose)
|
||||||
|
else:
|
||||||
|
dump_jsonl(jsonl_data, str(args.jsonl_file.expanduser()), verbose=args.verbose)
|
332
processor/org-mode/orgnode.py
Normal file
332
processor/org-mode/orgnode.py
Normal file
|
@ -0,0 +1,332 @@
|
||||||
|
# Copyright (c) 2010 Charles Cave
|
||||||
|
#
|
||||||
|
# Permission is hereby granted, free of charge, to any person
|
||||||
|
# obtaining a copy of this software and associated documentation
|
||||||
|
# files (the "Software"), to deal in the Software without
|
||||||
|
# restriction, including without limitation the rights to use, copy,
|
||||||
|
# modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||||
|
# of the Software, and to permit persons to whom the Software is
|
||||||
|
# furnished to do so, subject to the following conditions:
|
||||||
|
#
|
||||||
|
# The above copyright notice and this permission notice shall be
|
||||||
|
# included in all copies or substantial portions of the Software.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||||
|
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
# SOFTWARE.
|
||||||
|
|
||||||
|
# Program written by Charles Cave (charlesweb@optusnet.com.au)
|
||||||
|
# February - March 2009
|
||||||
|
# Version 2 - June 2009
|
||||||
|
# Added support for all tags, TODO priority and checking existence of a tag
|
||||||
|
# More information at
|
||||||
|
# http://members.optusnet.com.au/~charles57/GTD
|
||||||
|
|
||||||
|
"""
|
||||||
|
The Orgnode module consists of the Orgnode class for representing a
|
||||||
|
headline and associated text from an org-mode file, and routines for
|
||||||
|
constructing data structures of these classes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re, sys
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
def makelist(filename):
|
||||||
|
"""
|
||||||
|
Read an org-mode file and return a list of Orgnode objects
|
||||||
|
created from this file.
|
||||||
|
"""
|
||||||
|
ctr = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
f = open(filename, 'r')
|
||||||
|
except IOError:
|
||||||
|
print(f"Unable to open file {filename}")
|
||||||
|
print("Program terminating.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
todos = { "TODO": "", "WAITING": "", "ACTIVE": "",
|
||||||
|
"DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line
|
||||||
|
level = 0
|
||||||
|
heading = ""
|
||||||
|
bodytext = ""
|
||||||
|
tag1 = "" # The first tag enclosed in ::
|
||||||
|
alltags = [] # list of all tags in headline
|
||||||
|
sched_date = ''
|
||||||
|
deadline_date = ''
|
||||||
|
nodelist = []
|
||||||
|
propdict = dict()
|
||||||
|
in_properties_drawer = False
|
||||||
|
|
||||||
|
for line in f:
|
||||||
|
ctr += 1
|
||||||
|
hdng = re.search('^(\*+)\s(.*?)\s*$', line)
|
||||||
|
if hdng:
|
||||||
|
if heading: # we are processing a heading line
|
||||||
|
thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
|
||||||
|
if sched_date:
|
||||||
|
thisNode.setScheduled(sched_date)
|
||||||
|
sched_date = ""
|
||||||
|
if deadline_date:
|
||||||
|
thisNode.setDeadline(deadline_date)
|
||||||
|
deadline_date = ''
|
||||||
|
thisNode.setProperties(propdict)
|
||||||
|
nodelist.append( thisNode )
|
||||||
|
propdict = dict()
|
||||||
|
level = hdng.group(1)
|
||||||
|
heading = hdng.group(2)
|
||||||
|
bodytext = ""
|
||||||
|
tag1 = ""
|
||||||
|
alltags = [] # list of all tags in headline
|
||||||
|
tagsrch = re.search('(.*?)\s*:([a-zA-Z0-9].*?):([a-zA-Z0-9].*?):$',heading)
|
||||||
|
if tagsrch:
|
||||||
|
heading = tagsrch.group(1)
|
||||||
|
tag1 = tagsrch.group(2)
|
||||||
|
alltags.append(tag1)
|
||||||
|
tag2 = tagsrch.group(3)
|
||||||
|
if tag2:
|
||||||
|
for t in tag2.split(':'):
|
||||||
|
if t != '': alltags.append(t)
|
||||||
|
else: # we are processing a non-heading line
|
||||||
|
if line[:10] == '#+SEQ_TODO':
|
||||||
|
kwlist = re.findall('([A-Z]+)\(', line)
|
||||||
|
for kw in kwlist: todos[kw] = ""
|
||||||
|
|
||||||
|
# Ignore Properties Drawers Completely
|
||||||
|
if re.search(':PROPERTIES:', line):
|
||||||
|
in_properties_drawer=True
|
||||||
|
continue
|
||||||
|
if in_properties_drawer and re.search(':END:', line):
|
||||||
|
in_properties_drawer=False
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Ignore Clocking Lines
|
||||||
|
if re.search('CLOCK: \[[0-9]{4}-[0-9]{2}-[0-9]{2}', line):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not in_properties_drawer and line[:1] != '#':
|
||||||
|
bodytext = bodytext + line
|
||||||
|
|
||||||
|
prop_srch = re.search('^\s*:(.*?):\s*(.*?)\s*$', line)
|
||||||
|
if prop_srch:
|
||||||
|
propdict[prop_srch.group(1)] = prop_srch.group(2)
|
||||||
|
continue
|
||||||
|
sd_re = re.search('SCHEDULED:\s+<([0-9]+)\-([0-9]+)\-([0-9]+)', line)
|
||||||
|
if sd_re:
|
||||||
|
sched_date = datetime.date(int(sd_re.group(1)),
|
||||||
|
int(sd_re.group(2)),
|
||||||
|
int(sd_re.group(3)) )
|
||||||
|
dd_re = re.search('DEADLINE:\s*<(\d+)\-(\d+)\-(\d+)', line)
|
||||||
|
if dd_re:
|
||||||
|
deadline_date = datetime.date(int(dd_re.group(1)),
|
||||||
|
int(dd_re.group(2)),
|
||||||
|
int(dd_re.group(3)) )
|
||||||
|
|
||||||
|
# write out last node
|
||||||
|
thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
|
||||||
|
thisNode.setProperties(propdict)
|
||||||
|
if sched_date:
|
||||||
|
thisNode.setScheduled(sched_date)
|
||||||
|
if deadline_date:
|
||||||
|
thisNode.setDeadline(deadline_date)
|
||||||
|
nodelist.append( thisNode )
|
||||||
|
|
||||||
|
# using the list of TODO keywords found in the file
|
||||||
|
# process the headings searching for TODO keywords
|
||||||
|
for n in nodelist:
|
||||||
|
h = n.Heading()
|
||||||
|
todoSrch = re.search('([A-Z]+)\s(.*?)$', h)
|
||||||
|
if todoSrch:
|
||||||
|
if todoSrch.group(1) in todos:
|
||||||
|
n.setHeading( todoSrch.group(2) )
|
||||||
|
n.setTodo ( todoSrch.group(1) )
|
||||||
|
prtysrch = re.search('^\[\#(A|B|C)\] (.*?)$', n.Heading())
|
||||||
|
if prtysrch:
|
||||||
|
n.setPriority(prtysrch.group(1))
|
||||||
|
n.setHeading(prtysrch.group(2))
|
||||||
|
|
||||||
|
return nodelist
|
||||||
|
|
||||||
|
######################
|
||||||
|
class Orgnode(object):
|
||||||
|
"""
|
||||||
|
Orgnode class represents a headline, tags and text associated
|
||||||
|
with the headline.
|
||||||
|
"""
|
||||||
|
def __init__(self, level, headline, body, tag, alltags):
|
||||||
|
"""
|
||||||
|
Create an Orgnode object given the parameters of level (as the
|
||||||
|
raw asterisks), headline text (including the TODO tag), and
|
||||||
|
first tag. The makelist routine postprocesses the list to
|
||||||
|
identify TODO tags and updates headline and todo fields.
|
||||||
|
"""
|
||||||
|
self.level = len(level)
|
||||||
|
self.headline = headline
|
||||||
|
self.body = body
|
||||||
|
self.tag = tag # The first tag in the list
|
||||||
|
self.tags = dict() # All tags in the headline
|
||||||
|
self.todo = ""
|
||||||
|
self.prty = "" # empty of A, B or C
|
||||||
|
self.scheduled = "" # Scheduled date
|
||||||
|
self.deadline = "" # Deadline date
|
||||||
|
self.properties = dict()
|
||||||
|
for t in alltags:
|
||||||
|
self.tags[t] = ''
|
||||||
|
|
||||||
|
# Look for priority in headline and transfer to prty field
|
||||||
|
|
||||||
|
def Heading(self):
|
||||||
|
"""
|
||||||
|
Return the Heading text of the node without the TODO tag
|
||||||
|
"""
|
||||||
|
return self.headline
|
||||||
|
|
||||||
|
def setHeading(self, newhdng):
|
||||||
|
"""
|
||||||
|
Change the heading to the supplied string
|
||||||
|
"""
|
||||||
|
self.headline = newhdng
|
||||||
|
|
||||||
|
def Body(self):
|
||||||
|
"""
|
||||||
|
Returns all lines of text of the body of this node except the
|
||||||
|
Property Drawer
|
||||||
|
"""
|
||||||
|
return self.body
|
||||||
|
|
||||||
|
def Level(self):
|
||||||
|
"""
|
||||||
|
Returns an integer corresponding to the level of the node.
|
||||||
|
Top level (one asterisk) has a level of 1.
|
||||||
|
"""
|
||||||
|
return self.level
|
||||||
|
|
||||||
|
def Priority(self):
|
||||||
|
"""
|
||||||
|
Returns the priority of this headline: 'A', 'B', 'C' or empty
|
||||||
|
string if priority has not been set.
|
||||||
|
"""
|
||||||
|
return self.prty
|
||||||
|
|
||||||
|
def setPriority(self, newprty):
|
||||||
|
"""
|
||||||
|
Change the value of the priority of this headline.
|
||||||
|
Values values are '', 'A', 'B', 'C'
|
||||||
|
"""
|
||||||
|
self.prty = newprty
|
||||||
|
|
||||||
|
def Tag(self):
|
||||||
|
"""
|
||||||
|
Returns the value of the first tag.
|
||||||
|
For example, :HOME:COMPUTER: would return HOME
|
||||||
|
"""
|
||||||
|
return self.tag
|
||||||
|
|
||||||
|
def Tags(self):
|
||||||
|
"""
|
||||||
|
Returns a list of all tags
|
||||||
|
For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER']
|
||||||
|
"""
|
||||||
|
return self.tags.keys()
|
||||||
|
|
||||||
|
def hasTag(self, srch):
|
||||||
|
"""
|
||||||
|
Returns True if the supplied tag is present in this headline
|
||||||
|
For example, hasTag('COMPUTER') on headling containing
|
||||||
|
:HOME:COMPUTER: would return True.
|
||||||
|
"""
|
||||||
|
return srch in self.tags
|
||||||
|
|
||||||
|
def setTag(self, newtag):
|
||||||
|
"""
|
||||||
|
Change the value of the first tag to the supplied string
|
||||||
|
"""
|
||||||
|
self.tag = newtag
|
||||||
|
|
||||||
|
def setTags(self, taglist):
|
||||||
|
"""
|
||||||
|
Store all the tags found in the headline. The first tag will
|
||||||
|
also be stored as if the setTag method was called.
|
||||||
|
"""
|
||||||
|
for t in taglist:
|
||||||
|
self.tags[t] = ''
|
||||||
|
|
||||||
|
def Todo(self):
|
||||||
|
"""
|
||||||
|
Return the value of the TODO tag
|
||||||
|
"""
|
||||||
|
return self.todo
|
||||||
|
|
||||||
|
def setTodo(self, value):
|
||||||
|
"""
|
||||||
|
Set the value of the TODO tag to the supplied string
|
||||||
|
"""
|
||||||
|
self.todo = value
|
||||||
|
|
||||||
|
def setProperties(self, dictval):
|
||||||
|
"""
|
||||||
|
Sets all properties using the supplied dictionary of
|
||||||
|
name/value pairs
|
||||||
|
"""
|
||||||
|
self.properties = dictval
|
||||||
|
|
||||||
|
def Property(self, keyval):
|
||||||
|
"""
|
||||||
|
Returns the value of the requested property or null if the
|
||||||
|
property does not exist.
|
||||||
|
"""
|
||||||
|
return self.properties.get(keyval, "")
|
||||||
|
|
||||||
|
def setScheduled(self, dateval):
|
||||||
|
"""
|
||||||
|
Set the scheduled date using the supplied date object
|
||||||
|
"""
|
||||||
|
self.scheduled = dateval
|
||||||
|
|
||||||
|
def Scheduled(self):
|
||||||
|
"""
|
||||||
|
Return the scheduled date object or null if nonexistent
|
||||||
|
"""
|
||||||
|
return self.scheduled
|
||||||
|
|
||||||
|
def setDeadline(self, dateval):
|
||||||
|
"""
|
||||||
|
Set the deadline (due) date using the supplied date object
|
||||||
|
"""
|
||||||
|
self.deadline = dateval
|
||||||
|
|
||||||
|
def Deadline(self):
|
||||||
|
"""
|
||||||
|
Return the deadline date object or null if nonexistent
|
||||||
|
"""
|
||||||
|
return self.deadline
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
"""
|
||||||
|
Print the level, heading text and tag of a node and the body
|
||||||
|
text as used to construct the node.
|
||||||
|
"""
|
||||||
|
# This method is not completed yet.
|
||||||
|
n = ''
|
||||||
|
for i in range(0, self.level):
|
||||||
|
n = n + '*'
|
||||||
|
n = n + ' ' + self.todo + ' '
|
||||||
|
if self.prty:
|
||||||
|
n = n + '[#' + self.prty + '] '
|
||||||
|
n = n + self.headline
|
||||||
|
n = "%-60s " % n # hack - tags will start in column 62
|
||||||
|
closecolon = ''
|
||||||
|
for t in self.tags.keys():
|
||||||
|
n = n + ':' + t
|
||||||
|
closecolon = ':'
|
||||||
|
n = n + closecolon
|
||||||
|
# Need to output Scheduled Date, Deadline Date, property tags The
|
||||||
|
# following will output the text used to construct the object
|
||||||
|
n = n + "\n" + self.body
|
||||||
|
|
||||||
|
return n
|
Loading…
Reference in a new issue