Add org processor to generate compressed jsonl from org-mode files

The corpus embeddings are generated from this compressed JSONL using the specified transformer ML model
2025-02-18 22:04:20 +00:00 · 2021-08-15 22:49:09 -07:00 · 2021-08-15 22:49:09 -07:00 · 354c541b62
commit 354c541b62
parent b74cb9a104
3 changed files with 454 additions and 0 deletions
--- a/processor/org-mode/init.py
+++ b/processor/org-mode/init.py
--- a/processor/org-mode/org-to-jsonl.py
+++ b/processor/org-mode/org-to-jsonl.py
@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+# Import Modules
+import orgnode
+import json
+import argparse
+import pathlib
+import glob
+import gzip
+
+
+# Define Functions
+def dump_jsonl(jsonl_data, output_path, verbose=0):
+    "Write List of JSON objects to JSON line file"
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(jsonl_data)
+
+    if verbose > 0:
+        print(f'Wrote {len(jsonl_data)} records to jsonl at {output_path}')
+
+
+def compress_jsonl_data(jsonl_data, output_path, verbose=0):
+    with gzip.open(f'{output_path}.gz', 'wt') as gzip_file:
+        gzip_file.write(jsonl_data)
+
+    if verbose > 0:
+        print(f'Wrote {len(jsonl_data)} records to gzip compressed jsonl at {output_path}.gz')
+
+
+def load_jsonl(input_path, verbose=0):
+    "Read List of JSON objects from JSON line file"
+    data = []
+    with open(input_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            data.append(json.loads(line.rstrip('\n|\r')))
+
+    if verbose > 0:
+        print(f'Loaded {len(data)} records from {input_path}')
+
+    return data
+
+
+def get_org_files(org_directory, org_files=None, org_file_filter="*.org"):
+    "Get Org files to process"
+    expanded_org_directory = org_directory.expanduser()
+    filtered_org_files = {org_file
+                 for org_file
+                 in expanded_org_directory.glob(org_file_filter)
+                 if not org_file.name.startswith('.')}
+
+    # Filter to User specified Org files when set by User
+    if org_files:
+        filtered_org_files = {str(org_file)
+                     for org_file in filtered_org_files
+                     if str(org_file.relative_to(expanded_org_directory)) in set(org_files)}
+
+    return filtered_org_files
+
+
+def extract_org_entries(org_files):
+    "Extract entries from specified Org files"
+    entries = []
+    for org_file in org_files:
+        entries.extend(
+            orgnode.makelist(
+                str(org_file)))
+
+    return entries
+
+def convert_org_entries_to_jsonl(entries, jsonl_file, verbose=0):
+    "Convert each org entries to json and write to jsonl file"
+    jsonl = ''
+    for entry in entries:
+        entry_dict = dict()
+
+        entry_dict["Title"] = entry.Heading()
+        if verbose > 1:
+            print(f"Title: {entry.Heading()}")
+
+        if entry.Tags():
+            tags_str = " ".join([tag for tag in entry.Tags()])
+            entry_dict["Tags"] = tags_str
+            if verbose > 1:
+                print(f"Tags: {tags_str}")
+
+        if entry.Body():
+            entry_dict["Body"] = entry.Body()
+            if verbose > 2:
+                print(f"Body: {entry.Body()}")
+
+        if entry_dict:
+            # Convert Dictionary to JSON and Append to JSONL string
+            jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
+
+    return jsonl
+
+
+if __name__ == '__main__':
+    # Setup argument parser
+    parser = argparse.ArgumentParser(description="Map Org-Mode notes into JSONL format")
+    parser.add_argument('--jsonl-file', type=pathlib.Path, required=True, help="Output file for JSONL formatted notes")
+    parser.add_argument('--org-directory', default=pathlib.Path("./"), type=pathlib.Path, help="Input directory from which to retrieve Org-Mode files to convert. Default: Current directory.")
+    parser.add_argument('--org-files', '-f', nargs='+', help="List of org mode files to process. Requires file path relative to org_directory")
+    parser.add_argument('--org-file-filter', type=str, default="*.org", help="Regex filter org files in org_directory to process. Default: All org files in org_directory")
+    parser.add_argument('--compress', action='store_true', default=False, help="Compress output to gunzipped jsonl file")
+    parser.add_argument('--verbose', '-v', action='count', help="Show verbose conversion logs")
+    args = parser.parse_args()
+
+    # Get Org Files to Process
+    org_files = get_org_files(args.org_directory, args.org_files, args.org_file_filter)
+
+    # Extract Entries from specified Org files
+    entries = extract_org_entries(org_files)
+
+    # Process Each Entry from All Notes Files
+    jsonl_data = convert_org_entries_to_jsonl(entries, str(args.jsonl_file), verbose=args.verbose)
+
+    # Compress JSONL formatted Data
+    if args.compress:
+        compress_jsonl_data(jsonl_data, str(args.jsonl_file.expanduser()), verbose=args.verbose)
+    else:
+        dump_jsonl(jsonl_data, str(args.jsonl_file.expanduser()), verbose=args.verbose)
--- a/processor/org-mode/orgnode.py
+++ b/processor/org-mode/orgnode.py
@ -0,0 +1,332 @@
+# Copyright (c) 2010 Charles Cave
+#
+#  Permission  is  hereby  granted,  free  of charge,  to  any  person
+#  obtaining  a copy  of  this software  and associated  documentation
+#  files   (the  "Software"),   to  deal   in  the   Software  without
+#  restriction, including without limitation  the rights to use, copy,
+#  modify, merge, publish,  distribute, sublicense, and/or sell copies
+#  of  the Software, and  to permit  persons to  whom the  Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be
+#  included in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+#  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+#  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+#  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#  SOFTWARE.
+
+# Program written by Charles Cave   (charlesweb@optusnet.com.au)
+# February - March 2009
+# Version 2 - June 2009
+#   Added support for all tags, TODO priority and checking existence of a tag
+# More information at
+#    http://members.optusnet.com.au/~charles57/GTD
+
+"""
+The Orgnode module consists of the Orgnode class for representing a
+headline and associated text from an org-mode file, and routines for
+constructing data structures of these classes.
+"""
+
+import re, sys
+import datetime
+
+def makelist(filename):
+   """
+   Read an org-mode file and return a list of Orgnode objects
+   created from this file.
+   """
+   ctr = 0
+
+   try:
+      f = open(filename, 'r')
+   except IOError:
+      print(f"Unable to open file {filename}")
+      print("Program terminating.")
+      sys.exit(1)
+
+   todos         = { "TODO": "", "WAITING": "", "ACTIVE": "",
+                     "DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line
+   level         = 0
+   heading       = ""
+   bodytext      = ""
+   tag1          = ""      # The first tag enclosed in ::
+   alltags       = []      # list of all tags in headline
+   sched_date    = ''
+   deadline_date = ''
+   nodelist      = []
+   propdict      = dict()
+   in_properties_drawer = False
+
+   for line in f:
+       ctr += 1
+       hdng = re.search('^(\*+)\s(.*?)\s*$', line)
+       if hdng:
+          if heading:  # we are processing a heading line
+             thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
+             if sched_date:
+                thisNode.setScheduled(sched_date)
+                sched_date = ""
+             if deadline_date:
+                thisNode.setDeadline(deadline_date)
+                deadline_date = ''
+             thisNode.setProperties(propdict)
+             nodelist.append( thisNode )
+             propdict = dict()
+          level = hdng.group(1)
+          heading =  hdng.group(2)
+          bodytext = ""
+          tag1 = ""
+          alltags = []       # list of all tags in headline
+          tagsrch = re.search('(.*?)\s*:([a-zA-Z0-9].*?):([a-zA-Z0-9].*?):$',heading)
+          if tagsrch:
+              heading = tagsrch.group(1)
+              tag1 = tagsrch.group(2)
+              alltags.append(tag1)
+              tag2 = tagsrch.group(3)
+              if tag2:
+                 for t in tag2.split(':'):
+                    if t != '': alltags.append(t)
+       else:      # we are processing a non-heading line
+           if line[:10] == '#+SEQ_TODO':
+              kwlist = re.findall('([A-Z]+)\(', line)
+              for kw in kwlist: todos[kw] = ""
+
+           # Ignore Properties Drawers Completely
+           if re.search(':PROPERTIES:', line):
+              in_properties_drawer=True
+              continue
+           if in_properties_drawer and re.search(':END:', line):
+              in_properties_drawer=False
+              continue
+
+           # Ignore Clocking Lines
+           if re.search('CLOCK: \[[0-9]{4}-[0-9]{2}-[0-9]{2}', line):
+              continue
+
+           if not in_properties_drawer and line[:1] != '#':
+               bodytext = bodytext + line
+
+           prop_srch = re.search('^\s*:(.*?):\s*(.*?)\s*$', line)
+           if prop_srch:
+              propdict[prop_srch.group(1)] = prop_srch.group(2)
+              continue
+           sd_re = re.search('SCHEDULED:\s+<([0-9]+)\-([0-9]+)\-([0-9]+)', line)
+           if sd_re:
+              sched_date = datetime.date(int(sd_re.group(1)),
+                                         int(sd_re.group(2)),
+                                         int(sd_re.group(3)) )
+           dd_re = re.search('DEADLINE:\s*<(\d+)\-(\d+)\-(\d+)', line)
+           if dd_re:
+              deadline_date = datetime.date(int(dd_re.group(1)),
+                                            int(dd_re.group(2)),
+                                            int(dd_re.group(3)) )
+
+   # write out last node
+   thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
+   thisNode.setProperties(propdict)
+   if sched_date:
+      thisNode.setScheduled(sched_date)
+   if deadline_date:
+      thisNode.setDeadline(deadline_date)
+   nodelist.append( thisNode )
+
+   # using the list of TODO keywords found in the file
+   # process the headings searching for TODO keywords
+   for n in nodelist:
+       h = n.Heading()
+       todoSrch = re.search('([A-Z]+)\s(.*?)$', h)
+       if todoSrch:
+           if todoSrch.group(1) in todos:
+               n.setHeading( todoSrch.group(2) )
+               n.setTodo ( todoSrch.group(1) )
+       prtysrch = re.search('^\[\#(A|B|C)\] (.*?)$', n.Heading())
+       if prtysrch:
+          n.setPriority(prtysrch.group(1))
+          n.setHeading(prtysrch.group(2))
+
+   return nodelist
+
+######################
+class Orgnode(object):
+    """
+    Orgnode class represents a headline, tags and text associated
+    with the headline.
+    """
+    def __init__(self, level, headline, body, tag, alltags):
+        """
+        Create an Orgnode object given the parameters of level (as the
+        raw asterisks), headline text (including the TODO tag), and
+        first tag. The makelist routine postprocesses the list to
+        identify TODO tags and updates headline and todo fields.
+        """
+        self.level = len(level)
+        self.headline = headline
+        self.body = body
+        self.tag = tag            # The first tag in the list
+        self.tags = dict()        # All tags in the headline
+        self.todo = ""
+        self.prty = ""            # empty of A, B or C
+        self.scheduled = ""       # Scheduled date
+        self.deadline = ""        # Deadline date
+        self.properties = dict()
+        for t in alltags:
+           self.tags[t] = ''
+
+        # Look for priority in headline and transfer to prty field
+
+    def Heading(self):
+        """
+        Return the Heading text of the node without the TODO tag
+        """
+        return self.headline
+
+    def setHeading(self, newhdng):
+        """
+        Change the heading to the supplied string
+        """
+        self.headline = newhdng
+
+    def Body(self):
+        """
+        Returns all lines of text of the body of this node except the
+        Property Drawer
+        """
+        return self.body
+
+    def Level(self):
+        """
+        Returns an integer corresponding to the level of the node.
+        Top level (one asterisk) has a level of 1.
+        """
+        return self.level
+
+    def Priority(self):
+        """
+        Returns the priority of this headline: 'A', 'B', 'C' or empty
+        string if priority has not been set.
+        """
+        return self.prty
+
+    def setPriority(self, newprty):
+        """
+        Change the value of the priority of this headline.
+        Values values are '', 'A', 'B', 'C'
+        """
+        self.prty = newprty
+
+    def Tag(self):
+        """
+        Returns the value of the first tag.
+        For example, :HOME:COMPUTER: would return HOME
+        """
+        return self.tag
+
+    def Tags(self):
+        """
+        Returns a list of all tags
+        For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER']
+        """
+        return self.tags.keys()
+
+    def hasTag(self, srch):
+        """
+        Returns True if the supplied tag is present in this headline
+        For example, hasTag('COMPUTER') on headling containing
+        :HOME:COMPUTER: would return True.
+        """
+        return srch in self.tags
+
+    def setTag(self, newtag):
+        """
+        Change the value of the first tag to the supplied string
+        """
+        self.tag = newtag
+
+    def setTags(self, taglist):
+        """
+        Store all the tags found in the headline. The first tag will
+        also be stored as if the setTag method was called.
+        """
+        for t in taglist:
+           self.tags[t] = ''
+
+    def Todo(self):
+        """
+        Return the value of the TODO tag
+        """
+        return self.todo
+
+    def setTodo(self, value):
+        """
+        Set the value of the TODO tag to the supplied string
+        """
+        self.todo = value
+
+    def setProperties(self, dictval):
+        """
+        Sets all properties using the supplied dictionary of
+        name/value pairs
+        """
+        self.properties = dictval
+
+    def Property(self, keyval):
+        """
+        Returns the value of the requested property or null if the
+        property does not exist.
+        """
+        return self.properties.get(keyval, "")
+
+    def setScheduled(self, dateval):
+        """
+        Set the scheduled date using the supplied date object
+        """
+        self.scheduled = dateval
+
+    def Scheduled(self):
+        """
+        Return the scheduled date object or null if nonexistent
+        """
+        return self.scheduled
+
+    def setDeadline(self, dateval):
+        """
+        Set the deadline (due) date using the supplied date object
+        """
+        self.deadline = dateval
+
+    def Deadline(self):
+        """
+        Return the deadline date object or null if nonexistent
+        """
+        return self.deadline
+
+    def __repr__(self):
+        """
+        Print the level, heading text and tag of a node and the body
+        text as used to construct the node.
+        """
+        # This method is not completed yet.
+        n = ''
+        for i in range(0, self.level):
+           n = n + '*'
+        n = n + ' ' + self.todo + ' '
+        if self.prty:
+           n = n +  '[#' + self.prty + '] '
+        n = n + self.headline
+        n = "%-60s " % n     # hack - tags will start in column 62
+        closecolon = ''
+        for t in self.tags.keys():
+           n = n + ':' + t
+           closecolon = ':'
+        n = n + closecolon
+# Need to output Scheduled Date, Deadline Date, property tags The
+# following will output the text used to construct the object
+        n = n + "\n" + self.body
+
+        return n