From 7ff1bd9f8b6543eac784c46310ae69073b31ec05 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 Apr 2024 01:49:15 +0530 Subject: [PATCH] Send more text file types from Desktop app and improve indexing them - Allow syncing more file types from desktop app to index on server - Use `file-type' package to identify valid text file types on Desktop app - Split plaintext entries into smaller logical units than a whole file Since the text splitting upgrades in #645, compiled chunks have more logical splits like paragraph, sentence. Show those (potentially) smaller snippets to the user as references - Tangential Fix: Initialize unbound currentTime variable for error log timestamp --- src/interface/desktop/main.js | 33 +++++--- src/interface/desktop/package.json | 3 +- src/interface/desktop/yarn.lock | 75 ++++++++++++++++++- .../content/plaintext/plaintext_to_entries.py | 2 +- src/khoj/processor/content/text_to_entries.py | 4 +- 5 files changed, 101 insertions(+), 16 deletions(-) diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js index d561a2d5..55cb78a1 100644 --- a/src/interface/desktop/main.js +++ b/src/interface/desktop/main.js @@ -1,4 +1,5 @@ const { app, BrowserWindow, ipcMain, Tray, Menu, nativeImage, shell } = require('electron'); +const FileType = require('file-type'); const todesktop = require("@todesktop/runtime"); const khojPackage = require('./package.json'); @@ -111,22 +112,31 @@ function filenameToMimeType (filename) { } } -function processDirectory(filesToPush, folder) { +async function isPlainTextFile(filePath) { + const fileType = await FileType.fromFile(filePath); + if (!fileType) { + return false; + } + return fileType.mime.startsWith('text/'); +} + +async function processDirectory(filesToPush, folder) { const files = fs.readdirSync(folder.path, { withFileTypes: true, recursive: true }); for (const file of files) { - if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) { + const filePath = path.join(folder.path, file.name); + if (file.isFile() && await isPlainTextFile(filePath)) { console.log(`Add ${file.name} in ${folder.path} for indexing`); - filesToPush.push(path.join(folder.path, file.name)); + filesToPush.push(filePath); } if (file.isDirectory()) { - processDirectory(filesToPush, {'path': path.join(folder.path, file.name)}); + await processDirectory(filesToPush, {'path': path.join(folder.path, file.name)}); } } } -function pushDataToKhoj (regenerate = false) { +async function pushDataToKhoj (regenerate = false) { // Don't sync if token or hostURL is not set or if already syncing if (store.get('khojToken') === '' || store.get('hostURL') === '' || syncing === true) { const win = BrowserWindow.getAllWindows()[0]; @@ -148,7 +158,7 @@ function pushDataToKhoj (regenerate = false) { // Collect paths of all indexable files in configured folders for (const folder of folders) { - processDirectory(filesToPush, folder); + await processDirectory(filesToPush, folder); } const lastSync = store.get('lastSync') || []; @@ -222,6 +232,7 @@ function pushDataToKhoj (regenerate = false) { } else if (error?.code === 'ECONNREFUSED') { state["error"] = `Could not connect to Khoj server. Ensure you can connect to it at ${error.address}:${error.port}.`; } else { + currentTime = new Date(); state["error"] = `Sync was unsuccessful at ${currentTime.toLocaleTimeString()}. Contact team@khoj.dev to report this issue.`; } }) @@ -240,7 +251,7 @@ pushDataToKhoj(); async function handleFileOpen (type) { let { canceled, filePaths } = {canceled: true, filePaths: []}; if (type === 'file') { - ({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files", extensions: validFileTypes}] })); + ({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files" }] })); } else if (type === 'folder') { ({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openDirectory' ]})); } @@ -331,7 +342,7 @@ async function removeFolder (event, folderPath) { async function syncData (regenerate = false) { try { - pushDataToKhoj(regenerate); + await pushDataToKhoj(regenerate); const date = new Date(); console.log('Pushing data to Khoj at: ', date); } catch (err) { @@ -343,7 +354,7 @@ async function deleteAllFiles () { try { store.set('files', []); store.set('folders', []); - pushDataToKhoj(true); + await pushDataToKhoj(true); const date = new Date(); console.log('Pushing data to Khoj at: ', date); } catch (err) { @@ -366,9 +377,9 @@ const createWindow = (tab = 'chat.html') => { } }) - const job = new cron('0 */10 * * * *', function() { + const job = new cron('0 */10 * * * *', async function() { try { - pushDataToKhoj(); + await pushDataToKhoj(); const date = new Date(); console.log('Pushing data to Khoj at: ', date); win.webContents.send('update-state', state); diff --git a/src/interface/desktop/package.json b/src/interface/desktop/package.json index 79caa6c2..7f57618a 100644 --- a/src/interface/desktop/package.json +++ b/src/interface/desktop/package.json @@ -20,6 +20,7 @@ "axios": "^1.6.4", "cron": "^2.4.3", "electron-store": "^8.1.0", - "fs": "^0.0.1-security" + "fs": "^0.0.1-security", + "file-type": "^16.2.0" } } diff --git a/src/interface/desktop/yarn.lock b/src/interface/desktop/yarn.lock index 539bfa97..1b599356 100644 --- a/src/interface/desktop/yarn.lock +++ b/src/interface/desktop/yarn.lock @@ -62,6 +62,11 @@ lodash.once "^4.1.1" semver "^7.3.2" +"@tokenizer/token@^0.3.0": + version "0.3.0" + resolved "https://registry.yarnpkg.com/@tokenizer/token/-/token-0.3.0.tgz#fe98a93fe789247e998c75e74e9c7c63217aa276" + integrity sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A== + "@types/cacheable-request@^6.0.1": version "6.0.3" resolved "https://registry.yarnpkg.com/@types/cacheable-request/-/cacheable-request-6.0.3.tgz#a430b3260466ca7b5ca5bfd735693b36e7a9d183" @@ -471,6 +476,15 @@ fd-slicer@~1.1.0: dependencies: pend "~1.2.0" +file-type@^16.2.0: + version "16.5.4" + resolved "https://registry.yarnpkg.com/file-type/-/file-type-16.5.4.tgz#474fb4f704bee427681f98dd390058a172a6c2fd" + integrity sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw== + dependencies: + readable-web-to-node-stream "^3.0.0" + strtok3 "^6.2.4" + token-types "^4.1.1" + fill-range@^7.0.1: version "7.0.1" resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40" @@ -668,6 +682,11 @@ human-signals@^2.1.0: resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw== +ieee754@^1.2.1: + version "1.2.1" + resolved "https://registry.yarnpkg.com/ieee754/-/ieee754-1.2.1.tgz#8eb7a10a63fff25d15a57b001586d177d1b0d352" + integrity sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA== + ignore@^5.2.0: version "5.2.4" resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.2.4.tgz#a291c0c6178ff1b960befe47fcdec301674a6324" @@ -686,7 +705,7 @@ inflight@^1.0.4: once "^1.3.0" wrappy "1" -inherits@2: +inherits@2, inherits@^2.0.3: version "2.0.4" resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c" integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ== @@ -979,6 +998,11 @@ path-type@^4.0.0: resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b" integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw== +peek-readable@^4.1.0: + version "4.1.0" + resolved "https://registry.yarnpkg.com/peek-readable/-/peek-readable-4.1.0.tgz#4ece1111bf5c2ad8867c314c81356847e8a62e72" + integrity sha512-ZI3LnwUv5nOGbQzD9c2iDG6toheuXSZP5esSHBjopsXH4dg19soufvpUGA3uohi5anFtGb2lhAVdHzH6R/Evvg== + pend@~1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50" @@ -1029,6 +1053,22 @@ quick-lru@^5.1.1: resolved "https://registry.yarnpkg.com/quick-lru/-/quick-lru-5.1.1.tgz#366493e6b3e42a3a6885e2e99d18f80fb7a8c932" integrity sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA== +readable-stream@^3.6.0: + version "3.6.2" + resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.2.tgz#56a9b36ea965c00c5a93ef31eb111a0f11056967" + integrity sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA== + dependencies: + inherits "^2.0.3" + string_decoder "^1.1.1" + util-deprecate "^1.0.1" + +readable-web-to-node-stream@^3.0.0: + version "3.0.2" + resolved "https://registry.yarnpkg.com/readable-web-to-node-stream/-/readable-web-to-node-stream-3.0.2.tgz#5d52bb5df7b54861fd48d015e93a2cb87b3ee0bb" + integrity sha512-ePeK6cc1EcKLEhJFt/AebMCLL+GgSKhuygrZ/GLaKZYEecIgIECf4UaUuaByiGtzckwR4ain9VzUh95T1exYGw== + dependencies: + readable-stream "^3.6.0" + require-from-string@^2.0.2: version "2.0.2" resolved "https://registry.yarnpkg.com/require-from-string/-/require-from-string-2.0.2.tgz#89a7fdd938261267318eafe14f9c32e598c36909" @@ -1077,6 +1117,11 @@ run-parallel@^1.1.9: dependencies: queue-microtask "^1.2.2" +safe-buffer@~5.2.0: + version "5.2.1" + resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6" + integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ== + sax@^1.2.4: version "1.2.4" resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" @@ -1133,11 +1178,26 @@ sprintf-js@^1.1.2: resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.2.tgz#da1765262bf8c0f571749f2ad6c26300207ae673" integrity sha512-VE0SOVEHCk7Qc8ulkWw3ntAzXuqf7S2lvwQaDLRnUeIEaKNQJzV6BwmLKhOqT61aGhfUMrXeaBk+oDGCzvhcug== +string_decoder@^1.1.1: + version "1.3.0" + resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e" + integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA== + dependencies: + safe-buffer "~5.2.0" + strip-final-newline@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/strip-final-newline/-/strip-final-newline-2.0.0.tgz#89b852fb2fcbe936f6f4b3187afb0a12c1ab58ad" integrity sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA== +strtok3@^6.2.4: + version "6.3.0" + resolved "https://registry.yarnpkg.com/strtok3/-/strtok3-6.3.0.tgz#358b80ffe6d5d5620e19a073aa78ce947a90f9a0" + integrity sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw== + dependencies: + "@tokenizer/token" "^0.3.0" + peek-readable "^4.1.0" + sumchecker@^3.0.1: version "3.0.1" resolved "https://registry.yarnpkg.com/sumchecker/-/sumchecker-3.0.1.tgz#6377e996795abb0b6d348e9b3e1dfb24345a8e42" @@ -1152,6 +1212,14 @@ to-regex-range@^5.0.1: dependencies: is-number "^7.0.0" +token-types@^4.1.1: + version "4.2.1" + resolved "https://registry.yarnpkg.com/token-types/-/token-types-4.2.1.tgz#0f897f03665846982806e138977dbe72d44df753" + integrity sha512-6udB24Q737UD/SDsKAHI9FCRP7Bqc9D/MQUV02ORQg5iskjtLJlZJNdN4kKtcdtwCeWIwIHDGaUsTsCCAa8sFQ== + dependencies: + "@tokenizer/token" "^0.3.0" + ieee754 "^1.2.1" + type-fest@^0.13.1: version "0.13.1" resolved "https://registry.yarnpkg.com/type-fest/-/type-fest-0.13.1.tgz#0172cb5bce80b0bd542ea348db50c7e21834d934" @@ -1179,6 +1247,11 @@ uri-js@^4.2.2: dependencies: punycode "^2.1.0" +util-deprecate@^1.0.1: + version "1.0.2" + resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf" + integrity sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw== + which@^2.0.1: version "2.0.2" resolved "https://registry.yarnpkg.com/which/-/which-2.0.2.tgz#7c6a8dd0a636a0327e10b59c9286eee93f3f51b1" diff --git a/src/khoj/processor/content/plaintext/plaintext_to_entries.py b/src/khoj/processor/content/plaintext/plaintext_to_entries.py index 4fb0dd2e..45ac5047 100644 --- a/src/khoj/processor/content/plaintext/plaintext_to_entries.py +++ b/src/khoj/processor/content/plaintext/plaintext_to_entries.py @@ -47,7 +47,7 @@ class PlaintextToEntries(TextToEntries): # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): - current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256, raw_is_compiled=True) # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index edd814f6..361d0220 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -59,7 +59,7 @@ class TextToEntries(ABC): @staticmethod def split_entries_by_max_tokens( - entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500 + entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500, raw_is_compiled: bool = False ) -> List[Entry]: "Split entries if compiled entry length exceeds the max tokens supported by the ML model." chunked_entries: List[Entry] = [] @@ -94,7 +94,7 @@ class TextToEntries(ABC): # Clean entry of unwanted characters like \0 character compiled_entry_chunk = TextToEntries.clean_field(compiled_entry_chunk) - entry.raw = TextToEntries.clean_field(entry.raw) + entry.raw = compiled_entry_chunk if raw_is_compiled else TextToEntries.clean_field(entry.raw) entry.heading = TextToEntries.clean_field(entry.heading) entry.file = TextToEntries.clean_field(entry.file)