Send more text file types from Desktop app and improve indexing them

- Allow syncing more file types from desktop app to index on server
  - Use `file-type' package to identify valid text file types on Desktop app

- Split plaintext entries into smaller logical units than a whole file
  Since the text splitting upgrades in #645, compiled chunks have more
  logical splits like paragraph, sentence.
  Show those (potentially) smaller snippets to the user as references

- Tangential Fix:
  Initialize unbound currentTime variable for error log timestamp
This commit is contained in:
Debanjum Singh Solanky 2024-04-03 01:49:15 +05:30
parent 89915dcb4c
commit 7ff1bd9f8b
5 changed files with 101 additions and 16 deletions

View file

@ -1,4 +1,5 @@
const { app, BrowserWindow, ipcMain, Tray, Menu, nativeImage, shell } = require('electron');
const FileType = require('file-type');
const todesktop = require("@todesktop/runtime");
const khojPackage = require('./package.json');
@ -111,22 +112,31 @@ function filenameToMimeType (filename) {
}
}
function processDirectory(filesToPush, folder) {
async function isPlainTextFile(filePath) {
const fileType = await FileType.fromFile(filePath);
if (!fileType) {
return false;
}
return fileType.mime.startsWith('text/');
}
async function processDirectory(filesToPush, folder) {
const files = fs.readdirSync(folder.path, { withFileTypes: true, recursive: true });
for (const file of files) {
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
const filePath = path.join(folder.path, file.name);
if (file.isFile() && await isPlainTextFile(filePath)) {
console.log(`Add ${file.name} in ${folder.path} for indexing`);
filesToPush.push(path.join(folder.path, file.name));
filesToPush.push(filePath);
}
if (file.isDirectory()) {
processDirectory(filesToPush, {'path': path.join(folder.path, file.name)});
await processDirectory(filesToPush, {'path': path.join(folder.path, file.name)});
}
}
}
function pushDataToKhoj (regenerate = false) {
async function pushDataToKhoj (regenerate = false) {
// Don't sync if token or hostURL is not set or if already syncing
if (store.get('khojToken') === '' || store.get('hostURL') === '' || syncing === true) {
const win = BrowserWindow.getAllWindows()[0];
@ -148,7 +158,7 @@ function pushDataToKhoj (regenerate = false) {
// Collect paths of all indexable files in configured folders
for (const folder of folders) {
processDirectory(filesToPush, folder);
await processDirectory(filesToPush, folder);
}
const lastSync = store.get('lastSync') || [];
@ -222,6 +232,7 @@ function pushDataToKhoj (regenerate = false) {
} else if (error?.code === 'ECONNREFUSED') {
state["error"] = `Could not connect to Khoj server. Ensure you can connect to it at ${error.address}:${error.port}.`;
} else {
currentTime = new Date();
state["error"] = `Sync was unsuccessful at ${currentTime.toLocaleTimeString()}. Contact team@khoj.dev to report this issue.`;
}
})
@ -240,7 +251,7 @@ pushDataToKhoj();
async function handleFileOpen (type) {
let { canceled, filePaths } = {canceled: true, filePaths: []};
if (type === 'file') {
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files", extensions: validFileTypes}] }));
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files" }] }));
} else if (type === 'folder') {
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openDirectory' ]}));
}
@ -331,7 +342,7 @@ async function removeFolder (event, folderPath) {
async function syncData (regenerate = false) {
try {
pushDataToKhoj(regenerate);
await pushDataToKhoj(regenerate);
const date = new Date();
console.log('Pushing data to Khoj at: ', date);
} catch (err) {
@ -343,7 +354,7 @@ async function deleteAllFiles () {
try {
store.set('files', []);
store.set('folders', []);
pushDataToKhoj(true);
await pushDataToKhoj(true);
const date = new Date();
console.log('Pushing data to Khoj at: ', date);
} catch (err) {
@ -366,9 +377,9 @@ const createWindow = (tab = 'chat.html') => {
}
})
const job = new cron('0 */10 * * * *', function() {
const job = new cron('0 */10 * * * *', async function() {
try {
pushDataToKhoj();
await pushDataToKhoj();
const date = new Date();
console.log('Pushing data to Khoj at: ', date);
win.webContents.send('update-state', state);

View file

@ -20,6 +20,7 @@
"axios": "^1.6.4",
"cron": "^2.4.3",
"electron-store": "^8.1.0",
"fs": "^0.0.1-security"
"fs": "^0.0.1-security",
"file-type": "^16.2.0"
}
}

View file

@ -62,6 +62,11 @@
lodash.once "^4.1.1"
semver "^7.3.2"
"@tokenizer/token@^0.3.0":
version "0.3.0"
resolved "https://registry.yarnpkg.com/@tokenizer/token/-/token-0.3.0.tgz#fe98a93fe789247e998c75e74e9c7c63217aa276"
integrity sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A==
"@types/cacheable-request@^6.0.1":
version "6.0.3"
resolved "https://registry.yarnpkg.com/@types/cacheable-request/-/cacheable-request-6.0.3.tgz#a430b3260466ca7b5ca5bfd735693b36e7a9d183"
@ -471,6 +476,15 @@ fd-slicer@~1.1.0:
dependencies:
pend "~1.2.0"
file-type@^16.2.0:
version "16.5.4"
resolved "https://registry.yarnpkg.com/file-type/-/file-type-16.5.4.tgz#474fb4f704bee427681f98dd390058a172a6c2fd"
integrity sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw==
dependencies:
readable-web-to-node-stream "^3.0.0"
strtok3 "^6.2.4"
token-types "^4.1.1"
fill-range@^7.0.1:
version "7.0.1"
resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40"
@ -668,6 +682,11 @@ human-signals@^2.1.0:
resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0"
integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==
ieee754@^1.2.1:
version "1.2.1"
resolved "https://registry.yarnpkg.com/ieee754/-/ieee754-1.2.1.tgz#8eb7a10a63fff25d15a57b001586d177d1b0d352"
integrity sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==
ignore@^5.2.0:
version "5.2.4"
resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.2.4.tgz#a291c0c6178ff1b960befe47fcdec301674a6324"
@ -686,7 +705,7 @@ inflight@^1.0.4:
once "^1.3.0"
wrappy "1"
inherits@2:
inherits@2, inherits@^2.0.3:
version "2.0.4"
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
@ -979,6 +998,11 @@ path-type@^4.0.0:
resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b"
integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==
peek-readable@^4.1.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/peek-readable/-/peek-readable-4.1.0.tgz#4ece1111bf5c2ad8867c314c81356847e8a62e72"
integrity sha512-ZI3LnwUv5nOGbQzD9c2iDG6toheuXSZP5esSHBjopsXH4dg19soufvpUGA3uohi5anFtGb2lhAVdHzH6R/Evvg==
pend@~1.2.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
@ -1029,6 +1053,22 @@ quick-lru@^5.1.1:
resolved "https://registry.yarnpkg.com/quick-lru/-/quick-lru-5.1.1.tgz#366493e6b3e42a3a6885e2e99d18f80fb7a8c932"
integrity sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==
readable-stream@^3.6.0:
version "3.6.2"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.2.tgz#56a9b36ea965c00c5a93ef31eb111a0f11056967"
integrity sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==
dependencies:
inherits "^2.0.3"
string_decoder "^1.1.1"
util-deprecate "^1.0.1"
readable-web-to-node-stream@^3.0.0:
version "3.0.2"
resolved "https://registry.yarnpkg.com/readable-web-to-node-stream/-/readable-web-to-node-stream-3.0.2.tgz#5d52bb5df7b54861fd48d015e93a2cb87b3ee0bb"
integrity sha512-ePeK6cc1EcKLEhJFt/AebMCLL+GgSKhuygrZ/GLaKZYEecIgIECf4UaUuaByiGtzckwR4ain9VzUh95T1exYGw==
dependencies:
readable-stream "^3.6.0"
require-from-string@^2.0.2:
version "2.0.2"
resolved "https://registry.yarnpkg.com/require-from-string/-/require-from-string-2.0.2.tgz#89a7fdd938261267318eafe14f9c32e598c36909"
@ -1077,6 +1117,11 @@ run-parallel@^1.1.9:
dependencies:
queue-microtask "^1.2.2"
safe-buffer@~5.2.0:
version "5.2.1"
resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6"
integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==
sax@^1.2.4:
version "1.2.4"
resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9"
@ -1133,11 +1178,26 @@ sprintf-js@^1.1.2:
resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.2.tgz#da1765262bf8c0f571749f2ad6c26300207ae673"
integrity sha512-VE0SOVEHCk7Qc8ulkWw3ntAzXuqf7S2lvwQaDLRnUeIEaKNQJzV6BwmLKhOqT61aGhfUMrXeaBk+oDGCzvhcug==
string_decoder@^1.1.1:
version "1.3.0"
resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e"
integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
dependencies:
safe-buffer "~5.2.0"
strip-final-newline@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/strip-final-newline/-/strip-final-newline-2.0.0.tgz#89b852fb2fcbe936f6f4b3187afb0a12c1ab58ad"
integrity sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==
strtok3@^6.2.4:
version "6.3.0"
resolved "https://registry.yarnpkg.com/strtok3/-/strtok3-6.3.0.tgz#358b80ffe6d5d5620e19a073aa78ce947a90f9a0"
integrity sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw==
dependencies:
"@tokenizer/token" "^0.3.0"
peek-readable "^4.1.0"
sumchecker@^3.0.1:
version "3.0.1"
resolved "https://registry.yarnpkg.com/sumchecker/-/sumchecker-3.0.1.tgz#6377e996795abb0b6d348e9b3e1dfb24345a8e42"
@ -1152,6 +1212,14 @@ to-regex-range@^5.0.1:
dependencies:
is-number "^7.0.0"
token-types@^4.1.1:
version "4.2.1"
resolved "https://registry.yarnpkg.com/token-types/-/token-types-4.2.1.tgz#0f897f03665846982806e138977dbe72d44df753"
integrity sha512-6udB24Q737UD/SDsKAHI9FCRP7Bqc9D/MQUV02ORQg5iskjtLJlZJNdN4kKtcdtwCeWIwIHDGaUsTsCCAa8sFQ==
dependencies:
"@tokenizer/token" "^0.3.0"
ieee754 "^1.2.1"
type-fest@^0.13.1:
version "0.13.1"
resolved "https://registry.yarnpkg.com/type-fest/-/type-fest-0.13.1.tgz#0172cb5bce80b0bd542ea348db50c7e21834d934"
@ -1179,6 +1247,11 @@ uri-js@^4.2.2:
dependencies:
punycode "^2.1.0"
util-deprecate@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
integrity sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==
which@^2.0.1:
version "2.0.2"
resolved "https://registry.yarnpkg.com/which/-/which-2.0.2.tgz#7c6a8dd0a636a0327e10b59c9286eee93f3f51b1"

View file

@ -47,7 +47,7 @@ class PlaintextToEntries(TextToEntries):
# Split entries by max tokens supported by model
with timer("Split entries by max token size supported by model", logger):
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256, raw_is_compiled=True)
# Identify, mark and merge any new entries with previous entries
with timer("Identify new or updated entries", logger):

View file

@ -59,7 +59,7 @@ class TextToEntries(ABC):
@staticmethod
def split_entries_by_max_tokens(
entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500
entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500, raw_is_compiled: bool = False
) -> List[Entry]:
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
chunked_entries: List[Entry] = []
@ -94,7 +94,7 @@ class TextToEntries(ABC):
# Clean entry of unwanted characters like \0 character
compiled_entry_chunk = TextToEntries.clean_field(compiled_entry_chunk)
entry.raw = TextToEntries.clean_field(entry.raw)
entry.raw = compiled_entry_chunk if raw_is_compiled else TextToEntries.clean_field(entry.raw)
entry.heading = TextToEntries.clean_field(entry.heading)
entry.file = TextToEntries.clean_field(entry.file)