Batch send of index files

This commit is contained in:
Desmond 2024-05-08 13:38:40 +08:00
parent 4395ed8065
commit 192cd53003

View file

@ -416,10 +416,47 @@ Auto invokes setup steps on calling main entrypoint."
(files-to-index (or file-paths
(append (mapcan (lambda (dir) (directory-files-recursively dir "\\.\\(org\\|md\\|markdown\\|pdf\\|txt\\|rst\\|xml\\|htm\\|html\\)$")) content-directories) content-files)))
(type-query (if (or (equal content-type "all") (not content-type)) "" (format "t=%s" content-type)))
(delete-files (khoj--get-delete-file-list khoj--indexed-files files-to-index))
(inhibit-message t)
(message-log-max nil))
(message-log-max nil)
(batch-size 50))
(dolist (files (khoj--split-file-list files-to-index batch-size))
(khoj--send-index-update-request (khoj--render-update-files-as-request-body files boundary) boundary content-type type-query force))
(when delete-files
(khoj--send-index-update-request (khoj--render-delete-files-as-request-body delete-files boundary) boundary content-type type-query force))
(setq khoj--indexed-files files-to-index)))
(defun khoj--get-delete-file-list (indexed-files upload-files)
"Get delete file list. when `INDEXED-FILES' no longer in `UPLOAD-FILES'.
delete them. return delete-file-list."
(let (delete-files '())
(dolist (indexed-file indexed-files)
(when (not (member indexed-file upload-files))
(push indexed-file delete-files)))
delete-files))
(defun khoj--split-file-list (file-list size)
"Split `FILE-LIST' into subgroups of `SIZE' files each."
(let ((subgroups '())
(current-group '()))
(dolist (file file-list)
(if (= (length current-group) size)
;; If the current group has size files, start a new group
(progn
(push current-group subgroups)
(setq current-group '()))
(push file current-group)))
;; Add the last group if it's not empty
(when current-group
(push (nreverse current-group) subgroups))
(nreverse subgroups))) ; Reverse to maintain the original order of file-list
(defun khoj--send-index-update-request (body boundary &optional content-type type-query force)
"Send `BODY' request to khoj server. 'TYPE-QUERY' is appended to the URL.
Use `BOUNDARY' to add headder conte
nt-type."
(let ((url-request-method "POST")
(url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary))
(url-request-data body)
(url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
("Authorization" . ,(format "Bearer %s" khoj-api-key)))))
(with-current-buffer
@ -437,10 +474,9 @@ Auto invokes setup steps on calling main entrypoint."
(if content-type (format "%s " content-type) "all")
(string-trim (format "%s %s" (nth 1 (nth 1 status)) (nth 2 (nth 1 status))))
(if (> (- (point-max) (point)) 0) (format ". Response: %s" (string-trim (buffer-substring-no-properties (point) (point-max)))) ""))))))
nil t t)))
(setq khoj--indexed-files files-to-index)))
nil t t))))
(defun khoj--render-files-as-request-body (files-to-index previously-indexed-files boundary)
(defun khoj--render-update-files-as-request-body (files-to-index boundary)
"Render `FILES-TO-INDEX', `PREVIOUSLY-INDEXED-FILES' as multi-part form body.
Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request."
(with-temp-buffer
@ -448,32 +484,43 @@ Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request.
(insert "\n")
(dolist (file-to-index files-to-index)
;; find file content-type. Choose from org, markdown, pdf, plaintext
(let ((content-type (cond ((string-match "\\.org$" file-to-index) "text/org")
((string-match "\\.\\(md\\|markdown\\)$" file-to-index) "text/markdown")
((string-match "\\.pdf$" file-to-index) "application/pdf")
(t "text/plain"))))
(let ((content-type (khoj--filename-to-mime-type file-to-index))
(file-name (encode-coding-string file-to-index 'utf-8)))
(insert (format "--%s\r\n" boundary))
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-name))
(insert (format "Content-Type: %s\r\n\r\n" content-type))
(insert (with-temp-buffer
(insert-file-contents-literally file-to-index)
(buffer-string)))
(insert "\r\n")))
(dolist (file-to-index previously-indexed-files)
(when (not (member file-to-index files-to-index))
;; find file content-type. Choose from org, markdown, pdf, plaintext
(let ((content-type (cond ((string-match "\\.org$" file-to-index) "text/org")
((string-match "\\.\\(md\\|markdown\\)$" file-to-index) "text/markdown")
((string-match "\\.pdf$" file-to-index) "application/pdf")
(t "text/plain"))))
(insert (format "--%s\r\n" boundary))
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
(insert "Content-Type: text/org\r\n\r\n")
(insert "")
(insert "\r\n"))))
(insert (format "--%s--\r\n" boundary))
(buffer-string)))
(defun khoj--render-delete-files-as-request-body (delete-files boundary)
"Render `DELETE-FILES' as multi-part form body.
Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request."
(with-temp-buffer
(set-buffer-multibyte nil)
(insert "\n")
(debug delete-files)
(dolist (file-to-index delete-files)
(let ((content-type (khoj--filename-to-mime-type file-to-index))
(file-name (encode-coding-string file-to-index 'utf-8)))
(insert (format "--%s\r\n" boundary))
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-name))
(insert "Content-Type: %s\r\n\r\n" content-type)
(insert "")
(insert "\r\n")))
(insert (format "--%s--\r\n" boundary))
(buffer-string)))
(defun khoj--filename-to-mime-type (file-name)
"`FILE-NAME' to mimeType."
(cond ((string-match "\\.org$" file-name) "text/org")
((string-match "\\.\\(md\\|markdown\\)$" file-name) "text/markdown")
((string-match "\\.pdf$" file-name) "application/pdf")
(t "text/plain")))
;; Cancel any running indexing timer, first
(when khoj--index-timer
(cancel-timer khoj--index-timer))