From 48cb8f2897b27913100347e19e241677375aeebf Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Wed, 7 Feb 2024 15:17:32 -0800
Subject: [PATCH] Add support to upload rawText document via api (#692)

* Add support to upload rawText document via api

* update API doc endpoint with correct textContent key

* update response swagger doc
---
 collector/index.js                      |  24 +++
 collector/processRawText/index.js       |  69 +++++++++
 server/endpoints/api/document/index.js  | 187 +++++++++++++++++++++++-
 server/endpoints/api/workspace/index.js |  16 +-
 server/swagger/openapi.json             | 147 ++++++++++++++++++-
 server/utils/files/documentProcessor.js |  20 +++
 6 files changed, 453 insertions(+), 10 deletions(-)
 create mode 100644 collector/processRawText/index.js

diff --git a/collector/index.js b/collector/index.js
index 062d78959..9ebe5f1ce 100644
--- a/collector/index.js
+++ b/collector/index.js
@@ -12,6 +12,7 @@ const { processSingleFile } = require("./processSingleFile");
 const { processLink } = require("./processLink");
 const { wipeCollectorStorage } = require("./utils/files");
 const extensions = require("./extensions");
+const { processRawText } = require("./processRawText");
 const app = express();
 
 app.use(cors({ origin: true }));
@@ -66,6 +67,29 @@ app.post("/process-link", async function (request, response) {
   return;
 });
 
+app.post("/process-raw-text", async function (request, response) {
+  const { textContent, metadata } = reqBody(request);
+  try {
+    const {
+      success,
+      reason,
+      documents = [],
+    } = await processRawText(textContent, metadata);
+    response
+      .status(200)
+      .json({ filename: metadata.title, success, reason, documents });
+  } catch (e) {
+    console.error(e);
+    response.status(200).json({
+      filename: metadata?.title || "Unknown-doc.txt",
+      success: false,
+      reason: "A processing error occurred.",
+      documents: [],
+    });
+  }
+  return;
+});
+
 extensions(app);
 
 app.get("/accepts", function (_, response) {
diff --git a/collector/processRawText/index.js b/collector/processRawText/index.js
new file mode 100644
index 000000000..d435c9e7e
--- /dev/null
+++ b/collector/processRawText/index.js
@@ -0,0 +1,69 @@
+const { v4 } = require("uuid");
+const { writeToServerDocuments } = require("../utils/files");
+const { tokenizeString } = require("../utils/tokenizer");
+const { default: slugify } = require("slugify");
+
+// Will remove the last .extension from the input 
+// and stringify the input + move to lowercase.
+function stripAndSlug(input) {
+  if (!input.includes('.')) return slugify(input, { lower: true });
+  return slugify(input.split('.').slice(0, -1).join('-'), { lower: true })
+}
+
+const METADATA_KEYS = {
+  possible: {
+    url: ({ url, title }) => {
+      let validUrl;
+      try {
+        const u = new URL(url);
+        validUrl = ["https:", "http:"].includes(u.protocol);
+      } catch { }
+
+      if (validUrl) return `web://${url.toLowerCase()}.website`;
+      return `file://${stripAndSlug(title)}.txt`;
+    },
+    title: ({ title }) => `${stripAndSlug(title)}.txt`,
+    docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' },
+    description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' },
+    docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' },
+    chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` },
+    published: ({ published }) => {
+      if (isNaN(Number(published))) return new Date().toLocaleString();
+      return new Date(Number(published)).toLocaleString()
+    },
+  }
+}
+
+async function processRawText(textContent, metadata) {
+  console.log(`-- Working Raw Text doc ${metadata.title} --`);
+  if (!textContent || textContent.length === 0) {
+    return {
+      success: false,
+      reason: "textContent was empty - nothing to process.",
+      documents: [],
+    };
+  }
+
+  const data = {
+    id: v4(),
+    url: METADATA_KEYS.possible.url(metadata),
+    title: METADATA_KEYS.possible.title(metadata),
+    docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
+    description: METADATA_KEYS.possible.description(metadata),
+    docSource: METADATA_KEYS.possible.docSource(metadata),
+    chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
+    published: METADATA_KEYS.possible.published(metadata),
+    wordCount: textContent.split(" ").length,
+    pageContent: textContent,
+    token_count_estimate: tokenizeString(textContent).length,
+  };
+
+  const document = writeToServerDocuments(
+    data,
+    `raw-${stripAndSlug(metadata.title)}-${data.id}`
+  );
+  console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`);
+  return { success: true, reason: null, documents: [document] };
+}
+
+module.exports = { processRawText }
\ No newline at end of file
diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js
index b72debbdb..c210fff4a 100644
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@@ -6,6 +6,7 @@ const {
   acceptedFileTypes,
   processDocument,
   processLink,
+  processRawText,
 } = require("../../../utils/files/documentProcessor");
 const {
   viewLocalFiles,
@@ -90,6 +91,7 @@ function apiDocumentEndpoints(app) {
               error: `Document processing API is not online. Document ${originalname} will not be processed automatically.`,
             })
             .end();
+          return;
         }
 
         const { success, reason, documents } =
@@ -127,7 +129,7 @@ function apiDocumentEndpoints(app) {
     #swagger.requestBody = {
       description: 'Link of web address to be scraped.',
       required: true,
-      type: 'file',
+      type: 'object',
       content: {
           "application/json": {
             schema: {
@@ -186,6 +188,7 @@ function apiDocumentEndpoints(app) {
               error: `Document processing API is not online. Link ${link} will not be processed automatically.`,
             })
             .end();
+          return;
         }
 
         const { success, reason, documents } = await processLink(link);
@@ -212,6 +215,138 @@ function apiDocumentEndpoints(app) {
     }
   );
 
+  app.post(
+    "/v1/document/raw-text",
+    [validApiKey],
+    async (request, response) => {
+      /*
+     #swagger.tags = ['Documents']
+     #swagger.description = 'Upload a file by specifying its raw text content and metadata values without having to upload a file.'
+     #swagger.requestBody = {
+      description: 'Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys',
+      required: true,
+      type: 'object',
+      content: {
+        "application/json": {
+          schema: {
+            type: 'object',
+            example: {
+              "textContent": "This is the raw text that will be saved as a document in AnythingLLM.",
+              "metadata": {
+                keyOne: "valueOne",
+                keyTwo: "valueTwo",
+                etc: "etc"
+              }
+            }
+          }
+        }
+      }
+     }
+    #swagger.responses[200] = {
+      content: {
+        "application/json": {
+          schema: {
+            type: 'object',
+            example: {
+              success: true,
+              error: null,
+              documents: [
+                {
+                  "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
+                  "url": "file://my-document.txt",
+                  "title": "hello-world.txt",
+                  "docAuthor": "no author found",
+                  "description": "No description found.",
+                  "docSource": "My custom description set during upload",
+                  "chunkSource": "no chunk source specified",
+                  "published": "1/16/2024, 3:46:33 PM",
+                  "wordCount": 252,
+                  "pageContent": "AnythingLLM is the best....",
+                  "token_count_estimate": 447,
+                  "location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
+                }
+              ]
+            }
+          }
+        }
+      }
+     }
+     #swagger.responses[403] = {
+       schema: {
+         "$ref": "#/definitions/InvalidAPIKey"
+       }
+     }
+     */
+      try {
+        const requiredMetadata = ["title"];
+        const { textContent, metadata = {} } = reqBody(request);
+        const processingOnline = await checkProcessorAlive();
+
+        if (!processingOnline) {
+          response
+            .status(500)
+            .json({
+              success: false,
+              error: `Document processing API is not online. Request will not be processed.`,
+            })
+            .end();
+          return;
+        }
+
+        if (
+          !requiredMetadata.every(
+            (reqKey) =>
+              Object.keys(metadata).includes(reqKey) && !!metadata[reqKey]
+          )
+        ) {
+          response
+            .status(422)
+            .json({
+              success: false,
+              error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata
+                .map((v) => `'${v}'`)
+                .join(", ")}`,
+            })
+            .end();
+          return;
+        }
+
+        if (!textContent || textContent?.length === 0) {
+          response
+            .status(422)
+            .json({
+              success: false,
+              error: `The 'textContent' key cannot have an empty value.`,
+            })
+            .end();
+          return;
+        }
+
+        const { success, reason, documents } = await processRawText(
+          textContent,
+          metadata
+        );
+        if (!success) {
+          response
+            .status(500)
+            .json({ success: false, error: reason, documents })
+            .end();
+          return;
+        }
+
+        console.log(
+          `Document created successfully. It is now available in documents.`
+        );
+        await Telemetry.sendTelemetry("raw_document_uploaded");
+        await EventLogs.logEvent("api_raw_document_uploaded");
+        response.status(200).json({ success: true, error: null, documents });
+      } catch (e) {
+        console.log(e.message, e);
+        response.sendStatus(500).end();
+      }
+    }
+  );
+
   app.get("/v1/documents", [validApiKey], async (_, response) => {
     /*
     #swagger.tags = ['Documents']
@@ -367,6 +502,56 @@ function apiDocumentEndpoints(app) {
       }
     }
   );
+
+  app.get(
+    "/v1/document/metadata-schema",
+    [validApiKey],
+    async (_, response) => {
+      /*
+    #swagger.tags = ['Documents']
+    #swagger.description = 'Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.'
+    #swagger.responses[200] = {
+      content: {
+        "application/json": {
+          schema: {
+            type: 'object',
+            example: {
+             "schema": {
+                "keyOne": "string | number | nullable",
+                "keyTwo": "string | number | nullable",
+                "specialKey": "number",
+                "title": "string",
+              }
+            }
+          }
+        }
+      }
+    }
+    #swagger.responses[403] = {
+      schema: {
+        "$ref": "#/definitions/InvalidAPIKey"
+      }
+    }
+    */
+      try {
+        response.status(200).json({
+          schema: {
+            // If you are updating this be sure to update the collector METADATA_KEYS constant in /processRawText.
+            url: "string | nullable",
+            title: "string",
+            docAuthor: "string | nullable",
+            description: "string | nullable",
+            docSource: "string | nullable",
+            chunkSource: "string | nullable",
+            published: "epoch timestamp in ms | nullable",
+          },
+        });
+      } catch (e) {
+        console.log(e.message, e);
+        response.sendStatus(500).end();
+      }
+    }
+  );
 }
 
 module.exports = { apiDocumentEndpoints };
diff --git a/server/endpoints/api/workspace/index.js b/server/endpoints/api/workspace/index.js
index 885d0f1ae..b846f3f8d 100644
--- a/server/endpoints/api/workspace/index.js
+++ b/server/endpoints/api/workspace/index.js
@@ -26,17 +26,17 @@ function apiWorkspaceEndpoints(app) {
     #swagger.tags = ['Workspaces']
     #swagger.description = 'Create a new workspace'
     #swagger.requestBody = {
-        description: 'JSON object containing new display name of workspace.',
-        required: true,
-        type: 'object',
-        content: {
-          "application/json": {
-            example: {
-              name: "My New Workspace",
-            }
+      description: 'JSON object containing new display name of workspace.',
+      required: true,
+      type: 'object',
+      content: {
+        "application/json": {
+          example: {
+            name: "My New Workspace",
           }
         }
       }
+    }
     #swagger.responses[200] = {
       content: {
         "application/json": {
diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json
index fd2ef8898..4554a7aad 100644
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@@ -973,7 +973,7 @@
         "requestBody": {
           "description": "Link of web address to be scraped.",
           "required": true,
-          "type": "file",
+          "type": "object",
           "content": {
             "application/json": {
               "schema": {
@@ -987,6 +987,96 @@
         }
       }
     },
+    "/v1/document/raw-text": {
+      "post": {
+        "tags": [
+          "Documents"
+        ],
+        "description": "Upload a file by specifying its raw text content and metadata values without having to upload a file.",
+        "parameters": [
+          {
+            "name": "Authorization",
+            "in": "header",
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "OK",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "example": {
+                    "success": true,
+                    "error": null,
+                    "documents": [
+                      {
+                        "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
+                        "url": "file://my-document.txt",
+                        "title": "hello-world.txt",
+                        "docAuthor": "no author found",
+                        "description": "No description found.",
+                        "docSource": "My custom description set during upload",
+                        "chunkSource": "no chunk source specified",
+                        "published": "1/16/2024, 3:46:33 PM",
+                        "wordCount": 252,
+                        "pageContent": "AnythingLLM is the best....",
+                        "token_count_estimate": 447,
+                        "location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
+                      }
+                    ]
+                  }
+                }
+              }
+            }
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Unprocessable Entity"
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        },
+        "requestBody": {
+          "description": "Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys",
+          "required": true,
+          "type": "object",
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "example": {
+                  "textContent": "This is the raw text that will be saved as a document in AnythingLLM.",
+                  "metadata": {
+                    "keyOne": "valueOne",
+                    "keyTwo": "valueTwo",
+                    "etc": "etc"
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
     "/v1/documents": {
       "get": {
         "tags": [
@@ -1195,6 +1285,61 @@
         }
       }
     },
+    "/v1/document/metadata-schema": {
+      "get": {
+        "tags": [
+          "Documents"
+        ],
+        "description": "Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.",
+        "parameters": [
+          {
+            "name": "Authorization",
+            "in": "header",
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "OK",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "example": {
+                    "schema": {
+                      "keyOne": "string | number | nullable",
+                      "keyTwo": "string | number | nullable",
+                      "specialKey": "number",
+                      "title": "string"
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        }
+      }
+    },
     "/v1/workspace/new": {
       "post": {
         "tags": [
diff --git a/server/utils/files/documentProcessor.js b/server/utils/files/documentProcessor.js
index 27d0f5f2b..ef8eba17c 100644
--- a/server/utils/files/documentProcessor.js
+++ b/server/utils/files/documentProcessor.js
@@ -59,6 +59,25 @@ async function processLink(link = "") {
     });
 }
 
+async function processRawText(textContent = "", metadata = {}) {
+  return await fetch(`${PROCESSOR_API}/process-raw-text`, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({ textContent, metadata }),
+  })
+    .then((res) => {
+      if (!res.ok) throw new Error("Response could not be completed");
+      return res.json();
+    })
+    .then((res) => res)
+    .catch((e) => {
+      console.log(e.message);
+      return { success: false, reason: e.message, documents: [] };
+    });
+}
+
 // We will not ever expose the document processor to the frontend API so instead we relay
 // all requests through the server. You can use this function to directly expose a specific endpoint
 // on the document processor.
@@ -85,6 +104,7 @@ module.exports = {
   checkProcessorAlive,
   processDocument,
   processLink,
+  processRawText,
   acceptedFileTypes,
   forwardExtensionRequest,
 };