Add support to upload rawText document via api (#692)

* Add support to upload rawText document via api * update API doc endpoint with correct textContent key * update response swagger doc
2025-03-15 14:42:23 +00:00 · 2024-02-07 15:17:32 -08:00 · 2024-02-07 15:17:32 -08:00 · 48cb8f2897
commit 48cb8f2897
parent 5e11173aab
6 changed files with 453 additions and 10 deletions
--- a/collector/index.js
+++ b/collector/index.js
@ -12,6 +12,7 @@ const { processSingleFile } = require("./processSingleFile");
 const { processLink } = require("./processLink");
 const { wipeCollectorStorage } = require("./utils/files");
 const extensions = require("./extensions");
+const { processRawText } = require("./processRawText");
 const app = express();

 app.use(cors({ origin: true }));
@ -66,6 +67,29 @@ app.post("/process-link", async function (request, response) {
  return;
 });

+app.post("/process-raw-text", async function (request, response) {
+  const { textContent, metadata } = reqBody(request);
+  try {
+    const {
+      success,
+      reason,
+      documents = [],
+    } = await processRawText(textContent, metadata);
+    response
+      .status(200)
+      .json({ filename: metadata.title, success, reason, documents });
+  } catch (e) {
+    console.error(e);
+    response.status(200).json({
+      filename: metadata?.title || "Unknown-doc.txt",
+      success: false,
+      reason: "A processing error occurred.",
+      documents: [],
+    });
+  }
+  return;
+});
+
 extensions(app);

 app.get("/accepts", function (_, response) {
--- a/collector/processRawText/index.js
+++ b/collector/processRawText/index.js
@ -0,0 +1,69 @@
+const { v4 } = require("uuid");
+const { writeToServerDocuments } = require("../utils/files");
+const { tokenizeString } = require("../utils/tokenizer");
+const { default: slugify } = require("slugify");
+
+// Will remove the last .extension from the input 
+// and stringify the input + move to lowercase.
+function stripAndSlug(input) {
+  if (!input.includes('.')) return slugify(input, { lower: true });
+  return slugify(input.split('.').slice(0, -1).join('-'), { lower: true })
+}
+
+const METADATA_KEYS = {
+  possible: {
+    url: ({ url, title }) => {
+      let validUrl;
+      try {
+        const u = new URL(url);
+        validUrl = ["https:", "http:"].includes(u.protocol);
+      } catch { }
+
+      if (validUrl) return `web://${url.toLowerCase()}.website`;
+      return `file://${stripAndSlug(title)}.txt`;
+    },
+    title: ({ title }) => `${stripAndSlug(title)}.txt`,
+    docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' },
+    description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' },
+    docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' },
+    chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` },
+    published: ({ published }) => {
+      if (isNaN(Number(published))) return new Date().toLocaleString();
+      return new Date(Number(published)).toLocaleString()
+    },
+  }
+}
+
+async function processRawText(textContent, metadata) {
+  console.log(`-- Working Raw Text doc ${metadata.title} --`);
+  if (!textContent || textContent.length === 0) {
+    return {
+      success: false,
+      reason: "textContent was empty - nothing to process.",
+      documents: [],
+    };
+  }
+
+  const data = {
+    id: v4(),
+    url: METADATA_KEYS.possible.url(metadata),
+    title: METADATA_KEYS.possible.title(metadata),
+    docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
+    description: METADATA_KEYS.possible.description(metadata),
+    docSource: METADATA_KEYS.possible.docSource(metadata),
+    chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
+    published: METADATA_KEYS.possible.published(metadata),
+    wordCount: textContent.split(" ").length,
+    pageContent: textContent,
+    token_count_estimate: tokenizeString(textContent).length,
+  };
+
+  const document = writeToServerDocuments(
+    data,
+    `raw-${stripAndSlug(metadata.title)}-${data.id}`
+  );
+  console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`);
+  return { success: true, reason: null, documents: [document] };
+}
+
+module.exports = { processRawText }
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@ -6,6 +6,7 @@ const {
  acceptedFileTypes,
  processDocument,
  processLink,
+  processRawText,
 } = require("../../../utils/files/documentProcessor");
 const {
  viewLocalFiles,
@ -90,6 +91,7 @@ function apiDocumentEndpoints(app) {
              error: `Document processing API is not online. Document ${originalname} will not be processed automatically.`,
            })
            .end();
+          return;
        }

        const { success, reason, documents } =
@ -127,7 +129,7 @@ function apiDocumentEndpoints(app) {
    #swagger.requestBody = {
      description: 'Link of web address to be scraped.',
      required: true,
-      type: 'file',
+      type: 'object',
      content: {
          "application/json": {
            schema: {
@ -186,6 +188,7 @@ function apiDocumentEndpoints(app) {
              error: `Document processing API is not online. Link ${link} will not be processed automatically.`,
            })
            .end();
+          return;
        }

        const { success, reason, documents } = await processLink(link);
@ -212,6 +215,138 @@ function apiDocumentEndpoints(app) {
    }
  );

+  app.post(
+    "/v1/document/raw-text",
+    [validApiKey],
+    async (request, response) => {
+      /*
+     #swagger.tags = ['Documents']
+     #swagger.description = 'Upload a file by specifying its raw text content and metadata values without having to upload a file.'
+     #swagger.requestBody = {
+      description: 'Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys',
+      required: true,
+      type: 'object',
+      content: {
+        "application/json": {
+          schema: {
+            type: 'object',
+            example: {
+              "textContent": "This is the raw text that will be saved as a document in AnythingLLM.",
+              "metadata": {
+                keyOne: "valueOne",
+                keyTwo: "valueTwo",
+                etc: "etc"
+              }
+            }
+          }
+        }
+      }
+     }
+    #swagger.responses[200] = {
+      content: {
+        "application/json": {
+          schema: {
+            type: 'object',
+            example: {
+              success: true,
+              error: null,
+              documents: [
+                {
+                  "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
+                  "url": "file://my-document.txt",
+                  "title": "hello-world.txt",
+                  "docAuthor": "no author found",
+                  "description": "No description found.",
+                  "docSource": "My custom description set during upload",
+                  "chunkSource": "no chunk source specified",
+                  "published": "1/16/2024, 3:46:33 PM",
+                  "wordCount": 252,
+                  "pageContent": "AnythingLLM is the best....",
+                  "token_count_estimate": 447,
+                  "location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
+                }
+              ]
+            }
+          }
+        }
+      }
+     }
+     #swagger.responses[403] = {
+       schema: {
+         "$ref": "#/definitions/InvalidAPIKey"
+       }
+     }
+     */
+      try {
+        const requiredMetadata = ["title"];
+        const { textContent, metadata = {} } = reqBody(request);
+        const processingOnline = await checkProcessorAlive();
+
+        if (!processingOnline) {
+          response
+            .status(500)
+            .json({
+              success: false,
+              error: `Document processing API is not online. Request will not be processed.`,
+            })
+            .end();
+          return;
+        }
+
+        if (
+          !requiredMetadata.every(
+            (reqKey) =>
+              Object.keys(metadata).includes(reqKey) && !!metadata[reqKey]
+          )
+        ) {
+          response
+            .status(422)
+            .json({
+              success: false,
+              error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata
+                .map((v) => `'${v}'`)
+                .join(", ")}`,
+            })
+            .end();
+          return;
+        }
+
+        if (!textContent || textContent?.length === 0) {
+          response
+            .status(422)
+            .json({
+              success: false,
+              error: `The 'textContent' key cannot have an empty value.`,
+            })
+            .end();
+          return;
+        }
+
+        const { success, reason, documents } = await processRawText(
+          textContent,
+          metadata
+        );
+        if (!success) {
+          response
+            .status(500)
+            .json({ success: false, error: reason, documents })
+            .end();
+          return;
+        }
+
+        console.log(
+          `Document created successfully. It is now available in documents.`
+        );
+        await Telemetry.sendTelemetry("raw_document_uploaded");
+        await EventLogs.logEvent("api_raw_document_uploaded");
+        response.status(200).json({ success: true, error: null, documents });
+      } catch (e) {
+        console.log(e.message, e);
+        response.sendStatus(500).end();
+      }
+    }
+  );
+
  app.get("/v1/documents", [validApiKey], async (_, response) => {
    /*
    #swagger.tags = ['Documents']
@ -367,6 +502,56 @@ function apiDocumentEndpoints(app) {
      }
    }
  );
+
+  app.get(
+    "/v1/document/metadata-schema",
+    [validApiKey],
+    async (_, response) => {
+      /*
+    #swagger.tags = ['Documents']
+    #swagger.description = 'Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.'
+    #swagger.responses[200] = {
+      content: {
+        "application/json": {
+          schema: {
+            type: 'object',
+            example: {
+             "schema": {
+                "keyOne": "string | number | nullable",
+                "keyTwo": "string | number | nullable",
+                "specialKey": "number",
+                "title": "string",
+              }
+            }
+          }
+        }
+      }
+    }
+    #swagger.responses[403] = {
+      schema: {
+        "$ref": "#/definitions/InvalidAPIKey"
+      }
+    }
+    */
+      try {
+        response.status(200).json({
+          schema: {
+            // If you are updating this be sure to update the collector METADATA_KEYS constant in /processRawText.
+            url: "string | nullable",
+            title: "string",
+            docAuthor: "string | nullable",
+            description: "string | nullable",
+            docSource: "string | nullable",
+            chunkSource: "string | nullable",
+            published: "epoch timestamp in ms | nullable",
+          },
+        });
+      } catch (e) {
+        console.log(e.message, e);
+        response.sendStatus(500).end();
+      }
+    }
+  );
 }

 module.exports = { apiDocumentEndpoints };
--- a/server/endpoints/api/workspace/index.js
+++ b/server/endpoints/api/workspace/index.js
@ -26,17 +26,17 @@ function apiWorkspaceEndpoints(app) {
    #swagger.tags = ['Workspaces']
    #swagger.description = 'Create a new workspace'
    #swagger.requestBody = {
-        description: 'JSON object containing new display name of workspace.',
-        required: true,
-        type: 'object',
-        content: {
-          "application/json": {
-            example: {
-              name: "My New Workspace",
-            }
+      description: 'JSON object containing new display name of workspace.',
+      required: true,
+      type: 'object',
+      content: {
+        "application/json": {
+          example: {
+            name: "My New Workspace",
          }
        }
      }
+    }
    #swagger.responses[200] = {
      content: {
        "application/json": {
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@ -973,7 +973,7 @@
        "requestBody": {
          "description": "Link of web address to be scraped.",
          "required": true,
-          "type": "file",
+          "type": "object",
          "content": {
            "application/json": {
              "schema": {
@ -987,6 +987,96 @@
        }
      }
    },
+    "/v1/document/raw-text": {
+      "post": {
+        "tags": [
+          "Documents"
+        ],
+        "description": "Upload a file by specifying its raw text content and metadata values without having to upload a file.",
+        "parameters": [
+          {
+            "name": "Authorization",
+            "in": "header",
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "OK",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "example": {
+                    "success": true,
+                    "error": null,
+                    "documents": [
+                      {
+                        "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
+                        "url": "file://my-document.txt",
+                        "title": "hello-world.txt",
+                        "docAuthor": "no author found",
+                        "description": "No description found.",
+                        "docSource": "My custom description set during upload",
+                        "chunkSource": "no chunk source specified",
+                        "published": "1/16/2024, 3:46:33 PM",
+                        "wordCount": 252,
+                        "pageContent": "AnythingLLM is the best....",
+                        "token_count_estimate": 447,
+                        "location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
+                      }
+                    ]
+                  }
+                }
+              }
+            }
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Unprocessable Entity"
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        },
+        "requestBody": {
+          "description": "Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys",
+          "required": true,
+          "type": "object",
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "example": {
+                  "textContent": "This is the raw text that will be saved as a document in AnythingLLM.",
+                  "metadata": {
+                    "keyOne": "valueOne",
+                    "keyTwo": "valueTwo",
+                    "etc": "etc"
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
    "/v1/documents": {
      "get": {
        "tags": [
@ -1195,6 +1285,61 @@
        }
      }
    },
+    "/v1/document/metadata-schema": {
+      "get": {
+        "tags": [
+          "Documents"
+        ],
+        "description": "Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.",
+        "parameters": [
+          {
+            "name": "Authorization",
+            "in": "header",
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "OK",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "example": {
+                    "schema": {
+                      "keyOne": "string | number | nullable",
+                      "keyTwo": "string | number | nullable",
+                      "specialKey": "number",
+                      "title": "string"
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        }
+      }
+    },
    "/v1/workspace/new": {
      "post": {
        "tags": [
--- a/server/utils/files/documentProcessor.js
+++ b/server/utils/files/documentProcessor.js
@ -59,6 +59,25 @@ async function processLink(link = "") {
    });
 }

+async function processRawText(textContent = "", metadata = {}) {
+  return await fetch(`${PROCESSOR_API}/process-raw-text`, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({ textContent, metadata }),
+  })
+    .then((res) => {
+      if (!res.ok) throw new Error("Response could not be completed");
+      return res.json();
+    })
+    .then((res) => res)
+    .catch((e) => {
+      console.log(e.message);
+      return { success: false, reason: e.message, documents: [] };
+    });
+}
+
 // We will not ever expose the document processor to the frontend API so instead we relay
 // all requests through the server. You can use this function to directly expose a specific endpoint
 // on the document processor.
@ -85,6 +104,7 @@ module.exports = {
  checkProcessorAlive,
  processDocument,
  processLink,
+  processRawText,
  acceptedFileTypes,
  forwardExtensionRequest,
 };