From 48cb8f2897b27913100347e19e241677375aeebf Mon Sep 17 00:00:00 2001 From: Timothy Carambat <rambat1010@gmail.com> Date: Wed, 7 Feb 2024 15:17:32 -0800 Subject: [PATCH] Add support to upload rawText document via api (#692) * Add support to upload rawText document via api * update API doc endpoint with correct textContent key * update response swagger doc --- collector/index.js | 24 +++ collector/processRawText/index.js | 69 +++++++++ server/endpoints/api/document/index.js | 187 +++++++++++++++++++++++- server/endpoints/api/workspace/index.js | 16 +- server/swagger/openapi.json | 147 ++++++++++++++++++- server/utils/files/documentProcessor.js | 20 +++ 6 files changed, 453 insertions(+), 10 deletions(-) create mode 100644 collector/processRawText/index.js diff --git a/collector/index.js b/collector/index.js index 062d78959..9ebe5f1ce 100644 --- a/collector/index.js +++ b/collector/index.js @@ -12,6 +12,7 @@ const { processSingleFile } = require("./processSingleFile"); const { processLink } = require("./processLink"); const { wipeCollectorStorage } = require("./utils/files"); const extensions = require("./extensions"); +const { processRawText } = require("./processRawText"); const app = express(); app.use(cors({ origin: true })); @@ -66,6 +67,29 @@ app.post("/process-link", async function (request, response) { return; }); +app.post("/process-raw-text", async function (request, response) { + const { textContent, metadata } = reqBody(request); + try { + const { + success, + reason, + documents = [], + } = await processRawText(textContent, metadata); + response + .status(200) + .json({ filename: metadata.title, success, reason, documents }); + } catch (e) { + console.error(e); + response.status(200).json({ + filename: metadata?.title || "Unknown-doc.txt", + success: false, + reason: "A processing error occurred.", + documents: [], + }); + } + return; +}); + extensions(app); app.get("/accepts", function (_, response) { diff --git a/collector/processRawText/index.js b/collector/processRawText/index.js new file mode 100644 index 000000000..d435c9e7e --- /dev/null +++ b/collector/processRawText/index.js @@ -0,0 +1,69 @@ +const { v4 } = require("uuid"); +const { writeToServerDocuments } = require("../utils/files"); +const { tokenizeString } = require("../utils/tokenizer"); +const { default: slugify } = require("slugify"); + +// Will remove the last .extension from the input +// and stringify the input + move to lowercase. +function stripAndSlug(input) { + if (!input.includes('.')) return slugify(input, { lower: true }); + return slugify(input.split('.').slice(0, -1).join('-'), { lower: true }) +} + +const METADATA_KEYS = { + possible: { + url: ({ url, title }) => { + let validUrl; + try { + const u = new URL(url); + validUrl = ["https:", "http:"].includes(u.protocol); + } catch { } + + if (validUrl) return `web://${url.toLowerCase()}.website`; + return `file://${stripAndSlug(title)}.txt`; + }, + title: ({ title }) => `${stripAndSlug(title)}.txt`, + docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' }, + description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' }, + docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' }, + chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` }, + published: ({ published }) => { + if (isNaN(Number(published))) return new Date().toLocaleString(); + return new Date(Number(published)).toLocaleString() + }, + } +} + +async function processRawText(textContent, metadata) { + console.log(`-- Working Raw Text doc ${metadata.title} --`); + if (!textContent || textContent.length === 0) { + return { + success: false, + reason: "textContent was empty - nothing to process.", + documents: [], + }; + } + + const data = { + id: v4(), + url: METADATA_KEYS.possible.url(metadata), + title: METADATA_KEYS.possible.title(metadata), + docAuthor: METADATA_KEYS.possible.docAuthor(metadata), + description: METADATA_KEYS.possible.description(metadata), + docSource: METADATA_KEYS.possible.docSource(metadata), + chunkSource: METADATA_KEYS.possible.chunkSource(metadata), + published: METADATA_KEYS.possible.published(metadata), + wordCount: textContent.split(" ").length, + pageContent: textContent, + token_count_estimate: tokenizeString(textContent).length, + }; + + const document = writeToServerDocuments( + data, + `raw-${stripAndSlug(metadata.title)}-${data.id}` + ); + console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`); + return { success: true, reason: null, documents: [document] }; +} + +module.exports = { processRawText } \ No newline at end of file diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index b72debbdb..c210fff4a 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -6,6 +6,7 @@ const { acceptedFileTypes, processDocument, processLink, + processRawText, } = require("../../../utils/files/documentProcessor"); const { viewLocalFiles, @@ -90,6 +91,7 @@ function apiDocumentEndpoints(app) { error: `Document processing API is not online. Document ${originalname} will not be processed automatically.`, }) .end(); + return; } const { success, reason, documents } = @@ -127,7 +129,7 @@ function apiDocumentEndpoints(app) { #swagger.requestBody = { description: 'Link of web address to be scraped.', required: true, - type: 'file', + type: 'object', content: { "application/json": { schema: { @@ -186,6 +188,7 @@ function apiDocumentEndpoints(app) { error: `Document processing API is not online. Link ${link} will not be processed automatically.`, }) .end(); + return; } const { success, reason, documents } = await processLink(link); @@ -212,6 +215,138 @@ function apiDocumentEndpoints(app) { } ); + app.post( + "/v1/document/raw-text", + [validApiKey], + async (request, response) => { + /* + #swagger.tags = ['Documents'] + #swagger.description = 'Upload a file by specifying its raw text content and metadata values without having to upload a file.' + #swagger.requestBody = { + description: 'Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys', + required: true, + type: 'object', + content: { + "application/json": { + schema: { + type: 'object', + example: { + "textContent": "This is the raw text that will be saved as a document in AnythingLLM.", + "metadata": { + keyOne: "valueOne", + keyTwo: "valueTwo", + etc: "etc" + } + } + } + } + } + } + #swagger.responses[200] = { + content: { + "application/json": { + schema: { + type: 'object', + example: { + success: true, + error: null, + documents: [ + { + "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc", + "url": "file://my-document.txt", + "title": "hello-world.txt", + "docAuthor": "no author found", + "description": "No description found.", + "docSource": "My custom description set during upload", + "chunkSource": "no chunk source specified", + "published": "1/16/2024, 3:46:33 PM", + "wordCount": 252, + "pageContent": "AnythingLLM is the best....", + "token_count_estimate": 447, + "location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json" + } + ] + } + } + } + } + } + #swagger.responses[403] = { + schema: { + "$ref": "#/definitions/InvalidAPIKey" + } + } + */ + try { + const requiredMetadata = ["title"]; + const { textContent, metadata = {} } = reqBody(request); + const processingOnline = await checkProcessorAlive(); + + if (!processingOnline) { + response + .status(500) + .json({ + success: false, + error: `Document processing API is not online. Request will not be processed.`, + }) + .end(); + return; + } + + if ( + !requiredMetadata.every( + (reqKey) => + Object.keys(metadata).includes(reqKey) && !!metadata[reqKey] + ) + ) { + response + .status(422) + .json({ + success: false, + error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata + .map((v) => `'${v}'`) + .join(", ")}`, + }) + .end(); + return; + } + + if (!textContent || textContent?.length === 0) { + response + .status(422) + .json({ + success: false, + error: `The 'textContent' key cannot have an empty value.`, + }) + .end(); + return; + } + + const { success, reason, documents } = await processRawText( + textContent, + metadata + ); + if (!success) { + response + .status(500) + .json({ success: false, error: reason, documents }) + .end(); + return; + } + + console.log( + `Document created successfully. It is now available in documents.` + ); + await Telemetry.sendTelemetry("raw_document_uploaded"); + await EventLogs.logEvent("api_raw_document_uploaded"); + response.status(200).json({ success: true, error: null, documents }); + } catch (e) { + console.log(e.message, e); + response.sendStatus(500).end(); + } + } + ); + app.get("/v1/documents", [validApiKey], async (_, response) => { /* #swagger.tags = ['Documents'] @@ -367,6 +502,56 @@ function apiDocumentEndpoints(app) { } } ); + + app.get( + "/v1/document/metadata-schema", + [validApiKey], + async (_, response) => { + /* + #swagger.tags = ['Documents'] + #swagger.description = 'Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.' + #swagger.responses[200] = { + content: { + "application/json": { + schema: { + type: 'object', + example: { + "schema": { + "keyOne": "string | number | nullable", + "keyTwo": "string | number | nullable", + "specialKey": "number", + "title": "string", + } + } + } + } + } + } + #swagger.responses[403] = { + schema: { + "$ref": "#/definitions/InvalidAPIKey" + } + } + */ + try { + response.status(200).json({ + schema: { + // If you are updating this be sure to update the collector METADATA_KEYS constant in /processRawText. + url: "string | nullable", + title: "string", + docAuthor: "string | nullable", + description: "string | nullable", + docSource: "string | nullable", + chunkSource: "string | nullable", + published: "epoch timestamp in ms | nullable", + }, + }); + } catch (e) { + console.log(e.message, e); + response.sendStatus(500).end(); + } + } + ); } module.exports = { apiDocumentEndpoints }; diff --git a/server/endpoints/api/workspace/index.js b/server/endpoints/api/workspace/index.js index 885d0f1ae..b846f3f8d 100644 --- a/server/endpoints/api/workspace/index.js +++ b/server/endpoints/api/workspace/index.js @@ -26,17 +26,17 @@ function apiWorkspaceEndpoints(app) { #swagger.tags = ['Workspaces'] #swagger.description = 'Create a new workspace' #swagger.requestBody = { - description: 'JSON object containing new display name of workspace.', - required: true, - type: 'object', - content: { - "application/json": { - example: { - name: "My New Workspace", - } + description: 'JSON object containing new display name of workspace.', + required: true, + type: 'object', + content: { + "application/json": { + example: { + name: "My New Workspace", } } } + } #swagger.responses[200] = { content: { "application/json": { diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json index fd2ef8898..4554a7aad 100644 --- a/server/swagger/openapi.json +++ b/server/swagger/openapi.json @@ -973,7 +973,7 @@ "requestBody": { "description": "Link of web address to be scraped.", "required": true, - "type": "file", + "type": "object", "content": { "application/json": { "schema": { @@ -987,6 +987,96 @@ } } }, + "/v1/document/raw-text": { + "post": { + "tags": [ + "Documents" + ], + "description": "Upload a file by specifying its raw text content and metadata values without having to upload a file.", + "parameters": [ + { + "name": "Authorization", + "in": "header", + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "type": "object", + "example": { + "success": true, + "error": null, + "documents": [ + { + "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc", + "url": "file://my-document.txt", + "title": "hello-world.txt", + "docAuthor": "no author found", + "description": "No description found.", + "docSource": "My custom description set during upload", + "chunkSource": "no chunk source specified", + "published": "1/16/2024, 3:46:33 PM", + "wordCount": 252, + "pageContent": "AnythingLLM is the best....", + "token_count_estimate": 447, + "location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json" + } + ] + } + } + } + } + }, + "403": { + "description": "Forbidden", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InvalidAPIKey" + } + }, + "application/xml": { + "schema": { + "$ref": "#/components/schemas/InvalidAPIKey" + } + } + } + }, + "422": { + "description": "Unprocessable Entity" + }, + "500": { + "description": "Internal Server Error" + } + }, + "requestBody": { + "description": "Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys", + "required": true, + "type": "object", + "content": { + "application/json": { + "schema": { + "type": "object", + "example": { + "textContent": "This is the raw text that will be saved as a document in AnythingLLM.", + "metadata": { + "keyOne": "valueOne", + "keyTwo": "valueTwo", + "etc": "etc" + } + } + } + } + } + } + } + }, "/v1/documents": { "get": { "tags": [ @@ -1195,6 +1285,61 @@ } } }, + "/v1/document/metadata-schema": { + "get": { + "tags": [ + "Documents" + ], + "description": "Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.", + "parameters": [ + { + "name": "Authorization", + "in": "header", + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "type": "object", + "example": { + "schema": { + "keyOne": "string | number | nullable", + "keyTwo": "string | number | nullable", + "specialKey": "number", + "title": "string" + } + } + } + } + } + }, + "403": { + "description": "Forbidden", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InvalidAPIKey" + } + }, + "application/xml": { + "schema": { + "$ref": "#/components/schemas/InvalidAPIKey" + } + } + } + }, + "500": { + "description": "Internal Server Error" + } + } + } + }, "/v1/workspace/new": { "post": { "tags": [ diff --git a/server/utils/files/documentProcessor.js b/server/utils/files/documentProcessor.js index 27d0f5f2b..ef8eba17c 100644 --- a/server/utils/files/documentProcessor.js +++ b/server/utils/files/documentProcessor.js @@ -59,6 +59,25 @@ async function processLink(link = "") { }); } +async function processRawText(textContent = "", metadata = {}) { + return await fetch(`${PROCESSOR_API}/process-raw-text`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ textContent, metadata }), + }) + .then((res) => { + if (!res.ok) throw new Error("Response could not be completed"); + return res.json(); + }) + .then((res) => res) + .catch((e) => { + console.log(e.message); + return { success: false, reason: e.message, documents: [] }; + }); +} + // We will not ever expose the document processor to the frontend API so instead we relay // all requests through the server. You can use this function to directly expose a specific endpoint // on the document processor. @@ -85,6 +104,7 @@ module.exports = { checkProcessorAlive, processDocument, processLink, + processRawText, acceptedFileTypes, forwardExtensionRequest, };