Add support to upload rawText document via api ()

* Add support to upload rawText document via api

* update API doc endpoint with correct textContent key

* update response swagger doc
This commit is contained in:
Timothy Carambat 2024-02-07 15:17:32 -08:00 committed by GitHub
parent 5e11173aab
commit 48cb8f2897
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 453 additions and 10 deletions
collector
index.js
processRawText
server
endpoints/api
document
workspace
swagger
utils/files

View file

@ -12,6 +12,7 @@ const { processSingleFile } = require("./processSingleFile");
const { processLink } = require("./processLink");
const { wipeCollectorStorage } = require("./utils/files");
const extensions = require("./extensions");
const { processRawText } = require("./processRawText");
const app = express();
app.use(cors({ origin: true }));
@ -66,6 +67,29 @@ app.post("/process-link", async function (request, response) {
return;
});
app.post("/process-raw-text", async function (request, response) {
const { textContent, metadata } = reqBody(request);
try {
const {
success,
reason,
documents = [],
} = await processRawText(textContent, metadata);
response
.status(200)
.json({ filename: metadata.title, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
filename: metadata?.title || "Unknown-doc.txt",
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;
});
extensions(app);
app.get("/accepts", function (_, response) {

View file

@ -0,0 +1,69 @@
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../utils/files");
const { tokenizeString } = require("../utils/tokenizer");
const { default: slugify } = require("slugify");
// Will remove the last .extension from the input
// and stringify the input + move to lowercase.
function stripAndSlug(input) {
if (!input.includes('.')) return slugify(input, { lower: true });
return slugify(input.split('.').slice(0, -1).join('-'), { lower: true })
}
const METADATA_KEYS = {
possible: {
url: ({ url, title }) => {
let validUrl;
try {
const u = new URL(url);
validUrl = ["https:", "http:"].includes(u.protocol);
} catch { }
if (validUrl) return `web://${url.toLowerCase()}.website`;
return `file://${stripAndSlug(title)}.txt`;
},
title: ({ title }) => `${stripAndSlug(title)}.txt`,
docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' },
description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' },
docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' },
chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` },
published: ({ published }) => {
if (isNaN(Number(published))) return new Date().toLocaleString();
return new Date(Number(published)).toLocaleString()
},
}
}
async function processRawText(textContent, metadata) {
console.log(`-- Working Raw Text doc ${metadata.title} --`);
if (!textContent || textContent.length === 0) {
return {
success: false,
reason: "textContent was empty - nothing to process.",
documents: [],
};
}
const data = {
id: v4(),
url: METADATA_KEYS.possible.url(metadata),
title: METADATA_KEYS.possible.title(metadata),
docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
description: METADATA_KEYS.possible.description(metadata),
docSource: METADATA_KEYS.possible.docSource(metadata),
chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
published: METADATA_KEYS.possible.published(metadata),
wordCount: textContent.split(" ").length,
pageContent: textContent,
token_count_estimate: tokenizeString(textContent).length,
};
const document = writeToServerDocuments(
data,
`raw-${stripAndSlug(metadata.title)}-${data.id}`
);
console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`);
return { success: true, reason: null, documents: [document] };
}
module.exports = { processRawText }

View file

@ -6,6 +6,7 @@ const {
acceptedFileTypes,
processDocument,
processLink,
processRawText,
} = require("../../../utils/files/documentProcessor");
const {
viewLocalFiles,
@ -90,6 +91,7 @@ function apiDocumentEndpoints(app) {
error: `Document processing API is not online. Document ${originalname} will not be processed automatically.`,
})
.end();
return;
}
const { success, reason, documents } =
@ -127,7 +129,7 @@ function apiDocumentEndpoints(app) {
#swagger.requestBody = {
description: 'Link of web address to be scraped.',
required: true,
type: 'file',
type: 'object',
content: {
"application/json": {
schema: {
@ -186,6 +188,7 @@ function apiDocumentEndpoints(app) {
error: `Document processing API is not online. Link ${link} will not be processed automatically.`,
})
.end();
return;
}
const { success, reason, documents } = await processLink(link);
@ -212,6 +215,138 @@ function apiDocumentEndpoints(app) {
}
);
app.post(
"/v1/document/raw-text",
[validApiKey],
async (request, response) => {
/*
#swagger.tags = ['Documents']
#swagger.description = 'Upload a file by specifying its raw text content and metadata values without having to upload a file.'
#swagger.requestBody = {
description: 'Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys',
required: true,
type: 'object',
content: {
"application/json": {
schema: {
type: 'object',
example: {
"textContent": "This is the raw text that will be saved as a document in AnythingLLM.",
"metadata": {
keyOne: "valueOne",
keyTwo: "valueTwo",
etc: "etc"
}
}
}
}
}
}
#swagger.responses[200] = {
content: {
"application/json": {
schema: {
type: 'object',
example: {
success: true,
error: null,
documents: [
{
"id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
"url": "file://my-document.txt",
"title": "hello-world.txt",
"docAuthor": "no author found",
"description": "No description found.",
"docSource": "My custom description set during upload",
"chunkSource": "no chunk source specified",
"published": "1/16/2024, 3:46:33PM",
"wordCount": 252,
"pageContent": "AnythingLLM is the best....",
"token_count_estimate": 447,
"location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
}
]
}
}
}
}
}
#swagger.responses[403] = {
schema: {
"$ref": "#/definitions/InvalidAPIKey"
}
}
*/
try {
const requiredMetadata = ["title"];
const { textContent, metadata = {} } = reqBody(request);
const processingOnline = await checkProcessorAlive();
if (!processingOnline) {
response
.status(500)
.json({
success: false,
error: `Document processing API is not online. Request will not be processed.`,
})
.end();
return;
}
if (
!requiredMetadata.every(
(reqKey) =>
Object.keys(metadata).includes(reqKey) && !!metadata[reqKey]
)
) {
response
.status(422)
.json({
success: false,
error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata
.map((v) => `'${v}'`)
.join(", ")}`,
})
.end();
return;
}
if (!textContent || textContent?.length === 0) {
response
.status(422)
.json({
success: false,
error: `The 'textContent' key cannot have an empty value.`,
})
.end();
return;
}
const { success, reason, documents } = await processRawText(
textContent,
metadata
);
if (!success) {
response
.status(500)
.json({ success: false, error: reason, documents })
.end();
return;
}
console.log(
`Document created successfully. It is now available in documents.`
);
await Telemetry.sendTelemetry("raw_document_uploaded");
await EventLogs.logEvent("api_raw_document_uploaded");
response.status(200).json({ success: true, error: null, documents });
} catch (e) {
console.log(e.message, e);
response.sendStatus(500).end();
}
}
);
app.get("/v1/documents", [validApiKey], async (_, response) => {
/*
#swagger.tags = ['Documents']
@ -367,6 +502,56 @@ function apiDocumentEndpoints(app) {
}
}
);
app.get(
"/v1/document/metadata-schema",
[validApiKey],
async (_, response) => {
/*
#swagger.tags = ['Documents']
#swagger.description = 'Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.'
#swagger.responses[200] = {
content: {
"application/json": {
schema: {
type: 'object',
example: {
"schema": {
"keyOne": "string | number | nullable",
"keyTwo": "string | number | nullable",
"specialKey": "number",
"title": "string",
}
}
}
}
}
}
#swagger.responses[403] = {
schema: {
"$ref": "#/definitions/InvalidAPIKey"
}
}
*/
try {
response.status(200).json({
schema: {
// If you are updating this be sure to update the collector METADATA_KEYS constant in /processRawText.
url: "string | nullable",
title: "string",
docAuthor: "string | nullable",
description: "string | nullable",
docSource: "string | nullable",
chunkSource: "string | nullable",
published: "epoch timestamp in ms | nullable",
},
});
} catch (e) {
console.log(e.message, e);
response.sendStatus(500).end();
}
}
);
}
module.exports = { apiDocumentEndpoints };

View file

@ -26,17 +26,17 @@ function apiWorkspaceEndpoints(app) {
#swagger.tags = ['Workspaces']
#swagger.description = 'Create a new workspace'
#swagger.requestBody = {
description: 'JSON object containing new display name of workspace.',
required: true,
type: 'object',
content: {
"application/json": {
example: {
name: "My New Workspace",
}
description: 'JSON object containing new display name of workspace.',
required: true,
type: 'object',
content: {
"application/json": {
example: {
name: "My New Workspace",
}
}
}
}
#swagger.responses[200] = {
content: {
"application/json": {

View file

@ -973,7 +973,7 @@
"requestBody": {
"description": "Link of web address to be scraped.",
"required": true,
"type": "file",
"type": "object",
"content": {
"application/json": {
"schema": {
@ -987,6 +987,96 @@
}
}
},
"/v1/document/raw-text": {
"post": {
"tags": [
"Documents"
],
"description": "Upload a file by specifying its raw text content and metadata values without having to upload a file.",
"parameters": [
{
"name": "Authorization",
"in": "header",
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"type": "object",
"example": {
"success": true,
"error": null,
"documents": [
{
"id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
"url": "file://my-document.txt",
"title": "hello-world.txt",
"docAuthor": "no author found",
"description": "No description found.",
"docSource": "My custom description set during upload",
"chunkSource": "no chunk source specified",
"published": "1/16/2024, 3:46:33PM",
"wordCount": 252,
"pageContent": "AnythingLLM is the best....",
"token_count_estimate": 447,
"location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
}
]
}
}
}
}
},
"403": {
"description": "Forbidden",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/InvalidAPIKey"
}
},
"application/xml": {
"schema": {
"$ref": "#/components/schemas/InvalidAPIKey"
}
}
}
},
"422": {
"description": "Unprocessable Entity"
},
"500": {
"description": "Internal Server Error"
}
},
"requestBody": {
"description": "Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys",
"required": true,
"type": "object",
"content": {
"application/json": {
"schema": {
"type": "object",
"example": {
"textContent": "This is the raw text that will be saved as a document in AnythingLLM.",
"metadata": {
"keyOne": "valueOne",
"keyTwo": "valueTwo",
"etc": "etc"
}
}
}
}
}
}
}
},
"/v1/documents": {
"get": {
"tags": [
@ -1195,6 +1285,61 @@
}
}
},
"/v1/document/metadata-schema": {
"get": {
"tags": [
"Documents"
],
"description": "Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.",
"parameters": [
{
"name": "Authorization",
"in": "header",
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"type": "object",
"example": {
"schema": {
"keyOne": "string | number | nullable",
"keyTwo": "string | number | nullable",
"specialKey": "number",
"title": "string"
}
}
}
}
}
},
"403": {
"description": "Forbidden",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/InvalidAPIKey"
}
},
"application/xml": {
"schema": {
"$ref": "#/components/schemas/InvalidAPIKey"
}
}
}
},
"500": {
"description": "Internal Server Error"
}
}
}
},
"/v1/workspace/new": {
"post": {
"tags": [

View file

@ -59,6 +59,25 @@ async function processLink(link = "") {
});
}
async function processRawText(textContent = "", metadata = {}) {
return await fetch(`${PROCESSOR_API}/process-raw-text`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ textContent, metadata }),
})
.then((res) => {
if (!res.ok) throw new Error("Response could not be completed");
return res.json();
})
.then((res) => res)
.catch((e) => {
console.log(e.message);
return { success: false, reason: e.message, documents: [] };
});
}
// We will not ever expose the document processor to the frontend API so instead we relay
// all requests through the server. You can use this function to directly expose a specific endpoint
// on the document processor.
@ -85,6 +104,7 @@ module.exports = {
checkProcessorAlive,
processDocument,
processLink,
processRawText,
acceptedFileTypes,
forwardExtensionRequest,
};