mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-03-15 14:42:23 +00:00
Add support to upload rawText document via api (#692)
* Add support to upload rawText document via api * update API doc endpoint with correct textContent key * update response swagger doc
This commit is contained in:
parent
5e11173aab
commit
48cb8f2897
6 changed files with 453 additions and 10 deletions
collector
server
|
@ -12,6 +12,7 @@ const { processSingleFile } = require("./processSingleFile");
|
|||
const { processLink } = require("./processLink");
|
||||
const { wipeCollectorStorage } = require("./utils/files");
|
||||
const extensions = require("./extensions");
|
||||
const { processRawText } = require("./processRawText");
|
||||
const app = express();
|
||||
|
||||
app.use(cors({ origin: true }));
|
||||
|
@ -66,6 +67,29 @@ app.post("/process-link", async function (request, response) {
|
|||
return;
|
||||
});
|
||||
|
||||
app.post("/process-raw-text", async function (request, response) {
|
||||
const { textContent, metadata } = reqBody(request);
|
||||
try {
|
||||
const {
|
||||
success,
|
||||
reason,
|
||||
documents = [],
|
||||
} = await processRawText(textContent, metadata);
|
||||
response
|
||||
.status(200)
|
||||
.json({ filename: metadata.title, success, reason, documents });
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
response.status(200).json({
|
||||
filename: metadata?.title || "Unknown-doc.txt",
|
||||
success: false,
|
||||
reason: "A processing error occurred.",
|
||||
documents: [],
|
||||
});
|
||||
}
|
||||
return;
|
||||
});
|
||||
|
||||
extensions(app);
|
||||
|
||||
app.get("/accepts", function (_, response) {
|
||||
|
|
69
collector/processRawText/index.js
Normal file
69
collector/processRawText/index.js
Normal file
|
@ -0,0 +1,69 @@
|
|||
const { v4 } = require("uuid");
|
||||
const { writeToServerDocuments } = require("../utils/files");
|
||||
const { tokenizeString } = require("../utils/tokenizer");
|
||||
const { default: slugify } = require("slugify");
|
||||
|
||||
// Will remove the last .extension from the input
|
||||
// and stringify the input + move to lowercase.
|
||||
function stripAndSlug(input) {
|
||||
if (!input.includes('.')) return slugify(input, { lower: true });
|
||||
return slugify(input.split('.').slice(0, -1).join('-'), { lower: true })
|
||||
}
|
||||
|
||||
const METADATA_KEYS = {
|
||||
possible: {
|
||||
url: ({ url, title }) => {
|
||||
let validUrl;
|
||||
try {
|
||||
const u = new URL(url);
|
||||
validUrl = ["https:", "http:"].includes(u.protocol);
|
||||
} catch { }
|
||||
|
||||
if (validUrl) return `web://${url.toLowerCase()}.website`;
|
||||
return `file://${stripAndSlug(title)}.txt`;
|
||||
},
|
||||
title: ({ title }) => `${stripAndSlug(title)}.txt`,
|
||||
docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' },
|
||||
description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' },
|
||||
docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' },
|
||||
chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` },
|
||||
published: ({ published }) => {
|
||||
if (isNaN(Number(published))) return new Date().toLocaleString();
|
||||
return new Date(Number(published)).toLocaleString()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
async function processRawText(textContent, metadata) {
|
||||
console.log(`-- Working Raw Text doc ${metadata.title} --`);
|
||||
if (!textContent || textContent.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
reason: "textContent was empty - nothing to process.",
|
||||
documents: [],
|
||||
};
|
||||
}
|
||||
|
||||
const data = {
|
||||
id: v4(),
|
||||
url: METADATA_KEYS.possible.url(metadata),
|
||||
title: METADATA_KEYS.possible.title(metadata),
|
||||
docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
|
||||
description: METADATA_KEYS.possible.description(metadata),
|
||||
docSource: METADATA_KEYS.possible.docSource(metadata),
|
||||
chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
|
||||
published: METADATA_KEYS.possible.published(metadata),
|
||||
wordCount: textContent.split(" ").length,
|
||||
pageContent: textContent,
|
||||
token_count_estimate: tokenizeString(textContent).length,
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
data,
|
||||
`raw-${stripAndSlug(metadata.title)}-${data.id}`
|
||||
);
|
||||
console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`);
|
||||
return { success: true, reason: null, documents: [document] };
|
||||
}
|
||||
|
||||
module.exports = { processRawText }
|
|
@ -6,6 +6,7 @@ const {
|
|||
acceptedFileTypes,
|
||||
processDocument,
|
||||
processLink,
|
||||
processRawText,
|
||||
} = require("../../../utils/files/documentProcessor");
|
||||
const {
|
||||
viewLocalFiles,
|
||||
|
@ -90,6 +91,7 @@ function apiDocumentEndpoints(app) {
|
|||
error: `Document processing API is not online. Document ${originalname} will not be processed automatically.`,
|
||||
})
|
||||
.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const { success, reason, documents } =
|
||||
|
@ -127,7 +129,7 @@ function apiDocumentEndpoints(app) {
|
|||
#swagger.requestBody = {
|
||||
description: 'Link of web address to be scraped.',
|
||||
required: true,
|
||||
type: 'file',
|
||||
type: 'object',
|
||||
content: {
|
||||
"application/json": {
|
||||
schema: {
|
||||
|
@ -186,6 +188,7 @@ function apiDocumentEndpoints(app) {
|
|||
error: `Document processing API is not online. Link ${link} will not be processed automatically.`,
|
||||
})
|
||||
.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const { success, reason, documents } = await processLink(link);
|
||||
|
@ -212,6 +215,138 @@ function apiDocumentEndpoints(app) {
|
|||
}
|
||||
);
|
||||
|
||||
app.post(
|
||||
"/v1/document/raw-text",
|
||||
[validApiKey],
|
||||
async (request, response) => {
|
||||
/*
|
||||
#swagger.tags = ['Documents']
|
||||
#swagger.description = 'Upload a file by specifying its raw text content and metadata values without having to upload a file.'
|
||||
#swagger.requestBody = {
|
||||
description: 'Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys',
|
||||
required: true,
|
||||
type: 'object',
|
||||
content: {
|
||||
"application/json": {
|
||||
schema: {
|
||||
type: 'object',
|
||||
example: {
|
||||
"textContent": "This is the raw text that will be saved as a document in AnythingLLM.",
|
||||
"metadata": {
|
||||
keyOne: "valueOne",
|
||||
keyTwo: "valueTwo",
|
||||
etc: "etc"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#swagger.responses[200] = {
|
||||
content: {
|
||||
"application/json": {
|
||||
schema: {
|
||||
type: 'object',
|
||||
example: {
|
||||
success: true,
|
||||
error: null,
|
||||
documents: [
|
||||
{
|
||||
"id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
|
||||
"url": "file://my-document.txt",
|
||||
"title": "hello-world.txt",
|
||||
"docAuthor": "no author found",
|
||||
"description": "No description found.",
|
||||
"docSource": "My custom description set during upload",
|
||||
"chunkSource": "no chunk source specified",
|
||||
"published": "1/16/2024, 3:46:33 PM",
|
||||
"wordCount": 252,
|
||||
"pageContent": "AnythingLLM is the best....",
|
||||
"token_count_estimate": 447,
|
||||
"location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#swagger.responses[403] = {
|
||||
schema: {
|
||||
"$ref": "#/definitions/InvalidAPIKey"
|
||||
}
|
||||
}
|
||||
*/
|
||||
try {
|
||||
const requiredMetadata = ["title"];
|
||||
const { textContent, metadata = {} } = reqBody(request);
|
||||
const processingOnline = await checkProcessorAlive();
|
||||
|
||||
if (!processingOnline) {
|
||||
response
|
||||
.status(500)
|
||||
.json({
|
||||
success: false,
|
||||
error: `Document processing API is not online. Request will not be processed.`,
|
||||
})
|
||||
.end();
|
||||
return;
|
||||
}
|
||||
|
||||
if (
|
||||
!requiredMetadata.every(
|
||||
(reqKey) =>
|
||||
Object.keys(metadata).includes(reqKey) && !!metadata[reqKey]
|
||||
)
|
||||
) {
|
||||
response
|
||||
.status(422)
|
||||
.json({
|
||||
success: false,
|
||||
error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata
|
||||
.map((v) => `'${v}'`)
|
||||
.join(", ")}`,
|
||||
})
|
||||
.end();
|
||||
return;
|
||||
}
|
||||
|
||||
if (!textContent || textContent?.length === 0) {
|
||||
response
|
||||
.status(422)
|
||||
.json({
|
||||
success: false,
|
||||
error: `The 'textContent' key cannot have an empty value.`,
|
||||
})
|
||||
.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const { success, reason, documents } = await processRawText(
|
||||
textContent,
|
||||
metadata
|
||||
);
|
||||
if (!success) {
|
||||
response
|
||||
.status(500)
|
||||
.json({ success: false, error: reason, documents })
|
||||
.end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Document created successfully. It is now available in documents.`
|
||||
);
|
||||
await Telemetry.sendTelemetry("raw_document_uploaded");
|
||||
await EventLogs.logEvent("api_raw_document_uploaded");
|
||||
response.status(200).json({ success: true, error: null, documents });
|
||||
} catch (e) {
|
||||
console.log(e.message, e);
|
||||
response.sendStatus(500).end();
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
app.get("/v1/documents", [validApiKey], async (_, response) => {
|
||||
/*
|
||||
#swagger.tags = ['Documents']
|
||||
|
@ -367,6 +502,56 @@ function apiDocumentEndpoints(app) {
|
|||
}
|
||||
}
|
||||
);
|
||||
|
||||
app.get(
|
||||
"/v1/document/metadata-schema",
|
||||
[validApiKey],
|
||||
async (_, response) => {
|
||||
/*
|
||||
#swagger.tags = ['Documents']
|
||||
#swagger.description = 'Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.'
|
||||
#swagger.responses[200] = {
|
||||
content: {
|
||||
"application/json": {
|
||||
schema: {
|
||||
type: 'object',
|
||||
example: {
|
||||
"schema": {
|
||||
"keyOne": "string | number | nullable",
|
||||
"keyTwo": "string | number | nullable",
|
||||
"specialKey": "number",
|
||||
"title": "string",
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#swagger.responses[403] = {
|
||||
schema: {
|
||||
"$ref": "#/definitions/InvalidAPIKey"
|
||||
}
|
||||
}
|
||||
*/
|
||||
try {
|
||||
response.status(200).json({
|
||||
schema: {
|
||||
// If you are updating this be sure to update the collector METADATA_KEYS constant in /processRawText.
|
||||
url: "string | nullable",
|
||||
title: "string",
|
||||
docAuthor: "string | nullable",
|
||||
description: "string | nullable",
|
||||
docSource: "string | nullable",
|
||||
chunkSource: "string | nullable",
|
||||
published: "epoch timestamp in ms | nullable",
|
||||
},
|
||||
});
|
||||
} catch (e) {
|
||||
console.log(e.message, e);
|
||||
response.sendStatus(500).end();
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
module.exports = { apiDocumentEndpoints };
|
||||
|
|
|
@ -973,7 +973,7 @@
|
|||
"requestBody": {
|
||||
"description": "Link of web address to be scraped.",
|
||||
"required": true,
|
||||
"type": "file",
|
||||
"type": "object",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
|
@ -987,6 +987,96 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/document/raw-text": {
|
||||
"post": {
|
||||
"tags": [
|
||||
"Documents"
|
||||
],
|
||||
"description": "Upload a file by specifying its raw text content and metadata values without having to upload a file.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "Authorization",
|
||||
"in": "header",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"example": {
|
||||
"success": true,
|
||||
"error": null,
|
||||
"documents": [
|
||||
{
|
||||
"id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
|
||||
"url": "file://my-document.txt",
|
||||
"title": "hello-world.txt",
|
||||
"docAuthor": "no author found",
|
||||
"description": "No description found.",
|
||||
"docSource": "My custom description set during upload",
|
||||
"chunkSource": "no chunk source specified",
|
||||
"published": "1/16/2024, 3:46:33 PM",
|
||||
"wordCount": 252,
|
||||
"pageContent": "AnythingLLM is the best....",
|
||||
"token_count_estimate": 447,
|
||||
"location": "custom-documents/raw-my-doc-text-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"403": {
|
||||
"description": "Forbidden",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/InvalidAPIKey"
|
||||
}
|
||||
},
|
||||
"application/xml": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/InvalidAPIKey"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Unprocessable Entity"
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error"
|
||||
}
|
||||
},
|
||||
"requestBody": {
|
||||
"description": "Text content and metadata of the file to be saved to the system. Use metadata-schema endpoint to get the possible metadata keys",
|
||||
"required": true,
|
||||
"type": "object",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"example": {
|
||||
"textContent": "This is the raw text that will be saved as a document in AnythingLLM.",
|
||||
"metadata": {
|
||||
"keyOne": "valueOne",
|
||||
"keyTwo": "valueTwo",
|
||||
"etc": "etc"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/documents": {
|
||||
"get": {
|
||||
"tags": [
|
||||
|
@ -1195,6 +1285,61 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/document/metadata-schema": {
|
||||
"get": {
|
||||
"tags": [
|
||||
"Documents"
|
||||
],
|
||||
"description": "Get the known available metadata schema for when doing a raw-text upload and the acceptable type of value for each key.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "Authorization",
|
||||
"in": "header",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"example": {
|
||||
"schema": {
|
||||
"keyOne": "string | number | nullable",
|
||||
"keyTwo": "string | number | nullable",
|
||||
"specialKey": "number",
|
||||
"title": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"403": {
|
||||
"description": "Forbidden",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/InvalidAPIKey"
|
||||
}
|
||||
},
|
||||
"application/xml": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/InvalidAPIKey"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/workspace/new": {
|
||||
"post": {
|
||||
"tags": [
|
||||
|
|
|
@ -59,6 +59,25 @@ async function processLink(link = "") {
|
|||
});
|
||||
}
|
||||
|
||||
async function processRawText(textContent = "", metadata = {}) {
|
||||
return await fetch(`${PROCESSOR_API}/process-raw-text`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ textContent, metadata }),
|
||||
})
|
||||
.then((res) => {
|
||||
if (!res.ok) throw new Error("Response could not be completed");
|
||||
return res.json();
|
||||
})
|
||||
.then((res) => res)
|
||||
.catch((e) => {
|
||||
console.log(e.message);
|
||||
return { success: false, reason: e.message, documents: [] };
|
||||
});
|
||||
}
|
||||
|
||||
// We will not ever expose the document processor to the frontend API so instead we relay
|
||||
// all requests through the server. You can use this function to directly expose a specific endpoint
|
||||
// on the document processor.
|
||||
|
@ -85,6 +104,7 @@ module.exports = {
|
|||
checkProcessorAlive,
|
||||
processDocument,
|
||||
processLink,
|
||||
processRawText,
|
||||
acceptedFileTypes,
|
||||
forwardExtensionRequest,
|
||||
};
|
||||
|
|
Loading…
Add table
Reference in a new issue