mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-04-17 18:18:11 +00:00
Support more Confluence URL formats (#2118)
* support more confluence url formats * use pattern matching for confluence urls and manual splitting as fallback * rework entire Confluence flow to prevent issues with custom, local, and cloud spaces * remove dep --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
44dddcd4af
commit
4488744850
5 changed files with 77 additions and 113 deletions
collector
frontend/src
components/Modals/ManageWorkspace/DataConnectors/Connectors/Confluence
models
|
@ -59,6 +59,7 @@ async function resyncConfluence({ chunkSource }, response) {
|
|||
const { success, reason, content } = await fetchConfluencePage({
|
||||
pageUrl: `https:${source.pathname}`, // need to add back the real protocol
|
||||
baseUrl: source.searchParams.get('baseUrl'),
|
||||
spaceKey: source.searchParams.get('spaceKey'),
|
||||
accessToken: source.searchParams.get('token'),
|
||||
username: source.searchParams.get('username'),
|
||||
});
|
||||
|
|
|
@ -72,8 +72,9 @@ class ConfluencePagesLoader {
|
|||
}
|
||||
}
|
||||
|
||||
// https://developer.atlassian.com/cloud/confluence/rest/v2/intro/#auth
|
||||
async fetchAllPagesInSpace(start = 0, limit = this.limit) {
|
||||
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
|
||||
const url = `${this.baseUrl}/wiki/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
|
||||
const data = await this.fetchConfluenceData(url);
|
||||
if (data.size === 0) {
|
||||
return [];
|
||||
|
|
|
@ -2,7 +2,6 @@ const fs = require("fs");
|
|||
const path = require("path");
|
||||
const { default: slugify } = require("slugify");
|
||||
const { v4 } = require("uuid");
|
||||
const UrlPattern = require("url-pattern");
|
||||
const { writeToServerDocuments, sanitizeFileName } = require("../../files");
|
||||
const { tokenizeString } = require("../../tokenizer");
|
||||
const { ConfluencePagesLoader } = require("./ConfluenceLoader");
|
||||
|
@ -13,8 +12,11 @@ const { ConfluencePagesLoader } = require("./ConfluenceLoader");
|
|||
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
|
||||
* @returns
|
||||
*/
|
||||
async function loadConfluence({ pageUrl, username, accessToken }, response) {
|
||||
if (!pageUrl || !username || !accessToken) {
|
||||
async function loadConfluence(
|
||||
{ baseUrl = null, spaceKey = null, username = null, accessToken = null },
|
||||
response
|
||||
) {
|
||||
if (!baseUrl || !spaceKey || !username || !accessToken) {
|
||||
return {
|
||||
success: false,
|
||||
reason:
|
||||
|
@ -22,19 +24,24 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
|
|||
};
|
||||
}
|
||||
|
||||
const { valid, result } = validSpaceUrl(pageUrl);
|
||||
if (!valid) {
|
||||
if (!validBaseUrl(baseUrl)) {
|
||||
return {
|
||||
success: false,
|
||||
reason:
|
||||
"Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*",
|
||||
reason: "Provided base URL is not a valid URL.",
|
||||
};
|
||||
}
|
||||
|
||||
const { apiBase: baseUrl, spaceKey, subdomain } = result;
|
||||
console.log(`-- Working Confluence ${baseUrl} --`);
|
||||
if (!spaceKey) {
|
||||
return {
|
||||
success: false,
|
||||
reason: "You need to provide a Confluence space key.",
|
||||
};
|
||||
}
|
||||
|
||||
const { origin, hostname } = new URL(baseUrl);
|
||||
console.log(`-- Working Confluence ${origin} --`);
|
||||
const loader = new ConfluencePagesLoader({
|
||||
baseUrl,
|
||||
baseUrl: origin, // Use the origin to avoid issues with subdomains, ports, protocols, etc.
|
||||
spaceKey,
|
||||
username,
|
||||
accessToken,
|
||||
|
@ -59,7 +66,7 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
|
|||
};
|
||||
}
|
||||
const outFolder = slugify(
|
||||
`${subdomain}-confluence-${v4().slice(0, 4)}`
|
||||
`confluence-${origin}-${v4().slice(0, 4)}`
|
||||
).toLowerCase();
|
||||
|
||||
const outFolderPath =
|
||||
|
@ -80,11 +87,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
|
|||
id: v4(),
|
||||
url: doc.metadata.url + ".page",
|
||||
title: doc.metadata.title || doc.metadata.source,
|
||||
docAuthor: subdomain,
|
||||
docAuthor: origin,
|
||||
description: doc.metadata.title,
|
||||
docSource: `${subdomain} Confluence`,
|
||||
docSource: `${origin} Confluence`,
|
||||
chunkSource: generateChunkSource(
|
||||
{ doc, baseUrl, accessToken, username },
|
||||
{ doc, baseUrl: origin, spaceKey, accessToken, username },
|
||||
response.locals.encryptionWorker
|
||||
),
|
||||
published: new Date().toLocaleString(),
|
||||
|
@ -120,10 +127,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
|
|||
async function fetchConfluencePage({
|
||||
pageUrl,
|
||||
baseUrl,
|
||||
spaceKey,
|
||||
username,
|
||||
accessToken,
|
||||
}) {
|
||||
if (!pageUrl || !baseUrl || !username || !accessToken) {
|
||||
if (!pageUrl || !baseUrl || !spaceKey || !username || !accessToken) {
|
||||
return {
|
||||
success: false,
|
||||
content: null,
|
||||
|
@ -132,20 +140,25 @@ async function fetchConfluencePage({
|
|||
};
|
||||
}
|
||||
|
||||
const { valid, result } = validSpaceUrl(pageUrl);
|
||||
if (!valid) {
|
||||
if (!validBaseUrl(baseUrl)) {
|
||||
return {
|
||||
success: false,
|
||||
content: null,
|
||||
reason:
|
||||
"Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*",
|
||||
reason: "Provided base URL is not a valid URL.",
|
||||
};
|
||||
}
|
||||
|
||||
if (!spaceKey) {
|
||||
return {
|
||||
success: false,
|
||||
content: null,
|
||||
reason: "You need to provide a Confluence space key.",
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`-- Working Confluence Page ${pageUrl} --`);
|
||||
const { spaceKey } = result;
|
||||
const loader = new ConfluencePagesLoader({
|
||||
baseUrl,
|
||||
baseUrl, // Should be the origin of the baseUrl
|
||||
spaceKey,
|
||||
username,
|
||||
accessToken,
|
||||
|
@ -190,91 +203,17 @@ async function fetchConfluencePage({
|
|||
}
|
||||
|
||||
/**
|
||||
* A match result for a url-pattern of a Confluence URL
|
||||
* @typedef {Object} ConfluenceMatchResult
|
||||
* @property {string} subdomain - the subdomain of an organization's Confluence space
|
||||
* @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect.
|
||||
* @property {string} apiBase - the correct REST API url to use for loader.
|
||||
* Validates if the provided baseUrl is a valid URL at all.
|
||||
* @param {string} baseUrl
|
||||
* @returns {boolean}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Generates the correct API base URL for interfacing with the Confluence REST API
|
||||
* depending on the URL pattern being used since there are various ways to host/access a
|
||||
* Confluence space.
|
||||
* @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match
|
||||
* @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL
|
||||
* @returns {string} - the resulting REST API URL
|
||||
*/
|
||||
function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) {
|
||||
const { subdomain } = matchResult;
|
||||
if (isCustomDomain) return `https://${subdomain}`;
|
||||
return `https://${subdomain}.atlassian.net/wiki`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates and parses the correct information from a given Confluence URL
|
||||
* @param {string} spaceUrl - The organization's Confluence URL to parse
|
||||
* @returns {{
|
||||
* valid: boolean,
|
||||
* result: (ConfluenceMatchResult|null),
|
||||
* }}
|
||||
*/
|
||||
function validSpaceUrl(spaceUrl = "") {
|
||||
let matchResult;
|
||||
const patterns = {
|
||||
default: new UrlPattern(
|
||||
"https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
|
||||
),
|
||||
subdomain: new UrlPattern(
|
||||
"https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*"
|
||||
),
|
||||
custom: new UrlPattern(
|
||||
"https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*"
|
||||
),
|
||||
};
|
||||
|
||||
// If using the default Atlassian Confluence URL pattern.
|
||||
// We can proceed because the Library/API can use this base url scheme.
|
||||
matchResult = patterns.default.match(spaceUrl);
|
||||
if (matchResult)
|
||||
return {
|
||||
valid: matchResult.hasOwnProperty("spaceKey"),
|
||||
result: {
|
||||
...matchResult,
|
||||
apiBase: generateAPIBaseUrl(matchResult),
|
||||
},
|
||||
};
|
||||
|
||||
// If using a custom subdomain Confluence URL pattern.
|
||||
// We need to attach the customDomain as a property to the match result
|
||||
// so we can form the correct REST API base from the subdomain.
|
||||
matchResult = patterns.subdomain.match(spaceUrl);
|
||||
if (matchResult) {
|
||||
return {
|
||||
valid: matchResult.hasOwnProperty("spaceKey"),
|
||||
result: {
|
||||
...matchResult,
|
||||
apiBase: generateAPIBaseUrl(matchResult),
|
||||
},
|
||||
};
|
||||
function validBaseUrl(baseUrl) {
|
||||
try {
|
||||
new URL(baseUrl);
|
||||
return true;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If using a base FQDN Confluence URL pattern.
|
||||
// We need to attach the customDomain as a property to the match result
|
||||
// so we can form the correct REST API base from the root domain since /display/ is basically a URL mask.
|
||||
matchResult = patterns.custom.match(spaceUrl);
|
||||
if (matchResult) {
|
||||
return {
|
||||
valid: matchResult.hasOwnProperty("spaceKey"),
|
||||
result: {
|
||||
...matchResult,
|
||||
apiBase: generateAPIBaseUrl(matchResult, true),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// No match
|
||||
return { valid: false, result: null };
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -286,11 +225,12 @@ function validSpaceUrl(spaceUrl = "") {
|
|||
* @returns {string}
|
||||
*/
|
||||
function generateChunkSource(
|
||||
{ doc, baseUrl, accessToken, username },
|
||||
{ doc, baseUrl, spaceKey, accessToken, username },
|
||||
encryptionWorker
|
||||
) {
|
||||
const payload = {
|
||||
baseUrl,
|
||||
spaceKey,
|
||||
token: accessToken,
|
||||
username,
|
||||
};
|
||||
|
|
|
@ -22,7 +22,8 @@ export default function ConfluenceOptions() {
|
|||
}
|
||||
);
|
||||
const { data, error } = await System.dataConnectors.confluence.collect({
|
||||
pageUrl: form.get("pageUrl"),
|
||||
baseUrl: form.get("baseUrl"),
|
||||
spaceKey: form.get("spaceKey"),
|
||||
username: form.get("username"),
|
||||
accessToken: form.get("accessToken"),
|
||||
});
|
||||
|
@ -56,17 +57,37 @@ export default function ConfluenceOptions() {
|
|||
<div className="flex flex-col pr-10">
|
||||
<div className="flex flex-col gap-y-1 mb-4">
|
||||
<label className="text-white text-sm font-bold flex gap-x-2 items-center">
|
||||
<p className="font-bold text-white">Confluence Page URL</p>
|
||||
<p className="font-bold text-white">Confluence base URL</p>
|
||||
</label>
|
||||
<p className="text-xs font-normal text-white/50">
|
||||
URL of a page in the Confluence space.
|
||||
This is the base URL of your Confluence space.
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="url"
|
||||
name="pageUrl"
|
||||
name="baseUrl"
|
||||
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
|
||||
placeholder="https://example.atlassian.net/wiki/spaces/~7120208c08555d52224113949698b933a3bb56/pages/851969/Test+anythingLLM+page"
|
||||
placeholder="eg: https://example.atlassian.net, http://localhost:8211, etc..."
|
||||
required={true}
|
||||
autoComplete="off"
|
||||
spellCheck={false}
|
||||
/>
|
||||
</div>
|
||||
<div className="flex flex-col pr-10">
|
||||
<div className="flex flex-col gap-y-1 mb-4">
|
||||
<label className="text-white text-sm font-bold">
|
||||
Confluence space key
|
||||
</label>
|
||||
<p className="text-xs font-normal text-white/50">
|
||||
This is the spaces key of your confluence instance that will
|
||||
be used. Usually begins with ~
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="text"
|
||||
name="spaceKey"
|
||||
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
|
||||
placeholder="eg: ~7120208c08555d52224113949698b933a3bb56"
|
||||
required={true}
|
||||
autoComplete="off"
|
||||
spellCheck={false}
|
||||
|
|
|
@ -119,12 +119,13 @@ const DataConnector = {
|
|||
},
|
||||
|
||||
confluence: {
|
||||
collect: async function ({ pageUrl, username, accessToken }) {
|
||||
collect: async function ({ baseUrl, spaceKey, username, accessToken }) {
|
||||
return await fetch(`${API_BASE}/ext/confluence`, {
|
||||
method: "POST",
|
||||
headers: baseHeaders(),
|
||||
body: JSON.stringify({
|
||||
pageUrl,
|
||||
baseUrl,
|
||||
spaceKey,
|
||||
username,
|
||||
accessToken,
|
||||
}),
|
||||
|
|
Loading…
Add table
Reference in a new issue