mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-03-14 22:22:22 +00:00
Adding url uploads to document picker (#375)
* WIP adding url uploads to document picker * fix manual script for uploading url to custom-documents * fix metadata for url scraping * wip url parsing * update how async link scraping works * docker-compose defaults added no autocomplete on URLs --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
parent
d766d128a2
commit
7edfccaf9a
12 changed files with 227 additions and 28 deletions
|
@ -11,5 +11,6 @@ collector/outputs/**
|
|||
**/__pycache__/
|
||||
**/.env
|
||||
**/.env.*
|
||||
**/bundleinspector.html
|
||||
!docker/.env.example
|
||||
!frontend/.env.production
|
|
@ -2,6 +2,7 @@ import os
|
|||
from flask import Flask, json, request
|
||||
from scripts.watch.process_single import process_single
|
||||
from scripts.watch.filetypes import ACCEPTED_MIMES
|
||||
from scripts.link import process_single_link
|
||||
api = Flask(__name__)
|
||||
|
||||
WATCH_DIRECTORY = "hotdir"
|
||||
|
@ -13,6 +14,15 @@ def process_file():
|
|||
success, reason = process_single(WATCH_DIRECTORY, target_filename)
|
||||
return json.dumps({'filename': target_filename, 'success': success, 'reason': reason})
|
||||
|
||||
@api.route('/process-link', methods=['POST'])
|
||||
async def process_link():
|
||||
content = request.json
|
||||
url = content.get('link')
|
||||
print(f"Processing {url}")
|
||||
success, reason = await process_single_link(url)
|
||||
return json.dumps({'url': url, 'success': success, 'reason': reason})
|
||||
|
||||
|
||||
@api.route('/accepts', methods=['GET'])
|
||||
def get_accepted_filetypes():
|
||||
return json.dumps(ACCEPTED_MIMES)
|
||||
|
|
|
@ -5,6 +5,7 @@ alive-progress==3.1.2
|
|||
anyio==3.7.0
|
||||
appdirs==1.4.4
|
||||
argilla==1.8.0
|
||||
asgiref==3.7.2
|
||||
async-timeout==4.0.2
|
||||
attrs==23.1.0
|
||||
backoff==2.2.1
|
||||
|
|
|
@ -2,7 +2,7 @@ import os, json, tempfile
|
|||
from urllib.parse import urlparse
|
||||
from requests_html import HTMLSession
|
||||
from langchain.document_loaders import UnstructuredHTMLLoader
|
||||
from .link_utils import append_meta
|
||||
from .link_utils import append_meta, AsyncHTMLSessionFixed
|
||||
from .utils import tokenize, ada_v2_cost
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -39,8 +39,8 @@ def link():
|
|||
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
|
||||
output_path = f"./outputs/website-logs"
|
||||
|
||||
transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
|
||||
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
|
||||
transaction_output_filename = f"website-{source.path.replace('/','_')}.json"
|
||||
transaction_output_dir = f"../server/storage/documents/custom-documents"
|
||||
|
||||
if os.path.isdir(output_path) == False:
|
||||
os.makedirs(output_path)
|
||||
|
@ -64,6 +64,57 @@ def link():
|
|||
print(f"////////////////////////////")
|
||||
exit(0)
|
||||
|
||||
async def process_single_link(url):
|
||||
session = None
|
||||
try:
|
||||
print(f"Working on {url}...")
|
||||
session = AsyncHTMLSessionFixed()
|
||||
req = await session.get(url)
|
||||
await req.html.arender()
|
||||
await session.close()
|
||||
|
||||
if not req.ok:
|
||||
return False, "Could not reach this URL."
|
||||
|
||||
full_text = None
|
||||
with tempfile.NamedTemporaryFile(mode = "w") as tmp:
|
||||
tmp.write(req.html.html)
|
||||
tmp.seek(0)
|
||||
loader = UnstructuredHTMLLoader(tmp.name)
|
||||
data = loader.load()[0]
|
||||
full_text = data.page_content
|
||||
print("full text 1: ", full_text)
|
||||
tmp.close()
|
||||
print(full_text)
|
||||
|
||||
print("full text: ", full_text)
|
||||
|
||||
|
||||
if full_text:
|
||||
link_meta = append_meta(req, full_text, True)
|
||||
|
||||
source = urlparse(req.url)
|
||||
transaction_output_dir = "../server/storage/documents/custom-documents"
|
||||
transaction_output_filename = f"website-{source.netloc}-{source.path.replace('/', '_')}.json"
|
||||
|
||||
if not os.path.isdir(transaction_output_dir):
|
||||
os.makedirs(transaction_output_dir)
|
||||
|
||||
file_path = os.path.join(transaction_output_dir, transaction_output_filename)
|
||||
with open(file_path, 'w', encoding='utf-8') as file:
|
||||
json.dump(link_meta, file, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
return True, "Content fetched and saved."
|
||||
|
||||
else:
|
||||
return False, "Could not parse any meaningful data from this URL."
|
||||
|
||||
except Exception as e:
|
||||
if session is not None:
|
||||
session.close() # Kill hanging session.
|
||||
return False, str(e)
|
||||
|
||||
def crawler():
|
||||
prompt = "Paste in root URI of the pages of interest: "
|
||||
new_link = input(prompt)
|
||||
|
@ -146,8 +197,8 @@ def parse_links(links):
|
|||
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
|
||||
output_path = f"./outputs/website-logs"
|
||||
|
||||
transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
|
||||
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
|
||||
transaction_output_filename = f"website-{source.path.replace('/','_')}.json"
|
||||
transaction_output_dir = f"../server/storage/documents/custom-documents"
|
||||
|
||||
if not os.path.isdir(output_path):
|
||||
os.makedirs(output_path)
|
||||
|
|
|
@ -1,22 +1,45 @@
|
|||
import json
|
||||
import json, pyppeteer
|
||||
from datetime import datetime
|
||||
from .watch.utils import guid
|
||||
from dotenv import load_dotenv
|
||||
from .watch.utils import guid
|
||||
from .utils import tokenize
|
||||
from requests_html import AsyncHTMLSession
|
||||
|
||||
load_dotenv()
|
||||
|
||||
def normalize_url(url):
|
||||
if(url.endswith('.web')):
|
||||
return url
|
||||
return f"{url}.web"
|
||||
|
||||
def append_meta(request, text, metadata_only = False):
|
||||
meta = {
|
||||
'id': guid(),
|
||||
'url': request.url,
|
||||
'url': normalize_url(request.url),
|
||||
'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '',
|
||||
'docAuthor': 'N/A',
|
||||
'docSource': 'webpage',
|
||||
'chunkSource': request.url,
|
||||
'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '',
|
||||
'docSource': 'web page',
|
||||
'chunkSource': request.url,
|
||||
'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'wordCount': len(text.split(' ')),
|
||||
'pageContent': text,
|
||||
'token_count_estimate':len(tokenize(text)),
|
||||
}
|
||||
return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta
|
||||
|
||||
class AsyncHTMLSessionFixed(AsyncHTMLSession):
|
||||
"""
|
||||
pip3 install websockets==6.0 --force-reinstall
|
||||
"""
|
||||
def __init__(self, **kwargs):
|
||||
super(AsyncHTMLSessionFixed, self).__init__(**kwargs)
|
||||
self.__browser_args = kwargs.get("browser_args", ["--no-sandbox"])
|
||||
|
||||
@property
|
||||
async def browser(self):
|
||||
if not hasattr(self, "_browser"):
|
||||
self._browser = await pyppeteer.launch(ignoreHTTPSErrors=not(self.verify), headless=True, handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False, args=self.__browser_args)
|
||||
|
||||
return self._browser
|
|
@ -15,14 +15,14 @@ services:
|
|||
context: ../.
|
||||
dockerfile: ./docker/Dockerfile
|
||||
args:
|
||||
ARG_UID: ${UID}
|
||||
ARG_GID: ${GID}
|
||||
ARG_UID: ${UID:-1000}
|
||||
ARG_GID: ${GID:-1000}
|
||||
volumes:
|
||||
- "./.env:/app/server/.env"
|
||||
- "../server/storage:/app/server/storage"
|
||||
- "../collector/hotdir/:/app/collector/hotdir"
|
||||
- "../collector/outputs/:/app/collector/outputs"
|
||||
user: "${UID}:${GID}"
|
||||
user: "${UID:-1000}:${GID:-1000}"
|
||||
ports:
|
||||
- "3001:3001"
|
||||
env_file:
|
||||
|
|
|
@ -3,6 +3,7 @@ import PreLoader from "../../../../Preloader";
|
|||
import { useEffect, useState } from "react";
|
||||
import FolderRow from "./FolderRow";
|
||||
import pluralize from "pluralize";
|
||||
import Workspace from "../../../../../models/workspace";
|
||||
|
||||
export default function Directory({
|
||||
files,
|
||||
|
@ -139,6 +140,7 @@ export default function Directory({
|
|||
fileTypes={fileTypes}
|
||||
workspace={workspace}
|
||||
fetchKeys={fetchKeys}
|
||||
setLoading={setLoading}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -5,10 +5,38 @@ import System from "../../../../../models/system";
|
|||
import { useDropzone } from "react-dropzone";
|
||||
import { v4 } from "uuid";
|
||||
import FileUploadProgress from "./FileUploadProgress";
|
||||
import Workspace from "../../../../../models/workspace";
|
||||
|
||||
export default function UploadFile({ workspace, fileTypes, fetchKeys }) {
|
||||
export default function UploadFile({
|
||||
workspace,
|
||||
fileTypes,
|
||||
fetchKeys,
|
||||
setLoading,
|
||||
}) {
|
||||
const [ready, setReady] = useState(false);
|
||||
const [files, setFiles] = useState([]);
|
||||
const [fetchingUrl, setFetchingUrl] = useState(false);
|
||||
|
||||
const handleSendLink = async (e) => {
|
||||
e.preventDefault();
|
||||
setLoading(true);
|
||||
setFetchingUrl(true);
|
||||
const formEl = e.target;
|
||||
const form = new FormData(formEl);
|
||||
const { response, data } = await Workspace.uploadLink(
|
||||
workspace.slug,
|
||||
form.get("link")
|
||||
);
|
||||
if (!response.ok) {
|
||||
showToast(`Error uploading link: ${data.error}`, "error");
|
||||
} else {
|
||||
fetchKeys(true);
|
||||
showToast("Link uploaded successfully", "success");
|
||||
formEl.reset();
|
||||
}
|
||||
setLoading(false);
|
||||
setFetchingUrl(false);
|
||||
};
|
||||
|
||||
const handleUploadSuccess = () => {
|
||||
fetchKeys(true);
|
||||
|
@ -103,6 +131,26 @@ export default function UploadFile({ workspace, fileTypes, fetchKeys }) {
|
|||
</div>
|
||||
)}
|
||||
</div>
|
||||
<div className="text-center text-white text-opacity-50 text-xs font-medium w-[560px] py-2">
|
||||
or submit a link
|
||||
</div>
|
||||
<form onSubmit={handleSendLink} className="flex gap-x-2">
|
||||
<input
|
||||
disabled={fetchingUrl}
|
||||
name="link"
|
||||
type="url"
|
||||
className="disabled:bg-zinc-600 disabled:text-slate-300 bg-zinc-900 text-white text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-3/4 p-2.5"
|
||||
placeholder={"https://example.com"}
|
||||
autoComplete="off"
|
||||
/>
|
||||
<button
|
||||
disabled={fetchingUrl}
|
||||
type="submit"
|
||||
className="disabled:bg-white/20 disabled:text-slate-300 disabled:border-slate-400 disabled:cursor-wait bg bg-transparent hover:bg-slate-200 hover:text-slate-800 w-auto border border-white text-sm text-white p-2.5 rounded-lg transition-all duration-300"
|
||||
>
|
||||
{fetchingUrl ? "Fetching..." : "Fetch website"}
|
||||
</button>
|
||||
</form>
|
||||
<div className="mt-6 text-center text-white text-opacity-80 text-xs font-medium w-[560px]">
|
||||
These files will be uploaded to the document processor running on this
|
||||
AnythingLLM instance. These files are not sent or shared with a third
|
||||
|
|
|
@ -138,6 +138,16 @@ const Workspace = {
|
|||
const data = await response.json();
|
||||
return { response, data };
|
||||
},
|
||||
uploadLink: async function (slug, link) {
|
||||
const response = await fetch(`${API_BASE}/workspace/${slug}/upload-link`, {
|
||||
method: "POST",
|
||||
body: JSON.stringify({ link }),
|
||||
headers: baseHeaders(),
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
return { response, data };
|
||||
},
|
||||
|
||||
// TODO: Deprecated and should be removed from frontend.
|
||||
sendChat: async function ({ slug }, message, mode = "query") {
|
||||
|
|
|
@ -8,8 +8,7 @@ export function formatDate(dateString) {
|
|||
}
|
||||
|
||||
export function getFileExtension(path) {
|
||||
const match = path.match(/[^\/\\&\?]+\.\w{1,4}(?=([\?&].*$|$))/);
|
||||
return match ? match[0].split(".").pop() : "file";
|
||||
return path?.split(".")?.slice(-1)?.[0] || "file";
|
||||
}
|
||||
|
||||
export function truncate(str, n) {
|
||||
|
|
|
@ -9,6 +9,7 @@ const { setupMulter } = require("../utils/files/multer");
|
|||
const {
|
||||
checkPythonAppAlive,
|
||||
processDocument,
|
||||
processLink,
|
||||
} = require("../utils/files/documentProcessor");
|
||||
const { validatedRequest } = require("../utils/middleware/validatedRequest");
|
||||
const { Telemetry } = require("../models/telemetry");
|
||||
|
@ -107,6 +108,38 @@ function workspaceEndpoints(app) {
|
|||
}
|
||||
);
|
||||
|
||||
app.post(
|
||||
"/workspace/:slug/upload-link",
|
||||
[validatedRequest],
|
||||
async (request, response) => {
|
||||
const { link = "" } = reqBody(request);
|
||||
const processingOnline = await checkPythonAppAlive();
|
||||
|
||||
if (!processingOnline) {
|
||||
response
|
||||
.status(500)
|
||||
.json({
|
||||
success: false,
|
||||
error: `Python processing API is not online. Link ${link} will not be processed automatically.`,
|
||||
})
|
||||
.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const { success, reason } = await processLink(link);
|
||||
if (!success) {
|
||||
response.status(500).json({ success: false, error: reason }).end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Link ${link} uploaded processed and successfully. It is now available in documents.`
|
||||
);
|
||||
await Telemetry.sendTelemetry("link_uploaded");
|
||||
response.status(200).json({ success: true, error: null });
|
||||
}
|
||||
);
|
||||
|
||||
app.post(
|
||||
"/workspace/:slug/update-embeddings",
|
||||
[validatedRequest],
|
||||
|
|
|
@ -39,8 +39,29 @@ async function processDocument(filename = "") {
|
|||
});
|
||||
}
|
||||
|
||||
async function processLink(link = "") {
|
||||
if (!link) return false;
|
||||
return await fetch(`${PYTHON_API}/process-link`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ link }),
|
||||
})
|
||||
.then((res) => {
|
||||
if (!res.ok) throw new Error("Response could not be completed");
|
||||
return res.json();
|
||||
})
|
||||
.then((res) => res)
|
||||
.catch((e) => {
|
||||
console.log(e.message);
|
||||
return { success: false, reason: e.message };
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
checkPythonAppAlive,
|
||||
processDocument,
|
||||
processLink,
|
||||
acceptedFileTypes,
|
||||
};
|
||||
|
|
Loading…
Add table
Reference in a new issue