mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-05-02 17:07:13 +00:00
patch link scrape tool schema
This commit is contained in:
parent
085745c5e4
commit
c5dc68633b
2 changed files with 9 additions and 7 deletions
collector/scripts
|
@ -2,7 +2,7 @@ import os, json, tempfile
|
|||
from urllib.parse import urlparse
|
||||
from requests_html import HTMLSession
|
||||
from langchain.document_loaders import UnstructuredHTMLLoader
|
||||
from .link_utils import append_meta
|
||||
from .link_utils import append_meta
|
||||
from .utils import tokenize, ada_v2_cost
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -47,10 +47,6 @@ def link():
|
|||
os.makedirs(transaction_output_dir)
|
||||
|
||||
full_text = append_meta(req, full_text)
|
||||
tokenCount = len(tokenize(full_text))
|
||||
link['pageContent'] = full_text
|
||||
link['token_count_estimate'] = tokenCount
|
||||
|
||||
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
|
||||
json.dump(link, file, ensure_ascii=True, indent=4)
|
||||
|
||||
|
@ -159,8 +155,6 @@ def parse_links(links):
|
|||
|
||||
full_text = append_meta(req, full_text)
|
||||
tokenCount = len(tokenize(full_text))
|
||||
link['pageContent'] = full_text
|
||||
link['token_count_estimate'] = tokenCount
|
||||
totalTokens += tokenCount
|
||||
|
||||
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
|
||||
|
|
|
@ -1,14 +1,22 @@
|
|||
import json
|
||||
from datetime import datetime
|
||||
from dotenv import load_dotenv
|
||||
from .watch.utils import guid
|
||||
from .utils import tokenize
|
||||
load_dotenv()
|
||||
|
||||
def append_meta(request, text, metadata_only = False):
|
||||
meta = {
|
||||
'id': guid(),
|
||||
'url': request.url,
|
||||
'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '',
|
||||
'docAuthor': 'N/A',
|
||||
'docSource': 'webpage',
|
||||
'chunkSource': request.url,
|
||||
'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '',
|
||||
'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'wordCount': len(text.split(' ')),
|
||||
'pageContent': text,
|
||||
'token_count_estimate':len(tokenize(text)),
|
||||
}
|
||||
return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue