patch link scrape tool schema

This commit is contained in:
timothycarambat 2023-11-14 16:41:39 -08:00
parent 085745c5e4
commit c5dc68633b
2 changed files with 9 additions and 7 deletions
collector/scripts

View file

@ -2,7 +2,7 @@ import os, json, tempfile
from urllib.parse import urlparse
from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta
from .link_utils import append_meta
from .utils import tokenize, ada_v2_cost
import requests
from bs4 import BeautifulSoup
@ -47,10 +47,6 @@ def link():
os.makedirs(transaction_output_dir)
full_text = append_meta(req, full_text)
tokenCount = len(tokenize(full_text))
link['pageContent'] = full_text
link['token_count_estimate'] = tokenCount
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
json.dump(link, file, ensure_ascii=True, indent=4)
@ -159,8 +155,6 @@ def parse_links(links):
full_text = append_meta(req, full_text)
tokenCount = len(tokenize(full_text))
link['pageContent'] = full_text
link['token_count_estimate'] = tokenCount
totalTokens += tokenCount
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:

View file

@ -1,14 +1,22 @@
import json
from datetime import datetime
from dotenv import load_dotenv
from .watch.utils import guid
from .utils import tokenize
load_dotenv()
def append_meta(request, text, metadata_only = False):
meta = {
'id': guid(),
'url': request.url,
'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '',
'docAuthor': 'N/A',
'docSource': 'webpage',
'chunkSource': request.url,
'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '',
'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
'wordCount': len(text.split(' ')),
'pageContent': text,
'token_count_estimate':len(tokenize(text)),
}
return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta