mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-03-15 22:52:23 +00:00
Add id to all metadata to prevent errors in frontend document picker (#378)
add id to all metadata to prevent errors in frontend docuemnt picker Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
parent
73f342eb19
commit
f40309cfdb
3 changed files with 20 additions and 14 deletions
collector/scripts
|
@ -1,13 +1,14 @@
|
|||
import os, json, requests, tempfile
|
||||
from requests_html import HTMLSession
|
||||
from langchain.document_loaders import UnstructuredHTMLLoader
|
||||
from .watch.utils import guid
|
||||
|
||||
def fetch_all_publications(subdomain):
|
||||
file_path = f"./outputs/substack-logs/substack-{subdomain}.json"
|
||||
|
||||
if os.path.isdir("./outputs/substack-logs") == False:
|
||||
os.makedirs("./outputs/substack-logs")
|
||||
|
||||
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, "r") as file:
|
||||
print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.")
|
||||
|
@ -24,7 +25,7 @@ def fetch_all_publications(subdomain):
|
|||
print("Bad response - exiting collection")
|
||||
collecting = False
|
||||
continue
|
||||
|
||||
|
||||
data = response.json()
|
||||
|
||||
if(len(data) ==0 ):
|
||||
|
@ -34,11 +35,11 @@ def fetch_all_publications(subdomain):
|
|||
for publication in data:
|
||||
publications.append(publication)
|
||||
offset = len(publications)
|
||||
|
||||
|
||||
with open(file_path, 'w+', encoding='utf-8') as json_file:
|
||||
json.dump(publications, json_file, ensure_ascii=True, indent=2)
|
||||
print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json")
|
||||
|
||||
|
||||
return publications
|
||||
|
||||
def only_valid_publications(publications= []):
|
||||
|
@ -60,7 +61,7 @@ def get_content(article_link):
|
|||
if(req.ok == False):
|
||||
print("Could not reach this url!")
|
||||
return None
|
||||
|
||||
|
||||
req.html.render()
|
||||
|
||||
full_text = None
|
||||
|
@ -75,6 +76,7 @@ def get_content(article_link):
|
|||
|
||||
def append_meta(publication, text):
|
||||
meta = {
|
||||
'id': guid(),
|
||||
'url': publication.get('canonical_url'),
|
||||
'thumbnail': publication.get('cover_image'),
|
||||
'title': publication.get('title'),
|
||||
|
|
|
@ -7,13 +7,14 @@ import os, time
|
|||
import pandas as pd
|
||||
import json
|
||||
from .utils import tokenize, ada_v2_cost
|
||||
from .watch.utils import guid
|
||||
|
||||
def twitter():
|
||||
#get user and number of tweets to read
|
||||
username = input("user timeline to read from (blank to ignore): ")
|
||||
searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ")
|
||||
tweetCount = input("Gather the last number of tweets: ")
|
||||
|
||||
|
||||
# Read your API keys to call the API.
|
||||
consumer_key = os.environ.get("TW_CONSUMER_KEY")
|
||||
consumer_secret = os.environ.get("TW_CONSUMER_SECRET")
|
||||
|
@ -43,7 +44,7 @@ def twitter():
|
|||
[tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text]
|
||||
for tweet in tweets
|
||||
]
|
||||
|
||||
|
||||
# Creation of column list to rename the columns in the dataframe
|
||||
columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]
|
||||
|
||||
|
@ -76,7 +77,7 @@ def twitter():
|
|||
|
||||
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
|
||||
json.dump(meta_link, file, ensure_ascii=True, indent=4)
|
||||
|
||||
|
||||
# print(f"{transaction_output_dir}/{transaction_output_filename}")
|
||||
|
||||
print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.")
|
||||
|
@ -92,6 +93,7 @@ def twitter_meta(row, metadata_only = False):
|
|||
url = f"http://twitter.com/anyuser/status/{row['id']}"
|
||||
title = f"Tweet {row['id']}"
|
||||
meta = {
|
||||
'id': guid(),
|
||||
'url': url,
|
||||
'title': title,
|
||||
'description': 'Tweet from ' + row["Screen Name"],
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import json, requests, os, re
|
||||
from slugify import slugify
|
||||
from dotenv import load_dotenv
|
||||
from .watch.utils import guid
|
||||
load_dotenv()
|
||||
|
||||
def is_yt_short(videoId):
|
||||
|
@ -20,13 +21,13 @@ def get_channel_id(channel_link):
|
|||
if(response.ok == False):
|
||||
print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL")
|
||||
return None
|
||||
|
||||
|
||||
json_data = response.json()
|
||||
return json_data.get('items')[0].get('id')
|
||||
else:
|
||||
pattern = r"youtube\.com/channel/([\w-]+)"
|
||||
match = re.search(pattern, channel_link)
|
||||
return match.group(1) if match else None
|
||||
return match.group(1) if match else None
|
||||
|
||||
|
||||
def clean_text(text):
|
||||
|
@ -34,6 +35,7 @@ def clean_text(text):
|
|||
|
||||
def append_meta(video, duration, text):
|
||||
meta = {
|
||||
'id': guid(),
|
||||
'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",
|
||||
'thumbnail': video.get('thumbnail'),
|
||||
'description': video.get('description'),
|
||||
|
@ -63,7 +65,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
|
|||
if(os.getenv('GOOGLE_APIS_KEY') == None):
|
||||
print("GOOGLE_APIS_KEY env variable not set!")
|
||||
exit(1)
|
||||
|
||||
|
||||
done = False
|
||||
currentPage = None
|
||||
pageTokens = []
|
||||
|
@ -93,7 +95,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
|
|||
|
||||
for item in response.get('items'):
|
||||
if 'id' in item and 'videoId' in item.get('id'):
|
||||
if is_yt_short(item.get('id').get('videoId')):
|
||||
if is_yt_short(item.get('id').get('videoId')):
|
||||
print(f"Filtering out YT Short {item.get('id').get('videoId')}")
|
||||
continue
|
||||
|
||||
|
@ -109,12 +111,12 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
|
|||
'published': item.get('snippet').get('publishTime'),
|
||||
}
|
||||
items.append(newItem)
|
||||
|
||||
|
||||
pageTokens.append(currentPage)
|
||||
|
||||
data['items'] = items
|
||||
with open(file_path, 'w+', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, ensure_ascii=True, indent=2)
|
||||
print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json")
|
||||
|
||||
|
||||
return data
|
Loading…
Add table
Reference in a new issue