anything-llm/collector/utils/extensions/YoutubeTranscript/index.js
Timothy Carambat d1ca16f7f8
Add tokenizer improvments via Singleton class and estimation ()
* Add tokenizer improvments via Singleton class
linting

* dev build

* Estimation fallback when string exceeds a fixed byte size

* Add notice to tiktoken on backend
2025-01-30 17:55:03 -08:00

142 lines
3.6 KiB
JavaScript

const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const { YoutubeLoader } = require("./YoutubeLoader");
function validYoutubeVideoUrl(link) {
const UrlPattern = require("url-pattern");
const opts = new URL(link);
const url = `${opts.protocol}//${opts.host}${opts.pathname}${
opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : ""
}`;
const shortPatternMatch = new UrlPattern(
"https\\://(www.)youtu.be/(:videoId)"
).match(url);
const fullPatternMatch = new UrlPattern(
"https\\://(www.)youtube.com/watch?v=(:videoId)"
).match(url);
const videoId =
shortPatternMatch?.videoId || fullPatternMatch?.videoId || null;
if (!!videoId) return true;
return false;
}
async function fetchVideoTranscriptContent({ url }) {
if (!validYoutubeVideoUrl(url)) {
return {
success: false,
reason: "Invalid URL. Should be youtu.be or youtube.com/watch.",
content: null,
metadata: {},
};
}
console.log(`-- Working YouTube ${url} --`);
const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
const { docs, error } = await loader
.load()
.then((docs) => {
return { docs, error: null };
})
.catch((e) => {
return {
docs: [],
error: e.message?.split("Error:")?.[1] || e.message,
};
});
if (!docs.length || !!error) {
return {
success: false,
reason: error ?? "No transcript found for that YouTube video.",
content: null,
metadata: {},
};
}
const metadata = docs[0].metadata;
const content = docs[0].pageContent;
if (!content.length) {
return {
success: false,
reason: "No transcript could be parsed for that YouTube video.",
content: null,
metadata: {},
};
}
return {
success: true,
reason: null,
content,
metadata,
};
}
async function loadYouTubeTranscript({ url }) {
const transcriptResults = await fetchVideoTranscriptContent({ url });
if (!transcriptResults.success) {
return {
success: false,
reason:
transcriptResults.reason ||
"An unknown error occurred during transcription retrieval",
};
}
const { content, metadata } = transcriptResults;
const outFolder = slugify(
`${metadata.author} YouTube transcripts`
).toLowerCase();
const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../../server/storage/documents/${outFolder}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });
const data = {
id: v4(),
url: url + ".youtube",
title: metadata.title || url,
docAuthor: metadata.author,
description: metadata.description,
docSource: url,
chunkSource: `youtube://${url}`,
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content),
};
console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
writeToServerDocuments(
data,
`${slugify(metadata.title)}-${data.id}`,
outFolderPath
);
return {
success: true,
reason: "test",
data: {
title: metadata.title,
author: metadata.author,
destination: outFolder,
},
};
}
module.exports = {
loadYouTubeTranscript,
fetchVideoTranscriptContent,
};