Auto-update: Mon Aug 5 17:56:51 PDT 2024

This commit is contained in:
sanj 2024-08-05 17:56:51 -07:00
parent 954ad967e9
commit 4f25c3a5fc

View file

@ -26,6 +26,10 @@ import ipaddress
from scipy.spatial import cKDTree from scipy.spatial import cKDTree
from dateutil.parser import parse as dateutil_parse from dateutil.parser import parse as dateutil_parse
from docx import Document from docx import Document
import aiohttp
from bs4 import BeautifulSoup
from readability import Document as ReadabilityDocument
from markdownify import markdownify as md
from sshtunnel import SSHTunnelForwarder from sshtunnel import SSHTunnelForwarder
from urllib.parse import urlparse from urllib.parse import urlparse
from fastapi import Depends, HTTPException, Request, UploadFile from fastapi import Depends, HTTPException, Request, UploadFile
@ -550,28 +554,28 @@ async def run_ssh_command(server, command):
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]: async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
if source: if source:
html_content = source html_content = source
elif url: elif url:
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(url) as response: async with session.get(url) as response:
html_content = await response.text() html_content = await response.text()
else: else:
err(f"Unable to convert nothing to markdown.") err(f"Unable to convert nothing to markdown.")
return None return None
# Use readability to extract the main content # Use readability to extract the main content
doc = Document(html_content) doc = ReadabilityDocument(html_content)
cleaned_html = doc.summary() cleaned_html = doc.summary()
# Parse the cleaned HTML with BeautifulSoup for any additional processing # Parse the cleaned HTML with BeautifulSoup for any additional processing
soup = BeautifulSoup(cleaned_html, 'html.parser') soup = BeautifulSoup(cleaned_html, 'html.parser')
# Remove any remaining unwanted elements # Remove any remaining unwanted elements
for element in soup(['script', 'style']): for element in soup(['script', 'style']):
element.decompose() element.decompose()
# Convert to markdown # Convert to markdown
markdown_content = md(str(soup), heading_style="ATX") markdown_content = md(str(soup), heading_style="ATX")
return markdown_content return markdown_content