Auto-update: Mon Aug 5 17:56:51 PDT 2024
This commit is contained in:
parent
954ad967e9
commit
4f25c3a5fc
1 changed files with 29 additions and 25 deletions
|
@ -26,6 +26,10 @@ import ipaddress
|
||||||
from scipy.spatial import cKDTree
|
from scipy.spatial import cKDTree
|
||||||
from dateutil.parser import parse as dateutil_parse
|
from dateutil.parser import parse as dateutil_parse
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from readability import Document as ReadabilityDocument
|
||||||
|
from markdownify import markdownify as md
|
||||||
from sshtunnel import SSHTunnelForwarder
|
from sshtunnel import SSHTunnelForwarder
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from fastapi import Depends, HTTPException, Request, UploadFile
|
from fastapi import Depends, HTTPException, Request, UploadFile
|
||||||
|
@ -550,28 +554,28 @@ async def run_ssh_command(server, command):
|
||||||
|
|
||||||
|
|
||||||
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
|
async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
|
||||||
if source:
|
if source:
|
||||||
html_content = source
|
html_content = source
|
||||||
elif url:
|
elif url:
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(url) as response:
|
async with session.get(url) as response:
|
||||||
html_content = await response.text()
|
html_content = await response.text()
|
||||||
else:
|
else:
|
||||||
err(f"Unable to convert nothing to markdown.")
|
err(f"Unable to convert nothing to markdown.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Use readability to extract the main content
|
# Use readability to extract the main content
|
||||||
doc = Document(html_content)
|
doc = ReadabilityDocument(html_content)
|
||||||
cleaned_html = doc.summary()
|
cleaned_html = doc.summary()
|
||||||
|
|
||||||
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
||||||
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||||
|
|
||||||
# Remove any remaining unwanted elements
|
# Remove any remaining unwanted elements
|
||||||
for element in soup(['script', 'style']):
|
for element in soup(['script', 'style']):
|
||||||
element.decompose()
|
element.decompose()
|
||||||
|
|
||||||
# Convert to markdown
|
# Convert to markdown
|
||||||
markdown_content = md(str(soup), heading_style="ATX")
|
markdown_content = md(str(soup), heading_style="ATX")
|
||||||
|
|
||||||
return markdown_content
|
return markdown_content
|
||||||
|
|
Loading…
Reference in a new issue