Auto-update: Mon Aug 5 17:56:51 PDT 2024

This commit is contained in:
sanj 2024-08-05 17:56:51 -07:00
parent 954ad967e9
commit 4f25c3a5fc

View file

@ -26,6 +26,10 @@ import ipaddress
from scipy.spatial import cKDTree
from dateutil.parser import parse as dateutil_parse
from docx import Document
import aiohttp
from bs4 import BeautifulSoup
from readability import Document as ReadabilityDocument
from markdownify import markdownify as md
from sshtunnel import SSHTunnelForwarder
from urllib.parse import urlparse
from fastapi import Depends, HTTPException, Request, UploadFile
@ -561,7 +565,7 @@ async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]
return None
# Use readability to extract the main content
doc = Document(html_content)
doc = ReadabilityDocument(html_content)
cleaned_html = doc.summary()
# Parse the cleaned HTML with BeautifulSoup for any additional processing