Auto-update: Mon Aug 5 17:56:51 PDT 2024
This commit is contained in:
parent
954ad967e9
commit
4f25c3a5fc
1 changed files with 29 additions and 25 deletions
|
@ -26,6 +26,10 @@ import ipaddress
|
|||
from scipy.spatial import cKDTree
|
||||
from dateutil.parser import parse as dateutil_parse
|
||||
from docx import Document
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
from readability import Document as ReadabilityDocument
|
||||
from markdownify import markdownify as md
|
||||
from sshtunnel import SSHTunnelForwarder
|
||||
from urllib.parse import urlparse
|
||||
from fastapi import Depends, HTTPException, Request, UploadFile
|
||||
|
@ -561,7 +565,7 @@ async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]
|
|||
return None
|
||||
|
||||
# Use readability to extract the main content
|
||||
doc = Document(html_content)
|
||||
doc = ReadabilityDocument(html_content)
|
||||
cleaned_html = doc.summary()
|
||||
|
||||
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
||||
|
|
Loading…
Reference in a new issue