Auto-update: Mon Aug 5 17:56:51 PDT 2024
This commit is contained in:
parent
954ad967e9
commit
4f25c3a5fc
1 changed files with 29 additions and 25 deletions
|
@ -26,6 +26,10 @@ import ipaddress
|
||||||
from scipy.spatial import cKDTree
|
from scipy.spatial import cKDTree
|
||||||
from dateutil.parser import parse as dateutil_parse
|
from dateutil.parser import parse as dateutil_parse
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from readability import Document as ReadabilityDocument
|
||||||
|
from markdownify import markdownify as md
|
||||||
from sshtunnel import SSHTunnelForwarder
|
from sshtunnel import SSHTunnelForwarder
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from fastapi import Depends, HTTPException, Request, UploadFile
|
from fastapi import Depends, HTTPException, Request, UploadFile
|
||||||
|
@ -561,7 +565,7 @@ async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Use readability to extract the main content
|
# Use readability to extract the main content
|
||||||
doc = Document(html_content)
|
doc = ReadabilityDocument(html_content)
|
||||||
cleaned_html = doc.summary()
|
cleaned_html = doc.summary()
|
||||||
|
|
||||||
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
# Parse the cleaned HTML with BeautifulSoup for any additional processing
|
||||||
|
|
Loading…
Reference in a new issue