From 4f25c3a5fc472867873354cfed23303f5fdf9637 Mon Sep 17 00:00:00 2001 From: sanj <67624670+iodrift@users.noreply.github.com> Date: Mon, 5 Aug 2024 17:56:51 -0700 Subject: [PATCH] Auto-update: Mon Aug 5 17:56:51 PDT 2024 --- sijapi/utilities.py | 54 ++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/sijapi/utilities.py b/sijapi/utilities.py index 367a477..4894286 100644 --- a/sijapi/utilities.py +++ b/sijapi/utilities.py @@ -26,6 +26,10 @@ import ipaddress from scipy.spatial import cKDTree from dateutil.parser import parse as dateutil_parse from docx import Document +import aiohttp +from bs4 import BeautifulSoup +from readability import Document as ReadabilityDocument +from markdownify import markdownify as md from sshtunnel import SSHTunnelForwarder from urllib.parse import urlparse from fastapi import Depends, HTTPException, Request, UploadFile @@ -550,28 +554,28 @@ async def run_ssh_command(server, command): async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]: - if source: - html_content = source - elif url: - async with aiohttp.ClientSession() as session: - async with session.get(url) as response: - html_content = await response.text() - else: - err(f"Unable to convert nothing to markdown.") - return None - - # Use readability to extract the main content - doc = Document(html_content) - cleaned_html = doc.summary() - - # Parse the cleaned HTML with BeautifulSoup for any additional processing - soup = BeautifulSoup(cleaned_html, 'html.parser') - - # Remove any remaining unwanted elements - for element in soup(['script', 'style']): - element.decompose() - - # Convert to markdown - markdown_content = md(str(soup), heading_style="ATX") - - return markdown_content \ No newline at end of file + if source: + html_content = source + elif url: + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + html_content = await response.text() + else: + err(f"Unable to convert nothing to markdown.") + return None + + # Use readability to extract the main content + doc = ReadabilityDocument(html_content) + cleaned_html = doc.summary() + + # Parse the cleaned HTML with BeautifulSoup for any additional processing + soup = BeautifulSoup(cleaned_html, 'html.parser') + + # Remove any remaining unwanted elements + for element in soup(['script', 'style']): + element.decompose() + + # Convert to markdown + markdown_content = md(str(soup), heading_style="ATX") + + return markdown_content