Auto-update: Mon Aug 5 17:56:51 PDT 2024

2024-08-05 17:56:51 -07:00 · 2024-08-05 17:56:51 -07:00 · 4f25c3a5fc
commit 4f25c3a5fc
parent 954ad967e9
1 changed files with 29 additions and 25 deletions
--- a/sijapi/utilities.py
+++ b/sijapi/utilities.py
@ -26,6 +26,10 @@ import ipaddress
 from scipy.spatial import cKDTree
 from dateutil.parser import parse as dateutil_parse
 from docx import Document
+import aiohttp
+from bs4 import BeautifulSoup
+from readability import Document as ReadabilityDocument
+from markdownify import markdownify as md
 from sshtunnel import SSHTunnelForwarder
 from urllib.parse import urlparse
 from fastapi import Depends, HTTPException, Request, UploadFile
@ -550,28 +554,28 @@ async def run_ssh_command(server, command):


 async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
-        if source:
-            html_content = source
-        elif url:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(url) as response:
-                    html_content = await response.text()
-        else:
-            err(f"Unable to convert nothing to markdown.")
-            return None
-        
-        # Use readability to extract the main content
-        doc = Document(html_content)
-        cleaned_html = doc.summary()
-        
-        # Parse the cleaned HTML with BeautifulSoup for any additional processing
-        soup = BeautifulSoup(cleaned_html, 'html.parser')
-        
-        # Remove any remaining unwanted elements
-        for element in soup(['script', 'style']):
-            element.decompose()
-        
-        # Convert to markdown
-        markdown_content = md(str(soup), heading_style="ATX")
-        
-        return markdown_content
+    if source:
+        html_content = source
+    elif url:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                html_content = await response.text()
+    else:
+        err(f"Unable to convert nothing to markdown.")
+        return None
+    
+    # Use readability to extract the main content
+    doc = ReadabilityDocument(html_content)
+    cleaned_html = doc.summary()
+    
+    # Parse the cleaned HTML with BeautifulSoup for any additional processing
+    soup = BeautifulSoup(cleaned_html, 'html.parser')
+    
+    # Remove any remaining unwanted elements
+    for element in soup(['script', 'style']):
+        element.decompose()
+    
+    # Convert to markdown
+    markdown_content = md(str(soup), heading_style="ATX")
+    
+    return markdown_content