From 4f25c3a5fc472867873354cfed23303f5fdf9637 Mon Sep 17 00:00:00 2001
From: sanj <67624670+iodrift@users.noreply.github.com>
Date: Mon, 5 Aug 2024 17:56:51 -0700
Subject: [PATCH] Auto-update: Mon Aug  5 17:56:51 PDT 2024

---
 sijapi/utilities.py | 54 ++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/sijapi/utilities.py b/sijapi/utilities.py
index 367a477..4894286 100644
--- a/sijapi/utilities.py
+++ b/sijapi/utilities.py
@@ -26,6 +26,10 @@ import ipaddress
 from scipy.spatial import cKDTree
 from dateutil.parser import parse as dateutil_parse
 from docx import Document
+import aiohttp
+from bs4 import BeautifulSoup
+from readability import Document as ReadabilityDocument
+from markdownify import markdownify as md
 from sshtunnel import SSHTunnelForwarder
 from urllib.parse import urlparse
 from fastapi import Depends, HTTPException, Request, UploadFile
@@ -550,28 +554,28 @@ async def run_ssh_command(server, command):
 
 
 async def html_to_markdown(url: str = None, source: str = None) -> Optional[str]:
-        if source:
-            html_content = source
-        elif url:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(url) as response:
-                    html_content = await response.text()
-        else:
-            err(f"Unable to convert nothing to markdown.")
-            return None
-        
-        # Use readability to extract the main content
-        doc = Document(html_content)
-        cleaned_html = doc.summary()
-        
-        # Parse the cleaned HTML with BeautifulSoup for any additional processing
-        soup = BeautifulSoup(cleaned_html, 'html.parser')
-        
-        # Remove any remaining unwanted elements
-        for element in soup(['script', 'style']):
-            element.decompose()
-        
-        # Convert to markdown
-        markdown_content = md(str(soup), heading_style="ATX")
-        
-        return markdown_content
\ No newline at end of file
+    if source:
+        html_content = source
+    elif url:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                html_content = await response.text()
+    else:
+        err(f"Unable to convert nothing to markdown.")
+        return None
+    
+    # Use readability to extract the main content
+    doc = ReadabilityDocument(html_content)
+    cleaned_html = doc.summary()
+    
+    # Parse the cleaned HTML with BeautifulSoup for any additional processing
+    soup = BeautifulSoup(cleaned_html, 'html.parser')
+    
+    # Remove any remaining unwanted elements
+    for element in soup(['script', 'style']):
+        element.decompose()
+    
+    # Convert to markdown
+    markdown_content = md(str(soup), heading_style="ATX")
+    
+    return markdown_content