Added bget for recursive site downloads
This commit is contained in:
parent
42fec25e89
commit
428718fe70
1 changed files with 182 additions and 0 deletions
182
bget
Executable file
182
bget
Executable file
|
@ -0,0 +1,182 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
import requests
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
# Optionally import selenium-stealth for additional anti-detection measures.
|
||||
try:
|
||||
from selenium_stealth import stealth
|
||||
except ImportError:
|
||||
stealth = None
|
||||
|
||||
# Global sets to track visited URLs and downloaded files
|
||||
visited_pages = set()
|
||||
downloaded_files = set()
|
||||
|
||||
def get_local_path(url, output_dir):
|
||||
"""
|
||||
Generate a local file path from a URL by subfoldering based on netloc and URL path.
|
||||
If a URL path ends with / then it's saved as index.html.
|
||||
Avoid filename collisions by appending a counter if needed.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
netloc = parsed.netloc
|
||||
path = parsed.path
|
||||
|
||||
if path.endswith("/") or not os.path.basename(path):
|
||||
filename = "index.html"
|
||||
folder = os.path.join(output_dir, netloc, path.lstrip("/"))
|
||||
else:
|
||||
filename = os.path.basename(path)
|
||||
folder = os.path.join(output_dir, netloc, os.path.dirname(path.lstrip("/")))
|
||||
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
local_file = os.path.join(folder, filename)
|
||||
counter = 1
|
||||
base, ext = os.path.splitext(local_file)
|
||||
while os.path.exists(local_file):
|
||||
local_file = f"{base}_{counter}{ext}"
|
||||
counter += 1
|
||||
return local_file
|
||||
|
||||
def download_file(url, output_dir, session):
|
||||
"""
|
||||
Downloads a file from the given URL using requests and writes it to a local file.
|
||||
"""
|
||||
if url in downloaded_files:
|
||||
return
|
||||
downloaded_files.add(url)
|
||||
print(f"Downloading file: {url}")
|
||||
local_path = get_local_path(url, output_dir)
|
||||
try:
|
||||
with session.get(url, stream=True, timeout=10) as r:
|
||||
r.raise_for_status()
|
||||
with open(local_path, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
print(f"Saved to: {local_path}")
|
||||
except Exception as e:
|
||||
print(f"Failed to download {url}: {e}")
|
||||
|
||||
def crawl(url, depth, args, driver, session, output_dir):
|
||||
"""
|
||||
Crawl an HTML page with Selenium, optionally downloading page content and recursively processing found links.
|
||||
"""
|
||||
if url in visited_pages:
|
||||
return
|
||||
visited_pages.add(url)
|
||||
print(f"Crawling: {url} (depth {depth})")
|
||||
try:
|
||||
driver.get(url)
|
||||
# simple wait - adjust as needed
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
print(f"Error loading {url}: {e}")
|
||||
return
|
||||
|
||||
# Optionally download the HTML page itself if --all is specified
|
||||
if args.all:
|
||||
local_path = get_local_path(url, output_dir)
|
||||
try:
|
||||
with open(local_path, "w", encoding="utf-8") as f:
|
||||
f.write(driver.page_source)
|
||||
print(f"Page saved to: {local_path}")
|
||||
except Exception as e:
|
||||
print(f"Failed to save {url}: {e}")
|
||||
|
||||
# Find all <a> tags
|
||||
try:
|
||||
anchors = driver.find_elements(By.TAG_NAME, "a")
|
||||
except Exception as e:
|
||||
print(f"Error finding links on {url}: {e}")
|
||||
return
|
||||
|
||||
for a in anchors:
|
||||
href = a.get_attribute("href")
|
||||
if not href:
|
||||
continue
|
||||
# Join relative URLs with current page URL.
|
||||
href = urljoin(url, href)
|
||||
# Skip if already visited (for downloads or crawling)
|
||||
if href in visited_pages or href in downloaded_files:
|
||||
continue
|
||||
|
||||
# Try to determine content-type via a HEAD request
|
||||
try:
|
||||
head = session.head(href, allow_redirects=True, timeout=5)
|
||||
content_type = head.headers.get("Content-Type", "")
|
||||
except Exception:
|
||||
content_type = ""
|
||||
|
||||
if "text/html" in content_type.lower():
|
||||
# Recurse if depth allows
|
||||
if depth > 0:
|
||||
crawl(href, depth - 1, args, driver, session, output_dir)
|
||||
else:
|
||||
# Download if --all flag is enabled
|
||||
if args.all:
|
||||
download_file(href, output_dir, session)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Selenium-based downloader with recursion and subfoldering")
|
||||
parser.add_argument("url", help="URL to fetch")
|
||||
parser.add_argument("-a", "--all", action="store_true", help="Download all links on the page")
|
||||
parser.add_argument("-r", "--recurse", type=int, default=0,
|
||||
help="Recursively crawl linked pages up to the provided depth")
|
||||
parser.add_argument("-o", "--output", default="downloads", help="Output directory")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure Chrome options
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--disable-gpu")
|
||||
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
user_agent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/115.0.0.0 Safari/537.36")
|
||||
chrome_options.add_argument(f"--user-agent={user_agent}")
|
||||
# Specify custom Chrome binary if needed.
|
||||
chrome_options.binary_location = "/Applications/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing"
|
||||
|
||||
# Use chromedriver from a known location (e.g., Brew installation)
|
||||
service = Service("/usr/local/bin/chromedriver")
|
||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
|
||||
# Optionally apply stealth configurations
|
||||
if stealth:
|
||||
stealth(driver,
|
||||
languages=["en-US", "en"],
|
||||
vendor="Google Inc.",
|
||||
platform="Win32",
|
||||
webgl_vendor="Intel Inc.",
|
||||
renderer="Intel Iris OpenGL Engine",
|
||||
fix_hairline=True,
|
||||
)
|
||||
|
||||
# Create a requests session and transfer cookies from Selenium driver
|
||||
session = requests.Session()
|
||||
for cookie in driver.get_cookies():
|
||||
session.cookies.set(cookie["name"], cookie["value"], domain=cookie.get("domain"))
|
||||
|
||||
# Ensure the output directory exists.
|
||||
os.makedirs(args.output, exist_ok=True)
|
||||
|
||||
# Start crawling/ downloading
|
||||
crawl(args.url, args.recurse, args, driver, session, args.output)
|
||||
|
||||
driver.quit()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue