feat: add Bunny.net HLS support to scraper and downloader (yt-dlp + pycryptodomex)

2026-05-01 00:52:18 +00:00 · 2026-03-19 08:39:11 +01:00
parent 904d79eb35
commit 0525bd3139
6 changed files with 254 additions and 36 deletions
@@ -14,6 +14,8 @@ import argparse
 from pathlib import Path
 import re
 import shutil
+import subprocess
+import sys
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any
@@ -31,6 +33,7 @@ from check_clashes import (
    load_video_map,
    save_video_map,
    is_valid_url,
+    is_hls_url,
    VIDEO_MAP_FILE,
 )
 from config import SITES
@@ -206,6 +209,30 @@ def reorganize(
 # ── Download ─────────────────────────────────────────────────────────


+def download_hls(url: str, dest: Path, referer: str = "") -> tuple[str, int]:
+    """Download an HLS stream via yt-dlp. Returns (status, bytes_written)."""
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    if dest.exists():
+        return "ok", 0
+    cmd = [
+        sys.executable, "-m", "yt_dlp",
+        "--quiet", "--no-warnings",
+        "--referer", referer or "https://player.mediadelivery.net/",
+        "-o", str(dest),
+        url,
+    ]
+    try:
+        proc = subprocess.run(cmd, capture_output=True, text=True)
+        if proc.returncode != 0:
+            lines = (proc.stderr or proc.stdout).strip().splitlines()
+            return f"error: {lines[-1] if lines else 'yt-dlp failed'}", 0
+        if not dest.exists():
+            return "error: output file missing after yt-dlp", 0
+        return "ok", dest.stat().st_size
+    except Exception as e:
+        return f"error: {e}", 0
+
+
 def download_one(
    session: requests.Session,
    url: str,
@@ -384,7 +411,7 @@ def main() -> None:
    saved = read_mode(args.output)
    mode_changed = saved is not None and saved != mode

-    print(f"[+] {len(urls)} MP4 URLs from {VIDEO_MAP_FILE}")
+    print(f"[+] {len(urls)} video URLs from {VIDEO_MAP_FILE}")
    print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else ""))

    # Handle reorganize
@@ -445,7 +472,7 @@ def main() -> None:
    }

    newly_fetched: dict[str, int | None] = {}
-    uncached_pending = [u for u in pending if u not in cached_sizes]
+    uncached_pending = [u for u in pending if u not in cached_sizes and not is_hls_url(u)]
    session = make_session()
    if uncached_pending:
        print(
@@ -462,17 +489,19 @@ def main() -> None:
    total_bytes = sum(sized.values())
    print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")

-    if already:
-        uncached_already = [u for u in already if u not in cached_sizes]
+    already_sizes: dict[str, int | None] = {}
+    already_to_verify = [u for u in already if not is_hls_url(u)]
+    if already_to_verify:
+        uncached_already = [u for u in already_to_verify if u not in cached_sizes]
        if uncached_already:
            print(
-                f"[+] Verifying {len(already)} existing files ({len(uncached_already)} uncached)…"
+                f"[+] Verifying {len(already_to_verify)} existing files ({len(uncached_already)} uncached)…"
            )
            fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers)
            newly_fetched.update(fetched_already)
-            already_sizes: dict[str, int | None] = {**cached_sizes, **fetched_already}
+            already_sizes = {**cached_sizes, **fetched_already}
        else:
-            print(f"[+] Verifying {len(already)} existing files (all sizes cached)…")
+            print(f"[+] Verifying {len(already_to_verify)} existing files (all sizes cached)…")
            already_sizes = dict(cached_sizes)

    mismatched = 0
@@ -505,6 +534,8 @@ def main() -> None:

    def do_download(url: str) -> tuple[str, tuple[str, int]]:
        dest = paths[url]
+        if is_hls_url(url):
+            return url, download_hls(url, dest, url_referers.get(url, ""))
        expected = remote_sizes.get(url)
        return url, download_one(
            session, url, dest, expected, url_referers.get(url, "")