chore: nightly index update [skip ci]

OK
Caching added
2026-05-01 09:02:19 +00:00 · 2026-03-02 00:59:12 +00:00 · 2026-03-02 01:56:54 +01:00 · 2026-03-01 13:06:24 +01:00 · 2026-03-01 01:36:01 +01:00
10 changed files with 901 additions and 1987 deletions
@@ -1,14 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(jq -r \".\"\"/api/v1/video-channels/{channelHandle}/videos\"\".get.parameters[] | $ref\")",
-      "Bash(grep '$ref')",
-      "Bash(grep -c \"^    ''/\" /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.yaml)",
-      "Bash(grep -c '    \"\"/' /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.json)",
-      "Bash(grep -c \"^  ''/\" /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.yaml)",
-      "Bash(git --no-pager diff HEAD~1 docs/external/peertube/openapi.json)",
-      "Bash(git --no-pager show HEAD~1:docs/external/peertube/openapi.json)",
-      "Bash(git --no-pager diff --name-only HEAD~1)"
-    ]
-  }
-}
@@ -0,0 +1,14 @@
+---
+alwaysApply: true
+always_on: true
+trigger: always_on
+applyTo: "**"
+description: Snyk Security At Inception
+---
+
+# Project security best practices
+
+- Always run snyk_code_scan tool for new first party code that is generated in a Snyk-supported language.
+- If any security issues are found based on newly introduced or modified code or dependencies, attempt to fix the issues using the results context from Snyk.
+- Rescan the code after fixing the issues to ensure that the issues were fixed and that there are no newly introduced issues.
+- Repeat this process until no new issues are found.
@@ -44,7 +44,7 @@ Sites with no credentials are skipped automatically when running `python main.py

 ### 1. Scrape

-Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V, HLS/M3U8).
+Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V).

 ```bash
 python main.py                    # scrape all sites you have credentials for
@@ -62,7 +62,7 @@ python download.py [options]
 Options:
  -o, --output DIR      Download directory (default: downloads)
  -t, --titles          Name files by post title
-      --original        Name files by original filename derived from the video URL (default)
+      --original        Name files by original CloudFront filename (default)
      --reorganize      Rename existing files to match current naming mode
  -w, --workers N       Concurrent downloads (default: 4)
  -n, --dry-run         Print what would be downloaded
@@ -121,21 +121,17 @@ def save_video_map(


 def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
-    """Pure function: return {cdn_video_url: referer} from a flat video map.
+    """Pure function: return {cdn_video_url: site_referer} from a flat video map.

-    Bunny.net CDN URLs require https://player.mediadelivery.net/ as referer.
-    All other URLs use the scheme+netloc of the page they were found on.
+    The flat video map has page URLs as keys; the scheme+netloc of each page URL
+    is used as the Referer for all CDN video URLs found in that entry.
    """
    result: dict[str, str] = {}
    for page_url, entry in video_map.items():
        parsed = urlparse(page_url)
-        site_referer = f"{parsed.scheme}://{parsed.netloc}/"
+        referer = f"{parsed.scheme}://{parsed.netloc}/"
        for vid in cast(dict[str, Any], entry).get("videos", []):
-            vid_url = vid["url"]
-            if urlparse(vid_url).netloc.endswith(".b-cdn.net"):
-                result.setdefault(vid_url, "https://player.mediadelivery.net/")
-            else:
-                result.setdefault(vid_url, site_referer)
+            result.setdefault(vid["url"], referer)
    return result


@@ -151,17 +147,8 @@ def fmt_size(b: float | int) -> str:
    return f"{b:.1f} TB"


-def is_hls_url(url: str) -> bool:
-    """True if url is an HLS master playlist (.m3u8)."""
-    return urlparse(url).path.endswith(".m3u8")
-
-
 def url_to_filename(url: str) -> str:
-    path = PurePosixPath(urlparse(url).path)
-    # Bunny.net HLS: .../guid/playlist.m3u8 → guid.mp4
-    if path.name == "playlist.m3u8":
-        return unquote(path.parent.name) + ".mp4"
-    return unquote(path.name)
+    return unquote(PurePosixPath(urlparse(url).path).name)


 def find_clashes(urls: list[str]) -> dict[str, list[str]]:
@@ -14,8 +14,6 @@ import argparse
 from pathlib import Path
 import re
 import shutil
-import subprocess
-import sys
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any
@@ -33,7 +31,6 @@ from check_clashes import (
    load_video_map,
    save_video_map,
    is_valid_url,
-    is_hls_url,
    VIDEO_MAP_FILE,
 )
 from config import SITES
@@ -209,30 +206,6 @@ def reorganize(
 # ── Download ─────────────────────────────────────────────────────────


-def download_hls(url: str, dest: Path, referer: str = "") -> tuple[str, int]:
-    """Download an HLS stream via yt-dlp. Returns (status, bytes_written)."""
-    dest.parent.mkdir(parents=True, exist_ok=True)
-    if dest.exists():
-        return "ok", 0
-    cmd = [
-        sys.executable, "-m", "yt_dlp",
-        "--quiet", "--no-warnings",
-        "--referer", referer or "https://player.mediadelivery.net/",
-        "-o", str(dest),
-        url,
-    ]
-    try:
-        proc = subprocess.run(cmd, capture_output=True, text=True)
-        if proc.returncode != 0:
-            lines = (proc.stderr or proc.stdout).strip().splitlines()
-            return f"error: {lines[-1] if lines else 'yt-dlp failed'}", 0
-        if not dest.exists():
-            return "error: output file missing after yt-dlp", 0
-        return "ok", dest.stat().st_size
-    except Exception as e:
-        return f"error: {e}", 0
-
-
 def download_one(
    session: requests.Session,
    url: str,
@@ -411,7 +384,7 @@ def main() -> None:
    saved = read_mode(args.output)
    mode_changed = saved is not None and saved != mode

-    print(f"[+] {len(urls)} video URLs from {VIDEO_MAP_FILE}")
+    print(f"[+] {len(urls)} MP4 URLs from {VIDEO_MAP_FILE}")
    print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else ""))

    # Handle reorganize
@@ -472,7 +445,7 @@ def main() -> None:
    }

    newly_fetched: dict[str, int | None] = {}
-    uncached_pending = [u for u in pending if u not in cached_sizes and not is_hls_url(u)]
+    uncached_pending = [u for u in pending if u not in cached_sizes]
    session = make_session()
    if uncached_pending:
        print(
@@ -489,19 +462,17 @@ def main() -> None:
    total_bytes = sum(sized.values())
    print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")

-    already_sizes: dict[str, int | None] = {}
-    already_to_verify = [u for u in already if not is_hls_url(u)]
-    if already_to_verify:
-        uncached_already = [u for u in already_to_verify if u not in cached_sizes]
+    if already:
+        uncached_already = [u for u in already if u not in cached_sizes]
        if uncached_already:
            print(
-                f"[+] Verifying {len(already_to_verify)} existing files ({len(uncached_already)} uncached)…"
+                f"[+] Verifying {len(already)} existing files ({len(uncached_already)} uncached)…"
            )
            fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers)
            newly_fetched.update(fetched_already)
-            already_sizes = {**cached_sizes, **fetched_already}
+            already_sizes: dict[str, int | None] = {**cached_sizes, **fetched_already}
        else:
-            print(f"[+] Verifying {len(already_to_verify)} existing files (all sizes cached)…")
+            print(f"[+] Verifying {len(already)} existing files (all sizes cached)…")
            already_sizes = dict(cached_sizes)

    mismatched = 0
@@ -534,8 +505,6 @@ def main() -> None:

    def do_download(url: str) -> tuple[str, tuple[str, int]]:
        dest = paths[url]
-        if is_hls_url(url):
-            return url, download_hls(url, dest, url_referers.get(url, ""))
        expected = remote_sizes.get(url)
        return url, download_one(
            session, url, dest, expected, url_referers.get(url, "")
@@ -335,46 +335,6 @@ def extract_title_from_html(html: str) -> str | None:
    return None


-def _is_bunny_playlist(url: str) -> bool:
-    """True if url is the root Bunny.net HLS playlist (not a sub-playlist)."""
-    parsed = urlparse(url)
-    return (
-        parsed.netloc.endswith(".b-cdn.net")
-        and parsed.path.endswith("/playlist.m3u8")
-    )
-
-
-def _is_bunny_junk(url: str) -> bool:
-    """True if url is a Bunny.net CDN init segment (not a usable video URL)."""
-    parsed = urlparse(url)
-    return parsed.netloc.endswith(".b-cdn.net") and PurePosixPath(
-        parsed.path
-    ).name in {"init.mp4", "init.dmp4"}
-
-
-def extract_bunny_embed_url(html: str) -> str | None:
-    """Return a tokenless Bunny.net embed URL found in an iframe, or None."""
-    m = re.search(
-        r'<iframe[^>]+src="(https://player\.mediadelivery\.net/embed/[^"?]+)',
-        html,
-    )
-    return m.group(1) if m else None
-
-
-def _clear_junk_video_entries(video_map: dict[str, Any], site_key: str) -> int:
-    """Reset entries whose only stored videos are CDN init segments. Returns count fixed."""
-    cleared = 0
-    for entry in video_map.values():
-        videos = entry.get("videos", [])
-        if videos and all(_is_bunny_junk(v["url"]) for v in videos):
-            entry["videos"] = []
-            entry.pop("scraped_at", None)
-            cleared += 1
-    if cleared:
-        save_video_map(video_map, site_key)
-    return cleared
-
-
 MAX_RETRIES = 2


@@ -401,9 +361,7 @@ async def worker(

    page.on(
        "response",
-        lambda resp: video_hits.add(resp.url)
-        if (_is_video_url(resp.url) or _is_bunny_playlist(resp.url))
-        else None,
+        lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
    )

    try:
@@ -418,7 +376,7 @@ async def worker(
            print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")

            try:
-                await page.goto(url, wait_until="load", timeout=60000)
+                await page.goto(url, wait_until="networkidle", timeout=60000)
            except Exception as e:
                print(f"[W{worker_id}] Navigation error: {e}")
                if expects_video(url) and attempt < MAX_RETRIES:
@@ -493,29 +451,18 @@ async def worker(
            found = set(html_videos) | set(video_hits)
            video_hits.clear()

-            print(f"[W{worker_id}] network hits raw: {found or '(empty)'}")
-
            all_videos = [
                m
                for m in found
                if is_valid_url(m)
-                and not _is_bunny_junk(m)
                and m
                not in (
                    f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
                )
            ]

-            if not all_videos:
-                embed_url = extract_bunny_embed_url(html)
-                if embed_url:
-                    print(
-                        f"[W{worker_id}] No network hit — iframe fallback: {embed_url}"
-                    )
-                    all_videos = [embed_url]
-
            async with map_lock:
-                new_found = set(all_videos) - known
+                new_found = found - known
                if new_found:
                    print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
                    known.update(new_found)
@@ -572,12 +519,6 @@ async def run_for_site(
    urls = load_post_urls(site_key, base_url, wp_api, req_headers)

    video_map = load_video_map(site_key)
-    junk_cleared = _clear_junk_video_entries(video_map, site_key)
-    if junk_cleared:
-        print(
-            f"[{site_key}] Cleared {junk_cleared} entries with junk CDN init segments — will re-scrape."
-        )
-
    if any(
        u not in video_map
        or not video_map[u].get("title")
@@ -1,5 +1,3 @@
 playwright==1.58.0
 python-dotenv==1.2.1
 Requests==2.32.5
-yt-dlp>=2026.3.17
-pycryptodomex>=3.23.0
@@ -125,12 +125,11 @@ def get_channel_id(base: str, token: str, channel_name: str) -> int:
 def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
    """Paginate through the channel and return a Counter of video names."""
    counts: Counter[str] = Counter()
-    page_size = 25
    start = 0
    while True:
        r = requests.get(
            f"{base}/api/v1/video-channels/{channel_name}/videos",
-            params={"start": start, "count": page_size},
+            params={"start": start, "count": 100},
            headers=api_headers(token),
            timeout=30,
        )
@@ -138,7 +137,7 @@ def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter
        data = r.json()
        for v in data.get("data", []):
            counts[v["name"]] += 1
-        start += page_size
+        start += 100
        if start >= data.get("total", 0):
            break
    return counts
Author	SHA1	Message	Date
github-actions[bot]	a51d2bd237	chore: nightly index update [skip ci]	2026-03-02 00:59:12 +00:00
HugeFrog24	c58814691e	OK	2026-03-02 01:56:54 +01:00
HugeFrog24	27f7beb8b7	Caching added	2026-03-01 13:06:24 +01:00
HugeFrog24	4a5b0a6ee3	Multi-gooner support because who knows	2026-03-01 01:36:01 +01:00