Compare commits

..

1 Commits

Author SHA1 Message Date
HugeFrog24 4a5b0a6ee3 Multi-gooner support because who knows 2026-03-01 01:36:01 +01:00
15 changed files with 3103 additions and 17277 deletions
-5
View File
@@ -13,11 +13,6 @@ PINKCUFFS_USERNAME=your-email-or-username
PINKCUFFS_PASSWORD=your-password PINKCUFFS_PASSWORD=your-password
PINKCUFFS_LOGIN_COOKIE=wordpress_logged_in_<hash>=<value> PINKCUFFS_LOGIN_COOKIE=wordpress_logged_in_<hash>=<value>
# femuniverse.com credentials (separate membership)
FEMUNIVERSE_USERNAME=your-email-or-username
FEMUNIVERSE_PASSWORD=your-password
FEMUNIVERSE_LOGIN_COOKIE=wordpress_logged_in_<hash>=<value>
# PeerTube upload target # PeerTube upload target
PEERTUBE_URL=https://your-peertube-instance.example PEERTUBE_URL=https://your-peertube-instance.example
PEERTUBE_USER=admin PEERTUBE_USER=admin
@@ -0,0 +1,14 @@
---
alwaysApply: true
always_on: true
trigger: always_on
applyTo: "**"
description: Snyk Security At Inception
---
# Project security best practices
- Always run snyk_code_scan tool for new first party code that is generated in a Snyk-supported language.
- If any security issues are found based on newly introduced or modified code or dependencies, attempt to fix the issues using the results context from Snyk.
- Rescan the code after fixing the issues to ensure that the issues were fixed and that there are no newly introduced issues.
- Repeat this process until no new issues are found.
-2
View File
@@ -38,8 +38,6 @@ jobs:
JAILBIRDZ_PASSWORD: ${{ secrets.JAILBIRDZ_PASSWORD }} JAILBIRDZ_PASSWORD: ${{ secrets.JAILBIRDZ_PASSWORD }}
PINKCUFFS_USERNAME: ${{ secrets.PINKCUFFS_USERNAME }} PINKCUFFS_USERNAME: ${{ secrets.PINKCUFFS_USERNAME }}
PINKCUFFS_PASSWORD: ${{ secrets.PINKCUFFS_PASSWORD }} PINKCUFFS_PASSWORD: ${{ secrets.PINKCUFFS_PASSWORD }}
FEMUNIVERSE_USERNAME: ${{ secrets.FEMUNIVERSE_USERNAME }}
FEMUNIVERSE_PASSWORD: ${{ secrets.FEMUNIVERSE_PASSWORD }}
- name: Commit updated video_map.json - name: Commit updated video_map.json
if: always() # save progress even if main.py crashed or timed out if: always() # save progress even if main.py crashed or timed out
-3
View File
@@ -1,6 +1,3 @@
# Local Claude instructions
.claude/settings.local.json
# Temporary cache # Temporary cache
__pycache__/ __pycache__/
.ruff_cache/ .ruff_cache/
+8 -16
View File
@@ -1,6 +1,6 @@
# 𝒥𝒶𝒾𝓁𝒷𝒾𝓇𝒹𝓏-𝒹𝓁 # 𝒥𝒶𝒾𝓁𝒷𝒾𝓇𝒹𝓏-𝒹𝓁
Jailbirdz.com, Pinkcuffs.com, and Femuniverse.com are Arizona-based subscription video sites publishing arrest and jail roleplay scenarios featuring women. This tool scrapes the member area of any combination of these sites, downloads the videos, and re-hosts them on a self-owned PeerTube instance. Jailbirdz.com and Pinkcuffs.com are Arizona-based subscription video sites publishing arrest and jail roleplay scenarios featuring women. This tool scrapes the member area of one or both sites, downloads the videos, and re-hosts them on a self-owned PeerTube instance.
> [!NOTE] > [!NOTE]
> This tool does not bypass authentication, modify the site, or intercept anything it isn't entitled to. A valid, paid membership is required. The scraper authenticates using your own session cookie and accesses only content your account can already view in a browser. > This tool does not bypass authentication, modify the site, or intercept anything it isn't entitled to. A valid, paid membership is required. The scraper authenticates using your own session cookie and accesses only content your account can already view in a browser.
@@ -23,9 +23,9 @@ cp .env.example .env
Set credentials for whichever sites you have a membership on. You don't need both. Set credentials for whichever sites you have a membership on. You don't need both.
**Option A — credentials (recommended):** set `JAILBIRDZ_USERNAME` + `JAILBIRDZ_PASSWORD` (and/or the `PINKCUFFS_*` / `FEMUNIVERSE_*` equivalents) in `.env`. `main.py` logs in automatically on startup. **Option A — credentials (recommended):** set `JAILBIRDZ_USERNAME` + `JAILBIRDZ_PASSWORD` (and/or the `PINKCUFFS_*` equivalents) in `.env`. `main.py` logs in automatically on startup.
**Option B — manual cookie:** set `JAILBIRDZ_LOGIN_COOKIE` (and/or `PINKCUFFS_LOGIN_COOKIE` / `FEMUNIVERSE_LOGIN_COOKIE`) yourself. Get the value from browser DevTools → Storage → Cookies — copy the full `name=value` of the `wordpress_logged_in_*` cookie. **Option B — manual cookie:** set `JAILBIRDZ_LOGIN_COOKIE` (and/or `PINKCUFFS_LOGIN_COOKIE`) yourself. Get the value from browser DevTools → Storage → Cookies — copy the full `name=value` of the `wordpress_logged_in_*` cookie.
Sites with no credentials are skipped automatically when running `python main.py`. Sites with no credentials are skipped automatically when running `python main.py`.
@@ -35,8 +35,6 @@ Sites with no credentials are skipped automatically when running `python main.py
- `JAILBIRDZ_LOGIN_COOKIE` — jailbirdz.com session cookie (fallback). - `JAILBIRDZ_LOGIN_COOKIE` — jailbirdz.com session cookie (fallback).
- `PINKCUFFS_USERNAME` / `PINKCUFFS_PASSWORD` — pinkcuffs.com login. - `PINKCUFFS_USERNAME` / `PINKCUFFS_PASSWORD` — pinkcuffs.com login.
- `PINKCUFFS_LOGIN_COOKIE` — pinkcuffs.com session cookie (fallback). - `PINKCUFFS_LOGIN_COOKIE` — pinkcuffs.com session cookie (fallback).
- `FEMUNIVERSE_USERNAME` / `FEMUNIVERSE_PASSWORD` — femuniverse.com login.
- `FEMUNIVERSE_LOGIN_COOKIE` — femuniverse.com session cookie (fallback).
- `PEERTUBE_URL` — base URL of your PeerTube instance. - `PEERTUBE_URL` — base URL of your PeerTube instance.
- `PEERTUBE_USER` — PeerTube username. - `PEERTUBE_USER` — PeerTube username.
- `PEERTUBE_CHANNEL` — channel to upload to. - `PEERTUBE_CHANNEL` — channel to upload to.
@@ -46,13 +44,12 @@ Sites with no credentials are skipped automatically when running `python main.py
### 1. Scrape ### 1. Scrape
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V, HLS/M3U8). Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V).
```bash ```bash
python main.py # scrape all sites you have credentials for python main.py # scrape all sites you have credentials for
python main.py --site jailbirdz # scrape one site only python main.py --site jailbirdz # scrape one site only
python main.py --site pinkcuffs --site jailbirdz # explicit multi-site python main.py --site pinkcuffs --site jailbirdz # explicit multi-site
python main.py --site femuniverse # femuniverse only
``` ```
Results are written to `video_map.json`. Safe to re-run — already-scraped posts are skipped. Results are written to `video_map.json`. Safe to re-run — already-scraped posts are skipped.
@@ -65,11 +62,11 @@ python download.py [options]
Options: Options:
-o, --output DIR Download directory (default: downloads) -o, --output DIR Download directory (default: downloads)
-t, --titles Name files by post title -t, --titles Name files by post title
--original Name files by original filename derived from the video URL (default) --original Name files by original CloudFront filename (default)
--reorganize Rename existing files to match current naming mode --reorganize Rename existing files to match current naming mode
-w, --workers N Concurrent downloads (default: 4) -w, --workers N Concurrent downloads (default: 4)
-n, --dry-run Print what would be downloaded -n, --dry-run Print what would be downloaded
--site SITE Limit to one site (jailbirdz, pinkcuffs, or femuniverse); repeatable --site SITE Limit to one site (jailbirdz or pinkcuffs); repeatable
``` ```
Resumes partial downloads. The chosen naming mode is saved to `.naming_mode` inside the output directory and persists across runs. Filenames that would clash are placed into subfolders. Resumes partial downloads. The chosen naming mode is saved to `.naming_mode` inside the output directory and persists across runs. Filenames that would clash are placed into subfolders.
@@ -108,10 +105,6 @@ gh secret set JAILBIRDZ_PASSWORD
# pinkcuffs (if you have a membership) # pinkcuffs (if you have a membership)
gh secret set PINKCUFFS_USERNAME gh secret set PINKCUFFS_USERNAME
gh secret set PINKCUFFS_PASSWORD gh secret set PINKCUFFS_PASSWORD
# femuniverse (if you have a membership)
gh secret set FEMUNIVERSE_USERNAME
gh secret set FEMUNIVERSE_PASSWORD
``` ```
**Seed CI with your current progress before the first run:** **Seed CI with your current progress before the first run:**
@@ -135,11 +128,10 @@ Lists filenames that map to more than one source URL, with sizes.
### Estimate total download size ### Estimate total download size
```bash ```bash
python total_size.py # read cached sizes and print summary python total_size.py
python total_size.py --write # probe uncached/stale URLs and refresh the cache
``` ```
Reads cached file sizes from `video_map.json` and prints a summary (total, smallest, largest, average). The default mode never hits the network. Use `--write` to probe any missing or stale entries and persist the results. Fetches `Content-Length` for every video URL in `video_map.json` and prints a size summary. Does not download anything.
## Data files ## Data files
+27 -79
View File
@@ -18,7 +18,7 @@ Importable functions:
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path, PurePosixPath from pathlib import Path, PurePosixPath
from typing import Any, cast from typing import Any, Optional, cast
from collections.abc import Callable from collections.abc import Callable
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
@@ -121,21 +121,18 @@ def save_video_map(
def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]: def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
"""Pure function: return {cdn_video_url: referer} from a flat video map. """Pure function: return {cdn_video_url: site_referer} from a flat video map.
Bunny.net CDN URLs require https://player.mediadelivery.net/ as referer. The flat video map has page URLs as keys; the scheme+netloc of each page URL
All other URLs use the scheme+netloc of the page they were found on. is used as the Referer for all CDN video URLs found in that entry.
""" """
result: dict[str, str] = {} result: dict[str, str] = {}
for page_url, entry in video_map.items(): for page_url, entry in video_map.items():
parsed = urlparse(page_url) parsed = urlparse(page_url)
site_referer = f"{parsed.scheme}://{parsed.netloc}/" referer = f"{parsed.scheme}://{parsed.netloc}/"
for vid in cast(dict[str, Any], entry).get("videos", []): for vid in cast(dict[str, Any], entry).get("videos", []):
vid_url = vid["url"] if isinstance(vid, str):
if urlparse(vid_url).netloc.endswith(".b-cdn.net"): result.setdefault(vid, referer)
result.setdefault(vid_url, "https://player.mediadelivery.net/")
else:
result.setdefault(vid_url, site_referer)
return result return result
@@ -151,17 +148,8 @@ def fmt_size(b: float | int) -> str:
return f"{b:.1f} TB" return f"{b:.1f} TB"
def is_hls_url(url: str) -> bool:
"""True if url is an HLS master playlist (.m3u8)."""
return urlparse(url).path.endswith(".m3u8")
def url_to_filename(url: str) -> str: def url_to_filename(url: str) -> str:
path = PurePosixPath(urlparse(url).path) return unquote(PurePosixPath(urlparse(url).path).name)
# Bunny.net HLS: .../guid/playlist.m3u8 → guid.mp4
if path.name == "playlist.m3u8":
return unquote(path.parent.name) + ".mp4"
return unquote(path.name)
def find_clashes(urls: list[str]) -> dict[str, list[str]]: def find_clashes(urls: list[str]) -> dict[str, list[str]]:
@@ -178,72 +166,27 @@ def find_clashes(urls: list[str]) -> dict[str, list[str]]:
} }
def _path_folders(url: str) -> list[str]: def _clash_subfolder(url: str) -> str:
"""Decoded URL path segments above the filename (filename excluded).""" """Parent path segment used as disambiguator for clashing filenames."""
parts = [unquote(p) for p in urlparse(url).path.split("/") if p] parts = urlparse(url).path.rstrip("/").split("/")
return parts[:-1] return unquote(parts[-2]) if len(parts) >= 2 else "unknown"
def _disambiguate_group(group: list[str]) -> dict[str, tuple[str, ...]]:
"""Find the smallest depth of trailing folder segments that gives every URL in the group
a unique subfolder path. Returns {url: subfolder_segments}.
Comparison is case-insensitive so the result is safe on NTFS/APFS as well as ext4.
"""
folders = {u: _path_folders(u) for u in group}
max_depth = max((len(f) for f in folders.values()), default=0)
for depth in range(1, max_depth + 1):
keys = {u: tuple(p.lower() for p in folders[u][-depth:]) for u in group}
if len(set(keys.values())) == len(group):
return {u: tuple(folders[u][-depth:]) for u in group}
raise RuntimeError(
f"Cannot disambiguate URL group sharing filename and full parent path: {group}"
)
def build_download_paths( def build_download_paths(
urls: list[str], urls: list[str],
output_dir: str | Path, output_dir: str | Path,
) -> dict[str, Path]: ) -> dict[str, Path]:
"""Map each URL to a unique local file path. """Map each URL to a local file path. Flat layout; clashing names get a subfolder."""
clashes = find_clashes(urls)
clash_lower = {name.lower() for name in clashes}
Unique filenames go directly under output_dir. Filenames that clash paths = {}
(case-insensitively) get the smallest tail of their URL path prepended
that makes every URL in the clashing group unique — e.g. /2018/Daisy/foo.mp4
and /2023/Daisy/foo.mp4 land at 2018/Daisy/foo.mp4 and 2023/Daisy/foo.mp4
rather than colliding at Daisy/foo.mp4.
"""
by_lower: defaultdict[str, list[str]] = defaultdict(list)
for url in urls: for url in urls:
by_lower[url_to_filename(url).lower()].append(url) filename = url_to_filename(url)
if filename.lower() in clash_lower:
base = Path(output_dir) paths[url] = Path(output_dir) / _clash_subfolder(url) / filename
paths: dict[str, Path] = {} else:
paths[url] = Path(output_dir) / filename
for group in by_lower.values():
if len(group) == 1:
url = group[0]
paths[url] = base / url_to_filename(url)
continue
subfolders = _disambiguate_group(group)
for url in group:
paths[url] = base.joinpath(*subfolders[url]) / url_to_filename(url)
# Defensive: every URL must map to a distinct destination path.
# Case-fold the comparison since callers commonly run on NTFS/APFS where
# "Daisy/foo" and "daisy/foo" are the same file on disk.
seen: dict[str, str] = {}
for url, p in paths.items():
key = str(p).lower()
if key in seen:
raise RuntimeError(
f"Path collision after disambiguation: {url!r} and {seen[key]!r} "
f"both map to {p}"
)
seen[key] = url
return paths return paths
@@ -309,7 +252,12 @@ def fetch_sizes(
def main() -> None: def main() -> None:
vm = load_video_map() vm = load_video_map()
urls = [vid["url"] for entry in vm.values() for vid in entry.get("videos", [])] urls = [
u
for entry in vm.values()
for u in entry.get("videos", [])
if u.startswith("http")
]
clashes = find_clashes(urls) clashes = find_clashes(urls)
-8
View File
@@ -1,9 +1,6 @@
# config.py # config.py
from typing import Final from typing import Final
# How long a cached file size stays valid. 0 = always re-probe; large = effectively forever.
SIZE_CACHE_TTL: Final[int] = 9_999_999 # seconds (~115 days)
SITES: Final[dict[str, dict[str, str]]] = { SITES: Final[dict[str, dict[str, str]]] = {
"jailbirdz": { "jailbirdz": {
"base_url": "https://www.jailbirdz.com", "base_url": "https://www.jailbirdz.com",
@@ -15,9 +12,4 @@ SITES: Final[dict[str, dict[str, str]]] = {
"cookie_domain": "pinkcuffs.com", "cookie_domain": "pinkcuffs.com",
"env_prefix": "PINKCUFFS", "env_prefix": "PINKCUFFS",
}, },
"femuniverse": {
"base_url": "https://www.femuniverse.com",
"cookie_domain": "femuniverse.com",
"env_prefix": "FEMUNIVERSE",
},
} }
+1157 -4158
View File
File diff suppressed because one or more lines are too long
+20 -108
View File
@@ -14,13 +14,10 @@ import argparse
from pathlib import Path from pathlib import Path
import re import re
import shutil import shutil
import subprocess
import sys
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any from typing import Any, Optional
import requests import requests
import time
from check_clashes import ( from check_clashes import (
make_session, make_session,
@@ -31,9 +28,7 @@ from check_clashes import (
build_url_referers, build_url_referers,
fetch_sizes, fetch_sizes,
load_video_map, load_video_map,
save_video_map,
is_valid_url, is_valid_url,
is_hls_url,
VIDEO_MAP_FILE, VIDEO_MAP_FILE,
) )
from config import SITES from config import SITES
@@ -209,30 +204,6 @@ def reorganize(
# ── Download ───────────────────────────────────────────────────────── # ── Download ─────────────────────────────────────────────────────────
def download_hls(url: str, dest: Path, referer: str = "") -> tuple[str, int]:
"""Download an HLS stream via yt-dlp. Returns (status, bytes_written)."""
dest.parent.mkdir(parents=True, exist_ok=True)
if dest.exists():
return "ok", 0
cmd = [
sys.executable, "-m", "yt_dlp",
"--quiet", "--no-warnings",
"--referer", referer or "https://player.mediadelivery.net/",
"-o", str(dest),
url,
]
try:
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
lines = (proc.stderr or proc.stdout).strip().splitlines()
return f"error: {lines[-1] if lines else 'yt-dlp failed'}", 0
if not dest.exists():
return "error: output file missing after yt-dlp", 0
return "ok", dest.stat().st_size
except Exception as e:
return f"error: {e}", 0
def download_one( def download_one(
session: requests.Session, session: requests.Session,
url: str, url: str,
@@ -293,13 +264,12 @@ def download_one(
def collect_urls(video_map: dict[str, Any]) -> list[str]: def collect_urls(video_map: dict[str, Any]) -> list[str]:
urls, seen, skipped = [], set(), 0 urls, seen, skipped = [], set(), 0
for entry in video_map.values(): for entry in video_map.values():
for vid in entry.get("videos", []): for video_url in entry.get("videos", []):
u = vid["url"] if video_url in seen:
if u in seen:
continue continue
seen.add(u) seen.add(video_url)
if is_valid_url(u): if is_valid_url(video_url):
urls.append(u) urls.append(video_url)
else: else:
skipped += 1 skipped += 1
if skipped: if skipped:
@@ -311,41 +281,12 @@ def build_url_title_map(video_map: dict[str, Any]) -> dict[str, str]:
url_title = {} url_title = {}
for entry in video_map.values(): for entry in video_map.values():
title = entry.get("title", "") title = entry.get("title", "")
for vid in entry.get("videos", []): for video_url in entry.get("videos", []):
if vid["url"] not in url_title: if video_url not in url_title:
url_title[vid["url"]] = title url_title[video_url] = title
return url_title return url_title
def _persist_fetched_sizes(newly_fetched: dict[str, int | None]) -> None:
"""Write newly probed sizes back to video_map.json (successful probes only)."""
now = int(time.time())
for site_key in SITES:
vm_site = load_video_map(site_key)
changed = False
for entry in vm_site.values():
for vid in entry.get("videos", []):
if vid["url"] in newly_fetched and vid.get("size") is None and newly_fetched[vid["url"]] is not None:
vid["size"] = newly_fetched[vid["url"]]
vid["size_checked_at"] = now
changed = True
if changed:
save_video_map(vm_site, site_key)
n_saved = sum(1 for s in newly_fetched.values() if s is not None)
if n_saved:
print(f"[+] Cached {n_saved} newly probed size(s).")
def build_url_to_site() -> dict[str, str]:
"""Return {cdn_video_url: site_key} by loading each site's map in turn."""
result: dict[str, str] = {}
for site_key in SITES:
for entry in load_video_map(site_key).values():
for vid in entry.get("videos", []):
result[vid["url"]] = site_key
return result
# ── Main ───────────────────────────────────────────────────────────── # ── Main ─────────────────────────────────────────────────────────────
@@ -400,7 +341,11 @@ def main() -> None:
url_referers = build_url_referers(video_map) url_referers = build_url_referers(video_map)
urls = collect_urls(video_map) urls = collect_urls(video_map)
url_to_site = build_url_to_site() url_to_site: dict[str, str] = {}
for site_key in SITES:
for entry in load_video_map(site_key).values():
for vid_url in entry.get("videos", []):
url_to_site[vid_url] = site_key
if args.sites: if args.sites:
selected = set(args.sites) selected = set(args.sites)
@@ -411,7 +356,7 @@ def main() -> None:
saved = read_mode(args.output) saved = read_mode(args.output)
mode_changed = saved is not None and saved != mode mode_changed = saved is not None and saved != mode
print(f"[+] {len(urls)} video URLs from {VIDEO_MAP_FILE}") print(f"[+] {len(urls)} MP4 URLs from {VIDEO_MAP_FILE}")
print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else "")) print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else ""))
# Handle reorganize # Handle reorganize
@@ -464,45 +409,17 @@ def main() -> None:
print(f" … and {len(pending) - 20} more") print(f" … and {len(pending) - 20} more")
return return
cached_sizes: dict[str, int] = { print("\n[+] Fetching remote file sizes…")
vid["url"]: vid["size"]
for entry in video_map.values()
for vid in entry.get("videos", [])
if vid.get("size") is not None
}
newly_fetched: dict[str, int | None] = {}
uncached_pending = [u for u in pending if u not in cached_sizes and not is_hls_url(u)]
session = make_session() session = make_session()
if uncached_pending: remote_sizes = fetch_sizes(pending, workers=20, url_referers=url_referers)
print(
f"\n[+] Fetching remote file sizes ({len(uncached_pending)} uncached, {len(pending) - len(uncached_pending)} cached)…"
)
fetched_pending = fetch_sizes(uncached_pending, workers=20, url_referers=url_referers)
newly_fetched.update(fetched_pending)
remote_sizes: dict[str, int | None] = {**cached_sizes, **fetched_pending}
else:
print(f"\n[+] All {len(pending)} pending sizes cached — skipping probe.")
remote_sizes = dict(cached_sizes)
sized = {u: s for u, s in remote_sizes.items() if s is not None} sized = {u: s for u, s in remote_sizes.items() if s is not None}
total_bytes = sum(sized.values()) total_bytes = sum(sized.values())
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files") print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
already_sizes: dict[str, int | None] = {} if already:
already_to_verify = [u for u in already if not is_hls_url(u)] print(f"[+] Verifying {len(already)} existing files…")
if already_to_verify: already_sizes = fetch_sizes(already, workers=20, url_referers=url_referers)
uncached_already = [u for u in already_to_verify if u not in cached_sizes]
if uncached_already:
print(
f"[+] Verifying {len(already_to_verify)} existing files ({len(uncached_already)} uncached)…"
)
fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers)
newly_fetched.update(fetched_already)
already_sizes = {**cached_sizes, **fetched_already}
else:
print(f"[+] Verifying {len(already_to_verify)} existing files (all sizes cached)…")
already_sizes = dict(cached_sizes)
mismatched = 0 mismatched = 0
for url in already: for url in already:
@@ -521,9 +438,6 @@ def main() -> None:
if mismatched: if mismatched:
print(f"[!] {mismatched} file(s) will be re-downloaded due to size mismatch") print(f"[!] {mismatched} file(s) will be re-downloaded due to size mismatch")
if newly_fetched:
_persist_fetched_sizes(newly_fetched)
print(f"\n[⚡] Downloading with {args.workers} threads…\n") print(f"\n[⚡] Downloading with {args.workers} threads…\n")
completed = 0 completed = 0
@@ -534,8 +448,6 @@ def main() -> None:
def do_download(url: str) -> tuple[str, tuple[str, int]]: def do_download(url: str) -> tuple[str, tuple[str, int]]:
dest = paths[url] dest = paths[url]
if is_hls_url(url):
return url, download_hls(url, dest, url_referers.get(url, ""))
expected = remote_sizes.get(url) expected = remote_sizes.get(url)
return url, download_one( return url, download_one(
session, url, dest, expected, url_referers.get(url, "") session, url, dest, expected, url_referers.get(url, "")
-5
View File
@@ -14,12 +14,10 @@ import os
from pathlib import Path from pathlib import Path
from typing import Literal from typing import Literal
import requests import requests
from dotenv import load_dotenv
from config import SITES from config import SITES
ENV_FILE = Path(".env") ENV_FILE = Path(".env")
COOKIE_PREFIX = "wordpress_logged_in_" COOKIE_PREFIX = "wordpress_logged_in_"
load_dotenv(dotenv_path=ENV_FILE)
def update_env( def update_env(
@@ -74,9 +72,6 @@ def login_and_get_cookie(
"Referer": f"{base_url}/", "Referer": f"{base_url}/",
"Origin": base_url, "Origin": base_url,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
"Accept": "*/*",
}, },
timeout=30, timeout=30,
) )
+8 -73
View File
@@ -6,7 +6,7 @@ import signal
import asyncio import asyncio
import requests import requests
from pathlib import PurePosixPath from pathlib import PurePosixPath
from typing import Any from typing import Any, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
from dotenv import load_dotenv from dotenv import load_dotenv
from playwright.async_api import async_playwright, BrowserContext from playwright.async_api import async_playwright, BrowserContext
@@ -335,46 +335,6 @@ def extract_title_from_html(html: str) -> str | None:
return None return None
def _is_bunny_playlist(url: str) -> bool:
"""True if url is the root Bunny.net HLS playlist (not a sub-playlist)."""
parsed = urlparse(url)
return (
parsed.netloc.endswith(".b-cdn.net")
and parsed.path.endswith("/playlist.m3u8")
)
def _is_bunny_junk(url: str) -> bool:
"""True if url is a Bunny.net CDN init segment (not a usable video URL)."""
parsed = urlparse(url)
return parsed.netloc.endswith(".b-cdn.net") and PurePosixPath(
parsed.path
).name in {"init.mp4", "init.dmp4"}
def extract_bunny_embed_url(html: str) -> str | None:
"""Return a tokenless Bunny.net embed URL found in an iframe, or None."""
m = re.search(
r'<iframe[^>]+src="(https://player\.mediadelivery\.net/embed/[^"?]+)',
html,
)
return m.group(1) if m else None
def _clear_junk_video_entries(video_map: dict[str, Any], site_key: str) -> int:
"""Reset entries whose only stored videos are CDN init segments. Returns count fixed."""
cleared = 0
for entry in video_map.values():
videos = entry.get("videos", [])
if videos and all(_is_bunny_junk(v["url"]) for v in videos):
entry["videos"] = []
entry.pop("scraped_at", None)
cleared += 1
if cleared:
save_video_map(video_map, site_key)
return cleared
MAX_RETRIES = 2 MAX_RETRIES = 2
@@ -401,9 +361,7 @@ async def worker(
page.on( page.on(
"response", "response",
lambda resp: video_hits.add(resp.url) lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
if (_is_video_url(resp.url) or _is_bunny_playlist(resp.url))
else None,
) )
try: try:
@@ -418,7 +376,7 @@ async def worker(
print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}") print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")
try: try:
await page.goto(url, wait_until="load", timeout=60000) await page.goto(url, wait_until="networkidle", timeout=60000)
except Exception as e: except Exception as e:
print(f"[W{worker_id}] Navigation error: {e}") print(f"[W{worker_id}] Navigation error: {e}")
if expects_video(url) and attempt < MAX_RETRIES: if expects_video(url) and attempt < MAX_RETRIES:
@@ -493,29 +451,18 @@ async def worker(
found = set(html_videos) | set(video_hits) found = set(html_videos) | set(video_hits)
video_hits.clear() video_hits.clear()
print(f"[W{worker_id}] network hits raw: {found or '(empty)'}")
all_videos = [ all_videos = [
m m
for m in found for m in found
if is_valid_url(m) if is_valid_url(m)
and not _is_bunny_junk(m)
and m and m
not in ( not in (
f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4", f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
) )
] ]
if not all_videos:
embed_url = extract_bunny_embed_url(html)
if embed_url:
print(
f"[W{worker_id}] No network hit — iframe fallback: {embed_url}"
)
all_videos = [embed_url]
async with map_lock: async with map_lock:
new_found = set(all_videos) - known new_found = found - known
if new_found: if new_found:
print(f"[W{worker_id}] Found {len(new_found)} new video URLs") print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
known.update(new_found) known.update(new_found)
@@ -529,13 +476,9 @@ async def worker(
entry = video_map.get(url, {}) entry = video_map.get(url, {})
if title: if title:
entry["title"] = title entry["title"] = title
existing_dict: dict[str, Any] = { existing_videos = set(entry.get("videos", []))
vid["url"]: vid for vid in entry.get("videos", []) existing_videos.update(all_videos)
} entry["videos"] = sorted(existing_videos)
for vid_url in all_videos:
if vid_url not in existing_dict:
existing_dict[vid_url] = {"url": vid_url}
entry["videos"] = sorted(existing_dict.values(), key=lambda v: v["url"])
mark_done = bool(all_videos) or not expects_video(url) mark_done = bool(all_videos) or not expects_video(url)
if mark_done: if mark_done:
entry["scraped_at"] = int(time.time()) entry["scraped_at"] = int(time.time())
@@ -572,12 +515,6 @@ async def run_for_site(
urls = load_post_urls(site_key, base_url, wp_api, req_headers) urls = load_post_urls(site_key, base_url, wp_api, req_headers)
video_map = load_video_map(site_key) video_map = load_video_map(site_key)
junk_cleared = _clear_junk_video_entries(video_map, site_key)
if junk_cleared:
print(
f"[{site_key}] Cleared {junk_cleared} entries with junk CDN init segments — will re-scrape."
)
if any( if any(
u not in video_map u not in video_map
or not video_map[u].get("title") or not video_map[u].get("title")
@@ -589,9 +526,7 @@ async def run_for_site(
site_key, base_url, wp_api, video_map, urls, req_headers site_key, base_url, wp_api, video_map, urls, req_headers
) )
known = { known = {u for entry in video_map.values() for u in entry.get("videos", [])}
vid["url"] for entry in video_map.values() for vid in entry.get("videos", [])
}
total = len(urls) total = len(urls)
pending = [] pending = []
-2
View File
@@ -1,5 +1,3 @@
playwright==1.58.0 playwright==1.58.0
python-dotenv==1.2.1 python-dotenv==1.2.1
Requests==2.32.5 Requests==2.32.5
yt-dlp>=2026.3.17
pycryptodomex>=3.23.0
+21 -111
View File
@@ -4,19 +4,15 @@ Importable function:
summarize_sizes(sizes) - return dict with total, smallest, largest, average, failed summarize_sizes(sizes) - return dict with total, smallest, largest, average, failed
""" """
import argparse from typing import Optional, TypedDict
import time
from typing import Any, TypedDict
from check_clashes import ( from check_clashes import (
fmt_size, fmt_size,
fetch_sizes, fetch_sizes,
load_video_map, load_video_map,
save_video_map,
build_url_referers, build_url_referers,
VIDEO_MAP_FILE, VIDEO_MAP_FILE,
) )
from config import SITES, SIZE_CACHE_TTL
class SizeStats(TypedDict): class SizeStats(TypedDict):
@@ -29,7 +25,7 @@ class SizeStats(TypedDict):
failed: list[str] failed: list[str]
def summarize_sizes(sizes: dict[str, int | None]) -> SizeStats: def summarize_sizes(sizes: dict[str, Optional[int]]) -> SizeStats:
"""Given {url: size_or_None}, return a stats dict.""" """Given {url: size_or_None}, return a stats dict."""
known = {u: s for u, s in sizes.items() if s is not None} known = {u: s for u, s in sizes.items() if s is not None}
failed = [u for u, s in sizes.items() if s is None] failed = [u for u, s in sizes.items() if s is None]
@@ -55,13 +51,6 @@ def summarize_sizes(sizes: dict[str, int | None]) -> SizeStats:
} }
def _is_stale(vid: dict[str, Any], now: int) -> bool:
"""True if the cached size is absent or older than SIZE_CACHE_TTL seconds."""
if vid.get("size") is None:
return True
return (now - int(vid.get("size_checked_at", 0))) >= SIZE_CACHE_TTL
# --------------- CLI --------------- # --------------- CLI ---------------
@@ -70,7 +59,24 @@ def _progress(done: int, total: int) -> None:
print(f" {done}/{total}") print(f" {done}/{total}")
def _print_stats(stats: SizeStats) -> None: def main() -> None:
vm = load_video_map()
urls: list[str] = [
u
for entry in vm.values()
for u in entry.get("videos", [])
if u.startswith("http")
]
url_referers = build_url_referers(vm)
print(f"[+] {len(urls)} URLs in {VIDEO_MAP_FILE}")
print("[+] Fetching file sizes (20 threads)…\n")
sizes = fetch_sizes(
urls, workers=20, on_progress=_progress, url_referers=url_referers
)
stats = summarize_sizes(sizes)
print(f"\n{'=' * 45}") print(f"\n{'=' * 45}")
print(f" Sized: {stats['sized']}/{stats['total']} files") print(f" Sized: {stats['sized']}/{stats['total']} files")
print(f" Total: {fmt_size(stats['total_bytes'])}") print(f" Total: {fmt_size(stats['total_bytes'])}")
@@ -78,108 +84,12 @@ def _print_stats(stats: SizeStats) -> None:
print(f" Largest: {fmt_size(stats['largest'])}") print(f" Largest: {fmt_size(stats['largest'])}")
print(f" Average: {fmt_size(stats['average'])}") print(f" Average: {fmt_size(stats['average'])}")
print(f"{'=' * 45}") print(f"{'=' * 45}")
if stats["failed"]: if stats["failed"]:
print(f"\n[!] {len(stats['failed'])} URL(s) could not be sized:") print(f"\n[!] {len(stats['failed'])} URL(s) could not be sized:")
for u in stats["failed"]: for u in stats["failed"]:
print(f" {u}") print(f" {u}")
def _cache_hint(fresh: int, stale: int, missing: int) -> str:
parts = [label for count, label in [(fresh, f"{fresh} fresh"), (stale, f"{stale} stale"), (missing, f"{missing} missing")] if count]
if stale or missing:
suffix = " — run --write to refresh" if stale else " — run --write to probe missing"
else:
suffix = " — all current"
return f"Cache: {', '.join(parts)}{suffix}"
def _run_stats() -> None:
vm = load_video_map()
now = int(time.time())
sizes: dict[str, int | None] = {}
fresh = stale = missing = 0
for entry in vm.values():
for vid in entry.get("videos", []):
url = vid["url"]
if url in sizes:
continue
sizes[url] = vid.get("size")
if vid.get("size") is None:
missing += 1
elif _is_stale(vid, now):
stale += 1
else:
fresh += 1
print(f"[+] {len(sizes)} URLs in {VIDEO_MAP_FILE}")
print(f" {_cache_hint(fresh, stale, missing)}")
_print_stats(summarize_sizes(sizes))
def _apply_fetched(vm: dict[str, Any], fetched: dict[str, int | None], now: int) -> None:
for entry in vm.values():
for vid in entry.get("videos", []):
if vid["url"] in fetched:
vid["size"] = fetched[vid["url"]]
vid["size_checked_at"] = now
def _run_write() -> None:
"""Probe uncached sizes and write them into video_map.json."""
now = int(time.time())
all_fetched: dict[str, int | None] = {}
for site_key in SITES:
vm = load_video_map(site_key)
if not vm:
continue
url_referers = build_url_referers(vm)
to_probe: list[str] = [
vid["url"]
for entry in vm.values()
for vid in entry.get("videos", [])
if _is_stale(vid, now)
]
cached_count = sum(
1
for entry in vm.values()
for vid in entry.get("videos", [])
if not _is_stale(vid, now)
)
print(f"[{site_key}] {cached_count} cached, {len(to_probe)} to probe…")
fetched: dict[str, int | None] = {}
if to_probe:
fetched = fetch_sizes(
to_probe, workers=20, on_progress=_progress, url_referers=url_referers
)
_apply_fetched(vm, fetched, now)
save_video_map(vm, site_key)
all_fetched.update(fetched)
print(f"[{site_key}] Written.")
if all_fetched:
_print_stats(summarize_sizes(all_fetched))
def main() -> None:
parser = argparse.ArgumentParser(description="Calculate total video download size")
parser.add_argument(
"--write",
"-w",
action="store_true",
help="Probe uncached sizes and write them into video_map.json",
)
args = parser.parse_args()
if args.write:
_run_write()
else:
_run_stats()
if __name__ == "__main__": if __name__ == "__main__":
main() main()
+15 -14
View File
@@ -32,11 +32,11 @@ import requests
from dotenv import load_dotenv from dotenv import load_dotenv
from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS, load_video_map from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS, load_video_map
from config import SITES
from download import ( from download import (
collect_urls, collect_urls,
get_paths_for_mode, get_paths_for_mode,
read_mode, read_mode,
build_url_to_site,
MODE_ORIGINAL, MODE_ORIGINAL,
DEFAULT_OUTPUT, DEFAULT_OUTPUT,
) )
@@ -49,7 +49,6 @@ DEFAULT_BATCH_SIZE = 1
DEFAULT_POLL = 30 DEFAULT_POLL = 30
UPLOADED_FILE = ".uploaded" UPLOADED_FILE = ".uploaded"
PT_NAME_MAX = 120 PT_NAME_MAX = 120
PT_DESC_MIN = 3 # PeerTube rejects descriptions shorter than this
# ── Text helpers ───────────────────────────────────────────────────── # ── Text helpers ─────────────────────────────────────────────────────
@@ -63,11 +62,6 @@ def clean_description(raw: str) -> str:
text = re.sub(r"<[^>]+>", "", text) text = re.sub(r"<[^>]+>", "", text)
text = html.unescape(text) text = html.unescape(text)
text = re.sub(r"\n{3,}", "\n\n", text).strip() text = re.sub(r"\n{3,}", "\n\n", text).strip()
# PeerTube enforces a 3-char minimum on descriptions; a sub-minimum
# description (e.g. a stray ".") makes the upload-init 400. Drop it so
# it's omitted from the request rather than rejected.
if len(text) < PT_DESC_MIN:
return ""
return text[:10000] return text[:10000]
@@ -131,12 +125,11 @@ def get_channel_id(base: str, token: str, channel_name: str) -> int:
def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]: def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
"""Paginate through the channel and return a Counter of video names.""" """Paginate through the channel and return a Counter of video names."""
counts: Counter[str] = Counter() counts: Counter[str] = Counter()
page_size = 25
start = 0 start = 0
while True: while True:
r = requests.get( r = requests.get(
f"{base}/api/v1/video-channels/{channel_name}/videos", f"{base}/api/v1/video-channels/{channel_name}/videos",
params={"start": start, "count": page_size}, params={"start": start, "count": 100},
headers=api_headers(token), headers=api_headers(token),
timeout=30, timeout=30,
) )
@@ -144,7 +137,7 @@ def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter
data = r.json() data = r.json()
for v in data.get("data", []): for v in data.get("data", []):
counts[v["name"]] += 1 counts[v["name"]] += 1
start += page_size start += 100
if start >= data.get("total", 0): if start >= data.get("total", 0):
break break
return counts return counts
@@ -419,7 +412,11 @@ def build_path_to_meta(
urls = collect_urls(video_map) urls = collect_urls(video_map)
mode = read_mode(input_dir) or MODE_ORIGINAL mode = read_mode(input_dir) or MODE_ORIGINAL
url_to_site = build_url_to_site() url_to_site: dict[str, str] = {}
for site_key in SITES:
for entry in load_video_map(site_key).values():
for vid_url in entry.get("videos", []):
url_to_site[vid_url] = site_key
paths = get_paths_for_mode(mode, urls, video_map, input_dir, url_to_site) paths = get_paths_for_mode(mode, urls, video_map, input_dir, url_to_site)
@@ -432,9 +429,13 @@ def build_path_to_meta(
title = t if isinstance(t, str) else "" title = t if isinstance(t, str) else ""
desc = d if isinstance(d, str) else "" desc = d if isinstance(d, str) else ""
for vid in entry.get("videos", []): videos_any = entry.get("videos", [])
if vid["url"] not in url_meta: if isinstance(videos_any, list):
url_meta[vid["url"]] = {"title": title, "description": desc} for video_url_any in videos_any:
if not isinstance(video_url_any, str):
continue
if video_url_any not in url_meta:
url_meta[video_url_any] = {"title": title, "description": desc}
result: dict[Path, dict[str, str]] = {} result: dict[Path, dict[str, str]] = {}
for url, abs_path in paths.items(): for url, abs_path in paths.items():
+1833 -12693
View File
File diff suppressed because one or more lines are too long