Multi-gooner support because who knows

This commit is contained in:
HugeFrog24
2026-03-01 01:36:01 +01:00
parent 80444405e9
commit 4a5b0a6ee3
15 changed files with 16378 additions and 11575 deletions

View File

@@ -5,42 +5,142 @@ Importable functions:
find_clashes(urls) - {filename: [urls]} for filenames with >1 source
build_download_paths(urls, output_dir) - {url: local_path} with clash resolution
fmt_size(bytes) - human-readable size string
get_remote_size(session, url) - file size via HEAD without downloading
fetch_sizes(urls, workers, on_progress) - bulk size lookup
get_remote_size(session, url, referer) - file size via HEAD without downloading
fetch_sizes(urls, workers, on_progress, url_referers, session) - bulk size lookup
make_session() - requests.Session with required headers
load_video_map() - load video_map.json, returns {} on missing/corrupt
load_video_map(site, path) - load video_map.json; auto-migrates old flat format
save_video_map(video_map, site_key, path) - atomic write of one site's entries
build_url_referers(video_map) - {cdn_url: referer} derived from page URL keys
is_valid_url(url) - True if url is a plain http(s) URL with no HTML artefacts
expects_video(url) - True if url is a members-only video page
"""
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path, PurePosixPath
from typing import Any, Optional, cast
from collections.abc import Callable
from urllib.parse import urlparse, unquote
import json
import os
import tempfile
import requests
from config import BASE_URL
REFERER = f"{BASE_URL}/"
VIDEO_MAP_FILE = "video_map.json"
VIDEO_EXTS = {".mp4", ".mov", ".m4v", ".webm", ".avi"}
VIDEO_MAP_FILE: str = "video_map.json"
VIDEO_EXTS: set[str] = {".mp4", ".mov", ".m4v", ".webm", ".avi"}
def load_video_map():
if Path(VIDEO_MAP_FILE).exists():
def is_valid_url(url: str) -> bool:
"""True if url is a plain http(s) URL with no HTML artefacts (<, >, href= etc.)."""
return (
url.startswith("http")
and "<" not in url
and ">" not in url
and " href=" not in url
)
def expects_video(url: str) -> bool:
"""True if url is a members-only video page that should contain a video."""
return "/pinkcuffs-videos/" in url
def _write_video_map_atomic(data: dict[str, Any], path: Path) -> None:
"""Write the full nested video_map dict to disk atomically via a temp file."""
fd, tmp = tempfile.mkstemp(dir=path.resolve().parent, suffix=".tmp")
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
Path(tmp).replace(path)
except Exception:
try:
with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
return json.load(f)
Path(tmp).unlink()
except OSError:
pass
raise
def load_video_map(
site: str | None = None,
path: str | Path = VIDEO_MAP_FILE,
) -> dict[str, Any]:
"""Load video_map.json.
Args:
site: If given, return only that site's inner dict {url: entry}.
If None, return a flat-merged dict across all sites.
path: Path to the JSON file (injectable for tests).
"""
p = Path(path)
if not p.exists():
return {}
try:
with open(p, encoding="utf-8") as f:
raw: Any = json.load(f)
data = cast(dict[str, Any], raw)
except (json.JSONDecodeError, OSError):
return {}
if site is not None:
return cast(dict[str, Any], data.get(site, {}))
# Merge all sites into a flat dict for backward-compat callers
merged: dict[str, Any] = {}
for site_entries in data.values():
if isinstance(site_entries, dict):
merged.update(cast(dict[str, Any], site_entries))
return merged
def save_video_map(
video_map: dict[str, Any],
site_key: str,
path: str | Path = VIDEO_MAP_FILE,
) -> None:
"""Atomically update one site's entries in the nested video_map.json.
Args:
video_map: The inner {url: entry} dict for site_key.
site_key: Which top-level key to update (e.g. "jailbirdz").
path: Path to the JSON file (injectable for tests).
"""
p = Path(path)
if p.exists():
try:
with open(p, encoding="utf-8") as f:
raw: Any = json.load(f)
full = cast(dict[str, Any], raw)
except (json.JSONDecodeError, OSError):
return {}
return {}
full = {}
else:
full = {}
full[site_key] = video_map
_write_video_map_atomic(full, p)
def make_session():
s = requests.Session()
s.headers.update({"Referer": REFERER})
return s
def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
"""Pure function: return {cdn_video_url: site_referer} from a flat video map.
The flat video map has page URLs as keys; the scheme+netloc of each page URL
is used as the Referer for all CDN video URLs found in that entry.
"""
result: dict[str, str] = {}
for page_url, entry in video_map.items():
parsed = urlparse(page_url)
referer = f"{parsed.scheme}://{parsed.netloc}/"
for vid in cast(dict[str, Any], entry).get("videos", []):
if isinstance(vid, str):
result.setdefault(vid, referer)
return result
def fmt_size(b):
def make_session() -> requests.Session:
return requests.Session()
def fmt_size(b: float | int) -> str:
for unit in ("B", "KB", "MB", "GB"):
if b < 1024:
return f"{b:.1f} {unit}"
@@ -48,30 +148,34 @@ def fmt_size(b):
return f"{b:.1f} TB"
def url_to_filename(url):
def url_to_filename(url: str) -> str:
return unquote(PurePosixPath(urlparse(url).path).name)
def find_clashes(urls):
def find_clashes(urls: list[str]) -> dict[str, list[str]]:
# Case-insensitive grouping so that e.g. "DaisyArrest.mp4" and
# "daisyarrest.mp4" are treated as a clash. This is required for
# correctness on case-insensitive filesystems (NTFS, exFAT, macOS HFS+)
# and harmless on case-sensitive ones (ext4) — the actual filenames on
# disk keep their original casing; only the clash *detection* is folded.
by_lower = defaultdict(list)
by_lower: defaultdict[str, list[str]] = defaultdict(list)
for url in urls:
by_lower[url_to_filename(url).lower()].append(url)
return {url_to_filename(srcs[0]): srcs
for srcs in by_lower.values() if len(srcs) > 1}
return {
url_to_filename(srcs[0]): srcs for srcs in by_lower.values() if len(srcs) > 1
}
def _clash_subfolder(url):
def _clash_subfolder(url: str) -> str:
"""Parent path segment used as disambiguator for clashing filenames."""
parts = urlparse(url).path.rstrip("/").split("/")
return unquote(parts[-2]) if len(parts) >= 2 else "unknown"
def build_download_paths(urls, output_dir):
def build_download_paths(
urls: list[str],
output_dir: str | Path,
) -> dict[str, Path]:
"""Map each URL to a local file path. Flat layout; clashing names get a subfolder."""
clashes = find_clashes(urls)
clash_lower = {name.lower() for name in clashes}
@@ -86,16 +190,25 @@ def build_download_paths(urls, output_dir):
return paths
def get_remote_size(session, url):
def get_remote_size(
session: requests.Session,
url: str,
referer: str = "",
) -> int | None:
extra = {"Referer": referer} if referer else {}
try:
r = session.head(url, allow_redirects=True, timeout=15)
r = session.head(url, headers=extra, allow_redirects=True, timeout=15)
if r.status_code < 400 and "Content-Length" in r.headers:
return int(r.headers["Content-Length"])
except Exception:
pass
try:
r = session.get(
url, headers={"Range": "bytes=0-0"}, stream=True, timeout=15)
url,
headers={"Range": "bytes=0-0", **extra},
stream=True,
timeout=15,
)
r.close()
cr = r.headers.get("Content-Range", "")
if "/" in cr:
@@ -105,19 +218,30 @@ def get_remote_size(session, url):
return None
def fetch_sizes(urls, workers=20, on_progress=None):
def fetch_sizes(
urls: list[str],
workers: int = 20,
on_progress: Callable[[int, int], None] | None = None,
url_referers: dict[str, str] | None = None,
session: requests.Session | None = None,
) -> dict[str, int | None]:
"""Return {url: size_or_None}. on_progress(done, total) called after each URL."""
session = make_session()
sizes = {}
if session is None:
session = make_session()
referers = url_referers or {}
sizes: dict[str, int | None] = {}
total = len(urls)
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(get_remote_size, session, u): u for u in urls}
futures = {
pool.submit(get_remote_size, session, u, referers.get(u, "")): u
for u in urls
}
done = 0
for fut in as_completed(futures):
sizes[futures[fut]] = fut.result()
done += 1
if on_progress:
if on_progress is not None:
on_progress(done, total)
return sizes
@@ -125,14 +249,20 @@ def fetch_sizes(urls, workers=20, on_progress=None):
# --------------- CLI ---------------
def main():
def main() -> None:
vm = load_video_map()
urls = [u for entry in vm.values() for u in entry.get("videos", []) if u.startswith("http")]
urls = [
u
for entry in vm.values()
for u in entry.get("videos", [])
if u.startswith("http")
]
clashes = find_clashes(urls)
print(f"Total URLs: {len(urls)}")
by_name = defaultdict(list)
by_name: defaultdict[str, list[str]] = defaultdict(list)
for url in urls:
by_name[url_to_filename(url)].append(url)
print(f"Unique filenames: {len(by_name)}")
@@ -142,8 +272,9 @@ def main():
return
clash_urls = [u for srcs in clashes.values() for u in srcs]
url_referers = build_url_referers(vm)
print(f"\n[+] Fetching file sizes for {len(clash_urls)} clashing URLs…")
sizes = fetch_sizes(clash_urls)
sizes = fetch_sizes(clash_urls, url_referers=url_referers)
print(f"\n{len(clashes)} filename clash(es):\n")
for name, srcs in sorted(clashes.items()):