mirror of
https://github.com/HugeFrog24/jailbirdz-dl.git
synced 2026-03-02 01:04:31 +00:00
Multi-gooner support because who knows
This commit is contained in:
205
check_clashes.py
205
check_clashes.py
@@ -5,42 +5,142 @@ Importable functions:
|
||||
find_clashes(urls) - {filename: [urls]} for filenames with >1 source
|
||||
build_download_paths(urls, output_dir) - {url: local_path} with clash resolution
|
||||
fmt_size(bytes) - human-readable size string
|
||||
get_remote_size(session, url) - file size via HEAD without downloading
|
||||
fetch_sizes(urls, workers, on_progress) - bulk size lookup
|
||||
get_remote_size(session, url, referer) - file size via HEAD without downloading
|
||||
fetch_sizes(urls, workers, on_progress, url_referers, session) - bulk size lookup
|
||||
make_session() - requests.Session with required headers
|
||||
load_video_map() - load video_map.json, returns {} on missing/corrupt
|
||||
load_video_map(site, path) - load video_map.json; auto-migrates old flat format
|
||||
save_video_map(video_map, site_key, path) - atomic write of one site's entries
|
||||
build_url_referers(video_map) - {cdn_url: referer} derived from page URL keys
|
||||
is_valid_url(url) - True if url is a plain http(s) URL with no HTML artefacts
|
||||
expects_video(url) - True if url is a members-only video page
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path, PurePosixPath
|
||||
from typing import Any, Optional, cast
|
||||
|
||||
from collections.abc import Callable
|
||||
from urllib.parse import urlparse, unquote
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import requests
|
||||
from config import BASE_URL
|
||||
|
||||
REFERER = f"{BASE_URL}/"
|
||||
VIDEO_MAP_FILE = "video_map.json"
|
||||
VIDEO_EXTS = {".mp4", ".mov", ".m4v", ".webm", ".avi"}
|
||||
VIDEO_MAP_FILE: str = "video_map.json"
|
||||
VIDEO_EXTS: set[str] = {".mp4", ".mov", ".m4v", ".webm", ".avi"}
|
||||
|
||||
|
||||
def load_video_map():
|
||||
if Path(VIDEO_MAP_FILE).exists():
|
||||
def is_valid_url(url: str) -> bool:
|
||||
"""True if url is a plain http(s) URL with no HTML artefacts (<, >, href= etc.)."""
|
||||
return (
|
||||
url.startswith("http")
|
||||
and "<" not in url
|
||||
and ">" not in url
|
||||
and " href=" not in url
|
||||
)
|
||||
|
||||
|
||||
def expects_video(url: str) -> bool:
|
||||
"""True if url is a members-only video page that should contain a video."""
|
||||
return "/pinkcuffs-videos/" in url
|
||||
|
||||
|
||||
def _write_video_map_atomic(data: dict[str, Any], path: Path) -> None:
|
||||
"""Write the full nested video_map dict to disk atomically via a temp file."""
|
||||
fd, tmp = tempfile.mkstemp(dir=path.resolve().parent, suffix=".tmp")
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
Path(tmp).replace(path)
|
||||
except Exception:
|
||||
try:
|
||||
with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
Path(tmp).unlink()
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
def load_video_map(
|
||||
site: str | None = None,
|
||||
path: str | Path = VIDEO_MAP_FILE,
|
||||
) -> dict[str, Any]:
|
||||
"""Load video_map.json.
|
||||
|
||||
Args:
|
||||
site: If given, return only that site's inner dict {url: entry}.
|
||||
If None, return a flat-merged dict across all sites.
|
||||
path: Path to the JSON file (injectable for tests).
|
||||
"""
|
||||
p = Path(path)
|
||||
if not p.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(p, encoding="utf-8") as f:
|
||||
raw: Any = json.load(f)
|
||||
data = cast(dict[str, Any], raw)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
if site is not None:
|
||||
return cast(dict[str, Any], data.get(site, {}))
|
||||
|
||||
# Merge all sites into a flat dict for backward-compat callers
|
||||
merged: dict[str, Any] = {}
|
||||
for site_entries in data.values():
|
||||
if isinstance(site_entries, dict):
|
||||
merged.update(cast(dict[str, Any], site_entries))
|
||||
return merged
|
||||
|
||||
|
||||
def save_video_map(
|
||||
video_map: dict[str, Any],
|
||||
site_key: str,
|
||||
path: str | Path = VIDEO_MAP_FILE,
|
||||
) -> None:
|
||||
"""Atomically update one site's entries in the nested video_map.json.
|
||||
|
||||
Args:
|
||||
video_map: The inner {url: entry} dict for site_key.
|
||||
site_key: Which top-level key to update (e.g. "jailbirdz").
|
||||
path: Path to the JSON file (injectable for tests).
|
||||
"""
|
||||
p = Path(path)
|
||||
if p.exists():
|
||||
try:
|
||||
with open(p, encoding="utf-8") as f:
|
||||
raw: Any = json.load(f)
|
||||
full = cast(dict[str, Any], raw)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
return {}
|
||||
full = {}
|
||||
else:
|
||||
full = {}
|
||||
|
||||
full[site_key] = video_map
|
||||
_write_video_map_atomic(full, p)
|
||||
|
||||
|
||||
def make_session():
|
||||
s = requests.Session()
|
||||
s.headers.update({"Referer": REFERER})
|
||||
return s
|
||||
def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
|
||||
"""Pure function: return {cdn_video_url: site_referer} from a flat video map.
|
||||
|
||||
The flat video map has page URLs as keys; the scheme+netloc of each page URL
|
||||
is used as the Referer for all CDN video URLs found in that entry.
|
||||
"""
|
||||
result: dict[str, str] = {}
|
||||
for page_url, entry in video_map.items():
|
||||
parsed = urlparse(page_url)
|
||||
referer = f"{parsed.scheme}://{parsed.netloc}/"
|
||||
for vid in cast(dict[str, Any], entry).get("videos", []):
|
||||
if isinstance(vid, str):
|
||||
result.setdefault(vid, referer)
|
||||
return result
|
||||
|
||||
|
||||
def fmt_size(b):
|
||||
def make_session() -> requests.Session:
|
||||
return requests.Session()
|
||||
|
||||
|
||||
def fmt_size(b: float | int) -> str:
|
||||
for unit in ("B", "KB", "MB", "GB"):
|
||||
if b < 1024:
|
||||
return f"{b:.1f} {unit}"
|
||||
@@ -48,30 +148,34 @@ def fmt_size(b):
|
||||
return f"{b:.1f} TB"
|
||||
|
||||
|
||||
def url_to_filename(url):
|
||||
def url_to_filename(url: str) -> str:
|
||||
return unquote(PurePosixPath(urlparse(url).path).name)
|
||||
|
||||
|
||||
def find_clashes(urls):
|
||||
def find_clashes(urls: list[str]) -> dict[str, list[str]]:
|
||||
# Case-insensitive grouping so that e.g. "DaisyArrest.mp4" and
|
||||
# "daisyarrest.mp4" are treated as a clash. This is required for
|
||||
# correctness on case-insensitive filesystems (NTFS, exFAT, macOS HFS+)
|
||||
# and harmless on case-sensitive ones (ext4) — the actual filenames on
|
||||
# disk keep their original casing; only the clash *detection* is folded.
|
||||
by_lower = defaultdict(list)
|
||||
by_lower: defaultdict[str, list[str]] = defaultdict(list)
|
||||
for url in urls:
|
||||
by_lower[url_to_filename(url).lower()].append(url)
|
||||
return {url_to_filename(srcs[0]): srcs
|
||||
for srcs in by_lower.values() if len(srcs) > 1}
|
||||
return {
|
||||
url_to_filename(srcs[0]): srcs for srcs in by_lower.values() if len(srcs) > 1
|
||||
}
|
||||
|
||||
|
||||
def _clash_subfolder(url):
|
||||
def _clash_subfolder(url: str) -> str:
|
||||
"""Parent path segment used as disambiguator for clashing filenames."""
|
||||
parts = urlparse(url).path.rstrip("/").split("/")
|
||||
return unquote(parts[-2]) if len(parts) >= 2 else "unknown"
|
||||
|
||||
|
||||
def build_download_paths(urls, output_dir):
|
||||
def build_download_paths(
|
||||
urls: list[str],
|
||||
output_dir: str | Path,
|
||||
) -> dict[str, Path]:
|
||||
"""Map each URL to a local file path. Flat layout; clashing names get a subfolder."""
|
||||
clashes = find_clashes(urls)
|
||||
clash_lower = {name.lower() for name in clashes}
|
||||
@@ -86,16 +190,25 @@ def build_download_paths(urls, output_dir):
|
||||
return paths
|
||||
|
||||
|
||||
def get_remote_size(session, url):
|
||||
def get_remote_size(
|
||||
session: requests.Session,
|
||||
url: str,
|
||||
referer: str = "",
|
||||
) -> int | None:
|
||||
extra = {"Referer": referer} if referer else {}
|
||||
try:
|
||||
r = session.head(url, allow_redirects=True, timeout=15)
|
||||
r = session.head(url, headers=extra, allow_redirects=True, timeout=15)
|
||||
if r.status_code < 400 and "Content-Length" in r.headers:
|
||||
return int(r.headers["Content-Length"])
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
r = session.get(
|
||||
url, headers={"Range": "bytes=0-0"}, stream=True, timeout=15)
|
||||
url,
|
||||
headers={"Range": "bytes=0-0", **extra},
|
||||
stream=True,
|
||||
timeout=15,
|
||||
)
|
||||
r.close()
|
||||
cr = r.headers.get("Content-Range", "")
|
||||
if "/" in cr:
|
||||
@@ -105,19 +218,30 @@ def get_remote_size(session, url):
|
||||
return None
|
||||
|
||||
|
||||
def fetch_sizes(urls, workers=20, on_progress=None):
|
||||
def fetch_sizes(
|
||||
urls: list[str],
|
||||
workers: int = 20,
|
||||
on_progress: Callable[[int, int], None] | None = None,
|
||||
url_referers: dict[str, str] | None = None,
|
||||
session: requests.Session | None = None,
|
||||
) -> dict[str, int | None]:
|
||||
"""Return {url: size_or_None}. on_progress(done, total) called after each URL."""
|
||||
session = make_session()
|
||||
sizes = {}
|
||||
if session is None:
|
||||
session = make_session()
|
||||
referers = url_referers or {}
|
||||
sizes: dict[str, int | None] = {}
|
||||
total = len(urls)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
futures = {pool.submit(get_remote_size, session, u): u for u in urls}
|
||||
futures = {
|
||||
pool.submit(get_remote_size, session, u, referers.get(u, "")): u
|
||||
for u in urls
|
||||
}
|
||||
done = 0
|
||||
for fut in as_completed(futures):
|
||||
sizes[futures[fut]] = fut.result()
|
||||
done += 1
|
||||
if on_progress:
|
||||
if on_progress is not None:
|
||||
on_progress(done, total)
|
||||
|
||||
return sizes
|
||||
@@ -125,14 +249,20 @@ def fetch_sizes(urls, workers=20, on_progress=None):
|
||||
|
||||
# --------------- CLI ---------------
|
||||
|
||||
def main():
|
||||
|
||||
def main() -> None:
|
||||
vm = load_video_map()
|
||||
urls = [u for entry in vm.values() for u in entry.get("videos", []) if u.startswith("http")]
|
||||
urls = [
|
||||
u
|
||||
for entry in vm.values()
|
||||
for u in entry.get("videos", [])
|
||||
if u.startswith("http")
|
||||
]
|
||||
|
||||
clashes = find_clashes(urls)
|
||||
|
||||
print(f"Total URLs: {len(urls)}")
|
||||
by_name = defaultdict(list)
|
||||
by_name: defaultdict[str, list[str]] = defaultdict(list)
|
||||
for url in urls:
|
||||
by_name[url_to_filename(url)].append(url)
|
||||
print(f"Unique filenames: {len(by_name)}")
|
||||
@@ -142,8 +272,9 @@ def main():
|
||||
return
|
||||
|
||||
clash_urls = [u for srcs in clashes.values() for u in srcs]
|
||||
url_referers = build_url_referers(vm)
|
||||
print(f"\n[+] Fetching file sizes for {len(clash_urls)} clashing URLs…")
|
||||
sizes = fetch_sizes(clash_urls)
|
||||
sizes = fetch_sizes(clash_urls, url_referers=url_referers)
|
||||
|
||||
print(f"\n{len(clashes)} filename clash(es):\n")
|
||||
for name, srcs in sorted(clashes.items()):
|
||||
|
||||
Reference in New Issue
Block a user