mirror of
https://github.com/HugeFrog24/jailbirdz-dl.git
synced 2026-06-30 08:37:13 +00:00
Compare commits
15 Commits
main
..
b02b89c8ca
| Author | SHA1 | Date | |
|---|---|---|---|
| b02b89c8ca | |||
| 2c51f53955 | |||
| 13a4dbc5fc | |||
| 597667d458 | |||
| a1acfaa736 | |||
| dd530093f0 | |||
| 3cc6d9a3b6 | |||
| e6acc96588 | |||
| db685d3575 | |||
| 014df7d116 | |||
| 241cb051f6 | |||
| a51d2bd237 | |||
| c58814691e | |||
| 27f7beb8b7 | |||
| 4a5b0a6ee3 |
@@ -13,11 +13,6 @@ PINKCUFFS_USERNAME=your-email-or-username
|
|||||||
PINKCUFFS_PASSWORD=your-password
|
PINKCUFFS_PASSWORD=your-password
|
||||||
PINKCUFFS_LOGIN_COOKIE=wordpress_logged_in_<hash>=<value>
|
PINKCUFFS_LOGIN_COOKIE=wordpress_logged_in_<hash>=<value>
|
||||||
|
|
||||||
# femuniverse.com credentials (separate membership)
|
|
||||||
FEMUNIVERSE_USERNAME=your-email-or-username
|
|
||||||
FEMUNIVERSE_PASSWORD=your-password
|
|
||||||
FEMUNIVERSE_LOGIN_COOKIE=wordpress_logged_in_<hash>=<value>
|
|
||||||
|
|
||||||
# PeerTube upload target
|
# PeerTube upload target
|
||||||
PEERTUBE_URL=https://your-peertube-instance.example
|
PEERTUBE_URL=https://your-peertube-instance.example
|
||||||
PEERTUBE_USER=admin
|
PEERTUBE_USER=admin
|
||||||
|
|||||||
@@ -0,0 +1,14 @@
|
|||||||
|
---
|
||||||
|
alwaysApply: true
|
||||||
|
always_on: true
|
||||||
|
trigger: always_on
|
||||||
|
applyTo: "**"
|
||||||
|
description: Snyk Security At Inception
|
||||||
|
---
|
||||||
|
|
||||||
|
# Project security best practices
|
||||||
|
|
||||||
|
- Always run snyk_code_scan tool for new first party code that is generated in a Snyk-supported language.
|
||||||
|
- If any security issues are found based on newly introduced or modified code or dependencies, attempt to fix the issues using the results context from Snyk.
|
||||||
|
- Rescan the code after fixing the issues to ensure that the issues were fixed and that there are no newly introduced issues.
|
||||||
|
- Repeat this process until no new issues are found.
|
||||||
@@ -38,8 +38,6 @@ jobs:
|
|||||||
JAILBIRDZ_PASSWORD: ${{ secrets.JAILBIRDZ_PASSWORD }}
|
JAILBIRDZ_PASSWORD: ${{ secrets.JAILBIRDZ_PASSWORD }}
|
||||||
PINKCUFFS_USERNAME: ${{ secrets.PINKCUFFS_USERNAME }}
|
PINKCUFFS_USERNAME: ${{ secrets.PINKCUFFS_USERNAME }}
|
||||||
PINKCUFFS_PASSWORD: ${{ secrets.PINKCUFFS_PASSWORD }}
|
PINKCUFFS_PASSWORD: ${{ secrets.PINKCUFFS_PASSWORD }}
|
||||||
FEMUNIVERSE_USERNAME: ${{ secrets.FEMUNIVERSE_USERNAME }}
|
|
||||||
FEMUNIVERSE_PASSWORD: ${{ secrets.FEMUNIVERSE_PASSWORD }}
|
|
||||||
|
|
||||||
- name: Commit updated video_map.json
|
- name: Commit updated video_map.json
|
||||||
if: always() # save progress even if main.py crashed or timed out
|
if: always() # save progress even if main.py crashed or timed out
|
||||||
|
|||||||
@@ -1,6 +1,3 @@
|
|||||||
# Local Claude instructions
|
|
||||||
.claude/settings.local.json
|
|
||||||
|
|
||||||
# Temporary cache
|
# Temporary cache
|
||||||
__pycache__/
|
__pycache__/
|
||||||
.ruff_cache/
|
.ruff_cache/
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# 𝒥𝒶𝒾𝓁𝒷𝒾𝓇𝒹𝓏-𝒹𝓁
|
# 𝒥𝒶𝒾𝓁𝒷𝒾𝓇𝒹𝓏-𝒹𝓁
|
||||||
|
|
||||||
Jailbirdz.com, Pinkcuffs.com, and Femuniverse.com are Arizona-based subscription video sites publishing arrest and jail roleplay scenarios featuring women. This tool scrapes the member area of any combination of these sites, downloads the videos, and re-hosts them on a self-owned PeerTube instance.
|
Jailbirdz.com and Pinkcuffs.com are Arizona-based subscription video sites publishing arrest and jail roleplay scenarios featuring women. This tool scrapes the member area of one or both sites, downloads the videos, and re-hosts them on a self-owned PeerTube instance.
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> This tool does not bypass authentication, modify the site, or intercept anything it isn't entitled to. A valid, paid membership is required. The scraper authenticates using your own session cookie and accesses only content your account can already view in a browser.
|
> This tool does not bypass authentication, modify the site, or intercept anything it isn't entitled to. A valid, paid membership is required. The scraper authenticates using your own session cookie and accesses only content your account can already view in a browser.
|
||||||
@@ -23,9 +23,9 @@ cp .env.example .env
|
|||||||
|
|
||||||
Set credentials for whichever sites you have a membership on. You don't need both.
|
Set credentials for whichever sites you have a membership on. You don't need both.
|
||||||
|
|
||||||
**Option A — credentials (recommended):** set `JAILBIRDZ_USERNAME` + `JAILBIRDZ_PASSWORD` (and/or the `PINKCUFFS_*` / `FEMUNIVERSE_*` equivalents) in `.env`. `main.py` logs in automatically on startup.
|
**Option A — credentials (recommended):** set `JAILBIRDZ_USERNAME` + `JAILBIRDZ_PASSWORD` (and/or the `PINKCUFFS_*` equivalents) in `.env`. `main.py` logs in automatically on startup.
|
||||||
|
|
||||||
**Option B — manual cookie:** set `JAILBIRDZ_LOGIN_COOKIE` (and/or `PINKCUFFS_LOGIN_COOKIE` / `FEMUNIVERSE_LOGIN_COOKIE`) yourself. Get the value from browser DevTools → Storage → Cookies — copy the full `name=value` of the `wordpress_logged_in_*` cookie.
|
**Option B — manual cookie:** set `JAILBIRDZ_LOGIN_COOKIE` (and/or `PINKCUFFS_LOGIN_COOKIE`) yourself. Get the value from browser DevTools → Storage → Cookies — copy the full `name=value` of the `wordpress_logged_in_*` cookie.
|
||||||
|
|
||||||
Sites with no credentials are skipped automatically when running `python main.py`.
|
Sites with no credentials are skipped automatically when running `python main.py`.
|
||||||
|
|
||||||
@@ -35,8 +35,6 @@ Sites with no credentials are skipped automatically when running `python main.py
|
|||||||
- `JAILBIRDZ_LOGIN_COOKIE` — jailbirdz.com session cookie (fallback).
|
- `JAILBIRDZ_LOGIN_COOKIE` — jailbirdz.com session cookie (fallback).
|
||||||
- `PINKCUFFS_USERNAME` / `PINKCUFFS_PASSWORD` — pinkcuffs.com login.
|
- `PINKCUFFS_USERNAME` / `PINKCUFFS_PASSWORD` — pinkcuffs.com login.
|
||||||
- `PINKCUFFS_LOGIN_COOKIE` — pinkcuffs.com session cookie (fallback).
|
- `PINKCUFFS_LOGIN_COOKIE` — pinkcuffs.com session cookie (fallback).
|
||||||
- `FEMUNIVERSE_USERNAME` / `FEMUNIVERSE_PASSWORD` — femuniverse.com login.
|
|
||||||
- `FEMUNIVERSE_LOGIN_COOKIE` — femuniverse.com session cookie (fallback).
|
|
||||||
- `PEERTUBE_URL` — base URL of your PeerTube instance.
|
- `PEERTUBE_URL` — base URL of your PeerTube instance.
|
||||||
- `PEERTUBE_USER` — PeerTube username.
|
- `PEERTUBE_USER` — PeerTube username.
|
||||||
- `PEERTUBE_CHANNEL` — channel to upload to.
|
- `PEERTUBE_CHANNEL` — channel to upload to.
|
||||||
@@ -46,13 +44,12 @@ Sites with no credentials are skipped automatically when running `python main.py
|
|||||||
|
|
||||||
### 1. Scrape
|
### 1. Scrape
|
||||||
|
|
||||||
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V, HLS/M3U8).
|
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V).
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python main.py # scrape all sites you have credentials for
|
python main.py # scrape all sites you have credentials for
|
||||||
python main.py --site jailbirdz # scrape one site only
|
python main.py --site jailbirdz # scrape one site only
|
||||||
python main.py --site pinkcuffs --site jailbirdz # explicit multi-site
|
python main.py --site pinkcuffs --site jailbirdz # explicit multi-site
|
||||||
python main.py --site femuniverse # femuniverse only
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Results are written to `video_map.json`. Safe to re-run — already-scraped posts are skipped.
|
Results are written to `video_map.json`. Safe to re-run — already-scraped posts are skipped.
|
||||||
@@ -65,11 +62,11 @@ python download.py [options]
|
|||||||
Options:
|
Options:
|
||||||
-o, --output DIR Download directory (default: downloads)
|
-o, --output DIR Download directory (default: downloads)
|
||||||
-t, --titles Name files by post title
|
-t, --titles Name files by post title
|
||||||
--original Name files by original filename derived from the video URL (default)
|
--original Name files by original CloudFront filename (default)
|
||||||
--reorganize Rename existing files to match current naming mode
|
--reorganize Rename existing files to match current naming mode
|
||||||
-w, --workers N Concurrent downloads (default: 4)
|
-w, --workers N Concurrent downloads (default: 4)
|
||||||
-n, --dry-run Print what would be downloaded
|
-n, --dry-run Print what would be downloaded
|
||||||
--site SITE Limit to one site (jailbirdz, pinkcuffs, or femuniverse); repeatable
|
--site SITE Limit to one site (jailbirdz or pinkcuffs); repeatable
|
||||||
```
|
```
|
||||||
|
|
||||||
Resumes partial downloads. The chosen naming mode is saved to `.naming_mode` inside the output directory and persists across runs. Filenames that would clash are placed into subfolders.
|
Resumes partial downloads. The chosen naming mode is saved to `.naming_mode` inside the output directory and persists across runs. Filenames that would clash are placed into subfolders.
|
||||||
@@ -108,10 +105,6 @@ gh secret set JAILBIRDZ_PASSWORD
|
|||||||
# pinkcuffs (if you have a membership)
|
# pinkcuffs (if you have a membership)
|
||||||
gh secret set PINKCUFFS_USERNAME
|
gh secret set PINKCUFFS_USERNAME
|
||||||
gh secret set PINKCUFFS_PASSWORD
|
gh secret set PINKCUFFS_PASSWORD
|
||||||
|
|
||||||
# femuniverse (if you have a membership)
|
|
||||||
gh secret set FEMUNIVERSE_USERNAME
|
|
||||||
gh secret set FEMUNIVERSE_PASSWORD
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Seed CI with your current progress before the first run:**
|
**Seed CI with your current progress before the first run:**
|
||||||
|
|||||||
+19
-77
@@ -121,21 +121,17 @@ def save_video_map(
|
|||||||
|
|
||||||
|
|
||||||
def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
|
def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
|
||||||
"""Pure function: return {cdn_video_url: referer} from a flat video map.
|
"""Pure function: return {cdn_video_url: site_referer} from a flat video map.
|
||||||
|
|
||||||
Bunny.net CDN URLs require https://player.mediadelivery.net/ as referer.
|
The flat video map has page URLs as keys; the scheme+netloc of each page URL
|
||||||
All other URLs use the scheme+netloc of the page they were found on.
|
is used as the Referer for all CDN video URLs found in that entry.
|
||||||
"""
|
"""
|
||||||
result: dict[str, str] = {}
|
result: dict[str, str] = {}
|
||||||
for page_url, entry in video_map.items():
|
for page_url, entry in video_map.items():
|
||||||
parsed = urlparse(page_url)
|
parsed = urlparse(page_url)
|
||||||
site_referer = f"{parsed.scheme}://{parsed.netloc}/"
|
referer = f"{parsed.scheme}://{parsed.netloc}/"
|
||||||
for vid in cast(dict[str, Any], entry).get("videos", []):
|
for vid in cast(dict[str, Any], entry).get("videos", []):
|
||||||
vid_url = vid["url"]
|
result.setdefault(vid["url"], referer)
|
||||||
if urlparse(vid_url).netloc.endswith(".b-cdn.net"):
|
|
||||||
result.setdefault(vid_url, "https://player.mediadelivery.net/")
|
|
||||||
else:
|
|
||||||
result.setdefault(vid_url, site_referer)
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@@ -151,17 +147,8 @@ def fmt_size(b: float | int) -> str:
|
|||||||
return f"{b:.1f} TB"
|
return f"{b:.1f} TB"
|
||||||
|
|
||||||
|
|
||||||
def is_hls_url(url: str) -> bool:
|
|
||||||
"""True if url is an HLS master playlist (.m3u8)."""
|
|
||||||
return urlparse(url).path.endswith(".m3u8")
|
|
||||||
|
|
||||||
|
|
||||||
def url_to_filename(url: str) -> str:
|
def url_to_filename(url: str) -> str:
|
||||||
path = PurePosixPath(urlparse(url).path)
|
return unquote(PurePosixPath(urlparse(url).path).name)
|
||||||
# Bunny.net HLS: .../guid/playlist.m3u8 → guid.mp4
|
|
||||||
if path.name == "playlist.m3u8":
|
|
||||||
return unquote(path.parent.name) + ".mp4"
|
|
||||||
return unquote(path.name)
|
|
||||||
|
|
||||||
|
|
||||||
def find_clashes(urls: list[str]) -> dict[str, list[str]]:
|
def find_clashes(urls: list[str]) -> dict[str, list[str]]:
|
||||||
@@ -178,72 +165,27 @@ def find_clashes(urls: list[str]) -> dict[str, list[str]]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _path_folders(url: str) -> list[str]:
|
def _clash_subfolder(url: str) -> str:
|
||||||
"""Decoded URL path segments above the filename (filename excluded)."""
|
"""Parent path segment used as disambiguator for clashing filenames."""
|
||||||
parts = [unquote(p) for p in urlparse(url).path.split("/") if p]
|
parts = urlparse(url).path.rstrip("/").split("/")
|
||||||
return parts[:-1]
|
return unquote(parts[-2]) if len(parts) >= 2 else "unknown"
|
||||||
|
|
||||||
|
|
||||||
def _disambiguate_group(group: list[str]) -> dict[str, tuple[str, ...]]:
|
|
||||||
"""Find the smallest depth of trailing folder segments that gives every URL in the group
|
|
||||||
a unique subfolder path. Returns {url: subfolder_segments}.
|
|
||||||
|
|
||||||
Comparison is case-insensitive so the result is safe on NTFS/APFS as well as ext4.
|
|
||||||
"""
|
|
||||||
folders = {u: _path_folders(u) for u in group}
|
|
||||||
max_depth = max((len(f) for f in folders.values()), default=0)
|
|
||||||
|
|
||||||
for depth in range(1, max_depth + 1):
|
|
||||||
keys = {u: tuple(p.lower() for p in folders[u][-depth:]) for u in group}
|
|
||||||
if len(set(keys.values())) == len(group):
|
|
||||||
return {u: tuple(folders[u][-depth:]) for u in group}
|
|
||||||
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Cannot disambiguate URL group sharing filename and full parent path: {group}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def build_download_paths(
|
def build_download_paths(
|
||||||
urls: list[str],
|
urls: list[str],
|
||||||
output_dir: str | Path,
|
output_dir: str | Path,
|
||||||
) -> dict[str, Path]:
|
) -> dict[str, Path]:
|
||||||
"""Map each URL to a unique local file path.
|
"""Map each URL to a local file path. Flat layout; clashing names get a subfolder."""
|
||||||
|
clashes = find_clashes(urls)
|
||||||
|
clash_lower = {name.lower() for name in clashes}
|
||||||
|
|
||||||
Unique filenames go directly under output_dir. Filenames that clash
|
paths = {}
|
||||||
(case-insensitively) get the smallest tail of their URL path prepended
|
|
||||||
that makes every URL in the clashing group unique — e.g. /2018/Daisy/foo.mp4
|
|
||||||
and /2023/Daisy/foo.mp4 land at 2018/Daisy/foo.mp4 and 2023/Daisy/foo.mp4
|
|
||||||
rather than colliding at Daisy/foo.mp4.
|
|
||||||
"""
|
|
||||||
by_lower: defaultdict[str, list[str]] = defaultdict(list)
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
by_lower[url_to_filename(url).lower()].append(url)
|
filename = url_to_filename(url)
|
||||||
|
if filename.lower() in clash_lower:
|
||||||
base = Path(output_dir)
|
paths[url] = Path(output_dir) / _clash_subfolder(url) / filename
|
||||||
paths: dict[str, Path] = {}
|
else:
|
||||||
|
paths[url] = Path(output_dir) / filename
|
||||||
for group in by_lower.values():
|
|
||||||
if len(group) == 1:
|
|
||||||
url = group[0]
|
|
||||||
paths[url] = base / url_to_filename(url)
|
|
||||||
continue
|
|
||||||
subfolders = _disambiguate_group(group)
|
|
||||||
for url in group:
|
|
||||||
paths[url] = base.joinpath(*subfolders[url]) / url_to_filename(url)
|
|
||||||
|
|
||||||
# Defensive: every URL must map to a distinct destination path.
|
|
||||||
# Case-fold the comparison since callers commonly run on NTFS/APFS where
|
|
||||||
# "Daisy/foo" and "daisy/foo" are the same file on disk.
|
|
||||||
seen: dict[str, str] = {}
|
|
||||||
for url, p in paths.items():
|
|
||||||
key = str(p).lower()
|
|
||||||
if key in seen:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Path collision after disambiguation: {url!r} and {seen[key]!r} "
|
|
||||||
f"both map to {p}"
|
|
||||||
)
|
|
||||||
seen[key] = url
|
|
||||||
|
|
||||||
return paths
|
return paths
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,9 +15,4 @@ SITES: Final[dict[str, dict[str, str]]] = {
|
|||||||
"cookie_domain": "pinkcuffs.com",
|
"cookie_domain": "pinkcuffs.com",
|
||||||
"env_prefix": "PINKCUFFS",
|
"env_prefix": "PINKCUFFS",
|
||||||
},
|
},
|
||||||
"femuniverse": {
|
|
||||||
"base_url": "https://www.femuniverse.com",
|
|
||||||
"cookie_domain": "femuniverse.com",
|
|
||||||
"env_prefix": "FEMUNIVERSE",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|||||||
Vendored
+1157
-4158
File diff suppressed because one or more lines are too long
+7
-38
@@ -14,8 +14,6 @@ import argparse
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -33,7 +31,6 @@ from check_clashes import (
|
|||||||
load_video_map,
|
load_video_map,
|
||||||
save_video_map,
|
save_video_map,
|
||||||
is_valid_url,
|
is_valid_url,
|
||||||
is_hls_url,
|
|
||||||
VIDEO_MAP_FILE,
|
VIDEO_MAP_FILE,
|
||||||
)
|
)
|
||||||
from config import SITES
|
from config import SITES
|
||||||
@@ -209,30 +206,6 @@ def reorganize(
|
|||||||
# ── Download ─────────────────────────────────────────────────────────
|
# ── Download ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def download_hls(url: str, dest: Path, referer: str = "") -> tuple[str, int]:
|
|
||||||
"""Download an HLS stream via yt-dlp. Returns (status, bytes_written)."""
|
|
||||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
if dest.exists():
|
|
||||||
return "ok", 0
|
|
||||||
cmd = [
|
|
||||||
sys.executable, "-m", "yt_dlp",
|
|
||||||
"--quiet", "--no-warnings",
|
|
||||||
"--referer", referer or "https://player.mediadelivery.net/",
|
|
||||||
"-o", str(dest),
|
|
||||||
url,
|
|
||||||
]
|
|
||||||
try:
|
|
||||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
||||||
if proc.returncode != 0:
|
|
||||||
lines = (proc.stderr or proc.stdout).strip().splitlines()
|
|
||||||
return f"error: {lines[-1] if lines else 'yt-dlp failed'}", 0
|
|
||||||
if not dest.exists():
|
|
||||||
return "error: output file missing after yt-dlp", 0
|
|
||||||
return "ok", dest.stat().st_size
|
|
||||||
except Exception as e:
|
|
||||||
return f"error: {e}", 0
|
|
||||||
|
|
||||||
|
|
||||||
def download_one(
|
def download_one(
|
||||||
session: requests.Session,
|
session: requests.Session,
|
||||||
url: str,
|
url: str,
|
||||||
@@ -411,7 +384,7 @@ def main() -> None:
|
|||||||
saved = read_mode(args.output)
|
saved = read_mode(args.output)
|
||||||
mode_changed = saved is not None and saved != mode
|
mode_changed = saved is not None and saved != mode
|
||||||
|
|
||||||
print(f"[+] {len(urls)} video URLs from {VIDEO_MAP_FILE}")
|
print(f"[+] {len(urls)} MP4 URLs from {VIDEO_MAP_FILE}")
|
||||||
print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else ""))
|
print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else ""))
|
||||||
|
|
||||||
# Handle reorganize
|
# Handle reorganize
|
||||||
@@ -472,7 +445,7 @@ def main() -> None:
|
|||||||
}
|
}
|
||||||
|
|
||||||
newly_fetched: dict[str, int | None] = {}
|
newly_fetched: dict[str, int | None] = {}
|
||||||
uncached_pending = [u for u in pending if u not in cached_sizes and not is_hls_url(u)]
|
uncached_pending = [u for u in pending if u not in cached_sizes]
|
||||||
session = make_session()
|
session = make_session()
|
||||||
if uncached_pending:
|
if uncached_pending:
|
||||||
print(
|
print(
|
||||||
@@ -489,19 +462,17 @@ def main() -> None:
|
|||||||
total_bytes = sum(sized.values())
|
total_bytes = sum(sized.values())
|
||||||
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
|
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
|
||||||
|
|
||||||
already_sizes: dict[str, int | None] = {}
|
if already:
|
||||||
already_to_verify = [u for u in already if not is_hls_url(u)]
|
uncached_already = [u for u in already if u not in cached_sizes]
|
||||||
if already_to_verify:
|
|
||||||
uncached_already = [u for u in already_to_verify if u not in cached_sizes]
|
|
||||||
if uncached_already:
|
if uncached_already:
|
||||||
print(
|
print(
|
||||||
f"[+] Verifying {len(already_to_verify)} existing files ({len(uncached_already)} uncached)…"
|
f"[+] Verifying {len(already)} existing files ({len(uncached_already)} uncached)…"
|
||||||
)
|
)
|
||||||
fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers)
|
fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers)
|
||||||
newly_fetched.update(fetched_already)
|
newly_fetched.update(fetched_already)
|
||||||
already_sizes = {**cached_sizes, **fetched_already}
|
already_sizes: dict[str, int | None] = {**cached_sizes, **fetched_already}
|
||||||
else:
|
else:
|
||||||
print(f"[+] Verifying {len(already_to_verify)} existing files (all sizes cached)…")
|
print(f"[+] Verifying {len(already)} existing files (all sizes cached)…")
|
||||||
already_sizes = dict(cached_sizes)
|
already_sizes = dict(cached_sizes)
|
||||||
|
|
||||||
mismatched = 0
|
mismatched = 0
|
||||||
@@ -534,8 +505,6 @@ def main() -> None:
|
|||||||
|
|
||||||
def do_download(url: str) -> tuple[str, tuple[str, int]]:
|
def do_download(url: str) -> tuple[str, tuple[str, int]]:
|
||||||
dest = paths[url]
|
dest = paths[url]
|
||||||
if is_hls_url(url):
|
|
||||||
return url, download_hls(url, dest, url_referers.get(url, ""))
|
|
||||||
expected = remote_sizes.get(url)
|
expected = remote_sizes.get(url)
|
||||||
return url, download_one(
|
return url, download_one(
|
||||||
session, url, dest, expected, url_referers.get(url, "")
|
session, url, dest, expected, url_referers.get(url, "")
|
||||||
|
|||||||
@@ -335,46 +335,6 @@ def extract_title_from_html(html: str) -> str | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _is_bunny_playlist(url: str) -> bool:
|
|
||||||
"""True if url is the root Bunny.net HLS playlist (not a sub-playlist)."""
|
|
||||||
parsed = urlparse(url)
|
|
||||||
return (
|
|
||||||
parsed.netloc.endswith(".b-cdn.net")
|
|
||||||
and parsed.path.endswith("/playlist.m3u8")
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_bunny_junk(url: str) -> bool:
|
|
||||||
"""True if url is a Bunny.net CDN init segment (not a usable video URL)."""
|
|
||||||
parsed = urlparse(url)
|
|
||||||
return parsed.netloc.endswith(".b-cdn.net") and PurePosixPath(
|
|
||||||
parsed.path
|
|
||||||
).name in {"init.mp4", "init.dmp4"}
|
|
||||||
|
|
||||||
|
|
||||||
def extract_bunny_embed_url(html: str) -> str | None:
|
|
||||||
"""Return a tokenless Bunny.net embed URL found in an iframe, or None."""
|
|
||||||
m = re.search(
|
|
||||||
r'<iframe[^>]+src="(https://player\.mediadelivery\.net/embed/[^"?]+)',
|
|
||||||
html,
|
|
||||||
)
|
|
||||||
return m.group(1) if m else None
|
|
||||||
|
|
||||||
|
|
||||||
def _clear_junk_video_entries(video_map: dict[str, Any], site_key: str) -> int:
|
|
||||||
"""Reset entries whose only stored videos are CDN init segments. Returns count fixed."""
|
|
||||||
cleared = 0
|
|
||||||
for entry in video_map.values():
|
|
||||||
videos = entry.get("videos", [])
|
|
||||||
if videos and all(_is_bunny_junk(v["url"]) for v in videos):
|
|
||||||
entry["videos"] = []
|
|
||||||
entry.pop("scraped_at", None)
|
|
||||||
cleared += 1
|
|
||||||
if cleared:
|
|
||||||
save_video_map(video_map, site_key)
|
|
||||||
return cleared
|
|
||||||
|
|
||||||
|
|
||||||
MAX_RETRIES = 2
|
MAX_RETRIES = 2
|
||||||
|
|
||||||
|
|
||||||
@@ -401,9 +361,7 @@ async def worker(
|
|||||||
|
|
||||||
page.on(
|
page.on(
|
||||||
"response",
|
"response",
|
||||||
lambda resp: video_hits.add(resp.url)
|
lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
|
||||||
if (_is_video_url(resp.url) or _is_bunny_playlist(resp.url))
|
|
||||||
else None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -418,7 +376,7 @@ async def worker(
|
|||||||
print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")
|
print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await page.goto(url, wait_until="load", timeout=60000)
|
await page.goto(url, wait_until="networkidle", timeout=60000)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[W{worker_id}] Navigation error: {e}")
|
print(f"[W{worker_id}] Navigation error: {e}")
|
||||||
if expects_video(url) and attempt < MAX_RETRIES:
|
if expects_video(url) and attempt < MAX_RETRIES:
|
||||||
@@ -493,29 +451,18 @@ async def worker(
|
|||||||
found = set(html_videos) | set(video_hits)
|
found = set(html_videos) | set(video_hits)
|
||||||
video_hits.clear()
|
video_hits.clear()
|
||||||
|
|
||||||
print(f"[W{worker_id}] network hits raw: {found or '(empty)'}")
|
|
||||||
|
|
||||||
all_videos = [
|
all_videos = [
|
||||||
m
|
m
|
||||||
for m in found
|
for m in found
|
||||||
if is_valid_url(m)
|
if is_valid_url(m)
|
||||||
and not _is_bunny_junk(m)
|
|
||||||
and m
|
and m
|
||||||
not in (
|
not in (
|
||||||
f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
|
f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
if not all_videos:
|
|
||||||
embed_url = extract_bunny_embed_url(html)
|
|
||||||
if embed_url:
|
|
||||||
print(
|
|
||||||
f"[W{worker_id}] No network hit — iframe fallback: {embed_url}"
|
|
||||||
)
|
|
||||||
all_videos = [embed_url]
|
|
||||||
|
|
||||||
async with map_lock:
|
async with map_lock:
|
||||||
new_found = set(all_videos) - known
|
new_found = found - known
|
||||||
if new_found:
|
if new_found:
|
||||||
print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
|
print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
|
||||||
known.update(new_found)
|
known.update(new_found)
|
||||||
@@ -572,12 +519,6 @@ async def run_for_site(
|
|||||||
urls = load_post_urls(site_key, base_url, wp_api, req_headers)
|
urls = load_post_urls(site_key, base_url, wp_api, req_headers)
|
||||||
|
|
||||||
video_map = load_video_map(site_key)
|
video_map = load_video_map(site_key)
|
||||||
junk_cleared = _clear_junk_video_entries(video_map, site_key)
|
|
||||||
if junk_cleared:
|
|
||||||
print(
|
|
||||||
f"[{site_key}] Cleared {junk_cleared} entries with junk CDN init segments — will re-scrape."
|
|
||||||
)
|
|
||||||
|
|
||||||
if any(
|
if any(
|
||||||
u not in video_map
|
u not in video_map
|
||||||
or not video_map[u].get("title")
|
or not video_map[u].get("title")
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
playwright==1.58.0
|
playwright==1.58.0
|
||||||
python-dotenv==1.2.1
|
python-dotenv==1.2.1
|
||||||
Requests==2.32.5
|
Requests==2.32.5
|
||||||
yt-dlp>=2026.3.17
|
|
||||||
pycryptodomex>=3.23.0
|
|
||||||
|
|||||||
@@ -49,7 +49,6 @@ DEFAULT_BATCH_SIZE = 1
|
|||||||
DEFAULT_POLL = 30
|
DEFAULT_POLL = 30
|
||||||
UPLOADED_FILE = ".uploaded"
|
UPLOADED_FILE = ".uploaded"
|
||||||
PT_NAME_MAX = 120
|
PT_NAME_MAX = 120
|
||||||
PT_DESC_MIN = 3 # PeerTube rejects descriptions shorter than this
|
|
||||||
|
|
||||||
|
|
||||||
# ── Text helpers ─────────────────────────────────────────────────────
|
# ── Text helpers ─────────────────────────────────────────────────────
|
||||||
@@ -63,11 +62,6 @@ def clean_description(raw: str) -> str:
|
|||||||
text = re.sub(r"<[^>]+>", "", text)
|
text = re.sub(r"<[^>]+>", "", text)
|
||||||
text = html.unescape(text)
|
text = html.unescape(text)
|
||||||
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
||||||
# PeerTube enforces a 3-char minimum on descriptions; a sub-minimum
|
|
||||||
# description (e.g. a stray ".") makes the upload-init 400. Drop it so
|
|
||||||
# it's omitted from the request rather than rejected.
|
|
||||||
if len(text) < PT_DESC_MIN:
|
|
||||||
return ""
|
|
||||||
return text[:10000]
|
return text[:10000]
|
||||||
|
|
||||||
|
|
||||||
@@ -131,12 +125,11 @@ def get_channel_id(base: str, token: str, channel_name: str) -> int:
|
|||||||
def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
|
def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
|
||||||
"""Paginate through the channel and return a Counter of video names."""
|
"""Paginate through the channel and return a Counter of video names."""
|
||||||
counts: Counter[str] = Counter()
|
counts: Counter[str] = Counter()
|
||||||
page_size = 25
|
|
||||||
start = 0
|
start = 0
|
||||||
while True:
|
while True:
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
f"{base}/api/v1/video-channels/{channel_name}/videos",
|
f"{base}/api/v1/video-channels/{channel_name}/videos",
|
||||||
params={"start": start, "count": page_size},
|
params={"start": start, "count": 100},
|
||||||
headers=api_headers(token),
|
headers=api_headers(token),
|
||||||
timeout=30,
|
timeout=30,
|
||||||
)
|
)
|
||||||
@@ -144,7 +137,7 @@ def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter
|
|||||||
data = r.json()
|
data = r.json()
|
||||||
for v in data.get("data", []):
|
for v in data.get("data", []):
|
||||||
counts[v["name"]] += 1
|
counts[v["name"]] += 1
|
||||||
start += page_size
|
start += 100
|
||||||
if start >= data.get("total", 0):
|
if start >= data.get("total", 0):
|
||||||
break
|
break
|
||||||
return counts
|
return counts
|
||||||
|
|||||||
+18
-3501
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user