mirror of
https://github.com/HugeFrog24/jailbirdz-dl.git
synced 2026-03-02 01:04:31 +00:00
Multi-gooner support because who knows
This commit is contained in:
19
.env.example
19
.env.example
@@ -1,8 +1,17 @@
|
|||||||
# Copy your wordpress_logged_in_... cookie from browser DevTools → Storage → Cookies.
|
# jailbirdz.com credentials
|
||||||
# Paste the full name=value pair below.
|
# These are used to obtain a fresh login cookie via the WooCommerce AJAX endpoint.
|
||||||
# wordpress_sec_... is the wp-admin cookie — irrelevant for read-only viewers.
|
JAILBIRDZ_USERNAME=your-email-or-username
|
||||||
# __cf_bm is a Cloudflare bot-management cookie — also not needed.
|
JAILBIRDZ_PASSWORD=your-password
|
||||||
WP_LOGIN_COOKIE=wordpress_logged_in_<hash>=<value>
|
|
||||||
|
# Alternatively, set the cookie manually (expires in ~2 weeks).
|
||||||
|
# Get it from browser DevTools → Storage → Cookies while on jailbirdz.com.
|
||||||
|
# Copy the full name=value of the wordpress_logged_in_* cookie.
|
||||||
|
JAILBIRDZ_LOGIN_COOKIE=wordpress_logged_in_<hash>=<value>
|
||||||
|
|
||||||
|
# pinkcuffs.com credentials (separate membership — different from jailbirdz)
|
||||||
|
PINKCUFFS_USERNAME=your-email-or-username
|
||||||
|
PINKCUFFS_PASSWORD=your-password
|
||||||
|
PINKCUFFS_LOGIN_COOKIE=wordpress_logged_in_<hash>=<value>
|
||||||
|
|
||||||
# PeerTube upload target
|
# PeerTube upload target
|
||||||
PEERTUBE_URL=https://your-peertube-instance.example
|
PEERTUBE_URL=https://your-peertube-instance.example
|
||||||
|
|||||||
49
.github/workflows/nightly-index.yml
vendored
Normal file
49
.github/workflows/nightly-index.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
name: Nightly Index
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 3 * * *' # 03:00 UTC daily
|
||||||
|
workflow_dispatch: # manual trigger via GitHub UI
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write # needed to push video_map.json back
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: nightly-index
|
||||||
|
cancel-in-progress: false # let an in-progress scrape finish; queue the next run
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
index:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 300 # 5 h ceiling; scraper resumes where it left off on next run
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
cache: pip
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: Install Playwright Firefox
|
||||||
|
run: playwright install firefox --with-deps
|
||||||
|
|
||||||
|
- name: Run scraper
|
||||||
|
run: python main.py
|
||||||
|
env:
|
||||||
|
JAILBIRDZ_USERNAME: ${{ secrets.JAILBIRDZ_USERNAME }}
|
||||||
|
JAILBIRDZ_PASSWORD: ${{ secrets.JAILBIRDZ_PASSWORD }}
|
||||||
|
PINKCUFFS_USERNAME: ${{ secrets.PINKCUFFS_USERNAME }}
|
||||||
|
PINKCUFFS_PASSWORD: ${{ secrets.PINKCUFFS_PASSWORD }}
|
||||||
|
|
||||||
|
- name: Commit updated video_map.json
|
||||||
|
if: always() # save progress even if main.py crashed or timed out
|
||||||
|
run: |
|
||||||
|
git config user.name "github-actions[bot]"
|
||||||
|
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||||
|
git add video_map.json
|
||||||
|
git diff --staged --quiet || git commit -m "chore: nightly index update [skip ci]"
|
||||||
|
git push
|
||||||
9
.gitignore
vendored
9
.gitignore
vendored
@@ -1,5 +1,14 @@
|
|||||||
|
# Temporary cache
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# Local IDE config
|
||||||
|
.vscode
|
||||||
|
|
||||||
|
# Project output & artifacts
|
||||||
downloads/
|
downloads/
|
||||||
*.mp4
|
*.mp4
|
||||||
*.mp4.part
|
*.mp4.part
|
||||||
|
|
||||||
|
# Secrets & sensitive info
|
||||||
.env
|
.env
|
||||||
|
|||||||
4
.vscode/settings.json
vendored
4
.vscode/settings.json
vendored
@@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"snyk.advanced.organization": "512ef4a1-6034-4537-a391-9692d282122a",
|
|
||||||
"snyk.advanced.autoSelectOrganization": true
|
|
||||||
}
|
|
||||||
52
README.md
52
README.md
@@ -1,6 +1,6 @@
|
|||||||
# 𝒥𝒶𝒾𝓁𝒷𝒾𝓇𝒹𝓏-𝒹𝓁
|
# 𝒥𝒶𝒾𝓁𝒷𝒾𝓇𝒹𝓏-𝒹𝓁
|
||||||
|
|
||||||
Jailbirdz.com is an Arizona-based subscription video site publishing arrest and jail roleplay scenarios featuring women. This tool scrapes the member area, downloads the videos, and re-hosts them on a self-owned PeerTube instance.
|
Jailbirdz.com and Pinkcuffs.com are Arizona-based subscription video sites publishing arrest and jail roleplay scenarios featuring women. This tool scrapes the member area of one or both sites, downloads the videos, and re-hosts them on a self-owned PeerTube instance.
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> This tool does not bypass authentication, modify the site, or intercept anything it isn't entitled to. A valid, paid membership is required. The scraper authenticates using your own session cookie and accesses only content your account can already view in a browser.
|
> This tool does not bypass authentication, modify the site, or intercept anything it isn't entitled to. A valid, paid membership is required. The scraper authenticates using your own session cookie and accesses only content your account can already view in a browser.
|
||||||
@@ -19,23 +19,22 @@ Jailbirdz.com is an Arizona-based subscription video site publishing arrest and
|
|||||||
cp .env.example .env
|
cp .env.example .env
|
||||||
```
|
```
|
||||||
|
|
||||||
### WP_LOGIN_COOKIE
|
### Credentials
|
||||||
|
|
||||||
You need to be logged into jailbirdz.com in a browser. Then either:
|
Set credentials for whichever sites you have a membership on. You don't need both.
|
||||||
|
|
||||||
**Option A — auto (recommended):** let `grab_cookie.py` read it from your browser and write it to `.env` automatically:
|
**Option A — credentials (recommended):** set `JAILBIRDZ_USERNAME` + `JAILBIRDZ_PASSWORD` (and/or the `PINKCUFFS_*` equivalents) in `.env`. `main.py` logs in automatically on startup.
|
||||||
|
|
||||||
```bash
|
**Option B — manual cookie:** set `JAILBIRDZ_LOGIN_COOKIE` (and/or `PINKCUFFS_LOGIN_COOKIE`) yourself. Get the value from browser DevTools → Storage → Cookies — copy the full `name=value` of the `wordpress_logged_in_*` cookie.
|
||||||
python grab_cookie.py # tries Firefox, Chrome, Edge, Brave in order
|
|
||||||
python grab_cookie.py -b firefox # or target a specific browser
|
|
||||||
```
|
|
||||||
|
|
||||||
> **Note:** Chrome and Edge on Windows 130+ require the script to run as Administrator due to App-bound Encryption. Firefox works without elevated privileges.
|
Sites with no credentials are skipped automatically when running `python main.py`.
|
||||||
|
|
||||||
**Option B — manual:** open `.env` and set `WP_LOGIN_COOKIE` yourself. Get the value from browser DevTools → Storage → Cookies while on jailbirdz.com — copy the full `name=value` of the `wordpress_logged_in_*` cookie.
|
### `.env` values
|
||||||
|
|
||||||
### Other `.env` values
|
|
||||||
|
|
||||||
|
- `JAILBIRDZ_USERNAME` / `JAILBIRDZ_PASSWORD` — jailbirdz.com login.
|
||||||
|
- `JAILBIRDZ_LOGIN_COOKIE` — jailbirdz.com session cookie (fallback).
|
||||||
|
- `PINKCUFFS_USERNAME` / `PINKCUFFS_PASSWORD` — pinkcuffs.com login.
|
||||||
|
- `PINKCUFFS_LOGIN_COOKIE` — pinkcuffs.com session cookie (fallback).
|
||||||
- `PEERTUBE_URL` — base URL of your PeerTube instance.
|
- `PEERTUBE_URL` — base URL of your PeerTube instance.
|
||||||
- `PEERTUBE_USER` — PeerTube username.
|
- `PEERTUBE_USER` — PeerTube username.
|
||||||
- `PEERTUBE_CHANNEL` — channel to upload to.
|
- `PEERTUBE_CHANNEL` — channel to upload to.
|
||||||
@@ -48,7 +47,9 @@ python grab_cookie.py -b firefox # or target a specific browser
|
|||||||
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V).
|
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V).
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python main.py
|
python main.py # scrape all sites you have credentials for
|
||||||
|
python main.py --site jailbirdz # scrape one site only
|
||||||
|
python main.py --site pinkcuffs --site jailbirdz # explicit multi-site
|
||||||
```
|
```
|
||||||
|
|
||||||
Results are written to `video_map.json`. Safe to re-run — already-scraped posts are skipped.
|
Results are written to `video_map.json`. Safe to re-run — already-scraped posts are skipped.
|
||||||
@@ -65,6 +66,7 @@ Options:
|
|||||||
--reorganize Rename existing files to match current naming mode
|
--reorganize Rename existing files to match current naming mode
|
||||||
-w, --workers N Concurrent downloads (default: 4)
|
-w, --workers N Concurrent downloads (default: 4)
|
||||||
-n, --dry-run Print what would be downloaded
|
-n, --dry-run Print what would be downloaded
|
||||||
|
--site SITE Limit to one site (jailbirdz or pinkcuffs); repeatable
|
||||||
```
|
```
|
||||||
|
|
||||||
Resumes partial downloads. The chosen naming mode is saved to `.naming_mode` inside the output directory and persists across runs. Filenames that would clash are placed into subfolders.
|
Resumes partial downloads. The chosen naming mode is saved to `.naming_mode` inside the output directory and persists across runs. Filenames that would clash are placed into subfolders.
|
||||||
@@ -89,6 +91,30 @@ Options:
|
|||||||
|
|
||||||
Uploads in resumable 10 MB chunks. After each batch, waits for transcoding and object storage to complete before uploading the next batch — this prevents disk exhaustion on the PeerTube server. Videos already present on the channel (matched by name) are skipped. Progress is tracked in `.uploaded` inside the input directory.
|
Uploads in resumable 10 MB chunks. After each batch, waits for transcoding and object storage to complete before uploading the next batch — this prevents disk exhaustion on the PeerTube server. Videos already present on the channel (matched by name) are skipped. Progress is tracked in `.uploaded` inside the input directory.
|
||||||
|
|
||||||
|
## CI / Nightly Indexing
|
||||||
|
|
||||||
|
`.github/workflows/nightly-index.yml` runs `main.py` at 03:00 UTC daily and commits any new `video_map.json` entries back to the repo.
|
||||||
|
|
||||||
|
**One-time setup — add repo secrets for each site you have a membership on:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# jailbirdz (if you have a membership)
|
||||||
|
gh secret set JAILBIRDZ_USERNAME
|
||||||
|
gh secret set JAILBIRDZ_PASSWORD
|
||||||
|
|
||||||
|
# pinkcuffs (if you have a membership)
|
||||||
|
gh secret set PINKCUFFS_USERNAME
|
||||||
|
gh secret set PINKCUFFS_PASSWORD
|
||||||
|
```
|
||||||
|
|
||||||
|
**Seed CI with your current progress before the first run:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add video_map.json && git commit -m "chore: seed video_map"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Trigger manually:** Actions → Nightly Index → Run workflow.
|
||||||
|
|
||||||
## Utilities
|
## Utilities
|
||||||
|
|
||||||
### Check for filename clashes
|
### Check for filename clashes
|
||||||
|
|||||||
201
check_clashes.py
201
check_clashes.py
@@ -5,42 +5,142 @@ Importable functions:
|
|||||||
find_clashes(urls) - {filename: [urls]} for filenames with >1 source
|
find_clashes(urls) - {filename: [urls]} for filenames with >1 source
|
||||||
build_download_paths(urls, output_dir) - {url: local_path} with clash resolution
|
build_download_paths(urls, output_dir) - {url: local_path} with clash resolution
|
||||||
fmt_size(bytes) - human-readable size string
|
fmt_size(bytes) - human-readable size string
|
||||||
get_remote_size(session, url) - file size via HEAD without downloading
|
get_remote_size(session, url, referer) - file size via HEAD without downloading
|
||||||
fetch_sizes(urls, workers, on_progress) - bulk size lookup
|
fetch_sizes(urls, workers, on_progress, url_referers, session) - bulk size lookup
|
||||||
make_session() - requests.Session with required headers
|
make_session() - requests.Session with required headers
|
||||||
load_video_map() - load video_map.json, returns {} on missing/corrupt
|
load_video_map(site, path) - load video_map.json; auto-migrates old flat format
|
||||||
|
save_video_map(video_map, site_key, path) - atomic write of one site's entries
|
||||||
|
build_url_referers(video_map) - {cdn_url: referer} derived from page URL keys
|
||||||
|
is_valid_url(url) - True if url is a plain http(s) URL with no HTML artefacts
|
||||||
|
expects_video(url) - True if url is a members-only video page
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from pathlib import Path, PurePosixPath
|
from pathlib import Path, PurePosixPath
|
||||||
|
from typing import Any, Optional, cast
|
||||||
|
|
||||||
|
from collections.abc import Callable
|
||||||
from urllib.parse import urlparse, unquote
|
from urllib.parse import urlparse, unquote
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
import requests
|
import requests
|
||||||
from config import BASE_URL
|
|
||||||
|
|
||||||
REFERER = f"{BASE_URL}/"
|
VIDEO_MAP_FILE: str = "video_map.json"
|
||||||
VIDEO_MAP_FILE = "video_map.json"
|
VIDEO_EXTS: set[str] = {".mp4", ".mov", ".m4v", ".webm", ".avi"}
|
||||||
VIDEO_EXTS = {".mp4", ".mov", ".m4v", ".webm", ".avi"}
|
|
||||||
|
|
||||||
|
|
||||||
def load_video_map():
|
def is_valid_url(url: str) -> bool:
|
||||||
if Path(VIDEO_MAP_FILE).exists():
|
"""True if url is a plain http(s) URL with no HTML artefacts (<, >, href= etc.)."""
|
||||||
|
return (
|
||||||
|
url.startswith("http")
|
||||||
|
and "<" not in url
|
||||||
|
and ">" not in url
|
||||||
|
and " href=" not in url
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def expects_video(url: str) -> bool:
|
||||||
|
"""True if url is a members-only video page that should contain a video."""
|
||||||
|
return "/pinkcuffs-videos/" in url
|
||||||
|
|
||||||
|
|
||||||
|
def _write_video_map_atomic(data: dict[str, Any], path: Path) -> None:
|
||||||
|
"""Write the full nested video_map dict to disk atomically via a temp file."""
|
||||||
|
fd, tmp = tempfile.mkstemp(dir=path.resolve().parent, suffix=".tmp")
|
||||||
try:
|
try:
|
||||||
with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
|
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
|
Path(tmp).replace(path)
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
Path(tmp).unlink()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def load_video_map(
|
||||||
|
site: str | None = None,
|
||||||
|
path: str | Path = VIDEO_MAP_FILE,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Load video_map.json.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
site: If given, return only that site's inner dict {url: entry}.
|
||||||
|
If None, return a flat-merged dict across all sites.
|
||||||
|
path: Path to the JSON file (injectable for tests).
|
||||||
|
"""
|
||||||
|
p = Path(path)
|
||||||
|
if not p.exists():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
with open(p, encoding="utf-8") as f:
|
||||||
|
raw: Any = json.load(f)
|
||||||
|
data = cast(dict[str, Any], raw)
|
||||||
except (json.JSONDecodeError, OSError):
|
except (json.JSONDecodeError, OSError):
|
||||||
return {}
|
return {}
|
||||||
return {}
|
|
||||||
|
if site is not None:
|
||||||
|
return cast(dict[str, Any], data.get(site, {}))
|
||||||
|
|
||||||
|
# Merge all sites into a flat dict for backward-compat callers
|
||||||
|
merged: dict[str, Any] = {}
|
||||||
|
for site_entries in data.values():
|
||||||
|
if isinstance(site_entries, dict):
|
||||||
|
merged.update(cast(dict[str, Any], site_entries))
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
def make_session():
|
def save_video_map(
|
||||||
s = requests.Session()
|
video_map: dict[str, Any],
|
||||||
s.headers.update({"Referer": REFERER})
|
site_key: str,
|
||||||
return s
|
path: str | Path = VIDEO_MAP_FILE,
|
||||||
|
) -> None:
|
||||||
|
"""Atomically update one site's entries in the nested video_map.json.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
video_map: The inner {url: entry} dict for site_key.
|
||||||
|
site_key: Which top-level key to update (e.g. "jailbirdz").
|
||||||
|
path: Path to the JSON file (injectable for tests).
|
||||||
|
"""
|
||||||
|
p = Path(path)
|
||||||
|
if p.exists():
|
||||||
|
try:
|
||||||
|
with open(p, encoding="utf-8") as f:
|
||||||
|
raw: Any = json.load(f)
|
||||||
|
full = cast(dict[str, Any], raw)
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
full = {}
|
||||||
|
else:
|
||||||
|
full = {}
|
||||||
|
|
||||||
|
full[site_key] = video_map
|
||||||
|
_write_video_map_atomic(full, p)
|
||||||
|
|
||||||
|
|
||||||
def fmt_size(b):
|
def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
|
||||||
|
"""Pure function: return {cdn_video_url: site_referer} from a flat video map.
|
||||||
|
|
||||||
|
The flat video map has page URLs as keys; the scheme+netloc of each page URL
|
||||||
|
is used as the Referer for all CDN video URLs found in that entry.
|
||||||
|
"""
|
||||||
|
result: dict[str, str] = {}
|
||||||
|
for page_url, entry in video_map.items():
|
||||||
|
parsed = urlparse(page_url)
|
||||||
|
referer = f"{parsed.scheme}://{parsed.netloc}/"
|
||||||
|
for vid in cast(dict[str, Any], entry).get("videos", []):
|
||||||
|
if isinstance(vid, str):
|
||||||
|
result.setdefault(vid, referer)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def make_session() -> requests.Session:
|
||||||
|
return requests.Session()
|
||||||
|
|
||||||
|
|
||||||
|
def fmt_size(b: float | int) -> str:
|
||||||
for unit in ("B", "KB", "MB", "GB"):
|
for unit in ("B", "KB", "MB", "GB"):
|
||||||
if b < 1024:
|
if b < 1024:
|
||||||
return f"{b:.1f} {unit}"
|
return f"{b:.1f} {unit}"
|
||||||
@@ -48,30 +148,34 @@ def fmt_size(b):
|
|||||||
return f"{b:.1f} TB"
|
return f"{b:.1f} TB"
|
||||||
|
|
||||||
|
|
||||||
def url_to_filename(url):
|
def url_to_filename(url: str) -> str:
|
||||||
return unquote(PurePosixPath(urlparse(url).path).name)
|
return unquote(PurePosixPath(urlparse(url).path).name)
|
||||||
|
|
||||||
|
|
||||||
def find_clashes(urls):
|
def find_clashes(urls: list[str]) -> dict[str, list[str]]:
|
||||||
# Case-insensitive grouping so that e.g. "DaisyArrest.mp4" and
|
# Case-insensitive grouping so that e.g. "DaisyArrest.mp4" and
|
||||||
# "daisyarrest.mp4" are treated as a clash. This is required for
|
# "daisyarrest.mp4" are treated as a clash. This is required for
|
||||||
# correctness on case-insensitive filesystems (NTFS, exFAT, macOS HFS+)
|
# correctness on case-insensitive filesystems (NTFS, exFAT, macOS HFS+)
|
||||||
# and harmless on case-sensitive ones (ext4) — the actual filenames on
|
# and harmless on case-sensitive ones (ext4) — the actual filenames on
|
||||||
# disk keep their original casing; only the clash *detection* is folded.
|
# disk keep their original casing; only the clash *detection* is folded.
|
||||||
by_lower = defaultdict(list)
|
by_lower: defaultdict[str, list[str]] = defaultdict(list)
|
||||||
for url in urls:
|
for url in urls:
|
||||||
by_lower[url_to_filename(url).lower()].append(url)
|
by_lower[url_to_filename(url).lower()].append(url)
|
||||||
return {url_to_filename(srcs[0]): srcs
|
return {
|
||||||
for srcs in by_lower.values() if len(srcs) > 1}
|
url_to_filename(srcs[0]): srcs for srcs in by_lower.values() if len(srcs) > 1
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _clash_subfolder(url):
|
def _clash_subfolder(url: str) -> str:
|
||||||
"""Parent path segment used as disambiguator for clashing filenames."""
|
"""Parent path segment used as disambiguator for clashing filenames."""
|
||||||
parts = urlparse(url).path.rstrip("/").split("/")
|
parts = urlparse(url).path.rstrip("/").split("/")
|
||||||
return unquote(parts[-2]) if len(parts) >= 2 else "unknown"
|
return unquote(parts[-2]) if len(parts) >= 2 else "unknown"
|
||||||
|
|
||||||
|
|
||||||
def build_download_paths(urls, output_dir):
|
def build_download_paths(
|
||||||
|
urls: list[str],
|
||||||
|
output_dir: str | Path,
|
||||||
|
) -> dict[str, Path]:
|
||||||
"""Map each URL to a local file path. Flat layout; clashing names get a subfolder."""
|
"""Map each URL to a local file path. Flat layout; clashing names get a subfolder."""
|
||||||
clashes = find_clashes(urls)
|
clashes = find_clashes(urls)
|
||||||
clash_lower = {name.lower() for name in clashes}
|
clash_lower = {name.lower() for name in clashes}
|
||||||
@@ -86,16 +190,25 @@ def build_download_paths(urls, output_dir):
|
|||||||
return paths
|
return paths
|
||||||
|
|
||||||
|
|
||||||
def get_remote_size(session, url):
|
def get_remote_size(
|
||||||
|
session: requests.Session,
|
||||||
|
url: str,
|
||||||
|
referer: str = "",
|
||||||
|
) -> int | None:
|
||||||
|
extra = {"Referer": referer} if referer else {}
|
||||||
try:
|
try:
|
||||||
r = session.head(url, allow_redirects=True, timeout=15)
|
r = session.head(url, headers=extra, allow_redirects=True, timeout=15)
|
||||||
if r.status_code < 400 and "Content-Length" in r.headers:
|
if r.status_code < 400 and "Content-Length" in r.headers:
|
||||||
return int(r.headers["Content-Length"])
|
return int(r.headers["Content-Length"])
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
r = session.get(
|
r = session.get(
|
||||||
url, headers={"Range": "bytes=0-0"}, stream=True, timeout=15)
|
url,
|
||||||
|
headers={"Range": "bytes=0-0", **extra},
|
||||||
|
stream=True,
|
||||||
|
timeout=15,
|
||||||
|
)
|
||||||
r.close()
|
r.close()
|
||||||
cr = r.headers.get("Content-Range", "")
|
cr = r.headers.get("Content-Range", "")
|
||||||
if "/" in cr:
|
if "/" in cr:
|
||||||
@@ -105,19 +218,30 @@ def get_remote_size(session, url):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def fetch_sizes(urls, workers=20, on_progress=None):
|
def fetch_sizes(
|
||||||
|
urls: list[str],
|
||||||
|
workers: int = 20,
|
||||||
|
on_progress: Callable[[int, int], None] | None = None,
|
||||||
|
url_referers: dict[str, str] | None = None,
|
||||||
|
session: requests.Session | None = None,
|
||||||
|
) -> dict[str, int | None]:
|
||||||
"""Return {url: size_or_None}. on_progress(done, total) called after each URL."""
|
"""Return {url: size_or_None}. on_progress(done, total) called after each URL."""
|
||||||
|
if session is None:
|
||||||
session = make_session()
|
session = make_session()
|
||||||
sizes = {}
|
referers = url_referers or {}
|
||||||
|
sizes: dict[str, int | None] = {}
|
||||||
total = len(urls)
|
total = len(urls)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||||
futures = {pool.submit(get_remote_size, session, u): u for u in urls}
|
futures = {
|
||||||
|
pool.submit(get_remote_size, session, u, referers.get(u, "")): u
|
||||||
|
for u in urls
|
||||||
|
}
|
||||||
done = 0
|
done = 0
|
||||||
for fut in as_completed(futures):
|
for fut in as_completed(futures):
|
||||||
sizes[futures[fut]] = fut.result()
|
sizes[futures[fut]] = fut.result()
|
||||||
done += 1
|
done += 1
|
||||||
if on_progress:
|
if on_progress is not None:
|
||||||
on_progress(done, total)
|
on_progress(done, total)
|
||||||
|
|
||||||
return sizes
|
return sizes
|
||||||
@@ -125,14 +249,20 @@ def fetch_sizes(urls, workers=20, on_progress=None):
|
|||||||
|
|
||||||
# --------------- CLI ---------------
|
# --------------- CLI ---------------
|
||||||
|
|
||||||
def main():
|
|
||||||
|
def main() -> None:
|
||||||
vm = load_video_map()
|
vm = load_video_map()
|
||||||
urls = [u for entry in vm.values() for u in entry.get("videos", []) if u.startswith("http")]
|
urls = [
|
||||||
|
u
|
||||||
|
for entry in vm.values()
|
||||||
|
for u in entry.get("videos", [])
|
||||||
|
if u.startswith("http")
|
||||||
|
]
|
||||||
|
|
||||||
clashes = find_clashes(urls)
|
clashes = find_clashes(urls)
|
||||||
|
|
||||||
print(f"Total URLs: {len(urls)}")
|
print(f"Total URLs: {len(urls)}")
|
||||||
by_name = defaultdict(list)
|
by_name: defaultdict[str, list[str]] = defaultdict(list)
|
||||||
for url in urls:
|
for url in urls:
|
||||||
by_name[url_to_filename(url)].append(url)
|
by_name[url_to_filename(url)].append(url)
|
||||||
print(f"Unique filenames: {len(by_name)}")
|
print(f"Unique filenames: {len(by_name)}")
|
||||||
@@ -142,8 +272,9 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
clash_urls = [u for srcs in clashes.values() for u in srcs]
|
clash_urls = [u for srcs in clashes.values() for u in srcs]
|
||||||
|
url_referers = build_url_referers(vm)
|
||||||
print(f"\n[+] Fetching file sizes for {len(clash_urls)} clashing URLs…")
|
print(f"\n[+] Fetching file sizes for {len(clash_urls)} clashing URLs…")
|
||||||
sizes = fetch_sizes(clash_urls)
|
sizes = fetch_sizes(clash_urls, url_referers=url_referers)
|
||||||
|
|
||||||
print(f"\n{len(clashes)} filename clash(es):\n")
|
print(f"\n{len(clashes)} filename clash(es):\n")
|
||||||
for name, srcs in sorted(clashes.items()):
|
for name, srcs in sorted(clashes.items()):
|
||||||
|
|||||||
17
config.py
17
config.py
@@ -1,2 +1,15 @@
|
|||||||
BASE_URL = "https://www.jailbirdz.com"
|
# config.py
|
||||||
COOKIE_DOMAIN = "jailbirdz.com" # rookiepy domain filter (no www)
|
from typing import Final
|
||||||
|
|
||||||
|
SITES: Final[dict[str, dict[str, str]]] = {
|
||||||
|
"jailbirdz": {
|
||||||
|
"base_url": "https://www.jailbirdz.com",
|
||||||
|
"cookie_domain": "jailbirdz.com",
|
||||||
|
"env_prefix": "JAILBIRDZ",
|
||||||
|
},
|
||||||
|
"pinkcuffs": {
|
||||||
|
"base_url": "https://www.pinkcuffs.com",
|
||||||
|
"cookie_domain": "pinkcuffs.com",
|
||||||
|
"env_prefix": "PINKCUFFS",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|||||||
235
download.py
235
download.py
@@ -11,12 +11,13 @@ Usage:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from typing import Any, Optional
|
||||||
|
import requests
|
||||||
|
|
||||||
from check_clashes import (
|
from check_clashes import (
|
||||||
make_session,
|
make_session,
|
||||||
@@ -24,33 +25,38 @@ from check_clashes import (
|
|||||||
url_to_filename,
|
url_to_filename,
|
||||||
find_clashes,
|
find_clashes,
|
||||||
build_download_paths,
|
build_download_paths,
|
||||||
|
build_url_referers,
|
||||||
fetch_sizes,
|
fetch_sizes,
|
||||||
|
load_video_map,
|
||||||
|
is_valid_url,
|
||||||
|
VIDEO_MAP_FILE,
|
||||||
)
|
)
|
||||||
|
from config import SITES
|
||||||
|
|
||||||
VIDEO_MAP_FILE = "video_map.json"
|
|
||||||
CHUNK_SIZE = 8 * 1024 * 1024
|
CHUNK_SIZE = 8 * 1024 * 1024
|
||||||
DEFAULT_OUTPUT = "downloads"
|
DEFAULT_OUTPUT: str = "downloads"
|
||||||
DEFAULT_WORKERS = 4
|
DEFAULT_WORKERS: int = 4
|
||||||
MODE_FILE = ".naming_mode"
|
MODE_FILE: str = ".naming_mode"
|
||||||
MODE_ORIGINAL = "original"
|
MODE_ORIGINAL: str = "original"
|
||||||
MODE_TITLE = "title"
|
MODE_TITLE: str = "title"
|
||||||
|
|
||||||
|
|
||||||
# ── Naming mode persistence ──────────────────────────────────────────
|
# ── Naming mode persistence ──────────────────────────────────────────
|
||||||
|
|
||||||
def read_mode(output_dir):
|
|
||||||
|
def read_mode(output_dir: str | Path) -> str | None:
|
||||||
p = Path(output_dir) / MODE_FILE
|
p = Path(output_dir) / MODE_FILE
|
||||||
if p.exists():
|
if p.exists():
|
||||||
return p.read_text().strip()
|
return p.read_text().strip()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def write_mode(output_dir, mode):
|
def write_mode(output_dir: str | Path, mode: str) -> None:
|
||||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||||
(Path(output_dir) / MODE_FILE).write_text(mode)
|
(Path(output_dir) / MODE_FILE).write_text(mode)
|
||||||
|
|
||||||
|
|
||||||
def resolve_mode(args):
|
def resolve_mode(args: argparse.Namespace) -> str:
|
||||||
"""Determine naming mode from CLI flags + saved marker. Returns mode string."""
|
"""Determine naming mode from CLI flags + saved marker. Returns mode string."""
|
||||||
saved = read_mode(args.output)
|
saved = read_mode(args.output)
|
||||||
|
|
||||||
@@ -69,13 +75,18 @@ def resolve_mode(args):
|
|||||||
|
|
||||||
# ── Filename helpers ─────────────────────────────────────────────────
|
# ── Filename helpers ─────────────────────────────────────────────────
|
||||||
|
|
||||||
def sanitize_filename(title, max_len=180):
|
|
||||||
name = re.sub(r'[<>:"/\\|?*]', '', title)
|
def sanitize_filename(title: str, max_len: int = 180) -> str:
|
||||||
name = re.sub(r'\s+', ' ', name).strip().rstrip('.')
|
name = re.sub(r'[<>:"/\\|?*]', "", title)
|
||||||
|
name = re.sub(r"\s+", " ", name).strip().rstrip(".")
|
||||||
return name[:max_len].rstrip() if len(name) > max_len else name
|
return name[:max_len].rstrip() if len(name) > max_len else name
|
||||||
|
|
||||||
|
|
||||||
def build_title_paths(urls, url_to_title, output_dir):
|
def build_title_paths(
|
||||||
|
urls: list[str],
|
||||||
|
url_to_title: dict[str, str],
|
||||||
|
output_dir: str | Path,
|
||||||
|
) -> dict[str, Path]:
|
||||||
name_to_urls = defaultdict(list)
|
name_to_urls = defaultdict(list)
|
||||||
url_to_base = {}
|
url_to_base = {}
|
||||||
|
|
||||||
@@ -91,14 +102,33 @@ def build_title_paths(urls, url_to_title, output_dir):
|
|||||||
base, ext = url_to_base[url]
|
base, ext = url_to_base[url]
|
||||||
full = base + ext
|
full = base + ext
|
||||||
if len(name_to_urls[full]) > 1:
|
if len(name_to_urls[full]) > 1:
|
||||||
slug = url_to_filename(url).rsplit('.', 1)[0]
|
slug = url_to_filename(url).rsplit(".", 1)[0]
|
||||||
paths[url] = Path(output_dir) / f"{base} [{slug}]{ext}"
|
paths[url] = Path(output_dir) / f"{base} [{slug}]{ext}"
|
||||||
else:
|
else:
|
||||||
paths[url] = Path(output_dir) / full
|
paths[url] = Path(output_dir) / full
|
||||||
return paths
|
return paths
|
||||||
|
|
||||||
|
|
||||||
def get_paths_for_mode(mode, urls, video_map, output_dir):
|
def get_paths_for_mode(
|
||||||
|
mode: str,
|
||||||
|
urls: list[str],
|
||||||
|
video_map: dict[str, Any],
|
||||||
|
output_dir: str | Path,
|
||||||
|
url_to_site: dict[str, str] | None = None,
|
||||||
|
) -> dict[str, Path]:
|
||||||
|
if url_to_site:
|
||||||
|
by_site: dict[str, list[str]] = defaultdict(list)
|
||||||
|
for u in urls:
|
||||||
|
by_site[url_to_site.get(u, "")].append(u)
|
||||||
|
paths: dict[str, Path] = {}
|
||||||
|
url_title = build_url_title_map(video_map) if mode == MODE_TITLE else {}
|
||||||
|
for site, site_urls in by_site.items():
|
||||||
|
base = Path(output_dir) / site if site else Path(output_dir)
|
||||||
|
if mode == MODE_TITLE:
|
||||||
|
paths.update(build_title_paths(site_urls, url_title, base))
|
||||||
|
else:
|
||||||
|
paths.update(build_download_paths(site_urls, base))
|
||||||
|
return paths
|
||||||
if mode == MODE_TITLE:
|
if mode == MODE_TITLE:
|
||||||
url_title = build_url_title_map(video_map)
|
url_title = build_url_title_map(video_map)
|
||||||
return build_title_paths(urls, url_title, output_dir)
|
return build_title_paths(urls, url_title, output_dir)
|
||||||
@@ -107,11 +137,21 @@ def get_paths_for_mode(mode, urls, video_map, output_dir):
|
|||||||
|
|
||||||
# ── Reorganize ───────────────────────────────────────────────────────
|
# ── Reorganize ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
def reorganize(urls, video_map, output_dir, target_mode, dry_run=False):
|
|
||||||
|
def reorganize(
|
||||||
|
urls: list[str],
|
||||||
|
video_map: dict[str, Any],
|
||||||
|
output_dir: str | Path,
|
||||||
|
target_mode: str,
|
||||||
|
dry_run: bool = False,
|
||||||
|
url_to_site: dict[str, str] | None = None,
|
||||||
|
) -> None:
|
||||||
"""Rename existing files from one naming scheme to another."""
|
"""Rename existing files from one naming scheme to another."""
|
||||||
other_mode = MODE_TITLE if target_mode == MODE_ORIGINAL else MODE_ORIGINAL
|
other_mode = MODE_TITLE if target_mode == MODE_ORIGINAL else MODE_ORIGINAL
|
||||||
old_paths = get_paths_for_mode(other_mode, urls, video_map, output_dir)
|
old_paths = get_paths_for_mode(other_mode, urls, video_map, output_dir, url_to_site)
|
||||||
new_paths = get_paths_for_mode(target_mode, urls, video_map, output_dir)
|
new_paths = get_paths_for_mode(
|
||||||
|
target_mode, urls, video_map, output_dir, url_to_site
|
||||||
|
)
|
||||||
|
|
||||||
moves = []
|
moves = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
@@ -163,21 +203,30 @@ def reorganize(urls, video_map, output_dir, target_mode, dry_run=False):
|
|||||||
|
|
||||||
# ── Download ─────────────────────────────────────────────────────────
|
# ── Download ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def download_one(session, url, dest, expected_size):
|
|
||||||
|
def download_one(
|
||||||
|
session: requests.Session,
|
||||||
|
url: str,
|
||||||
|
dest: str | Path,
|
||||||
|
expected_size: int | None,
|
||||||
|
referer: str = "",
|
||||||
|
) -> tuple[str, int]:
|
||||||
dest = Path(dest)
|
dest = Path(dest)
|
||||||
part = dest.parent / (dest.name + ".part")
|
part = dest.parent / (dest.name + ".part")
|
||||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
local = dest.stat().st_size
|
local = dest.stat().st_size
|
||||||
if expected_size and local == expected_size:
|
if expected_size is not None and local == expected_size:
|
||||||
return "ok", 0
|
return "ok", 0
|
||||||
if expected_size and local != expected_size:
|
if expected_size is not None and local != expected_size:
|
||||||
dest.unlink()
|
dest.unlink()
|
||||||
|
|
||||||
existing = part.stat().st_size if part.exists() else 0
|
existing = part.stat().st_size if part.exists() else 0
|
||||||
headers = {}
|
headers: dict[str, str] = {}
|
||||||
if existing and expected_size and existing < expected_size:
|
if referer:
|
||||||
|
headers["Referer"] = referer
|
||||||
|
if existing and expected_size is not None and existing < expected_size:
|
||||||
headers["Range"] = f"bytes={existing}-"
|
headers["Range"] = f"bytes={existing}-"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -205,33 +254,21 @@ def download_one(session, url, dest, expected_size):
|
|||||||
return f"error: {e}", written
|
return f"error: {e}", written
|
||||||
|
|
||||||
final_size = existing + written
|
final_size = existing + written
|
||||||
if expected_size and final_size != expected_size:
|
if expected_size is not None and final_size != expected_size:
|
||||||
return "size_mismatch", written
|
return "size_mismatch", written
|
||||||
|
|
||||||
part.rename(dest)
|
part.rename(dest)
|
||||||
return "ok", written
|
return "ok", written
|
||||||
|
|
||||||
|
|
||||||
# ── Data loading ─────────────────────────────────────────────────────
|
def collect_urls(video_map: dict[str, Any]) -> list[str]:
|
||||||
|
|
||||||
def load_video_map():
|
|
||||||
with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
|
|
||||||
return json.load(f)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_valid_url(url):
|
|
||||||
return url.startswith(
|
|
||||||
"http") and "<" not in url and ">" not in url and " href=" not in url
|
|
||||||
|
|
||||||
|
|
||||||
def collect_urls(video_map):
|
|
||||||
urls, seen, skipped = [], set(), 0
|
urls, seen, skipped = [], set(), 0
|
||||||
for entry in video_map.values():
|
for entry in video_map.values():
|
||||||
for video_url in entry.get("videos", []):
|
for video_url in entry.get("videos", []):
|
||||||
if video_url in seen:
|
if video_url in seen:
|
||||||
continue
|
continue
|
||||||
seen.add(video_url)
|
seen.add(video_url)
|
||||||
if _is_valid_url(video_url):
|
if is_valid_url(video_url):
|
||||||
urls.append(video_url)
|
urls.append(video_url)
|
||||||
else:
|
else:
|
||||||
skipped += 1
|
skipped += 1
|
||||||
@@ -240,7 +277,7 @@ def collect_urls(video_map):
|
|||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def build_url_title_map(video_map):
|
def build_url_title_map(video_map: dict[str, Any]) -> dict[str, str]:
|
||||||
url_title = {}
|
url_title = {}
|
||||||
for entry in video_map.values():
|
for entry in video_map.values():
|
||||||
title = entry.get("title", "")
|
title = entry.get("title", "")
|
||||||
@@ -252,28 +289,68 @@ def build_url_title_map(video_map):
|
|||||||
|
|
||||||
# ── Main ─────────────────────────────────────────────────────────────
|
# ── Main ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(
|
def main() -> None:
|
||||||
description="Download videos from video_map.json")
|
parser = argparse.ArgumentParser(description="Download videos from video_map.json")
|
||||||
parser.add_argument("--output", "-o", default=DEFAULT_OUTPUT,
|
parser.add_argument(
|
||||||
help=f"Download directory (default: {DEFAULT_OUTPUT})")
|
"--output",
|
||||||
|
"-o",
|
||||||
|
default=DEFAULT_OUTPUT,
|
||||||
|
help=f"Download directory (default: {DEFAULT_OUTPUT})",
|
||||||
|
)
|
||||||
|
|
||||||
naming = parser.add_mutually_exclusive_group()
|
naming = parser.add_mutually_exclusive_group()
|
||||||
naming.add_argument("--titles", "-t", action="store_true",
|
naming.add_argument(
|
||||||
help="Use title-based filenames (saved as default for this directory)")
|
"--titles",
|
||||||
naming.add_argument("--original", action="store_true",
|
"-t",
|
||||||
help="Use original CloudFront filenames (saved as default for this directory)")
|
action="store_true",
|
||||||
|
help="Use title-based filenames (saved as default for this directory)",
|
||||||
|
)
|
||||||
|
naming.add_argument(
|
||||||
|
"--original",
|
||||||
|
action="store_true",
|
||||||
|
help="Use original CloudFront filenames (saved as default for this directory)",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--reorganize", action="store_true",
|
parser.add_argument(
|
||||||
help="Rename existing files to match the current naming mode")
|
"--reorganize",
|
||||||
parser.add_argument("--dry-run", "-n", action="store_true",
|
action="store_true",
|
||||||
help="Preview without making changes")
|
help="Rename existing files to match the current naming mode",
|
||||||
parser.add_argument("--workers", "-w", type=int, default=DEFAULT_WORKERS,
|
)
|
||||||
help=f"Concurrent downloads (default: {DEFAULT_WORKERS})")
|
parser.add_argument(
|
||||||
|
"--dry-run", "-n", action="store_true", help="Preview without making changes"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--workers",
|
||||||
|
"-w",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_WORKERS,
|
||||||
|
help=f"Concurrent downloads (default: {DEFAULT_WORKERS})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--site",
|
||||||
|
action="append",
|
||||||
|
choices=list(SITES.keys()),
|
||||||
|
dest="sites",
|
||||||
|
metavar="SITE",
|
||||||
|
help=f"Site(s) to download (default: all). Can be repeated. Choices: {', '.join(SITES)}",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
video_map = load_video_map()
|
video_map = load_video_map()
|
||||||
|
url_referers = build_url_referers(video_map)
|
||||||
urls = collect_urls(video_map)
|
urls = collect_urls(video_map)
|
||||||
|
|
||||||
|
url_to_site: dict[str, str] = {}
|
||||||
|
for site_key in SITES:
|
||||||
|
for entry in load_video_map(site_key).values():
|
||||||
|
for vid_url in entry.get("videos", []):
|
||||||
|
url_to_site[vid_url] = site_key
|
||||||
|
|
||||||
|
if args.sites:
|
||||||
|
selected = set(args.sites)
|
||||||
|
urls = [u for u in urls if url_to_site.get(u) in selected]
|
||||||
|
|
||||||
mode = resolve_mode(args)
|
mode = resolve_mode(args)
|
||||||
|
|
||||||
saved = read_mode(args.output)
|
saved = read_mode(args.output)
|
||||||
@@ -287,10 +364,18 @@ def main():
|
|||||||
if mode_changed and not args.reorganize:
|
if mode_changed and not args.reorganize:
|
||||||
print(f"\n[!] Mode changed from '{saved}' to '{mode}'.")
|
print(f"\n[!] Mode changed from '{saved}' to '{mode}'.")
|
||||||
print(
|
print(
|
||||||
" Use --reorganize to rename existing files, or --dry-run to preview.")
|
" Use --reorganize to rename existing files, or --dry-run to preview."
|
||||||
|
)
|
||||||
print(" Refusing to download until existing files are reorganized.")
|
print(" Refusing to download until existing files are reorganized.")
|
||||||
return
|
return
|
||||||
reorganize(urls, video_map, args.output, mode, dry_run=args.dry_run)
|
reorganize(
|
||||||
|
urls,
|
||||||
|
video_map,
|
||||||
|
args.output,
|
||||||
|
mode,
|
||||||
|
dry_run=args.dry_run,
|
||||||
|
url_to_site=url_to_site,
|
||||||
|
)
|
||||||
if args.dry_run or args.reorganize:
|
if args.dry_run or args.reorganize:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -298,12 +383,13 @@ def main():
|
|||||||
if not args.dry_run:
|
if not args.dry_run:
|
||||||
write_mode(args.output, mode)
|
write_mode(args.output, mode)
|
||||||
|
|
||||||
paths = get_paths_for_mode(mode, urls, video_map, args.output)
|
paths = get_paths_for_mode(mode, urls, video_map, args.output, url_to_site)
|
||||||
|
|
||||||
clashes = find_clashes(urls)
|
clashes = find_clashes(urls)
|
||||||
if clashes:
|
if clashes:
|
||||||
print(
|
print(
|
||||||
f"[+] {len(clashes)} filename clash(es) resolved with subfolders/suffixes")
|
f"[+] {len(clashes)} filename clash(es) resolved with subfolders/suffixes"
|
||||||
|
)
|
||||||
|
|
||||||
already = [u for u in urls if paths[u].exists()]
|
already = [u for u in urls if paths[u].exists()]
|
||||||
pending = [u for u in urls if not paths[u].exists()]
|
pending = [u for u in urls if not paths[u].exists()]
|
||||||
@@ -316,8 +402,7 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
print(
|
print(f"\n[dry-run] Would download {len(pending)} files to {args.output}/")
|
||||||
f"\n[dry-run] Would download {len(pending)} files to {args.output}/")
|
|
||||||
for url in pending[:20]:
|
for url in pending[:20]:
|
||||||
print(f" → {paths[url].name}")
|
print(f" → {paths[url].name}")
|
||||||
if len(pending) > 20:
|
if len(pending) > 20:
|
||||||
@@ -326,16 +411,15 @@ def main():
|
|||||||
|
|
||||||
print("\n[+] Fetching remote file sizes…")
|
print("\n[+] Fetching remote file sizes…")
|
||||||
session = make_session()
|
session = make_session()
|
||||||
remote_sizes = fetch_sizes(pending, workers=20)
|
remote_sizes = fetch_sizes(pending, workers=20, url_referers=url_referers)
|
||||||
|
|
||||||
sized = {u: s for u, s in remote_sizes.items() if s is not None}
|
sized = {u: s for u, s in remote_sizes.items() if s is not None}
|
||||||
total_bytes = sum(sized.values())
|
total_bytes = sum(sized.values())
|
||||||
print(
|
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
|
||||||
f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
|
|
||||||
|
|
||||||
if already:
|
if already:
|
||||||
print(f"[+] Verifying {len(already)} existing files…")
|
print(f"[+] Verifying {len(already)} existing files…")
|
||||||
already_sizes = fetch_sizes(already, workers=20)
|
already_sizes = fetch_sizes(already, workers=20, url_referers=url_referers)
|
||||||
|
|
||||||
mismatched = 0
|
mismatched = 0
|
||||||
for url in already:
|
for url in already:
|
||||||
@@ -344,14 +428,15 @@ def main():
|
|||||||
remote = already_sizes.get(url)
|
remote = already_sizes.get(url)
|
||||||
if remote and local != remote:
|
if remote and local != remote:
|
||||||
mismatched += 1
|
mismatched += 1
|
||||||
print(f"[!] Size mismatch: {dest.name} "
|
print(
|
||||||
f"(local {fmt_size(local)} vs remote {fmt_size(remote)})")
|
f"[!] Size mismatch: {dest.name} "
|
||||||
|
f"(local {fmt_size(local)} vs remote {fmt_size(remote)})"
|
||||||
|
)
|
||||||
pending.append(url)
|
pending.append(url)
|
||||||
remote_sizes[url] = remote
|
remote_sizes[url] = remote
|
||||||
|
|
||||||
if mismatched:
|
if mismatched:
|
||||||
print(
|
print(f"[!] {mismatched} file(s) will be re-downloaded due to size mismatch")
|
||||||
f"[!] {mismatched} file(s) will be re-downloaded due to size mismatch")
|
|
||||||
|
|
||||||
print(f"\n[⚡] Downloading with {args.workers} threads…\n")
|
print(f"\n[⚡] Downloading with {args.workers} threads…\n")
|
||||||
|
|
||||||
@@ -361,10 +446,12 @@ def main():
|
|||||||
total = len(pending)
|
total = len(pending)
|
||||||
interrupted = False
|
interrupted = False
|
||||||
|
|
||||||
def do_download(url):
|
def do_download(url: str) -> tuple[str, tuple[str, int]]:
|
||||||
dest = paths[url]
|
dest = paths[url]
|
||||||
expected = remote_sizes.get(url)
|
expected = remote_sizes.get(url)
|
||||||
return url, download_one(session, url, dest, expected)
|
return url, download_one(
|
||||||
|
session, url, dest, expected, url_referers.get(url, "")
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with ThreadPoolExecutor(max_workers=args.workers) as pool:
|
with ThreadPoolExecutor(max_workers=args.workers) as pool:
|
||||||
@@ -376,11 +463,9 @@ def main():
|
|||||||
name = paths[url].name
|
name = paths[url].name
|
||||||
|
|
||||||
if status == "ok" and written > 0:
|
if status == "ok" and written > 0:
|
||||||
print(
|
print(f" [{completed}/{total}] ✓ {name} ({fmt_size(written)})")
|
||||||
f" [{completed}/{total}] ✓ {name} ({fmt_size(written)})")
|
|
||||||
elif status == "ok":
|
elif status == "ok":
|
||||||
print(
|
print(f" [{completed}/{total}] ✓ {name} (already complete)")
|
||||||
f" [{completed}/{total}] ✓ {name} (already complete)")
|
|
||||||
elif status == "size_mismatch":
|
elif status == "size_mismatch":
|
||||||
print(f" [{completed}/{total}] ⚠ {name} (size mismatch)")
|
print(f" [{completed}/{total}] ⚠ {name} (size mismatch)")
|
||||||
failed.append(url)
|
failed.append(url)
|
||||||
|
|||||||
165
grab_cookie.py
165
grab_cookie.py
@@ -1,113 +1,130 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
grab_cookie.py — read the WordPress login cookie from an
|
grab_cookie.py — log in to a site and write the session cookie to .env.
|
||||||
installed browser and write it to .env as WP_LOGIN_COOKIE=name=value.
|
|
||||||
|
Requires {SITE}_USERNAME and {SITE}_PASSWORD to be set in the environment or .env.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python grab_cookie.py # tries Firefox, Chrome, Edge, Brave
|
python grab_cookie.py --site jailbirdz
|
||||||
python grab_cookie.py --browser firefox # explicit browser
|
python grab_cookie.py --site pinkcuffs
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from config import COOKIE_DOMAIN
|
from typing import Literal
|
||||||
|
import requests
|
||||||
|
from config import SITES
|
||||||
|
|
||||||
ENV_FILE = Path(".env")
|
ENV_FILE = Path(".env")
|
||||||
ENV_KEY = "WP_LOGIN_COOKIE"
|
|
||||||
COOKIE_PREFIX = "wordpress_logged_in_"
|
COOKIE_PREFIX = "wordpress_logged_in_"
|
||||||
|
|
||||||
BROWSER_NAMES = ["firefox", "chrome", "edge", "brave"]
|
|
||||||
|
|
||||||
|
def update_env(
|
||||||
|
name: str,
|
||||||
|
value: str,
|
||||||
|
env_key: str = "WP_LOGIN_COOKIE",
|
||||||
|
path: Path = ENV_FILE,
|
||||||
|
) -> Literal["updated", "appended", "created"]:
|
||||||
|
"""Write env_key=name=value into the env file, replacing any existing line."""
|
||||||
|
new_line = f"{env_key}={name}={value}\n"
|
||||||
|
|
||||||
def find_cookie(browser_name):
|
if path.exists():
|
||||||
"""Return (name, value) for the wordpress_logged_in_* cookie, or (None, None)."""
|
text = path.read_text(encoding="utf-8")
|
||||||
try:
|
|
||||||
import rookiepy
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError("rookiepy not installed — run: pip install rookiepy")
|
|
||||||
|
|
||||||
fn = getattr(rookiepy, browser_name, None)
|
|
||||||
if fn is None:
|
|
||||||
raise ValueError(f"rookiepy does not support '{browser_name}'.")
|
|
||||||
|
|
||||||
try:
|
|
||||||
cookies = fn([COOKIE_DOMAIN])
|
|
||||||
except PermissionError:
|
|
||||||
raise PermissionError(
|
|
||||||
f"Permission denied reading {browser_name} cookies.\n"
|
|
||||||
" Close the browser, or on Windows run as Administrator for Chrome/Edge."
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(f"Could not read {browser_name} cookies: {e}")
|
|
||||||
|
|
||||||
for c in cookies:
|
|
||||||
if c.get("name", "").startswith(COOKIE_PREFIX):
|
|
||||||
return c["name"], c["value"]
|
|
||||||
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
|
|
||||||
def update_env(name, value):
|
|
||||||
"""Write WP_LOGIN_COOKIE=name=value into .env, replacing any existing line."""
|
|
||||||
new_line = f"{ENV_KEY}={name}={value}\n"
|
|
||||||
|
|
||||||
if ENV_FILE.exists():
|
|
||||||
text = ENV_FILE.read_text(encoding="utf-8")
|
|
||||||
lines = text.splitlines(keepends=True)
|
lines = text.splitlines(keepends=True)
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
if line.startswith(f"{ENV_KEY}=") or line.strip() == ENV_KEY:
|
key, sep, _ = line.partition("=")
|
||||||
|
if key.strip() == env_key and sep:
|
||||||
lines[i] = new_line
|
lines[i] = new_line
|
||||||
ENV_FILE.write_text("".join(lines), encoding="utf-8")
|
path.write_text("".join(lines), encoding="utf-8")
|
||||||
return "updated"
|
return "updated"
|
||||||
# Key not present — append
|
# Key not present — append
|
||||||
if text and not text.endswith("\n"):
|
if text and not text.endswith("\n"):
|
||||||
text += "\n"
|
text += "\n"
|
||||||
ENV_FILE.write_text(text + new_line, encoding="utf-8")
|
path.write_text(text + new_line, encoding="utf-8")
|
||||||
return "appended"
|
return "appended"
|
||||||
else:
|
else:
|
||||||
ENV_FILE.write_text(new_line, encoding="utf-8")
|
path.write_text(new_line, encoding="utf-8")
|
||||||
return "created"
|
return "created"
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def login_and_get_cookie(
|
||||||
|
username: str, password: str, base_url: str
|
||||||
|
) -> tuple[str, str]:
|
||||||
|
"""POST to wp-admin/admin-ajax.php (xootix action) and return (cookie_name, cookie_value).
|
||||||
|
|
||||||
|
No browser needed — the xootix login endpoint takes plain form fields and returns
|
||||||
|
the wordpress_logged_in_* cookie directly in the response Set-Cookie headers.
|
||||||
|
"""
|
||||||
|
session = requests.Session()
|
||||||
|
r = session.post(
|
||||||
|
f"{base_url}/wp-admin/admin-ajax.php",
|
||||||
|
data={
|
||||||
|
"xoo-el-username": username,
|
||||||
|
"xoo-el-password": password,
|
||||||
|
"xoo-el-rememberme": "forever",
|
||||||
|
"_xoo_el_form": "login",
|
||||||
|
"xoo_el_redirect": "/",
|
||||||
|
"action": "xoo_el_form_action",
|
||||||
|
"display": "popup",
|
||||||
|
},
|
||||||
|
headers={
|
||||||
|
"Referer": f"{base_url}/",
|
||||||
|
"Origin": base_url,
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
||||||
|
},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
result = r.json()
|
||||||
|
if result.get("error"):
|
||||||
|
raise RuntimeError(f"Login rejected by server: {result.get('notice', result)}")
|
||||||
|
|
||||||
|
for name, value in session.cookies.items():
|
||||||
|
if name.startswith(COOKIE_PREFIX):
|
||||||
|
return name, value
|
||||||
|
|
||||||
|
raise RuntimeError(
|
||||||
|
"Server accepted login but no wordpress_logged_in_* cookie was set.\n"
|
||||||
|
" Check that username and password are correct."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _auto_login() -> None:
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description=f"Copy the {COOKIE_DOMAIN} login cookie from your browser into .env."
|
description="Log in and save session cookie to .env"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--browser", "-b",
|
"--site",
|
||||||
choices=BROWSER_NAMES,
|
required=True,
|
||||||
metavar="BROWSER",
|
choices=list(SITES.keys()),
|
||||||
help=f"Browser to read from: {', '.join(BROWSER_NAMES)} (default: try all in order)",
|
help="Which site to authenticate with",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
order = [args.browser] if args.browser else BROWSER_NAMES
|
site_cfg = SITES[args.site]
|
||||||
|
env_prefix = site_cfg["env_prefix"]
|
||||||
|
base_url = site_cfg["base_url"]
|
||||||
|
env_key = f"{env_prefix}_LOGIN_COOKIE"
|
||||||
|
|
||||||
cookie_name = cookie_value = None
|
username = os.environ.get(f"{env_prefix}_USERNAME", "").strip()
|
||||||
for browser in order:
|
password = os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
|
||||||
print(f"[…] Trying {browser}…")
|
if not username or not password:
|
||||||
try:
|
|
||||||
cookie_name, cookie_value = find_cookie(browser)
|
|
||||||
except ImportError as e:
|
|
||||||
raise SystemExit(f"[!] {e}")
|
|
||||||
except (ValueError, PermissionError, RuntimeError) as e:
|
|
||||||
print(f"[!] {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if cookie_name:
|
|
||||||
print(f"[+] Found in {browser}: {cookie_name}")
|
|
||||||
break
|
|
||||||
print(f" No {COOKIE_PREFIX}* cookie found in {browser}.")
|
|
||||||
|
|
||||||
if not cookie_name:
|
|
||||||
raise SystemExit(
|
raise SystemExit(
|
||||||
f"\n[!] No {COOKIE_PREFIX}* cookie found in any browser.\n"
|
f"[!] {env_prefix}_USERNAME and {env_prefix}_PASSWORD must be set "
|
||||||
f" Make sure you are logged into {COOKIE_DOMAIN}, then re-run.\n"
|
"in the environment or .env — see .env.example."
|
||||||
" Or set WP_LOGIN_COOKIE manually in .env — see .env.example."
|
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
cookie_name, cookie_value = login_and_get_cookie(username, password, base_url)
|
||||||
|
except RuntimeError as e:
|
||||||
|
raise SystemExit(f"[!] {e}")
|
||||||
|
print(f"[+] Login succeeded: {cookie_name}")
|
||||||
|
action = update_env(cookie_name, cookie_value, env_key=env_key)
|
||||||
|
print(f"[✓] {env_key} {action} in {ENV_FILE}.")
|
||||||
|
|
||||||
action = update_env(cookie_name, cookie_value)
|
|
||||||
print(f"[✓] {ENV_KEY} {action} in {ENV_FILE}.")
|
def main() -> None:
|
||||||
|
_auto_login()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
558
main.py
558
main.py
@@ -1,60 +1,131 @@
|
|||||||
|
import argparse
|
||||||
import re
|
import re
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import signal
|
import signal
|
||||||
import asyncio
|
import asyncio
|
||||||
import tempfile
|
|
||||||
import requests
|
import requests
|
||||||
from pathlib import Path, PurePosixPath
|
from pathlib import PurePosixPath
|
||||||
|
from typing import Any, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright, BrowserContext
|
||||||
from check_clashes import VIDEO_EXTS
|
from check_clashes import (
|
||||||
from config import BASE_URL
|
VIDEO_EXTS,
|
||||||
|
load_video_map,
|
||||||
|
save_video_map,
|
||||||
|
is_valid_url,
|
||||||
|
expects_video,
|
||||||
|
)
|
||||||
|
from config import SITES
|
||||||
|
from grab_cookie import login_and_get_cookie, update_env
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
def _is_video_url(url):
|
def _is_video_url(url: str) -> bool:
|
||||||
"""True if `url` ends with a recognised video extension (case-insensitive, path only)."""
|
"""True if `url` ends with a recognised video extension (case-insensitive, path only)."""
|
||||||
return PurePosixPath(urlparse(url).path).suffix.lower() in VIDEO_EXTS
|
return PurePosixPath(urlparse(url).path).suffix.lower() in VIDEO_EXTS
|
||||||
WP_API = f"{BASE_URL}/wp-json/wp/v2"
|
|
||||||
|
|
||||||
SKIP_TYPES = {
|
SKIP_TYPES = {
|
||||||
"attachment", "nav_menu_item", "wp_block", "wp_template",
|
"attachment",
|
||||||
"wp_template_part", "wp_global_styles", "wp_navigation",
|
"nav_menu_item",
|
||||||
"wp_font_family", "wp_font_face",
|
"wp_block",
|
||||||
|
"wp_template",
|
||||||
|
"wp_template_part",
|
||||||
|
"wp_global_styles",
|
||||||
|
"wp_navigation",
|
||||||
|
"wp_font_family",
|
||||||
|
"wp_font_face",
|
||||||
}
|
}
|
||||||
|
|
||||||
VIDEO_MAP_FILE = "video_map.json"
|
|
||||||
MAX_WORKERS = 4
|
MAX_WORKERS = 4
|
||||||
|
|
||||||
API_HEADERS = {
|
_USER_AGENT = (
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _api_headers(base_url: str, cookie_name: str, cookie_value: str) -> dict[str, str]:
|
||||||
|
return {
|
||||||
|
"User-Agent": _USER_AGENT,
|
||||||
"Accept": "application/json",
|
"Accept": "application/json",
|
||||||
"Referer": f"{BASE_URL}/",
|
"Referer": f"{base_url}/",
|
||||||
|
"Cookie": f"{cookie_name}={cookie_value}; eav-age-verified=1",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _get_login_cookie():
|
def _select_probe_url(video_map: dict[str, Any]) -> str | None:
|
||||||
raw = os.environ.get("WP_LOGIN_COOKIE", "").strip() # strip accidental whitespace
|
"""Pure function: return the first URL in video_map where expects_video() is True."""
|
||||||
if not raw:
|
return next((url for url in video_map if expects_video(url)), None)
|
||||||
raise RuntimeError(
|
|
||||||
"WP_LOGIN_COOKIE not set. Copy it from your browser into .env — see .env.example.")
|
|
||||||
|
def _probe_cookie(name: str, value: str, site_key: str) -> bool:
|
||||||
|
"""HEAD request to a members-only video page. Returns True if the cookie is still valid."""
|
||||||
|
video_map = load_video_map(site_key)
|
||||||
|
probe_url = _select_probe_url(video_map)
|
||||||
|
if probe_url is None:
|
||||||
|
return False # no video URLs yet — can't validate, fall through to re-auth
|
||||||
|
r = requests.head(
|
||||||
|
probe_url,
|
||||||
|
headers={"Cookie": f"{name}={value}", "User-Agent": _USER_AGENT},
|
||||||
|
allow_redirects=False,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
return r.status_code == 200
|
||||||
|
|
||||||
|
|
||||||
|
def _get_login_cookie(site_key: str, site_cfg: dict[str, str]) -> tuple[str, str]:
|
||||||
|
env_prefix = site_cfg["env_prefix"]
|
||||||
|
base_url = site_cfg["base_url"]
|
||||||
|
env_key = f"{env_prefix}_LOGIN_COOKIE"
|
||||||
|
|
||||||
|
username = os.environ.get(f"{env_prefix}_USERNAME", "").strip()
|
||||||
|
password = os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
|
||||||
|
has_credentials = bool(username and password)
|
||||||
|
|
||||||
|
raw = os.environ.get(env_key, "").strip()
|
||||||
|
if raw:
|
||||||
name, _, value = raw.partition("=")
|
name, _, value = raw.partition("=")
|
||||||
if not value:
|
if value and name.startswith("wordpress_logged_in_"):
|
||||||
raise RuntimeError(
|
if not has_credentials:
|
||||||
"WP_LOGIN_COOKIE looks malformed (no '=' found). Expected: name=value")
|
return name, value # cookie-only mode — trust it
|
||||||
if not name.startswith("wordpress_logged_in_"):
|
print(f"[{site_key}] Cookie found — validating…")
|
||||||
raise RuntimeError(
|
if _probe_cookie(name, value, site_key):
|
||||||
"WP_LOGIN_COOKIE doesn't look right — expected a wordpress_logged_in_... cookie.")
|
print(f"[{site_key}] Cookie still valid — skipping login.")
|
||||||
return name, value
|
return name, value
|
||||||
|
print(f"[{site_key}] Cookie expired — re-authenticating…")
|
||||||
|
|
||||||
|
if has_credentials:
|
||||||
|
cookie_name, cookie_value = login_and_get_cookie(username, password, base_url)
|
||||||
|
action = update_env(cookie_name, cookie_value, env_key=env_key)
|
||||||
|
print(f"[{site_key}] Logged in: {cookie_name} ({action} in .env)")
|
||||||
|
return cookie_name, cookie_value
|
||||||
|
|
||||||
|
raise RuntimeError(
|
||||||
|
f"No credentials or cookie found for {site_key}. Set either:\n"
|
||||||
|
f" • {env_prefix}_USERNAME + {env_prefix}_PASSWORD (recommended)\n"
|
||||||
|
f" • {env_prefix}_LOGIN_COOKIE (fallback — may expire)\n"
|
||||||
|
"See .env.example."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def discover_content_types(session):
|
def _has_credentials(site_cfg: dict[str, str]) -> bool:
|
||||||
"""Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug) for content types worth scraping."""
|
env_prefix = site_cfg["env_prefix"]
|
||||||
r = session.get(f"{WP_API}/types", timeout=30)
|
has_cookie = bool(os.environ.get(f"{env_prefix}_LOGIN_COOKIE", "").strip())
|
||||||
|
has_creds = bool(
|
||||||
|
os.environ.get(f"{env_prefix}_USERNAME", "").strip()
|
||||||
|
and os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
|
||||||
|
)
|
||||||
|
return has_cookie or has_creds
|
||||||
|
|
||||||
|
|
||||||
|
def discover_content_types(
|
||||||
|
session: requests.Session, wp_api: str
|
||||||
|
) -> list[tuple[str, str, str]]:
|
||||||
|
"""Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug)."""
|
||||||
|
r = session.get(f"{wp_api}/types", timeout=30)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
types = r.json()
|
types = r.json()
|
||||||
|
|
||||||
@@ -69,16 +140,22 @@ def discover_content_types(session):
|
|||||||
return targets
|
return targets
|
||||||
|
|
||||||
|
|
||||||
def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
|
def fetch_all_posts_for_type(
|
||||||
"""Paginate one content type and return (url, title, description) tuples.
|
session: requests.Session,
|
||||||
Uses the `link` field when available; falls back to building from slug."""
|
wp_api: str,
|
||||||
|
base_url: str,
|
||||||
|
type_name: str,
|
||||||
|
rest_base: str,
|
||||||
|
type_slug: str,
|
||||||
|
) -> list[tuple[str, str, str]]:
|
||||||
|
"""Paginate one content type and return (url, title, description) tuples."""
|
||||||
url_prefix = type_slug.replace("_", "-")
|
url_prefix = type_slug.replace("_", "-")
|
||||||
results = []
|
results = []
|
||||||
page = 1
|
page = 1
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
r = session.get(
|
r = session.get(
|
||||||
f"{WP_API}/{rest_base}",
|
f"{wp_api}/{rest_base}",
|
||||||
params={"per_page": 100, "page": page},
|
params={"per_page": 100, "page": page},
|
||||||
timeout=30,
|
timeout=30,
|
||||||
)
|
)
|
||||||
@@ -92,15 +169,19 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
|
|||||||
if not link.startswith("http"):
|
if not link.startswith("http"):
|
||||||
slug = post.get("slug")
|
slug = post.get("slug")
|
||||||
if slug:
|
if slug:
|
||||||
link = f"{BASE_URL}/{url_prefix}/{slug}/"
|
link = f"{base_url}/{url_prefix}/{slug}/"
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
title_obj = post.get("title", {})
|
title_obj = post.get("title", {})
|
||||||
title = title_obj.get("rendered", "") if isinstance(
|
title = (
|
||||||
title_obj, dict) else str(title_obj)
|
title_obj.get("rendered", "")
|
||||||
|
if isinstance(title_obj, dict)
|
||||||
|
else str(title_obj)
|
||||||
|
)
|
||||||
content_obj = post.get("content", {})
|
content_obj = post.get("content", {})
|
||||||
content_html = content_obj.get(
|
content_html = (
|
||||||
"rendered", "") if isinstance(content_obj, dict) else ""
|
content_obj.get("rendered", "") if isinstance(content_obj, dict) else ""
|
||||||
|
)
|
||||||
description = html_to_text(content_html) if content_html else ""
|
description = html_to_text(content_html) if content_html else ""
|
||||||
results.append((link, title, description))
|
results.append((link, title, description))
|
||||||
print(f" {type_name} page {page}: {len(data)} items")
|
print(f" {type_name} page {page}: {len(data)} items")
|
||||||
@@ -109,66 +190,88 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def fetch_post_urls_from_api(headers):
|
def fetch_post_urls_from_api(
|
||||||
|
site_key: str,
|
||||||
|
base_url: str,
|
||||||
|
wp_api: str,
|
||||||
|
headers: dict[str, str],
|
||||||
|
) -> list[str]:
|
||||||
"""Auto-discover all content types via the WP REST API and collect every post URL.
|
"""Auto-discover all content types via the WP REST API and collect every post URL.
|
||||||
Also builds video_map.json with titles pre-populated."""
|
Also pre-populates video_map.json with titles."""
|
||||||
print("[+] video_map.json empty or missing — discovering content types from REST API…")
|
print(f"[{site_key}] video_map empty — discovering content types from REST API…")
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.headers.update(headers)
|
session.headers.update(headers)
|
||||||
|
|
||||||
targets = discover_content_types(session)
|
targets = discover_content_types(session, wp_api)
|
||||||
print(
|
print(
|
||||||
f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n")
|
f"[{site_key}] Found {len(targets)} content types: "
|
||||||
|
f"{', '.join(name for name, _, _ in targets)}\n"
|
||||||
|
)
|
||||||
|
|
||||||
all_results = []
|
all_results = []
|
||||||
for type_name, rest_base, type_slug in targets:
|
for type_name, rest_base, type_slug in targets:
|
||||||
type_results = fetch_all_posts_for_type(
|
type_results = fetch_all_posts_for_type(
|
||||||
session, type_name, rest_base, type_slug)
|
session, wp_api, base_url, type_name, rest_base, type_slug
|
||||||
|
)
|
||||||
all_results.extend(type_results)
|
all_results.extend(type_results)
|
||||||
|
|
||||||
seen = set()
|
seen: set[str] = set()
|
||||||
deduped_urls = []
|
deduped_urls = []
|
||||||
video_map = load_video_map()
|
video_map = load_video_map(site_key)
|
||||||
|
|
||||||
for url, title, description in all_results:
|
for url, title, description in all_results:
|
||||||
if url not in seen and url.startswith("http"):
|
if url not in seen and url.startswith("http"):
|
||||||
seen.add(url)
|
seen.add(url)
|
||||||
deduped_urls.append(url)
|
deduped_urls.append(url)
|
||||||
if url not in video_map:
|
if url not in video_map:
|
||||||
video_map[url] = {"title": title,
|
video_map[url] = {
|
||||||
"description": description, "videos": []}
|
"title": title,
|
||||||
|
"description": description,
|
||||||
|
"videos": [],
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
if not video_map[url].get("title"):
|
if not video_map[url].get("title"):
|
||||||
video_map[url]["title"] = title
|
video_map[url]["title"] = title
|
||||||
if not video_map[url].get("description"):
|
if not video_map[url].get("description"):
|
||||||
video_map[url]["description"] = description
|
video_map[url]["description"] = description
|
||||||
|
|
||||||
save_video_map(video_map)
|
save_video_map(video_map, site_key)
|
||||||
print(
|
print(
|
||||||
f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}")
|
f"\n[{site_key}] Discovered {len(deduped_urls)} unique URLs → saved to video_map.json"
|
||||||
print(
|
)
|
||||||
f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
|
print(f"[{site_key}] Pre-populated {len(video_map)} entries")
|
||||||
return deduped_urls
|
return deduped_urls
|
||||||
|
|
||||||
|
|
||||||
def fetch_metadata_from_api(video_map, urls, headers):
|
def fetch_metadata_from_api(
|
||||||
|
site_key: str,
|
||||||
|
base_url: str,
|
||||||
|
wp_api: str,
|
||||||
|
video_map: dict[str, Any],
|
||||||
|
urls: list[str],
|
||||||
|
headers: dict[str, str],
|
||||||
|
) -> None:
|
||||||
"""Populate missing titles and descriptions in video_map from the REST API."""
|
"""Populate missing titles and descriptions in video_map from the REST API."""
|
||||||
missing = [u for u in urls
|
missing = [
|
||||||
|
u
|
||||||
|
for u in urls
|
||||||
if u not in video_map
|
if u not in video_map
|
||||||
or not video_map[u].get("title")
|
or not video_map[u].get("title")
|
||||||
or not video_map[u].get("description")]
|
or not video_map[u].get("description")
|
||||||
|
]
|
||||||
if not missing:
|
if not missing:
|
||||||
return
|
return
|
||||||
|
|
||||||
print(f"[+] Fetching metadata from REST API for {len(missing)} posts…")
|
print(f"[{site_key}] Fetching metadata from REST API for {len(missing)} posts…")
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.headers.update(headers)
|
session.headers.update(headers)
|
||||||
|
|
||||||
targets = discover_content_types(session)
|
targets = discover_content_types(session, wp_api)
|
||||||
|
|
||||||
for type_name, rest_base, type_slug in targets:
|
for type_name, rest_base, type_slug in targets:
|
||||||
type_results = fetch_all_posts_for_type(
|
type_results = fetch_all_posts_for_type(
|
||||||
session, type_name, rest_base, type_slug)
|
session, wp_api, base_url, type_name, rest_base, type_slug
|
||||||
|
)
|
||||||
for url, title, description in type_results:
|
for url, title, description in type_results:
|
||||||
if url in video_map:
|
if url in video_map:
|
||||||
if not video_map[url].get("title"):
|
if not video_map[url].get("title"):
|
||||||
@@ -176,93 +279,90 @@ def fetch_metadata_from_api(video_map, urls, headers):
|
|||||||
if not video_map[url].get("description"):
|
if not video_map[url].get("description"):
|
||||||
video_map[url]["description"] = description
|
video_map[url]["description"] = description
|
||||||
else:
|
else:
|
||||||
video_map[url] = {"title": title,
|
video_map[url] = {
|
||||||
"description": description, "videos": []}
|
"title": title,
|
||||||
|
"description": description,
|
||||||
|
"videos": [],
|
||||||
|
}
|
||||||
|
|
||||||
save_video_map(video_map)
|
save_video_map(video_map, site_key)
|
||||||
populated_t = sum(1 for u in urls if video_map.get(u, {}).get("title"))
|
populated_t = sum(1 for u in urls if video_map.get(u, {}).get("title"))
|
||||||
populated_d = sum(1 for u in urls if video_map.get(
|
populated_d = sum(1 for u in urls if video_map.get(u, {}).get("description"))
|
||||||
u, {}).get("description"))
|
print(f"[{site_key}] Titles populated: {populated_t}/{len(urls)}")
|
||||||
print(f"[+] Titles populated: {populated_t}/{len(urls)}")
|
print(f"[{site_key}] Descriptions populated: {populated_d}/{len(urls)}")
|
||||||
print(f"[+] Descriptions populated: {populated_d}/{len(urls)}")
|
|
||||||
|
|
||||||
|
|
||||||
def load_post_urls(headers):
|
def load_post_urls(
|
||||||
vm = load_video_map()
|
site_key: str,
|
||||||
|
base_url: str,
|
||||||
|
wp_api: str,
|
||||||
|
headers: dict[str, str],
|
||||||
|
) -> list[str]:
|
||||||
|
vm = load_video_map(site_key)
|
||||||
if vm:
|
if vm:
|
||||||
print(f"[+] {VIDEO_MAP_FILE} found — loading {len(vm)} post URLs.")
|
print(f"[{site_key}] video_map found — loading {len(vm)} post URLs.")
|
||||||
return list(vm.keys())
|
return list(vm.keys())
|
||||||
return fetch_post_urls_from_api(headers)
|
return fetch_post_urls_from_api(site_key, base_url, wp_api, headers)
|
||||||
|
|
||||||
|
|
||||||
def html_to_text(html_str):
|
def html_to_text(html_str: str) -> str:
|
||||||
"""Strip HTML tags, decode entities, and collapse whitespace into clean plain text."""
|
"""Strip HTML tags, decode entities, and collapse whitespace into clean plain text."""
|
||||||
import html
|
import html
|
||||||
text = re.sub(r'<br\s*/?>', '\n', html_str)
|
|
||||||
text = text.replace('</p>', '\n\n')
|
text = re.sub(r"<br\s*/?>", "\n", html_str)
|
||||||
text = re.sub(r'<[^>]+>', '', text)
|
text = text.replace("</p>", "\n\n")
|
||||||
|
text = re.sub(r"<[^>]+>", "", text)
|
||||||
text = html.unescape(text)
|
text = html.unescape(text)
|
||||||
lines = [line.strip() for line in text.splitlines()]
|
lines = [line.strip() for line in text.splitlines()]
|
||||||
text = '\n'.join(lines)
|
text = "\n".join(lines)
|
||||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def extract_mp4_from_html(html):
|
def extract_mp4_from_html(html: str) -> list[str]:
|
||||||
candidates = re.findall(r'https?://[^\s"\'<>]+', html)
|
candidates = re.findall(r'https?://[^\s"\'<>]+', html)
|
||||||
return [u for u in candidates if _is_video_url(u)]
|
return [u for u in candidates if _is_video_url(u)]
|
||||||
|
|
||||||
|
|
||||||
def extract_title_from_html(html):
|
def extract_title_from_html(html: str) -> str | None:
|
||||||
m = re.search(
|
m = re.search(r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
|
||||||
r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
|
|
||||||
if m:
|
if m:
|
||||||
title = re.sub(r'<[^>]+>', '', m.group(1)).strip()
|
title = re.sub(r"<[^>]+>", "", m.group(1)).strip()
|
||||||
return title
|
return title
|
||||||
m = re.search(r'<title>(.*?)(?:\s*[-–|].*)?</title>', html, re.DOTALL)
|
m = re.search(r"<title>(.*?)(?:\s*[-–|].*)?</title>", html, re.DOTALL)
|
||||||
if m:
|
if m:
|
||||||
return m.group(1).strip()
|
return m.group(1).strip()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def load_video_map():
|
|
||||||
if Path(VIDEO_MAP_FILE).exists():
|
|
||||||
try:
|
|
||||||
with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
|
|
||||||
return json.load(f)
|
|
||||||
except (json.JSONDecodeError, OSError):
|
|
||||||
return {}
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
|
||||||
def save_video_map(video_map):
|
|
||||||
fd, tmp_path = tempfile.mkstemp(dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp")
|
|
||||||
try:
|
|
||||||
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(video_map, f, indent=2, ensure_ascii=False)
|
|
||||||
Path(tmp_path).replace(VIDEO_MAP_FILE)
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
Path(tmp_path).unlink()
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _expects_video(url):
|
|
||||||
return "/pinkcuffs-videos/" in url
|
|
||||||
|
|
||||||
|
|
||||||
MAX_RETRIES = 2
|
MAX_RETRIES = 2
|
||||||
|
|
||||||
|
|
||||||
async def worker(worker_id, queue, context, known,
|
async def worker(
|
||||||
total, retry_counts, video_map, map_lock, shutdown_event):
|
worker_id: int,
|
||||||
|
queue: asyncio.Queue[tuple[int, str]],
|
||||||
|
context: BrowserContext,
|
||||||
|
known: set[str],
|
||||||
|
total: int,
|
||||||
|
retry_counts: dict[int, int],
|
||||||
|
video_map: dict[str, Any],
|
||||||
|
map_lock: asyncio.Lock,
|
||||||
|
shutdown_event: asyncio.Event,
|
||||||
|
reauth_lock: asyncio.Lock,
|
||||||
|
reauth_done: list[bool],
|
||||||
|
site_key: str,
|
||||||
|
site_cfg: dict[str, str],
|
||||||
|
) -> None:
|
||||||
|
base_url = site_cfg["base_url"]
|
||||||
|
cookie_domain = urlparse(base_url).hostname or site_cfg["cookie_domain"]
|
||||||
|
env_prefix = site_cfg["env_prefix"]
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
video_hits = set()
|
video_hits: set[str] = set()
|
||||||
|
|
||||||
page.on("response", lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None)
|
page.on(
|
||||||
|
"response",
|
||||||
|
lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while not shutdown_event.is_set():
|
while not shutdown_event.is_set():
|
||||||
@@ -279,19 +379,69 @@ async def worker(worker_id, queue, context, known,
|
|||||||
await page.goto(url, wait_until="networkidle", timeout=60000)
|
await page.goto(url, wait_until="networkidle", timeout=60000)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[W{worker_id}] Navigation error: {e}")
|
print(f"[W{worker_id}] Navigation error: {e}")
|
||||||
if _expects_video(url) and attempt < MAX_RETRIES:
|
if expects_video(url) and attempt < MAX_RETRIES:
|
||||||
retry_counts[idx] = attempt + 1
|
retry_counts[idx] = attempt + 1
|
||||||
queue.put_nowait((idx, url))
|
queue.put_nowait((idx, url))
|
||||||
print(f"[W{worker_id}] Re-queued for retry.")
|
print(f"[W{worker_id}] Re-queued for retry.")
|
||||||
elif not _expects_video(url):
|
elif not expects_video(url):
|
||||||
async with map_lock:
|
async with map_lock:
|
||||||
entry = video_map.get(url, {})
|
entry = video_map.get(url, {})
|
||||||
entry["scraped_at"] = int(time.time())
|
entry["scraped_at"] = int(time.time())
|
||||||
video_map[url] = entry
|
video_map[url] = entry
|
||||||
save_video_map(video_map)
|
save_video_map(video_map, site_key)
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run.")
|
f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "NoDirectAccessAllowed" in page.url:
|
||||||
|
recovered = False
|
||||||
|
async with reauth_lock:
|
||||||
|
if not reauth_done[0]:
|
||||||
|
username = os.environ.get(f"{env_prefix}_USERNAME", "").strip()
|
||||||
|
password = os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
|
||||||
|
if username and password:
|
||||||
|
print(f"[W{worker_id}] Cookie expired — re-authenticating…")
|
||||||
|
try:
|
||||||
|
new_name, new_value = await asyncio.to_thread(
|
||||||
|
login_and_get_cookie, username, password, base_url
|
||||||
|
)
|
||||||
|
update_env(
|
||||||
|
new_name,
|
||||||
|
new_value,
|
||||||
|
env_key=f"{env_prefix}_LOGIN_COOKIE",
|
||||||
|
)
|
||||||
|
await context.add_cookies(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": new_name,
|
||||||
|
"value": new_value,
|
||||||
|
"domain": cookie_domain,
|
||||||
|
"path": "/",
|
||||||
|
"httpOnly": True,
|
||||||
|
"secure": True,
|
||||||
|
"sameSite": "None",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
reauth_done[0] = True
|
||||||
|
recovered = True
|
||||||
|
print(f"[W{worker_id}] Re-auth succeeded — re-queuing.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[W{worker_id}] Re-auth failed: {e}")
|
||||||
|
shutdown_event.set()
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"[W{worker_id}] Cookie expired. "
|
||||||
|
f"Set {env_prefix}_USERNAME + {env_prefix}_PASSWORD "
|
||||||
|
"in .env for auto re-auth."
|
||||||
|
)
|
||||||
|
shutdown_event.set()
|
||||||
|
else:
|
||||||
|
recovered = True # another worker already re-authed
|
||||||
|
if recovered:
|
||||||
|
queue.put_nowait((idx, url))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
await asyncio.sleep(1.5)
|
await asyncio.sleep(1.5)
|
||||||
@@ -301,9 +451,15 @@ async def worker(worker_id, queue, context, known,
|
|||||||
found = set(html_videos) | set(video_hits)
|
found = set(html_videos) | set(video_hits)
|
||||||
video_hits.clear()
|
video_hits.clear()
|
||||||
|
|
||||||
all_videos = [m for m in found if m not in (
|
all_videos = [
|
||||||
f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
|
m
|
||||||
)]
|
for m in found
|
||||||
|
if is_valid_url(m)
|
||||||
|
and m
|
||||||
|
not in (
|
||||||
|
f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
async with map_lock:
|
async with map_lock:
|
||||||
new_found = found - known
|
new_found = found - known
|
||||||
@@ -312,7 +468,8 @@ async def worker(worker_id, queue, context, known,
|
|||||||
known.update(new_found)
|
known.update(new_found)
|
||||||
elif all_videos:
|
elif all_videos:
|
||||||
print(
|
print(
|
||||||
f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write.")
|
f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print(f"[W{worker_id}] No video found on page.")
|
print(f"[W{worker_id}] No video found on page.")
|
||||||
|
|
||||||
@@ -322,51 +479,52 @@ async def worker(worker_id, queue, context, known,
|
|||||||
existing_videos = set(entry.get("videos", []))
|
existing_videos = set(entry.get("videos", []))
|
||||||
existing_videos.update(all_videos)
|
existing_videos.update(all_videos)
|
||||||
entry["videos"] = sorted(existing_videos)
|
entry["videos"] = sorted(existing_videos)
|
||||||
mark_done = bool(all_videos) or not _expects_video(url)
|
mark_done = bool(all_videos) or not expects_video(url)
|
||||||
if mark_done:
|
if mark_done:
|
||||||
entry["scraped_at"] = int(time.time())
|
entry["scraped_at"] = int(time.time())
|
||||||
video_map[url] = entry
|
video_map[url] = entry
|
||||||
save_video_map(video_map)
|
save_video_map(video_map, site_key)
|
||||||
|
|
||||||
if not mark_done:
|
if not mark_done:
|
||||||
if attempt < MAX_RETRIES:
|
if attempt < MAX_RETRIES:
|
||||||
retry_counts[idx] = attempt + 1
|
retry_counts[idx] = attempt + 1
|
||||||
queue.put_nowait((idx, url))
|
queue.put_nowait((idx, url))
|
||||||
print(
|
print(
|
||||||
f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES}).")
|
f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES})."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run.")
|
f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run."
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
await page.close()
|
await page.close()
|
||||||
|
|
||||||
|
|
||||||
async def run():
|
async def run_for_site(
|
||||||
shutdown_event = asyncio.Event()
|
site_key: str,
|
||||||
loop = asyncio.get_running_loop()
|
site_cfg: dict[str, str],
|
||||||
|
shutdown_event: asyncio.Event,
|
||||||
|
) -> None:
|
||||||
|
base_url = site_cfg["base_url"]
|
||||||
|
cookie_domain = urlparse(base_url).hostname or site_cfg["cookie_domain"]
|
||||||
|
wp_api = f"{base_url}/wp-json/wp/v2"
|
||||||
|
|
||||||
def _handle_shutdown(signum, _frame):
|
cookie_name, cookie_value = _get_login_cookie(site_key, site_cfg)
|
||||||
print(f"\n[!] Signal {signum} received — finishing active pages then exiting…")
|
req_headers = _api_headers(base_url, cookie_name, cookie_value)
|
||||||
loop.call_soon_threadsafe(shutdown_event.set)
|
|
||||||
|
|
||||||
signal.signal(signal.SIGINT, _handle_shutdown)
|
urls = load_post_urls(site_key, base_url, wp_api, req_headers)
|
||||||
signal.signal(signal.SIGTERM, _handle_shutdown)
|
|
||||||
|
|
||||||
try:
|
video_map = load_video_map(site_key)
|
||||||
cookie_name, cookie_value = _get_login_cookie()
|
if any(
|
||||||
req_headers = {
|
u not in video_map
|
||||||
**API_HEADERS,
|
|
||||||
"Cookie": f"{cookie_name}={cookie_value}; eav-age-verified=1",
|
|
||||||
}
|
|
||||||
|
|
||||||
urls = load_post_urls(req_headers)
|
|
||||||
|
|
||||||
video_map = load_video_map()
|
|
||||||
if any(u not in video_map
|
|
||||||
or not video_map[u].get("title")
|
or not video_map[u].get("title")
|
||||||
or not video_map[u].get("description")
|
or not video_map[u].get("description")
|
||||||
for u in urls if _expects_video(u)):
|
for u in urls
|
||||||
fetch_metadata_from_api(video_map, urls, req_headers)
|
if expects_video(u)
|
||||||
|
):
|
||||||
|
fetch_metadata_from_api(
|
||||||
|
site_key, base_url, wp_api, video_map, urls, req_headers
|
||||||
|
)
|
||||||
|
|
||||||
known = {u for entry in video_map.values() for u in entry.get("videos", [])}
|
known = {u for entry in video_map.values() for u in entry.get("videos", [])}
|
||||||
|
|
||||||
@@ -377,62 +535,79 @@ async def run():
|
|||||||
entry = video_map.get(u, {})
|
entry = video_map.get(u, {})
|
||||||
if not entry.get("scraped_at"):
|
if not entry.get("scraped_at"):
|
||||||
pending.append((i, u))
|
pending.append((i, u))
|
||||||
elif _expects_video(u) and not entry.get("videos"):
|
elif expects_video(u) and not entry.get("videos"):
|
||||||
pending.append((i, u))
|
pending.append((i, u))
|
||||||
needs_map += 1
|
needs_map += 1
|
||||||
|
|
||||||
done_count = sum(1 for v in video_map.values() if v.get("scraped_at"))
|
done_count = sum(1 for v in video_map.values() if v.get("scraped_at"))
|
||||||
print(f"[+] Loaded {total} post URLs.")
|
print(f"[{site_key}] Loaded {total} post URLs.")
|
||||||
print(f"[+] Already have {len(known)} video URLs mapped.")
|
print(f"[{site_key}] Already have {len(known)} video URLs mapped.")
|
||||||
print(f"[+] Video map: {len(video_map)} entries in {VIDEO_MAP_FILE}")
|
print(f"[{site_key}] Video map: {len(video_map)} entries in video_map.json")
|
||||||
if done_count:
|
if done_count:
|
||||||
remaining_new = len(pending) - needs_map
|
remaining_new = len(pending) - needs_map
|
||||||
print(
|
print(
|
||||||
f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data.")
|
f"[{site_key}] Resuming: {done_count} done, "
|
||||||
|
f"{remaining_new} new + {needs_map} needing map data."
|
||||||
|
)
|
||||||
if not pending:
|
if not pending:
|
||||||
print("[✓] All URLs already processed and mapped.")
|
print(f"[{site_key}] All URLs already processed and mapped.")
|
||||||
return
|
return
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n")
|
f"[{site_key}] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n"
|
||||||
|
)
|
||||||
|
|
||||||
queue = asyncio.Queue()
|
queue: asyncio.Queue[tuple[int, str]] = asyncio.Queue()
|
||||||
for item in pending:
|
for item in pending:
|
||||||
queue.put_nowait(item)
|
queue.put_nowait(item)
|
||||||
|
|
||||||
map_lock = asyncio.Lock()
|
map_lock = asyncio.Lock()
|
||||||
retry_counts = {}
|
reauth_lock = asyncio.Lock()
|
||||||
|
reauth_done: list[bool] = [False]
|
||||||
|
retry_counts: dict[int, int] = {}
|
||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.firefox.launch(headless=True)
|
browser = await p.firefox.launch(headless=True)
|
||||||
context = await browser.new_context()
|
context = await browser.new_context()
|
||||||
|
|
||||||
_cookie_domain = urlparse(BASE_URL).netloc
|
|
||||||
site_cookies = [
|
site_cookies = [
|
||||||
{
|
{
|
||||||
"name": cookie_name,
|
"name": cookie_name,
|
||||||
"value": cookie_value,
|
"value": cookie_value,
|
||||||
"domain": _cookie_domain,
|
"domain": cookie_domain,
|
||||||
"path": "/",
|
"path": "/",
|
||||||
"httpOnly": True,
|
"httpOnly": True,
|
||||||
"secure": True,
|
"secure": True,
|
||||||
"sameSite": "None"
|
"sameSite": "None",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "eav-age-verified",
|
"name": "eav-age-verified",
|
||||||
"value": "1",
|
"value": "1",
|
||||||
"domain": _cookie_domain,
|
"domain": cookie_domain,
|
||||||
"path": "/"
|
"path": "/",
|
||||||
}
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
await context.add_cookies(site_cookies)
|
await context.add_cookies(site_cookies) # type: ignore[arg-type]
|
||||||
|
|
||||||
num_workers = min(MAX_WORKERS, len(pending))
|
num_workers = min(MAX_WORKERS, len(pending))
|
||||||
workers = [
|
workers = [
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
worker(i, queue, context, known,
|
worker(
|
||||||
total, retry_counts, video_map, map_lock, shutdown_event)
|
i,
|
||||||
|
queue,
|
||||||
|
context,
|
||||||
|
known,
|
||||||
|
total,
|
||||||
|
retry_counts,
|
||||||
|
video_map,
|
||||||
|
map_lock,
|
||||||
|
shutdown_event,
|
||||||
|
reauth_lock,
|
||||||
|
reauth_done,
|
||||||
|
site_key,
|
||||||
|
site_cfg,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
for i in range(num_workers)
|
for i in range(num_workers)
|
||||||
]
|
]
|
||||||
@@ -442,21 +617,64 @@ async def run():
|
|||||||
|
|
||||||
mapped = sum(1 for v in video_map.values() if v.get("videos"))
|
mapped = sum(1 for v in video_map.values() if v.get("videos"))
|
||||||
print(
|
print(
|
||||||
f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries.")
|
f"\n[{site_key}] Video map: {mapped} posts with videos, {len(video_map)} total entries."
|
||||||
|
)
|
||||||
|
|
||||||
if not shutdown_event.is_set():
|
if not shutdown_event.is_set():
|
||||||
print(f"[✓] Completed. Full map in {VIDEO_MAP_FILE}")
|
print(f"[{site_key}] Completed. Full map in video_map.json")
|
||||||
else:
|
else:
|
||||||
done = sum(1 for v in video_map.values() if v.get("scraped_at"))
|
done = sum(1 for v in video_map.values() if v.get("scraped_at"))
|
||||||
print(f"[⏸] Paused — {done}/{total} done. Run again to resume.")
|
print(f"[{site_key}] Paused — {done}/{total} done. Run again to resume.")
|
||||||
|
|
||||||
|
|
||||||
|
async def run(selected_sites: list[str], explicit: bool) -> None:
|
||||||
|
shutdown_event = asyncio.Event()
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
|
def _handle_shutdown(signum: int, _: object) -> None:
|
||||||
|
print(f"\n[!] Signal {signum} received — finishing active pages then exiting…")
|
||||||
|
loop.call_soon_threadsafe(shutdown_event.set)
|
||||||
|
|
||||||
|
signal.signal(signal.SIGINT, _handle_shutdown)
|
||||||
|
signal.signal(signal.SIGTERM, _handle_shutdown)
|
||||||
|
|
||||||
|
try:
|
||||||
|
for site_key in selected_sites:
|
||||||
|
if shutdown_event.is_set():
|
||||||
|
break
|
||||||
|
site_cfg = SITES[site_key]
|
||||||
|
if not _has_credentials(site_cfg):
|
||||||
|
if explicit:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"No credentials or cookie found for {site_key}. See .env.example."
|
||||||
|
)
|
||||||
|
print(f"[{site_key}] No credentials found — skipping.")
|
||||||
|
continue
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f" Site: {site_key} ({site_cfg['base_url']})")
|
||||||
|
print(f"{'=' * 60}\n")
|
||||||
|
await run_for_site(site_key, site_cfg, shutdown_event)
|
||||||
finally:
|
finally:
|
||||||
signal.signal(signal.SIGINT, signal.SIG_DFL)
|
signal.signal(signal.SIGINT, signal.SIG_DFL)
|
||||||
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Scrape video URLs from member sites")
|
||||||
|
parser.add_argument(
|
||||||
|
"--site",
|
||||||
|
action="append",
|
||||||
|
choices=list(SITES.keys()),
|
||||||
|
dest="sites",
|
||||||
|
metavar="SITE",
|
||||||
|
help=f"Site(s) to scrape (default: all). Can be repeated. Choices: {', '.join(SITES)}",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
explicit = bool(args.sites)
|
||||||
|
selected = args.sites or list(SITES.keys())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
asyncio.run(run())
|
asyncio.run(run(selected, explicit))
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print("\n[!] Interrupted. Run again to resume.")
|
print("\n[!] Interrupted. Run again to resume.")
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
playwright==1.58.0
|
playwright==1.58.0
|
||||||
python-dotenv==1.2.1
|
python-dotenv==1.2.1
|
||||||
Requests==2.32.5
|
Requests==2.32.5
|
||||||
rookiepy==0.5.6
|
|
||||||
|
|||||||
@@ -4,16 +4,41 @@ Importable function:
|
|||||||
summarize_sizes(sizes) - return dict with total, smallest, largest, average, failed
|
summarize_sizes(sizes) - return dict with total, smallest, largest, average, failed
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from check_clashes import fmt_size, fetch_sizes, load_video_map, VIDEO_MAP_FILE
|
from typing import Optional, TypedDict
|
||||||
|
|
||||||
|
from check_clashes import (
|
||||||
|
fmt_size,
|
||||||
|
fetch_sizes,
|
||||||
|
load_video_map,
|
||||||
|
build_url_referers,
|
||||||
|
VIDEO_MAP_FILE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def summarize_sizes(sizes):
|
class SizeStats(TypedDict):
|
||||||
|
sized: int
|
||||||
|
total: int
|
||||||
|
total_bytes: int
|
||||||
|
smallest: int
|
||||||
|
largest: int
|
||||||
|
average: int
|
||||||
|
failed: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
def summarize_sizes(sizes: dict[str, Optional[int]]) -> SizeStats:
|
||||||
"""Given {url: size_or_None}, return a stats dict."""
|
"""Given {url: size_or_None}, return a stats dict."""
|
||||||
known = {u: s for u, s in sizes.items() if s is not None}
|
known = {u: s for u, s in sizes.items() if s is not None}
|
||||||
failed = [u for u, s in sizes.items() if s is None]
|
failed = [u for u, s in sizes.items() if s is None]
|
||||||
if not known:
|
if not known:
|
||||||
return {"sized": 0, "total": len(sizes), "total_bytes": 0,
|
return {
|
||||||
"smallest": 0, "largest": 0, "average": 0, "failed": failed}
|
"sized": 0,
|
||||||
|
"total": len(sizes),
|
||||||
|
"total_bytes": 0,
|
||||||
|
"smallest": 0,
|
||||||
|
"largest": 0,
|
||||||
|
"average": 0,
|
||||||
|
"failed": failed,
|
||||||
|
}
|
||||||
total_bytes = sum(known.values())
|
total_bytes = sum(known.values())
|
||||||
return {
|
return {
|
||||||
"sized": len(known),
|
"sized": len(known),
|
||||||
@@ -28,19 +53,28 @@ def summarize_sizes(sizes):
|
|||||||
|
|
||||||
# --------------- CLI ---------------
|
# --------------- CLI ---------------
|
||||||
|
|
||||||
def _progress(done, total):
|
|
||||||
|
def _progress(done: int, total: int) -> None:
|
||||||
if done % 200 == 0 or done == total:
|
if done % 200 == 0 or done == total:
|
||||||
print(f" {done}/{total}")
|
print(f" {done}/{total}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main() -> None:
|
||||||
vm = load_video_map()
|
vm = load_video_map()
|
||||||
urls = [u for entry in vm.values() for u in entry.get("videos", []) if u.startswith("http")]
|
urls: list[str] = [
|
||||||
|
u
|
||||||
|
for entry in vm.values()
|
||||||
|
for u in entry.get("videos", [])
|
||||||
|
if u.startswith("http")
|
||||||
|
]
|
||||||
|
|
||||||
|
url_referers = build_url_referers(vm)
|
||||||
print(f"[+] {len(urls)} URLs in {VIDEO_MAP_FILE}")
|
print(f"[+] {len(urls)} URLs in {VIDEO_MAP_FILE}")
|
||||||
print("[+] Fetching file sizes (20 threads)…\n")
|
print("[+] Fetching file sizes (20 threads)…\n")
|
||||||
|
|
||||||
sizes = fetch_sizes(urls, workers=20, on_progress=_progress)
|
sizes = fetch_sizes(
|
||||||
|
urls, workers=20, on_progress=_progress, url_referers=url_referers
|
||||||
|
)
|
||||||
stats = summarize_sizes(sizes)
|
stats = summarize_sizes(sizes)
|
||||||
|
|
||||||
print(f"\n{'=' * 45}")
|
print(f"\n{'=' * 45}")
|
||||||
|
|||||||
292
upload.py
292
upload.py
@@ -26,13 +26,14 @@ from pathlib import Path
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
from typing import Any, cast
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS
|
from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS, load_video_map
|
||||||
|
from config import SITES
|
||||||
from download import (
|
from download import (
|
||||||
load_video_map,
|
|
||||||
collect_urls,
|
collect_urls,
|
||||||
get_paths_for_mode,
|
get_paths_for_mode,
|
||||||
read_mode,
|
read_mode,
|
||||||
@@ -52,21 +53,21 @@ PT_NAME_MAX = 120
|
|||||||
|
|
||||||
# ── Text helpers ─────────────────────────────────────────────────────
|
# ── Text helpers ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
def clean_description(raw):
|
|
||||||
|
def clean_description(raw: str) -> str:
|
||||||
"""Strip WordPress shortcodes and HTML from a description."""
|
"""Strip WordPress shortcodes and HTML from a description."""
|
||||||
if not raw:
|
if not raw:
|
||||||
return ""
|
return ""
|
||||||
text = re.sub(r'\[/?[^\]]+\]', '', raw)
|
text = re.sub(r"\[/?[^\]]+\]", "", raw)
|
||||||
text = re.sub(r'<[^>]+>', '', text)
|
text = re.sub(r"<[^>]+>", "", text)
|
||||||
text = html.unescape(text)
|
text = html.unescape(text)
|
||||||
text = re.sub(r'\n{3,}', '\n\n', text).strip()
|
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
||||||
return text[:10000]
|
return text[:10000]
|
||||||
|
|
||||||
|
|
||||||
def make_pt_name(title, fallback_filename):
|
def make_pt_name(title: str, fallback_filename: str) -> str:
|
||||||
"""Build a PeerTube-safe video name (3-120 chars)."""
|
"""Build a PeerTube-safe video name (3-120 chars)."""
|
||||||
name = html.unescape(title).strip(
|
name = html.unescape(title).strip() if title else Path(fallback_filename).stem
|
||||||
) if title else Path(fallback_filename).stem
|
|
||||||
if len(name) > PT_NAME_MAX:
|
if len(name) > PT_NAME_MAX:
|
||||||
name = name[: PT_NAME_MAX - 1].rstrip() + "\u2026"
|
name = name[: PT_NAME_MAX - 1].rstrip() + "\u2026"
|
||||||
while len(name) < 3:
|
while len(name) < 3:
|
||||||
@@ -76,7 +77,8 @@ def make_pt_name(title, fallback_filename):
|
|||||||
|
|
||||||
# ── PeerTube API ─────────────────────────────────────────────────────
|
# ── PeerTube API ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
def get_oauth_token(base, username, password):
|
|
||||||
|
def get_oauth_token(base: str, username: str, password: str) -> str:
|
||||||
r = requests.get(f"{base}/api/v1/oauth-clients/local", timeout=15)
|
r = requests.get(f"{base}/api/v1/oauth-clients/local", timeout=15)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
client = r.json()
|
client = r.json()
|
||||||
@@ -93,26 +95,36 @@ def get_oauth_token(base, username, password):
|
|||||||
timeout=15,
|
timeout=15,
|
||||||
)
|
)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.json()["access_token"]
|
data_any: Any = r.json()
|
||||||
|
data = cast(dict[str, Any], data_any)
|
||||||
|
token = data.get("access_token")
|
||||||
|
if not isinstance(token, str) or not token:
|
||||||
|
raise RuntimeError("PeerTube token response missing access_token")
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
def api_headers(token):
|
def api_headers(token: str) -> dict[str, str]:
|
||||||
return {"Authorization": f"Bearer {token}"}
|
return {"Authorization": f"Bearer {token}"}
|
||||||
|
|
||||||
|
|
||||||
def get_channel_id(base, token, channel_name):
|
def get_channel_id(base: str, token: str, channel_name: str) -> int:
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
f"{base}/api/v1/video-channels/{channel_name}",
|
f"{base}/api/v1/video-channels/{channel_name}",
|
||||||
headers=api_headers(token),
|
headers=api_headers(token),
|
||||||
timeout=15,
|
timeout=15,
|
||||||
)
|
)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.json()["id"]
|
data_any: Any = r.json()
|
||||||
|
data = cast(dict[str, Any], data_any)
|
||||||
|
cid = data.get("id")
|
||||||
|
if not isinstance(cid, int):
|
||||||
|
raise RuntimeError("PeerTube channel response missing id")
|
||||||
|
return cid
|
||||||
|
|
||||||
|
|
||||||
def get_channel_video_names(base, token, channel_name):
|
def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
|
||||||
"""Paginate through the channel and return a Counter of video names."""
|
"""Paginate through the channel and return a Counter of video names."""
|
||||||
counts = Counter()
|
counts: Counter[str] = Counter()
|
||||||
start = 0
|
start = 0
|
||||||
while True:
|
while True:
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
@@ -135,8 +147,16 @@ CHUNK_SIZE = 10 * 1024 * 1024 # 10 MB
|
|||||||
MAX_RETRIES = 5
|
MAX_RETRIES = 5
|
||||||
|
|
||||||
|
|
||||||
def _init_resumable(base, token, channel_id, filepath, filename, name,
|
def _init_resumable(
|
||||||
description="", nsfw=False):
|
base: str,
|
||||||
|
token: str,
|
||||||
|
channel_id: int,
|
||||||
|
filepath: Path,
|
||||||
|
filename: str,
|
||||||
|
name: str,
|
||||||
|
description: str = "",
|
||||||
|
nsfw: bool = False,
|
||||||
|
) -> tuple[str, int]:
|
||||||
"""POST to create a resumable upload session. Returns upload URL."""
|
"""POST to create a resumable upload session. Returns upload URL."""
|
||||||
file_size = Path(filepath).stat().st_size
|
file_size = Path(filepath).stat().st_size
|
||||||
metadata = {
|
metadata = {
|
||||||
@@ -171,7 +191,7 @@ def _init_resumable(base, token, channel_id, filepath, filename, name,
|
|||||||
return location, file_size
|
return location, file_size
|
||||||
|
|
||||||
|
|
||||||
def _query_offset(upload_url, token, file_size):
|
def _query_offset(upload_url: str, token: str, file_size: int) -> int:
|
||||||
"""Ask the server how many bytes it has received so far."""
|
"""Ask the server how many bytes it has received so far."""
|
||||||
r = requests.put(
|
r = requests.put(
|
||||||
upload_url,
|
upload_url,
|
||||||
@@ -193,8 +213,15 @@ def _query_offset(upload_url, token, file_size):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def upload_video(base, token, channel_id, filepath, name,
|
def upload_video(
|
||||||
description="", nsfw=False):
|
base: str,
|
||||||
|
token: str,
|
||||||
|
channel_id: int,
|
||||||
|
filepath: Path,
|
||||||
|
name: str,
|
||||||
|
description: str = "",
|
||||||
|
nsfw: bool = False,
|
||||||
|
) -> tuple[bool, str | None]:
|
||||||
"""Resumable chunked upload. Returns (ok, uuid)."""
|
"""Resumable chunked upload. Returns (ok, uuid)."""
|
||||||
filepath = Path(filepath)
|
filepath = Path(filepath)
|
||||||
filename = filepath.name
|
filename = filepath.name
|
||||||
@@ -202,8 +229,14 @@ def upload_video(base, token, channel_id, filepath, name,
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
upload_url, _ = _init_resumable(
|
upload_url, _ = _init_resumable(
|
||||||
base, token, channel_id, filepath, filename,
|
base,
|
||||||
name, description, nsfw,
|
token,
|
||||||
|
channel_id,
|
||||||
|
filepath,
|
||||||
|
filename,
|
||||||
|
name,
|
||||||
|
description,
|
||||||
|
nsfw,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" Init failed: {e}")
|
print(f" Init failed: {e}")
|
||||||
@@ -221,8 +254,11 @@ def upload_video(base, token, channel_id, filepath, name,
|
|||||||
chunk = f.read(chunk_len)
|
chunk = f.read(chunk_len)
|
||||||
|
|
||||||
pct = int(100 * (end + 1) / file_size)
|
pct = int(100 * (end + 1) / file_size)
|
||||||
print(f" {fmt_size(offset)}/{fmt_size(file_size)} ({pct}%)",
|
print(
|
||||||
end="\r", flush=True)
|
f" {fmt_size(offset)}/{fmt_size(file_size)} ({pct}%)",
|
||||||
|
end="\r",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.put(
|
r = requests.put(
|
||||||
@@ -239,12 +275,13 @@ def upload_video(base, token, channel_id, filepath, name,
|
|||||||
except (requests.ConnectionError, requests.Timeout) as e:
|
except (requests.ConnectionError, requests.Timeout) as e:
|
||||||
retries += 1
|
retries += 1
|
||||||
if retries > MAX_RETRIES:
|
if retries > MAX_RETRIES:
|
||||||
print(
|
print(f"\n Upload failed after {MAX_RETRIES} retries: {e}")
|
||||||
f"\n Upload failed after {MAX_RETRIES} retries: {e}")
|
|
||||||
return False, None
|
return False, None
|
||||||
wait = min(2**retries, 60)
|
wait = min(2**retries, 60)
|
||||||
print(f"\n Connection error, retry {retries}/{MAX_RETRIES} "
|
print(
|
||||||
f"in {wait}s ...")
|
f"\n Connection error, retry {retries}/{MAX_RETRIES} "
|
||||||
|
f"in {wait}s ..."
|
||||||
|
)
|
||||||
time.sleep(wait)
|
time.sleep(wait)
|
||||||
try:
|
try:
|
||||||
offset = _query_offset(upload_url, token, file_size)
|
offset = _query_offset(upload_url, token, file_size)
|
||||||
@@ -261,8 +298,7 @@ def upload_video(base, token, channel_id, filepath, name,
|
|||||||
retries = 0
|
retries = 0
|
||||||
|
|
||||||
elif r.status_code == 200:
|
elif r.status_code == 200:
|
||||||
print(
|
print(f" {fmt_size(file_size)}/{fmt_size(file_size)} (100%)")
|
||||||
f" {fmt_size(file_size)}/{fmt_size(file_size)} (100%)")
|
|
||||||
uuid = r.json().get("video", {}).get("uuid")
|
uuid = r.json().get("video", {}).get("uuid")
|
||||||
return True, uuid
|
return True, uuid
|
||||||
|
|
||||||
@@ -270,11 +306,9 @@ def upload_video(base, token, channel_id, filepath, name,
|
|||||||
retry_after = int(r.headers.get("Retry-After", 10))
|
retry_after = int(r.headers.get("Retry-After", 10))
|
||||||
retries += 1
|
retries += 1
|
||||||
if retries > MAX_RETRIES:
|
if retries > MAX_RETRIES:
|
||||||
print(
|
print(f"\n Upload failed: server returned {r.status_code}")
|
||||||
f"\n Upload failed: server returned {r.status_code}")
|
|
||||||
return False, None
|
return False, None
|
||||||
print(
|
print(f"\n Server {r.status_code}, retry in {retry_after}s ...")
|
||||||
f"\n Server {r.status_code}, retry in {retry_after}s ...")
|
|
||||||
time.sleep(retry_after)
|
time.sleep(retry_after)
|
||||||
try:
|
try:
|
||||||
offset = _query_offset(upload_url, token, file_size)
|
offset = _query_offset(upload_url, token, file_size)
|
||||||
@@ -301,7 +335,7 @@ _STATE = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_video_state(base, token, uuid):
|
def get_video_state(base: str, token: str, uuid: str) -> tuple[int, str]:
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
f"{base}/api/v1/videos/{uuid}",
|
f"{base}/api/v1/videos/{uuid}",
|
||||||
headers=api_headers(token),
|
headers=api_headers(token),
|
||||||
@@ -312,7 +346,7 @@ def get_video_state(base, token, uuid):
|
|||||||
return state["id"], state.get("label", "")
|
return state["id"], state.get("label", "")
|
||||||
|
|
||||||
|
|
||||||
def wait_for_published(base, token, uuid, poll_interval):
|
def wait_for_published(base: str, token: str, uuid: str, poll_interval: int) -> int:
|
||||||
"""Block until the video reaches state 1 (Published) or a failure state."""
|
"""Block until the video reaches state 1 (Published) or a failure state."""
|
||||||
started = time.monotonic()
|
started = time.monotonic()
|
||||||
while True:
|
while True:
|
||||||
@@ -329,8 +363,10 @@ def wait_for_published(base, token, uuid, poll_interval):
|
|||||||
try:
|
try:
|
||||||
sid, label = get_video_state(base, token, uuid)
|
sid, label = get_video_state(base, token, uuid)
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
print(f" -> Poll error ({e.__class__.__name__}) "
|
print(
|
||||||
f"after {elapsed_str}, retrying in {poll_interval}s …")
|
f" -> Poll error ({e.__class__.__name__}) "
|
||||||
|
f"after {elapsed_str}, retrying in {poll_interval}s …"
|
||||||
|
)
|
||||||
time.sleep(poll_interval)
|
time.sleep(poll_interval)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -343,13 +379,16 @@ def wait_for_published(base, token, uuid, poll_interval):
|
|||||||
print(f" -> FAILED: {display}")
|
print(f" -> FAILED: {display}")
|
||||||
return sid
|
return sid
|
||||||
|
|
||||||
print(f" -> {display} … {elapsed_str} elapsed (next check in {poll_interval}s)")
|
print(
|
||||||
|
f" -> {display} … {elapsed_str} elapsed (next check in {poll_interval}s)"
|
||||||
|
)
|
||||||
time.sleep(poll_interval)
|
time.sleep(poll_interval)
|
||||||
|
|
||||||
|
|
||||||
# ── State tracker ────────────────────────────────────────────────────
|
# ── State tracker ────────────────────────────────────────────────────
|
||||||
|
|
||||||
def load_uploaded(input_dir):
|
|
||||||
|
def load_uploaded(input_dir: str) -> set[Path]:
|
||||||
path = Path(input_dir) / UPLOADED_FILE
|
path = Path(input_dir) / UPLOADED_FILE
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
return set()
|
return set()
|
||||||
@@ -357,36 +396,60 @@ def load_uploaded(input_dir):
|
|||||||
return {Path(line.strip()) for line in f if line.strip()}
|
return {Path(line.strip()) for line in f if line.strip()}
|
||||||
|
|
||||||
|
|
||||||
def mark_uploaded(input_dir, rel_path):
|
def mark_uploaded(input_dir: str, rel_path: Path) -> None:
|
||||||
with open(Path(input_dir) / UPLOADED_FILE, "a") as f:
|
with open(Path(input_dir) / UPLOADED_FILE, "a") as f:
|
||||||
f.write(f"{rel_path}\n")
|
f.write(f"{rel_path}\n")
|
||||||
|
|
||||||
|
|
||||||
# ── File / metadata helpers ─────────────────────────────────────────
|
# ── File / metadata helpers ─────────────────────────────────────────
|
||||||
|
|
||||||
def build_path_to_meta(video_map, input_dir):
|
|
||||||
"""Map each expected download path (relative) to {title, description}."""
|
def build_path_to_meta(
|
||||||
|
video_map: dict[str, Any],
|
||||||
|
input_dir: str,
|
||||||
|
) -> dict[Path, dict[str, str]]:
|
||||||
|
"""Map each expected download path (relative) to {title, description, original_filename}."""
|
||||||
urls = collect_urls(video_map)
|
urls = collect_urls(video_map)
|
||||||
mode = read_mode(input_dir) or MODE_ORIGINAL
|
mode = read_mode(input_dir) or MODE_ORIGINAL
|
||||||
paths = get_paths_for_mode(mode, urls, video_map, input_dir)
|
|
||||||
|
|
||||||
url_meta = {}
|
url_to_site: dict[str, str] = {}
|
||||||
for entry in video_map.values():
|
for site_key in SITES:
|
||||||
t = entry.get("title", "")
|
for entry in load_video_map(site_key).values():
|
||||||
d = entry.get("description", "")
|
for vid_url in entry.get("videos", []):
|
||||||
for video_url in entry.get("videos", []):
|
url_to_site[vid_url] = site_key
|
||||||
if video_url not in url_meta:
|
|
||||||
url_meta[video_url] = {"title": t, "description": d}
|
|
||||||
|
|
||||||
result = {}
|
paths = get_paths_for_mode(mode, urls, video_map, input_dir, url_to_site)
|
||||||
|
|
||||||
|
url_meta: dict[str, dict[str, str]] = {}
|
||||||
|
for entry_any in video_map.values():
|
||||||
|
entry = cast(dict[str, Any], entry_any)
|
||||||
|
|
||||||
|
t = entry.get("title")
|
||||||
|
d = entry.get("description")
|
||||||
|
title = t if isinstance(t, str) else ""
|
||||||
|
desc = d if isinstance(d, str) else ""
|
||||||
|
|
||||||
|
videos_any = entry.get("videos", [])
|
||||||
|
if isinstance(videos_any, list):
|
||||||
|
for video_url_any in videos_any:
|
||||||
|
if not isinstance(video_url_any, str):
|
||||||
|
continue
|
||||||
|
if video_url_any not in url_meta:
|
||||||
|
url_meta[video_url_any] = {"title": title, "description": desc}
|
||||||
|
|
||||||
|
result: dict[Path, dict[str, str]] = {}
|
||||||
for url, abs_path in paths.items():
|
for url, abs_path in paths.items():
|
||||||
rel = Path(abs_path).relative_to(input_dir)
|
rel = abs_path.relative_to(input_dir)
|
||||||
meta = url_meta.get(url, {"title": "", "description": ""})
|
meta = url_meta.get(url, {"title": "", "description": ""})
|
||||||
result[rel] = {**meta, "original_filename": url_to_filename(url)}
|
result[rel] = {
|
||||||
|
"title": meta.get("title", ""),
|
||||||
|
"description": meta.get("description", ""),
|
||||||
|
"original_filename": url_to_filename(url),
|
||||||
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def find_videos(input_dir):
|
def find_videos(input_dir: str) -> set[Path]:
|
||||||
"""Walk input_dir and return a set of relative paths for all video files."""
|
"""Walk input_dir and return a set of relative paths for all video files."""
|
||||||
found = set()
|
found = set()
|
||||||
for root, dirs, files in os.walk(input_dir):
|
for root, dirs, files in os.walk(input_dir):
|
||||||
@@ -399,7 +462,12 @@ def find_videos(input_dir):
|
|||||||
|
|
||||||
# ── Channel match helpers ─────────────────────────────────────────────
|
# ── Channel match helpers ─────────────────────────────────────────────
|
||||||
|
|
||||||
def _channel_match(rel, path_meta, existing):
|
|
||||||
|
def _channel_match(
|
||||||
|
rel: Path,
|
||||||
|
path_meta: dict[Path, dict[str, str]],
|
||||||
|
existing: set[str],
|
||||||
|
) -> tuple[bool, str]:
|
||||||
"""Return (matched, name) for a local file against the channel name set.
|
"""Return (matched, name) for a local file against the channel name set.
|
||||||
|
|
||||||
Checks both the title-derived name and the original-filename-derived name
|
Checks both the title-derived name and the original-filename-derived name
|
||||||
@@ -409,38 +477,62 @@ def _channel_match(rel, path_meta, existing):
|
|||||||
"""
|
"""
|
||||||
meta = path_meta.get(rel, {})
|
meta = path_meta.get(rel, {})
|
||||||
name = make_pt_name(meta.get("title", ""), rel.name)
|
name = make_pt_name(meta.get("title", ""), rel.name)
|
||||||
|
|
||||||
orig_fn = meta.get("original_filename", "")
|
orig_fn = meta.get("original_filename", "")
|
||||||
raw_name = make_pt_name("", orig_fn) if orig_fn else None
|
raw_name: str | None = make_pt_name("", orig_fn) if orig_fn else None
|
||||||
matched = name in existing or (raw_name and raw_name != name and raw_name in existing)
|
|
||||||
|
matched = name in existing
|
||||||
|
if not matched and raw_name is not None and raw_name != name:
|
||||||
|
matched = raw_name in existing
|
||||||
|
|
||||||
return matched, name
|
return matched, name
|
||||||
|
|
||||||
|
|
||||||
# ── CLI ──────────────────────────────────────────────────────────────
|
# ── CLI ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def main():
|
|
||||||
|
def main() -> None:
|
||||||
ap = argparse.ArgumentParser(
|
ap = argparse.ArgumentParser(
|
||||||
description="Upload videos to PeerTube with transcoding-aware batching",
|
description="Upload videos to PeerTube with transcoding-aware batching",
|
||||||
)
|
)
|
||||||
ap.add_argument("--input", "-i", default=DEFAULT_OUTPUT,
|
ap.add_argument(
|
||||||
help=f"Directory with downloaded videos (default: {DEFAULT_OUTPUT})")
|
"--input",
|
||||||
ap.add_argument("--url",
|
"-i",
|
||||||
help="PeerTube instance URL (or set PEERTUBE_URL env var)")
|
default=DEFAULT_OUTPUT,
|
||||||
ap.add_argument("--username", "-U",
|
help=f"Directory with downloaded videos (default: {DEFAULT_OUTPUT})",
|
||||||
help="PeerTube username (or set PEERTUBE_USER env var)")
|
)
|
||||||
ap.add_argument("--password", "-p",
|
ap.add_argument("--url", help="PeerTube instance URL (or set PEERTUBE_URL env var)")
|
||||||
help="PeerTube password (or set PEERTUBE_PASSWORD env var)")
|
ap.add_argument(
|
||||||
ap.add_argument("--channel", "-C",
|
"--username", "-U", help="PeerTube username (or set PEERTUBE_USER env var)"
|
||||||
help="Channel to upload to (or set PEERTUBE_CHANNEL env var)")
|
)
|
||||||
ap.add_argument("--batch-size", "-b", type=int, default=DEFAULT_BATCH_SIZE,
|
ap.add_argument(
|
||||||
help="Videos to upload before waiting for transcoding (default: 1)")
|
"--password", "-p", help="PeerTube password (or set PEERTUBE_PASSWORD env var)"
|
||||||
ap.add_argument("--poll-interval", type=int, default=DEFAULT_POLL,
|
)
|
||||||
help=f"Seconds between state polls (default: {DEFAULT_POLL})")
|
ap.add_argument(
|
||||||
ap.add_argument("--skip-wait", action="store_true",
|
"--channel", "-C", help="Channel to upload to (or set PEERTUBE_CHANNEL env var)"
|
||||||
help="Upload everything without waiting for transcoding")
|
)
|
||||||
ap.add_argument("--nsfw", action="store_true",
|
ap.add_argument(
|
||||||
help="Mark videos as NSFW")
|
"--batch-size",
|
||||||
ap.add_argument("--dry-run", "-n", action="store_true",
|
"-b",
|
||||||
help="Preview what would be uploaded")
|
type=int,
|
||||||
|
default=DEFAULT_BATCH_SIZE,
|
||||||
|
help="Videos to upload before waiting for transcoding (default: 1)",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--poll-interval",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_POLL,
|
||||||
|
help=f"Seconds between state polls (default: {DEFAULT_POLL})",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--skip-wait",
|
||||||
|
action="store_true",
|
||||||
|
help="Upload everything without waiting for transcoding",
|
||||||
|
)
|
||||||
|
ap.add_argument("--nsfw", action="store_true", help="Mark videos as NSFW")
|
||||||
|
ap.add_argument(
|
||||||
|
"--dry-run", "-n", action="store_true", help="Preview what would be uploaded"
|
||||||
|
)
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
url = args.url or os.environ.get("PEERTUBE_URL")
|
url = args.url or os.environ.get("PEERTUBE_URL")
|
||||||
@@ -449,12 +541,16 @@ def main():
|
|||||||
password = args.password or os.environ.get("PEERTUBE_PASSWORD")
|
password = args.password or os.environ.get("PEERTUBE_PASSWORD")
|
||||||
|
|
||||||
if not args.dry_run:
|
if not args.dry_run:
|
||||||
missing = [label for label, val in [
|
missing = [
|
||||||
|
label
|
||||||
|
for label, val in [
|
||||||
("--url / PEERTUBE_URL", url),
|
("--url / PEERTUBE_URL", url),
|
||||||
("--username / PEERTUBE_USER", username),
|
("--username / PEERTUBE_USER", username),
|
||||||
("--channel / PEERTUBE_CHANNEL", channel),
|
("--channel / PEERTUBE_CHANNEL", channel),
|
||||||
("--password / PEERTUBE_PASSWORD", password),
|
("--password / PEERTUBE_PASSWORD", password),
|
||||||
] if not val]
|
]
|
||||||
|
if not val
|
||||||
|
]
|
||||||
if missing:
|
if missing:
|
||||||
for label in missing:
|
for label in missing:
|
||||||
print(f"[!] Required: {label}")
|
print(f"[!] Required: {label}")
|
||||||
@@ -468,7 +564,8 @@ def main():
|
|||||||
unmatched = on_disk - set(path_meta.keys())
|
unmatched = on_disk - set(path_meta.keys())
|
||||||
if unmatched:
|
if unmatched:
|
||||||
print(
|
print(
|
||||||
f"[!] {len(unmatched)} file(s) on disk not in video_map (will use filename as title)")
|
f"[!] {len(unmatched)} file(s) on disk not in video_map (will use filename as title)"
|
||||||
|
)
|
||||||
for rel in unmatched:
|
for rel in unmatched:
|
||||||
path_meta[rel] = {"title": "", "description": ""}
|
path_meta[rel] = {"title": "", "description": ""}
|
||||||
|
|
||||||
@@ -493,10 +590,14 @@ def main():
|
|||||||
sz = (Path(args.input) / rel).stat().st_size
|
sz = (Path(args.input) / rel).stat().st_size
|
||||||
total_bytes += sz
|
total_bytes += sz
|
||||||
print(f" [{fmt_size(sz):>10}] {name}")
|
print(f" [{fmt_size(sz):>10}] {name}")
|
||||||
print(
|
print(f"\n Total: {fmt_size(total_bytes)} across {len(pending)} videos")
|
||||||
f"\n Total: {fmt_size(total_bytes)} across {len(pending)} videos")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
assert url is not None
|
||||||
|
assert username is not None
|
||||||
|
assert channel is not None
|
||||||
|
assert password is not None
|
||||||
|
|
||||||
# ── authenticate ──
|
# ── authenticate ──
|
||||||
base = url.rstrip("/")
|
base = url.rstrip("/")
|
||||||
if not base.startswith("http"):
|
if not base.startswith("http"):
|
||||||
@@ -533,7 +634,9 @@ def main():
|
|||||||
if _channel_match(rel, path_meta, existing)[0]:
|
if _channel_match(rel, path_meta, existing)[0]:
|
||||||
pre_matched.append(rel)
|
pre_matched.append(rel)
|
||||||
if pre_matched:
|
if pre_matched:
|
||||||
print(f"\n[+] Pre-sweep: {len(pre_matched)} local file(s) already on channel — marking uploaded")
|
print(
|
||||||
|
f"\n[+] Pre-sweep: {len(pre_matched)} local file(s) already on channel — marking uploaded"
|
||||||
|
)
|
||||||
for rel in pre_matched:
|
for rel in pre_matched:
|
||||||
mark_uploaded(args.input, rel)
|
mark_uploaded(args.input, rel)
|
||||||
pending = [rel for rel in pending if rel not in set(pre_matched)]
|
pending = [rel for rel in pending if rel not in set(pre_matched)]
|
||||||
@@ -548,7 +651,8 @@ def main():
|
|||||||
# ── flush batch if full ──
|
# ── flush batch if full ──
|
||||||
if not args.skip_wait and len(batch) >= args.batch_size:
|
if not args.skip_wait and len(batch) >= args.batch_size:
|
||||||
print(
|
print(
|
||||||
f"\n[+] Waiting for {len(batch)} video(s) to finish processing ...")
|
f"\n[+] Waiting for {len(batch)} video(s) to finish processing ..."
|
||||||
|
)
|
||||||
for uuid, bname in batch:
|
for uuid, bname in batch:
|
||||||
print(f"\n [{bname}]")
|
print(f"\n [{bname}]")
|
||||||
wait_for_published(base, token, uuid, args.poll_interval)
|
wait_for_published(base, token, uuid, args.poll_interval)
|
||||||
@@ -568,18 +672,19 @@ def main():
|
|||||||
print(f"\n[{total_up + 1}/{len(pending)}] {name}")
|
print(f"\n[{total_up + 1}/{len(pending)}] {name}")
|
||||||
print(f" File: {rel} ({fmt_size(sz)})")
|
print(f" File: {rel} ({fmt_size(sz)})")
|
||||||
|
|
||||||
ok, uuid = upload_video(
|
ok, uuid_opt = upload_video(
|
||||||
base, token, channel_id, filepath, name, desc, nsfw)
|
base, token, channel_id, filepath, name, desc, nsfw
|
||||||
|
)
|
||||||
if not ok:
|
if not ok:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f" Uploaded uuid={uuid}")
|
print(f" Uploaded uuid={uuid_opt}")
|
||||||
mark_uploaded(args.input, rel)
|
mark_uploaded(args.input, rel)
|
||||||
total_up += 1
|
total_up += 1
|
||||||
existing.add(name)
|
existing.add(name)
|
||||||
|
|
||||||
if uuid:
|
if uuid_opt is not None:
|
||||||
batch.append((uuid, name))
|
batch.append((uuid_opt, name))
|
||||||
|
|
||||||
# ── wait for final batch ──
|
# ── wait for final batch ──
|
||||||
if batch and not args.skip_wait:
|
if batch and not args.skip_wait:
|
||||||
@@ -589,8 +694,7 @@ def main():
|
|||||||
wait_for_published(base, token, uuid, args.poll_interval)
|
wait_for_published(base, token, uuid, args.poll_interval)
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print(
|
print(f"\n\n[!] Interrupted after {total_up} uploads. Re-run to continue.")
|
||||||
f"\n\n[!] Interrupted after {total_up} uploads. Re-run to continue.")
|
|
||||||
sys.exit(130)
|
sys.exit(130)
|
||||||
|
|
||||||
print(f"\n{'=' * 50}")
|
print(f"\n{'=' * 50}")
|
||||||
|
|||||||
4121
video_map.json
4121
video_map.json
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user