Files
jailbirdz-dl/download.py
2026-02-28 21:37:39 +01:00

444 lines
14 KiB
Python

"""Download videos from video_map.json with resume, integrity checks, and naming modes.
Usage:
python download.py # downloads with remembered (or default original) naming
python download.py --output /mnt/nas # custom directory
python download.py --titles # switch to title-based filenames (remembers choice)
python download.py --original # switch back to original filenames (remembers choice)
python download.py --reorganize # rename existing files to match current mode
python download.py --dry-run # preview what would happen
python download.py --workers 6 # override concurrency (default 4)
"""
import argparse
import json
from pathlib import Path
import re
import shutil
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Optional
import requests
from check_clashes import (
make_session,
fmt_size,
url_to_filename,
find_clashes,
build_download_paths,
fetch_sizes,
load_video_map,
is_valid_url,
VIDEO_MAP_FILE,
)
CHUNK_SIZE = 8 * 1024 * 1024
DEFAULT_OUTPUT: str = "downloads"
DEFAULT_WORKERS: int = 4
MODE_FILE: str = ".naming_mode"
MODE_ORIGINAL: str = "original"
MODE_TITLE: str = "title"
# ── Naming mode persistence ──────────────────────────────────────────
def read_mode(output_dir: str | Path) -> Optional[str]:
p = Path(output_dir) / MODE_FILE
if p.exists():
return p.read_text().strip()
return None
def write_mode(output_dir: str | Path, mode: str) -> None:
Path(output_dir).mkdir(parents=True, exist_ok=True)
(Path(output_dir) / MODE_FILE).write_text(mode)
def resolve_mode(args: argparse.Namespace) -> str:
"""Determine naming mode from CLI flags + saved marker. Returns mode string."""
saved = read_mode(args.output)
if args.titles and args.original:
print("[!] Cannot use --titles and --original together.")
raise SystemExit(1)
if args.titles:
return MODE_TITLE
if args.original:
return MODE_ORIGINAL
if saved:
return saved
return MODE_ORIGINAL
# ── Filename helpers ─────────────────────────────────────────────────
def sanitize_filename(title: str, max_len: int = 180) -> str:
name = re.sub(r'[<>:"/\\|?*]', "", title)
name = re.sub(r"\s+", " ", name).strip().rstrip(".")
return name[:max_len].rstrip() if len(name) > max_len else name
def build_title_paths(
urls: list[str],
url_to_title: dict[str, str],
output_dir: str | Path,
) -> dict[str, Path]:
name_to_urls = defaultdict(list)
url_to_base = {}
for url in urls:
title = url_to_title.get(url)
ext = Path(url_to_filename(url)).suffix or ".mp4"
base = sanitize_filename(title) if title else Path(url_to_filename(url)).stem
url_to_base[url] = (base, ext)
name_to_urls[base + ext].append(url)
paths = {}
for url in urls:
base, ext = url_to_base[url]
full = base + ext
if len(name_to_urls[full]) > 1:
slug = url_to_filename(url).rsplit(".", 1)[0]
paths[url] = Path(output_dir) / f"{base} [{slug}]{ext}"
else:
paths[url] = Path(output_dir) / full
return paths
def get_paths_for_mode(
mode: str,
urls: list[str],
video_map: dict[str, Any],
output_dir: str | Path,
) -> dict[str, Path]:
if mode == MODE_TITLE:
url_title = build_url_title_map(video_map)
return build_title_paths(urls, url_title, output_dir)
return build_download_paths(urls, output_dir)
# ── Reorganize ───────────────────────────────────────────────────────
def reorganize(
urls: list[str],
video_map: dict[str, Any],
output_dir: str | Path,
target_mode: str,
dry_run: bool = False,
) -> None:
"""Rename existing files from one naming scheme to another."""
other_mode = MODE_TITLE if target_mode == MODE_ORIGINAL else MODE_ORIGINAL
old_paths = get_paths_for_mode(other_mode, urls, video_map, output_dir)
new_paths = get_paths_for_mode(target_mode, urls, video_map, output_dir)
moves = []
for url in urls:
old = old_paths[url]
new = new_paths[url]
if old == new:
continue
if old.exists() and not new.exists():
moves.append((old, new))
# also handle .part files
old_part = old.parent / (old.name + ".part")
new_part = new.parent / (new.name + ".part")
if old_part.exists() and not new_part.exists():
moves.append((old_part, new_part))
if not moves:
print("[✓] Nothing to reorganize — files already match the target mode.")
return
print(f"[+] {len(moves)} file(s) to rename ({other_mode}{target_mode}):\n")
for old, new in moves:
old_rel = old.relative_to(output_dir)
new_rel = new.relative_to(output_dir)
if dry_run:
print(f" [dry-run] {old_rel}{new_rel}")
else:
new.parent.mkdir(parents=True, exist_ok=True)
shutil.move(old, new)
print(f"{old_rel}{new_rel}")
if not dry_run:
# Clean up empty directories left behind
output_path = Path(output_dir)
for old, _ in moves:
d = old.parent
while d != output_path:
try:
d.rmdir()
except OSError:
break
d = d.parent
write_mode(output_dir, target_mode)
print(f"\n[✓] Reorganized. Mode saved: {target_mode}")
else:
print(f"\n[dry-run] Would rename {len(moves)} files. No changes made.")
# ── Download ─────────────────────────────────────────────────────────
def download_one(
session: requests.Session,
url: str,
dest: str | Path,
expected_size: Optional[int],
) -> tuple[str, int]:
dest = Path(dest)
part = dest.parent / (dest.name + ".part")
dest.parent.mkdir(parents=True, exist_ok=True)
if dest.exists():
local = dest.stat().st_size
if expected_size is not None and local == expected_size:
return "ok", 0
if expected_size is not None and local != expected_size:
dest.unlink()
existing = part.stat().st_size if part.exists() else 0
headers = {}
if existing and expected_size is not None and existing < expected_size:
headers["Range"] = f"bytes={existing}-"
try:
r = session.get(url, headers=headers, stream=True, timeout=60)
if r.status_code == 416:
part.rename(dest)
return "ok", 0
r.raise_for_status()
except Exception as e:
return f"error: {e}", 0
mode = "ab" if headers.get("Range") else "wb"
if mode == "wb":
existing = 0
written = 0
try:
with open(part, mode) as f:
for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
f.write(chunk)
written += len(chunk)
except Exception as e:
return f"error: {e}", written
final_size = existing + written
if expected_size is not None and final_size != expected_size:
return "size_mismatch", written
part.rename(dest)
return "ok", written
def collect_urls(video_map: dict[str, Any]) -> list[str]:
urls, seen, skipped = [], set(), 0
for entry in video_map.values():
for video_url in entry.get("videos", []):
if video_url in seen:
continue
seen.add(video_url)
if is_valid_url(video_url):
urls.append(video_url)
else:
skipped += 1
if skipped:
print(f"[!] Skipped {skipped} malformed URL(s)")
return urls
def build_url_title_map(video_map: dict[str, Any]) -> dict[str, str]:
url_title = {}
for entry in video_map.values():
title = entry.get("title", "")
for video_url in entry.get("videos", []):
if video_url not in url_title:
url_title[video_url] = title
return url_title
# ── Main ─────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description="Download videos from video_map.json")
parser.add_argument(
"--output",
"-o",
default=DEFAULT_OUTPUT,
help=f"Download directory (default: {DEFAULT_OUTPUT})",
)
naming = parser.add_mutually_exclusive_group()
naming.add_argument(
"--titles",
"-t",
action="store_true",
help="Use title-based filenames (saved as default for this directory)",
)
naming.add_argument(
"--original",
action="store_true",
help="Use original CloudFront filenames (saved as default for this directory)",
)
parser.add_argument(
"--reorganize",
action="store_true",
help="Rename existing files to match the current naming mode",
)
parser.add_argument(
"--dry-run", "-n", action="store_true", help="Preview without making changes"
)
parser.add_argument(
"--workers",
"-w",
type=int,
default=DEFAULT_WORKERS,
help=f"Concurrent downloads (default: {DEFAULT_WORKERS})",
)
args = parser.parse_args()
video_map = load_video_map()
urls = collect_urls(video_map)
mode = resolve_mode(args)
saved = read_mode(args.output)
mode_changed = saved is not None and saved != mode
print(f"[+] {len(urls)} MP4 URLs from {VIDEO_MAP_FILE}")
print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else ""))
# Handle reorganize
if args.reorganize or mode_changed:
if mode_changed and not args.reorganize:
print(f"\n[!] Mode changed from '{saved}' to '{mode}'.")
print(
" Use --reorganize to rename existing files, or --dry-run to preview."
)
print(" Refusing to download until existing files are reorganized.")
return
reorganize(urls, video_map, args.output, mode, dry_run=args.dry_run)
if args.dry_run or args.reorganize:
return
# Save mode
if not args.dry_run:
write_mode(args.output, mode)
paths = get_paths_for_mode(mode, urls, video_map, args.output)
clashes = find_clashes(urls)
if clashes:
print(
f"[+] {len(clashes)} filename clash(es) resolved with subfolders/suffixes"
)
already = [u for u in urls if paths[u].exists()]
pending = [u for u in urls if not paths[u].exists()]
print(f"[+] Already downloaded: {len(already)}")
print(f"[+] To download: {len(pending)}")
if not pending:
print("\n[✓] Everything is already downloaded.")
return
if args.dry_run:
print(f"\n[dry-run] Would download {len(pending)} files to {args.output}/")
for url in pending[:20]:
print(f"{paths[url].name}")
if len(pending) > 20:
print(f" … and {len(pending) - 20} more")
return
print("\n[+] Fetching remote file sizes…")
session = make_session()
remote_sizes = fetch_sizes(pending, workers=20)
sized = {u: s for u, s in remote_sizes.items() if s is not None}
total_bytes = sum(sized.values())
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
if already:
print(f"[+] Verifying {len(already)} existing files…")
already_sizes = fetch_sizes(already, workers=20)
mismatched = 0
for url in already:
dest = paths[url]
local = dest.stat().st_size
remote = already_sizes.get(url)
if remote and local != remote:
mismatched += 1
print(
f"[!] Size mismatch: {dest.name} "
f"(local {fmt_size(local)} vs remote {fmt_size(remote)})"
)
pending.append(url)
remote_sizes[url] = remote
if mismatched:
print(f"[!] {mismatched} file(s) will be re-downloaded due to size mismatch")
print(f"\n[⚡] Downloading with {args.workers} threads…\n")
completed = 0
failed = []
total_written = 0
total = len(pending)
interrupted = False
def do_download(url: str) -> tuple[str, tuple[str, int]]:
dest = paths[url]
expected = remote_sizes.get(url)
return url, download_one(session, url, dest, expected)
try:
with ThreadPoolExecutor(max_workers=args.workers) as pool:
futures = {pool.submit(do_download, u): u for u in pending}
for fut in as_completed(futures):
url, (status, written) = fut.result()
total_written += written
completed += 1
name = paths[url].name
if status == "ok" and written > 0:
print(f" [{completed}/{total}] ✓ {name} ({fmt_size(written)})")
elif status == "ok":
print(f" [{completed}/{total}] ✓ {name} (already complete)")
elif status == "size_mismatch":
print(f" [{completed}/{total}] ⚠ {name} (size mismatch)")
failed.append(url)
else:
print(f" [{completed}/{total}] ✗ {name} ({status})")
failed.append(url)
except KeyboardInterrupt:
interrupted = True
pool.shutdown(wait=False, cancel_futures=True)
print("\n\n[⏸] Interrupted! Partial downloads saved as .part files.")
print(f"\n{'=' * 50}")
print(f" Downloaded: {fmt_size(total_written)}")
print(f" Completed: {completed}/{total}")
if failed:
print(f" Failed: {len(failed)} (re-run to retry)")
if interrupted:
print(" Paused — re-run to resume.")
elif not failed:
print(" All done!")
print(f"{'=' * 50}")
if __name__ == "__main__":
main()