jailbirdz-dl/upload.py

"""Upload videos to PeerTube with transcoding-aware flow control.

Uploads videos one batch at a time, waits for each batch to be fully transcoded
and moved to object storage before uploading the next — preventing disk
exhaustion on the PeerTube server.

Usage:
    python upload.py                      # upload from ./downloads
    python upload.py -i /mnt/vol/dl      # custom input dir
    python upload.py --batch-size 2      # upload 2, wait, repeat
    python upload.py --dry-run           # preview without uploading
    python upload.py --skip-wait         # upload without waiting

Required (CLI flag or env var):
    --url / PEERTUBE_URL
    --username / PEERTUBE_USER
    --channel / PEERTUBE_CHANNEL
    --password / PEERTUBE_PASSWORD
"""

import argparse
from collections import Counter
import html
import os
from pathlib import Path
import re
import sys
import time
from typing import Any, cast

import requests
from dotenv import load_dotenv

from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS, load_video_map
from config import SITES
from download import (
    collect_urls,
    get_paths_for_mode,
    read_mode,
    MODE_ORIGINAL,
    DEFAULT_OUTPUT,
)

load_dotenv()

# ── Defaults ─────────────────────────────────────────────────────────

DEFAULT_BATCH_SIZE = 1
DEFAULT_POLL = 30
UPLOADED_FILE = ".uploaded"
PT_NAME_MAX = 120


# ── Text helpers ─────────────────────────────────────────────────────


def clean_description(raw: str) -> str:
    """Strip WordPress shortcodes and HTML from a description."""
    if not raw:
        return ""
    text = re.sub(r"\[/?[^\]]+\]", "", raw)
    text = re.sub(r"<[^>]+>", "", text)
    text = html.unescape(text)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text[:10000]


def make_pt_name(title: str, fallback_filename: str) -> str:
    """Build a PeerTube-safe video name (3-120 chars)."""
    name = html.unescape(title).strip() if title else Path(fallback_filename).stem
    if len(name) > PT_NAME_MAX:
        name = name[: PT_NAME_MAX - 1].rstrip() + "\u2026"
    while len(name) < 3:
        name += "_"
    return name


# ── PeerTube API ─────────────────────────────────────────────────────


def get_oauth_token(base: str, username: str, password: str) -> str:
    r = requests.get(f"{base}/api/v1/oauth-clients/local", timeout=15)
    r.raise_for_status()
    client = r.json()

    r = requests.post(
        f"{base}/api/v1/users/token",
        data={
            "client_id": client["client_id"],
            "client_secret": client["client_secret"],
            "grant_type": "password",
            "username": username,
            "password": password,
        },
        timeout=15,
    )
    r.raise_for_status()
    data_any: Any = r.json()
    data = cast(dict[str, Any], data_any)
    token = data.get("access_token")
    if not isinstance(token, str) or not token:
        raise RuntimeError("PeerTube token response missing access_token")
    return token


def api_headers(token: str) -> dict[str, str]:
    return {"Authorization": f"Bearer {token}"}


def get_channel_id(base: str, token: str, channel_name: str) -> int:
    r = requests.get(
        f"{base}/api/v1/video-channels/{channel_name}",
        headers=api_headers(token),
        timeout=15,
    )
    r.raise_for_status()
    data_any: Any = r.json()
    data = cast(dict[str, Any], data_any)
    cid = data.get("id")
    if not isinstance(cid, int):
        raise RuntimeError("PeerTube channel response missing id")
    return cid


def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
    """Paginate through the channel and return a Counter of video names."""
    counts: Counter[str] = Counter()
    start = 0
    while True:
        r = requests.get(
            f"{base}/api/v1/video-channels/{channel_name}/videos",
            params={"start": start, "count": 100},
            headers=api_headers(token),
            timeout=30,
        )
        r.raise_for_status()
        data = r.json()
        for v in data.get("data", []):
            counts[v["name"]] += 1
        start += 100
        if start >= data.get("total", 0):
            break
    return counts


CHUNK_SIZE = 10 * 1024 * 1024  # 10 MB
MAX_RETRIES = 5


def _init_resumable(
    base: str,
    token: str,
    channel_id: int,
    filepath: Path,
    filename: str,
    name: str,
    description: str = "",
    nsfw: bool = False,
) -> tuple[str, int]:
    """POST to create a resumable upload session.  Returns upload URL."""
    file_size = Path(filepath).stat().st_size
    metadata = {
        "name": name,
        "channelId": channel_id,
        "filename": filename,
        "nsfw": nsfw,
        "waitTranscoding": True,
        "privacy": 1,
    }
    if description:
        metadata["description"] = description

    r = requests.post(
        f"{base}/api/v1/videos/upload-resumable",
        headers={
            **api_headers(token),
            "Content-Type": "application/json",
            "X-Upload-Content-Length": str(file_size),
            "X-Upload-Content-Type": "video/mp4",
        },
        json=metadata,
        timeout=30,
    )
    r.raise_for_status()

    location = r.headers["Location"]
    if location.startswith("//"):
        location = "https:" + location
    elif location.startswith("/"):
        location = base + location
    return location, file_size


def _query_offset(upload_url: str, token: str, file_size: int) -> int:
    """Ask the server how many bytes it has received so far."""
    r = requests.put(
        upload_url,
        headers={
            **api_headers(token),
            "Content-Range": f"bytes */{file_size}",
            "Content-Length": "0",
        },
        timeout=15,
    )
    if r.status_code == 308:
        range_hdr = r.headers.get("Range", "")
        if range_hdr:
            return int(range_hdr.split("-")[1]) + 1
        return 0
    if r.status_code == 200:
        return file_size
    r.raise_for_status()
    return 0


def upload_video(
    base: str,
    token: str,
    channel_id: int,
    filepath: Path,
    name: str,
    description: str = "",
    nsfw: bool = False,
) -> tuple[bool, str | None]:
    """Resumable chunked upload.  Returns (ok, uuid)."""
    filepath = Path(filepath)
    filename = filepath.name
    file_size = filepath.stat().st_size

    try:
        upload_url, _ = _init_resumable(
            base,
            token,
            channel_id,
            filepath,
            filename,
            name,
            description,
            nsfw,
        )
    except Exception as e:
        print(f"    Init failed: {e}")
        return False, None

    offset = 0
    retries = 0

    with open(filepath, "rb") as f:
        while offset < file_size:
            end = min(offset + CHUNK_SIZE, file_size) - 1
            chunk_len = end - offset + 1

            f.seek(offset)
            chunk = f.read(chunk_len)

            pct = int(100 * (end + 1) / file_size)
            print(
                f"    {fmt_size(offset)}/{fmt_size(file_size)}  ({pct}%)",
                end="\r",
                flush=True,
            )

            try:
                r = requests.put(
                    upload_url,
                    headers={
                        **api_headers(token),
                        "Content-Type": "application/octet-stream",
                        "Content-Range": f"bytes {offset}-{end}/{file_size}",
                        "Content-Length": str(chunk_len),
                    },
                    data=chunk,
                    timeout=120,
                )
            except (requests.ConnectionError, requests.Timeout) as e:
                retries += 1
                if retries > MAX_RETRIES:
                    print(f"\n    Upload failed after {MAX_RETRIES} retries: {e}")
                    return False, None
                wait = min(2**retries, 60)
                print(
                    f"\n    Connection error, retry {retries}/{MAX_RETRIES} "
                    f"in {wait}s ..."
                )
                time.sleep(wait)
                try:
                    offset = _query_offset(upload_url, token, file_size)
                except Exception:
                    pass
                continue

            if r.status_code == 308:
                range_hdr = r.headers.get("Range", "")
                if range_hdr:
                    offset = int(range_hdr.split("-")[1]) + 1
                else:
                    offset = end + 1
                retries = 0

            elif r.status_code == 200:
                print(f"    {fmt_size(file_size)}/{fmt_size(file_size)}  (100%)")
                uuid = r.json().get("video", {}).get("uuid")
                return True, uuid

            elif r.status_code in (502, 503, 429):
                retry_after = int(r.headers.get("Retry-After", 10))
                retries += 1
                if retries > MAX_RETRIES:
                    print(f"\n    Upload failed: server returned {r.status_code}")
                    return False, None
                print(f"\n    Server {r.status_code}, retry in {retry_after}s ...")
                time.sleep(retry_after)
                try:
                    offset = _query_offset(upload_url, token, file_size)
                except Exception:
                    pass

            else:
                detail = r.text[:300] if r.text else str(r.status_code)
                print(f"\n    Upload failed ({r.status_code}): {detail}")
                return False, None

    print("\n    Unexpected: sent all bytes but no 200 response")
    return False, None


_STATE = {
    1: "Published",
    2: "To transcode",
    3: "To import",
    6: "Moving to object storage",
    7: "Transcoding failed",
    8: "Storage move failed",
    9: "To edit",
}


def get_video_state(base: str, token: str, uuid: str) -> tuple[int, str]:
    r = requests.get(
        f"{base}/api/v1/videos/{uuid}",
        headers=api_headers(token),
        timeout=15,
    )
    r.raise_for_status()
    state = r.json()["state"]
    return state["id"], state.get("label", "")


def wait_for_published(base: str, token: str, uuid: str, poll_interval: int) -> int:
    """Block until the video reaches state 1 (Published) or a failure state."""
    started = time.monotonic()
    while True:
        elapsed = int(time.monotonic() - started)
        hours, rem = divmod(elapsed, 3600)
        mins, secs = divmod(rem, 60)
        if hours:
            elapsed_str = f"{hours}h {mins:02d}m {secs:02d}s"
        elif mins:
            elapsed_str = f"{mins}m {secs:02d}s"
        else:
            elapsed_str = f"{secs}s"

        try:
            sid, label = get_video_state(base, token, uuid)
        except requests.exceptions.RequestException as e:
            print(
                f"    -> Poll error ({e.__class__.__name__}) "
                f"after {elapsed_str}, retrying in {poll_interval}s …"
            )
            time.sleep(poll_interval)
            continue

        display = _STATE.get(sid, label or f"state {sid}")

        if sid == 1:
            print(f"    -> {display}")
            return sid
        if sid in (7, 8):
            print(f"    -> FAILED: {display}")
            return sid

        print(
            f"    -> {display} … {elapsed_str} elapsed (next check in {poll_interval}s)"
        )
        time.sleep(poll_interval)


# ── State tracker ────────────────────────────────────────────────────


def load_uploaded(input_dir: str) -> set[Path]:
    path = Path(input_dir) / UPLOADED_FILE
    if not path.exists():
        return set()
    with open(path) as f:
        return {Path(line.strip()) for line in f if line.strip()}


def mark_uploaded(input_dir: str, rel_path: Path) -> None:
    with open(Path(input_dir) / UPLOADED_FILE, "a") as f:
        f.write(f"{rel_path}\n")


# ── File / metadata helpers ─────────────────────────────────────────


def build_path_to_meta(
    video_map: dict[str, Any],
    input_dir: str,
) -> dict[Path, dict[str, str]]:
    """Map each expected download path (relative) to {title, description, original_filename}."""
    urls = collect_urls(video_map)
    mode = read_mode(input_dir) or MODE_ORIGINAL

    url_to_site: dict[str, str] = {}
    for site_key in SITES:
        for entry in load_video_map(site_key).values():
            for vid_url in entry.get("videos", []):
                url_to_site[vid_url] = site_key

    paths = get_paths_for_mode(mode, urls, video_map, input_dir, url_to_site)

    url_meta: dict[str, dict[str, str]] = {}
    for entry_any in video_map.values():
        entry = cast(dict[str, Any], entry_any)

        t = entry.get("title")
        d = entry.get("description")
        title = t if isinstance(t, str) else ""
        desc = d if isinstance(d, str) else ""

        videos_any = entry.get("videos", [])
        if isinstance(videos_any, list):
            for video_url_any in videos_any:
                if not isinstance(video_url_any, str):
                    continue
                if video_url_any not in url_meta:
                    url_meta[video_url_any] = {"title": title, "description": desc}

    result: dict[Path, dict[str, str]] = {}
    for url, abs_path in paths.items():
        rel = abs_path.relative_to(input_dir)
        meta = url_meta.get(url, {"title": "", "description": ""})
        result[rel] = {
            "title": meta.get("title", ""),
            "description": meta.get("description", ""),
            "original_filename": url_to_filename(url),
        }
    return result


def find_videos(input_dir: str) -> set[Path]:
    """Walk input_dir and return a set of relative paths for all video files."""
    found = set()
    for root, dirs, files in os.walk(input_dir):
        dirs[:] = [d for d in dirs if not d.startswith(".")]
        for f in files:
            if Path(f).suffix.lower() in VIDEO_EXTS:
                found.add((Path(root) / f).relative_to(input_dir))
    return found


# ── Channel match helpers ─────────────────────────────────────────────


def _channel_match(
    rel: Path,
    path_meta: dict[Path, dict[str, str]],
    existing: set[str],
) -> tuple[bool, str]:
    """Return (matched, name) for a local file against the channel name set.

    Checks both the title-derived name and the original-filename-derived name
    so that videos uploaded under either form are recognised.  Extracted to
    avoid duplicating the logic between the pre-reconcile sweep and the per-
    file check inside the upload loop.
    """
    meta = path_meta.get(rel, {})
    name = make_pt_name(meta.get("title", ""), rel.name)

    orig_fn = meta.get("original_filename", "")
    raw_name: str | None = make_pt_name("", orig_fn) if orig_fn else None

    matched = name in existing
    if not matched and raw_name is not None and raw_name != name:
        matched = raw_name in existing

    return matched, name


# ── CLI ──────────────────────────────────────────────────────────────


def main() -> None:
    ap = argparse.ArgumentParser(
        description="Upload videos to PeerTube with transcoding-aware batching",
    )
    ap.add_argument(
        "--input",
        "-i",
        default=DEFAULT_OUTPUT,
        help=f"Directory with downloaded videos (default: {DEFAULT_OUTPUT})",
    )
    ap.add_argument("--url", help="PeerTube instance URL (or set PEERTUBE_URL env var)")
    ap.add_argument(
        "--username", "-U", help="PeerTube username (or set PEERTUBE_USER env var)"
    )
    ap.add_argument(
        "--password", "-p", help="PeerTube password (or set PEERTUBE_PASSWORD env var)"
    )
    ap.add_argument(
        "--channel", "-C", help="Channel to upload to (or set PEERTUBE_CHANNEL env var)"
    )
    ap.add_argument(
        "--batch-size",
        "-b",
        type=int,
        default=DEFAULT_BATCH_SIZE,
        help="Videos to upload before waiting for transcoding (default: 1)",
    )
    ap.add_argument(
        "--poll-interval",
        type=int,
        default=DEFAULT_POLL,
        help=f"Seconds between state polls (default: {DEFAULT_POLL})",
    )
    ap.add_argument(
        "--skip-wait",
        action="store_true",
        help="Upload everything without waiting for transcoding",
    )
    ap.add_argument("--nsfw", action="store_true", help="Mark videos as NSFW")
    ap.add_argument(
        "--dry-run", "-n", action="store_true", help="Preview what would be uploaded"
    )
    args = ap.parse_args()

    url = args.url or os.environ.get("PEERTUBE_URL")
    username = args.username or os.environ.get("PEERTUBE_USER")
    channel = args.channel or os.environ.get("PEERTUBE_CHANNEL")
    password = args.password or os.environ.get("PEERTUBE_PASSWORD")

    if not args.dry_run:
        missing = [
            label
            for label, val in [
                ("--url / PEERTUBE_URL", url),
                ("--username / PEERTUBE_USER", username),
                ("--channel / PEERTUBE_CHANNEL", channel),
                ("--password / PEERTUBE_PASSWORD", password),
            ]
            if not val
        ]
        if missing:
            for label in missing:
                print(f"[!] Required: {label}")
            sys.exit(1)

    # ── load metadata & scan disk ──
    video_map = load_video_map()
    path_meta = build_path_to_meta(video_map, args.input)
    on_disk = find_videos(args.input)

    unmatched = on_disk - set(path_meta.keys())
    if unmatched:
        print(
            f"[!] {len(unmatched)} file(s) on disk not in video_map (will use filename as title)"
        )
        for rel in unmatched:
            path_meta[rel] = {"title": "", "description": ""}

    uploaded = load_uploaded(args.input)
    pending = sorted(rel for rel in on_disk if rel not in uploaded)

    print(f"[+] {len(on_disk)} video files in {args.input}/")
    print(f"[+] {len(uploaded)} already uploaded")
    print(f"[+] {len(pending)} pending")
    print(f"[+] Batch size: {args.batch_size}")

    if not pending:
        print("\nAll videos already uploaded.")
        return

    # ── dry run ──
    if args.dry_run:
        total_bytes = 0
        for rel in pending:
            meta = path_meta.get(rel, {})
            name = make_pt_name(meta.get("title", ""), rel.name)
            sz = (Path(args.input) / rel).stat().st_size
            total_bytes += sz
            print(f"  [{fmt_size(sz):>10}]  {name}")
        print(f"\n  Total: {fmt_size(total_bytes)} across {len(pending)} videos")
        return

    assert url is not None
    assert username is not None
    assert channel is not None
    assert password is not None

    # ── authenticate ──
    base = url.rstrip("/")
    if not base.startswith("http"):
        base = "https://" + base

    print(f"\n[+] Authenticating with {base} ...")
    token = get_oauth_token(base, username, password)
    print(f"[+] Authenticated as {username}")

    channel_id = get_channel_id(base, token, channel)
    print(f"[+] Channel: {channel} (id {channel_id})")

    name_counts = get_channel_video_names(base, token, channel)
    existing = set(name_counts)
    total = sum(name_counts.values())
    print(f"[+] Found {total} video(s) on channel ({len(name_counts)} unique name(s))")

    dupes = {name: count for name, count in name_counts.items() if count > 1}
    if dupes:
        print(f"[!] {len(dupes)} duplicate name(s) detected on channel:")
        for name, count in sorted(dupes.items()):
            print(f"    x{count}  {name}")

    # ── pre-reconcile: sweep all pending against channel names ────────
    # The main upload loop discovers already-uploaded videos lazily as it
    # walks the sorted pending list — meaning on a fresh run (no .uploaded
    # file) you won't know how many files are genuinely new until the loop
    # has processed everything.  Doing a full sweep here, before any
    # upload starts, gives an accurate count up-front and pre-populates
    # .uploaded so that interrupted/re-run sessions skip them instantly
    # without re-checking each time.
    pre_matched = []
    for rel in pending:
        if _channel_match(rel, path_meta, existing)[0]:
            pre_matched.append(rel)
    if pre_matched:
        print(
            f"\n[+] Pre-sweep: {len(pre_matched)} local file(s) already on channel — marking uploaded"
        )
        for rel in pre_matched:
            mark_uploaded(args.input, rel)
        pending = [rel for rel in pending if rel not in set(pre_matched)]
        print(f"[+] {len(pending)} left to upload\n")

    nsfw = args.nsfw
    total_up = 0
    batch: list[tuple[str, str]] = []  # [(uuid, name), ...]

    try:
        for rel in pending:
            # ── flush batch if full ──
            if not args.skip_wait and len(batch) >= args.batch_size:
                print(
                    f"\n[+] Waiting for {len(batch)} video(s) to finish processing ..."
                )
                for uuid, bname in batch:
                    print(f"\n  [{bname}]")
                    wait_for_published(base, token, uuid, args.poll_interval)
                batch.clear()

            filepath = Path(args.input) / rel
            meta = path_meta.get(rel, {})
            name = make_pt_name(meta.get("title", ""), rel.name)
            desc = clean_description(meta.get("description", ""))
            sz = filepath.stat().st_size

            if _channel_match(rel, path_meta, existing)[0]:
                print(f"\n[skip] already on channel: {name}")
                mark_uploaded(args.input, rel)
                continue

            print(f"\n[{total_up + 1}/{len(pending)}] {name}")
            print(f"    File: {rel}  ({fmt_size(sz)})")

            ok, uuid_opt = upload_video(
                base, token, channel_id, filepath, name, desc, nsfw
            )
            if not ok:
                continue

            print(f"    Uploaded  uuid={uuid_opt}")
            mark_uploaded(args.input, rel)
            total_up += 1
            existing.add(name)

            if uuid_opt is not None:
                batch.append((uuid_opt, name))

        # ── wait for final batch ──
        if batch and not args.skip_wait:
            print(f"\n[+] Waiting for final {len(batch)} video(s) ...")
            for uuid, bname in batch:
                print(f"\n  [{bname}]")
                wait_for_published(base, token, uuid, args.poll_interval)

    except KeyboardInterrupt:
        print(f"\n\n[!] Interrupted after {total_up} uploads. Re-run to continue.")
        sys.exit(130)

    print(f"\n{'=' * 50}")
    print(f"  Uploaded: {total_up} video(s)")
    print("  Done!")
    print(f"{'=' * 50}")


if __name__ == "__main__":
    main()