Garbage commit; added junk code

2026-05-01 00:52:18 +00:00 · 2026-02-20 18:07:02 +01:00
commit 80444405e9
15 changed files with 34271 additions and 0 deletions
@@ -0,0 +1,603 @@
+"""Upload videos to PeerTube with transcoding-aware flow control.
+
+Uploads videos one batch at a time, waits for each batch to be fully transcoded
+and moved to object storage before uploading the next — preventing disk
+exhaustion on the PeerTube server.
+
+Usage:
+    python upload.py                      # upload from ./downloads
+    python upload.py -i /mnt/vol/dl      # custom input dir
+    python upload.py --batch-size 2      # upload 2, wait, repeat
+    python upload.py --dry-run           # preview without uploading
+    python upload.py --skip-wait         # upload without waiting
+
+Required (CLI flag or env var):
+    --url / PEERTUBE_URL
+    --username / PEERTUBE_USER
+    --channel / PEERTUBE_CHANNEL
+    --password / PEERTUBE_PASSWORD
+"""
+
+import argparse
+from collections import Counter
+import html
+import os
+from pathlib import Path
+import re
+import sys
+import time
+
+import requests
+from dotenv import load_dotenv
+
+from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS
+from download import (
+    load_video_map,
+    collect_urls,
+    get_paths_for_mode,
+    read_mode,
+    MODE_ORIGINAL,
+    DEFAULT_OUTPUT,
+)
+
+load_dotenv()
+
+# ── Defaults ─────────────────────────────────────────────────────────
+
+DEFAULT_BATCH_SIZE = 1
+DEFAULT_POLL = 30
+UPLOADED_FILE = ".uploaded"
+PT_NAME_MAX = 120
+
+
+# ── Text helpers ─────────────────────────────────────────────────────
+
+def clean_description(raw):
+    """Strip WordPress shortcodes and HTML from a description."""
+    if not raw:
+        return ""
+    text = re.sub(r'\[/?[^\]]+\]', '', raw)
+    text = re.sub(r'<[^>]+>', '', text)
+    text = html.unescape(text)
+    text = re.sub(r'\n{3,}', '\n\n', text).strip()
+    return text[:10000]
+
+
+def make_pt_name(title, fallback_filename):
+    """Build a PeerTube-safe video name (3-120 chars)."""
+    name = html.unescape(title).strip(
+    ) if title else Path(fallback_filename).stem
+    if len(name) > PT_NAME_MAX:
+        name = name[: PT_NAME_MAX - 1].rstrip() + "\u2026"
+    while len(name) < 3:
+        name += "_"
+    return name
+
+
+# ── PeerTube API ─────────────────────────────────────────────────────
+
+def get_oauth_token(base, username, password):
+    r = requests.get(f"{base}/api/v1/oauth-clients/local", timeout=15)
+    r.raise_for_status()
+    client = r.json()
+
+    r = requests.post(
+        f"{base}/api/v1/users/token",
+        data={
+            "client_id": client["client_id"],
+            "client_secret": client["client_secret"],
+            "grant_type": "password",
+            "username": username,
+            "password": password,
+        },
+        timeout=15,
+    )
+    r.raise_for_status()
+    return r.json()["access_token"]
+
+
+def api_headers(token):
+    return {"Authorization": f"Bearer {token}"}
+
+
+def get_channel_id(base, token, channel_name):
+    r = requests.get(
+        f"{base}/api/v1/video-channels/{channel_name}",
+        headers=api_headers(token),
+        timeout=15,
+    )
+    r.raise_for_status()
+    return r.json()["id"]
+
+
+def get_channel_video_names(base, token, channel_name):
+    """Paginate through the channel and return a Counter of video names."""
+    counts = Counter()
+    start = 0
+    while True:
+        r = requests.get(
+            f"{base}/api/v1/video-channels/{channel_name}/videos",
+            params={"start": start, "count": 100},
+            headers=api_headers(token),
+            timeout=30,
+        )
+        r.raise_for_status()
+        data = r.json()
+        for v in data.get("data", []):
+            counts[v["name"]] += 1
+        start += 100
+        if start >= data.get("total", 0):
+            break
+    return counts
+
+
+CHUNK_SIZE = 10 * 1024 * 1024  # 10 MB
+MAX_RETRIES = 5
+
+
+def _init_resumable(base, token, channel_id, filepath, filename, name,
+                    description="", nsfw=False):
+    """POST to create a resumable upload session.  Returns upload URL."""
+    file_size = Path(filepath).stat().st_size
+    metadata = {
+        "name": name,
+        "channelId": channel_id,
+        "filename": filename,
+        "nsfw": nsfw,
+        "waitTranscoding": True,
+        "privacy": 1,
+    }
+    if description:
+        metadata["description"] = description
+
+    r = requests.post(
+        f"{base}/api/v1/videos/upload-resumable",
+        headers={
+            **api_headers(token),
+            "Content-Type": "application/json",
+            "X-Upload-Content-Length": str(file_size),
+            "X-Upload-Content-Type": "video/mp4",
+        },
+        json=metadata,
+        timeout=30,
+    )
+    r.raise_for_status()
+
+    location = r.headers["Location"]
+    if location.startswith("//"):
+        location = "https:" + location
+    elif location.startswith("/"):
+        location = base + location
+    return location, file_size
+
+
+def _query_offset(upload_url, token, file_size):
+    """Ask the server how many bytes it has received so far."""
+    r = requests.put(
+        upload_url,
+        headers={
+            **api_headers(token),
+            "Content-Range": f"bytes */{file_size}",
+            "Content-Length": "0",
+        },
+        timeout=15,
+    )
+    if r.status_code == 308:
+        range_hdr = r.headers.get("Range", "")
+        if range_hdr:
+            return int(range_hdr.split("-")[1]) + 1
+        return 0
+    if r.status_code == 200:
+        return file_size
+    r.raise_for_status()
+    return 0
+
+
+def upload_video(base, token, channel_id, filepath, name,
+                 description="", nsfw=False):
+    """Resumable chunked upload.  Returns (ok, uuid)."""
+    filepath = Path(filepath)
+    filename = filepath.name
+    file_size = filepath.stat().st_size
+
+    try:
+        upload_url, _ = _init_resumable(
+            base, token, channel_id, filepath, filename,
+            name, description, nsfw,
+        )
+    except Exception as e:
+        print(f"    Init failed: {e}")
+        return False, None
+
+    offset = 0
+    retries = 0
+
+    with open(filepath, "rb") as f:
+        while offset < file_size:
+            end = min(offset + CHUNK_SIZE, file_size) - 1
+            chunk_len = end - offset + 1
+
+            f.seek(offset)
+            chunk = f.read(chunk_len)
+
+            pct = int(100 * (end + 1) / file_size)
+            print(f"    {fmt_size(offset)}/{fmt_size(file_size)}  ({pct}%)",
+                  end="\r", flush=True)
+
+            try:
+                r = requests.put(
+                    upload_url,
+                    headers={
+                        **api_headers(token),
+                        "Content-Type": "application/octet-stream",
+                        "Content-Range": f"bytes {offset}-{end}/{file_size}",
+                        "Content-Length": str(chunk_len),
+                    },
+                    data=chunk,
+                    timeout=120,
+                )
+            except (requests.ConnectionError, requests.Timeout) as e:
+                retries += 1
+                if retries > MAX_RETRIES:
+                    print(
+                        f"\n    Upload failed after {MAX_RETRIES} retries: {e}")
+                    return False, None
+                wait = min(2 ** retries, 60)
+                print(f"\n    Connection error, retry {retries}/{MAX_RETRIES} "
+                      f"in {wait}s ...")
+                time.sleep(wait)
+                try:
+                    offset = _query_offset(upload_url, token, file_size)
+                except Exception:
+                    pass
+                continue
+
+            if r.status_code == 308:
+                range_hdr = r.headers.get("Range", "")
+                if range_hdr:
+                    offset = int(range_hdr.split("-")[1]) + 1
+                else:
+                    offset = end + 1
+                retries = 0
+
+            elif r.status_code == 200:
+                print(
+                    f"    {fmt_size(file_size)}/{fmt_size(file_size)}  (100%)")
+                uuid = r.json().get("video", {}).get("uuid")
+                return True, uuid
+
+            elif r.status_code in (502, 503, 429):
+                retry_after = int(r.headers.get("Retry-After", 10))
+                retries += 1
+                if retries > MAX_RETRIES:
+                    print(
+                        f"\n    Upload failed: server returned {r.status_code}")
+                    return False, None
+                print(
+                    f"\n    Server {r.status_code}, retry in {retry_after}s ...")
+                time.sleep(retry_after)
+                try:
+                    offset = _query_offset(upload_url, token, file_size)
+                except Exception:
+                    pass
+
+            else:
+                detail = r.text[:300] if r.text else str(r.status_code)
+                print(f"\n    Upload failed ({r.status_code}): {detail}")
+                return False, None
+
+    print("\n    Unexpected: sent all bytes but no 200 response")
+    return False, None
+
+
+_STATE = {
+    1: "Published",
+    2: "To transcode",
+    3: "To import",
+    6: "Moving to object storage",
+    7: "Transcoding failed",
+    8: "Storage move failed",
+    9: "To edit",
+}
+
+
+def get_video_state(base, token, uuid):
+    r = requests.get(
+        f"{base}/api/v1/videos/{uuid}",
+        headers=api_headers(token),
+        timeout=15,
+    )
+    r.raise_for_status()
+    state = r.json()["state"]
+    return state["id"], state.get("label", "")
+
+
+def wait_for_published(base, token, uuid, poll_interval):
+    """Block until the video reaches state 1 (Published) or a failure state."""
+    started = time.monotonic()
+    while True:
+        elapsed = int(time.monotonic() - started)
+        hours, rem = divmod(elapsed, 3600)
+        mins, secs = divmod(rem, 60)
+        if hours:
+            elapsed_str = f"{hours}h {mins:02d}m {secs:02d}s"
+        elif mins:
+            elapsed_str = f"{mins}m {secs:02d}s"
+        else:
+            elapsed_str = f"{secs}s"
+
+        try:
+            sid, label = get_video_state(base, token, uuid)
+        except requests.exceptions.RequestException as e:
+            print(f"    -> Poll error ({e.__class__.__name__}) "
+                  f"after {elapsed_str}, retrying in {poll_interval}s …")
+            time.sleep(poll_interval)
+            continue
+
+        display = _STATE.get(sid, label or f"state {sid}")
+
+        if sid == 1:
+            print(f"    -> {display}")
+            return sid
+        if sid in (7, 8):
+            print(f"    -> FAILED: {display}")
+            return sid
+
+        print(f"    -> {display} … {elapsed_str} elapsed (next check in {poll_interval}s)")
+        time.sleep(poll_interval)
+
+
+# ── State tracker ────────────────────────────────────────────────────
+
+def load_uploaded(input_dir):
+    path = Path(input_dir) / UPLOADED_FILE
+    if not path.exists():
+        return set()
+    with open(path) as f:
+        return {Path(line.strip()) for line in f if line.strip()}
+
+
+def mark_uploaded(input_dir, rel_path):
+    with open(Path(input_dir) / UPLOADED_FILE, "a") as f:
+        f.write(f"{rel_path}\n")
+
+
+# ── File / metadata helpers ─────────────────────────────────────────
+
+def build_path_to_meta(video_map, input_dir):
+    """Map each expected download path (relative) to {title, description}."""
+    urls = collect_urls(video_map)
+    mode = read_mode(input_dir) or MODE_ORIGINAL
+    paths = get_paths_for_mode(mode, urls, video_map, input_dir)
+
+    url_meta = {}
+    for entry in video_map.values():
+        t = entry.get("title", "")
+        d = entry.get("description", "")
+        for video_url in entry.get("videos", []):
+            if video_url not in url_meta:
+                url_meta[video_url] = {"title": t, "description": d}
+
+    result = {}
+    for url, abs_path in paths.items():
+        rel = Path(abs_path).relative_to(input_dir)
+        meta = url_meta.get(url, {"title": "", "description": ""})
+        result[rel] = {**meta, "original_filename": url_to_filename(url)}
+    return result
+
+
+def find_videos(input_dir):
+    """Walk input_dir and return a set of relative paths for all video files."""
+    found = set()
+    for root, dirs, files in os.walk(input_dir):
+        dirs[:] = [d for d in dirs if not d.startswith(".")]
+        for f in files:
+            if Path(f).suffix.lower() in VIDEO_EXTS:
+                found.add((Path(root) / f).relative_to(input_dir))
+    return found
+
+
+# ── Channel match helpers ─────────────────────────────────────────────
+
+def _channel_match(rel, path_meta, existing):
+    """Return (matched, name) for a local file against the channel name set.
+
+    Checks both the title-derived name and the original-filename-derived name
+    so that videos uploaded under either form are recognised.  Extracted to
+    avoid duplicating the logic between the pre-reconcile sweep and the per-
+    file check inside the upload loop.
+    """
+    meta = path_meta.get(rel, {})
+    name = make_pt_name(meta.get("title", ""), rel.name)
+    orig_fn = meta.get("original_filename", "")
+    raw_name = make_pt_name("", orig_fn) if orig_fn else None
+    matched = name in existing or (raw_name and raw_name != name and raw_name in existing)
+    return matched, name
+
+
+# ── CLI ──────────────────────────────────────────────────────────────
+
+def main():
+    ap = argparse.ArgumentParser(
+        description="Upload videos to PeerTube with transcoding-aware batching",
+    )
+    ap.add_argument("--input", "-i", default=DEFAULT_OUTPUT,
+                    help=f"Directory with downloaded videos (default: {DEFAULT_OUTPUT})")
+    ap.add_argument("--url",
+                    help="PeerTube instance URL (or set PEERTUBE_URL env var)")
+    ap.add_argument("--username", "-U",
+                    help="PeerTube username (or set PEERTUBE_USER env var)")
+    ap.add_argument("--password", "-p",
+                    help="PeerTube password (or set PEERTUBE_PASSWORD env var)")
+    ap.add_argument("--channel", "-C",
+                    help="Channel to upload to (or set PEERTUBE_CHANNEL env var)")
+    ap.add_argument("--batch-size", "-b", type=int, default=DEFAULT_BATCH_SIZE,
+                    help="Videos to upload before waiting for transcoding (default: 1)")
+    ap.add_argument("--poll-interval", type=int, default=DEFAULT_POLL,
+                    help=f"Seconds between state polls (default: {DEFAULT_POLL})")
+    ap.add_argument("--skip-wait", action="store_true",
+                    help="Upload everything without waiting for transcoding")
+    ap.add_argument("--nsfw", action="store_true",
+                    help="Mark videos as NSFW")
+    ap.add_argument("--dry-run", "-n", action="store_true",
+                    help="Preview what would be uploaded")
+    args = ap.parse_args()
+
+    url      = args.url      or os.environ.get("PEERTUBE_URL")
+    username = args.username or os.environ.get("PEERTUBE_USER")
+    channel  = args.channel  or os.environ.get("PEERTUBE_CHANNEL")
+    password = args.password or os.environ.get("PEERTUBE_PASSWORD")
+
+    if not args.dry_run:
+        missing = [label for label, val in [
+            ("--url / PEERTUBE_URL", url),
+            ("--username / PEERTUBE_USER", username),
+            ("--channel / PEERTUBE_CHANNEL", channel),
+            ("--password / PEERTUBE_PASSWORD", password),
+        ] if not val]
+        if missing:
+            for label in missing:
+                print(f"[!] Required: {label}")
+            sys.exit(1)
+
+    # ── load metadata & scan disk ──
+    video_map = load_video_map()
+    path_meta = build_path_to_meta(video_map, args.input)
+    on_disk = find_videos(args.input)
+
+    unmatched = on_disk - set(path_meta.keys())
+    if unmatched:
+        print(
+            f"[!] {len(unmatched)} file(s) on disk not in video_map (will use filename as title)")
+        for rel in unmatched:
+            path_meta[rel] = {"title": "", "description": ""}
+
+    uploaded = load_uploaded(args.input)
+    pending = sorted(rel for rel in on_disk if rel not in uploaded)
+
+    print(f"[+] {len(on_disk)} video files in {args.input}/")
+    print(f"[+] {len(uploaded)} already uploaded")
+    print(f"[+] {len(pending)} pending")
+    print(f"[+] Batch size: {args.batch_size}")
+
+    if not pending:
+        print("\nAll videos already uploaded.")
+        return
+
+    # ── dry run ──
+    if args.dry_run:
+        total_bytes = 0
+        for rel in pending:
+            meta = path_meta.get(rel, {})
+            name = make_pt_name(meta.get("title", ""), rel.name)
+            sz = (Path(args.input) / rel).stat().st_size
+            total_bytes += sz
+            print(f"  [{fmt_size(sz):>10}]  {name}")
+        print(
+            f"\n  Total: {fmt_size(total_bytes)} across {len(pending)} videos")
+        return
+
+    # ── authenticate ──
+    base = url.rstrip("/")
+    if not base.startswith("http"):
+        base = "https://" + base
+
+    print(f"\n[+] Authenticating with {base} ...")
+    token = get_oauth_token(base, username, password)
+    print(f"[+] Authenticated as {username}")
+
+    channel_id = get_channel_id(base, token, channel)
+    print(f"[+] Channel: {channel} (id {channel_id})")
+
+    name_counts = get_channel_video_names(base, token, channel)
+    existing = set(name_counts)
+    total = sum(name_counts.values())
+    print(f"[+] Found {total} video(s) on channel ({len(name_counts)} unique name(s))")
+
+    dupes = {name: count for name, count in name_counts.items() if count > 1}
+    if dupes:
+        print(f"[!] {len(dupes)} duplicate name(s) detected on channel:")
+        for name, count in sorted(dupes.items()):
+            print(f"    x{count}  {name}")
+
+    # ── pre-reconcile: sweep all pending against channel names ────────
+    # The main upload loop discovers already-uploaded videos lazily as it
+    # walks the sorted pending list — meaning on a fresh run (no .uploaded
+    # file) you won't know how many files are genuinely new until the loop
+    # has processed everything.  Doing a full sweep here, before any
+    # upload starts, gives an accurate count up-front and pre-populates
+    # .uploaded so that interrupted/re-run sessions skip them instantly
+    # without re-checking each time.
+    pre_matched = []
+    for rel in pending:
+        if _channel_match(rel, path_meta, existing)[0]:
+            pre_matched.append(rel)
+    if pre_matched:
+        print(f"\n[+] Pre-sweep: {len(pre_matched)} local file(s) already on channel — marking uploaded")
+        for rel in pre_matched:
+            mark_uploaded(args.input, rel)
+        pending = [rel for rel in pending if rel not in set(pre_matched)]
+        print(f"[+] {len(pending)} left to upload\n")
+
+    nsfw = args.nsfw
+    total_up = 0
+    batch: list[tuple[str, str]] = []   # [(uuid, name), ...]
+
+    try:
+        for rel in pending:
+            # ── flush batch if full ──
+            if not args.skip_wait and len(batch) >= args.batch_size:
+                print(
+                    f"\n[+] Waiting for {len(batch)} video(s) to finish processing ...")
+                for uuid, bname in batch:
+                    print(f"\n  [{bname}]")
+                    wait_for_published(base, token, uuid, args.poll_interval)
+                batch.clear()
+
+            filepath = Path(args.input) / rel
+            meta = path_meta.get(rel, {})
+            name = make_pt_name(meta.get("title", ""), rel.name)
+            desc = clean_description(meta.get("description", ""))
+            sz = filepath.stat().st_size
+
+            if _channel_match(rel, path_meta, existing)[0]:
+                print(f"\n[skip] already on channel: {name}")
+                mark_uploaded(args.input, rel)
+                continue
+
+            print(f"\n[{total_up + 1}/{len(pending)}] {name}")
+            print(f"    File: {rel}  ({fmt_size(sz)})")
+
+            ok, uuid = upload_video(
+                base, token, channel_id, filepath, name, desc, nsfw)
+            if not ok:
+                continue
+
+            print(f"    Uploaded  uuid={uuid}")
+            mark_uploaded(args.input, rel)
+            total_up += 1
+            existing.add(name)
+
+            if uuid:
+                batch.append((uuid, name))
+
+        # ── wait for final batch ──
+        if batch and not args.skip_wait:
+            print(f"\n[+] Waiting for final {len(batch)} video(s) ...")
+            for uuid, bname in batch:
+                print(f"\n  [{bname}]")
+                wait_for_published(base, token, uuid, args.poll_interval)
+
+    except KeyboardInterrupt:
+        print(
+            f"\n\n[!] Interrupted after {total_up} uploads. Re-run to continue.")
+        sys.exit(130)
+
+    print(f"\n{'=' * 50}")
+    print(f"  Uploaded: {total_up} video(s)")
+    print("  Done!")
+    print(f"{'=' * 50}")
+
+
+if __name__ == "__main__":
+    main()