"""Upload videos to PeerTube with transcoding-aware flow control. Uploads videos one batch at a time, waits for each batch to be fully transcoded and moved to object storage before uploading the next — preventing disk exhaustion on the PeerTube server. Usage: python upload.py # upload from ./downloads python upload.py -i /mnt/vol/dl # custom input dir python upload.py --batch-size 2 # upload 2, wait, repeat python upload.py --dry-run # preview without uploading python upload.py --skip-wait # upload without waiting Required (CLI flag or env var): --url / PEERTUBE_URL --username / PEERTUBE_USER --channel / PEERTUBE_CHANNEL --password / PEERTUBE_PASSWORD """ import argparse from collections import Counter import html import os from pathlib import Path import re import sys import time from typing import Any, Callable, cast import requests from dotenv import load_dotenv from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS, load_video_map from download import ( collect_urls, get_paths_for_mode, read_mode, MODE_ORIGINAL, DEFAULT_OUTPUT, ) load_dotenv() # ── Defaults ───────────────────────────────────────────────────────── DEFAULT_BATCH_SIZE = 1 DEFAULT_POLL = 30 UPLOADED_FILE = ".uploaded" PT_NAME_MAX = 120 # ── Text helpers ───────────────────────────────────────────────────── def clean_description(raw: str) -> str: """Strip WordPress shortcodes and HTML from a description.""" if not raw: return "" text = re.sub(r"\[/?[^\]]+\]", "", raw) text = re.sub(r"<[^>]+>", "", text) text = html.unescape(text) text = re.sub(r"\n{3,}", "\n\n", text).strip() return text[:10000] def make_pt_name(title: str, fallback_filename: str) -> str: """Build a PeerTube-safe video name (3-120 chars).""" name = html.unescape(title).strip() if title else Path(fallback_filename).stem if len(name) > PT_NAME_MAX: name = name[: PT_NAME_MAX - 1].rstrip() + "\u2026" while len(name) < 3: name += "_" return name # ── PeerTube API ───────────────────────────────────────────────────── def get_oauth_token(base: str, username: str, password: str) -> str: r = requests.get(f"{base}/api/v1/oauth-clients/local", timeout=15) r.raise_for_status() client = r.json() r = requests.post( f"{base}/api/v1/users/token", data={ "client_id": client["client_id"], "client_secret": client["client_secret"], "grant_type": "password", "username": username, "password": password, }, timeout=15, ) r.raise_for_status() data_any: Any = r.json() data = cast(dict[str, Any], data_any) token = data.get("access_token") if not isinstance(token, str) or not token: raise RuntimeError("PeerTube token response missing access_token") return token def api_headers(token: str) -> dict[str, str]: return {"Authorization": f"Bearer {token}"} def get_channel_id(base: str, token: str, channel_name: str) -> int: r = requests.get( f"{base}/api/v1/video-channels/{channel_name}", headers=api_headers(token), timeout=15, ) r.raise_for_status() data_any: Any = r.json() data = cast(dict[str, Any], data_any) cid = data.get("id") if not isinstance(cid, int): raise RuntimeError("PeerTube channel response missing id") return cid def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]: """Paginate through the channel and return a Counter of video names.""" counts: Counter[str] = Counter() start = 0 while True: r = requests.get( f"{base}/api/v1/video-channels/{channel_name}/videos", params={"start": start, "count": 100}, headers=api_headers(token), timeout=30, ) r.raise_for_status() data = r.json() for v in data.get("data", []): counts[v["name"]] += 1 start += 100 if start >= data.get("total", 0): break return counts CHUNK_SIZE = 10 * 1024 * 1024 # 10 MB MAX_RETRIES = 5 def _init_resumable( base: str, token: str, channel_id: int, filepath: Path, filename: str, name: str, description: str = "", nsfw: bool = False, ) -> tuple[str, int]: """POST to create a resumable upload session. Returns upload URL.""" file_size = Path(filepath).stat().st_size metadata = { "name": name, "channelId": channel_id, "filename": filename, "nsfw": nsfw, "waitTranscoding": True, "privacy": 1, } if description: metadata["description"] = description r = requests.post( f"{base}/api/v1/videos/upload-resumable", headers={ **api_headers(token), "Content-Type": "application/json", "X-Upload-Content-Length": str(file_size), "X-Upload-Content-Type": "video/mp4", }, json=metadata, timeout=30, ) r.raise_for_status() location = r.headers["Location"] if location.startswith("//"): location = "https:" + location elif location.startswith("/"): location = base + location return location, file_size def _query_offset(upload_url: str, token: str, file_size: int) -> int: """Ask the server how many bytes it has received so far.""" r = requests.put( upload_url, headers={ **api_headers(token), "Content-Range": f"bytes */{file_size}", "Content-Length": "0", }, timeout=15, ) if r.status_code == 308: range_hdr = r.headers.get("Range", "") if range_hdr: return int(range_hdr.split("-")[1]) + 1 return 0 if r.status_code == 200: return file_size r.raise_for_status() return 0 def upload_video( base: str, token: str, channel_id: int, filepath: Path, name: str, description: str = "", nsfw: bool = False, ) -> tuple[bool, str | None]: """Resumable chunked upload. Returns (ok, uuid).""" filepath = Path(filepath) filename = filepath.name file_size = filepath.stat().st_size try: upload_url, _ = _init_resumable( base, token, channel_id, filepath, filename, name, description, nsfw, ) except Exception as e: print(f" Init failed: {e}") return False, None offset = 0 retries = 0 with open(filepath, "rb") as f: while offset < file_size: end = min(offset + CHUNK_SIZE, file_size) - 1 chunk_len = end - offset + 1 f.seek(offset) chunk = f.read(chunk_len) pct = int(100 * (end + 1) / file_size) print( f" {fmt_size(offset)}/{fmt_size(file_size)} ({pct}%)", end="\r", flush=True, ) try: r = requests.put( upload_url, headers={ **api_headers(token), "Content-Type": "application/octet-stream", "Content-Range": f"bytes {offset}-{end}/{file_size}", "Content-Length": str(chunk_len), }, data=chunk, timeout=120, ) except (requests.ConnectionError, requests.Timeout) as e: retries += 1 if retries > MAX_RETRIES: print(f"\n Upload failed after {MAX_RETRIES} retries: {e}") return False, None wait = min(2**retries, 60) print( f"\n Connection error, retry {retries}/{MAX_RETRIES} " f"in {wait}s ..." ) time.sleep(wait) try: offset = _query_offset(upload_url, token, file_size) except Exception: pass continue if r.status_code == 308: range_hdr = r.headers.get("Range", "") if range_hdr: offset = int(range_hdr.split("-")[1]) + 1 else: offset = end + 1 retries = 0 elif r.status_code == 200: print(f" {fmt_size(file_size)}/{fmt_size(file_size)} (100%)") uuid = r.json().get("video", {}).get("uuid") return True, uuid elif r.status_code in (502, 503, 429): retry_after = int(r.headers.get("Retry-After", 10)) retries += 1 if retries > MAX_RETRIES: print(f"\n Upload failed: server returned {r.status_code}") return False, None print(f"\n Server {r.status_code}, retry in {retry_after}s ...") time.sleep(retry_after) try: offset = _query_offset(upload_url, token, file_size) except Exception: pass else: detail = r.text[:300] if r.text else str(r.status_code) print(f"\n Upload failed ({r.status_code}): {detail}") return False, None print("\n Unexpected: sent all bytes but no 200 response") return False, None _STATE = { 1: "Published", 2: "To transcode", 3: "To import", 6: "Moving to object storage", 7: "Transcoding failed", 8: "Storage move failed", 9: "To edit", } def get_video_state(base: str, token: str, uuid: str) -> tuple[int, str]: r = requests.get( f"{base}/api/v1/videos/{uuid}", headers=api_headers(token), timeout=15, ) r.raise_for_status() state = r.json()["state"] return state["id"], state.get("label", "") def wait_for_published(base: str, token: str, uuid: str, poll_interval: int) -> int: """Block until the video reaches state 1 (Published) or a failure state.""" started = time.monotonic() while True: elapsed = int(time.monotonic() - started) hours, rem = divmod(elapsed, 3600) mins, secs = divmod(rem, 60) if hours: elapsed_str = f"{hours}h {mins:02d}m {secs:02d}s" elif mins: elapsed_str = f"{mins}m {secs:02d}s" else: elapsed_str = f"{secs}s" try: sid, label = get_video_state(base, token, uuid) except requests.exceptions.RequestException as e: print( f" -> Poll error ({e.__class__.__name__}) " f"after {elapsed_str}, retrying in {poll_interval}s …" ) time.sleep(poll_interval) continue display = _STATE.get(sid, label or f"state {sid}") if sid == 1: print(f" -> {display}") return sid if sid in (7, 8): print(f" -> FAILED: {display}") return sid print( f" -> {display} … {elapsed_str} elapsed (next check in {poll_interval}s)" ) time.sleep(poll_interval) # ── State tracker ──────────────────────────────────────────────────── def load_uploaded(input_dir: str) -> set[Path]: path = Path(input_dir) / UPLOADED_FILE if not path.exists(): return set() with open(path) as f: return {Path(line.strip()) for line in f if line.strip()} def mark_uploaded(input_dir: str, rel_path: Path) -> None: with open(Path(input_dir) / UPLOADED_FILE, "a") as f: f.write(f"{rel_path}\n") # ── File / metadata helpers ───────────────────────────────────────── def build_path_to_meta( video_map: dict[str, Any], input_dir: str, ) -> dict[Path, dict[str, str]]: """Map each expected download path (relative) to {title, description, original_filename}.""" urls = collect_urls(video_map) mode = read_mode(input_dir) or MODE_ORIGINAL get_paths_typed = cast( Callable[[str, list[str], dict[str, Any], str], dict[str, Path]], get_paths_for_mode, ) paths = get_paths_typed(mode, urls, video_map, input_dir) url_meta: dict[str, dict[str, str]] = {} for entry_any in video_map.values(): entry = cast(dict[str, Any], entry_any) t = entry.get("title") d = entry.get("description") title = t if isinstance(t, str) else "" desc = d if isinstance(d, str) else "" videos_any = entry.get("videos", []) if isinstance(videos_any, list): for video_url_any in videos_any: if not isinstance(video_url_any, str): continue if video_url_any not in url_meta: url_meta[video_url_any] = {"title": title, "description": desc} result: dict[Path, dict[str, str]] = {} for url, abs_path in paths.items(): rel = abs_path.relative_to(input_dir) meta = url_meta.get(url, {"title": "", "description": ""}) result[rel] = { "title": meta.get("title", ""), "description": meta.get("description", ""), "original_filename": url_to_filename(url), } return result def find_videos(input_dir: str) -> set[Path]: """Walk input_dir and return a set of relative paths for all video files.""" found = set() for root, dirs, files in os.walk(input_dir): dirs[:] = [d for d in dirs if not d.startswith(".")] for f in files: if Path(f).suffix.lower() in VIDEO_EXTS: found.add((Path(root) / f).relative_to(input_dir)) return found # ── Channel match helpers ───────────────────────────────────────────── def _channel_match( rel: Path, path_meta: dict[Path, dict[str, str]], existing: set[str], ) -> tuple[bool, str]: """Return (matched, name) for a local file against the channel name set. Checks both the title-derived name and the original-filename-derived name so that videos uploaded under either form are recognised. Extracted to avoid duplicating the logic between the pre-reconcile sweep and the per- file check inside the upload loop. """ meta = path_meta.get(rel, {}) name = make_pt_name(meta.get("title", ""), rel.name) orig_fn = meta.get("original_filename", "") raw_name: str | None = make_pt_name("", orig_fn) if orig_fn else None matched = name in existing if not matched and raw_name is not None and raw_name != name: matched = raw_name in existing return matched, name # ── CLI ────────────────────────────────────────────────────────────── def main() -> None: ap = argparse.ArgumentParser( description="Upload videos to PeerTube with transcoding-aware batching", ) ap.add_argument( "--input", "-i", default=DEFAULT_OUTPUT, help=f"Directory with downloaded videos (default: {DEFAULT_OUTPUT})", ) ap.add_argument("--url", help="PeerTube instance URL (or set PEERTUBE_URL env var)") ap.add_argument( "--username", "-U", help="PeerTube username (or set PEERTUBE_USER env var)" ) ap.add_argument( "--password", "-p", help="PeerTube password (or set PEERTUBE_PASSWORD env var)" ) ap.add_argument( "--channel", "-C", help="Channel to upload to (or set PEERTUBE_CHANNEL env var)" ) ap.add_argument( "--batch-size", "-b", type=int, default=DEFAULT_BATCH_SIZE, help="Videos to upload before waiting for transcoding (default: 1)", ) ap.add_argument( "--poll-interval", type=int, default=DEFAULT_POLL, help=f"Seconds between state polls (default: {DEFAULT_POLL})", ) ap.add_argument( "--skip-wait", action="store_true", help="Upload everything without waiting for transcoding", ) ap.add_argument("--nsfw", action="store_true", help="Mark videos as NSFW") ap.add_argument( "--dry-run", "-n", action="store_true", help="Preview what would be uploaded" ) args = ap.parse_args() url = args.url or os.environ.get("PEERTUBE_URL") username = args.username or os.environ.get("PEERTUBE_USER") channel = args.channel or os.environ.get("PEERTUBE_CHANNEL") password = args.password or os.environ.get("PEERTUBE_PASSWORD") if not args.dry_run: missing = [ label for label, val in [ ("--url / PEERTUBE_URL", url), ("--username / PEERTUBE_USER", username), ("--channel / PEERTUBE_CHANNEL", channel), ("--password / PEERTUBE_PASSWORD", password), ] if not val ] if missing: for label in missing: print(f"[!] Required: {label}") sys.exit(1) # ── load metadata & scan disk ── video_map = load_video_map() path_meta = build_path_to_meta(video_map, args.input) on_disk = find_videos(args.input) unmatched = on_disk - set(path_meta.keys()) if unmatched: print( f"[!] {len(unmatched)} file(s) on disk not in video_map (will use filename as title)" ) for rel in unmatched: path_meta[rel] = {"title": "", "description": ""} uploaded = load_uploaded(args.input) pending = sorted(rel for rel in on_disk if rel not in uploaded) print(f"[+] {len(on_disk)} video files in {args.input}/") print(f"[+] {len(uploaded)} already uploaded") print(f"[+] {len(pending)} pending") print(f"[+] Batch size: {args.batch_size}") if not pending: print("\nAll videos already uploaded.") return # ── dry run ── if args.dry_run: total_bytes = 0 for rel in pending: meta = path_meta.get(rel, {}) name = make_pt_name(meta.get("title", ""), rel.name) sz = (Path(args.input) / rel).stat().st_size total_bytes += sz print(f" [{fmt_size(sz):>10}] {name}") print(f"\n Total: {fmt_size(total_bytes)} across {len(pending)} videos") return assert url is not None assert username is not None assert channel is not None assert password is not None # ── authenticate ── base = url.rstrip("/") if not base.startswith("http"): base = "https://" + base print(f"\n[+] Authenticating with {base} ...") token = get_oauth_token(base, username, password) print(f"[+] Authenticated as {username}") channel_id = get_channel_id(base, token, channel) print(f"[+] Channel: {channel} (id {channel_id})") name_counts = get_channel_video_names(base, token, channel) existing = set(name_counts) total = sum(name_counts.values()) print(f"[+] Found {total} video(s) on channel ({len(name_counts)} unique name(s))") dupes = {name: count for name, count in name_counts.items() if count > 1} if dupes: print(f"[!] {len(dupes)} duplicate name(s) detected on channel:") for name, count in sorted(dupes.items()): print(f" x{count} {name}") # ── pre-reconcile: sweep all pending against channel names ──────── # The main upload loop discovers already-uploaded videos lazily as it # walks the sorted pending list — meaning on a fresh run (no .uploaded # file) you won't know how many files are genuinely new until the loop # has processed everything. Doing a full sweep here, before any # upload starts, gives an accurate count up-front and pre-populates # .uploaded so that interrupted/re-run sessions skip them instantly # without re-checking each time. pre_matched = [] for rel in pending: if _channel_match(rel, path_meta, existing)[0]: pre_matched.append(rel) if pre_matched: print( f"\n[+] Pre-sweep: {len(pre_matched)} local file(s) already on channel — marking uploaded" ) for rel in pre_matched: mark_uploaded(args.input, rel) pending = [rel for rel in pending if rel not in set(pre_matched)] print(f"[+] {len(pending)} left to upload\n") nsfw = args.nsfw total_up = 0 batch: list[tuple[str, str]] = [] # [(uuid, name), ...] try: for rel in pending: # ── flush batch if full ── if not args.skip_wait and len(batch) >= args.batch_size: print( f"\n[+] Waiting for {len(batch)} video(s) to finish processing ..." ) for uuid, bname in batch: print(f"\n [{bname}]") wait_for_published(base, token, uuid, args.poll_interval) batch.clear() filepath = Path(args.input) / rel meta = path_meta.get(rel, {}) name = make_pt_name(meta.get("title", ""), rel.name) desc = clean_description(meta.get("description", "")) sz = filepath.stat().st_size if _channel_match(rel, path_meta, existing)[0]: print(f"\n[skip] already on channel: {name}") mark_uploaded(args.input, rel) continue print(f"\n[{total_up + 1}/{len(pending)}] {name}") print(f" File: {rel} ({fmt_size(sz)})") ok, uuid_opt = upload_video(base, token, channel_id, filepath, name, desc, nsfw) if not ok: continue print(f" Uploaded uuid={uuid_opt}") mark_uploaded(args.input, rel) total_up += 1 existing.add(name) if uuid_opt is not None: batch.append((uuid_opt, name)) # ── wait for final batch ── if batch and not args.skip_wait: print(f"\n[+] Waiting for final {len(batch)} video(s) ...") for uuid, bname in batch: print(f"\n [{bname}]") wait_for_published(base, token, uuid, args.poll_interval) except KeyboardInterrupt: print(f"\n\n[!] Interrupted after {total_up} uploads. Re-run to continue.") sys.exit(130) print(f"\n{'=' * 50}") print(f" Uploaded: {total_up} video(s)") print(" Done!") print(f"{'=' * 50}") if __name__ == "__main__": main()