"""Upload videos to PeerTube with transcoding-aware flow control. Uploads videos one batch at a time, waits for each batch to be fully transcoded and moved to object storage before uploading the next — preventing disk exhaustion on the PeerTube server. Usage: python upload.py # upload from ./downloads python upload.py -i /mnt/vol/dl # custom input dir python upload.py --batch-size 2 # upload 2, wait, repeat python upload.py --dry-run # preview without uploading python upload.py --skip-wait # upload without waiting Required (CLI flag or env var): --url / PEERTUBE_URL --username / PEERTUBE_USER --channel / PEERTUBE_CHANNEL --password / PEERTUBE_PASSWORD """ import argparse from collections import Counter import html import os from pathlib import Path import re import sys import time import requests from dotenv import load_dotenv from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS from download import ( load_video_map, collect_urls, get_paths_for_mode, read_mode, MODE_ORIGINAL, DEFAULT_OUTPUT, ) load_dotenv() # ── Defaults ───────────────────────────────────────────────────────── DEFAULT_BATCH_SIZE = 1 DEFAULT_POLL = 30 UPLOADED_FILE = ".uploaded" PT_NAME_MAX = 120 # ── Text helpers ───────────────────────────────────────────────────── def clean_description(raw): """Strip WordPress shortcodes and HTML from a description.""" if not raw: return "" text = re.sub(r'\[/?[^\]]+\]', '', raw) text = re.sub(r'<[^>]+>', '', text) text = html.unescape(text) text = re.sub(r'\n{3,}', '\n\n', text).strip() return text[:10000] def make_pt_name(title, fallback_filename): """Build a PeerTube-safe video name (3-120 chars).""" name = html.unescape(title).strip( ) if title else Path(fallback_filename).stem if len(name) > PT_NAME_MAX: name = name[: PT_NAME_MAX - 1].rstrip() + "\u2026" while len(name) < 3: name += "_" return name # ── PeerTube API ───────────────────────────────────────────────────── def get_oauth_token(base, username, password): r = requests.get(f"{base}/api/v1/oauth-clients/local", timeout=15) r.raise_for_status() client = r.json() r = requests.post( f"{base}/api/v1/users/token", data={ "client_id": client["client_id"], "client_secret": client["client_secret"], "grant_type": "password", "username": username, "password": password, }, timeout=15, ) r.raise_for_status() return r.json()["access_token"] def api_headers(token): return {"Authorization": f"Bearer {token}"} def get_channel_id(base, token, channel_name): r = requests.get( f"{base}/api/v1/video-channels/{channel_name}", headers=api_headers(token), timeout=15, ) r.raise_for_status() return r.json()["id"] def get_channel_video_names(base, token, channel_name): """Paginate through the channel and return a Counter of video names.""" counts = Counter() start = 0 while True: r = requests.get( f"{base}/api/v1/video-channels/{channel_name}/videos", params={"start": start, "count": 100}, headers=api_headers(token), timeout=30, ) r.raise_for_status() data = r.json() for v in data.get("data", []): counts[v["name"]] += 1 start += 100 if start >= data.get("total", 0): break return counts CHUNK_SIZE = 10 * 1024 * 1024 # 10 MB MAX_RETRIES = 5 def _init_resumable(base, token, channel_id, filepath, filename, name, description="", nsfw=False): """POST to create a resumable upload session. Returns upload URL.""" file_size = Path(filepath).stat().st_size metadata = { "name": name, "channelId": channel_id, "filename": filename, "nsfw": nsfw, "waitTranscoding": True, "privacy": 1, } if description: metadata["description"] = description r = requests.post( f"{base}/api/v1/videos/upload-resumable", headers={ **api_headers(token), "Content-Type": "application/json", "X-Upload-Content-Length": str(file_size), "X-Upload-Content-Type": "video/mp4", }, json=metadata, timeout=30, ) r.raise_for_status() location = r.headers["Location"] if location.startswith("//"): location = "https:" + location elif location.startswith("/"): location = base + location return location, file_size def _query_offset(upload_url, token, file_size): """Ask the server how many bytes it has received so far.""" r = requests.put( upload_url, headers={ **api_headers(token), "Content-Range": f"bytes */{file_size}", "Content-Length": "0", }, timeout=15, ) if r.status_code == 308: range_hdr = r.headers.get("Range", "") if range_hdr: return int(range_hdr.split("-")[1]) + 1 return 0 if r.status_code == 200: return file_size r.raise_for_status() return 0 def upload_video(base, token, channel_id, filepath, name, description="", nsfw=False): """Resumable chunked upload. Returns (ok, uuid).""" filepath = Path(filepath) filename = filepath.name file_size = filepath.stat().st_size try: upload_url, _ = _init_resumable( base, token, channel_id, filepath, filename, name, description, nsfw, ) except Exception as e: print(f" Init failed: {e}") return False, None offset = 0 retries = 0 with open(filepath, "rb") as f: while offset < file_size: end = min(offset + CHUNK_SIZE, file_size) - 1 chunk_len = end - offset + 1 f.seek(offset) chunk = f.read(chunk_len) pct = int(100 * (end + 1) / file_size) print(f" {fmt_size(offset)}/{fmt_size(file_size)} ({pct}%)", end="\r", flush=True) try: r = requests.put( upload_url, headers={ **api_headers(token), "Content-Type": "application/octet-stream", "Content-Range": f"bytes {offset}-{end}/{file_size}", "Content-Length": str(chunk_len), }, data=chunk, timeout=120, ) except (requests.ConnectionError, requests.Timeout) as e: retries += 1 if retries > MAX_RETRIES: print( f"\n Upload failed after {MAX_RETRIES} retries: {e}") return False, None wait = min(2 ** retries, 60) print(f"\n Connection error, retry {retries}/{MAX_RETRIES} " f"in {wait}s ...") time.sleep(wait) try: offset = _query_offset(upload_url, token, file_size) except Exception: pass continue if r.status_code == 308: range_hdr = r.headers.get("Range", "") if range_hdr: offset = int(range_hdr.split("-")[1]) + 1 else: offset = end + 1 retries = 0 elif r.status_code == 200: print( f" {fmt_size(file_size)}/{fmt_size(file_size)} (100%)") uuid = r.json().get("video", {}).get("uuid") return True, uuid elif r.status_code in (502, 503, 429): retry_after = int(r.headers.get("Retry-After", 10)) retries += 1 if retries > MAX_RETRIES: print( f"\n Upload failed: server returned {r.status_code}") return False, None print( f"\n Server {r.status_code}, retry in {retry_after}s ...") time.sleep(retry_after) try: offset = _query_offset(upload_url, token, file_size) except Exception: pass else: detail = r.text[:300] if r.text else str(r.status_code) print(f"\n Upload failed ({r.status_code}): {detail}") return False, None print("\n Unexpected: sent all bytes but no 200 response") return False, None _STATE = { 1: "Published", 2: "To transcode", 3: "To import", 6: "Moving to object storage", 7: "Transcoding failed", 8: "Storage move failed", 9: "To edit", } def get_video_state(base, token, uuid): r = requests.get( f"{base}/api/v1/videos/{uuid}", headers=api_headers(token), timeout=15, ) r.raise_for_status() state = r.json()["state"] return state["id"], state.get("label", "") def wait_for_published(base, token, uuid, poll_interval): """Block until the video reaches state 1 (Published) or a failure state.""" started = time.monotonic() while True: elapsed = int(time.monotonic() - started) hours, rem = divmod(elapsed, 3600) mins, secs = divmod(rem, 60) if hours: elapsed_str = f"{hours}h {mins:02d}m {secs:02d}s" elif mins: elapsed_str = f"{mins}m {secs:02d}s" else: elapsed_str = f"{secs}s" try: sid, label = get_video_state(base, token, uuid) except requests.exceptions.RequestException as e: print(f" -> Poll error ({e.__class__.__name__}) " f"after {elapsed_str}, retrying in {poll_interval}s …") time.sleep(poll_interval) continue display = _STATE.get(sid, label or f"state {sid}") if sid == 1: print(f" -> {display}") return sid if sid in (7, 8): print(f" -> FAILED: {display}") return sid print(f" -> {display} … {elapsed_str} elapsed (next check in {poll_interval}s)") time.sleep(poll_interval) # ── State tracker ──────────────────────────────────────────────────── def load_uploaded(input_dir): path = Path(input_dir) / UPLOADED_FILE if not path.exists(): return set() with open(path) as f: return {Path(line.strip()) for line in f if line.strip()} def mark_uploaded(input_dir, rel_path): with open(Path(input_dir) / UPLOADED_FILE, "a") as f: f.write(f"{rel_path}\n") # ── File / metadata helpers ───────────────────────────────────────── def build_path_to_meta(video_map, input_dir): """Map each expected download path (relative) to {title, description}.""" urls = collect_urls(video_map) mode = read_mode(input_dir) or MODE_ORIGINAL paths = get_paths_for_mode(mode, urls, video_map, input_dir) url_meta = {} for entry in video_map.values(): t = entry.get("title", "") d = entry.get("description", "") for video_url in entry.get("videos", []): if video_url not in url_meta: url_meta[video_url] = {"title": t, "description": d} result = {} for url, abs_path in paths.items(): rel = Path(abs_path).relative_to(input_dir) meta = url_meta.get(url, {"title": "", "description": ""}) result[rel] = {**meta, "original_filename": url_to_filename(url)} return result def find_videos(input_dir): """Walk input_dir and return a set of relative paths for all video files.""" found = set() for root, dirs, files in os.walk(input_dir): dirs[:] = [d for d in dirs if not d.startswith(".")] for f in files: if Path(f).suffix.lower() in VIDEO_EXTS: found.add((Path(root) / f).relative_to(input_dir)) return found # ── Channel match helpers ───────────────────────────────────────────── def _channel_match(rel, path_meta, existing): """Return (matched, name) for a local file against the channel name set. Checks both the title-derived name and the original-filename-derived name so that videos uploaded under either form are recognised. Extracted to avoid duplicating the logic between the pre-reconcile sweep and the per- file check inside the upload loop. """ meta = path_meta.get(rel, {}) name = make_pt_name(meta.get("title", ""), rel.name) orig_fn = meta.get("original_filename", "") raw_name = make_pt_name("", orig_fn) if orig_fn else None matched = name in existing or (raw_name and raw_name != name and raw_name in existing) return matched, name # ── CLI ────────────────────────────────────────────────────────────── def main(): ap = argparse.ArgumentParser( description="Upload videos to PeerTube with transcoding-aware batching", ) ap.add_argument("--input", "-i", default=DEFAULT_OUTPUT, help=f"Directory with downloaded videos (default: {DEFAULT_OUTPUT})") ap.add_argument("--url", help="PeerTube instance URL (or set PEERTUBE_URL env var)") ap.add_argument("--username", "-U", help="PeerTube username (or set PEERTUBE_USER env var)") ap.add_argument("--password", "-p", help="PeerTube password (or set PEERTUBE_PASSWORD env var)") ap.add_argument("--channel", "-C", help="Channel to upload to (or set PEERTUBE_CHANNEL env var)") ap.add_argument("--batch-size", "-b", type=int, default=DEFAULT_BATCH_SIZE, help="Videos to upload before waiting for transcoding (default: 1)") ap.add_argument("--poll-interval", type=int, default=DEFAULT_POLL, help=f"Seconds between state polls (default: {DEFAULT_POLL})") ap.add_argument("--skip-wait", action="store_true", help="Upload everything without waiting for transcoding") ap.add_argument("--nsfw", action="store_true", help="Mark videos as NSFW") ap.add_argument("--dry-run", "-n", action="store_true", help="Preview what would be uploaded") args = ap.parse_args() url = args.url or os.environ.get("PEERTUBE_URL") username = args.username or os.environ.get("PEERTUBE_USER") channel = args.channel or os.environ.get("PEERTUBE_CHANNEL") password = args.password or os.environ.get("PEERTUBE_PASSWORD") if not args.dry_run: missing = [label for label, val in [ ("--url / PEERTUBE_URL", url), ("--username / PEERTUBE_USER", username), ("--channel / PEERTUBE_CHANNEL", channel), ("--password / PEERTUBE_PASSWORD", password), ] if not val] if missing: for label in missing: print(f"[!] Required: {label}") sys.exit(1) # ── load metadata & scan disk ── video_map = load_video_map() path_meta = build_path_to_meta(video_map, args.input) on_disk = find_videos(args.input) unmatched = on_disk - set(path_meta.keys()) if unmatched: print( f"[!] {len(unmatched)} file(s) on disk not in video_map (will use filename as title)") for rel in unmatched: path_meta[rel] = {"title": "", "description": ""} uploaded = load_uploaded(args.input) pending = sorted(rel for rel in on_disk if rel not in uploaded) print(f"[+] {len(on_disk)} video files in {args.input}/") print(f"[+] {len(uploaded)} already uploaded") print(f"[+] {len(pending)} pending") print(f"[+] Batch size: {args.batch_size}") if not pending: print("\nAll videos already uploaded.") return # ── dry run ── if args.dry_run: total_bytes = 0 for rel in pending: meta = path_meta.get(rel, {}) name = make_pt_name(meta.get("title", ""), rel.name) sz = (Path(args.input) / rel).stat().st_size total_bytes += sz print(f" [{fmt_size(sz):>10}] {name}") print( f"\n Total: {fmt_size(total_bytes)} across {len(pending)} videos") return # ── authenticate ── base = url.rstrip("/") if not base.startswith("http"): base = "https://" + base print(f"\n[+] Authenticating with {base} ...") token = get_oauth_token(base, username, password) print(f"[+] Authenticated as {username}") channel_id = get_channel_id(base, token, channel) print(f"[+] Channel: {channel} (id {channel_id})") name_counts = get_channel_video_names(base, token, channel) existing = set(name_counts) total = sum(name_counts.values()) print(f"[+] Found {total} video(s) on channel ({len(name_counts)} unique name(s))") dupes = {name: count for name, count in name_counts.items() if count > 1} if dupes: print(f"[!] {len(dupes)} duplicate name(s) detected on channel:") for name, count in sorted(dupes.items()): print(f" x{count} {name}") # ── pre-reconcile: sweep all pending against channel names ──────── # The main upload loop discovers already-uploaded videos lazily as it # walks the sorted pending list — meaning on a fresh run (no .uploaded # file) you won't know how many files are genuinely new until the loop # has processed everything. Doing a full sweep here, before any # upload starts, gives an accurate count up-front and pre-populates # .uploaded so that interrupted/re-run sessions skip them instantly # without re-checking each time. pre_matched = [] for rel in pending: if _channel_match(rel, path_meta, existing)[0]: pre_matched.append(rel) if pre_matched: print(f"\n[+] Pre-sweep: {len(pre_matched)} local file(s) already on channel — marking uploaded") for rel in pre_matched: mark_uploaded(args.input, rel) pending = [rel for rel in pending if rel not in set(pre_matched)] print(f"[+] {len(pending)} left to upload\n") nsfw = args.nsfw total_up = 0 batch: list[tuple[str, str]] = [] # [(uuid, name), ...] try: for rel in pending: # ── flush batch if full ── if not args.skip_wait and len(batch) >= args.batch_size: print( f"\n[+] Waiting for {len(batch)} video(s) to finish processing ...") for uuid, bname in batch: print(f"\n [{bname}]") wait_for_published(base, token, uuid, args.poll_interval) batch.clear() filepath = Path(args.input) / rel meta = path_meta.get(rel, {}) name = make_pt_name(meta.get("title", ""), rel.name) desc = clean_description(meta.get("description", "")) sz = filepath.stat().st_size if _channel_match(rel, path_meta, existing)[0]: print(f"\n[skip] already on channel: {name}") mark_uploaded(args.input, rel) continue print(f"\n[{total_up + 1}/{len(pending)}] {name}") print(f" File: {rel} ({fmt_size(sz)})") ok, uuid = upload_video( base, token, channel_id, filepath, name, desc, nsfw) if not ok: continue print(f" Uploaded uuid={uuid}") mark_uploaded(args.input, rel) total_up += 1 existing.add(name) if uuid: batch.append((uuid, name)) # ── wait for final batch ── if batch and not args.skip_wait: print(f"\n[+] Waiting for final {len(batch)} video(s) ...") for uuid, bname in batch: print(f"\n [{bname}]") wait_for_published(base, token, uuid, args.poll_interval) except KeyboardInterrupt: print( f"\n\n[!] Interrupted after {total_up} uploads. Re-run to continue.") sys.exit(130) print(f"\n{'=' * 50}") print(f" Uploaded: {total_up} video(s)") print(" Done!") print(f"{'=' * 50}") if __name__ == "__main__": main()