Files
jailbirdz-dl/upload.py
2026-03-01 01:36:01 +01:00

708 lines
23 KiB
Python

"""Upload videos to PeerTube with transcoding-aware flow control.
Uploads videos one batch at a time, waits for each batch to be fully transcoded
and moved to object storage before uploading the next — preventing disk
exhaustion on the PeerTube server.
Usage:
python upload.py # upload from ./downloads
python upload.py -i /mnt/vol/dl # custom input dir
python upload.py --batch-size 2 # upload 2, wait, repeat
python upload.py --dry-run # preview without uploading
python upload.py --skip-wait # upload without waiting
Required (CLI flag or env var):
--url / PEERTUBE_URL
--username / PEERTUBE_USER
--channel / PEERTUBE_CHANNEL
--password / PEERTUBE_PASSWORD
"""
import argparse
from collections import Counter
import html
import os
from pathlib import Path
import re
import sys
import time
from typing import Any, cast
import requests
from dotenv import load_dotenv
from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS, load_video_map
from config import SITES
from download import (
collect_urls,
get_paths_for_mode,
read_mode,
MODE_ORIGINAL,
DEFAULT_OUTPUT,
)
load_dotenv()
# ── Defaults ─────────────────────────────────────────────────────────
DEFAULT_BATCH_SIZE = 1
DEFAULT_POLL = 30
UPLOADED_FILE = ".uploaded"
PT_NAME_MAX = 120
# ── Text helpers ─────────────────────────────────────────────────────
def clean_description(raw: str) -> str:
"""Strip WordPress shortcodes and HTML from a description."""
if not raw:
return ""
text = re.sub(r"\[/?[^\]]+\]", "", raw)
text = re.sub(r"<[^>]+>", "", text)
text = html.unescape(text)
text = re.sub(r"\n{3,}", "\n\n", text).strip()
return text[:10000]
def make_pt_name(title: str, fallback_filename: str) -> str:
"""Build a PeerTube-safe video name (3-120 chars)."""
name = html.unescape(title).strip() if title else Path(fallback_filename).stem
if len(name) > PT_NAME_MAX:
name = name[: PT_NAME_MAX - 1].rstrip() + "\u2026"
while len(name) < 3:
name += "_"
return name
# ── PeerTube API ─────────────────────────────────────────────────────
def get_oauth_token(base: str, username: str, password: str) -> str:
r = requests.get(f"{base}/api/v1/oauth-clients/local", timeout=15)
r.raise_for_status()
client = r.json()
r = requests.post(
f"{base}/api/v1/users/token",
data={
"client_id": client["client_id"],
"client_secret": client["client_secret"],
"grant_type": "password",
"username": username,
"password": password,
},
timeout=15,
)
r.raise_for_status()
data_any: Any = r.json()
data = cast(dict[str, Any], data_any)
token = data.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("PeerTube token response missing access_token")
return token
def api_headers(token: str) -> dict[str, str]:
return {"Authorization": f"Bearer {token}"}
def get_channel_id(base: str, token: str, channel_name: str) -> int:
r = requests.get(
f"{base}/api/v1/video-channels/{channel_name}",
headers=api_headers(token),
timeout=15,
)
r.raise_for_status()
data_any: Any = r.json()
data = cast(dict[str, Any], data_any)
cid = data.get("id")
if not isinstance(cid, int):
raise RuntimeError("PeerTube channel response missing id")
return cid
def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
"""Paginate through the channel and return a Counter of video names."""
counts: Counter[str] = Counter()
start = 0
while True:
r = requests.get(
f"{base}/api/v1/video-channels/{channel_name}/videos",
params={"start": start, "count": 100},
headers=api_headers(token),
timeout=30,
)
r.raise_for_status()
data = r.json()
for v in data.get("data", []):
counts[v["name"]] += 1
start += 100
if start >= data.get("total", 0):
break
return counts
CHUNK_SIZE = 10 * 1024 * 1024 # 10 MB
MAX_RETRIES = 5
def _init_resumable(
base: str,
token: str,
channel_id: int,
filepath: Path,
filename: str,
name: str,
description: str = "",
nsfw: bool = False,
) -> tuple[str, int]:
"""POST to create a resumable upload session. Returns upload URL."""
file_size = Path(filepath).stat().st_size
metadata = {
"name": name,
"channelId": channel_id,
"filename": filename,
"nsfw": nsfw,
"waitTranscoding": True,
"privacy": 1,
}
if description:
metadata["description"] = description
r = requests.post(
f"{base}/api/v1/videos/upload-resumable",
headers={
**api_headers(token),
"Content-Type": "application/json",
"X-Upload-Content-Length": str(file_size),
"X-Upload-Content-Type": "video/mp4",
},
json=metadata,
timeout=30,
)
r.raise_for_status()
location = r.headers["Location"]
if location.startswith("//"):
location = "https:" + location
elif location.startswith("/"):
location = base + location
return location, file_size
def _query_offset(upload_url: str, token: str, file_size: int) -> int:
"""Ask the server how many bytes it has received so far."""
r = requests.put(
upload_url,
headers={
**api_headers(token),
"Content-Range": f"bytes */{file_size}",
"Content-Length": "0",
},
timeout=15,
)
if r.status_code == 308:
range_hdr = r.headers.get("Range", "")
if range_hdr:
return int(range_hdr.split("-")[1]) + 1
return 0
if r.status_code == 200:
return file_size
r.raise_for_status()
return 0
def upload_video(
base: str,
token: str,
channel_id: int,
filepath: Path,
name: str,
description: str = "",
nsfw: bool = False,
) -> tuple[bool, str | None]:
"""Resumable chunked upload. Returns (ok, uuid)."""
filepath = Path(filepath)
filename = filepath.name
file_size = filepath.stat().st_size
try:
upload_url, _ = _init_resumable(
base,
token,
channel_id,
filepath,
filename,
name,
description,
nsfw,
)
except Exception as e:
print(f" Init failed: {e}")
return False, None
offset = 0
retries = 0
with open(filepath, "rb") as f:
while offset < file_size:
end = min(offset + CHUNK_SIZE, file_size) - 1
chunk_len = end - offset + 1
f.seek(offset)
chunk = f.read(chunk_len)
pct = int(100 * (end + 1) / file_size)
print(
f" {fmt_size(offset)}/{fmt_size(file_size)} ({pct}%)",
end="\r",
flush=True,
)
try:
r = requests.put(
upload_url,
headers={
**api_headers(token),
"Content-Type": "application/octet-stream",
"Content-Range": f"bytes {offset}-{end}/{file_size}",
"Content-Length": str(chunk_len),
},
data=chunk,
timeout=120,
)
except (requests.ConnectionError, requests.Timeout) as e:
retries += 1
if retries > MAX_RETRIES:
print(f"\n Upload failed after {MAX_RETRIES} retries: {e}")
return False, None
wait = min(2**retries, 60)
print(
f"\n Connection error, retry {retries}/{MAX_RETRIES} "
f"in {wait}s ..."
)
time.sleep(wait)
try:
offset = _query_offset(upload_url, token, file_size)
except Exception:
pass
continue
if r.status_code == 308:
range_hdr = r.headers.get("Range", "")
if range_hdr:
offset = int(range_hdr.split("-")[1]) + 1
else:
offset = end + 1
retries = 0
elif r.status_code == 200:
print(f" {fmt_size(file_size)}/{fmt_size(file_size)} (100%)")
uuid = r.json().get("video", {}).get("uuid")
return True, uuid
elif r.status_code in (502, 503, 429):
retry_after = int(r.headers.get("Retry-After", 10))
retries += 1
if retries > MAX_RETRIES:
print(f"\n Upload failed: server returned {r.status_code}")
return False, None
print(f"\n Server {r.status_code}, retry in {retry_after}s ...")
time.sleep(retry_after)
try:
offset = _query_offset(upload_url, token, file_size)
except Exception:
pass
else:
detail = r.text[:300] if r.text else str(r.status_code)
print(f"\n Upload failed ({r.status_code}): {detail}")
return False, None
print("\n Unexpected: sent all bytes but no 200 response")
return False, None
_STATE = {
1: "Published",
2: "To transcode",
3: "To import",
6: "Moving to object storage",
7: "Transcoding failed",
8: "Storage move failed",
9: "To edit",
}
def get_video_state(base: str, token: str, uuid: str) -> tuple[int, str]:
r = requests.get(
f"{base}/api/v1/videos/{uuid}",
headers=api_headers(token),
timeout=15,
)
r.raise_for_status()
state = r.json()["state"]
return state["id"], state.get("label", "")
def wait_for_published(base: str, token: str, uuid: str, poll_interval: int) -> int:
"""Block until the video reaches state 1 (Published) or a failure state."""
started = time.monotonic()
while True:
elapsed = int(time.monotonic() - started)
hours, rem = divmod(elapsed, 3600)
mins, secs = divmod(rem, 60)
if hours:
elapsed_str = f"{hours}h {mins:02d}m {secs:02d}s"
elif mins:
elapsed_str = f"{mins}m {secs:02d}s"
else:
elapsed_str = f"{secs}s"
try:
sid, label = get_video_state(base, token, uuid)
except requests.exceptions.RequestException as e:
print(
f" -> Poll error ({e.__class__.__name__}) "
f"after {elapsed_str}, retrying in {poll_interval}s …"
)
time.sleep(poll_interval)
continue
display = _STATE.get(sid, label or f"state {sid}")
if sid == 1:
print(f" -> {display}")
return sid
if sid in (7, 8):
print(f" -> FAILED: {display}")
return sid
print(
f" -> {display}{elapsed_str} elapsed (next check in {poll_interval}s)"
)
time.sleep(poll_interval)
# ── State tracker ────────────────────────────────────────────────────
def load_uploaded(input_dir: str) -> set[Path]:
path = Path(input_dir) / UPLOADED_FILE
if not path.exists():
return set()
with open(path) as f:
return {Path(line.strip()) for line in f if line.strip()}
def mark_uploaded(input_dir: str, rel_path: Path) -> None:
with open(Path(input_dir) / UPLOADED_FILE, "a") as f:
f.write(f"{rel_path}\n")
# ── File / metadata helpers ─────────────────────────────────────────
def build_path_to_meta(
video_map: dict[str, Any],
input_dir: str,
) -> dict[Path, dict[str, str]]:
"""Map each expected download path (relative) to {title, description, original_filename}."""
urls = collect_urls(video_map)
mode = read_mode(input_dir) or MODE_ORIGINAL
url_to_site: dict[str, str] = {}
for site_key in SITES:
for entry in load_video_map(site_key).values():
for vid_url in entry.get("videos", []):
url_to_site[vid_url] = site_key
paths = get_paths_for_mode(mode, urls, video_map, input_dir, url_to_site)
url_meta: dict[str, dict[str, str]] = {}
for entry_any in video_map.values():
entry = cast(dict[str, Any], entry_any)
t = entry.get("title")
d = entry.get("description")
title = t if isinstance(t, str) else ""
desc = d if isinstance(d, str) else ""
videos_any = entry.get("videos", [])
if isinstance(videos_any, list):
for video_url_any in videos_any:
if not isinstance(video_url_any, str):
continue
if video_url_any not in url_meta:
url_meta[video_url_any] = {"title": title, "description": desc}
result: dict[Path, dict[str, str]] = {}
for url, abs_path in paths.items():
rel = abs_path.relative_to(input_dir)
meta = url_meta.get(url, {"title": "", "description": ""})
result[rel] = {
"title": meta.get("title", ""),
"description": meta.get("description", ""),
"original_filename": url_to_filename(url),
}
return result
def find_videos(input_dir: str) -> set[Path]:
"""Walk input_dir and return a set of relative paths for all video files."""
found = set()
for root, dirs, files in os.walk(input_dir):
dirs[:] = [d for d in dirs if not d.startswith(".")]
for f in files:
if Path(f).suffix.lower() in VIDEO_EXTS:
found.add((Path(root) / f).relative_to(input_dir))
return found
# ── Channel match helpers ─────────────────────────────────────────────
def _channel_match(
rel: Path,
path_meta: dict[Path, dict[str, str]],
existing: set[str],
) -> tuple[bool, str]:
"""Return (matched, name) for a local file against the channel name set.
Checks both the title-derived name and the original-filename-derived name
so that videos uploaded under either form are recognised. Extracted to
avoid duplicating the logic between the pre-reconcile sweep and the per-
file check inside the upload loop.
"""
meta = path_meta.get(rel, {})
name = make_pt_name(meta.get("title", ""), rel.name)
orig_fn = meta.get("original_filename", "")
raw_name: str | None = make_pt_name("", orig_fn) if orig_fn else None
matched = name in existing
if not matched and raw_name is not None and raw_name != name:
matched = raw_name in existing
return matched, name
# ── CLI ──────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(
description="Upload videos to PeerTube with transcoding-aware batching",
)
ap.add_argument(
"--input",
"-i",
default=DEFAULT_OUTPUT,
help=f"Directory with downloaded videos (default: {DEFAULT_OUTPUT})",
)
ap.add_argument("--url", help="PeerTube instance URL (or set PEERTUBE_URL env var)")
ap.add_argument(
"--username", "-U", help="PeerTube username (or set PEERTUBE_USER env var)"
)
ap.add_argument(
"--password", "-p", help="PeerTube password (or set PEERTUBE_PASSWORD env var)"
)
ap.add_argument(
"--channel", "-C", help="Channel to upload to (or set PEERTUBE_CHANNEL env var)"
)
ap.add_argument(
"--batch-size",
"-b",
type=int,
default=DEFAULT_BATCH_SIZE,
help="Videos to upload before waiting for transcoding (default: 1)",
)
ap.add_argument(
"--poll-interval",
type=int,
default=DEFAULT_POLL,
help=f"Seconds between state polls (default: {DEFAULT_POLL})",
)
ap.add_argument(
"--skip-wait",
action="store_true",
help="Upload everything without waiting for transcoding",
)
ap.add_argument("--nsfw", action="store_true", help="Mark videos as NSFW")
ap.add_argument(
"--dry-run", "-n", action="store_true", help="Preview what would be uploaded"
)
args = ap.parse_args()
url = args.url or os.environ.get("PEERTUBE_URL")
username = args.username or os.environ.get("PEERTUBE_USER")
channel = args.channel or os.environ.get("PEERTUBE_CHANNEL")
password = args.password or os.environ.get("PEERTUBE_PASSWORD")
if not args.dry_run:
missing = [
label
for label, val in [
("--url / PEERTUBE_URL", url),
("--username / PEERTUBE_USER", username),
("--channel / PEERTUBE_CHANNEL", channel),
("--password / PEERTUBE_PASSWORD", password),
]
if not val
]
if missing:
for label in missing:
print(f"[!] Required: {label}")
sys.exit(1)
# ── load metadata & scan disk ──
video_map = load_video_map()
path_meta = build_path_to_meta(video_map, args.input)
on_disk = find_videos(args.input)
unmatched = on_disk - set(path_meta.keys())
if unmatched:
print(
f"[!] {len(unmatched)} file(s) on disk not in video_map (will use filename as title)"
)
for rel in unmatched:
path_meta[rel] = {"title": "", "description": ""}
uploaded = load_uploaded(args.input)
pending = sorted(rel for rel in on_disk if rel not in uploaded)
print(f"[+] {len(on_disk)} video files in {args.input}/")
print(f"[+] {len(uploaded)} already uploaded")
print(f"[+] {len(pending)} pending")
print(f"[+] Batch size: {args.batch_size}")
if not pending:
print("\nAll videos already uploaded.")
return
# ── dry run ──
if args.dry_run:
total_bytes = 0
for rel in pending:
meta = path_meta.get(rel, {})
name = make_pt_name(meta.get("title", ""), rel.name)
sz = (Path(args.input) / rel).stat().st_size
total_bytes += sz
print(f" [{fmt_size(sz):>10}] {name}")
print(f"\n Total: {fmt_size(total_bytes)} across {len(pending)} videos")
return
assert url is not None
assert username is not None
assert channel is not None
assert password is not None
# ── authenticate ──
base = url.rstrip("/")
if not base.startswith("http"):
base = "https://" + base
print(f"\n[+] Authenticating with {base} ...")
token = get_oauth_token(base, username, password)
print(f"[+] Authenticated as {username}")
channel_id = get_channel_id(base, token, channel)
print(f"[+] Channel: {channel} (id {channel_id})")
name_counts = get_channel_video_names(base, token, channel)
existing = set(name_counts)
total = sum(name_counts.values())
print(f"[+] Found {total} video(s) on channel ({len(name_counts)} unique name(s))")
dupes = {name: count for name, count in name_counts.items() if count > 1}
if dupes:
print(f"[!] {len(dupes)} duplicate name(s) detected on channel:")
for name, count in sorted(dupes.items()):
print(f" x{count} {name}")
# ── pre-reconcile: sweep all pending against channel names ────────
# The main upload loop discovers already-uploaded videos lazily as it
# walks the sorted pending list — meaning on a fresh run (no .uploaded
# file) you won't know how many files are genuinely new until the loop
# has processed everything. Doing a full sweep here, before any
# upload starts, gives an accurate count up-front and pre-populates
# .uploaded so that interrupted/re-run sessions skip them instantly
# without re-checking each time.
pre_matched = []
for rel in pending:
if _channel_match(rel, path_meta, existing)[0]:
pre_matched.append(rel)
if pre_matched:
print(
f"\n[+] Pre-sweep: {len(pre_matched)} local file(s) already on channel — marking uploaded"
)
for rel in pre_matched:
mark_uploaded(args.input, rel)
pending = [rel for rel in pending if rel not in set(pre_matched)]
print(f"[+] {len(pending)} left to upload\n")
nsfw = args.nsfw
total_up = 0
batch: list[tuple[str, str]] = [] # [(uuid, name), ...]
try:
for rel in pending:
# ── flush batch if full ──
if not args.skip_wait and len(batch) >= args.batch_size:
print(
f"\n[+] Waiting for {len(batch)} video(s) to finish processing ..."
)
for uuid, bname in batch:
print(f"\n [{bname}]")
wait_for_published(base, token, uuid, args.poll_interval)
batch.clear()
filepath = Path(args.input) / rel
meta = path_meta.get(rel, {})
name = make_pt_name(meta.get("title", ""), rel.name)
desc = clean_description(meta.get("description", ""))
sz = filepath.stat().st_size
if _channel_match(rel, path_meta, existing)[0]:
print(f"\n[skip] already on channel: {name}")
mark_uploaded(args.input, rel)
continue
print(f"\n[{total_up + 1}/{len(pending)}] {name}")
print(f" File: {rel} ({fmt_size(sz)})")
ok, uuid_opt = upload_video(
base, token, channel_id, filepath, name, desc, nsfw
)
if not ok:
continue
print(f" Uploaded uuid={uuid_opt}")
mark_uploaded(args.input, rel)
total_up += 1
existing.add(name)
if uuid_opt is not None:
batch.append((uuid_opt, name))
# ── wait for final batch ──
if batch and not args.skip_wait:
print(f"\n[+] Waiting for final {len(batch)} video(s) ...")
for uuid, bname in batch:
print(f"\n [{bname}]")
wait_for_published(base, token, uuid, args.poll_interval)
except KeyboardInterrupt:
print(f"\n\n[!] Interrupted after {total_up} uploads. Re-run to continue.")
sys.exit(130)
print(f"\n{'=' * 50}")
print(f" Uploaded: {total_up} video(s)")
print(" Done!")
print(f"{'=' * 50}")
if __name__ == "__main__":
main()