Garbage commit; added junk code

This commit is contained in:
HugeFrog24
2026-02-20 18:07:02 +01:00
commit 80444405e9
15 changed files with 34271 additions and 0 deletions

603
upload.py Normal file
View File

@@ -0,0 +1,603 @@
"""Upload videos to PeerTube with transcoding-aware flow control.
Uploads videos one batch at a time, waits for each batch to be fully transcoded
and moved to object storage before uploading the next — preventing disk
exhaustion on the PeerTube server.
Usage:
python upload.py # upload from ./downloads
python upload.py -i /mnt/vol/dl # custom input dir
python upload.py --batch-size 2 # upload 2, wait, repeat
python upload.py --dry-run # preview without uploading
python upload.py --skip-wait # upload without waiting
Required (CLI flag or env var):
--url / PEERTUBE_URL
--username / PEERTUBE_USER
--channel / PEERTUBE_CHANNEL
--password / PEERTUBE_PASSWORD
"""
import argparse
from collections import Counter
import html
import os
from pathlib import Path
import re
import sys
import time
import requests
from dotenv import load_dotenv
from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS
from download import (
load_video_map,
collect_urls,
get_paths_for_mode,
read_mode,
MODE_ORIGINAL,
DEFAULT_OUTPUT,
)
load_dotenv()
# ── Defaults ─────────────────────────────────────────────────────────
DEFAULT_BATCH_SIZE = 1
DEFAULT_POLL = 30
UPLOADED_FILE = ".uploaded"
PT_NAME_MAX = 120
# ── Text helpers ─────────────────────────────────────────────────────
def clean_description(raw):
"""Strip WordPress shortcodes and HTML from a description."""
if not raw:
return ""
text = re.sub(r'\[/?[^\]]+\]', '', raw)
text = re.sub(r'<[^>]+>', '', text)
text = html.unescape(text)
text = re.sub(r'\n{3,}', '\n\n', text).strip()
return text[:10000]
def make_pt_name(title, fallback_filename):
"""Build a PeerTube-safe video name (3-120 chars)."""
name = html.unescape(title).strip(
) if title else Path(fallback_filename).stem
if len(name) > PT_NAME_MAX:
name = name[: PT_NAME_MAX - 1].rstrip() + "\u2026"
while len(name) < 3:
name += "_"
return name
# ── PeerTube API ─────────────────────────────────────────────────────
def get_oauth_token(base, username, password):
r = requests.get(f"{base}/api/v1/oauth-clients/local", timeout=15)
r.raise_for_status()
client = r.json()
r = requests.post(
f"{base}/api/v1/users/token",
data={
"client_id": client["client_id"],
"client_secret": client["client_secret"],
"grant_type": "password",
"username": username,
"password": password,
},
timeout=15,
)
r.raise_for_status()
return r.json()["access_token"]
def api_headers(token):
return {"Authorization": f"Bearer {token}"}
def get_channel_id(base, token, channel_name):
r = requests.get(
f"{base}/api/v1/video-channels/{channel_name}",
headers=api_headers(token),
timeout=15,
)
r.raise_for_status()
return r.json()["id"]
def get_channel_video_names(base, token, channel_name):
"""Paginate through the channel and return a Counter of video names."""
counts = Counter()
start = 0
while True:
r = requests.get(
f"{base}/api/v1/video-channels/{channel_name}/videos",
params={"start": start, "count": 100},
headers=api_headers(token),
timeout=30,
)
r.raise_for_status()
data = r.json()
for v in data.get("data", []):
counts[v["name"]] += 1
start += 100
if start >= data.get("total", 0):
break
return counts
CHUNK_SIZE = 10 * 1024 * 1024 # 10 MB
MAX_RETRIES = 5
def _init_resumable(base, token, channel_id, filepath, filename, name,
description="", nsfw=False):
"""POST to create a resumable upload session. Returns upload URL."""
file_size = Path(filepath).stat().st_size
metadata = {
"name": name,
"channelId": channel_id,
"filename": filename,
"nsfw": nsfw,
"waitTranscoding": True,
"privacy": 1,
}
if description:
metadata["description"] = description
r = requests.post(
f"{base}/api/v1/videos/upload-resumable",
headers={
**api_headers(token),
"Content-Type": "application/json",
"X-Upload-Content-Length": str(file_size),
"X-Upload-Content-Type": "video/mp4",
},
json=metadata,
timeout=30,
)
r.raise_for_status()
location = r.headers["Location"]
if location.startswith("//"):
location = "https:" + location
elif location.startswith("/"):
location = base + location
return location, file_size
def _query_offset(upload_url, token, file_size):
"""Ask the server how many bytes it has received so far."""
r = requests.put(
upload_url,
headers={
**api_headers(token),
"Content-Range": f"bytes */{file_size}",
"Content-Length": "0",
},
timeout=15,
)
if r.status_code == 308:
range_hdr = r.headers.get("Range", "")
if range_hdr:
return int(range_hdr.split("-")[1]) + 1
return 0
if r.status_code == 200:
return file_size
r.raise_for_status()
return 0
def upload_video(base, token, channel_id, filepath, name,
description="", nsfw=False):
"""Resumable chunked upload. Returns (ok, uuid)."""
filepath = Path(filepath)
filename = filepath.name
file_size = filepath.stat().st_size
try:
upload_url, _ = _init_resumable(
base, token, channel_id, filepath, filename,
name, description, nsfw,
)
except Exception as e:
print(f" Init failed: {e}")
return False, None
offset = 0
retries = 0
with open(filepath, "rb") as f:
while offset < file_size:
end = min(offset + CHUNK_SIZE, file_size) - 1
chunk_len = end - offset + 1
f.seek(offset)
chunk = f.read(chunk_len)
pct = int(100 * (end + 1) / file_size)
print(f" {fmt_size(offset)}/{fmt_size(file_size)} ({pct}%)",
end="\r", flush=True)
try:
r = requests.put(
upload_url,
headers={
**api_headers(token),
"Content-Type": "application/octet-stream",
"Content-Range": f"bytes {offset}-{end}/{file_size}",
"Content-Length": str(chunk_len),
},
data=chunk,
timeout=120,
)
except (requests.ConnectionError, requests.Timeout) as e:
retries += 1
if retries > MAX_RETRIES:
print(
f"\n Upload failed after {MAX_RETRIES} retries: {e}")
return False, None
wait = min(2 ** retries, 60)
print(f"\n Connection error, retry {retries}/{MAX_RETRIES} "
f"in {wait}s ...")
time.sleep(wait)
try:
offset = _query_offset(upload_url, token, file_size)
except Exception:
pass
continue
if r.status_code == 308:
range_hdr = r.headers.get("Range", "")
if range_hdr:
offset = int(range_hdr.split("-")[1]) + 1
else:
offset = end + 1
retries = 0
elif r.status_code == 200:
print(
f" {fmt_size(file_size)}/{fmt_size(file_size)} (100%)")
uuid = r.json().get("video", {}).get("uuid")
return True, uuid
elif r.status_code in (502, 503, 429):
retry_after = int(r.headers.get("Retry-After", 10))
retries += 1
if retries > MAX_RETRIES:
print(
f"\n Upload failed: server returned {r.status_code}")
return False, None
print(
f"\n Server {r.status_code}, retry in {retry_after}s ...")
time.sleep(retry_after)
try:
offset = _query_offset(upload_url, token, file_size)
except Exception:
pass
else:
detail = r.text[:300] if r.text else str(r.status_code)
print(f"\n Upload failed ({r.status_code}): {detail}")
return False, None
print("\n Unexpected: sent all bytes but no 200 response")
return False, None
_STATE = {
1: "Published",
2: "To transcode",
3: "To import",
6: "Moving to object storage",
7: "Transcoding failed",
8: "Storage move failed",
9: "To edit",
}
def get_video_state(base, token, uuid):
r = requests.get(
f"{base}/api/v1/videos/{uuid}",
headers=api_headers(token),
timeout=15,
)
r.raise_for_status()
state = r.json()["state"]
return state["id"], state.get("label", "")
def wait_for_published(base, token, uuid, poll_interval):
"""Block until the video reaches state 1 (Published) or a failure state."""
started = time.monotonic()
while True:
elapsed = int(time.monotonic() - started)
hours, rem = divmod(elapsed, 3600)
mins, secs = divmod(rem, 60)
if hours:
elapsed_str = f"{hours}h {mins:02d}m {secs:02d}s"
elif mins:
elapsed_str = f"{mins}m {secs:02d}s"
else:
elapsed_str = f"{secs}s"
try:
sid, label = get_video_state(base, token, uuid)
except requests.exceptions.RequestException as e:
print(f" -> Poll error ({e.__class__.__name__}) "
f"after {elapsed_str}, retrying in {poll_interval}s …")
time.sleep(poll_interval)
continue
display = _STATE.get(sid, label or f"state {sid}")
if sid == 1:
print(f" -> {display}")
return sid
if sid in (7, 8):
print(f" -> FAILED: {display}")
return sid
print(f" -> {display}{elapsed_str} elapsed (next check in {poll_interval}s)")
time.sleep(poll_interval)
# ── State tracker ────────────────────────────────────────────────────
def load_uploaded(input_dir):
path = Path(input_dir) / UPLOADED_FILE
if not path.exists():
return set()
with open(path) as f:
return {Path(line.strip()) for line in f if line.strip()}
def mark_uploaded(input_dir, rel_path):
with open(Path(input_dir) / UPLOADED_FILE, "a") as f:
f.write(f"{rel_path}\n")
# ── File / metadata helpers ─────────────────────────────────────────
def build_path_to_meta(video_map, input_dir):
"""Map each expected download path (relative) to {title, description}."""
urls = collect_urls(video_map)
mode = read_mode(input_dir) or MODE_ORIGINAL
paths = get_paths_for_mode(mode, urls, video_map, input_dir)
url_meta = {}
for entry in video_map.values():
t = entry.get("title", "")
d = entry.get("description", "")
for video_url in entry.get("videos", []):
if video_url not in url_meta:
url_meta[video_url] = {"title": t, "description": d}
result = {}
for url, abs_path in paths.items():
rel = Path(abs_path).relative_to(input_dir)
meta = url_meta.get(url, {"title": "", "description": ""})
result[rel] = {**meta, "original_filename": url_to_filename(url)}
return result
def find_videos(input_dir):
"""Walk input_dir and return a set of relative paths for all video files."""
found = set()
for root, dirs, files in os.walk(input_dir):
dirs[:] = [d for d in dirs if not d.startswith(".")]
for f in files:
if Path(f).suffix.lower() in VIDEO_EXTS:
found.add((Path(root) / f).relative_to(input_dir))
return found
# ── Channel match helpers ─────────────────────────────────────────────
def _channel_match(rel, path_meta, existing):
"""Return (matched, name) for a local file against the channel name set.
Checks both the title-derived name and the original-filename-derived name
so that videos uploaded under either form are recognised. Extracted to
avoid duplicating the logic between the pre-reconcile sweep and the per-
file check inside the upload loop.
"""
meta = path_meta.get(rel, {})
name = make_pt_name(meta.get("title", ""), rel.name)
orig_fn = meta.get("original_filename", "")
raw_name = make_pt_name("", orig_fn) if orig_fn else None
matched = name in existing or (raw_name and raw_name != name and raw_name in existing)
return matched, name
# ── CLI ──────────────────────────────────────────────────────────────
def main():
ap = argparse.ArgumentParser(
description="Upload videos to PeerTube with transcoding-aware batching",
)
ap.add_argument("--input", "-i", default=DEFAULT_OUTPUT,
help=f"Directory with downloaded videos (default: {DEFAULT_OUTPUT})")
ap.add_argument("--url",
help="PeerTube instance URL (or set PEERTUBE_URL env var)")
ap.add_argument("--username", "-U",
help="PeerTube username (or set PEERTUBE_USER env var)")
ap.add_argument("--password", "-p",
help="PeerTube password (or set PEERTUBE_PASSWORD env var)")
ap.add_argument("--channel", "-C",
help="Channel to upload to (or set PEERTUBE_CHANNEL env var)")
ap.add_argument("--batch-size", "-b", type=int, default=DEFAULT_BATCH_SIZE,
help="Videos to upload before waiting for transcoding (default: 1)")
ap.add_argument("--poll-interval", type=int, default=DEFAULT_POLL,
help=f"Seconds between state polls (default: {DEFAULT_POLL})")
ap.add_argument("--skip-wait", action="store_true",
help="Upload everything without waiting for transcoding")
ap.add_argument("--nsfw", action="store_true",
help="Mark videos as NSFW")
ap.add_argument("--dry-run", "-n", action="store_true",
help="Preview what would be uploaded")
args = ap.parse_args()
url = args.url or os.environ.get("PEERTUBE_URL")
username = args.username or os.environ.get("PEERTUBE_USER")
channel = args.channel or os.environ.get("PEERTUBE_CHANNEL")
password = args.password or os.environ.get("PEERTUBE_PASSWORD")
if not args.dry_run:
missing = [label for label, val in [
("--url / PEERTUBE_URL", url),
("--username / PEERTUBE_USER", username),
("--channel / PEERTUBE_CHANNEL", channel),
("--password / PEERTUBE_PASSWORD", password),
] if not val]
if missing:
for label in missing:
print(f"[!] Required: {label}")
sys.exit(1)
# ── load metadata & scan disk ──
video_map = load_video_map()
path_meta = build_path_to_meta(video_map, args.input)
on_disk = find_videos(args.input)
unmatched = on_disk - set(path_meta.keys())
if unmatched:
print(
f"[!] {len(unmatched)} file(s) on disk not in video_map (will use filename as title)")
for rel in unmatched:
path_meta[rel] = {"title": "", "description": ""}
uploaded = load_uploaded(args.input)
pending = sorted(rel for rel in on_disk if rel not in uploaded)
print(f"[+] {len(on_disk)} video files in {args.input}/")
print(f"[+] {len(uploaded)} already uploaded")
print(f"[+] {len(pending)} pending")
print(f"[+] Batch size: {args.batch_size}")
if not pending:
print("\nAll videos already uploaded.")
return
# ── dry run ──
if args.dry_run:
total_bytes = 0
for rel in pending:
meta = path_meta.get(rel, {})
name = make_pt_name(meta.get("title", ""), rel.name)
sz = (Path(args.input) / rel).stat().st_size
total_bytes += sz
print(f" [{fmt_size(sz):>10}] {name}")
print(
f"\n Total: {fmt_size(total_bytes)} across {len(pending)} videos")
return
# ── authenticate ──
base = url.rstrip("/")
if not base.startswith("http"):
base = "https://" + base
print(f"\n[+] Authenticating with {base} ...")
token = get_oauth_token(base, username, password)
print(f"[+] Authenticated as {username}")
channel_id = get_channel_id(base, token, channel)
print(f"[+] Channel: {channel} (id {channel_id})")
name_counts = get_channel_video_names(base, token, channel)
existing = set(name_counts)
total = sum(name_counts.values())
print(f"[+] Found {total} video(s) on channel ({len(name_counts)} unique name(s))")
dupes = {name: count for name, count in name_counts.items() if count > 1}
if dupes:
print(f"[!] {len(dupes)} duplicate name(s) detected on channel:")
for name, count in sorted(dupes.items()):
print(f" x{count} {name}")
# ── pre-reconcile: sweep all pending against channel names ────────
# The main upload loop discovers already-uploaded videos lazily as it
# walks the sorted pending list — meaning on a fresh run (no .uploaded
# file) you won't know how many files are genuinely new until the loop
# has processed everything. Doing a full sweep here, before any
# upload starts, gives an accurate count up-front and pre-populates
# .uploaded so that interrupted/re-run sessions skip them instantly
# without re-checking each time.
pre_matched = []
for rel in pending:
if _channel_match(rel, path_meta, existing)[0]:
pre_matched.append(rel)
if pre_matched:
print(f"\n[+] Pre-sweep: {len(pre_matched)} local file(s) already on channel — marking uploaded")
for rel in pre_matched:
mark_uploaded(args.input, rel)
pending = [rel for rel in pending if rel not in set(pre_matched)]
print(f"[+] {len(pending)} left to upload\n")
nsfw = args.nsfw
total_up = 0
batch: list[tuple[str, str]] = [] # [(uuid, name), ...]
try:
for rel in pending:
# ── flush batch if full ──
if not args.skip_wait and len(batch) >= args.batch_size:
print(
f"\n[+] Waiting for {len(batch)} video(s) to finish processing ...")
for uuid, bname in batch:
print(f"\n [{bname}]")
wait_for_published(base, token, uuid, args.poll_interval)
batch.clear()
filepath = Path(args.input) / rel
meta = path_meta.get(rel, {})
name = make_pt_name(meta.get("title", ""), rel.name)
desc = clean_description(meta.get("description", ""))
sz = filepath.stat().st_size
if _channel_match(rel, path_meta, existing)[0]:
print(f"\n[skip] already on channel: {name}")
mark_uploaded(args.input, rel)
continue
print(f"\n[{total_up + 1}/{len(pending)}] {name}")
print(f" File: {rel} ({fmt_size(sz)})")
ok, uuid = upload_video(
base, token, channel_id, filepath, name, desc, nsfw)
if not ok:
continue
print(f" Uploaded uuid={uuid}")
mark_uploaded(args.input, rel)
total_up += 1
existing.add(name)
if uuid:
batch.append((uuid, name))
# ── wait for final batch ──
if batch and not args.skip_wait:
print(f"\n[+] Waiting for final {len(batch)} video(s) ...")
for uuid, bname in batch:
print(f"\n [{bname}]")
wait_for_published(base, token, uuid, args.poll_interval)
except KeyboardInterrupt:
print(
f"\n\n[!] Interrupted after {total_up} uploads. Re-run to continue.")
sys.exit(130)
print(f"\n{'=' * 50}")
print(f" Uploaded: {total_up} video(s)")
print(" Done!")
print(f"{'=' * 50}")
if __name__ == "__main__":
main()