Compare commits

..

6 Commits

Author SHA1 Message Date
github-actions[bot] 014df7d116 chore: nightly index update [skip ci] 2026-03-04 04:30:33 +00:00
github-actions[bot] 241cb051f6 chore: nightly index update [skip ci] 2026-03-02 04:49:26 +00:00
github-actions[bot] a51d2bd237 chore: nightly index update [skip ci] 2026-03-02 00:59:12 +00:00
HugeFrog24 c58814691e OK 2026-03-02 01:56:54 +01:00
HugeFrog24 27f7beb8b7 Caching added 2026-03-01 13:06:24 +01:00
HugeFrog24 4a5b0a6ee3 Multi-gooner support because who knows 2026-03-01 01:36:01 +01:00
10 changed files with 901 additions and 1977 deletions
-14
View File
@@ -1,14 +0,0 @@
{
"permissions": {
"allow": [
"Bash(jq -r \".\"\"/api/v1/video-channels/{channelHandle}/videos\"\".get.parameters[] | $ref\")",
"Bash(grep '$ref')",
"Bash(grep -c \"^ ''/\" /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.yaml)",
"Bash(grep -c ' \"\"/' /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.json)",
"Bash(grep -c \"^ ''/\" /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.yaml)",
"Bash(git --no-pager diff HEAD~1 docs/external/peertube/openapi.json)",
"Bash(git --no-pager show HEAD~1:docs/external/peertube/openapi.json)",
"Bash(git --no-pager diff --name-only HEAD~1)"
]
}
}
@@ -0,0 +1,14 @@
---
alwaysApply: true
always_on: true
trigger: always_on
applyTo: "**"
description: Snyk Security At Inception
---
# Project security best practices
- Always run snyk_code_scan tool for new first party code that is generated in a Snyk-supported language.
- If any security issues are found based on newly introduced or modified code or dependencies, attempt to fix the issues using the results context from Snyk.
- Rescan the code after fixing the issues to ensure that the issues were fixed and that there are no newly introduced issues.
- Repeat this process until no new issues are found.
+2 -2
View File
@@ -44,7 +44,7 @@ Sites with no credentials are skipped automatically when running `python main.py
### 1. Scrape ### 1. Scrape
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V, HLS/M3U8). Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V).
```bash ```bash
python main.py # scrape all sites you have credentials for python main.py # scrape all sites you have credentials for
@@ -62,7 +62,7 @@ python download.py [options]
Options: Options:
-o, --output DIR Download directory (default: downloads) -o, --output DIR Download directory (default: downloads)
-t, --titles Name files by post title -t, --titles Name files by post title
--original Name files by original filename derived from the video URL (default) --original Name files by original CloudFront filename (default)
--reorganize Rename existing files to match current naming mode --reorganize Rename existing files to match current naming mode
-w, --workers N Concurrent downloads (default: 4) -w, --workers N Concurrent downloads (default: 4)
-n, --dry-run Print what would be downloaded -n, --dry-run Print what would be downloaded
+6 -19
View File
@@ -121,21 +121,17 @@ def save_video_map(
def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]: def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
"""Pure function: return {cdn_video_url: referer} from a flat video map. """Pure function: return {cdn_video_url: site_referer} from a flat video map.
Bunny.net CDN URLs require https://player.mediadelivery.net/ as referer. The flat video map has page URLs as keys; the scheme+netloc of each page URL
All other URLs use the scheme+netloc of the page they were found on. is used as the Referer for all CDN video URLs found in that entry.
""" """
result: dict[str, str] = {} result: dict[str, str] = {}
for page_url, entry in video_map.items(): for page_url, entry in video_map.items():
parsed = urlparse(page_url) parsed = urlparse(page_url)
site_referer = f"{parsed.scheme}://{parsed.netloc}/" referer = f"{parsed.scheme}://{parsed.netloc}/"
for vid in cast(dict[str, Any], entry).get("videos", []): for vid in cast(dict[str, Any], entry).get("videos", []):
vid_url = vid["url"] result.setdefault(vid["url"], referer)
if urlparse(vid_url).netloc.endswith(".b-cdn.net"):
result.setdefault(vid_url, "https://player.mediadelivery.net/")
else:
result.setdefault(vid_url, site_referer)
return result return result
@@ -151,17 +147,8 @@ def fmt_size(b: float | int) -> str:
return f"{b:.1f} TB" return f"{b:.1f} TB"
def is_hls_url(url: str) -> bool:
"""True if url is an HLS master playlist (.m3u8)."""
return urlparse(url).path.endswith(".m3u8")
def url_to_filename(url: str) -> str: def url_to_filename(url: str) -> str:
path = PurePosixPath(urlparse(url).path) return unquote(PurePosixPath(urlparse(url).path).name)
# Bunny.net HLS: .../guid/playlist.m3u8 → guid.mp4
if path.name == "playlist.m3u8":
return unquote(path.parent.name) + ".mp4"
return unquote(path.name)
def find_clashes(urls: list[str]) -> dict[str, list[str]]: def find_clashes(urls: list[str]) -> dict[str, list[str]]:
+857 -1412
View File
File diff suppressed because one or more lines are too long
+7 -38
View File
@@ -14,8 +14,6 @@ import argparse
from pathlib import Path from pathlib import Path
import re import re
import shutil import shutil
import subprocess
import sys
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any from typing import Any
@@ -33,7 +31,6 @@ from check_clashes import (
load_video_map, load_video_map,
save_video_map, save_video_map,
is_valid_url, is_valid_url,
is_hls_url,
VIDEO_MAP_FILE, VIDEO_MAP_FILE,
) )
from config import SITES from config import SITES
@@ -209,30 +206,6 @@ def reorganize(
# ── Download ───────────────────────────────────────────────────────── # ── Download ─────────────────────────────────────────────────────────
def download_hls(url: str, dest: Path, referer: str = "") -> tuple[str, int]:
"""Download an HLS stream via yt-dlp. Returns (status, bytes_written)."""
dest.parent.mkdir(parents=True, exist_ok=True)
if dest.exists():
return "ok", 0
cmd = [
sys.executable, "-m", "yt_dlp",
"--quiet", "--no-warnings",
"--referer", referer or "https://player.mediadelivery.net/",
"-o", str(dest),
url,
]
try:
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
lines = (proc.stderr or proc.stdout).strip().splitlines()
return f"error: {lines[-1] if lines else 'yt-dlp failed'}", 0
if not dest.exists():
return "error: output file missing after yt-dlp", 0
return "ok", dest.stat().st_size
except Exception as e:
return f"error: {e}", 0
def download_one( def download_one(
session: requests.Session, session: requests.Session,
url: str, url: str,
@@ -411,7 +384,7 @@ def main() -> None:
saved = read_mode(args.output) saved = read_mode(args.output)
mode_changed = saved is not None and saved != mode mode_changed = saved is not None and saved != mode
print(f"[+] {len(urls)} video URLs from {VIDEO_MAP_FILE}") print(f"[+] {len(urls)} MP4 URLs from {VIDEO_MAP_FILE}")
print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else "")) print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else ""))
# Handle reorganize # Handle reorganize
@@ -472,7 +445,7 @@ def main() -> None:
} }
newly_fetched: dict[str, int | None] = {} newly_fetched: dict[str, int | None] = {}
uncached_pending = [u for u in pending if u not in cached_sizes and not is_hls_url(u)] uncached_pending = [u for u in pending if u not in cached_sizes]
session = make_session() session = make_session()
if uncached_pending: if uncached_pending:
print( print(
@@ -489,19 +462,17 @@ def main() -> None:
total_bytes = sum(sized.values()) total_bytes = sum(sized.values())
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files") print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
already_sizes: dict[str, int | None] = {} if already:
already_to_verify = [u for u in already if not is_hls_url(u)] uncached_already = [u for u in already if u not in cached_sizes]
if already_to_verify:
uncached_already = [u for u in already_to_verify if u not in cached_sizes]
if uncached_already: if uncached_already:
print( print(
f"[+] Verifying {len(already_to_verify)} existing files ({len(uncached_already)} uncached)…" f"[+] Verifying {len(already)} existing files ({len(uncached_already)} uncached)…"
) )
fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers) fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers)
newly_fetched.update(fetched_already) newly_fetched.update(fetched_already)
already_sizes = {**cached_sizes, **fetched_already} already_sizes: dict[str, int | None] = {**cached_sizes, **fetched_already}
else: else:
print(f"[+] Verifying {len(already_to_verify)} existing files (all sizes cached)…") print(f"[+] Verifying {len(already)} existing files (all sizes cached)…")
already_sizes = dict(cached_sizes) already_sizes = dict(cached_sizes)
mismatched = 0 mismatched = 0
@@ -534,8 +505,6 @@ def main() -> None:
def do_download(url: str) -> tuple[str, tuple[str, int]]: def do_download(url: str) -> tuple[str, tuple[str, int]]:
dest = paths[url] dest = paths[url]
if is_hls_url(url):
return url, download_hls(url, dest, url_referers.get(url, ""))
expected = remote_sizes.get(url) expected = remote_sizes.get(url)
return url, download_one( return url, download_one(
session, url, dest, expected, url_referers.get(url, "") session, url, dest, expected, url_referers.get(url, "")
+3 -62
View File
@@ -335,46 +335,6 @@ def extract_title_from_html(html: str) -> str | None:
return None return None
def _is_bunny_playlist(url: str) -> bool:
"""True if url is the root Bunny.net HLS playlist (not a sub-playlist)."""
parsed = urlparse(url)
return (
parsed.netloc.endswith(".b-cdn.net")
and parsed.path.endswith("/playlist.m3u8")
)
def _is_bunny_junk(url: str) -> bool:
"""True if url is a Bunny.net CDN init segment (not a usable video URL)."""
parsed = urlparse(url)
return parsed.netloc.endswith(".b-cdn.net") and PurePosixPath(
parsed.path
).name in {"init.mp4", "init.dmp4"}
def extract_bunny_embed_url(html: str) -> str | None:
"""Return a tokenless Bunny.net embed URL found in an iframe, or None."""
m = re.search(
r'<iframe[^>]+src="(https://player\.mediadelivery\.net/embed/[^"?]+)',
html,
)
return m.group(1) if m else None
def _clear_junk_video_entries(video_map: dict[str, Any], site_key: str) -> int:
"""Reset entries whose only stored videos are CDN init segments. Returns count fixed."""
cleared = 0
for entry in video_map.values():
videos = entry.get("videos", [])
if videos and all(_is_bunny_junk(v["url"]) for v in videos):
entry["videos"] = []
entry.pop("scraped_at", None)
cleared += 1
if cleared:
save_video_map(video_map, site_key)
return cleared
MAX_RETRIES = 2 MAX_RETRIES = 2
@@ -401,9 +361,7 @@ async def worker(
page.on( page.on(
"response", "response",
lambda resp: video_hits.add(resp.url) lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
if (_is_video_url(resp.url) or _is_bunny_playlist(resp.url))
else None,
) )
try: try:
@@ -418,7 +376,7 @@ async def worker(
print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}") print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")
try: try:
await page.goto(url, wait_until="load", timeout=60000) await page.goto(url, wait_until="networkidle", timeout=60000)
except Exception as e: except Exception as e:
print(f"[W{worker_id}] Navigation error: {e}") print(f"[W{worker_id}] Navigation error: {e}")
if expects_video(url) and attempt < MAX_RETRIES: if expects_video(url) and attempt < MAX_RETRIES:
@@ -493,29 +451,18 @@ async def worker(
found = set(html_videos) | set(video_hits) found = set(html_videos) | set(video_hits)
video_hits.clear() video_hits.clear()
print(f"[W{worker_id}] network hits raw: {found or '(empty)'}")
all_videos = [ all_videos = [
m m
for m in found for m in found
if is_valid_url(m) if is_valid_url(m)
and not _is_bunny_junk(m)
and m and m
not in ( not in (
f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4", f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
) )
] ]
if not all_videos:
embed_url = extract_bunny_embed_url(html)
if embed_url:
print(
f"[W{worker_id}] No network hit — iframe fallback: {embed_url}"
)
all_videos = [embed_url]
async with map_lock: async with map_lock:
new_found = set(all_videos) - known new_found = found - known
if new_found: if new_found:
print(f"[W{worker_id}] Found {len(new_found)} new video URLs") print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
known.update(new_found) known.update(new_found)
@@ -572,12 +519,6 @@ async def run_for_site(
urls = load_post_urls(site_key, base_url, wp_api, req_headers) urls = load_post_urls(site_key, base_url, wp_api, req_headers)
video_map = load_video_map(site_key) video_map = load_video_map(site_key)
junk_cleared = _clear_junk_video_entries(video_map, site_key)
if junk_cleared:
print(
f"[{site_key}] Cleared {junk_cleared} entries with junk CDN init segments — will re-scrape."
)
if any( if any(
u not in video_map u not in video_map
or not video_map[u].get("title") or not video_map[u].get("title")
-2
View File
@@ -1,5 +1,3 @@
playwright==1.58.0 playwright==1.58.0
python-dotenv==1.2.1 python-dotenv==1.2.1
Requests==2.32.5 Requests==2.32.5
yt-dlp>=2026.3.17
pycryptodomex>=3.23.0
+2 -3
View File
@@ -125,12 +125,11 @@ def get_channel_id(base: str, token: str, channel_name: str) -> int:
def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]: def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
"""Paginate through the channel and return a Counter of video names.""" """Paginate through the channel and return a Counter of video names."""
counts: Counter[str] = Counter() counts: Counter[str] = Counter()
page_size = 25
start = 0 start = 0
while True: while True:
r = requests.get( r = requests.get(
f"{base}/api/v1/video-channels/{channel_name}/videos", f"{base}/api/v1/video-channels/{channel_name}/videos",
params={"start": start, "count": page_size}, params={"start": start, "count": 100},
headers=api_headers(token), headers=api_headers(token),
timeout=30, timeout=30,
) )
@@ -138,7 +137,7 @@ def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter
data = r.json() data = r.json()
for v in data.get("data", []): for v in data.get("data", []):
counts[v["name"]] += 1 counts[v["name"]] += 1
start += page_size start += 100
if start >= data.get("total", 0): if start >= data.get("total", 0):
break break
return counts return counts
+10 -425
View File
File diff suppressed because one or more lines are too long