Compare commits

..

34 Commits

Author SHA1 Message Date
github-actions[bot] 1360ef781c chore: nightly index update [skip ci] 2026-05-01 06:17:34 +00:00
github-actions[bot] 8db1c640dd chore: nightly index update [skip ci] 2026-04-30 06:00:15 +00:00
github-actions[bot] bf0efef146 chore: nightly index update [skip ci] 2026-04-29 05:55:43 +00:00
github-actions[bot] 75727100df chore: nightly index update [skip ci] 2026-04-28 06:08:26 +00:00
github-actions[bot] e5ddda0181 chore: nightly index update [skip ci] 2026-04-27 05:55:45 +00:00
github-actions[bot] 04c64810dc chore: nightly index update [skip ci] 2026-04-25 12:50:53 +00:00
github-actions[bot] 78c3cf73b1 chore: nightly index update [skip ci] 2026-04-25 05:13:07 +00:00
github-actions[bot] cadecd8900 chore: nightly index update [skip ci] 2026-04-24 05:36:52 +00:00
github-actions[bot] 6ae4eb9e40 chore: nightly index update [skip ci] 2026-04-23 05:31:46 +00:00
github-actions[bot] e2627df1a3 chore: nightly index update [skip ci] 2026-04-22 05:27:39 +00:00
github-actions[bot] b9bea64471 chore: nightly index update [skip ci] 2026-04-21 05:28:56 +00:00
github-actions[bot] a9e38d8ba9 chore: nightly index update [skip ci] 2026-04-20 05:43:15 +00:00
github-actions[bot] 52e4c8c043 chore: nightly index update [skip ci] 2026-04-19 05:30:50 +00:00
github-actions[bot] ca7ac31e8c chore: nightly index update [skip ci] 2026-04-18 05:08:21 +00:00
github-actions[bot] cf8e6ec2d7 chore: nightly index update [skip ci] 2026-04-17 05:29:40 +00:00
github-actions[bot] cb7cf6e63e chore: nightly index update [skip ci] 2026-04-16 05:32:45 +00:00
github-actions[bot] 01d7006094 chore: nightly index update [skip ci] 2026-04-15 05:25:41 +00:00
github-actions[bot] a9610253c6 chore: nightly index update [skip ci] 2026-04-14 05:25:10 +00:00
github-actions[bot] b92d1ac858 chore: nightly index update [skip ci] 2026-04-13 05:43:41 +00:00
github-actions[bot] bf4518981b chore: nightly index update [skip ci] 2026-04-11 04:56:50 +00:00
github-actions[bot] 97167e5510 chore: nightly index update [skip ci] 2026-04-09 05:11:04 +00:00
github-actions[bot] 8621c7565d chore: nightly index update [skip ci] 2026-04-07 05:11:07 +00:00
github-actions[bot] 8ed96b7791 chore: nightly index update [skip ci] 2026-04-04 04:51:21 +00:00
github-actions[bot] 3433b91f37 chore: nightly index update [skip ci] 2026-04-02 05:04:06 +00:00
github-actions[bot] 5b2b9d2ed8 chore: nightly index update [skip ci] 2026-04-01 05:25:40 +00:00
github-actions[bot] 9aba91274b chore: nightly index update [skip ci] 2026-03-30 05:25:52 +00:00
github-actions[bot] c93cc5ff4f chore: nightly index update [skip ci] 2026-03-28 04:53:41 +00:00
github-actions[bot] 9f6e56725b chore: nightly index update [skip ci] 2026-03-26 05:07:11 +00:00
github-actions[bot] 7f0692bdb5 chore: nightly index update [skip ci] 2026-03-23 05:04:00 +00:00
github-actions[bot] 57b91cc708 chore: nightly index update [skip ci] 2026-03-22 04:53:11 +00:00
github-actions[bot] 42ece92669 chore: nightly index update [skip ci] 2026-03-21 04:27:27 +00:00
HugeFrog24 0525bd3139 feat: add Bunny.net HLS support to scraper and downloader (yt-dlp + pycryptodomex) 2026-03-19 08:39:11 +01:00
github-actions[bot] 904d79eb35 chore: nightly index update [skip ci] 2026-03-16 05:21:32 +00:00
HugeFrog24 112ea70c00 Fixed pagination bug 2026-03-14 06:49:05 +01:00
10 changed files with 1938 additions and 892 deletions
+14
View File
@@ -0,0 +1,14 @@
{
"permissions": {
"allow": [
"Bash(jq -r \".\"\"/api/v1/video-channels/{channelHandle}/videos\"\".get.parameters[] | $ref\")",
"Bash(grep '$ref')",
"Bash(grep -c \"^ ''/\" /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.yaml)",
"Bash(grep -c ' \"\"/' /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.json)",
"Bash(grep -c \"^ ''/\" /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.yaml)",
"Bash(git --no-pager diff HEAD~1 docs/external/peertube/openapi.json)",
"Bash(git --no-pager show HEAD~1:docs/external/peertube/openapi.json)",
"Bash(git --no-pager diff --name-only HEAD~1)"
]
}
}
@@ -1,14 +0,0 @@
---
alwaysApply: true
always_on: true
trigger: always_on
applyTo: "**"
description: Snyk Security At Inception
---
# Project security best practices
- Always run snyk_code_scan tool for new first party code that is generated in a Snyk-supported language.
- If any security issues are found based on newly introduced or modified code or dependencies, attempt to fix the issues using the results context from Snyk.
- Rescan the code after fixing the issues to ensure that the issues were fixed and that there are no newly introduced issues.
- Repeat this process until no new issues are found.
+2 -2
View File
@@ -44,7 +44,7 @@ Sites with no credentials are skipped automatically when running `python main.py
### 1. Scrape
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V).
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V, HLS/M3U8).
```bash
python main.py # scrape all sites you have credentials for
@@ -62,7 +62,7 @@ python download.py [options]
Options:
-o, --output DIR Download directory (default: downloads)
-t, --titles Name files by post title
--original Name files by original CloudFront filename (default)
--original Name files by original filename derived from the video URL (default)
--reorganize Rename existing files to match current naming mode
-w, --workers N Concurrent downloads (default: 4)
-n, --dry-run Print what would be downloaded
+19 -6
View File
@@ -121,17 +121,21 @@ def save_video_map(
def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
"""Pure function: return {cdn_video_url: site_referer} from a flat video map.
"""Pure function: return {cdn_video_url: referer} from a flat video map.
The flat video map has page URLs as keys; the scheme+netloc of each page URL
is used as the Referer for all CDN video URLs found in that entry.
Bunny.net CDN URLs require https://player.mediadelivery.net/ as referer.
All other URLs use the scheme+netloc of the page they were found on.
"""
result: dict[str, str] = {}
for page_url, entry in video_map.items():
parsed = urlparse(page_url)
referer = f"{parsed.scheme}://{parsed.netloc}/"
site_referer = f"{parsed.scheme}://{parsed.netloc}/"
for vid in cast(dict[str, Any], entry).get("videos", []):
result.setdefault(vid["url"], referer)
vid_url = vid["url"]
if urlparse(vid_url).netloc.endswith(".b-cdn.net"):
result.setdefault(vid_url, "https://player.mediadelivery.net/")
else:
result.setdefault(vid_url, site_referer)
return result
@@ -147,8 +151,17 @@ def fmt_size(b: float | int) -> str:
return f"{b:.1f} TB"
def is_hls_url(url: str) -> bool:
"""True if url is an HLS master playlist (.m3u8)."""
return urlparse(url).path.endswith(".m3u8")
def url_to_filename(url: str) -> str:
return unquote(PurePosixPath(urlparse(url).path).name)
path = PurePosixPath(urlparse(url).path)
# Bunny.net HLS: .../guid/playlist.m3u8 → guid.mp4
if path.name == "playlist.m3u8":
return unquote(path.parent.name) + ".mp4"
return unquote(path.name)
def find_clashes(urls: list[str]) -> dict[str, list[str]]:
+1399 -844
View File
File diff suppressed because one or more lines are too long
+38 -7
View File
@@ -14,6 +14,8 @@ import argparse
from pathlib import Path
import re
import shutil
import subprocess
import sys
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any
@@ -31,6 +33,7 @@ from check_clashes import (
load_video_map,
save_video_map,
is_valid_url,
is_hls_url,
VIDEO_MAP_FILE,
)
from config import SITES
@@ -206,6 +209,30 @@ def reorganize(
# ── Download ─────────────────────────────────────────────────────────
def download_hls(url: str, dest: Path, referer: str = "") -> tuple[str, int]:
"""Download an HLS stream via yt-dlp. Returns (status, bytes_written)."""
dest.parent.mkdir(parents=True, exist_ok=True)
if dest.exists():
return "ok", 0
cmd = [
sys.executable, "-m", "yt_dlp",
"--quiet", "--no-warnings",
"--referer", referer or "https://player.mediadelivery.net/",
"-o", str(dest),
url,
]
try:
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
lines = (proc.stderr or proc.stdout).strip().splitlines()
return f"error: {lines[-1] if lines else 'yt-dlp failed'}", 0
if not dest.exists():
return "error: output file missing after yt-dlp", 0
return "ok", dest.stat().st_size
except Exception as e:
return f"error: {e}", 0
def download_one(
session: requests.Session,
url: str,
@@ -384,7 +411,7 @@ def main() -> None:
saved = read_mode(args.output)
mode_changed = saved is not None and saved != mode
print(f"[+] {len(urls)} MP4 URLs from {VIDEO_MAP_FILE}")
print(f"[+] {len(urls)} video URLs from {VIDEO_MAP_FILE}")
print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else ""))
# Handle reorganize
@@ -445,7 +472,7 @@ def main() -> None:
}
newly_fetched: dict[str, int | None] = {}
uncached_pending = [u for u in pending if u not in cached_sizes]
uncached_pending = [u for u in pending if u not in cached_sizes and not is_hls_url(u)]
session = make_session()
if uncached_pending:
print(
@@ -462,17 +489,19 @@ def main() -> None:
total_bytes = sum(sized.values())
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
if already:
uncached_already = [u for u in already if u not in cached_sizes]
already_sizes: dict[str, int | None] = {}
already_to_verify = [u for u in already if not is_hls_url(u)]
if already_to_verify:
uncached_already = [u for u in already_to_verify if u not in cached_sizes]
if uncached_already:
print(
f"[+] Verifying {len(already)} existing files ({len(uncached_already)} uncached)…"
f"[+] Verifying {len(already_to_verify)} existing files ({len(uncached_already)} uncached)…"
)
fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers)
newly_fetched.update(fetched_already)
already_sizes: dict[str, int | None] = {**cached_sizes, **fetched_already}
already_sizes = {**cached_sizes, **fetched_already}
else:
print(f"[+] Verifying {len(already)} existing files (all sizes cached)…")
print(f"[+] Verifying {len(already_to_verify)} existing files (all sizes cached)…")
already_sizes = dict(cached_sizes)
mismatched = 0
@@ -505,6 +534,8 @@ def main() -> None:
def do_download(url: str) -> tuple[str, tuple[str, int]]:
dest = paths[url]
if is_hls_url(url):
return url, download_hls(url, dest, url_referers.get(url, ""))
expected = remote_sizes.get(url)
return url, download_one(
session, url, dest, expected, url_referers.get(url, "")
+62 -3
View File
@@ -335,6 +335,46 @@ def extract_title_from_html(html: str) -> str | None:
return None
def _is_bunny_playlist(url: str) -> bool:
"""True if url is the root Bunny.net HLS playlist (not a sub-playlist)."""
parsed = urlparse(url)
return (
parsed.netloc.endswith(".b-cdn.net")
and parsed.path.endswith("/playlist.m3u8")
)
def _is_bunny_junk(url: str) -> bool:
"""True if url is a Bunny.net CDN init segment (not a usable video URL)."""
parsed = urlparse(url)
return parsed.netloc.endswith(".b-cdn.net") and PurePosixPath(
parsed.path
).name in {"init.mp4", "init.dmp4"}
def extract_bunny_embed_url(html: str) -> str | None:
"""Return a tokenless Bunny.net embed URL found in an iframe, or None."""
m = re.search(
r'<iframe[^>]+src="(https://player\.mediadelivery\.net/embed/[^"?]+)',
html,
)
return m.group(1) if m else None
def _clear_junk_video_entries(video_map: dict[str, Any], site_key: str) -> int:
"""Reset entries whose only stored videos are CDN init segments. Returns count fixed."""
cleared = 0
for entry in video_map.values():
videos = entry.get("videos", [])
if videos and all(_is_bunny_junk(v["url"]) for v in videos):
entry["videos"] = []
entry.pop("scraped_at", None)
cleared += 1
if cleared:
save_video_map(video_map, site_key)
return cleared
MAX_RETRIES = 2
@@ -361,7 +401,9 @@ async def worker(
page.on(
"response",
lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
lambda resp: video_hits.add(resp.url)
if (_is_video_url(resp.url) or _is_bunny_playlist(resp.url))
else None,
)
try:
@@ -376,7 +418,7 @@ async def worker(
print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")
try:
await page.goto(url, wait_until="networkidle", timeout=60000)
await page.goto(url, wait_until="load", timeout=60000)
except Exception as e:
print(f"[W{worker_id}] Navigation error: {e}")
if expects_video(url) and attempt < MAX_RETRIES:
@@ -451,18 +493,29 @@ async def worker(
found = set(html_videos) | set(video_hits)
video_hits.clear()
print(f"[W{worker_id}] network hits raw: {found or '(empty)'}")
all_videos = [
m
for m in found
if is_valid_url(m)
and not _is_bunny_junk(m)
and m
not in (
f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
)
]
if not all_videos:
embed_url = extract_bunny_embed_url(html)
if embed_url:
print(
f"[W{worker_id}] No network hit — iframe fallback: {embed_url}"
)
all_videos = [embed_url]
async with map_lock:
new_found = found - known
new_found = set(all_videos) - known
if new_found:
print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
known.update(new_found)
@@ -519,6 +572,12 @@ async def run_for_site(
urls = load_post_urls(site_key, base_url, wp_api, req_headers)
video_map = load_video_map(site_key)
junk_cleared = _clear_junk_video_entries(video_map, site_key)
if junk_cleared:
print(
f"[{site_key}] Cleared {junk_cleared} entries with junk CDN init segments — will re-scrape."
)
if any(
u not in video_map
or not video_map[u].get("title")
+2
View File
@@ -1,3 +1,5 @@
playwright==1.58.0
python-dotenv==1.2.1
Requests==2.32.5
yt-dlp>=2026.3.17
pycryptodomex>=3.23.0
+3 -2
View File
@@ -125,11 +125,12 @@ def get_channel_id(base: str, token: str, channel_name: str) -> int:
def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
"""Paginate through the channel and return a Counter of video names."""
counts: Counter[str] = Counter()
page_size = 25
start = 0
while True:
r = requests.get(
f"{base}/api/v1/video-channels/{channel_name}/videos",
params={"start": start, "count": 100},
params={"start": start, "count": page_size},
headers=api_headers(token),
timeout=30,
)
@@ -137,7 +138,7 @@ def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter
data = r.json()
for v in data.get("data", []):
counts[v["name"]] += 1
start += 100
start += page_size
if start >= data.get("total", 0):
break
return counts
+399 -14
View File
File diff suppressed because one or more lines are too long