mirror of
https://github.com/HugeFrog24/jailbirdz-dl.git
synced 2026-05-01 09:02:19 +00:00
Compare commits
4 Commits
main
..
a51d2bd237
| Author | SHA1 | Date | |
|---|---|---|---|
| a51d2bd237 | |||
| c58814691e | |||
| 27f7beb8b7 | |||
| 4a5b0a6ee3 |
@@ -1,14 +0,0 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(jq -r \".\"\"/api/v1/video-channels/{channelHandle}/videos\"\".get.parameters[] | $ref\")",
|
||||
"Bash(grep '$ref')",
|
||||
"Bash(grep -c \"^ ''/\" /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.yaml)",
|
||||
"Bash(grep -c ' \"\"/' /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.json)",
|
||||
"Bash(grep -c \"^ ''/\" /c/Users/admin/Desktop/jailbirdz-dl/docs/external/peertube/openapi.yaml)",
|
||||
"Bash(git --no-pager diff HEAD~1 docs/external/peertube/openapi.json)",
|
||||
"Bash(git --no-pager show HEAD~1:docs/external/peertube/openapi.json)",
|
||||
"Bash(git --no-pager diff --name-only HEAD~1)"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
---
|
||||
alwaysApply: true
|
||||
always_on: true
|
||||
trigger: always_on
|
||||
applyTo: "**"
|
||||
description: Snyk Security At Inception
|
||||
---
|
||||
|
||||
# Project security best practices
|
||||
|
||||
- Always run snyk_code_scan tool for new first party code that is generated in a Snyk-supported language.
|
||||
- If any security issues are found based on newly introduced or modified code or dependencies, attempt to fix the issues using the results context from Snyk.
|
||||
- Rescan the code after fixing the issues to ensure that the issues were fixed and that there are no newly introduced issues.
|
||||
- Repeat this process until no new issues are found.
|
||||
@@ -44,7 +44,7 @@ Sites with no credentials are skipped automatically when running `python main.py
|
||||
|
||||
### 1. Scrape
|
||||
|
||||
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V, HLS/M3U8).
|
||||
Discovers all post URLs via the WordPress REST API, then visits each page with a headless Firefox browser to intercept video network requests (MP4, MOV, WebM, AVI, M4V).
|
||||
|
||||
```bash
|
||||
python main.py # scrape all sites you have credentials for
|
||||
@@ -62,7 +62,7 @@ python download.py [options]
|
||||
Options:
|
||||
-o, --output DIR Download directory (default: downloads)
|
||||
-t, --titles Name files by post title
|
||||
--original Name files by original filename derived from the video URL (default)
|
||||
--original Name files by original CloudFront filename (default)
|
||||
--reorganize Rename existing files to match current naming mode
|
||||
-w, --workers N Concurrent downloads (default: 4)
|
||||
-n, --dry-run Print what would be downloaded
|
||||
|
||||
+6
-19
@@ -121,21 +121,17 @@ def save_video_map(
|
||||
|
||||
|
||||
def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
|
||||
"""Pure function: return {cdn_video_url: referer} from a flat video map.
|
||||
"""Pure function: return {cdn_video_url: site_referer} from a flat video map.
|
||||
|
||||
Bunny.net CDN URLs require https://player.mediadelivery.net/ as referer.
|
||||
All other URLs use the scheme+netloc of the page they were found on.
|
||||
The flat video map has page URLs as keys; the scheme+netloc of each page URL
|
||||
is used as the Referer for all CDN video URLs found in that entry.
|
||||
"""
|
||||
result: dict[str, str] = {}
|
||||
for page_url, entry in video_map.items():
|
||||
parsed = urlparse(page_url)
|
||||
site_referer = f"{parsed.scheme}://{parsed.netloc}/"
|
||||
referer = f"{parsed.scheme}://{parsed.netloc}/"
|
||||
for vid in cast(dict[str, Any], entry).get("videos", []):
|
||||
vid_url = vid["url"]
|
||||
if urlparse(vid_url).netloc.endswith(".b-cdn.net"):
|
||||
result.setdefault(vid_url, "https://player.mediadelivery.net/")
|
||||
else:
|
||||
result.setdefault(vid_url, site_referer)
|
||||
result.setdefault(vid["url"], referer)
|
||||
return result
|
||||
|
||||
|
||||
@@ -151,17 +147,8 @@ def fmt_size(b: float | int) -> str:
|
||||
return f"{b:.1f} TB"
|
||||
|
||||
|
||||
def is_hls_url(url: str) -> bool:
|
||||
"""True if url is an HLS master playlist (.m3u8)."""
|
||||
return urlparse(url).path.endswith(".m3u8")
|
||||
|
||||
|
||||
def url_to_filename(url: str) -> str:
|
||||
path = PurePosixPath(urlparse(url).path)
|
||||
# Bunny.net HLS: .../guid/playlist.m3u8 → guid.mp4
|
||||
if path.name == "playlist.m3u8":
|
||||
return unquote(path.parent.name) + ".mp4"
|
||||
return unquote(path.name)
|
||||
return unquote(PurePosixPath(urlparse(url).path).name)
|
||||
|
||||
|
||||
def find_clashes(urls: list[str]) -> dict[str, list[str]]:
|
||||
|
||||
Vendored
+857
-1412
File diff suppressed because one or more lines are too long
+7
-38
@@ -14,8 +14,6 @@ import argparse
|
||||
from pathlib import Path
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Any
|
||||
@@ -33,7 +31,6 @@ from check_clashes import (
|
||||
load_video_map,
|
||||
save_video_map,
|
||||
is_valid_url,
|
||||
is_hls_url,
|
||||
VIDEO_MAP_FILE,
|
||||
)
|
||||
from config import SITES
|
||||
@@ -209,30 +206,6 @@ def reorganize(
|
||||
# ── Download ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def download_hls(url: str, dest: Path, referer: str = "") -> tuple[str, int]:
|
||||
"""Download an HLS stream via yt-dlp. Returns (status, bytes_written)."""
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
if dest.exists():
|
||||
return "ok", 0
|
||||
cmd = [
|
||||
sys.executable, "-m", "yt_dlp",
|
||||
"--quiet", "--no-warnings",
|
||||
"--referer", referer or "https://player.mediadelivery.net/",
|
||||
"-o", str(dest),
|
||||
url,
|
||||
]
|
||||
try:
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if proc.returncode != 0:
|
||||
lines = (proc.stderr or proc.stdout).strip().splitlines()
|
||||
return f"error: {lines[-1] if lines else 'yt-dlp failed'}", 0
|
||||
if not dest.exists():
|
||||
return "error: output file missing after yt-dlp", 0
|
||||
return "ok", dest.stat().st_size
|
||||
except Exception as e:
|
||||
return f"error: {e}", 0
|
||||
|
||||
|
||||
def download_one(
|
||||
session: requests.Session,
|
||||
url: str,
|
||||
@@ -411,7 +384,7 @@ def main() -> None:
|
||||
saved = read_mode(args.output)
|
||||
mode_changed = saved is not None and saved != mode
|
||||
|
||||
print(f"[+] {len(urls)} video URLs from {VIDEO_MAP_FILE}")
|
||||
print(f"[+] {len(urls)} MP4 URLs from {VIDEO_MAP_FILE}")
|
||||
print(f"[+] Naming mode: {mode}" + (" (changed!)" if mode_changed else ""))
|
||||
|
||||
# Handle reorganize
|
||||
@@ -472,7 +445,7 @@ def main() -> None:
|
||||
}
|
||||
|
||||
newly_fetched: dict[str, int | None] = {}
|
||||
uncached_pending = [u for u in pending if u not in cached_sizes and not is_hls_url(u)]
|
||||
uncached_pending = [u for u in pending if u not in cached_sizes]
|
||||
session = make_session()
|
||||
if uncached_pending:
|
||||
print(
|
||||
@@ -489,19 +462,17 @@ def main() -> None:
|
||||
total_bytes = sum(sized.values())
|
||||
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
|
||||
|
||||
already_sizes: dict[str, int | None] = {}
|
||||
already_to_verify = [u for u in already if not is_hls_url(u)]
|
||||
if already_to_verify:
|
||||
uncached_already = [u for u in already_to_verify if u not in cached_sizes]
|
||||
if already:
|
||||
uncached_already = [u for u in already if u not in cached_sizes]
|
||||
if uncached_already:
|
||||
print(
|
||||
f"[+] Verifying {len(already_to_verify)} existing files ({len(uncached_already)} uncached)…"
|
||||
f"[+] Verifying {len(already)} existing files ({len(uncached_already)} uncached)…"
|
||||
)
|
||||
fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers)
|
||||
newly_fetched.update(fetched_already)
|
||||
already_sizes = {**cached_sizes, **fetched_already}
|
||||
already_sizes: dict[str, int | None] = {**cached_sizes, **fetched_already}
|
||||
else:
|
||||
print(f"[+] Verifying {len(already_to_verify)} existing files (all sizes cached)…")
|
||||
print(f"[+] Verifying {len(already)} existing files (all sizes cached)…")
|
||||
already_sizes = dict(cached_sizes)
|
||||
|
||||
mismatched = 0
|
||||
@@ -534,8 +505,6 @@ def main() -> None:
|
||||
|
||||
def do_download(url: str) -> tuple[str, tuple[str, int]]:
|
||||
dest = paths[url]
|
||||
if is_hls_url(url):
|
||||
return url, download_hls(url, dest, url_referers.get(url, ""))
|
||||
expected = remote_sizes.get(url)
|
||||
return url, download_one(
|
||||
session, url, dest, expected, url_referers.get(url, "")
|
||||
|
||||
@@ -335,46 +335,6 @@ def extract_title_from_html(html: str) -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
def _is_bunny_playlist(url: str) -> bool:
|
||||
"""True if url is the root Bunny.net HLS playlist (not a sub-playlist)."""
|
||||
parsed = urlparse(url)
|
||||
return (
|
||||
parsed.netloc.endswith(".b-cdn.net")
|
||||
and parsed.path.endswith("/playlist.m3u8")
|
||||
)
|
||||
|
||||
|
||||
def _is_bunny_junk(url: str) -> bool:
|
||||
"""True if url is a Bunny.net CDN init segment (not a usable video URL)."""
|
||||
parsed = urlparse(url)
|
||||
return parsed.netloc.endswith(".b-cdn.net") and PurePosixPath(
|
||||
parsed.path
|
||||
).name in {"init.mp4", "init.dmp4"}
|
||||
|
||||
|
||||
def extract_bunny_embed_url(html: str) -> str | None:
|
||||
"""Return a tokenless Bunny.net embed URL found in an iframe, or None."""
|
||||
m = re.search(
|
||||
r'<iframe[^>]+src="(https://player\.mediadelivery\.net/embed/[^"?]+)',
|
||||
html,
|
||||
)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def _clear_junk_video_entries(video_map: dict[str, Any], site_key: str) -> int:
|
||||
"""Reset entries whose only stored videos are CDN init segments. Returns count fixed."""
|
||||
cleared = 0
|
||||
for entry in video_map.values():
|
||||
videos = entry.get("videos", [])
|
||||
if videos and all(_is_bunny_junk(v["url"]) for v in videos):
|
||||
entry["videos"] = []
|
||||
entry.pop("scraped_at", None)
|
||||
cleared += 1
|
||||
if cleared:
|
||||
save_video_map(video_map, site_key)
|
||||
return cleared
|
||||
|
||||
|
||||
MAX_RETRIES = 2
|
||||
|
||||
|
||||
@@ -401,9 +361,7 @@ async def worker(
|
||||
|
||||
page.on(
|
||||
"response",
|
||||
lambda resp: video_hits.add(resp.url)
|
||||
if (_is_video_url(resp.url) or _is_bunny_playlist(resp.url))
|
||||
else None,
|
||||
lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -418,7 +376,7 @@ async def worker(
|
||||
print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")
|
||||
|
||||
try:
|
||||
await page.goto(url, wait_until="load", timeout=60000)
|
||||
await page.goto(url, wait_until="networkidle", timeout=60000)
|
||||
except Exception as e:
|
||||
print(f"[W{worker_id}] Navigation error: {e}")
|
||||
if expects_video(url) and attempt < MAX_RETRIES:
|
||||
@@ -493,29 +451,18 @@ async def worker(
|
||||
found = set(html_videos) | set(video_hits)
|
||||
video_hits.clear()
|
||||
|
||||
print(f"[W{worker_id}] network hits raw: {found or '(empty)'}")
|
||||
|
||||
all_videos = [
|
||||
m
|
||||
for m in found
|
||||
if is_valid_url(m)
|
||||
and not _is_bunny_junk(m)
|
||||
and m
|
||||
not in (
|
||||
f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
|
||||
)
|
||||
]
|
||||
|
||||
if not all_videos:
|
||||
embed_url = extract_bunny_embed_url(html)
|
||||
if embed_url:
|
||||
print(
|
||||
f"[W{worker_id}] No network hit — iframe fallback: {embed_url}"
|
||||
)
|
||||
all_videos = [embed_url]
|
||||
|
||||
async with map_lock:
|
||||
new_found = set(all_videos) - known
|
||||
new_found = found - known
|
||||
if new_found:
|
||||
print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
|
||||
known.update(new_found)
|
||||
@@ -572,12 +519,6 @@ async def run_for_site(
|
||||
urls = load_post_urls(site_key, base_url, wp_api, req_headers)
|
||||
|
||||
video_map = load_video_map(site_key)
|
||||
junk_cleared = _clear_junk_video_entries(video_map, site_key)
|
||||
if junk_cleared:
|
||||
print(
|
||||
f"[{site_key}] Cleared {junk_cleared} entries with junk CDN init segments — will re-scrape."
|
||||
)
|
||||
|
||||
if any(
|
||||
u not in video_map
|
||||
or not video_map[u].get("title")
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
playwright==1.58.0
|
||||
python-dotenv==1.2.1
|
||||
Requests==2.32.5
|
||||
yt-dlp>=2026.3.17
|
||||
pycryptodomex>=3.23.0
|
||||
|
||||
@@ -125,12 +125,11 @@ def get_channel_id(base: str, token: str, channel_name: str) -> int:
|
||||
def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter[str]:
|
||||
"""Paginate through the channel and return a Counter of video names."""
|
||||
counts: Counter[str] = Counter()
|
||||
page_size = 25
|
||||
start = 0
|
||||
while True:
|
||||
r = requests.get(
|
||||
f"{base}/api/v1/video-channels/{channel_name}/videos",
|
||||
params={"start": start, "count": page_size},
|
||||
params={"start": start, "count": 100},
|
||||
headers=api_headers(token),
|
||||
timeout=30,
|
||||
)
|
||||
@@ -138,7 +137,7 @@ def get_channel_video_names(base: str, token: str, channel_name: str) -> Counter
|
||||
data = r.json()
|
||||
for v in data.get("data", []):
|
||||
counts[v["name"]] += 1
|
||||
start += page_size
|
||||
start += 100
|
||||
if start >= data.get("total", 0):
|
||||
break
|
||||
return counts
|
||||
|
||||
+10
-435
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user