mirror of
https://github.com/HugeFrog24/jailbirdz-dl.git
synced 2026-03-02 01:04:31 +00:00
Caching added
This commit is contained in:
@@ -18,7 +18,7 @@ Importable functions:
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path, PurePosixPath
|
||||
from typing import Any, Optional, cast
|
||||
from typing import Any, cast
|
||||
|
||||
from collections.abc import Callable
|
||||
from urllib.parse import urlparse, unquote
|
||||
@@ -131,8 +131,7 @@ def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
|
||||
parsed = urlparse(page_url)
|
||||
referer = f"{parsed.scheme}://{parsed.netloc}/"
|
||||
for vid in cast(dict[str, Any], entry).get("videos", []):
|
||||
if isinstance(vid, str):
|
||||
result.setdefault(vid, referer)
|
||||
result.setdefault(vid["url"], referer)
|
||||
return result
|
||||
|
||||
|
||||
@@ -252,12 +251,7 @@ def fetch_sizes(
|
||||
|
||||
def main() -> None:
|
||||
vm = load_video_map()
|
||||
urls = [
|
||||
u
|
||||
for entry in vm.values()
|
||||
for u in entry.get("videos", [])
|
||||
if u.startswith("http")
|
||||
]
|
||||
urls = [vid["url"] for entry in vm.values() for vid in entry.get("videos", [])]
|
||||
|
||||
clashes = find_clashes(urls)
|
||||
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
# config.py
|
||||
from typing import Final
|
||||
|
||||
# How long a cached file size stays valid. 0 = always re-probe; large = effectively forever.
|
||||
SIZE_CACHE_TTL: Final[int] = 9_999_999 # seconds (~115 days)
|
||||
|
||||
SITES: Final[dict[str, dict[str, str]]] = {
|
||||
"jailbirdz": {
|
||||
"base_url": "https://www.jailbirdz.com",
|
||||
|
||||
70
download.py
70
download.py
@@ -16,7 +16,7 @@ import re
|
||||
import shutil
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
import requests
|
||||
|
||||
from check_clashes import (
|
||||
@@ -264,12 +264,13 @@ def download_one(
|
||||
def collect_urls(video_map: dict[str, Any]) -> list[str]:
|
||||
urls, seen, skipped = [], set(), 0
|
||||
for entry in video_map.values():
|
||||
for video_url in entry.get("videos", []):
|
||||
if video_url in seen:
|
||||
for vid in entry.get("videos", []):
|
||||
u = vid["url"]
|
||||
if u in seen:
|
||||
continue
|
||||
seen.add(video_url)
|
||||
if is_valid_url(video_url):
|
||||
urls.append(video_url)
|
||||
seen.add(u)
|
||||
if is_valid_url(u):
|
||||
urls.append(u)
|
||||
else:
|
||||
skipped += 1
|
||||
if skipped:
|
||||
@@ -281,12 +282,22 @@ def build_url_title_map(video_map: dict[str, Any]) -> dict[str, str]:
|
||||
url_title = {}
|
||||
for entry in video_map.values():
|
||||
title = entry.get("title", "")
|
||||
for video_url in entry.get("videos", []):
|
||||
if video_url not in url_title:
|
||||
url_title[video_url] = title
|
||||
for vid in entry.get("videos", []):
|
||||
if vid["url"] not in url_title:
|
||||
url_title[vid["url"]] = title
|
||||
return url_title
|
||||
|
||||
|
||||
def build_url_to_site() -> dict[str, str]:
|
||||
"""Return {cdn_video_url: site_key} by loading each site's map in turn."""
|
||||
result: dict[str, str] = {}
|
||||
for site_key in SITES:
|
||||
for entry in load_video_map(site_key).values():
|
||||
for vid in entry.get("videos", []):
|
||||
result[vid["url"]] = site_key
|
||||
return result
|
||||
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -341,11 +352,7 @@ def main() -> None:
|
||||
url_referers = build_url_referers(video_map)
|
||||
urls = collect_urls(video_map)
|
||||
|
||||
url_to_site: dict[str, str] = {}
|
||||
for site_key in SITES:
|
||||
for entry in load_video_map(site_key).values():
|
||||
for vid_url in entry.get("videos", []):
|
||||
url_to_site[vid_url] = site_key
|
||||
url_to_site = build_url_to_site()
|
||||
|
||||
if args.sites:
|
||||
selected = set(args.sites)
|
||||
@@ -409,17 +416,44 @@ def main() -> None:
|
||||
print(f" … and {len(pending) - 20} more")
|
||||
return
|
||||
|
||||
print("\n[+] Fetching remote file sizes…")
|
||||
cached_sizes: dict[str, int] = {
|
||||
vid["url"]: vid["size"]
|
||||
for entry in video_map.values()
|
||||
for vid in entry.get("videos", [])
|
||||
if vid.get("size") is not None
|
||||
}
|
||||
|
||||
uncached_pending = [u for u in pending if u not in cached_sizes]
|
||||
session = make_session()
|
||||
remote_sizes = fetch_sizes(pending, workers=20, url_referers=url_referers)
|
||||
if uncached_pending:
|
||||
print(
|
||||
f"\n[+] Fetching remote file sizes ({len(uncached_pending)} uncached, {len(pending) - len(uncached_pending)} cached)…"
|
||||
)
|
||||
remote_sizes: dict[str, int | None] = {
|
||||
**cached_sizes,
|
||||
**fetch_sizes(uncached_pending, workers=20, url_referers=url_referers),
|
||||
}
|
||||
else:
|
||||
print(f"\n[+] All {len(pending)} pending sizes cached — skipping probe.")
|
||||
remote_sizes = dict(cached_sizes)
|
||||
|
||||
sized = {u: s for u, s in remote_sizes.items() if s is not None}
|
||||
total_bytes = sum(sized.values())
|
||||
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
|
||||
|
||||
if already:
|
||||
print(f"[+] Verifying {len(already)} existing files…")
|
||||
already_sizes = fetch_sizes(already, workers=20, url_referers=url_referers)
|
||||
uncached_already = [u for u in already if u not in cached_sizes]
|
||||
if uncached_already:
|
||||
print(
|
||||
f"[+] Verifying {len(already)} existing files ({len(uncached_already)} uncached)…"
|
||||
)
|
||||
already_sizes: dict[str, int | None] = {
|
||||
**cached_sizes,
|
||||
**fetch_sizes(uncached_already, workers=20, url_referers=url_referers),
|
||||
}
|
||||
else:
|
||||
print(f"[+] Verifying {len(already)} existing files (all sizes cached)…")
|
||||
already_sizes = dict(cached_sizes)
|
||||
|
||||
mismatched = 0
|
||||
for url in already:
|
||||
|
||||
16
main.py
16
main.py
@@ -6,7 +6,7 @@ import signal
|
||||
import asyncio
|
||||
import requests
|
||||
from pathlib import PurePosixPath
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
from dotenv import load_dotenv
|
||||
from playwright.async_api import async_playwright, BrowserContext
|
||||
@@ -476,9 +476,13 @@ async def worker(
|
||||
entry = video_map.get(url, {})
|
||||
if title:
|
||||
entry["title"] = title
|
||||
existing_videos = set(entry.get("videos", []))
|
||||
existing_videos.update(all_videos)
|
||||
entry["videos"] = sorted(existing_videos)
|
||||
existing_dict: dict[str, Any] = {
|
||||
vid["url"]: vid for vid in entry.get("videos", [])
|
||||
}
|
||||
for vid_url in all_videos:
|
||||
if vid_url not in existing_dict:
|
||||
existing_dict[vid_url] = {"url": vid_url}
|
||||
entry["videos"] = sorted(existing_dict.values(), key=lambda v: v["url"])
|
||||
mark_done = bool(all_videos) or not expects_video(url)
|
||||
if mark_done:
|
||||
entry["scraped_at"] = int(time.time())
|
||||
@@ -526,7 +530,9 @@ async def run_for_site(
|
||||
site_key, base_url, wp_api, video_map, urls, req_headers
|
||||
)
|
||||
|
||||
known = {u for entry in video_map.values() for u in entry.get("videos", [])}
|
||||
known = {
|
||||
vid["url"] for entry in video_map.values() for vid in entry.get("videos", [])
|
||||
}
|
||||
|
||||
total = len(urls)
|
||||
pending = []
|
||||
|
||||
132
total_size.py
132
total_size.py
@@ -4,15 +4,19 @@ Importable function:
|
||||
summarize_sizes(sizes) - return dict with total, smallest, largest, average, failed
|
||||
"""
|
||||
|
||||
from typing import Optional, TypedDict
|
||||
import argparse
|
||||
import time
|
||||
from typing import Any, TypedDict
|
||||
|
||||
from check_clashes import (
|
||||
fmt_size,
|
||||
fetch_sizes,
|
||||
load_video_map,
|
||||
save_video_map,
|
||||
build_url_referers,
|
||||
VIDEO_MAP_FILE,
|
||||
)
|
||||
from config import SITES, SIZE_CACHE_TTL
|
||||
|
||||
|
||||
class SizeStats(TypedDict):
|
||||
@@ -25,7 +29,7 @@ class SizeStats(TypedDict):
|
||||
failed: list[str]
|
||||
|
||||
|
||||
def summarize_sizes(sizes: dict[str, Optional[int]]) -> SizeStats:
|
||||
def summarize_sizes(sizes: dict[str, int | None]) -> SizeStats:
|
||||
"""Given {url: size_or_None}, return a stats dict."""
|
||||
known = {u: s for u, s in sizes.items() if s is not None}
|
||||
failed = [u for u, s in sizes.items() if s is None]
|
||||
@@ -51,6 +55,13 @@ def summarize_sizes(sizes: dict[str, Optional[int]]) -> SizeStats:
|
||||
}
|
||||
|
||||
|
||||
def _is_stale(vid: dict[str, Any], now: int) -> bool:
|
||||
"""True if the cached size is absent or older than SIZE_CACHE_TTL seconds."""
|
||||
if vid.get("size") is None:
|
||||
return True
|
||||
return (now - vid.get("size_checked_at", 0)) >= SIZE_CACHE_TTL
|
||||
|
||||
|
||||
# --------------- CLI ---------------
|
||||
|
||||
|
||||
@@ -59,24 +70,7 @@ def _progress(done: int, total: int) -> None:
|
||||
print(f" {done}/{total}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
vm = load_video_map()
|
||||
urls: list[str] = [
|
||||
u
|
||||
for entry in vm.values()
|
||||
for u in entry.get("videos", [])
|
||||
if u.startswith("http")
|
||||
]
|
||||
|
||||
url_referers = build_url_referers(vm)
|
||||
print(f"[+] {len(urls)} URLs in {VIDEO_MAP_FILE}")
|
||||
print("[+] Fetching file sizes (20 threads)…\n")
|
||||
|
||||
sizes = fetch_sizes(
|
||||
urls, workers=20, on_progress=_progress, url_referers=url_referers
|
||||
)
|
||||
stats = summarize_sizes(sizes)
|
||||
|
||||
def _print_stats(stats: SizeStats) -> None:
|
||||
print(f"\n{'=' * 45}")
|
||||
print(f" Sized: {stats['sized']}/{stats['total']} files")
|
||||
print(f" Total: {fmt_size(stats['total_bytes'])}")
|
||||
@@ -84,12 +78,108 @@ def main() -> None:
|
||||
print(f" Largest: {fmt_size(stats['largest'])}")
|
||||
print(f" Average: {fmt_size(stats['average'])}")
|
||||
print(f"{'=' * 45}")
|
||||
|
||||
if stats["failed"]:
|
||||
print(f"\n[!] {len(stats['failed'])} URL(s) could not be sized:")
|
||||
for u in stats["failed"]:
|
||||
print(f" {u}")
|
||||
|
||||
|
||||
def _cache_hint(fresh: int, stale: int, missing: int) -> str:
|
||||
parts = [label for count, label in [(fresh, f"{fresh} fresh"), (stale, f"{stale} stale"), (missing, f"{missing} missing")] if count]
|
||||
if stale or missing:
|
||||
suffix = " — run --write to refresh" if stale else " — run --write to probe missing"
|
||||
else:
|
||||
suffix = " — all current"
|
||||
return f"Cache: {', '.join(parts)}{suffix}"
|
||||
|
||||
|
||||
def _run_stats() -> None:
|
||||
vm = load_video_map()
|
||||
now = int(time.time())
|
||||
sizes: dict[str, int | None] = {}
|
||||
fresh = stale = missing = 0
|
||||
for entry in vm.values():
|
||||
for vid in entry.get("videos", []):
|
||||
url = vid["url"]
|
||||
if url in sizes:
|
||||
continue
|
||||
sizes[url] = vid.get("size")
|
||||
if vid.get("size") is None:
|
||||
missing += 1
|
||||
elif _is_stale(vid, now):
|
||||
stale += 1
|
||||
else:
|
||||
fresh += 1
|
||||
|
||||
print(f"[+] {len(sizes)} URLs in {VIDEO_MAP_FILE}")
|
||||
print(f" {_cache_hint(fresh, stale, missing)}")
|
||||
_print_stats(summarize_sizes(sizes))
|
||||
|
||||
|
||||
def _apply_fetched(vm: dict[str, Any], fetched: dict[str, int | None], now: int) -> None:
|
||||
for entry in vm.values():
|
||||
for vid in entry.get("videos", []):
|
||||
if vid["url"] in fetched:
|
||||
vid["size"] = fetched[vid["url"]]
|
||||
vid["size_checked_at"] = now
|
||||
|
||||
|
||||
def _run_write() -> None:
|
||||
"""Probe uncached sizes and write them into video_map.json."""
|
||||
now = int(time.time())
|
||||
all_fetched: dict[str, int | None] = {}
|
||||
|
||||
for site_key in SITES:
|
||||
vm = load_video_map(site_key)
|
||||
if not vm:
|
||||
continue
|
||||
|
||||
url_referers = build_url_referers(vm)
|
||||
|
||||
to_probe: list[str] = [
|
||||
vid["url"]
|
||||
for entry in vm.values()
|
||||
for vid in entry.get("videos", [])
|
||||
if _is_stale(vid, now)
|
||||
]
|
||||
cached_count = sum(
|
||||
1
|
||||
for entry in vm.values()
|
||||
for vid in entry.get("videos", [])
|
||||
if not _is_stale(vid, now)
|
||||
)
|
||||
print(f"[{site_key}] {cached_count} cached, {len(to_probe)} to probe…")
|
||||
|
||||
fetched: dict[str, int | None] = {}
|
||||
if to_probe:
|
||||
fetched = fetch_sizes(
|
||||
to_probe, workers=20, on_progress=_progress, url_referers=url_referers
|
||||
)
|
||||
|
||||
_apply_fetched(vm, fetched, now)
|
||||
save_video_map(vm, site_key)
|
||||
all_fetched.update(fetched)
|
||||
print(f"[{site_key}] Written.")
|
||||
|
||||
if all_fetched:
|
||||
_print_stats(summarize_sizes(all_fetched))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Calculate total video download size")
|
||||
parser.add_argument(
|
||||
"--write",
|
||||
"-w",
|
||||
action="store_true",
|
||||
help="Probe uncached sizes and write them into video_map.json",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.write:
|
||||
_run_write()
|
||||
else:
|
||||
_run_stats()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
18
upload.py
18
upload.py
@@ -32,11 +32,11 @@ import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS, load_video_map
|
||||
from config import SITES
|
||||
from download import (
|
||||
collect_urls,
|
||||
get_paths_for_mode,
|
||||
read_mode,
|
||||
build_url_to_site,
|
||||
MODE_ORIGINAL,
|
||||
DEFAULT_OUTPUT,
|
||||
)
|
||||
@@ -412,11 +412,7 @@ def build_path_to_meta(
|
||||
urls = collect_urls(video_map)
|
||||
mode = read_mode(input_dir) or MODE_ORIGINAL
|
||||
|
||||
url_to_site: dict[str, str] = {}
|
||||
for site_key in SITES:
|
||||
for entry in load_video_map(site_key).values():
|
||||
for vid_url in entry.get("videos", []):
|
||||
url_to_site[vid_url] = site_key
|
||||
url_to_site = build_url_to_site()
|
||||
|
||||
paths = get_paths_for_mode(mode, urls, video_map, input_dir, url_to_site)
|
||||
|
||||
@@ -429,13 +425,9 @@ def build_path_to_meta(
|
||||
title = t if isinstance(t, str) else ""
|
||||
desc = d if isinstance(d, str) else ""
|
||||
|
||||
videos_any = entry.get("videos", [])
|
||||
if isinstance(videos_any, list):
|
||||
for video_url_any in videos_any:
|
||||
if not isinstance(video_url_any, str):
|
||||
continue
|
||||
if video_url_any not in url_meta:
|
||||
url_meta[video_url_any] = {"title": title, "description": desc}
|
||||
for vid in entry.get("videos", []):
|
||||
if vid["url"] not in url_meta:
|
||||
url_meta[vid["url"]] = {"title": title, "description": desc}
|
||||
|
||||
result: dict[Path, dict[str, str]] = {}
|
||||
for url, abs_path in paths.items():
|
||||
|
||||
10950
video_map.json
10950
video_map.json
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user