Caching added

This commit is contained in:
HugeFrog24
2026-03-01 13:06:24 +01:00
parent 4a5b0a6ee3
commit 27f7beb8b7
7 changed files with 9310 additions and 1891 deletions

View File

@@ -18,7 +18,7 @@ Importable functions:
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path, PurePosixPath from pathlib import Path, PurePosixPath
from typing import Any, Optional, cast from typing import Any, cast
from collections.abc import Callable from collections.abc import Callable
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
@@ -131,8 +131,7 @@ def build_url_referers(video_map: dict[str, Any]) -> dict[str, str]:
parsed = urlparse(page_url) parsed = urlparse(page_url)
referer = f"{parsed.scheme}://{parsed.netloc}/" referer = f"{parsed.scheme}://{parsed.netloc}/"
for vid in cast(dict[str, Any], entry).get("videos", []): for vid in cast(dict[str, Any], entry).get("videos", []):
if isinstance(vid, str): result.setdefault(vid["url"], referer)
result.setdefault(vid, referer)
return result return result
@@ -252,12 +251,7 @@ def fetch_sizes(
def main() -> None: def main() -> None:
vm = load_video_map() vm = load_video_map()
urls = [ urls = [vid["url"] for entry in vm.values() for vid in entry.get("videos", [])]
u
for entry in vm.values()
for u in entry.get("videos", [])
if u.startswith("http")
]
clashes = find_clashes(urls) clashes = find_clashes(urls)

View File

@@ -1,6 +1,9 @@
# config.py # config.py
from typing import Final from typing import Final
# How long a cached file size stays valid. 0 = always re-probe; large = effectively forever.
SIZE_CACHE_TTL: Final[int] = 9_999_999 # seconds (~115 days)
SITES: Final[dict[str, dict[str, str]]] = { SITES: Final[dict[str, dict[str, str]]] = {
"jailbirdz": { "jailbirdz": {
"base_url": "https://www.jailbirdz.com", "base_url": "https://www.jailbirdz.com",

View File

@@ -16,7 +16,7 @@ import re
import shutil import shutil
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Optional from typing import Any
import requests import requests
from check_clashes import ( from check_clashes import (
@@ -264,12 +264,13 @@ def download_one(
def collect_urls(video_map: dict[str, Any]) -> list[str]: def collect_urls(video_map: dict[str, Any]) -> list[str]:
urls, seen, skipped = [], set(), 0 urls, seen, skipped = [], set(), 0
for entry in video_map.values(): for entry in video_map.values():
for video_url in entry.get("videos", []): for vid in entry.get("videos", []):
if video_url in seen: u = vid["url"]
if u in seen:
continue continue
seen.add(video_url) seen.add(u)
if is_valid_url(video_url): if is_valid_url(u):
urls.append(video_url) urls.append(u)
else: else:
skipped += 1 skipped += 1
if skipped: if skipped:
@@ -281,12 +282,22 @@ def build_url_title_map(video_map: dict[str, Any]) -> dict[str, str]:
url_title = {} url_title = {}
for entry in video_map.values(): for entry in video_map.values():
title = entry.get("title", "") title = entry.get("title", "")
for video_url in entry.get("videos", []): for vid in entry.get("videos", []):
if video_url not in url_title: if vid["url"] not in url_title:
url_title[video_url] = title url_title[vid["url"]] = title
return url_title return url_title
def build_url_to_site() -> dict[str, str]:
"""Return {cdn_video_url: site_key} by loading each site's map in turn."""
result: dict[str, str] = {}
for site_key in SITES:
for entry in load_video_map(site_key).values():
for vid in entry.get("videos", []):
result[vid["url"]] = site_key
return result
# ── Main ───────────────────────────────────────────────────────────── # ── Main ─────────────────────────────────────────────────────────────
@@ -341,11 +352,7 @@ def main() -> None:
url_referers = build_url_referers(video_map) url_referers = build_url_referers(video_map)
urls = collect_urls(video_map) urls = collect_urls(video_map)
url_to_site: dict[str, str] = {} url_to_site = build_url_to_site()
for site_key in SITES:
for entry in load_video_map(site_key).values():
for vid_url in entry.get("videos", []):
url_to_site[vid_url] = site_key
if args.sites: if args.sites:
selected = set(args.sites) selected = set(args.sites)
@@ -409,17 +416,44 @@ def main() -> None:
print(f" … and {len(pending) - 20} more") print(f" … and {len(pending) - 20} more")
return return
print("\n[+] Fetching remote file sizes…") cached_sizes: dict[str, int] = {
vid["url"]: vid["size"]
for entry in video_map.values()
for vid in entry.get("videos", [])
if vid.get("size") is not None
}
uncached_pending = [u for u in pending if u not in cached_sizes]
session = make_session() session = make_session()
remote_sizes = fetch_sizes(pending, workers=20, url_referers=url_referers) if uncached_pending:
print(
f"\n[+] Fetching remote file sizes ({len(uncached_pending)} uncached, {len(pending) - len(uncached_pending)} cached)…"
)
remote_sizes: dict[str, int | None] = {
**cached_sizes,
**fetch_sizes(uncached_pending, workers=20, url_referers=url_referers),
}
else:
print(f"\n[+] All {len(pending)} pending sizes cached — skipping probe.")
remote_sizes = dict(cached_sizes)
sized = {u: s for u, s in remote_sizes.items() if s is not None} sized = {u: s for u, s in remote_sizes.items() if s is not None}
total_bytes = sum(sized.values()) total_bytes = sum(sized.values())
print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files") print(f"[+] Download size: {fmt_size(total_bytes)} across {len(pending)} files")
if already: if already:
print(f"[+] Verifying {len(already)} existing files…") uncached_already = [u for u in already if u not in cached_sizes]
already_sizes = fetch_sizes(already, workers=20, url_referers=url_referers) if uncached_already:
print(
f"[+] Verifying {len(already)} existing files ({len(uncached_already)} uncached)…"
)
already_sizes: dict[str, int | None] = {
**cached_sizes,
**fetch_sizes(uncached_already, workers=20, url_referers=url_referers),
}
else:
print(f"[+] Verifying {len(already)} existing files (all sizes cached)…")
already_sizes = dict(cached_sizes)
mismatched = 0 mismatched = 0
for url in already: for url in already:

16
main.py
View File

@@ -6,7 +6,7 @@ import signal
import asyncio import asyncio
import requests import requests
from pathlib import PurePosixPath from pathlib import PurePosixPath
from typing import Any, Optional from typing import Any
from urllib.parse import urlparse from urllib.parse import urlparse
from dotenv import load_dotenv from dotenv import load_dotenv
from playwright.async_api import async_playwright, BrowserContext from playwright.async_api import async_playwright, BrowserContext
@@ -476,9 +476,13 @@ async def worker(
entry = video_map.get(url, {}) entry = video_map.get(url, {})
if title: if title:
entry["title"] = title entry["title"] = title
existing_videos = set(entry.get("videos", [])) existing_dict: dict[str, Any] = {
existing_videos.update(all_videos) vid["url"]: vid for vid in entry.get("videos", [])
entry["videos"] = sorted(existing_videos) }
for vid_url in all_videos:
if vid_url not in existing_dict:
existing_dict[vid_url] = {"url": vid_url}
entry["videos"] = sorted(existing_dict.values(), key=lambda v: v["url"])
mark_done = bool(all_videos) or not expects_video(url) mark_done = bool(all_videos) or not expects_video(url)
if mark_done: if mark_done:
entry["scraped_at"] = int(time.time()) entry["scraped_at"] = int(time.time())
@@ -526,7 +530,9 @@ async def run_for_site(
site_key, base_url, wp_api, video_map, urls, req_headers site_key, base_url, wp_api, video_map, urls, req_headers
) )
known = {u for entry in video_map.values() for u in entry.get("videos", [])} known = {
vid["url"] for entry in video_map.values() for vid in entry.get("videos", [])
}
total = len(urls) total = len(urls)
pending = [] pending = []

View File

@@ -4,15 +4,19 @@ Importable function:
summarize_sizes(sizes) - return dict with total, smallest, largest, average, failed summarize_sizes(sizes) - return dict with total, smallest, largest, average, failed
""" """
from typing import Optional, TypedDict import argparse
import time
from typing import Any, TypedDict
from check_clashes import ( from check_clashes import (
fmt_size, fmt_size,
fetch_sizes, fetch_sizes,
load_video_map, load_video_map,
save_video_map,
build_url_referers, build_url_referers,
VIDEO_MAP_FILE, VIDEO_MAP_FILE,
) )
from config import SITES, SIZE_CACHE_TTL
class SizeStats(TypedDict): class SizeStats(TypedDict):
@@ -25,7 +29,7 @@ class SizeStats(TypedDict):
failed: list[str] failed: list[str]
def summarize_sizes(sizes: dict[str, Optional[int]]) -> SizeStats: def summarize_sizes(sizes: dict[str, int | None]) -> SizeStats:
"""Given {url: size_or_None}, return a stats dict.""" """Given {url: size_or_None}, return a stats dict."""
known = {u: s for u, s in sizes.items() if s is not None} known = {u: s for u, s in sizes.items() if s is not None}
failed = [u for u, s in sizes.items() if s is None] failed = [u for u, s in sizes.items() if s is None]
@@ -51,6 +55,13 @@ def summarize_sizes(sizes: dict[str, Optional[int]]) -> SizeStats:
} }
def _is_stale(vid: dict[str, Any], now: int) -> bool:
"""True if the cached size is absent or older than SIZE_CACHE_TTL seconds."""
if vid.get("size") is None:
return True
return (now - vid.get("size_checked_at", 0)) >= SIZE_CACHE_TTL
# --------------- CLI --------------- # --------------- CLI ---------------
@@ -59,24 +70,7 @@ def _progress(done: int, total: int) -> None:
print(f" {done}/{total}") print(f" {done}/{total}")
def main() -> None: def _print_stats(stats: SizeStats) -> None:
vm = load_video_map()
urls: list[str] = [
u
for entry in vm.values()
for u in entry.get("videos", [])
if u.startswith("http")
]
url_referers = build_url_referers(vm)
print(f"[+] {len(urls)} URLs in {VIDEO_MAP_FILE}")
print("[+] Fetching file sizes (20 threads)…\n")
sizes = fetch_sizes(
urls, workers=20, on_progress=_progress, url_referers=url_referers
)
stats = summarize_sizes(sizes)
print(f"\n{'=' * 45}") print(f"\n{'=' * 45}")
print(f" Sized: {stats['sized']}/{stats['total']} files") print(f" Sized: {stats['sized']}/{stats['total']} files")
print(f" Total: {fmt_size(stats['total_bytes'])}") print(f" Total: {fmt_size(stats['total_bytes'])}")
@@ -84,12 +78,108 @@ def main() -> None:
print(f" Largest: {fmt_size(stats['largest'])}") print(f" Largest: {fmt_size(stats['largest'])}")
print(f" Average: {fmt_size(stats['average'])}") print(f" Average: {fmt_size(stats['average'])}")
print(f"{'=' * 45}") print(f"{'=' * 45}")
if stats["failed"]: if stats["failed"]:
print(f"\n[!] {len(stats['failed'])} URL(s) could not be sized:") print(f"\n[!] {len(stats['failed'])} URL(s) could not be sized:")
for u in stats["failed"]: for u in stats["failed"]:
print(f" {u}") print(f" {u}")
def _cache_hint(fresh: int, stale: int, missing: int) -> str:
parts = [label for count, label in [(fresh, f"{fresh} fresh"), (stale, f"{stale} stale"), (missing, f"{missing} missing")] if count]
if stale or missing:
suffix = " — run --write to refresh" if stale else " — run --write to probe missing"
else:
suffix = " — all current"
return f"Cache: {', '.join(parts)}{suffix}"
def _run_stats() -> None:
vm = load_video_map()
now = int(time.time())
sizes: dict[str, int | None] = {}
fresh = stale = missing = 0
for entry in vm.values():
for vid in entry.get("videos", []):
url = vid["url"]
if url in sizes:
continue
sizes[url] = vid.get("size")
if vid.get("size") is None:
missing += 1
elif _is_stale(vid, now):
stale += 1
else:
fresh += 1
print(f"[+] {len(sizes)} URLs in {VIDEO_MAP_FILE}")
print(f" {_cache_hint(fresh, stale, missing)}")
_print_stats(summarize_sizes(sizes))
def _apply_fetched(vm: dict[str, Any], fetched: dict[str, int | None], now: int) -> None:
for entry in vm.values():
for vid in entry.get("videos", []):
if vid["url"] in fetched:
vid["size"] = fetched[vid["url"]]
vid["size_checked_at"] = now
def _run_write() -> None:
"""Probe uncached sizes and write them into video_map.json."""
now = int(time.time())
all_fetched: dict[str, int | None] = {}
for site_key in SITES:
vm = load_video_map(site_key)
if not vm:
continue
url_referers = build_url_referers(vm)
to_probe: list[str] = [
vid["url"]
for entry in vm.values()
for vid in entry.get("videos", [])
if _is_stale(vid, now)
]
cached_count = sum(
1
for entry in vm.values()
for vid in entry.get("videos", [])
if not _is_stale(vid, now)
)
print(f"[{site_key}] {cached_count} cached, {len(to_probe)} to probe…")
fetched: dict[str, int | None] = {}
if to_probe:
fetched = fetch_sizes(
to_probe, workers=20, on_progress=_progress, url_referers=url_referers
)
_apply_fetched(vm, fetched, now)
save_video_map(vm, site_key)
all_fetched.update(fetched)
print(f"[{site_key}] Written.")
if all_fetched:
_print_stats(summarize_sizes(all_fetched))
def main() -> None:
parser = argparse.ArgumentParser(description="Calculate total video download size")
parser.add_argument(
"--write",
"-w",
action="store_true",
help="Probe uncached sizes and write them into video_map.json",
)
args = parser.parse_args()
if args.write:
_run_write()
else:
_run_stats()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -32,11 +32,11 @@ import requests
from dotenv import load_dotenv from dotenv import load_dotenv
from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS, load_video_map from check_clashes import fmt_size, url_to_filename, VIDEO_EXTS, load_video_map
from config import SITES
from download import ( from download import (
collect_urls, collect_urls,
get_paths_for_mode, get_paths_for_mode,
read_mode, read_mode,
build_url_to_site,
MODE_ORIGINAL, MODE_ORIGINAL,
DEFAULT_OUTPUT, DEFAULT_OUTPUT,
) )
@@ -412,11 +412,7 @@ def build_path_to_meta(
urls = collect_urls(video_map) urls = collect_urls(video_map)
mode = read_mode(input_dir) or MODE_ORIGINAL mode = read_mode(input_dir) or MODE_ORIGINAL
url_to_site: dict[str, str] = {} url_to_site = build_url_to_site()
for site_key in SITES:
for entry in load_video_map(site_key).values():
for vid_url in entry.get("videos", []):
url_to_site[vid_url] = site_key
paths = get_paths_for_mode(mode, urls, video_map, input_dir, url_to_site) paths = get_paths_for_mode(mode, urls, video_map, input_dir, url_to_site)
@@ -429,13 +425,9 @@ def build_path_to_meta(
title = t if isinstance(t, str) else "" title = t if isinstance(t, str) else ""
desc = d if isinstance(d, str) else "" desc = d if isinstance(d, str) else ""
videos_any = entry.get("videos", []) for vid in entry.get("videos", []):
if isinstance(videos_any, list): if vid["url"] not in url_meta:
for video_url_any in videos_any: url_meta[vid["url"]] = {"title": title, "description": desc}
if not isinstance(video_url_any, str):
continue
if video_url_any not in url_meta:
url_meta[video_url_any] = {"title": title, "description": desc}
result: dict[Path, dict[str, str]] = {} result: dict[Path, dict[str, str]] = {}
for url, abs_path in paths.items(): for url, abs_path in paths.items():

File diff suppressed because it is too large Load Diff