From c58814691e5f674a7cd3678f2fb0f9eb31198130 Mon Sep 17 00:00:00 2001 From: HugeFrog24 <62775760+HugeFrog24@users.noreply.github.com> Date: Mon, 2 Mar 2026 01:56:54 +0100 Subject: [PATCH] OK --- README.md | 5 +++-- download.py | 39 +++++++++++++++++++++++++++++++-------- grab_cookie.py | 5 +++++ total_size.py | 2 +- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 9b5c598..2c5e509 100644 --- a/README.md +++ b/README.md @@ -128,10 +128,11 @@ Lists filenames that map to more than one source URL, with sizes. ### Estimate total download size ```bash -python total_size.py +python total_size.py # read cached sizes and print summary +python total_size.py --write # probe uncached/stale URLs and refresh the cache ``` -Fetches `Content-Length` for every video URL in `video_map.json` and prints a size summary. Does not download anything. +Reads cached file sizes from `video_map.json` and prints a summary (total, smallest, largest, average). The default mode never hits the network. Use `--write` to probe any missing or stale entries and persist the results. ## Data files diff --git a/download.py b/download.py index c34c269..67bfa09 100644 --- a/download.py +++ b/download.py @@ -18,6 +18,7 @@ from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Any import requests +import time from check_clashes import ( make_session, @@ -28,6 +29,7 @@ from check_clashes import ( build_url_referers, fetch_sizes, load_video_map, + save_video_map, is_valid_url, VIDEO_MAP_FILE, ) @@ -288,6 +290,25 @@ def build_url_title_map(video_map: dict[str, Any]) -> dict[str, str]: return url_title +def _persist_fetched_sizes(newly_fetched: dict[str, int | None]) -> None: + """Write newly probed sizes back to video_map.json (successful probes only).""" + now = int(time.time()) + for site_key in SITES: + vm_site = load_video_map(site_key) + changed = False + for entry in vm_site.values(): + for vid in entry.get("videos", []): + if vid["url"] in newly_fetched and vid.get("size") is None and newly_fetched[vid["url"]] is not None: + vid["size"] = newly_fetched[vid["url"]] + vid["size_checked_at"] = now + changed = True + if changed: + save_video_map(vm_site, site_key) + n_saved = sum(1 for s in newly_fetched.values() if s is not None) + if n_saved: + print(f"[+] Cached {n_saved} newly probed size(s).") + + def build_url_to_site() -> dict[str, str]: """Return {cdn_video_url: site_key} by loading each site's map in turn.""" result: dict[str, str] = {} @@ -423,16 +444,16 @@ def main() -> None: if vid.get("size") is not None } + newly_fetched: dict[str, int | None] = {} uncached_pending = [u for u in pending if u not in cached_sizes] session = make_session() if uncached_pending: print( f"\n[+] Fetching remote file sizes ({len(uncached_pending)} uncached, {len(pending) - len(uncached_pending)} cached)…" ) - remote_sizes: dict[str, int | None] = { - **cached_sizes, - **fetch_sizes(uncached_pending, workers=20, url_referers=url_referers), - } + fetched_pending = fetch_sizes(uncached_pending, workers=20, url_referers=url_referers) + newly_fetched.update(fetched_pending) + remote_sizes: dict[str, int | None] = {**cached_sizes, **fetched_pending} else: print(f"\n[+] All {len(pending)} pending sizes cached — skipping probe.") remote_sizes = dict(cached_sizes) @@ -447,10 +468,9 @@ def main() -> None: print( f"[+] Verifying {len(already)} existing files ({len(uncached_already)} uncached)…" ) - already_sizes: dict[str, int | None] = { - **cached_sizes, - **fetch_sizes(uncached_already, workers=20, url_referers=url_referers), - } + fetched_already = fetch_sizes(uncached_already, workers=20, url_referers=url_referers) + newly_fetched.update(fetched_already) + already_sizes: dict[str, int | None] = {**cached_sizes, **fetched_already} else: print(f"[+] Verifying {len(already)} existing files (all sizes cached)…") already_sizes = dict(cached_sizes) @@ -472,6 +492,9 @@ def main() -> None: if mismatched: print(f"[!] {mismatched} file(s) will be re-downloaded due to size mismatch") + if newly_fetched: + _persist_fetched_sizes(newly_fetched) + print(f"\n[⚡] Downloading with {args.workers} threads…\n") completed = 0 diff --git a/grab_cookie.py b/grab_cookie.py index 2a4ac10..8f98560 100644 --- a/grab_cookie.py +++ b/grab_cookie.py @@ -14,10 +14,12 @@ import os from pathlib import Path from typing import Literal import requests +from dotenv import load_dotenv from config import SITES ENV_FILE = Path(".env") COOKIE_PREFIX = "wordpress_logged_in_" +load_dotenv(dotenv_path=ENV_FILE) def update_env( @@ -72,6 +74,9 @@ def login_and_get_cookie( "Referer": f"{base_url}/", "Origin": base_url, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + "X-Requested-With": "XMLHttpRequest", + "Accept": "*/*", }, timeout=30, ) diff --git a/total_size.py b/total_size.py index 3aa57c4..0feb96a 100644 --- a/total_size.py +++ b/total_size.py @@ -59,7 +59,7 @@ def _is_stale(vid: dict[str, Any], now: int) -> bool: """True if the cached size is absent or older than SIZE_CACHE_TTL seconds.""" if vid.get("size") is None: return True - return (now - vid.get("size_checked_at", 0)) >= SIZE_CACHE_TTL + return (now - int(vid.get("size_checked_at", 0))) >= SIZE_CACHE_TTL # --------------- CLI ---------------