Multi-gooner support because who knows

2026-05-01 09:02:19 +00:00 · 2026-03-01 01:36:01 +01:00
parent 80444405e9
commit 4a5b0a6ee3
15 changed files with 16378 additions and 11575 deletions
@@ -1,60 +1,131 @@
+import argparse
 import re
-import json
 import os
 import time
 import signal
 import asyncio
-import tempfile
 import requests
-from pathlib import Path, PurePosixPath
+from pathlib import PurePosixPath
+from typing import Any, Optional
 from urllib.parse import urlparse
 from dotenv import load_dotenv
-from playwright.async_api import async_playwright
-from check_clashes import VIDEO_EXTS
-from config import BASE_URL
+from playwright.async_api import async_playwright, BrowserContext
+from check_clashes import (
+    VIDEO_EXTS,
+    load_video_map,
+    save_video_map,
+    is_valid_url,
+    expects_video,
+)
+from config import SITES
+from grab_cookie import login_and_get_cookie, update_env

 load_dotenv()


-def _is_video_url(url):
+def _is_video_url(url: str) -> bool:
    """True if `url` ends with a recognised video extension (case-insensitive, path only)."""
    return PurePosixPath(urlparse(url).path).suffix.lower() in VIDEO_EXTS
-WP_API = f"{BASE_URL}/wp-json/wp/v2"
+

 SKIP_TYPES = {
-    "attachment", "nav_menu_item", "wp_block", "wp_template",
-    "wp_template_part", "wp_global_styles", "wp_navigation",
-    "wp_font_family", "wp_font_face",
+    "attachment",
+    "nav_menu_item",
+    "wp_block",
+    "wp_template",
+    "wp_template_part",
+    "wp_global_styles",
+    "wp_navigation",
+    "wp_font_family",
+    "wp_font_face",
 }

-VIDEO_MAP_FILE = "video_map.json"
 MAX_WORKERS = 4

-API_HEADERS = {
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0",
-    "Accept": "application/json",
-    "Referer": f"{BASE_URL}/",
-}
+_USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0"
+)


-def _get_login_cookie():
-    raw = os.environ.get("WP_LOGIN_COOKIE", "").strip()  # strip accidental whitespace
-    if not raw:
-        raise RuntimeError(
-            "WP_LOGIN_COOKIE not set. Copy it from your browser into .env — see .env.example.")
-    name, _, value = raw.partition("=")
-    if not value:
-        raise RuntimeError(
-            "WP_LOGIN_COOKIE looks malformed (no '=' found). Expected: name=value")
-    if not name.startswith("wordpress_logged_in_"):
-        raise RuntimeError(
-            "WP_LOGIN_COOKIE doesn't look right — expected a wordpress_logged_in_... cookie.")
-    return name, value
+def _api_headers(base_url: str, cookie_name: str, cookie_value: str) -> dict[str, str]:
+    return {
+        "User-Agent": _USER_AGENT,
+        "Accept": "application/json",
+        "Referer": f"{base_url}/",
+        "Cookie": f"{cookie_name}={cookie_value}; eav-age-verified=1",
+    }


-def discover_content_types(session):
-    """Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug) for content types worth scraping."""
-    r = session.get(f"{WP_API}/types", timeout=30)
+def _select_probe_url(video_map: dict[str, Any]) -> str | None:
+    """Pure function: return the first URL in video_map where expects_video() is True."""
+    return next((url for url in video_map if expects_video(url)), None)
+
+
+def _probe_cookie(name: str, value: str, site_key: str) -> bool:
+    """HEAD request to a members-only video page. Returns True if the cookie is still valid."""
+    video_map = load_video_map(site_key)
+    probe_url = _select_probe_url(video_map)
+    if probe_url is None:
+        return False  # no video URLs yet — can't validate, fall through to re-auth
+    r = requests.head(
+        probe_url,
+        headers={"Cookie": f"{name}={value}", "User-Agent": _USER_AGENT},
+        allow_redirects=False,
+        timeout=10,
+    )
+    return r.status_code == 200
+
+
+def _get_login_cookie(site_key: str, site_cfg: dict[str, str]) -> tuple[str, str]:
+    env_prefix = site_cfg["env_prefix"]
+    base_url = site_cfg["base_url"]
+    env_key = f"{env_prefix}_LOGIN_COOKIE"
+
+    username = os.environ.get(f"{env_prefix}_USERNAME", "").strip()
+    password = os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
+    has_credentials = bool(username and password)
+
+    raw = os.environ.get(env_key, "").strip()
+    if raw:
+        name, _, value = raw.partition("=")
+        if value and name.startswith("wordpress_logged_in_"):
+            if not has_credentials:
+                return name, value  # cookie-only mode — trust it
+            print(f"[{site_key}] Cookie found — validating…")
+            if _probe_cookie(name, value, site_key):
+                print(f"[{site_key}] Cookie still valid — skipping login.")
+                return name, value
+            print(f"[{site_key}] Cookie expired — re-authenticating…")
+
+    if has_credentials:
+        cookie_name, cookie_value = login_and_get_cookie(username, password, base_url)
+        action = update_env(cookie_name, cookie_value, env_key=env_key)
+        print(f"[{site_key}] Logged in: {cookie_name} ({action} in .env)")
+        return cookie_name, cookie_value
+
+    raise RuntimeError(
+        f"No credentials or cookie found for {site_key}. Set either:\n"
+        f"  • {env_prefix}_USERNAME + {env_prefix}_PASSWORD  (recommended)\n"
+        f"  • {env_prefix}_LOGIN_COOKIE                      (fallback — may expire)\n"
+        "See .env.example."
+    )
+
+
+def _has_credentials(site_cfg: dict[str, str]) -> bool:
+    env_prefix = site_cfg["env_prefix"]
+    has_cookie = bool(os.environ.get(f"{env_prefix}_LOGIN_COOKIE", "").strip())
+    has_creds = bool(
+        os.environ.get(f"{env_prefix}_USERNAME", "").strip()
+        and os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
+    )
+    return has_cookie or has_creds
+
+
+def discover_content_types(
+    session: requests.Session, wp_api: str
+) -> list[tuple[str, str, str]]:
+    """Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug)."""
+    r = session.get(f"{wp_api}/types", timeout=30)
    r.raise_for_status()
    types = r.json()

@@ -69,16 +140,22 @@ def discover_content_types(session):
    return targets


-def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
-    """Paginate one content type and return (url, title, description) tuples.
-    Uses the `link` field when available; falls back to building from slug."""
+def fetch_all_posts_for_type(
+    session: requests.Session,
+    wp_api: str,
+    base_url: str,
+    type_name: str,
+    rest_base: str,
+    type_slug: str,
+) -> list[tuple[str, str, str]]:
+    """Paginate one content type and return (url, title, description) tuples."""
    url_prefix = type_slug.replace("_", "-")
    results = []
    page = 1

    while True:
        r = session.get(
-            f"{WP_API}/{rest_base}",
+            f"{wp_api}/{rest_base}",
            params={"per_page": 100, "page": page},
            timeout=30,
        )
@@ -92,15 +169,19 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
            if not link.startswith("http"):
                slug = post.get("slug")
                if slug:
-                    link = f"{BASE_URL}/{url_prefix}/{slug}/"
+                    link = f"{base_url}/{url_prefix}/{slug}/"
                else:
                    continue
            title_obj = post.get("title", {})
-            title = title_obj.get("rendered", "") if isinstance(
-                title_obj, dict) else str(title_obj)
+            title = (
+                title_obj.get("rendered", "")
+                if isinstance(title_obj, dict)
+                else str(title_obj)
+            )
            content_obj = post.get("content", {})
-            content_html = content_obj.get(
-                "rendered", "") if isinstance(content_obj, dict) else ""
+            content_html = (
+                content_obj.get("rendered", "") if isinstance(content_obj, dict) else ""
+            )
            description = html_to_text(content_html) if content_html else ""
            results.append((link, title, description))
        print(f"    {type_name} page {page}: {len(data)} items")
@@ -109,66 +190,88 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
    return results


-def fetch_post_urls_from_api(headers):
+def fetch_post_urls_from_api(
+    site_key: str,
+    base_url: str,
+    wp_api: str,
+    headers: dict[str, str],
+) -> list[str]:
    """Auto-discover all content types via the WP REST API and collect every post URL.
-    Also builds video_map.json with titles pre-populated."""
-    print("[+] video_map.json empty or missing — discovering content types from REST API…")
+    Also pre-populates video_map.json with titles."""
+    print(f"[{site_key}] video_map empty — discovering content types from REST API…")
    session = requests.Session()
    session.headers.update(headers)

-    targets = discover_content_types(session)
+    targets = discover_content_types(session, wp_api)
    print(
-        f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n")
+        f"[{site_key}] Found {len(targets)} content types: "
+        f"{', '.join(name for name, _, _ in targets)}\n"
+    )

    all_results = []
    for type_name, rest_base, type_slug in targets:
        type_results = fetch_all_posts_for_type(
-            session, type_name, rest_base, type_slug)
+            session, wp_api, base_url, type_name, rest_base, type_slug
+        )
        all_results.extend(type_results)

-    seen = set()
+    seen: set[str] = set()
    deduped_urls = []
-    video_map = load_video_map()
+    video_map = load_video_map(site_key)

    for url, title, description in all_results:
        if url not in seen and url.startswith("http"):
            seen.add(url)
            deduped_urls.append(url)
            if url not in video_map:
-                video_map[url] = {"title": title,
-                                  "description": description, "videos": []}
+                video_map[url] = {
+                    "title": title,
+                    "description": description,
+                    "videos": [],
+                }
            else:
                if not video_map[url].get("title"):
                    video_map[url]["title"] = title
                if not video_map[url].get("description"):
                    video_map[url]["description"] = description

-    save_video_map(video_map)
+    save_video_map(video_map, site_key)
    print(
-        f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}")
-    print(
-        f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
+        f"\n[{site_key}] Discovered {len(deduped_urls)} unique URLs → saved to video_map.json"
+    )
+    print(f"[{site_key}] Pre-populated {len(video_map)} entries")
    return deduped_urls


-def fetch_metadata_from_api(video_map, urls, headers):
+def fetch_metadata_from_api(
+    site_key: str,
+    base_url: str,
+    wp_api: str,
+    video_map: dict[str, Any],
+    urls: list[str],
+    headers: dict[str, str],
+) -> None:
    """Populate missing titles and descriptions in video_map from the REST API."""
-    missing = [u for u in urls
-               if u not in video_map
-               or not video_map[u].get("title")
-               or not video_map[u].get("description")]
+    missing = [
+        u
+        for u in urls
+        if u not in video_map
+        or not video_map[u].get("title")
+        or not video_map[u].get("description")
+    ]
    if not missing:
        return

-    print(f"[+] Fetching metadata from REST API for {len(missing)} posts…")
+    print(f"[{site_key}] Fetching metadata from REST API for {len(missing)} posts…")
    session = requests.Session()
    session.headers.update(headers)

-    targets = discover_content_types(session)
+    targets = discover_content_types(session, wp_api)

    for type_name, rest_base, type_slug in targets:
        type_results = fetch_all_posts_for_type(
-            session, type_name, rest_base, type_slug)
+            session, wp_api, base_url, type_name, rest_base, type_slug
+        )
        for url, title, description in type_results:
            if url in video_map:
                if not video_map[url].get("title"):
@@ -176,93 +279,90 @@ def fetch_metadata_from_api(video_map, urls, headers):
                if not video_map[url].get("description"):
                    video_map[url]["description"] = description
            else:
-                video_map[url] = {"title": title,
-                                  "description": description, "videos": []}
+                video_map[url] = {
+                    "title": title,
+                    "description": description,
+                    "videos": [],
+                }

-    save_video_map(video_map)
+    save_video_map(video_map, site_key)
    populated_t = sum(1 for u in urls if video_map.get(u, {}).get("title"))
-    populated_d = sum(1 for u in urls if video_map.get(
-        u, {}).get("description"))
-    print(f"[+] Titles populated: {populated_t}/{len(urls)}")
-    print(f"[+] Descriptions populated: {populated_d}/{len(urls)}")
+    populated_d = sum(1 for u in urls if video_map.get(u, {}).get("description"))
+    print(f"[{site_key}] Titles populated: {populated_t}/{len(urls)}")
+    print(f"[{site_key}] Descriptions populated: {populated_d}/{len(urls)}")


-def load_post_urls(headers):
-    vm = load_video_map()
+def load_post_urls(
+    site_key: str,
+    base_url: str,
+    wp_api: str,
+    headers: dict[str, str],
+) -> list[str]:
+    vm = load_video_map(site_key)
    if vm:
-        print(f"[+] {VIDEO_MAP_FILE} found — loading {len(vm)} post URLs.")
+        print(f"[{site_key}] video_map found — loading {len(vm)} post URLs.")
        return list(vm.keys())
-    return fetch_post_urls_from_api(headers)
+    return fetch_post_urls_from_api(site_key, base_url, wp_api, headers)


-def html_to_text(html_str):
+def html_to_text(html_str: str) -> str:
    """Strip HTML tags, decode entities, and collapse whitespace into clean plain text."""
    import html
-    text = re.sub(r'<br\s*/?>', '\n', html_str)
-    text = text.replace('</p>', '\n\n')
-    text = re.sub(r'<[^>]+>', '', text)
+
+    text = re.sub(r"<br\s*/?>", "\n", html_str)
+    text = text.replace("</p>", "\n\n")
+    text = re.sub(r"<[^>]+>", "", text)
    text = html.unescape(text)
    lines = [line.strip() for line in text.splitlines()]
-    text = '\n'.join(lines)
-    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = "\n".join(lines)
+    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


-def extract_mp4_from_html(html):
+def extract_mp4_from_html(html: str) -> list[str]:
    candidates = re.findall(r'https?://[^\s"\'<>]+', html)
    return [u for u in candidates if _is_video_url(u)]


-def extract_title_from_html(html):
-    m = re.search(
-        r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
+def extract_title_from_html(html: str) -> str | None:
+    m = re.search(r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
    if m:
-        title = re.sub(r'<[^>]+>', '', m.group(1)).strip()
+        title = re.sub(r"<[^>]+>", "", m.group(1)).strip()
        return title
-    m = re.search(r'<title>(.*?)(?:\s*[-–|].*)?</title>', html, re.DOTALL)
+    m = re.search(r"<title>(.*?)(?:\s*[-–|].*)?</title>", html, re.DOTALL)
    if m:
        return m.group(1).strip()
    return None


-def load_video_map():
-    if Path(VIDEO_MAP_FILE).exists():
-        try:
-            with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
-                return json.load(f)
-        except (json.JSONDecodeError, OSError):
-            return {}
-    return {}
-
-
-def save_video_map(video_map):
-    fd, tmp_path = tempfile.mkstemp(dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp")
-    try:
-        with os.fdopen(fd, "w", encoding="utf-8") as f:
-            json.dump(video_map, f, indent=2, ensure_ascii=False)
-        Path(tmp_path).replace(VIDEO_MAP_FILE)
-    except Exception:
-        try:
-            Path(tmp_path).unlink()
-        except OSError:
-            pass
-        raise
-
-
-
-def _expects_video(url):
-    return "/pinkcuffs-videos/" in url
-
-
 MAX_RETRIES = 2


-async def worker(worker_id, queue, context, known,
-                 total, retry_counts, video_map, map_lock, shutdown_event):
+async def worker(
+    worker_id: int,
+    queue: asyncio.Queue[tuple[int, str]],
+    context: BrowserContext,
+    known: set[str],
+    total: int,
+    retry_counts: dict[int, int],
+    video_map: dict[str, Any],
+    map_lock: asyncio.Lock,
+    shutdown_event: asyncio.Event,
+    reauth_lock: asyncio.Lock,
+    reauth_done: list[bool],
+    site_key: str,
+    site_cfg: dict[str, str],
+) -> None:
+    base_url = site_cfg["base_url"]
+    cookie_domain = urlparse(base_url).hostname or site_cfg["cookie_domain"]
+    env_prefix = site_cfg["env_prefix"]
    page = await context.new_page()
-    video_hits = set()
+    video_hits: set[str] = set()

-    page.on("response", lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None)
+    page.on(
+        "response",
+        lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
+    )

    try:
        while not shutdown_event.is_set():
@@ -279,19 +379,69 @@ async def worker(worker_id, queue, context, known,
                await page.goto(url, wait_until="networkidle", timeout=60000)
            except Exception as e:
                print(f"[W{worker_id}] Navigation error: {e}")
-                if _expects_video(url) and attempt < MAX_RETRIES:
+                if expects_video(url) and attempt < MAX_RETRIES:
                    retry_counts[idx] = attempt + 1
                    queue.put_nowait((idx, url))
                    print(f"[W{worker_id}] Re-queued for retry.")
-                elif not _expects_video(url):
+                elif not expects_video(url):
                    async with map_lock:
                        entry = video_map.get(url, {})
                        entry["scraped_at"] = int(time.time())
                        video_map[url] = entry
-                        save_video_map(video_map)
+                        save_video_map(video_map, site_key)
                else:
                    print(
-                        f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run.")
+                        f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run."
+                    )
+                continue
+
+            if "NoDirectAccessAllowed" in page.url:
+                recovered = False
+                async with reauth_lock:
+                    if not reauth_done[0]:
+                        username = os.environ.get(f"{env_prefix}_USERNAME", "").strip()
+                        password = os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
+                        if username and password:
+                            print(f"[W{worker_id}] Cookie expired — re-authenticating…")
+                            try:
+                                new_name, new_value = await asyncio.to_thread(
+                                    login_and_get_cookie, username, password, base_url
+                                )
+                                update_env(
+                                    new_name,
+                                    new_value,
+                                    env_key=f"{env_prefix}_LOGIN_COOKIE",
+                                )
+                                await context.add_cookies(
+                                    [
+                                        {
+                                            "name": new_name,
+                                            "value": new_value,
+                                            "domain": cookie_domain,
+                                            "path": "/",
+                                            "httpOnly": True,
+                                            "secure": True,
+                                            "sameSite": "None",
+                                        }
+                                    ]
+                                )
+                                reauth_done[0] = True
+                                recovered = True
+                                print(f"[W{worker_id}] Re-auth succeeded — re-queuing.")
+                            except Exception as e:
+                                print(f"[W{worker_id}] Re-auth failed: {e}")
+                                shutdown_event.set()
+                        else:
+                            print(
+                                f"[W{worker_id}] Cookie expired. "
+                                f"Set {env_prefix}_USERNAME + {env_prefix}_PASSWORD "
+                                "in .env for auto re-auth."
+                            )
+                            shutdown_event.set()
+                    else:
+                        recovered = True  # another worker already re-authed
+                if recovered:
+                    queue.put_nowait((idx, url))
                continue

            await asyncio.sleep(1.5)
@@ -301,9 +451,15 @@ async def worker(worker_id, queue, context, known,
            found = set(html_videos) | set(video_hits)
            video_hits.clear()

-            all_videos = [m for m in found if m not in (
-                f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
-            )]
+            all_videos = [
+                m
+                for m in found
+                if is_valid_url(m)
+                and m
+                not in (
+                    f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
+                )
+            ]

            async with map_lock:
                new_found = found - known
@@ -312,7 +468,8 @@ async def worker(worker_id, queue, context, known,
                    known.update(new_found)
                elif all_videos:
                    print(
-                        f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write.")
+                        f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write."
+                    )
                else:
                    print(f"[W{worker_id}] No video found on page.")

@@ -322,30 +479,159 @@ async def worker(worker_id, queue, context, known,
                existing_videos = set(entry.get("videos", []))
                existing_videos.update(all_videos)
                entry["videos"] = sorted(existing_videos)
-                mark_done = bool(all_videos) or not _expects_video(url)
+                mark_done = bool(all_videos) or not expects_video(url)
                if mark_done:
                    entry["scraped_at"] = int(time.time())
                video_map[url] = entry
-                save_video_map(video_map)
+                save_video_map(video_map, site_key)

            if not mark_done:
                if attempt < MAX_RETRIES:
                    retry_counts[idx] = attempt + 1
                    queue.put_nowait((idx, url))
                    print(
-                        f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES}).")
+                        f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES})."
+                    )
                else:
                    print(
-                        f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run.")
+                        f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run."
+                    )
    finally:
        await page.close()


-async def run():
+async def run_for_site(
+    site_key: str,
+    site_cfg: dict[str, str],
+    shutdown_event: asyncio.Event,
+) -> None:
+    base_url = site_cfg["base_url"]
+    cookie_domain = urlparse(base_url).hostname or site_cfg["cookie_domain"]
+    wp_api = f"{base_url}/wp-json/wp/v2"
+
+    cookie_name, cookie_value = _get_login_cookie(site_key, site_cfg)
+    req_headers = _api_headers(base_url, cookie_name, cookie_value)
+
+    urls = load_post_urls(site_key, base_url, wp_api, req_headers)
+
+    video_map = load_video_map(site_key)
+    if any(
+        u not in video_map
+        or not video_map[u].get("title")
+        or not video_map[u].get("description")
+        for u in urls
+        if expects_video(u)
+    ):
+        fetch_metadata_from_api(
+            site_key, base_url, wp_api, video_map, urls, req_headers
+        )
+
+    known = {u for entry in video_map.values() for u in entry.get("videos", [])}
+
+    total = len(urls)
+    pending = []
+    needs_map = 0
+    for i, u in enumerate(urls):
+        entry = video_map.get(u, {})
+        if not entry.get("scraped_at"):
+            pending.append((i, u))
+        elif expects_video(u) and not entry.get("videos"):
+            pending.append((i, u))
+            needs_map += 1
+
+    done_count = sum(1 for v in video_map.values() if v.get("scraped_at"))
+    print(f"[{site_key}] Loaded {total} post URLs.")
+    print(f"[{site_key}] Already have {len(known)} video URLs mapped.")
+    print(f"[{site_key}] Video map: {len(video_map)} entries in video_map.json")
+    if done_count:
+        remaining_new = len(pending) - needs_map
+        print(
+            f"[{site_key}] Resuming: {done_count} done, "
+            f"{remaining_new} new + {needs_map} needing map data."
+        )
+    if not pending:
+        print(f"[{site_key}] All URLs already processed and mapped.")
+        return
+
+    print(
+        f"[{site_key}] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n"
+    )
+
+    queue: asyncio.Queue[tuple[int, str]] = asyncio.Queue()
+    for item in pending:
+        queue.put_nowait(item)
+
+    map_lock = asyncio.Lock()
+    reauth_lock = asyncio.Lock()
+    reauth_done: list[bool] = [False]
+    retry_counts: dict[int, int] = {}
+
+    async with async_playwright() as p:
+        browser = await p.firefox.launch(headless=True)
+        context = await browser.new_context()
+
+        site_cookies = [
+            {
+                "name": cookie_name,
+                "value": cookie_value,
+                "domain": cookie_domain,
+                "path": "/",
+                "httpOnly": True,
+                "secure": True,
+                "sameSite": "None",
+            },
+            {
+                "name": "eav-age-verified",
+                "value": "1",
+                "domain": cookie_domain,
+                "path": "/",
+            },
+        ]
+
+        await context.add_cookies(site_cookies)  # type: ignore[arg-type]
+
+        num_workers = min(MAX_WORKERS, len(pending))
+        workers = [
+            asyncio.create_task(
+                worker(
+                    i,
+                    queue,
+                    context,
+                    known,
+                    total,
+                    retry_counts,
+                    video_map,
+                    map_lock,
+                    shutdown_event,
+                    reauth_lock,
+                    reauth_done,
+                    site_key,
+                    site_cfg,
+                )
+            )
+            for i in range(num_workers)
+        ]
+
+        await asyncio.gather(*workers)
+        await browser.close()
+
+    mapped = sum(1 for v in video_map.values() if v.get("videos"))
+    print(
+        f"\n[{site_key}] Video map: {mapped} posts with videos, {len(video_map)} total entries."
+    )
+
+    if not shutdown_event.is_set():
+        print(f"[{site_key}] Completed. Full map in video_map.json")
+    else:
+        done = sum(1 for v in video_map.values() if v.get("scraped_at"))
+        print(f"[{site_key}] Paused — {done}/{total} done. Run again to resume.")
+
+
+async def run(selected_sites: list[str], explicit: bool) -> None:
    shutdown_event = asyncio.Event()
    loop = asyncio.get_running_loop()

-    def _handle_shutdown(signum, _frame):
+    def _handle_shutdown(signum: int, _: object) -> None:
        print(f"\n[!] Signal {signum} received — finishing active pages then exiting…")
        loop.call_soon_threadsafe(shutdown_event.set)

@@ -353,110 +639,42 @@ async def run():
    signal.signal(signal.SIGTERM, _handle_shutdown)

    try:
-        cookie_name, cookie_value = _get_login_cookie()
-        req_headers = {
-            **API_HEADERS,
-            "Cookie": f"{cookie_name}={cookie_value}; eav-age-verified=1",
-        }
-
-        urls = load_post_urls(req_headers)
-
-        video_map = load_video_map()
-        if any(u not in video_map
-               or not video_map[u].get("title")
-               or not video_map[u].get("description")
-               for u in urls if _expects_video(u)):
-            fetch_metadata_from_api(video_map, urls, req_headers)
-
-        known = {u for entry in video_map.values() for u in entry.get("videos", [])}
-
-        total = len(urls)
-        pending = []
-        needs_map = 0
-        for i, u in enumerate(urls):
-            entry = video_map.get(u, {})
-            if not entry.get("scraped_at"):
-                pending.append((i, u))
-            elif _expects_video(u) and not entry.get("videos"):
-                pending.append((i, u))
-                needs_map += 1
-
-        done_count = sum(1 for v in video_map.values() if v.get("scraped_at"))
-        print(f"[+] Loaded {total} post URLs.")
-        print(f"[+] Already have {len(known)} video URLs mapped.")
-        print(f"[+] Video map: {len(video_map)} entries in {VIDEO_MAP_FILE}")
-        if done_count:
-            remaining_new = len(pending) - needs_map
-            print(
-                f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data.")
-        if not pending:
-            print("[✓] All URLs already processed and mapped.")
-            return
-
-        print(
-            f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n")
-
-        queue = asyncio.Queue()
-        for item in pending:
-            queue.put_nowait(item)
-
-        map_lock = asyncio.Lock()
-        retry_counts = {}
-
-        async with async_playwright() as p:
-            browser = await p.firefox.launch(headless=True)
-            context = await browser.new_context()
-
-            _cookie_domain = urlparse(BASE_URL).netloc
-            site_cookies = [
-                {
-                    "name": cookie_name,
-                    "value": cookie_value,
-                    "domain": _cookie_domain,
-                    "path": "/",
-                    "httpOnly": True,
-                    "secure": True,
-                    "sameSite": "None"
-                },
-                {
-                    "name": "eav-age-verified",
-                    "value": "1",
-                    "domain": _cookie_domain,
-                    "path": "/"
-                }
-            ]
-
-            await context.add_cookies(site_cookies)
-
-            num_workers = min(MAX_WORKERS, len(pending))
-            workers = [
-                asyncio.create_task(
-                    worker(i, queue, context, known,
-                           total, retry_counts, video_map, map_lock, shutdown_event)
-                )
-                for i in range(num_workers)
-            ]
-
-            await asyncio.gather(*workers)
-            await browser.close()
-
-        mapped = sum(1 for v in video_map.values() if v.get("videos"))
-        print(
-            f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries.")
-
-        if not shutdown_event.is_set():
-            print(f"[✓] Completed. Full map in {VIDEO_MAP_FILE}")
-        else:
-            done = sum(1 for v in video_map.values() if v.get("scraped_at"))
-            print(f"[⏸] Paused — {done}/{total} done. Run again to resume.")
+        for site_key in selected_sites:
+            if shutdown_event.is_set():
+                break
+            site_cfg = SITES[site_key]
+            if not _has_credentials(site_cfg):
+                if explicit:
+                    raise RuntimeError(
+                        f"No credentials or cookie found for {site_key}. See .env.example."
+                    )
+                print(f"[{site_key}] No credentials found — skipping.")
+                continue
+            print(f"\n{'=' * 60}")
+            print(f"  Site: {site_key}  ({site_cfg['base_url']})")
+            print(f"{'=' * 60}\n")
+            await run_for_site(site_key, site_cfg, shutdown_event)
    finally:
        signal.signal(signal.SIGINT, signal.SIG_DFL)
        signal.signal(signal.SIGTERM, signal.SIG_DFL)


-def main():
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Scrape video URLs from member sites")
+    parser.add_argument(
+        "--site",
+        action="append",
+        choices=list(SITES.keys()),
+        dest="sites",
+        metavar="SITE",
+        help=f"Site(s) to scrape (default: all). Can be repeated. Choices: {', '.join(SITES)}",
+    )
+    args = parser.parse_args()
+    explicit = bool(args.sites)
+    selected = args.sites or list(SITES.keys())
+
    try:
-        asyncio.run(run())
+        asyncio.run(run(selected, explicit))
    except KeyboardInterrupt:
        print("\n[!] Interrupted. Run again to resume.")
    except RuntimeError as e: