Cookie validation logic

2026-03-02 09:04:33 +00:00 · 2026-02-28 21:37:39 +01:00
parent 80444405e9
commit e3e14293cd
15 changed files with 802 additions and 398 deletions
--- a/main.py
+++ b/main.py
@@ -7,27 +7,36 @@ import asyncio
 import tempfile
 import requests
 from pathlib import Path, PurePosixPath
+from typing import Any, Optional
 from urllib.parse import urlparse
 from dotenv import load_dotenv
-from playwright.async_api import async_playwright
-from check_clashes import VIDEO_EXTS
+from playwright.async_api import async_playwright, BrowserContext
+from check_clashes import VIDEO_EXTS, load_video_map, is_valid_url, VIDEO_MAP_FILE, expects_video
 from config import BASE_URL
+from grab_cookie import login_and_get_cookie, update_env

 load_dotenv()


-def _is_video_url(url):
+def _is_video_url(url: str) -> bool:
    """True if `url` ends with a recognised video extension (case-insensitive, path only)."""
    return PurePosixPath(urlparse(url).path).suffix.lower() in VIDEO_EXTS
+
+
 WP_API = f"{BASE_URL}/wp-json/wp/v2"

 SKIP_TYPES = {
-    "attachment", "nav_menu_item", "wp_block", "wp_template",
-    "wp_template_part", "wp_global_styles", "wp_navigation",
-    "wp_font_family", "wp_font_face",
+    "attachment",
+    "nav_menu_item",
+    "wp_block",
+    "wp_template",
+    "wp_template_part",
+    "wp_global_styles",
+    "wp_navigation",
+    "wp_font_family",
+    "wp_font_face",
 }

-VIDEO_MAP_FILE = "video_map.json"
 MAX_WORKERS = 4

 API_HEADERS = {
@@ -37,22 +46,53 @@ API_HEADERS = {
 }


-def _get_login_cookie():
-    raw = os.environ.get("WP_LOGIN_COOKIE", "").strip()  # strip accidental whitespace
-    if not raw:
-        raise RuntimeError(
-            "WP_LOGIN_COOKIE not set. Copy it from your browser into .env — see .env.example.")
-    name, _, value = raw.partition("=")
-    if not value:
-        raise RuntimeError(
-            "WP_LOGIN_COOKIE looks malformed (no '=' found). Expected: name=value")
-    if not name.startswith("wordpress_logged_in_"):
-        raise RuntimeError(
-            "WP_LOGIN_COOKIE doesn't look right — expected a wordpress_logged_in_... cookie.")
-    return name, value
+def _probe_cookie(name: str, value: str) -> bool:
+    """HEAD request to a members-only video page. Returns True if the cookie is still valid."""
+    video_map = load_video_map()
+    probe_url = next((url for url in video_map if expects_video(url)), None)
+    if probe_url is None:
+        return False  # no video URLs yet — can't validate, fall through to re-auth
+    r = requests.head(
+        probe_url,
+        headers={"Cookie": f"{name}={value}", "User-Agent": API_HEADERS["User-Agent"]},
+        allow_redirects=False,
+        timeout=10,
+    )
+    return r.status_code == 200


-def discover_content_types(session):
+def _get_login_cookie() -> tuple[str, str]:
+    username = os.environ.get("WP_USERNAME", "").strip()
+    password = os.environ.get("WP_PASSWORD", "").strip()
+    has_credentials = bool(username and password)
+
+    raw = os.environ.get("WP_LOGIN_COOKIE", "").strip()
+    if raw:
+        name, _, value = raw.partition("=")
+        if value and name.startswith("wordpress_logged_in_"):
+            if not has_credentials:
+                return name, value  # cookie-only mode — trust it
+            print("[+] Cookie found — validating…")
+            if _probe_cookie(name, value):
+                print("[✓] Cookie still valid — skipping login.")
+                return name, value
+            print("[!] Cookie expired — re-authenticating…")
+
+    if has_credentials:
+        cookie_name, cookie_value = login_and_get_cookie(username, password)
+        action = update_env(cookie_name, cookie_value)
+        print(f"[✓] Logged in: {cookie_name} ({action} in .env)")
+        return cookie_name, cookie_value
+
+    raise RuntimeError(
+        "No credentials or cookie found. Set either:\n"
+        "  • WP_USERNAME + WP_PASSWORD  (recommended — always gets a fresh cookie)\n"
+        "  • WP_LOGIN_COOKIE            (fallback — may expire mid-run)\n"
+        "See .env.example."
+    )
+
+
+def discover_content_types(session: requests.Session) -> list[tuple[str, str, str]]:
    """Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug) for content types worth scraping."""
    r = session.get(f"{WP_API}/types", timeout=30)
    r.raise_for_status()
@@ -69,7 +109,12 @@ def discover_content_types(session):
    return targets


-def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
+def fetch_all_posts_for_type(
+    session: requests.Session,
+    type_name: str,
+    rest_base: str,
+    type_slug: str,
+) -> list[tuple[str, str, str]]:
    """Paginate one content type and return (url, title, description) tuples.
    Uses the `link` field when available; falls back to building from slug."""
    url_prefix = type_slug.replace("_", "-")
@@ -96,11 +141,15 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
                else:
                    continue
            title_obj = post.get("title", {})
-            title = title_obj.get("rendered", "") if isinstance(
-                title_obj, dict) else str(title_obj)
+            title = (
+                title_obj.get("rendered", "")
+                if isinstance(title_obj, dict)
+                else str(title_obj)
+            )
            content_obj = post.get("content", {})
-            content_html = content_obj.get(
-                "rendered", "") if isinstance(content_obj, dict) else ""
+            content_html = (
+                content_obj.get("rendered", "") if isinstance(content_obj, dict) else ""
+            )
            description = html_to_text(content_html) if content_html else ""
            results.append((link, title, description))
        print(f"    {type_name} page {page}: {len(data)} items")
@@ -109,21 +158,25 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
    return results


-def fetch_post_urls_from_api(headers):
+def fetch_post_urls_from_api(headers: dict[str, str]) -> list[str]:
    """Auto-discover all content types via the WP REST API and collect every post URL.
    Also builds video_map.json with titles pre-populated."""
-    print("[+] video_map.json empty or missing — discovering content types from REST API…")
+    print(
+        "[+] video_map.json empty or missing — discovering content types from REST API…"
+    )
    session = requests.Session()
    session.headers.update(headers)

    targets = discover_content_types(session)
    print(
-        f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n")
+        f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n"
+    )

    all_results = []
    for type_name, rest_base, type_slug in targets:
        type_results = fetch_all_posts_for_type(
-            session, type_name, rest_base, type_slug)
+            session, type_name, rest_base, type_slug
+        )
        all_results.extend(type_results)

    seen = set()
@@ -135,8 +188,11 @@ def fetch_post_urls_from_api(headers):
            seen.add(url)
            deduped_urls.append(url)
            if url not in video_map:
-                video_map[url] = {"title": title,
-                                  "description": description, "videos": []}
+                video_map[url] = {
+                    "title": title,
+                    "description": description,
+                    "videos": [],
+                }
            else:
                if not video_map[url].get("title"):
                    video_map[url]["title"] = title
@@ -145,18 +201,25 @@ def fetch_post_urls_from_api(headers):

    save_video_map(video_map)
    print(
-        f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}")
-    print(
-        f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
+        f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}"
+    )
+    print(f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
    return deduped_urls


-def fetch_metadata_from_api(video_map, urls, headers):
+def fetch_metadata_from_api(
+    video_map: dict[str, Any],
+    urls: list[str],
+    headers: dict[str, str],
+) -> None:
    """Populate missing titles and descriptions in video_map from the REST API."""
-    missing = [u for u in urls
-               if u not in video_map
-               or not video_map[u].get("title")
-               or not video_map[u].get("description")]
+    missing = [
+        u
+        for u in urls
+        if u not in video_map
+        or not video_map[u].get("title")
+        or not video_map[u].get("description")
+    ]
    if not missing:
        return

@@ -168,7 +231,8 @@ def fetch_metadata_from_api(video_map, urls, headers):

    for type_name, rest_base, type_slug in targets:
        type_results = fetch_all_posts_for_type(
-            session, type_name, rest_base, type_slug)
+            session, type_name, rest_base, type_slug
+        )
        for url, title, description in type_results:
            if url in video_map:
                if not video_map[url].get("title"):
@@ -176,18 +240,20 @@ def fetch_metadata_from_api(video_map, urls, headers):
                if not video_map[url].get("description"):
                    video_map[url]["description"] = description
            else:
-                video_map[url] = {"title": title,
-                                  "description": description, "videos": []}
+                video_map[url] = {
+                    "title": title,
+                    "description": description,
+                    "videos": [],
+                }

    save_video_map(video_map)
    populated_t = sum(1 for u in urls if video_map.get(u, {}).get("title"))
-    populated_d = sum(1 for u in urls if video_map.get(
-        u, {}).get("description"))
+    populated_d = sum(1 for u in urls if video_map.get(u, {}).get("description"))
    print(f"[+] Titles populated: {populated_t}/{len(urls)}")
    print(f"[+] Descriptions populated: {populated_d}/{len(urls)}")


-def load_post_urls(headers):
+def load_post_urls(headers: dict[str, str]) -> list[str]:
    vm = load_video_map()
    if vm:
        print(f"[+] {VIDEO_MAP_FILE} found — loading {len(vm)} post URLs.")
@@ -195,48 +261,40 @@ def load_post_urls(headers):
    return fetch_post_urls_from_api(headers)


-def html_to_text(html_str):
+def html_to_text(html_str: str) -> str:
    """Strip HTML tags, decode entities, and collapse whitespace into clean plain text."""
    import html
-    text = re.sub(r'<br\s*/?>', '\n', html_str)
-    text = text.replace('</p>', '\n\n')
-    text = re.sub(r'<[^>]+>', '', text)
+
+    text = re.sub(r"<br\s*/?>", "\n", html_str)
+    text = text.replace("</p>", "\n\n")
+    text = re.sub(r"<[^>]+>", "", text)
    text = html.unescape(text)
    lines = [line.strip() for line in text.splitlines()]
-    text = '\n'.join(lines)
-    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = "\n".join(lines)
+    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


-def extract_mp4_from_html(html):
+def extract_mp4_from_html(html: str) -> list[str]:
    candidates = re.findall(r'https?://[^\s"\'<>]+', html)
    return [u for u in candidates if _is_video_url(u)]


-def extract_title_from_html(html):
-    m = re.search(
-        r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
+def extract_title_from_html(html: str) -> Optional[str]:
+    m = re.search(r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
    if m:
-        title = re.sub(r'<[^>]+>', '', m.group(1)).strip()
+        title = re.sub(r"<[^>]+>", "", m.group(1)).strip()
        return title
-    m = re.search(r'<title>(.*?)(?:\s*[-–|].*)?</title>', html, re.DOTALL)
+    m = re.search(r"<title>(.*?)(?:\s*[-–|].*)?</title>", html, re.DOTALL)
    if m:
        return m.group(1).strip()
    return None


-def load_video_map():
-    if Path(VIDEO_MAP_FILE).exists():
-        try:
-            with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
-                return json.load(f)
-        except (json.JSONDecodeError, OSError):
-            return {}
-    return {}
-
-
-def save_video_map(video_map):
-    fd, tmp_path = tempfile.mkstemp(dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp")
+def save_video_map(video_map: dict[str, Any]) -> None:
+    fd, tmp_path = tempfile.mkstemp(
+        dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp"
+    )
    try:
        with os.fdopen(fd, "w", encoding="utf-8") as f:
            json.dump(video_map, f, indent=2, ensure_ascii=False)
@@ -250,19 +308,30 @@ def save_video_map(video_map):



-def _expects_video(url):
-    return "/pinkcuffs-videos/" in url
-
-
 MAX_RETRIES = 2


-async def worker(worker_id, queue, context, known,
-                 total, retry_counts, video_map, map_lock, shutdown_event):
+async def worker(
+    worker_id: int,
+    queue: asyncio.Queue[tuple[int, str]],
+    context: BrowserContext,
+    known: set[str],
+    total: int,
+    retry_counts: dict[int, int],
+    video_map: dict[str, Any],
+    map_lock: asyncio.Lock,
+    shutdown_event: asyncio.Event,
+    reauth_lock: asyncio.Lock,
+    reauth_done: list[bool],
+    cookie_domain: str,
+) -> None:
    page = await context.new_page()
    video_hits = set()

-    page.on("response", lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None)
+    page.on(
+        "response",
+        lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
+    )

    try:
        while not shutdown_event.is_set():
@@ -279,11 +348,11 @@ async def worker(worker_id, queue, context, known,
                await page.goto(url, wait_until="networkidle", timeout=60000)
            except Exception as e:
                print(f"[W{worker_id}] Navigation error: {e}")
-                if _expects_video(url) and attempt < MAX_RETRIES:
+                if expects_video(url) and attempt < MAX_RETRIES:
                    retry_counts[idx] = attempt + 1
                    queue.put_nowait((idx, url))
                    print(f"[W{worker_id}] Re-queued for retry.")
-                elif not _expects_video(url):
+                elif not expects_video(url):
                    async with map_lock:
                        entry = video_map.get(url, {})
                        entry["scraped_at"] = int(time.time())
@@ -291,7 +360,48 @@ async def worker(worker_id, queue, context, known,
                        save_video_map(video_map)
                else:
                    print(
-                        f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run.")
+                        f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run."
+                    )
+                continue
+
+            if "NoDirectAccessAllowed" in page.url:
+                recovered = False
+                async with reauth_lock:
+                    if not reauth_done[0]:
+                        username = os.environ.get("WP_USERNAME", "").strip()
+                        password = os.environ.get("WP_PASSWORD", "").strip()
+                        if username and password:
+                            print(f"[W{worker_id}] Cookie expired — re-authenticating…")
+                            try:
+                                new_name, new_value = await asyncio.to_thread(
+                                    login_and_get_cookie, username, password
+                                )
+                                update_env(new_name, new_value)
+                                await context.add_cookies([{
+                                    "name": new_name,
+                                    "value": new_value,
+                                    "domain": cookie_domain,
+                                    "path": "/",
+                                    "httpOnly": True,
+                                    "secure": True,
+                                    "sameSite": "None",
+                                }])
+                                reauth_done[0] = True
+                                recovered = True
+                                print(f"[W{worker_id}] Re-auth succeeded — re-queuing.")
+                            except Exception as e:
+                                print(f"[W{worker_id}] Re-auth failed: {e}")
+                                shutdown_event.set()
+                        else:
+                            print(
+                                f"[W{worker_id}] Cookie expired. "
+                                "Set WP_USERNAME + WP_PASSWORD in .env for auto re-auth."
+                            )
+                            shutdown_event.set()
+                    else:
+                        recovered = True  # another worker already re-authed
+                if recovered:
+                    queue.put_nowait((idx, url))
                continue

            await asyncio.sleep(1.5)
@@ -301,9 +411,15 @@ async def worker(worker_id, queue, context, known,
            found = set(html_videos) | set(video_hits)
            video_hits.clear()

-            all_videos = [m for m in found if m not in (
-                f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
-            )]
+            all_videos = [
+                m
+                for m in found
+                if is_valid_url(m)
+                and m
+                not in (
+                    f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
+                )
+            ]

            async with map_lock:
                new_found = found - known
@@ -312,7 +428,8 @@ async def worker(worker_id, queue, context, known,
                    known.update(new_found)
                elif all_videos:
                    print(
-                        f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write.")
+                        f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write."
+                    )
                else:
                    print(f"[W{worker_id}] No video found on page.")

@@ -322,7 +439,7 @@ async def worker(worker_id, queue, context, known,
                existing_videos = set(entry.get("videos", []))
                existing_videos.update(all_videos)
                entry["videos"] = sorted(existing_videos)
-                mark_done = bool(all_videos) or not _expects_video(url)
+                mark_done = bool(all_videos) or not expects_video(url)
                if mark_done:
                    entry["scraped_at"] = int(time.time())
                video_map[url] = entry
@@ -333,19 +450,21 @@ async def worker(worker_id, queue, context, known,
                    retry_counts[idx] = attempt + 1
                    queue.put_nowait((idx, url))
                    print(
-                        f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES}).")
+                        f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES})."
+                    )
                else:
                    print(
-                        f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run.")
+                        f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run."
+                    )
    finally:
        await page.close()


-async def run():
+async def run() -> None:
    shutdown_event = asyncio.Event()
    loop = asyncio.get_running_loop()

-    def _handle_shutdown(signum, _frame):
+    def _handle_shutdown(signum: int, _frame: object) -> None:
        print(f"\n[!] Signal {signum} received — finishing active pages then exiting…")
        loop.call_soon_threadsafe(shutdown_event.set)

@@ -362,10 +481,13 @@ async def run():
        urls = load_post_urls(req_headers)

        video_map = load_video_map()
-        if any(u not in video_map
-               or not video_map[u].get("title")
-               or not video_map[u].get("description")
-               for u in urls if _expects_video(u)):
+        if any(
+            u not in video_map
+            or not video_map[u].get("title")
+            or not video_map[u].get("description")
+            for u in urls
+            if expects_video(u)
+        ):
            fetch_metadata_from_api(video_map, urls, req_headers)

        known = {u for entry in video_map.values() for u in entry.get("videos", [])}
@@ -377,7 +499,7 @@ async def run():
            entry = video_map.get(u, {})
            if not entry.get("scraped_at"):
                pending.append((i, u))
-            elif _expects_video(u) and not entry.get("videos"):
+            elif expects_video(u) and not entry.get("videos"):
                pending.append((i, u))
                needs_map += 1

@@ -388,26 +510,31 @@ async def run():
        if done_count:
            remaining_new = len(pending) - needs_map
            print(
-                f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data.")
+                f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data."
+            )
        if not pending:
            print("[✓] All URLs already processed and mapped.")
            return

        print(
-            f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n")
+            f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n"
+        )

-        queue = asyncio.Queue()
+        queue: asyncio.Queue[tuple[int, str]] = asyncio.Queue()
        for item in pending:
            queue.put_nowait(item)

        map_lock = asyncio.Lock()
-        retry_counts = {}
+        reauth_lock = asyncio.Lock()
+        reauth_done: list[bool] = [False]
+        retry_counts: dict[int, int] = {}

        async with async_playwright() as p:
            browser = await p.firefox.launch(headless=True)
            context = await browser.new_context()

-            _cookie_domain = urlparse(BASE_URL).netloc
+            _parsed = urlparse(BASE_URL)
+            _cookie_domain = _parsed.hostname or _parsed.netloc
            site_cookies = [
                {
                    "name": cookie_name,
@@ -416,23 +543,35 @@ async def run():
                    "path": "/",
                    "httpOnly": True,
                    "secure": True,
-                    "sameSite": "None"
+                    "sameSite": "None",
                },
                {
                    "name": "eav-age-verified",
                    "value": "1",
                    "domain": _cookie_domain,
-                    "path": "/"
-                }
+                    "path": "/",
+                },
            ]

-            await context.add_cookies(site_cookies)
+            await context.add_cookies(site_cookies)  # type: ignore[arg-type]

            num_workers = min(MAX_WORKERS, len(pending))
            workers = [
                asyncio.create_task(
-                    worker(i, queue, context, known,
-                           total, retry_counts, video_map, map_lock, shutdown_event)
+                    worker(
+                        i,
+                        queue,
+                        context,
+                        known,
+                        total,
+                        retry_counts,
+                        video_map,
+                        map_lock,
+                        shutdown_event,
+                        reauth_lock,
+                        reauth_done,
+                        _cookie_domain,
+                    )
                )
                for i in range(num_workers)
            ]
@@ -442,7 +581,8 @@ async def run():

        mapped = sum(1 for v in video_map.values() if v.get("videos"))
        print(
-            f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries.")
+            f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries."
+        )

        if not shutdown_event.is_set():
            print(f"[✓] Completed. Full map in {VIDEO_MAP_FILE}")
@@ -454,7 +594,7 @@ async def run():
        signal.signal(signal.SIGTERM, signal.SIG_DFL)


-def main():
+def main() -> None:
    try:
        asyncio.run(run())
    except KeyboardInterrupt: