jailbirdz-dl/main.py

import re
import json
import os
import time
import signal
import asyncio
import tempfile
import requests
from pathlib import Path, PurePosixPath
from typing import Any, Optional
from urllib.parse import urlparse
from dotenv import load_dotenv
from playwright.async_api import async_playwright, BrowserContext
from check_clashes import VIDEO_EXTS, load_video_map, is_valid_url, VIDEO_MAP_FILE, expects_video
from config import BASE_URL
from grab_cookie import login_and_get_cookie, update_env

load_dotenv()


def _is_video_url(url: str) -> bool:
    """True if `url` ends with a recognised video extension (case-insensitive, path only)."""
    return PurePosixPath(urlparse(url).path).suffix.lower() in VIDEO_EXTS


WP_API = f"{BASE_URL}/wp-json/wp/v2"

SKIP_TYPES = {
    "attachment",
    "nav_menu_item",
    "wp_block",
    "wp_template",
    "wp_template_part",
    "wp_global_styles",
    "wp_navigation",
    "wp_font_family",
    "wp_font_face",
}

MAX_WORKERS = 4

API_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0",
    "Accept": "application/json",
    "Referer": f"{BASE_URL}/",
}


def _probe_cookie(name: str, value: str) -> bool:
    """HEAD request to a members-only video page. Returns True if the cookie is still valid."""
    video_map = load_video_map()
    probe_url = next((url for url in video_map if expects_video(url)), None)
    if probe_url is None:
        return False  # no video URLs yet — can't validate, fall through to re-auth
    r = requests.head(
        probe_url,
        headers={"Cookie": f"{name}={value}", "User-Agent": API_HEADERS["User-Agent"]},
        allow_redirects=False,
        timeout=10,
    )
    return r.status_code == 200


def _get_login_cookie() -> tuple[str, str]:
    username = os.environ.get("WP_USERNAME", "").strip()
    password = os.environ.get("WP_PASSWORD", "").strip()
    has_credentials = bool(username and password)

    raw = os.environ.get("WP_LOGIN_COOKIE", "").strip()
    if raw:
        name, _, value = raw.partition("=")
        if value and name.startswith("wordpress_logged_in_"):
            if not has_credentials:
                return name, value  # cookie-only mode — trust it
            print("[+] Cookie found — validating…")
            if _probe_cookie(name, value):
                print("[✓] Cookie still valid — skipping login.")
                return name, value
            print("[!] Cookie expired — re-authenticating…")

    if has_credentials:
        cookie_name, cookie_value = login_and_get_cookie(username, password)
        action = update_env(cookie_name, cookie_value)
        print(f"[✓] Logged in: {cookie_name} ({action} in .env)")
        return cookie_name, cookie_value

    raise RuntimeError(
        "No credentials or cookie found. Set either:\n"
        "  • WP_USERNAME + WP_PASSWORD  (recommended — always gets a fresh cookie)\n"
        "  • WP_LOGIN_COOKIE            (fallback — may expire mid-run)\n"
        "See .env.example."
    )


def discover_content_types(session: requests.Session) -> list[tuple[str, str, str]]:
    """Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug) for content types worth scraping."""
    r = session.get(f"{WP_API}/types", timeout=30)
    r.raise_for_status()
    types = r.json()

    targets = []
    for type_slug, info in types.items():
        if type_slug in SKIP_TYPES:
            continue
        rest_base = info.get("rest_base")
        name = info.get("name", type_slug)
        if rest_base:
            targets.append((name, rest_base, type_slug))
    return targets


def fetch_all_posts_for_type(
    session: requests.Session,
    type_name: str,
    rest_base: str,
    type_slug: str,
) -> list[tuple[str, str, str]]:
    """Paginate one content type and return (url, title, description) tuples.
    Uses the `link` field when available; falls back to building from slug."""
    url_prefix = type_slug.replace("_", "-")
    results = []
    page = 1

    while True:
        r = session.get(
            f"{WP_API}/{rest_base}",
            params={"per_page": 100, "page": page},
            timeout=30,
        )
        if r.status_code == 400 or not r.ok:
            break
        data = r.json()
        if not data:
            break
        for post in data:
            link = post.get("link", "")
            if not link.startswith("http"):
                slug = post.get("slug")
                if slug:
                    link = f"{BASE_URL}/{url_prefix}/{slug}/"
                else:
                    continue
            title_obj = post.get("title", {})
            title = (
                title_obj.get("rendered", "")
                if isinstance(title_obj, dict)
                else str(title_obj)
            )
            content_obj = post.get("content", {})
            content_html = (
                content_obj.get("rendered", "") if isinstance(content_obj, dict) else ""
            )
            description = html_to_text(content_html) if content_html else ""
            results.append((link, title, description))
        print(f"    {type_name} page {page}: {len(data)} items")
        page += 1

    return results


def fetch_post_urls_from_api(headers: dict[str, str]) -> list[str]:
    """Auto-discover all content types via the WP REST API and collect every post URL.
    Also builds video_map.json with titles pre-populated."""
    print(
        "[+] video_map.json empty or missing — discovering content types from REST API…"
    )
    session = requests.Session()
    session.headers.update(headers)

    targets = discover_content_types(session)
    print(
        f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n"
    )

    all_results = []
    for type_name, rest_base, type_slug in targets:
        type_results = fetch_all_posts_for_type(
            session, type_name, rest_base, type_slug
        )
        all_results.extend(type_results)

    seen = set()
    deduped_urls = []
    video_map = load_video_map()

    for url, title, description in all_results:
        if url not in seen and url.startswith("http"):
            seen.add(url)
            deduped_urls.append(url)
            if url not in video_map:
                video_map[url] = {
                    "title": title,
                    "description": description,
                    "videos": [],
                }
            else:
                if not video_map[url].get("title"):
                    video_map[url]["title"] = title
                if not video_map[url].get("description"):
                    video_map[url]["description"] = description

    save_video_map(video_map)
    print(
        f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}"
    )
    print(f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
    return deduped_urls


def fetch_metadata_from_api(
    video_map: dict[str, Any],
    urls: list[str],
    headers: dict[str, str],
) -> None:
    """Populate missing titles and descriptions in video_map from the REST API."""
    missing = [
        u
        for u in urls
        if u not in video_map
        or not video_map[u].get("title")
        or not video_map[u].get("description")
    ]
    if not missing:
        return

    print(f"[+] Fetching metadata from REST API for {len(missing)} posts…")
    session = requests.Session()
    session.headers.update(headers)

    targets = discover_content_types(session)

    for type_name, rest_base, type_slug in targets:
        type_results = fetch_all_posts_for_type(
            session, type_name, rest_base, type_slug
        )
        for url, title, description in type_results:
            if url in video_map:
                if not video_map[url].get("title"):
                    video_map[url]["title"] = title
                if not video_map[url].get("description"):
                    video_map[url]["description"] = description
            else:
                video_map[url] = {
                    "title": title,
                    "description": description,
                    "videos": [],
                }

    save_video_map(video_map)
    populated_t = sum(1 for u in urls if video_map.get(u, {}).get("title"))
    populated_d = sum(1 for u in urls if video_map.get(u, {}).get("description"))
    print(f"[+] Titles populated: {populated_t}/{len(urls)}")
    print(f"[+] Descriptions populated: {populated_d}/{len(urls)}")


def load_post_urls(headers: dict[str, str]) -> list[str]:
    vm = load_video_map()
    if vm:
        print(f"[+] {VIDEO_MAP_FILE} found — loading {len(vm)} post URLs.")
        return list(vm.keys())
    return fetch_post_urls_from_api(headers)


def html_to_text(html_str: str) -> str:
    """Strip HTML tags, decode entities, and collapse whitespace into clean plain text."""
    import html

    text = re.sub(r"<br\s*/?>", "\n", html_str)
    text = text.replace("</p>", "\n\n")
    text = re.sub(r"<[^>]+>", "", text)
    text = html.unescape(text)
    lines = [line.strip() for line in text.splitlines()]
    text = "\n".join(lines)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def extract_mp4_from_html(html: str) -> list[str]:
    candidates = re.findall(r'https?://[^\s"\'<>]+', html)
    return [u for u in candidates if _is_video_url(u)]


def extract_title_from_html(html: str) -> Optional[str]:
    m = re.search(r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
    if m:
        title = re.sub(r"<[^>]+>", "", m.group(1)).strip()
        return title
    m = re.search(r"<title>(.*?)(?:\s*[-–|].*)?</title>", html, re.DOTALL)
    if m:
        return m.group(1).strip()
    return None


def save_video_map(video_map: dict[str, Any]) -> None:
    fd, tmp_path = tempfile.mkstemp(
        dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp"
    )
    try:
        with os.fdopen(fd, "w", encoding="utf-8") as f:
            json.dump(video_map, f, indent=2, ensure_ascii=False)
        Path(tmp_path).replace(VIDEO_MAP_FILE)
    except Exception:
        try:
            Path(tmp_path).unlink()
        except OSError:
            pass
        raise


MAX_RETRIES = 2


async def worker(
    worker_id: int,
    queue: asyncio.Queue[tuple[int, str]],
    context: BrowserContext,
    known: set[str],
    total: int,
    retry_counts: dict[int, int],
    video_map: dict[str, Any],
    map_lock: asyncio.Lock,
    shutdown_event: asyncio.Event,
    reauth_lock: asyncio.Lock,
    reauth_done: list[bool],
    cookie_domain: str,
) -> None:
    page = await context.new_page()
    video_hits = set()

    page.on(
        "response",
        lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
    )

    try:
        while not shutdown_event.is_set():
            try:
                idx, url = queue.get_nowait()
            except asyncio.QueueEmpty:
                break

            attempt = retry_counts.get(idx, 0)
            label = f" (retry {attempt}/{MAX_RETRIES})" if attempt else ""
            print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")

            try:
                await page.goto(url, wait_until="networkidle", timeout=60000)
            except Exception as e:
                print(f"[W{worker_id}] Navigation error: {e}")
                if expects_video(url) and attempt < MAX_RETRIES:
                    retry_counts[idx] = attempt + 1
                    queue.put_nowait((idx, url))
                    print(f"[W{worker_id}] Re-queued for retry.")
                elif not expects_video(url):
                    async with map_lock:
                        entry = video_map.get(url, {})
                        entry["scraped_at"] = int(time.time())
                        video_map[url] = entry
                        save_video_map(video_map)
                else:
                    print(
                        f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run."
                    )
                continue

            if "NoDirectAccessAllowed" in page.url:
                recovered = False
                async with reauth_lock:
                    if not reauth_done[0]:
                        username = os.environ.get("WP_USERNAME", "").strip()
                        password = os.environ.get("WP_PASSWORD", "").strip()
                        if username and password:
                            print(f"[W{worker_id}] Cookie expired — re-authenticating…")
                            try:
                                new_name, new_value = await asyncio.to_thread(
                                    login_and_get_cookie, username, password
                                )
                                update_env(new_name, new_value)
                                await context.add_cookies([{
                                    "name": new_name,
                                    "value": new_value,
                                    "domain": cookie_domain,
                                    "path": "/",
                                    "httpOnly": True,
                                    "secure": True,
                                    "sameSite": "None",
                                }])
                                reauth_done[0] = True
                                recovered = True
                                print(f"[W{worker_id}] Re-auth succeeded — re-queuing.")
                            except Exception as e:
                                print(f"[W{worker_id}] Re-auth failed: {e}")
                                shutdown_event.set()
                        else:
                            print(
                                f"[W{worker_id}] Cookie expired. "
                                "Set WP_USERNAME + WP_PASSWORD in .env for auto re-auth."
                            )
                            shutdown_event.set()
                    else:
                        recovered = True  # another worker already re-authed
                if recovered:
                    queue.put_nowait((idx, url))
                continue

            await asyncio.sleep(1.5)
            html = await page.content()
            title = extract_title_from_html(html)
            html_videos = extract_mp4_from_html(html)
            found = set(html_videos) | set(video_hits)
            video_hits.clear()

            all_videos = [
                m
                for m in found
                if is_valid_url(m)
                and m
                not in (
                    f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
                )
            ]

            async with map_lock:
                new_found = found - known
                if new_found:
                    print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
                    known.update(new_found)
                elif all_videos:
                    print(
                        f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write."
                    )
                else:
                    print(f"[W{worker_id}] No video found on page.")

                entry = video_map.get(url, {})
                if title:
                    entry["title"] = title
                existing_videos = set(entry.get("videos", []))
                existing_videos.update(all_videos)
                entry["videos"] = sorted(existing_videos)
                mark_done = bool(all_videos) or not expects_video(url)
                if mark_done:
                    entry["scraped_at"] = int(time.time())
                video_map[url] = entry
                save_video_map(video_map)

            if not mark_done:
                if attempt < MAX_RETRIES:
                    retry_counts[idx] = attempt + 1
                    queue.put_nowait((idx, url))
                    print(
                        f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES})."
                    )
                else:
                    print(
                        f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run."
                    )
    finally:
        await page.close()


async def run() -> None:
    shutdown_event = asyncio.Event()
    loop = asyncio.get_running_loop()

    def _handle_shutdown(signum: int, _frame: object) -> None:
        print(f"\n[!] Signal {signum} received — finishing active pages then exiting…")
        loop.call_soon_threadsafe(shutdown_event.set)

    signal.signal(signal.SIGINT, _handle_shutdown)
    signal.signal(signal.SIGTERM, _handle_shutdown)

    try:
        cookie_name, cookie_value = _get_login_cookie()
        req_headers = {
            **API_HEADERS,
            "Cookie": f"{cookie_name}={cookie_value}; eav-age-verified=1",
        }

        urls = load_post_urls(req_headers)

        video_map = load_video_map()
        if any(
            u not in video_map
            or not video_map[u].get("title")
            or not video_map[u].get("description")
            for u in urls
            if expects_video(u)
        ):
            fetch_metadata_from_api(video_map, urls, req_headers)

        known = {u for entry in video_map.values() for u in entry.get("videos", [])}

        total = len(urls)
        pending = []
        needs_map = 0
        for i, u in enumerate(urls):
            entry = video_map.get(u, {})
            if not entry.get("scraped_at"):
                pending.append((i, u))
            elif expects_video(u) and not entry.get("videos"):
                pending.append((i, u))
                needs_map += 1

        done_count = sum(1 for v in video_map.values() if v.get("scraped_at"))
        print(f"[+] Loaded {total} post URLs.")
        print(f"[+] Already have {len(known)} video URLs mapped.")
        print(f"[+] Video map: {len(video_map)} entries in {VIDEO_MAP_FILE}")
        if done_count:
            remaining_new = len(pending) - needs_map
            print(
                f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data."
            )
        if not pending:
            print("[✓] All URLs already processed and mapped.")
            return

        print(
            f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n"
        )

        queue: asyncio.Queue[tuple[int, str]] = asyncio.Queue()
        for item in pending:
            queue.put_nowait(item)

        map_lock = asyncio.Lock()
        reauth_lock = asyncio.Lock()
        reauth_done: list[bool] = [False]
        retry_counts: dict[int, int] = {}

        async with async_playwright() as p:
            browser = await p.firefox.launch(headless=True)
            context = await browser.new_context()

            _parsed = urlparse(BASE_URL)
            _cookie_domain = _parsed.hostname or _parsed.netloc
            site_cookies = [
                {
                    "name": cookie_name,
                    "value": cookie_value,
                    "domain": _cookie_domain,
                    "path": "/",
                    "httpOnly": True,
                    "secure": True,
                    "sameSite": "None",
                },
                {
                    "name": "eav-age-verified",
                    "value": "1",
                    "domain": _cookie_domain,
                    "path": "/",
                },
            ]

            await context.add_cookies(site_cookies)  # type: ignore[arg-type]

            num_workers = min(MAX_WORKERS, len(pending))
            workers = [
                asyncio.create_task(
                    worker(
                        i,
                        queue,
                        context,
                        known,
                        total,
                        retry_counts,
                        video_map,
                        map_lock,
                        shutdown_event,
                        reauth_lock,
                        reauth_done,
                        _cookie_domain,
                    )
                )
                for i in range(num_workers)
            ]

            await asyncio.gather(*workers)
            await browser.close()

        mapped = sum(1 for v in video_map.values() if v.get("videos"))
        print(
            f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries."
        )

        if not shutdown_event.is_set():
            print(f"[✓] Completed. Full map in {VIDEO_MAP_FILE}")
        else:
            done = sum(1 for v in video_map.values() if v.get("scraped_at"))
            print(f"[⏸] Paused — {done}/{total} done. Run again to resume.")
    finally:
        signal.signal(signal.SIGINT, signal.SIG_DFL)
        signal.signal(signal.SIGTERM, signal.SIG_DFL)


def main() -> None:
    try:
        asyncio.run(run())
    except KeyboardInterrupt:
        print("\n[!] Interrupted. Run again to resume.")
    except RuntimeError as e:
        raise SystemExit(f"[!] {e}")


if __name__ == "__main__":
    main()