jailbirdz-dl/main.py

import re
import json
import os
import time
import signal
import asyncio
import tempfile
import requests
from pathlib import Path, PurePosixPath
from urllib.parse import urlparse
from dotenv import load_dotenv
from playwright.async_api import async_playwright
from check_clashes import VIDEO_EXTS
from config import BASE_URL

load_dotenv()


def _is_video_url(url):
    """True if `url` ends with a recognised video extension (case-insensitive, path only)."""
    return PurePosixPath(urlparse(url).path).suffix.lower() in VIDEO_EXTS
WP_API = f"{BASE_URL}/wp-json/wp/v2"

SKIP_TYPES = {
    "attachment", "nav_menu_item", "wp_block", "wp_template",
    "wp_template_part", "wp_global_styles", "wp_navigation",
    "wp_font_family", "wp_font_face",
}

VIDEO_MAP_FILE = "video_map.json"
MAX_WORKERS = 4

API_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0",
    "Accept": "application/json",
    "Referer": f"{BASE_URL}/",
}


def _get_login_cookie():
    raw = os.environ.get("WP_LOGIN_COOKIE", "").strip()  # strip accidental whitespace
    if not raw:
        raise RuntimeError(
            "WP_LOGIN_COOKIE not set. Copy it from your browser into .env — see .env.example.")
    name, _, value = raw.partition("=")
    if not value:
        raise RuntimeError(
            "WP_LOGIN_COOKIE looks malformed (no '=' found). Expected: name=value")
    if not name.startswith("wordpress_logged_in_"):
        raise RuntimeError(
            "WP_LOGIN_COOKIE doesn't look right — expected a wordpress_logged_in_... cookie.")
    return name, value


def discover_content_types(session):
    """Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug) for content types worth scraping."""
    r = session.get(f"{WP_API}/types", timeout=30)
    r.raise_for_status()
    types = r.json()

    targets = []
    for type_slug, info in types.items():
        if type_slug in SKIP_TYPES:
            continue
        rest_base = info.get("rest_base")
        name = info.get("name", type_slug)
        if rest_base:
            targets.append((name, rest_base, type_slug))
    return targets


def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
    """Paginate one content type and return (url, title, description) tuples.
    Uses the `link` field when available; falls back to building from slug."""
    url_prefix = type_slug.replace("_", "-")
    results = []
    page = 1

    while True:
        r = session.get(
            f"{WP_API}/{rest_base}",
            params={"per_page": 100, "page": page},
            timeout=30,
        )
        if r.status_code == 400 or not r.ok:
            break
        data = r.json()
        if not data:
            break
        for post in data:
            link = post.get("link", "")
            if not link.startswith("http"):
                slug = post.get("slug")
                if slug:
                    link = f"{BASE_URL}/{url_prefix}/{slug}/"
                else:
                    continue
            title_obj = post.get("title", {})
            title = title_obj.get("rendered", "") if isinstance(
                title_obj, dict) else str(title_obj)
            content_obj = post.get("content", {})
            content_html = content_obj.get(
                "rendered", "") if isinstance(content_obj, dict) else ""
            description = html_to_text(content_html) if content_html else ""
            results.append((link, title, description))
        print(f"    {type_name} page {page}: {len(data)} items")
        page += 1

    return results


def fetch_post_urls_from_api(headers):
    """Auto-discover all content types via the WP REST API and collect every post URL.
    Also builds video_map.json with titles pre-populated."""
    print("[+] video_map.json empty or missing — discovering content types from REST API…")
    session = requests.Session()
    session.headers.update(headers)

    targets = discover_content_types(session)
    print(
        f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n")

    all_results = []
    for type_name, rest_base, type_slug in targets:
        type_results = fetch_all_posts_for_type(
            session, type_name, rest_base, type_slug)
        all_results.extend(type_results)

    seen = set()
    deduped_urls = []
    video_map = load_video_map()

    for url, title, description in all_results:
        if url not in seen and url.startswith("http"):
            seen.add(url)
            deduped_urls.append(url)
            if url not in video_map:
                video_map[url] = {"title": title,
                                  "description": description, "videos": []}
            else:
                if not video_map[url].get("title"):
                    video_map[url]["title"] = title
                if not video_map[url].get("description"):
                    video_map[url]["description"] = description

    save_video_map(video_map)
    print(
        f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}")
    print(
        f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
    return deduped_urls


def fetch_metadata_from_api(video_map, urls, headers):
    """Populate missing titles and descriptions in video_map from the REST API."""
    missing = [u for u in urls
               if u not in video_map
               or not video_map[u].get("title")
               or not video_map[u].get("description")]
    if not missing:
        return

    print(f"[+] Fetching metadata from REST API for {len(missing)} posts…")
    session = requests.Session()
    session.headers.update(headers)

    targets = discover_content_types(session)

    for type_name, rest_base, type_slug in targets:
        type_results = fetch_all_posts_for_type(
            session, type_name, rest_base, type_slug)
        for url, title, description in type_results:
            if url in video_map:
                if not video_map[url].get("title"):
                    video_map[url]["title"] = title
                if not video_map[url].get("description"):
                    video_map[url]["description"] = description
            else:
                video_map[url] = {"title": title,
                                  "description": description, "videos": []}

    save_video_map(video_map)
    populated_t = sum(1 for u in urls if video_map.get(u, {}).get("title"))
    populated_d = sum(1 for u in urls if video_map.get(
        u, {}).get("description"))
    print(f"[+] Titles populated: {populated_t}/{len(urls)}")
    print(f"[+] Descriptions populated: {populated_d}/{len(urls)}")


def load_post_urls(headers):
    vm = load_video_map()
    if vm:
        print(f"[+] {VIDEO_MAP_FILE} found — loading {len(vm)} post URLs.")
        return list(vm.keys())
    return fetch_post_urls_from_api(headers)


def html_to_text(html_str):
    """Strip HTML tags, decode entities, and collapse whitespace into clean plain text."""
    import html
    text = re.sub(r'<br\s*/?>', '\n', html_str)
    text = text.replace('</p>', '\n\n')
    text = re.sub(r'<[^>]+>', '', text)
    text = html.unescape(text)
    lines = [line.strip() for line in text.splitlines()]
    text = '\n'.join(lines)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()


def extract_mp4_from_html(html):
    candidates = re.findall(r'https?://[^\s"\'<>]+', html)
    return [u for u in candidates if _is_video_url(u)]


def extract_title_from_html(html):
    m = re.search(
        r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
    if m:
        title = re.sub(r'<[^>]+>', '', m.group(1)).strip()
        return title
    m = re.search(r'<title>(.*?)(?:\s*[-–|].*)?</title>', html, re.DOTALL)
    if m:
        return m.group(1).strip()
    return None


def load_video_map():
    if Path(VIDEO_MAP_FILE).exists():
        try:
            with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
                return json.load(f)
        except (json.JSONDecodeError, OSError):
            return {}
    return {}


def save_video_map(video_map):
    fd, tmp_path = tempfile.mkstemp(dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp")
    try:
        with os.fdopen(fd, "w", encoding="utf-8") as f:
            json.dump(video_map, f, indent=2, ensure_ascii=False)
        Path(tmp_path).replace(VIDEO_MAP_FILE)
    except Exception:
        try:
            Path(tmp_path).unlink()
        except OSError:
            pass
        raise


def _expects_video(url):
    return "/pinkcuffs-videos/" in url


MAX_RETRIES = 2


async def worker(worker_id, queue, context, known,
                 total, retry_counts, video_map, map_lock, shutdown_event):
    page = await context.new_page()
    video_hits = set()

    page.on("response", lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None)

    try:
        while not shutdown_event.is_set():
            try:
                idx, url = queue.get_nowait()
            except asyncio.QueueEmpty:
                break

            attempt = retry_counts.get(idx, 0)
            label = f" (retry {attempt}/{MAX_RETRIES})" if attempt else ""
            print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")

            try:
                await page.goto(url, wait_until="networkidle", timeout=60000)
            except Exception as e:
                print(f"[W{worker_id}] Navigation error: {e}")
                if _expects_video(url) and attempt < MAX_RETRIES:
                    retry_counts[idx] = attempt + 1
                    queue.put_nowait((idx, url))
                    print(f"[W{worker_id}] Re-queued for retry.")
                elif not _expects_video(url):
                    async with map_lock:
                        entry = video_map.get(url, {})
                        entry["scraped_at"] = int(time.time())
                        video_map[url] = entry
                        save_video_map(video_map)
                else:
                    print(
                        f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run.")
                continue

            await asyncio.sleep(1.5)
            html = await page.content()
            title = extract_title_from_html(html)
            html_videos = extract_mp4_from_html(html)
            found = set(html_videos) | set(video_hits)
            video_hits.clear()

            all_videos = [m for m in found if m not in (
                f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
            )]

            async with map_lock:
                new_found = found - known
                if new_found:
                    print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
                    known.update(new_found)
                elif all_videos:
                    print(
                        f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write.")
                else:
                    print(f"[W{worker_id}] No video found on page.")

                entry = video_map.get(url, {})
                if title:
                    entry["title"] = title
                existing_videos = set(entry.get("videos", []))
                existing_videos.update(all_videos)
                entry["videos"] = sorted(existing_videos)
                mark_done = bool(all_videos) or not _expects_video(url)
                if mark_done:
                    entry["scraped_at"] = int(time.time())
                video_map[url] = entry
                save_video_map(video_map)

            if not mark_done:
                if attempt < MAX_RETRIES:
                    retry_counts[idx] = attempt + 1
                    queue.put_nowait((idx, url))
                    print(
                        f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES}).")
                else:
                    print(
                        f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run.")
    finally:
        await page.close()


async def run():
    shutdown_event = asyncio.Event()
    loop = asyncio.get_running_loop()

    def _handle_shutdown(signum, _frame):
        print(f"\n[!] Signal {signum} received — finishing active pages then exiting…")
        loop.call_soon_threadsafe(shutdown_event.set)

    signal.signal(signal.SIGINT, _handle_shutdown)
    signal.signal(signal.SIGTERM, _handle_shutdown)

    try:
        cookie_name, cookie_value = _get_login_cookie()
        req_headers = {
            **API_HEADERS,
            "Cookie": f"{cookie_name}={cookie_value}; eav-age-verified=1",
        }

        urls = load_post_urls(req_headers)

        video_map = load_video_map()
        if any(u not in video_map
               or not video_map[u].get("title")
               or not video_map[u].get("description")
               for u in urls if _expects_video(u)):
            fetch_metadata_from_api(video_map, urls, req_headers)

        known = {u for entry in video_map.values() for u in entry.get("videos", [])}

        total = len(urls)
        pending = []
        needs_map = 0
        for i, u in enumerate(urls):
            entry = video_map.get(u, {})
            if not entry.get("scraped_at"):
                pending.append((i, u))
            elif _expects_video(u) and not entry.get("videos"):
                pending.append((i, u))
                needs_map += 1

        done_count = sum(1 for v in video_map.values() if v.get("scraped_at"))
        print(f"[+] Loaded {total} post URLs.")
        print(f"[+] Already have {len(known)} video URLs mapped.")
        print(f"[+] Video map: {len(video_map)} entries in {VIDEO_MAP_FILE}")
        if done_count:
            remaining_new = len(pending) - needs_map
            print(
                f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data.")
        if not pending:
            print("[✓] All URLs already processed and mapped.")
            return

        print(
            f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n")

        queue = asyncio.Queue()
        for item in pending:
            queue.put_nowait(item)

        map_lock = asyncio.Lock()
        retry_counts = {}

        async with async_playwright() as p:
            browser = await p.firefox.launch(headless=True)
            context = await browser.new_context()

            _cookie_domain = urlparse(BASE_URL).netloc
            site_cookies = [
                {
                    "name": cookie_name,
                    "value": cookie_value,
                    "domain": _cookie_domain,
                    "path": "/",
                    "httpOnly": True,
                    "secure": True,
                    "sameSite": "None"
                },
                {
                    "name": "eav-age-verified",
                    "value": "1",
                    "domain": _cookie_domain,
                    "path": "/"
                }
            ]

            await context.add_cookies(site_cookies)

            num_workers = min(MAX_WORKERS, len(pending))
            workers = [
                asyncio.create_task(
                    worker(i, queue, context, known,
                           total, retry_counts, video_map, map_lock, shutdown_event)
                )
                for i in range(num_workers)
            ]

            await asyncio.gather(*workers)
            await browser.close()

        mapped = sum(1 for v in video_map.values() if v.get("videos"))
        print(
            f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries.")

        if not shutdown_event.is_set():
            print(f"[✓] Completed. Full map in {VIDEO_MAP_FILE}")
        else:
            done = sum(1 for v in video_map.values() if v.get("scraped_at"))
            print(f"[⏸] Paused — {done}/{total} done. Run again to resume.")
    finally:
        signal.signal(signal.SIGINT, signal.SIG_DFL)
        signal.signal(signal.SIGTERM, signal.SIG_DFL)


def main():
    try:
        asyncio.run(run())
    except KeyboardInterrupt:
        print("\n[!] Interrupted. Run again to resume.")
    except RuntimeError as e:
        raise SystemExit(f"[!] {e}")


if __name__ == "__main__":
    main()