Garbage commit; added junk code

2026-05-01 00:52:18 +00:00 · 2026-02-20 18:07:02 +01:00
commit 80444405e9
15 changed files with 34271 additions and 0 deletions
@@ -0,0 +1,467 @@
+import re
+import json
+import os
+import time
+import signal
+import asyncio
+import tempfile
+import requests
+from pathlib import Path, PurePosixPath
+from urllib.parse import urlparse
+from dotenv import load_dotenv
+from playwright.async_api import async_playwright
+from check_clashes import VIDEO_EXTS
+from config import BASE_URL
+
+load_dotenv()
+
+
+def _is_video_url(url):
+    """True if `url` ends with a recognised video extension (case-insensitive, path only)."""
+    return PurePosixPath(urlparse(url).path).suffix.lower() in VIDEO_EXTS
+WP_API = f"{BASE_URL}/wp-json/wp/v2"
+
+SKIP_TYPES = {
+    "attachment", "nav_menu_item", "wp_block", "wp_template",
+    "wp_template_part", "wp_global_styles", "wp_navigation",
+    "wp_font_family", "wp_font_face",
+}
+
+VIDEO_MAP_FILE = "video_map.json"
+MAX_WORKERS = 4
+
+API_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0",
+    "Accept": "application/json",
+    "Referer": f"{BASE_URL}/",
+}
+
+
+def _get_login_cookie():
+    raw = os.environ.get("WP_LOGIN_COOKIE", "").strip()  # strip accidental whitespace
+    if not raw:
+        raise RuntimeError(
+            "WP_LOGIN_COOKIE not set. Copy it from your browser into .env — see .env.example.")
+    name, _, value = raw.partition("=")
+    if not value:
+        raise RuntimeError(
+            "WP_LOGIN_COOKIE looks malformed (no '=' found). Expected: name=value")
+    if not name.startswith("wordpress_logged_in_"):
+        raise RuntimeError(
+            "WP_LOGIN_COOKIE doesn't look right — expected a wordpress_logged_in_... cookie.")
+    return name, value
+
+
+def discover_content_types(session):
+    """Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug) for content types worth scraping."""
+    r = session.get(f"{WP_API}/types", timeout=30)
+    r.raise_for_status()
+    types = r.json()
+
+    targets = []
+    for type_slug, info in types.items():
+        if type_slug in SKIP_TYPES:
+            continue
+        rest_base = info.get("rest_base")
+        name = info.get("name", type_slug)
+        if rest_base:
+            targets.append((name, rest_base, type_slug))
+    return targets
+
+
+def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
+    """Paginate one content type and return (url, title, description) tuples.
+    Uses the `link` field when available; falls back to building from slug."""
+    url_prefix = type_slug.replace("_", "-")
+    results = []
+    page = 1
+
+    while True:
+        r = session.get(
+            f"{WP_API}/{rest_base}",
+            params={"per_page": 100, "page": page},
+            timeout=30,
+        )
+        if r.status_code == 400 or not r.ok:
+            break
+        data = r.json()
+        if not data:
+            break
+        for post in data:
+            link = post.get("link", "")
+            if not link.startswith("http"):
+                slug = post.get("slug")
+                if slug:
+                    link = f"{BASE_URL}/{url_prefix}/{slug}/"
+                else:
+                    continue
+            title_obj = post.get("title", {})
+            title = title_obj.get("rendered", "") if isinstance(
+                title_obj, dict) else str(title_obj)
+            content_obj = post.get("content", {})
+            content_html = content_obj.get(
+                "rendered", "") if isinstance(content_obj, dict) else ""
+            description = html_to_text(content_html) if content_html else ""
+            results.append((link, title, description))
+        print(f"    {type_name} page {page}: {len(data)} items")
+        page += 1
+
+    return results
+
+
+def fetch_post_urls_from_api(headers):
+    """Auto-discover all content types via the WP REST API and collect every post URL.
+    Also builds video_map.json with titles pre-populated."""
+    print("[+] video_map.json empty or missing — discovering content types from REST API…")
+    session = requests.Session()
+    session.headers.update(headers)
+
+    targets = discover_content_types(session)
+    print(
+        f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n")
+
+    all_results = []
+    for type_name, rest_base, type_slug in targets:
+        type_results = fetch_all_posts_for_type(
+            session, type_name, rest_base, type_slug)
+        all_results.extend(type_results)
+
+    seen = set()
+    deduped_urls = []
+    video_map = load_video_map()
+
+    for url, title, description in all_results:
+        if url not in seen and url.startswith("http"):
+            seen.add(url)
+            deduped_urls.append(url)
+            if url not in video_map:
+                video_map[url] = {"title": title,
+                                  "description": description, "videos": []}
+            else:
+                if not video_map[url].get("title"):
+                    video_map[url]["title"] = title
+                if not video_map[url].get("description"):
+                    video_map[url]["description"] = description
+
+    save_video_map(video_map)
+    print(
+        f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}")
+    print(
+        f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
+    return deduped_urls
+
+
+def fetch_metadata_from_api(video_map, urls, headers):
+    """Populate missing titles and descriptions in video_map from the REST API."""
+    missing = [u for u in urls
+               if u not in video_map
+               or not video_map[u].get("title")
+               or not video_map[u].get("description")]
+    if not missing:
+        return
+
+    print(f"[+] Fetching metadata from REST API for {len(missing)} posts…")
+    session = requests.Session()
+    session.headers.update(headers)
+
+    targets = discover_content_types(session)
+
+    for type_name, rest_base, type_slug in targets:
+        type_results = fetch_all_posts_for_type(
+            session, type_name, rest_base, type_slug)
+        for url, title, description in type_results:
+            if url in video_map:
+                if not video_map[url].get("title"):
+                    video_map[url]["title"] = title
+                if not video_map[url].get("description"):
+                    video_map[url]["description"] = description
+            else:
+                video_map[url] = {"title": title,
+                                  "description": description, "videos": []}
+
+    save_video_map(video_map)
+    populated_t = sum(1 for u in urls if video_map.get(u, {}).get("title"))
+    populated_d = sum(1 for u in urls if video_map.get(
+        u, {}).get("description"))
+    print(f"[+] Titles populated: {populated_t}/{len(urls)}")
+    print(f"[+] Descriptions populated: {populated_d}/{len(urls)}")
+
+
+def load_post_urls(headers):
+    vm = load_video_map()
+    if vm:
+        print(f"[+] {VIDEO_MAP_FILE} found — loading {len(vm)} post URLs.")
+        return list(vm.keys())
+    return fetch_post_urls_from_api(headers)
+
+
+def html_to_text(html_str):
+    """Strip HTML tags, decode entities, and collapse whitespace into clean plain text."""
+    import html
+    text = re.sub(r'<br\s*/?>', '\n', html_str)
+    text = text.replace('</p>', '\n\n')
+    text = re.sub(r'<[^>]+>', '', text)
+    text = html.unescape(text)
+    lines = [line.strip() for line in text.splitlines()]
+    text = '\n'.join(lines)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    return text.strip()
+
+
+def extract_mp4_from_html(html):
+    candidates = re.findall(r'https?://[^\s"\'<>]+', html)
+    return [u for u in candidates if _is_video_url(u)]
+
+
+def extract_title_from_html(html):
+    m = re.search(
+        r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
+    if m:
+        title = re.sub(r'<[^>]+>', '', m.group(1)).strip()
+        return title
+    m = re.search(r'<title>(.*?)(?:\s*[-–|].*)?</title>', html, re.DOTALL)
+    if m:
+        return m.group(1).strip()
+    return None
+
+
+def load_video_map():
+    if Path(VIDEO_MAP_FILE).exists():
+        try:
+            with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
+                return json.load(f)
+        except (json.JSONDecodeError, OSError):
+            return {}
+    return {}
+
+
+def save_video_map(video_map):
+    fd, tmp_path = tempfile.mkstemp(dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp")
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as f:
+            json.dump(video_map, f, indent=2, ensure_ascii=False)
+        Path(tmp_path).replace(VIDEO_MAP_FILE)
+    except Exception:
+        try:
+            Path(tmp_path).unlink()
+        except OSError:
+            pass
+        raise
+
+
+
+def _expects_video(url):
+    return "/pinkcuffs-videos/" in url
+
+
+MAX_RETRIES = 2
+
+
+async def worker(worker_id, queue, context, known,
+                 total, retry_counts, video_map, map_lock, shutdown_event):
+    page = await context.new_page()
+    video_hits = set()
+
+    page.on("response", lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None)
+
+    try:
+        while not shutdown_event.is_set():
+            try:
+                idx, url = queue.get_nowait()
+            except asyncio.QueueEmpty:
+                break
+
+            attempt = retry_counts.get(idx, 0)
+            label = f" (retry {attempt}/{MAX_RETRIES})" if attempt else ""
+            print(f"[W{worker_id}] ({idx + 1}/{total}) {url}{label}")
+
+            try:
+                await page.goto(url, wait_until="networkidle", timeout=60000)
+            except Exception as e:
+                print(f"[W{worker_id}] Navigation error: {e}")
+                if _expects_video(url) and attempt < MAX_RETRIES:
+                    retry_counts[idx] = attempt + 1
+                    queue.put_nowait((idx, url))
+                    print(f"[W{worker_id}] Re-queued for retry.")
+                elif not _expects_video(url):
+                    async with map_lock:
+                        entry = video_map.get(url, {})
+                        entry["scraped_at"] = int(time.time())
+                        video_map[url] = entry
+                        save_video_map(video_map)
+                else:
+                    print(
+                        f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run.")
+                continue
+
+            await asyncio.sleep(1.5)
+            html = await page.content()
+            title = extract_title_from_html(html)
+            html_videos = extract_mp4_from_html(html)
+            found = set(html_videos) | set(video_hits)
+            video_hits.clear()
+
+            all_videos = [m for m in found if m not in (
+                f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
+            )]
+
+            async with map_lock:
+                new_found = found - known
+                if new_found:
+                    print(f"[W{worker_id}] Found {len(new_found)} new video URLs")
+                    known.update(new_found)
+                elif all_videos:
+                    print(
+                        f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write.")
+                else:
+                    print(f"[W{worker_id}] No video found on page.")
+
+                entry = video_map.get(url, {})
+                if title:
+                    entry["title"] = title
+                existing_videos = set(entry.get("videos", []))
+                existing_videos.update(all_videos)
+                entry["videos"] = sorted(existing_videos)
+                mark_done = bool(all_videos) or not _expects_video(url)
+                if mark_done:
+                    entry["scraped_at"] = int(time.time())
+                video_map[url] = entry
+                save_video_map(video_map)
+
+            if not mark_done:
+                if attempt < MAX_RETRIES:
+                    retry_counts[idx] = attempt + 1
+                    queue.put_nowait((idx, url))
+                    print(
+                        f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES}).")
+                else:
+                    print(
+                        f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run.")
+    finally:
+        await page.close()
+
+
+async def run():
+    shutdown_event = asyncio.Event()
+    loop = asyncio.get_running_loop()
+
+    def _handle_shutdown(signum, _frame):
+        print(f"\n[!] Signal {signum} received — finishing active pages then exiting…")
+        loop.call_soon_threadsafe(shutdown_event.set)
+
+    signal.signal(signal.SIGINT, _handle_shutdown)
+    signal.signal(signal.SIGTERM, _handle_shutdown)
+
+    try:
+        cookie_name, cookie_value = _get_login_cookie()
+        req_headers = {
+            **API_HEADERS,
+            "Cookie": f"{cookie_name}={cookie_value}; eav-age-verified=1",
+        }
+
+        urls = load_post_urls(req_headers)
+
+        video_map = load_video_map()
+        if any(u not in video_map
+               or not video_map[u].get("title")
+               or not video_map[u].get("description")
+               for u in urls if _expects_video(u)):
+            fetch_metadata_from_api(video_map, urls, req_headers)
+
+        known = {u for entry in video_map.values() for u in entry.get("videos", [])}
+
+        total = len(urls)
+        pending = []
+        needs_map = 0
+        for i, u in enumerate(urls):
+            entry = video_map.get(u, {})
+            if not entry.get("scraped_at"):
+                pending.append((i, u))
+            elif _expects_video(u) and not entry.get("videos"):
+                pending.append((i, u))
+                needs_map += 1
+
+        done_count = sum(1 for v in video_map.values() if v.get("scraped_at"))
+        print(f"[+] Loaded {total} post URLs.")
+        print(f"[+] Already have {len(known)} video URLs mapped.")
+        print(f"[+] Video map: {len(video_map)} entries in {VIDEO_MAP_FILE}")
+        if done_count:
+            remaining_new = len(pending) - needs_map
+            print(
+                f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data.")
+        if not pending:
+            print("[✓] All URLs already processed and mapped.")
+            return
+
+        print(
+            f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n")
+
+        queue = asyncio.Queue()
+        for item in pending:
+            queue.put_nowait(item)
+
+        map_lock = asyncio.Lock()
+        retry_counts = {}
+
+        async with async_playwright() as p:
+            browser = await p.firefox.launch(headless=True)
+            context = await browser.new_context()
+
+            _cookie_domain = urlparse(BASE_URL).netloc
+            site_cookies = [
+                {
+                    "name": cookie_name,
+                    "value": cookie_value,
+                    "domain": _cookie_domain,
+                    "path": "/",
+                    "httpOnly": True,
+                    "secure": True,
+                    "sameSite": "None"
+                },
+                {
+                    "name": "eav-age-verified",
+                    "value": "1",
+                    "domain": _cookie_domain,
+                    "path": "/"
+                }
+            ]
+
+            await context.add_cookies(site_cookies)
+
+            num_workers = min(MAX_WORKERS, len(pending))
+            workers = [
+                asyncio.create_task(
+                    worker(i, queue, context, known,
+                           total, retry_counts, video_map, map_lock, shutdown_event)
+                )
+                for i in range(num_workers)
+            ]
+
+            await asyncio.gather(*workers)
+            await browser.close()
+
+        mapped = sum(1 for v in video_map.values() if v.get("videos"))
+        print(
+            f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries.")
+
+        if not shutdown_event.is_set():
+            print(f"[✓] Completed. Full map in {VIDEO_MAP_FILE}")
+        else:
+            done = sum(1 for v in video_map.values() if v.get("scraped_at"))
+            print(f"[⏸] Paused — {done}/{total} done. Run again to resume.")
+    finally:
+        signal.signal(signal.SIGINT, signal.SIG_DFL)
+        signal.signal(signal.SIGTERM, signal.SIG_DFL)
+
+
+def main():
+    try:
+        asyncio.run(run())
+    except KeyboardInterrupt:
+        print("\n[!] Interrupted. Run again to resume.")
+    except RuntimeError as e:
+        raise SystemExit(f"[!] {e}")
+
+
+if __name__ == "__main__":
+    main()