Multi-gooner support because who knows

This commit is contained in:
HugeFrog24
2026-03-01 01:36:01 +01:00
parent 80444405e9
commit 4a5b0a6ee3
15 changed files with 16378 additions and 11575 deletions

682
main.py
View File

@@ -1,60 +1,131 @@
import argparse
import re
import json
import os
import time
import signal
import asyncio
import tempfile
import requests
from pathlib import Path, PurePosixPath
from pathlib import PurePosixPath
from typing import Any, Optional
from urllib.parse import urlparse
from dotenv import load_dotenv
from playwright.async_api import async_playwright
from check_clashes import VIDEO_EXTS
from config import BASE_URL
from playwright.async_api import async_playwright, BrowserContext
from check_clashes import (
VIDEO_EXTS,
load_video_map,
save_video_map,
is_valid_url,
expects_video,
)
from config import SITES
from grab_cookie import login_and_get_cookie, update_env
load_dotenv()
def _is_video_url(url):
def _is_video_url(url: str) -> bool:
"""True if `url` ends with a recognised video extension (case-insensitive, path only)."""
return PurePosixPath(urlparse(url).path).suffix.lower() in VIDEO_EXTS
WP_API = f"{BASE_URL}/wp-json/wp/v2"
SKIP_TYPES = {
"attachment", "nav_menu_item", "wp_block", "wp_template",
"wp_template_part", "wp_global_styles", "wp_navigation",
"wp_font_family", "wp_font_face",
"attachment",
"nav_menu_item",
"wp_block",
"wp_template",
"wp_template_part",
"wp_global_styles",
"wp_navigation",
"wp_font_family",
"wp_font_face",
}
VIDEO_MAP_FILE = "video_map.json"
MAX_WORKERS = 4
API_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0",
"Accept": "application/json",
"Referer": f"{BASE_URL}/",
}
_USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0"
)
def _get_login_cookie():
raw = os.environ.get("WP_LOGIN_COOKIE", "").strip() # strip accidental whitespace
if not raw:
raise RuntimeError(
"WP_LOGIN_COOKIE not set. Copy it from your browser into .env — see .env.example.")
name, _, value = raw.partition("=")
if not value:
raise RuntimeError(
"WP_LOGIN_COOKIE looks malformed (no '=' found). Expected: name=value")
if not name.startswith("wordpress_logged_in_"):
raise RuntimeError(
"WP_LOGIN_COOKIE doesn't look right — expected a wordpress_logged_in_... cookie.")
return name, value
def _api_headers(base_url: str, cookie_name: str, cookie_value: str) -> dict[str, str]:
return {
"User-Agent": _USER_AGENT,
"Accept": "application/json",
"Referer": f"{base_url}/",
"Cookie": f"{cookie_name}={cookie_value}; eav-age-verified=1",
}
def discover_content_types(session):
"""Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug) for content types worth scraping."""
r = session.get(f"{WP_API}/types", timeout=30)
def _select_probe_url(video_map: dict[str, Any]) -> str | None:
"""Pure function: return the first URL in video_map where expects_video() is True."""
return next((url for url in video_map if expects_video(url)), None)
def _probe_cookie(name: str, value: str, site_key: str) -> bool:
"""HEAD request to a members-only video page. Returns True if the cookie is still valid."""
video_map = load_video_map(site_key)
probe_url = _select_probe_url(video_map)
if probe_url is None:
return False # no video URLs yet — can't validate, fall through to re-auth
r = requests.head(
probe_url,
headers={"Cookie": f"{name}={value}", "User-Agent": _USER_AGENT},
allow_redirects=False,
timeout=10,
)
return r.status_code == 200
def _get_login_cookie(site_key: str, site_cfg: dict[str, str]) -> tuple[str, str]:
env_prefix = site_cfg["env_prefix"]
base_url = site_cfg["base_url"]
env_key = f"{env_prefix}_LOGIN_COOKIE"
username = os.environ.get(f"{env_prefix}_USERNAME", "").strip()
password = os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
has_credentials = bool(username and password)
raw = os.environ.get(env_key, "").strip()
if raw:
name, _, value = raw.partition("=")
if value and name.startswith("wordpress_logged_in_"):
if not has_credentials:
return name, value # cookie-only mode — trust it
print(f"[{site_key}] Cookie found — validating…")
if _probe_cookie(name, value, site_key):
print(f"[{site_key}] Cookie still valid — skipping login.")
return name, value
print(f"[{site_key}] Cookie expired — re-authenticating…")
if has_credentials:
cookie_name, cookie_value = login_and_get_cookie(username, password, base_url)
action = update_env(cookie_name, cookie_value, env_key=env_key)
print(f"[{site_key}] Logged in: {cookie_name} ({action} in .env)")
return cookie_name, cookie_value
raise RuntimeError(
f"No credentials or cookie found for {site_key}. Set either:\n"
f"{env_prefix}_USERNAME + {env_prefix}_PASSWORD (recommended)\n"
f"{env_prefix}_LOGIN_COOKIE (fallback — may expire)\n"
"See .env.example."
)
def _has_credentials(site_cfg: dict[str, str]) -> bool:
env_prefix = site_cfg["env_prefix"]
has_cookie = bool(os.environ.get(f"{env_prefix}_LOGIN_COOKIE", "").strip())
has_creds = bool(
os.environ.get(f"{env_prefix}_USERNAME", "").strip()
and os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
)
return has_cookie or has_creds
def discover_content_types(
session: requests.Session, wp_api: str
) -> list[tuple[str, str, str]]:
"""Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug)."""
r = session.get(f"{wp_api}/types", timeout=30)
r.raise_for_status()
types = r.json()
@@ -69,16 +140,22 @@ def discover_content_types(session):
return targets
def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
"""Paginate one content type and return (url, title, description) tuples.
Uses the `link` field when available; falls back to building from slug."""
def fetch_all_posts_for_type(
session: requests.Session,
wp_api: str,
base_url: str,
type_name: str,
rest_base: str,
type_slug: str,
) -> list[tuple[str, str, str]]:
"""Paginate one content type and return (url, title, description) tuples."""
url_prefix = type_slug.replace("_", "-")
results = []
page = 1
while True:
r = session.get(
f"{WP_API}/{rest_base}",
f"{wp_api}/{rest_base}",
params={"per_page": 100, "page": page},
timeout=30,
)
@@ -92,15 +169,19 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
if not link.startswith("http"):
slug = post.get("slug")
if slug:
link = f"{BASE_URL}/{url_prefix}/{slug}/"
link = f"{base_url}/{url_prefix}/{slug}/"
else:
continue
title_obj = post.get("title", {})
title = title_obj.get("rendered", "") if isinstance(
title_obj, dict) else str(title_obj)
title = (
title_obj.get("rendered", "")
if isinstance(title_obj, dict)
else str(title_obj)
)
content_obj = post.get("content", {})
content_html = content_obj.get(
"rendered", "") if isinstance(content_obj, dict) else ""
content_html = (
content_obj.get("rendered", "") if isinstance(content_obj, dict) else ""
)
description = html_to_text(content_html) if content_html else ""
results.append((link, title, description))
print(f" {type_name} page {page}: {len(data)} items")
@@ -109,66 +190,88 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
return results
def fetch_post_urls_from_api(headers):
def fetch_post_urls_from_api(
site_key: str,
base_url: str,
wp_api: str,
headers: dict[str, str],
) -> list[str]:
"""Auto-discover all content types via the WP REST API and collect every post URL.
Also builds video_map.json with titles pre-populated."""
print("[+] video_map.json empty or missing — discovering content types from REST API…")
Also pre-populates video_map.json with titles."""
print(f"[{site_key}] video_map empty — discovering content types from REST API…")
session = requests.Session()
session.headers.update(headers)
targets = discover_content_types(session)
targets = discover_content_types(session, wp_api)
print(
f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n")
f"[{site_key}] Found {len(targets)} content types: "
f"{', '.join(name for name, _, _ in targets)}\n"
)
all_results = []
for type_name, rest_base, type_slug in targets:
type_results = fetch_all_posts_for_type(
session, type_name, rest_base, type_slug)
session, wp_api, base_url, type_name, rest_base, type_slug
)
all_results.extend(type_results)
seen = set()
seen: set[str] = set()
deduped_urls = []
video_map = load_video_map()
video_map = load_video_map(site_key)
for url, title, description in all_results:
if url not in seen and url.startswith("http"):
seen.add(url)
deduped_urls.append(url)
if url not in video_map:
video_map[url] = {"title": title,
"description": description, "videos": []}
video_map[url] = {
"title": title,
"description": description,
"videos": [],
}
else:
if not video_map[url].get("title"):
video_map[url]["title"] = title
if not video_map[url].get("description"):
video_map[url]["description"] = description
save_video_map(video_map)
save_video_map(video_map, site_key)
print(
f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}")
print(
f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
f"\n[{site_key}] Discovered {len(deduped_urls)} unique URLs → saved to video_map.json"
)
print(f"[{site_key}] Pre-populated {len(video_map)} entries")
return deduped_urls
def fetch_metadata_from_api(video_map, urls, headers):
def fetch_metadata_from_api(
site_key: str,
base_url: str,
wp_api: str,
video_map: dict[str, Any],
urls: list[str],
headers: dict[str, str],
) -> None:
"""Populate missing titles and descriptions in video_map from the REST API."""
missing = [u for u in urls
if u not in video_map
or not video_map[u].get("title")
or not video_map[u].get("description")]
missing = [
u
for u in urls
if u not in video_map
or not video_map[u].get("title")
or not video_map[u].get("description")
]
if not missing:
return
print(f"[+] Fetching metadata from REST API for {len(missing)} posts…")
print(f"[{site_key}] Fetching metadata from REST API for {len(missing)} posts…")
session = requests.Session()
session.headers.update(headers)
targets = discover_content_types(session)
targets = discover_content_types(session, wp_api)
for type_name, rest_base, type_slug in targets:
type_results = fetch_all_posts_for_type(
session, type_name, rest_base, type_slug)
session, wp_api, base_url, type_name, rest_base, type_slug
)
for url, title, description in type_results:
if url in video_map:
if not video_map[url].get("title"):
@@ -176,93 +279,90 @@ def fetch_metadata_from_api(video_map, urls, headers):
if not video_map[url].get("description"):
video_map[url]["description"] = description
else:
video_map[url] = {"title": title,
"description": description, "videos": []}
video_map[url] = {
"title": title,
"description": description,
"videos": [],
}
save_video_map(video_map)
save_video_map(video_map, site_key)
populated_t = sum(1 for u in urls if video_map.get(u, {}).get("title"))
populated_d = sum(1 for u in urls if video_map.get(
u, {}).get("description"))
print(f"[+] Titles populated: {populated_t}/{len(urls)}")
print(f"[+] Descriptions populated: {populated_d}/{len(urls)}")
populated_d = sum(1 for u in urls if video_map.get(u, {}).get("description"))
print(f"[{site_key}] Titles populated: {populated_t}/{len(urls)}")
print(f"[{site_key}] Descriptions populated: {populated_d}/{len(urls)}")
def load_post_urls(headers):
vm = load_video_map()
def load_post_urls(
site_key: str,
base_url: str,
wp_api: str,
headers: dict[str, str],
) -> list[str]:
vm = load_video_map(site_key)
if vm:
print(f"[+] {VIDEO_MAP_FILE} found — loading {len(vm)} post URLs.")
print(f"[{site_key}] video_map found — loading {len(vm)} post URLs.")
return list(vm.keys())
return fetch_post_urls_from_api(headers)
return fetch_post_urls_from_api(site_key, base_url, wp_api, headers)
def html_to_text(html_str):
def html_to_text(html_str: str) -> str:
"""Strip HTML tags, decode entities, and collapse whitespace into clean plain text."""
import html
text = re.sub(r'<br\s*/?>', '\n', html_str)
text = text.replace('</p>', '\n\n')
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r"<br\s*/?>", "\n", html_str)
text = text.replace("</p>", "\n\n")
text = re.sub(r"<[^>]+>", "", text)
text = html.unescape(text)
lines = [line.strip() for line in text.splitlines()]
text = '\n'.join(lines)
text = re.sub(r'\n{3,}', '\n\n', text)
text = "\n".join(lines)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def extract_mp4_from_html(html):
def extract_mp4_from_html(html: str) -> list[str]:
candidates = re.findall(r'https?://[^\s"\'<>]+', html)
return [u for u in candidates if _is_video_url(u)]
def extract_title_from_html(html):
m = re.search(
r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
def extract_title_from_html(html: str) -> str | None:
m = re.search(r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
if m:
title = re.sub(r'<[^>]+>', '', m.group(1)).strip()
title = re.sub(r"<[^>]+>", "", m.group(1)).strip()
return title
m = re.search(r'<title>(.*?)(?:\s*[-|].*)?</title>', html, re.DOTALL)
m = re.search(r"<title>(.*?)(?:\s*[-|].*)?</title>", html, re.DOTALL)
if m:
return m.group(1).strip()
return None
def load_video_map():
if Path(VIDEO_MAP_FILE).exists():
try:
with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, OSError):
return {}
return {}
def save_video_map(video_map):
fd, tmp_path = tempfile.mkstemp(dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp")
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
json.dump(video_map, f, indent=2, ensure_ascii=False)
Path(tmp_path).replace(VIDEO_MAP_FILE)
except Exception:
try:
Path(tmp_path).unlink()
except OSError:
pass
raise
def _expects_video(url):
return "/pinkcuffs-videos/" in url
MAX_RETRIES = 2
async def worker(worker_id, queue, context, known,
total, retry_counts, video_map, map_lock, shutdown_event):
async def worker(
worker_id: int,
queue: asyncio.Queue[tuple[int, str]],
context: BrowserContext,
known: set[str],
total: int,
retry_counts: dict[int, int],
video_map: dict[str, Any],
map_lock: asyncio.Lock,
shutdown_event: asyncio.Event,
reauth_lock: asyncio.Lock,
reauth_done: list[bool],
site_key: str,
site_cfg: dict[str, str],
) -> None:
base_url = site_cfg["base_url"]
cookie_domain = urlparse(base_url).hostname or site_cfg["cookie_domain"]
env_prefix = site_cfg["env_prefix"]
page = await context.new_page()
video_hits = set()
video_hits: set[str] = set()
page.on("response", lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None)
page.on(
"response",
lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
)
try:
while not shutdown_event.is_set():
@@ -279,19 +379,69 @@ async def worker(worker_id, queue, context, known,
await page.goto(url, wait_until="networkidle", timeout=60000)
except Exception as e:
print(f"[W{worker_id}] Navigation error: {e}")
if _expects_video(url) and attempt < MAX_RETRIES:
if expects_video(url) and attempt < MAX_RETRIES:
retry_counts[idx] = attempt + 1
queue.put_nowait((idx, url))
print(f"[W{worker_id}] Re-queued for retry.")
elif not _expects_video(url):
elif not expects_video(url):
async with map_lock:
entry = video_map.get(url, {})
entry["scraped_at"] = int(time.time())
video_map[url] = entry
save_video_map(video_map)
save_video_map(video_map, site_key)
else:
print(
f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run.")
f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run."
)
continue
if "NoDirectAccessAllowed" in page.url:
recovered = False
async with reauth_lock:
if not reauth_done[0]:
username = os.environ.get(f"{env_prefix}_USERNAME", "").strip()
password = os.environ.get(f"{env_prefix}_PASSWORD", "").strip()
if username and password:
print(f"[W{worker_id}] Cookie expired — re-authenticating…")
try:
new_name, new_value = await asyncio.to_thread(
login_and_get_cookie, username, password, base_url
)
update_env(
new_name,
new_value,
env_key=f"{env_prefix}_LOGIN_COOKIE",
)
await context.add_cookies(
[
{
"name": new_name,
"value": new_value,
"domain": cookie_domain,
"path": "/",
"httpOnly": True,
"secure": True,
"sameSite": "None",
}
]
)
reauth_done[0] = True
recovered = True
print(f"[W{worker_id}] Re-auth succeeded — re-queuing.")
except Exception as e:
print(f"[W{worker_id}] Re-auth failed: {e}")
shutdown_event.set()
else:
print(
f"[W{worker_id}] Cookie expired. "
f"Set {env_prefix}_USERNAME + {env_prefix}_PASSWORD "
"in .env for auto re-auth."
)
shutdown_event.set()
else:
recovered = True # another worker already re-authed
if recovered:
queue.put_nowait((idx, url))
continue
await asyncio.sleep(1.5)
@@ -301,9 +451,15 @@ async def worker(worker_id, queue, context, known,
found = set(html_videos) | set(video_hits)
video_hits.clear()
all_videos = [m for m in found if m not in (
f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
)]
all_videos = [
m
for m in found
if is_valid_url(m)
and m
not in (
f"{base_url}/wp-content/plugins/easy-video-player/lib/blank.mp4",
)
]
async with map_lock:
new_found = found - known
@@ -312,7 +468,8 @@ async def worker(worker_id, queue, context, known,
known.update(new_found)
elif all_videos:
print(
f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write.")
f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write."
)
else:
print(f"[W{worker_id}] No video found on page.")
@@ -322,30 +479,159 @@ async def worker(worker_id, queue, context, known,
existing_videos = set(entry.get("videos", []))
existing_videos.update(all_videos)
entry["videos"] = sorted(existing_videos)
mark_done = bool(all_videos) or not _expects_video(url)
mark_done = bool(all_videos) or not expects_video(url)
if mark_done:
entry["scraped_at"] = int(time.time())
video_map[url] = entry
save_video_map(video_map)
save_video_map(video_map, site_key)
if not mark_done:
if attempt < MAX_RETRIES:
retry_counts[idx] = attempt + 1
queue.put_nowait((idx, url))
print(
f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES}).")
f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES})."
)
else:
print(
f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run.")
f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run."
)
finally:
await page.close()
async def run():
async def run_for_site(
site_key: str,
site_cfg: dict[str, str],
shutdown_event: asyncio.Event,
) -> None:
base_url = site_cfg["base_url"]
cookie_domain = urlparse(base_url).hostname or site_cfg["cookie_domain"]
wp_api = f"{base_url}/wp-json/wp/v2"
cookie_name, cookie_value = _get_login_cookie(site_key, site_cfg)
req_headers = _api_headers(base_url, cookie_name, cookie_value)
urls = load_post_urls(site_key, base_url, wp_api, req_headers)
video_map = load_video_map(site_key)
if any(
u not in video_map
or not video_map[u].get("title")
or not video_map[u].get("description")
for u in urls
if expects_video(u)
):
fetch_metadata_from_api(
site_key, base_url, wp_api, video_map, urls, req_headers
)
known = {u for entry in video_map.values() for u in entry.get("videos", [])}
total = len(urls)
pending = []
needs_map = 0
for i, u in enumerate(urls):
entry = video_map.get(u, {})
if not entry.get("scraped_at"):
pending.append((i, u))
elif expects_video(u) and not entry.get("videos"):
pending.append((i, u))
needs_map += 1
done_count = sum(1 for v in video_map.values() if v.get("scraped_at"))
print(f"[{site_key}] Loaded {total} post URLs.")
print(f"[{site_key}] Already have {len(known)} video URLs mapped.")
print(f"[{site_key}] Video map: {len(video_map)} entries in video_map.json")
if done_count:
remaining_new = len(pending) - needs_map
print(
f"[{site_key}] Resuming: {done_count} done, "
f"{remaining_new} new + {needs_map} needing map data."
)
if not pending:
print(f"[{site_key}] All URLs already processed and mapped.")
return
print(
f"[{site_key}] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n"
)
queue: asyncio.Queue[tuple[int, str]] = asyncio.Queue()
for item in pending:
queue.put_nowait(item)
map_lock = asyncio.Lock()
reauth_lock = asyncio.Lock()
reauth_done: list[bool] = [False]
retry_counts: dict[int, int] = {}
async with async_playwright() as p:
browser = await p.firefox.launch(headless=True)
context = await browser.new_context()
site_cookies = [
{
"name": cookie_name,
"value": cookie_value,
"domain": cookie_domain,
"path": "/",
"httpOnly": True,
"secure": True,
"sameSite": "None",
},
{
"name": "eav-age-verified",
"value": "1",
"domain": cookie_domain,
"path": "/",
},
]
await context.add_cookies(site_cookies) # type: ignore[arg-type]
num_workers = min(MAX_WORKERS, len(pending))
workers = [
asyncio.create_task(
worker(
i,
queue,
context,
known,
total,
retry_counts,
video_map,
map_lock,
shutdown_event,
reauth_lock,
reauth_done,
site_key,
site_cfg,
)
)
for i in range(num_workers)
]
await asyncio.gather(*workers)
await browser.close()
mapped = sum(1 for v in video_map.values() if v.get("videos"))
print(
f"\n[{site_key}] Video map: {mapped} posts with videos, {len(video_map)} total entries."
)
if not shutdown_event.is_set():
print(f"[{site_key}] Completed. Full map in video_map.json")
else:
done = sum(1 for v in video_map.values() if v.get("scraped_at"))
print(f"[{site_key}] Paused — {done}/{total} done. Run again to resume.")
async def run(selected_sites: list[str], explicit: bool) -> None:
shutdown_event = asyncio.Event()
loop = asyncio.get_running_loop()
def _handle_shutdown(signum, _frame):
def _handle_shutdown(signum: int, _: object) -> None:
print(f"\n[!] Signal {signum} received — finishing active pages then exiting…")
loop.call_soon_threadsafe(shutdown_event.set)
@@ -353,110 +639,42 @@ async def run():
signal.signal(signal.SIGTERM, _handle_shutdown)
try:
cookie_name, cookie_value = _get_login_cookie()
req_headers = {
**API_HEADERS,
"Cookie": f"{cookie_name}={cookie_value}; eav-age-verified=1",
}
urls = load_post_urls(req_headers)
video_map = load_video_map()
if any(u not in video_map
or not video_map[u].get("title")
or not video_map[u].get("description")
for u in urls if _expects_video(u)):
fetch_metadata_from_api(video_map, urls, req_headers)
known = {u for entry in video_map.values() for u in entry.get("videos", [])}
total = len(urls)
pending = []
needs_map = 0
for i, u in enumerate(urls):
entry = video_map.get(u, {})
if not entry.get("scraped_at"):
pending.append((i, u))
elif _expects_video(u) and not entry.get("videos"):
pending.append((i, u))
needs_map += 1
done_count = sum(1 for v in video_map.values() if v.get("scraped_at"))
print(f"[+] Loaded {total} post URLs.")
print(f"[+] Already have {len(known)} video URLs mapped.")
print(f"[+] Video map: {len(video_map)} entries in {VIDEO_MAP_FILE}")
if done_count:
remaining_new = len(pending) - needs_map
print(
f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data.")
if not pending:
print("[✓] All URLs already processed and mapped.")
return
print(
f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n")
queue = asyncio.Queue()
for item in pending:
queue.put_nowait(item)
map_lock = asyncio.Lock()
retry_counts = {}
async with async_playwright() as p:
browser = await p.firefox.launch(headless=True)
context = await browser.new_context()
_cookie_domain = urlparse(BASE_URL).netloc
site_cookies = [
{
"name": cookie_name,
"value": cookie_value,
"domain": _cookie_domain,
"path": "/",
"httpOnly": True,
"secure": True,
"sameSite": "None"
},
{
"name": "eav-age-verified",
"value": "1",
"domain": _cookie_domain,
"path": "/"
}
]
await context.add_cookies(site_cookies)
num_workers = min(MAX_WORKERS, len(pending))
workers = [
asyncio.create_task(
worker(i, queue, context, known,
total, retry_counts, video_map, map_lock, shutdown_event)
)
for i in range(num_workers)
]
await asyncio.gather(*workers)
await browser.close()
mapped = sum(1 for v in video_map.values() if v.get("videos"))
print(
f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries.")
if not shutdown_event.is_set():
print(f"[✓] Completed. Full map in {VIDEO_MAP_FILE}")
else:
done = sum(1 for v in video_map.values() if v.get("scraped_at"))
print(f"[⏸] Paused — {done}/{total} done. Run again to resume.")
for site_key in selected_sites:
if shutdown_event.is_set():
break
site_cfg = SITES[site_key]
if not _has_credentials(site_cfg):
if explicit:
raise RuntimeError(
f"No credentials or cookie found for {site_key}. See .env.example."
)
print(f"[{site_key}] No credentials found — skipping.")
continue
print(f"\n{'=' * 60}")
print(f" Site: {site_key} ({site_cfg['base_url']})")
print(f"{'=' * 60}\n")
await run_for_site(site_key, site_cfg, shutdown_event)
finally:
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGTERM, signal.SIG_DFL)
def main():
def main() -> None:
parser = argparse.ArgumentParser(description="Scrape video URLs from member sites")
parser.add_argument(
"--site",
action="append",
choices=list(SITES.keys()),
dest="sites",
metavar="SITE",
help=f"Site(s) to scrape (default: all). Can be repeated. Choices: {', '.join(SITES)}",
)
args = parser.parse_args()
explicit = bool(args.sites)
selected = args.sites or list(SITES.keys())
try:
asyncio.run(run())
asyncio.run(run(selected, explicit))
except KeyboardInterrupt:
print("\n[!] Interrupted. Run again to resume.")
except RuntimeError as e: