Cookie validation logic

This commit is contained in:
HugeFrog24
2026-02-28 21:37:39 +01:00
parent 80444405e9
commit e3e14293cd
15 changed files with 802 additions and 398 deletions

354
main.py
View File

@@ -7,27 +7,36 @@ import asyncio
import tempfile
import requests
from pathlib import Path, PurePosixPath
from typing import Any, Optional
from urllib.parse import urlparse
from dotenv import load_dotenv
from playwright.async_api import async_playwright
from check_clashes import VIDEO_EXTS
from playwright.async_api import async_playwright, BrowserContext
from check_clashes import VIDEO_EXTS, load_video_map, is_valid_url, VIDEO_MAP_FILE, expects_video
from config import BASE_URL
from grab_cookie import login_and_get_cookie, update_env
load_dotenv()
def _is_video_url(url):
def _is_video_url(url: str) -> bool:
"""True if `url` ends with a recognised video extension (case-insensitive, path only)."""
return PurePosixPath(urlparse(url).path).suffix.lower() in VIDEO_EXTS
WP_API = f"{BASE_URL}/wp-json/wp/v2"
SKIP_TYPES = {
"attachment", "nav_menu_item", "wp_block", "wp_template",
"wp_template_part", "wp_global_styles", "wp_navigation",
"wp_font_family", "wp_font_face",
"attachment",
"nav_menu_item",
"wp_block",
"wp_template",
"wp_template_part",
"wp_global_styles",
"wp_navigation",
"wp_font_family",
"wp_font_face",
}
VIDEO_MAP_FILE = "video_map.json"
MAX_WORKERS = 4
API_HEADERS = {
@@ -37,22 +46,53 @@ API_HEADERS = {
}
def _get_login_cookie():
raw = os.environ.get("WP_LOGIN_COOKIE", "").strip() # strip accidental whitespace
if not raw:
raise RuntimeError(
"WP_LOGIN_COOKIE not set. Copy it from your browser into .env — see .env.example.")
name, _, value = raw.partition("=")
if not value:
raise RuntimeError(
"WP_LOGIN_COOKIE looks malformed (no '=' found). Expected: name=value")
if not name.startswith("wordpress_logged_in_"):
raise RuntimeError(
"WP_LOGIN_COOKIE doesn't look right — expected a wordpress_logged_in_... cookie.")
return name, value
def _probe_cookie(name: str, value: str) -> bool:
"""HEAD request to a members-only video page. Returns True if the cookie is still valid."""
video_map = load_video_map()
probe_url = next((url for url in video_map if expects_video(url)), None)
if probe_url is None:
return False # no video URLs yet — can't validate, fall through to re-auth
r = requests.head(
probe_url,
headers={"Cookie": f"{name}={value}", "User-Agent": API_HEADERS["User-Agent"]},
allow_redirects=False,
timeout=10,
)
return r.status_code == 200
def discover_content_types(session):
def _get_login_cookie() -> tuple[str, str]:
username = os.environ.get("WP_USERNAME", "").strip()
password = os.environ.get("WP_PASSWORD", "").strip()
has_credentials = bool(username and password)
raw = os.environ.get("WP_LOGIN_COOKIE", "").strip()
if raw:
name, _, value = raw.partition("=")
if value and name.startswith("wordpress_logged_in_"):
if not has_credentials:
return name, value # cookie-only mode — trust it
print("[+] Cookie found — validating…")
if _probe_cookie(name, value):
print("[✓] Cookie still valid — skipping login.")
return name, value
print("[!] Cookie expired — re-authenticating…")
if has_credentials:
cookie_name, cookie_value = login_and_get_cookie(username, password)
action = update_env(cookie_name, cookie_value)
print(f"[✓] Logged in: {cookie_name} ({action} in .env)")
return cookie_name, cookie_value
raise RuntimeError(
"No credentials or cookie found. Set either:\n"
" • WP_USERNAME + WP_PASSWORD (recommended — always gets a fresh cookie)\n"
" • WP_LOGIN_COOKIE (fallback — may expire mid-run)\n"
"See .env.example."
)
def discover_content_types(session: requests.Session) -> list[tuple[str, str, str]]:
"""Query /wp-json/wp/v2/types and return a list of (name, rest_base, type_slug) for content types worth scraping."""
r = session.get(f"{WP_API}/types", timeout=30)
r.raise_for_status()
@@ -69,7 +109,12 @@ def discover_content_types(session):
return targets
def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
def fetch_all_posts_for_type(
session: requests.Session,
type_name: str,
rest_base: str,
type_slug: str,
) -> list[tuple[str, str, str]]:
"""Paginate one content type and return (url, title, description) tuples.
Uses the `link` field when available; falls back to building from slug."""
url_prefix = type_slug.replace("_", "-")
@@ -96,11 +141,15 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
else:
continue
title_obj = post.get("title", {})
title = title_obj.get("rendered", "") if isinstance(
title_obj, dict) else str(title_obj)
title = (
title_obj.get("rendered", "")
if isinstance(title_obj, dict)
else str(title_obj)
)
content_obj = post.get("content", {})
content_html = content_obj.get(
"rendered", "") if isinstance(content_obj, dict) else ""
content_html = (
content_obj.get("rendered", "") if isinstance(content_obj, dict) else ""
)
description = html_to_text(content_html) if content_html else ""
results.append((link, title, description))
print(f" {type_name} page {page}: {len(data)} items")
@@ -109,21 +158,25 @@ def fetch_all_posts_for_type(session, type_name, rest_base, type_slug):
return results
def fetch_post_urls_from_api(headers):
def fetch_post_urls_from_api(headers: dict[str, str]) -> list[str]:
"""Auto-discover all content types via the WP REST API and collect every post URL.
Also builds video_map.json with titles pre-populated."""
print("[+] video_map.json empty or missing — discovering content types from REST API…")
print(
"[+] video_map.json empty or missing — discovering content types from REST API…"
)
session = requests.Session()
session.headers.update(headers)
targets = discover_content_types(session)
print(
f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n")
f"[+] Found {len(targets)} content types: {', '.join(name for name, _, _ in targets)}\n"
)
all_results = []
for type_name, rest_base, type_slug in targets:
type_results = fetch_all_posts_for_type(
session, type_name, rest_base, type_slug)
session, type_name, rest_base, type_slug
)
all_results.extend(type_results)
seen = set()
@@ -135,8 +188,11 @@ def fetch_post_urls_from_api(headers):
seen.add(url)
deduped_urls.append(url)
if url not in video_map:
video_map[url] = {"title": title,
"description": description, "videos": []}
video_map[url] = {
"title": title,
"description": description,
"videos": [],
}
else:
if not video_map[url].get("title"):
video_map[url]["title"] = title
@@ -145,18 +201,25 @@ def fetch_post_urls_from_api(headers):
save_video_map(video_map)
print(
f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}")
print(
f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
f"\n[+] Discovered {len(deduped_urls)} unique URLs → saved to {VIDEO_MAP_FILE}"
)
print(f"[+] Pre-populated {len(video_map)} entries in {VIDEO_MAP_FILE}")
return deduped_urls
def fetch_metadata_from_api(video_map, urls, headers):
def fetch_metadata_from_api(
video_map: dict[str, Any],
urls: list[str],
headers: dict[str, str],
) -> None:
"""Populate missing titles and descriptions in video_map from the REST API."""
missing = [u for u in urls
if u not in video_map
or not video_map[u].get("title")
or not video_map[u].get("description")]
missing = [
u
for u in urls
if u not in video_map
or not video_map[u].get("title")
or not video_map[u].get("description")
]
if not missing:
return
@@ -168,7 +231,8 @@ def fetch_metadata_from_api(video_map, urls, headers):
for type_name, rest_base, type_slug in targets:
type_results = fetch_all_posts_for_type(
session, type_name, rest_base, type_slug)
session, type_name, rest_base, type_slug
)
for url, title, description in type_results:
if url in video_map:
if not video_map[url].get("title"):
@@ -176,18 +240,20 @@ def fetch_metadata_from_api(video_map, urls, headers):
if not video_map[url].get("description"):
video_map[url]["description"] = description
else:
video_map[url] = {"title": title,
"description": description, "videos": []}
video_map[url] = {
"title": title,
"description": description,
"videos": [],
}
save_video_map(video_map)
populated_t = sum(1 for u in urls if video_map.get(u, {}).get("title"))
populated_d = sum(1 for u in urls if video_map.get(
u, {}).get("description"))
populated_d = sum(1 for u in urls if video_map.get(u, {}).get("description"))
print(f"[+] Titles populated: {populated_t}/{len(urls)}")
print(f"[+] Descriptions populated: {populated_d}/{len(urls)}")
def load_post_urls(headers):
def load_post_urls(headers: dict[str, str]) -> list[str]:
vm = load_video_map()
if vm:
print(f"[+] {VIDEO_MAP_FILE} found — loading {len(vm)} post URLs.")
@@ -195,48 +261,40 @@ def load_post_urls(headers):
return fetch_post_urls_from_api(headers)
def html_to_text(html_str):
def html_to_text(html_str: str) -> str:
"""Strip HTML tags, decode entities, and collapse whitespace into clean plain text."""
import html
text = re.sub(r'<br\s*/?>', '\n', html_str)
text = text.replace('</p>', '\n\n')
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r"<br\s*/?>", "\n", html_str)
text = text.replace("</p>", "\n\n")
text = re.sub(r"<[^>]+>", "", text)
text = html.unescape(text)
lines = [line.strip() for line in text.splitlines()]
text = '\n'.join(lines)
text = re.sub(r'\n{3,}', '\n\n', text)
text = "\n".join(lines)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def extract_mp4_from_html(html):
def extract_mp4_from_html(html: str) -> list[str]:
candidates = re.findall(r'https?://[^\s"\'<>]+', html)
return [u for u in candidates if _is_video_url(u)]
def extract_title_from_html(html):
m = re.search(
r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
def extract_title_from_html(html: str) -> Optional[str]:
m = re.search(r'<h1[^>]*class="entry-title"[^>]*>(.*?)</h1>', html, re.DOTALL)
if m:
title = re.sub(r'<[^>]+>', '', m.group(1)).strip()
title = re.sub(r"<[^>]+>", "", m.group(1)).strip()
return title
m = re.search(r'<title>(.*?)(?:\s*[-|].*)?</title>', html, re.DOTALL)
m = re.search(r"<title>(.*?)(?:\s*[-|].*)?</title>", html, re.DOTALL)
if m:
return m.group(1).strip()
return None
def load_video_map():
if Path(VIDEO_MAP_FILE).exists():
try:
with open(VIDEO_MAP_FILE, encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, OSError):
return {}
return {}
def save_video_map(video_map):
fd, tmp_path = tempfile.mkstemp(dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp")
def save_video_map(video_map: dict[str, Any]) -> None:
fd, tmp_path = tempfile.mkstemp(
dir=Path(VIDEO_MAP_FILE).resolve().parent, suffix=".tmp"
)
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
json.dump(video_map, f, indent=2, ensure_ascii=False)
@@ -250,19 +308,30 @@ def save_video_map(video_map):
def _expects_video(url):
return "/pinkcuffs-videos/" in url
MAX_RETRIES = 2
async def worker(worker_id, queue, context, known,
total, retry_counts, video_map, map_lock, shutdown_event):
async def worker(
worker_id: int,
queue: asyncio.Queue[tuple[int, str]],
context: BrowserContext,
known: set[str],
total: int,
retry_counts: dict[int, int],
video_map: dict[str, Any],
map_lock: asyncio.Lock,
shutdown_event: asyncio.Event,
reauth_lock: asyncio.Lock,
reauth_done: list[bool],
cookie_domain: str,
) -> None:
page = await context.new_page()
video_hits = set()
page.on("response", lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None)
page.on(
"response",
lambda resp: video_hits.add(resp.url) if _is_video_url(resp.url) else None,
)
try:
while not shutdown_event.is_set():
@@ -279,11 +348,11 @@ async def worker(worker_id, queue, context, known,
await page.goto(url, wait_until="networkidle", timeout=60000)
except Exception as e:
print(f"[W{worker_id}] Navigation error: {e}")
if _expects_video(url) and attempt < MAX_RETRIES:
if expects_video(url) and attempt < MAX_RETRIES:
retry_counts[idx] = attempt + 1
queue.put_nowait((idx, url))
print(f"[W{worker_id}] Re-queued for retry.")
elif not _expects_video(url):
elif not expects_video(url):
async with map_lock:
entry = video_map.get(url, {})
entry["scraped_at"] = int(time.time())
@@ -291,7 +360,48 @@ async def worker(worker_id, queue, context, known,
save_video_map(video_map)
else:
print(
f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run.")
f"[W{worker_id}] Still failing after {MAX_RETRIES} retries — will retry next run."
)
continue
if "NoDirectAccessAllowed" in page.url:
recovered = False
async with reauth_lock:
if not reauth_done[0]:
username = os.environ.get("WP_USERNAME", "").strip()
password = os.environ.get("WP_PASSWORD", "").strip()
if username and password:
print(f"[W{worker_id}] Cookie expired — re-authenticating…")
try:
new_name, new_value = await asyncio.to_thread(
login_and_get_cookie, username, password
)
update_env(new_name, new_value)
await context.add_cookies([{
"name": new_name,
"value": new_value,
"domain": cookie_domain,
"path": "/",
"httpOnly": True,
"secure": True,
"sameSite": "None",
}])
reauth_done[0] = True
recovered = True
print(f"[W{worker_id}] Re-auth succeeded — re-queuing.")
except Exception as e:
print(f"[W{worker_id}] Re-auth failed: {e}")
shutdown_event.set()
else:
print(
f"[W{worker_id}] Cookie expired. "
"Set WP_USERNAME + WP_PASSWORD in .env for auto re-auth."
)
shutdown_event.set()
else:
recovered = True # another worker already re-authed
if recovered:
queue.put_nowait((idx, url))
continue
await asyncio.sleep(1.5)
@@ -301,9 +411,15 @@ async def worker(worker_id, queue, context, known,
found = set(html_videos) | set(video_hits)
video_hits.clear()
all_videos = [m for m in found if m not in (
f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
)]
all_videos = [
m
for m in found
if is_valid_url(m)
and m
not in (
f"{BASE_URL}/wp-content/plugins/easy-video-player/lib/blank.mp4",
)
]
async with map_lock:
new_found = found - known
@@ -312,7 +428,8 @@ async def worker(worker_id, queue, context, known,
known.update(new_found)
elif all_videos:
print(
f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write.")
f"[W{worker_id}] {len(all_videos)} video(s) already known — skipping write."
)
else:
print(f"[W{worker_id}] No video found on page.")
@@ -322,7 +439,7 @@ async def worker(worker_id, queue, context, known,
existing_videos = set(entry.get("videos", []))
existing_videos.update(all_videos)
entry["videos"] = sorted(existing_videos)
mark_done = bool(all_videos) or not _expects_video(url)
mark_done = bool(all_videos) or not expects_video(url)
if mark_done:
entry["scraped_at"] = int(time.time())
video_map[url] = entry
@@ -333,19 +450,21 @@ async def worker(worker_id, queue, context, known,
retry_counts[idx] = attempt + 1
queue.put_nowait((idx, url))
print(
f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES}).")
f"[W{worker_id}] Re-queued for retry ({attempt + 1}/{MAX_RETRIES})."
)
else:
print(
f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run.")
f"[W{worker_id}] No video after {MAX_RETRIES} retries — will retry next run."
)
finally:
await page.close()
async def run():
async def run() -> None:
shutdown_event = asyncio.Event()
loop = asyncio.get_running_loop()
def _handle_shutdown(signum, _frame):
def _handle_shutdown(signum: int, _frame: object) -> None:
print(f"\n[!] Signal {signum} received — finishing active pages then exiting…")
loop.call_soon_threadsafe(shutdown_event.set)
@@ -362,10 +481,13 @@ async def run():
urls = load_post_urls(req_headers)
video_map = load_video_map()
if any(u not in video_map
or not video_map[u].get("title")
or not video_map[u].get("description")
for u in urls if _expects_video(u)):
if any(
u not in video_map
or not video_map[u].get("title")
or not video_map[u].get("description")
for u in urls
if expects_video(u)
):
fetch_metadata_from_api(video_map, urls, req_headers)
known = {u for entry in video_map.values() for u in entry.get("videos", [])}
@@ -377,7 +499,7 @@ async def run():
entry = video_map.get(u, {})
if not entry.get("scraped_at"):
pending.append((i, u))
elif _expects_video(u) and not entry.get("videos"):
elif expects_video(u) and not entry.get("videos"):
pending.append((i, u))
needs_map += 1
@@ -388,26 +510,31 @@ async def run():
if done_count:
remaining_new = len(pending) - needs_map
print(
f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data.")
f"[↻] Resuming: {done_count} done, {remaining_new} new + {needs_map} needing map data."
)
if not pending:
print("[✓] All URLs already processed and mapped.")
return
print(
f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n")
f"[⚡] Running with {min(MAX_WORKERS, len(pending))} concurrent workers.\n"
)
queue = asyncio.Queue()
queue: asyncio.Queue[tuple[int, str]] = asyncio.Queue()
for item in pending:
queue.put_nowait(item)
map_lock = asyncio.Lock()
retry_counts = {}
reauth_lock = asyncio.Lock()
reauth_done: list[bool] = [False]
retry_counts: dict[int, int] = {}
async with async_playwright() as p:
browser = await p.firefox.launch(headless=True)
context = await browser.new_context()
_cookie_domain = urlparse(BASE_URL).netloc
_parsed = urlparse(BASE_URL)
_cookie_domain = _parsed.hostname or _parsed.netloc
site_cookies = [
{
"name": cookie_name,
@@ -416,23 +543,35 @@ async def run():
"path": "/",
"httpOnly": True,
"secure": True,
"sameSite": "None"
"sameSite": "None",
},
{
"name": "eav-age-verified",
"value": "1",
"domain": _cookie_domain,
"path": "/"
}
"path": "/",
},
]
await context.add_cookies(site_cookies)
await context.add_cookies(site_cookies) # type: ignore[arg-type]
num_workers = min(MAX_WORKERS, len(pending))
workers = [
asyncio.create_task(
worker(i, queue, context, known,
total, retry_counts, video_map, map_lock, shutdown_event)
worker(
i,
queue,
context,
known,
total,
retry_counts,
video_map,
map_lock,
shutdown_event,
reauth_lock,
reauth_done,
_cookie_domain,
)
)
for i in range(num_workers)
]
@@ -442,7 +581,8 @@ async def run():
mapped = sum(1 for v in video_map.values() if v.get("videos"))
print(
f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries.")
f"\n[+] Video map: {mapped} posts with videos, {len(video_map)} total entries."
)
if not shutdown_event.is_set():
print(f"[✓] Completed. Full map in {VIDEO_MAP_FILE}")
@@ -454,7 +594,7 @@ async def run():
signal.signal(signal.SIGTERM, signal.SIG_DFL)
def main():
def main() -> None:
try:
asyncio.run(run())
except KeyboardInterrupt: