feat(scraping): Playwright proxy support for protected sources
Some checks failed
CI / Frontend Lint & Type Check (push) Has been cancelled
CI / Frontend Build (push) Has been cancelled
CI / Backend Lint (push) Has been cancelled
CI / Backend Tests (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
Deploy / Build & Push Images (push) Has been cancelled
Deploy / Deploy to Server (push) Has been cancelled
Deploy / Notify (push) Has been cancelled

- Add proxy configuration via SCRAPER_PLAYWRIGHT_PROXY / SCRAPER_PROXY_URL
- Allow headless toggle via PLAYWRIGHT_HEADLESS
- Use domcontentloaded + longer timeouts for GoDaddy/NameJet navigation
This commit is contained in:
2025-12-11 21:52:28 +01:00
parent 31f27123db
commit b5485cf13c

View File

@ -27,10 +27,12 @@ Usage:
import asyncio
import json
import logging
import os
import random
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from pathlib import Path
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
@ -87,9 +89,29 @@ class PlaywrightScraperService:
try:
self.playwright = await async_playwright().start()
proxy_url = (
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
or os.getenv("SCRAPER_PROXY_URL")
or os.getenv("SCRAPER_HTTP_PROXY")
)
proxy_config = None
if proxy_url:
parsed = urlparse(proxy_url)
if parsed.scheme and parsed.hostname and parsed.port:
proxy_config = {
"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
}
if parsed.username:
proxy_config["username"] = parsed.username
if parsed.password:
proxy_config["password"] = parsed.password
headless = os.getenv("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes")
# Launch with stealth settings
self.browser = await self.playwright.chromium.launch(
headless=True,
headless=headless,
proxy=proxy_config,
args=[
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage",
@ -227,7 +249,7 @@ class PlaywrightScraperService:
# Navigate to GoDaddy Auctions
logger.info("Navigating to GoDaddy Auctions...")
await page.goto("https://auctions.godaddy.com/beta", wait_until="networkidle")
await page.goto("https://auctions.godaddy.com/beta", wait_until="domcontentloaded", timeout=60_000)
# Wait for Cloudflare
await self._wait_for_cloudflare(page)
@ -389,7 +411,7 @@ class PlaywrightScraperService:
# Navigate to NameJet auctions page
logger.info("Navigating to NameJet...")
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="networkidle")
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="domcontentloaded", timeout=60_000)
# Wait for Cloudflare
await self._wait_for_cloudflare(page)