feat(scraping): Playwright proxy support for protected sources

- Add proxy configuration via SCRAPER_PLAYWRIGHT_PROXY / SCRAPER_PROXY_URL
- Allow headless toggle via PLAYWRIGHT_HEADLESS
- Use domcontentloaded + longer timeouts for GoDaddy/NameJet navigation
This commit is contained in:
2025-12-11 21:52:28 +01:00
parent 5e0d4c6590
commit 3323f33d7c

View File

@ -27,10 +27,12 @@ Usage:
import asyncio
import json
import logging
import os
import random
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from pathlib import Path
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
@ -87,9 +89,29 @@ class PlaywrightScraperService:
try:
self.playwright = await async_playwright().start()
proxy_url = (
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
or os.getenv("SCRAPER_PROXY_URL")
or os.getenv("SCRAPER_HTTP_PROXY")
)
proxy_config = None
if proxy_url:
parsed = urlparse(proxy_url)
if parsed.scheme and parsed.hostname and parsed.port:
proxy_config = {
"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
}
if parsed.username:
proxy_config["username"] = parsed.username
if parsed.password:
proxy_config["password"] = parsed.password
headless = os.getenv("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes")
# Launch with stealth settings
self.browser = await self.playwright.chromium.launch(
headless=True,
headless=headless,
proxy=proxy_config,
args=[
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage",
@ -227,7 +249,7 @@ class PlaywrightScraperService:
# Navigate to GoDaddy Auctions
logger.info("Navigating to GoDaddy Auctions...")
await page.goto("https://auctions.godaddy.com/beta", wait_until="networkidle")
await page.goto("https://auctions.godaddy.com/beta", wait_until="domcontentloaded", timeout=60_000)
# Wait for Cloudflare
await self._wait_for_cloudflare(page)
@ -389,7 +411,7 @@ class PlaywrightScraperService:
# Navigate to NameJet auctions page
logger.info("Navigating to NameJet...")
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="networkidle")
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="domcontentloaded", timeout=60_000)
# Wait for Cloudflare
await self._wait_for_cloudflare(page)