feat(scraping): Playwright proxy support for protected sources
- Add proxy configuration via SCRAPER_PLAYWRIGHT_PROXY / SCRAPER_PROXY_URL - Allow headless toggle via PLAYWRIGHT_HEADLESS - Use domcontentloaded + longer timeouts for GoDaddy/NameJet navigation
This commit is contained in:
@ -27,10 +27,12 @@ Usage:
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -87,9 +89,29 @@ class PlaywrightScraperService:
|
||||
try:
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
proxy_url = (
|
||||
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
|
||||
or os.getenv("SCRAPER_PROXY_URL")
|
||||
or os.getenv("SCRAPER_HTTP_PROXY")
|
||||
)
|
||||
proxy_config = None
|
||||
if proxy_url:
|
||||
parsed = urlparse(proxy_url)
|
||||
if parsed.scheme and parsed.hostname and parsed.port:
|
||||
proxy_config = {
|
||||
"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
|
||||
}
|
||||
if parsed.username:
|
||||
proxy_config["username"] = parsed.username
|
||||
if parsed.password:
|
||||
proxy_config["password"] = parsed.password
|
||||
|
||||
headless = os.getenv("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes")
|
||||
|
||||
# Launch with stealth settings
|
||||
self.browser = await self.playwright.chromium.launch(
|
||||
headless=True,
|
||||
headless=headless,
|
||||
proxy=proxy_config,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-dev-shm-usage",
|
||||
@ -227,7 +249,7 @@ class PlaywrightScraperService:
|
||||
|
||||
# Navigate to GoDaddy Auctions
|
||||
logger.info("Navigating to GoDaddy Auctions...")
|
||||
await page.goto("https://auctions.godaddy.com/beta", wait_until="networkidle")
|
||||
await page.goto("https://auctions.godaddy.com/beta", wait_until="domcontentloaded", timeout=60_000)
|
||||
|
||||
# Wait for Cloudflare
|
||||
await self._wait_for_cloudflare(page)
|
||||
@ -389,7 +411,7 @@ class PlaywrightScraperService:
|
||||
|
||||
# Navigate to NameJet auctions page
|
||||
logger.info("Navigating to NameJet...")
|
||||
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="networkidle")
|
||||
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="domcontentloaded", timeout=60_000)
|
||||
|
||||
# Wait for Cloudflare
|
||||
await self._wait_for_cloudflare(page)
|
||||
|
||||
Reference in New Issue
Block a user