diff --git a/backend/app/services/playwright_scraper.py b/backend/app/services/playwright_scraper.py index 71b5c1d..292de52 100644 --- a/backend/app/services/playwright_scraper.py +++ b/backend/app/services/playwright_scraper.py @@ -27,10 +27,12 @@ Usage: import asyncio import json import logging +import os import random from datetime import datetime, timedelta from typing import Any, Dict, List, Optional from pathlib import Path +from urllib.parse import urlparse logger = logging.getLogger(__name__) @@ -87,9 +89,29 @@ class PlaywrightScraperService: try: self.playwright = await async_playwright().start() + proxy_url = ( + os.getenv("SCRAPER_PLAYWRIGHT_PROXY") + or os.getenv("SCRAPER_PROXY_URL") + or os.getenv("SCRAPER_HTTP_PROXY") + ) + proxy_config = None + if proxy_url: + parsed = urlparse(proxy_url) + if parsed.scheme and parsed.hostname and parsed.port: + proxy_config = { + "server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}", + } + if parsed.username: + proxy_config["username"] = parsed.username + if parsed.password: + proxy_config["password"] = parsed.password + + headless = os.getenv("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes") + # Launch with stealth settings self.browser = await self.playwright.chromium.launch( - headless=True, + headless=headless, + proxy=proxy_config, args=[ "--disable-blink-features=AutomationControlled", "--disable-dev-shm-usage", @@ -227,7 +249,7 @@ class PlaywrightScraperService: # Navigate to GoDaddy Auctions logger.info("Navigating to GoDaddy Auctions...") - await page.goto("https://auctions.godaddy.com/beta", wait_until="networkidle") + await page.goto("https://auctions.godaddy.com/beta", wait_until="domcontentloaded", timeout=60_000) # Wait for Cloudflare await self._wait_for_cloudflare(page) @@ -389,7 +411,7 @@ class PlaywrightScraperService: # Navigate to NameJet auctions page logger.info("Navigating to NameJet...") - await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="networkidle") + await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="domcontentloaded", timeout=60_000) # Wait for Cloudflare await self._wait_for_cloudflare(page)