feat(scraping): Playwright proxy support for protected sources
- Add proxy configuration via SCRAPER_PLAYWRIGHT_PROXY / SCRAPER_PROXY_URL - Allow headless toggle via PLAYWRIGHT_HEADLESS - Use domcontentloaded + longer timeouts for GoDaddy/NameJet navigation
This commit is contained in:
@ -27,10 +27,12 @@ Usage:
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -87,9 +89,29 @@ class PlaywrightScraperService:
|
|||||||
try:
|
try:
|
||||||
self.playwright = await async_playwright().start()
|
self.playwright = await async_playwright().start()
|
||||||
|
|
||||||
|
proxy_url = (
|
||||||
|
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
|
||||||
|
or os.getenv("SCRAPER_PROXY_URL")
|
||||||
|
or os.getenv("SCRAPER_HTTP_PROXY")
|
||||||
|
)
|
||||||
|
proxy_config = None
|
||||||
|
if proxy_url:
|
||||||
|
parsed = urlparse(proxy_url)
|
||||||
|
if parsed.scheme and parsed.hostname and parsed.port:
|
||||||
|
proxy_config = {
|
||||||
|
"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
|
||||||
|
}
|
||||||
|
if parsed.username:
|
||||||
|
proxy_config["username"] = parsed.username
|
||||||
|
if parsed.password:
|
||||||
|
proxy_config["password"] = parsed.password
|
||||||
|
|
||||||
|
headless = os.getenv("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes")
|
||||||
|
|
||||||
# Launch with stealth settings
|
# Launch with stealth settings
|
||||||
self.browser = await self.playwright.chromium.launch(
|
self.browser = await self.playwright.chromium.launch(
|
||||||
headless=True,
|
headless=headless,
|
||||||
|
proxy=proxy_config,
|
||||||
args=[
|
args=[
|
||||||
"--disable-blink-features=AutomationControlled",
|
"--disable-blink-features=AutomationControlled",
|
||||||
"--disable-dev-shm-usage",
|
"--disable-dev-shm-usage",
|
||||||
@ -227,7 +249,7 @@ class PlaywrightScraperService:
|
|||||||
|
|
||||||
# Navigate to GoDaddy Auctions
|
# Navigate to GoDaddy Auctions
|
||||||
logger.info("Navigating to GoDaddy Auctions...")
|
logger.info("Navigating to GoDaddy Auctions...")
|
||||||
await page.goto("https://auctions.godaddy.com/beta", wait_until="networkidle")
|
await page.goto("https://auctions.godaddy.com/beta", wait_until="domcontentloaded", timeout=60_000)
|
||||||
|
|
||||||
# Wait for Cloudflare
|
# Wait for Cloudflare
|
||||||
await self._wait_for_cloudflare(page)
|
await self._wait_for_cloudflare(page)
|
||||||
@ -389,7 +411,7 @@ class PlaywrightScraperService:
|
|||||||
|
|
||||||
# Navigate to NameJet auctions page
|
# Navigate to NameJet auctions page
|
||||||
logger.info("Navigating to NameJet...")
|
logger.info("Navigating to NameJet...")
|
||||||
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="networkidle")
|
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="domcontentloaded", timeout=60_000)
|
||||||
|
|
||||||
# Wait for Cloudflare
|
# Wait for Cloudflare
|
||||||
await self._wait_for_cloudflare(page)
|
await self._wait_for_cloudflare(page)
|
||||||
|
|||||||
Reference in New Issue
Block a user