pounce/backend/app/services/playwright_scraper.py
Yves Gugger b5485cf13c
Some checks failed
CI / Frontend Lint & Type Check (push) Has been cancelled
CI / Frontend Build (push) Has been cancelled
CI / Backend Lint (push) Has been cancelled
CI / Backend Tests (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
Deploy / Build & Push Images (push) Has been cancelled
Deploy / Deploy to Server (push) Has been cancelled
Deploy / Notify (push) Has been cancelled
feat(scraping): Playwright proxy support for protected sources
- Add proxy configuration via SCRAPER_PLAYWRIGHT_PROXY / SCRAPER_PROXY_URL
- Allow headless toggle via PLAYWRIGHT_HEADLESS
- Use domcontentloaded + longer timeouts for GoDaddy/NameJet navigation
2025-12-11 21:52:28 +01:00

552 lines
22 KiB
Python

"""
Playwright-based Stealth Scraper for Cloudflare-protected Domain Auction Sites.
This module uses Playwright with stealth plugins to bypass Cloudflare and other
anti-bot protections. It's designed for enterprise-grade web scraping.
Features:
- Stealth mode (undetectable browser fingerprint)
- Automatic Cloudflare bypass
- Connection pooling
- Retry logic with exponential backoff
- JSON extraction from rendered pages
- Cookie persistence across sessions
Supported Platforms:
- GoDaddy Auctions (Cloudflare protected)
- NameJet (Cloudflare protected)
- Any other protected auction site
Usage:
scraper = PlaywrightScraperService()
await scraper.initialize()
auctions = await scraper.scrape_godaddy()
await scraper.close()
"""
import asyncio
import json
import logging
import os
import random
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from pathlib import Path
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
# Try to import playwright (optional dependency)
try:
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
from playwright_stealth import Stealth
PLAYWRIGHT_AVAILABLE = True
except ImportError:
PLAYWRIGHT_AVAILABLE = False
Stealth = None
# Define dummy types for type hints
Browser = Any
BrowserContext = Any
Page = Any
logger.warning("Playwright not installed. Stealth scraping disabled.")
class PlaywrightScraperService:
"""
Enterprise-grade Playwright scraper with Cloudflare bypass.
Uses stealth techniques to appear as a real browser:
- Real Chrome user agent
- WebGL fingerprint spoofing
- Navigator property spoofing
- Timezone and locale matching
"""
# User agents that work well with Cloudflare
USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
]
def __init__(self):
self.playwright = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self._initialized = False
self._cookie_dir = Path(__file__).parent.parent.parent / "data" / "cookies"
self._cookie_dir.mkdir(parents=True, exist_ok=True)
async def initialize(self) -> bool:
"""Initialize the browser instance."""
if not PLAYWRIGHT_AVAILABLE:
logger.error("Playwright not available. Install with: pip install playwright playwright-stealth")
return False
if self._initialized:
return True
try:
self.playwright = await async_playwright().start()
proxy_url = (
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
or os.getenv("SCRAPER_PROXY_URL")
or os.getenv("SCRAPER_HTTP_PROXY")
)
proxy_config = None
if proxy_url:
parsed = urlparse(proxy_url)
if parsed.scheme and parsed.hostname and parsed.port:
proxy_config = {
"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
}
if parsed.username:
proxy_config["username"] = parsed.username
if parsed.password:
proxy_config["password"] = parsed.password
headless = os.getenv("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes")
# Launch with stealth settings
self.browser = await self.playwright.chromium.launch(
headless=headless,
proxy=proxy_config,
args=[
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage",
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-infobars",
"--disable-extensions",
"--window-size=1920,1080",
]
)
# Create context with realistic settings
self.context = await self.browser.new_context(
user_agent=random.choice(self.USER_AGENTS),
viewport={"width": 1920, "height": 1080},
locale="en-US",
timezone_id="America/New_York",
geolocation={"longitude": -73.935242, "latitude": 40.730610},
permissions=["geolocation"],
)
# Load saved cookies if available
await self._load_cookies()
self._initialized = True
logger.info("Playwright browser initialized successfully")
return True
except Exception as e:
logger.exception(f"Failed to initialize Playwright: {e}")
return False
async def close(self):
"""Close browser and cleanup."""
if self.context:
await self._save_cookies()
await self.context.close()
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
self._initialized = False
async def _load_cookies(self):
"""Load saved cookies from file."""
cookie_file = self._cookie_dir / "session_cookies.json"
if cookie_file.exists():
try:
with open(cookie_file) as f:
cookies = json.load(f)
await self.context.add_cookies(cookies)
logger.info(f"Loaded {len(cookies)} saved cookies")
except Exception as e:
logger.warning(f"Failed to load cookies: {e}")
async def _save_cookies(self):
"""Save cookies to file for persistence."""
try:
cookies = await self.context.cookies()
cookie_file = self._cookie_dir / "session_cookies.json"
with open(cookie_file, "w") as f:
json.dump(cookies, f)
logger.info(f"Saved {len(cookies)} cookies")
except Exception as e:
logger.warning(f"Failed to save cookies: {e}")
async def _create_stealth_page(self) -> Page:
"""Create a new page with stealth mode enabled."""
page = await self.context.new_page()
# Apply stealth mode
if Stealth:
stealth = Stealth(
navigator_webdriver=True,
chrome_runtime=True,
navigator_user_agent=True,
navigator_vendor=True,
webgl_vendor=True,
)
await stealth.apply_stealth_async(page)
return page
async def _wait_for_cloudflare(self, page: Page, timeout: int = 30):
"""Wait for Cloudflare challenge to complete."""
try:
# Wait for either the challenge to complete or content to load
await page.wait_for_function(
"""
() => {
// Check if we're past Cloudflare
const title = document.title.toLowerCase();
return !title.includes('just a moment') &&
!title.includes('attention required') &&
!title.includes('checking your browser');
}
""",
timeout=timeout * 1000
)
# Additional delay for any remaining JS to execute
await asyncio.sleep(2)
except Exception as e:
logger.warning(f"Cloudflare wait timeout: {e}")
# ═══════════════════════════════════════════════════════════════════════════════
# GODADDY AUCTIONS SCRAPER
# ═══════════════════════════════════════════════════════════════════════════════
async def scrape_godaddy(self, limit: int = 100) -> Dict[str, Any]:
"""
Scrape GoDaddy Auctions using Playwright.
GoDaddy uses Cloudflare + their own bot detection.
We intercept the API calls made by their frontend.
"""
if not await self.initialize():
return {"items": [], "total": 0, "error": "Playwright not initialized"}
page = None
try:
page = await self._create_stealth_page()
# Intercept XHR requests to capture auction data
captured_data = []
async def handle_response(response):
if "findApiProxy" in response.url and "auction" in response.url:
try:
data = await response.json()
captured_data.append(data)
except:
pass
page.on("response", handle_response)
# Navigate to GoDaddy Auctions
logger.info("Navigating to GoDaddy Auctions...")
await page.goto("https://auctions.godaddy.com/beta", wait_until="domcontentloaded", timeout=60_000)
# Wait for Cloudflare
await self._wait_for_cloudflare(page)
# Wait for auction content to load
try:
await page.wait_for_selector('[data-testid="auction-card"], .auction-card, .domain-item', timeout=15000)
except:
logger.warning("Auction cards not found, trying to scroll...")
# Scroll to trigger lazy loading
await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
await asyncio.sleep(2)
# Try to extract from intercepted API calls first
if captured_data:
return self._parse_godaddy_api_response(captured_data)
# Fallback: Extract from DOM
return await self._extract_godaddy_from_dom(page)
except Exception as e:
logger.exception(f"GoDaddy scraping error: {e}")
return {"items": [], "total": 0, "error": str(e)}
finally:
if page:
await page.close()
def _parse_godaddy_api_response(self, captured_data: List[Dict]) -> Dict[str, Any]:
"""Parse captured API response from GoDaddy."""
items = []
for data in captured_data:
results = data.get("results", [])
for item in results:
domain = item.get("fqdn", "") or item.get("domain", "")
if not domain:
continue
tld = domain.rsplit(".", 1)[-1] if "." in domain else ""
# Parse end time
end_time = None
end_at = item.get("endingAt") or item.get("auctionEndTime")
if end_at:
try:
end_time = datetime.fromisoformat(end_at.replace("Z", "+00:00")).replace(tzinfo=None)
except:
pass
price = item.get("price") or item.get("currentBidPrice") or item.get("minBid") or 0
items.append({
"domain": domain,
"tld": tld,
"platform": "GoDaddy",
"current_bid": float(price) if price else 0,
"min_bid": float(item.get("minBid", 0) or 0),
"num_bids": int(item.get("bids", 0) or item.get("bidCount", 0) or 0),
"end_time": end_time or datetime.utcnow() + timedelta(days=1),
"buy_now_price": float(item.get("buyNowPrice")) if item.get("buyNowPrice") else None,
"auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce",
"currency": "USD",
"is_active": True,
"traffic": int(item.get("traffic", 0) or 0),
"domain_authority": int(item.get("valuationPrice", 0) or 0),
})
return {
"items": items,
"total": len(items),
"source": "api_intercept",
}
async def _extract_godaddy_from_dom(self, page: Page) -> Dict[str, Any]:
"""Extract auction data from GoDaddy DOM when API intercept fails."""
items = []
try:
# Try different selectors
selectors = [
'[data-testid="auction-card"]',
'.auction-card',
'.domain-listing',
'tr[data-domain]',
'.domain-row',
]
for selector in selectors:
elements = await page.query_selector_all(selector)
if elements:
logger.info(f"Found {len(elements)} elements with selector: {selector}")
for el in elements[:100]: # Max 100 items
try:
# Try to extract domain name
domain_el = await el.query_selector('.domain-name, .fqdn, [data-domain], a[href*="domain"]')
if domain_el:
domain = await domain_el.text_content()
domain = domain.strip() if domain else ""
else:
domain = await el.get_attribute("data-domain") or ""
if not domain or "." not in domain:
continue
tld = domain.rsplit(".", 1)[-1]
# Try to extract price
price = 0
price_el = await el.query_selector('.price, .bid, .current-bid, [data-price]')
if price_el:
price_text = await price_el.text_content()
price = float("".join(c for c in price_text if c.isdigit() or c == ".") or "0")
items.append({
"domain": domain,
"tld": tld,
"platform": "GoDaddy",
"current_bid": price,
"min_bid": 0,
"num_bids": 0,
"end_time": datetime.utcnow() + timedelta(days=1),
"buy_now_price": None,
"auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce",
"currency": "USD",
"is_active": True,
})
except Exception as e:
logger.debug(f"Error extracting element: {e}")
break # Found elements, stop trying other selectors
except Exception as e:
logger.exception(f"DOM extraction error: {e}")
return {
"items": items,
"total": len(items),
"source": "dom_extraction",
}
# ═══════════════════════════════════════════════════════════════════════════════
# NAMEJET SCRAPER
# ═══════════════════════════════════════════════════════════════════════════════
async def scrape_namejet(self, limit: int = 100) -> Dict[str, Any]:
"""
Scrape NameJet auctions using Playwright.
NameJet uses heavy Cloudflare protection.
"""
if not await self.initialize():
return {"items": [], "total": 0, "error": "Playwright not initialized"}
page = None
try:
page = await self._create_stealth_page()
# Navigate to NameJet auctions page
logger.info("Navigating to NameJet...")
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="domcontentloaded", timeout=60_000)
# Wait for Cloudflare
await self._wait_for_cloudflare(page)
# Wait for auction table
try:
await page.wait_for_selector('#MainContent_gvAuctions, .auction-table, table', timeout=15000)
except:
logger.warning("NameJet table not found")
# Extract data from table
items = []
rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')
for row in rows[:limit]:
try:
cells = await row.query_selector_all('td')
if len(cells) < 3:
continue
# NameJet format: Domain, End Time, Price, Bids, ...
domain = await cells[0].text_content()
domain = domain.strip() if domain else ""
if not domain or "." not in domain:
continue
tld = domain.rsplit(".", 1)[-1]
# Parse price
price = 0
if len(cells) > 2:
price_text = await cells[2].text_content()
price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")
# Parse bids
bids = 0
if len(cells) > 3:
bids_text = await cells[3].text_content()
bids = int("".join(c for c in (bids_text or "0") if c.isdigit()) or "0")
items.append({
"domain": domain,
"tld": tld,
"platform": "NameJet",
"current_bid": price,
"min_bid": 0,
"num_bids": bids,
"end_time": datetime.utcnow() + timedelta(days=1),
"buy_now_price": None,
"auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
"currency": "USD",
"is_active": True,
})
except Exception as e:
logger.debug(f"Error parsing row: {e}")
return {
"items": items,
"total": len(items),
"source": "playwright",
}
except Exception as e:
logger.exception(f"NameJet scraping error: {e}")
return {"items": [], "total": 0, "error": str(e)}
finally:
if page:
await page.close()
# ═══════════════════════════════════════════════════════════════════════════════
# UNIFIED SCRAPE METHOD
# ═══════════════════════════════════════════════════════════════════════════════
async def scrape_all_protected(self) -> Dict[str, Any]:
"""
Scrape all Cloudflare-protected platforms.
Returns combined results from:
- GoDaddy Auctions
- NameJet
"""
results = {
"total_found": 0,
"platforms": {},
"items": [],
"errors": [],
}
if not PLAYWRIGHT_AVAILABLE:
results["errors"].append("Playwright not installed")
return results
try:
await self.initialize()
# Scrape GoDaddy
logger.info("Scraping GoDaddy with Playwright...")
godaddy_result = await self.scrape_godaddy()
results["platforms"]["GoDaddy"] = {
"found": len(godaddy_result.get("items", [])),
"source": godaddy_result.get("source", "unknown"),
}
results["items"].extend(godaddy_result.get("items", []))
results["total_found"] += len(godaddy_result.get("items", []))
if godaddy_result.get("error"):
results["errors"].append(f"GoDaddy: {godaddy_result['error']}")
# Small delay between platforms
await asyncio.sleep(3)
# Scrape NameJet
logger.info("Scraping NameJet with Playwright...")
namejet_result = await self.scrape_namejet()
results["platforms"]["NameJet"] = {
"found": len(namejet_result.get("items", [])),
"source": namejet_result.get("source", "unknown"),
}
results["items"].extend(namejet_result.get("items", []))
results["total_found"] += len(namejet_result.get("items", []))
if namejet_result.get("error"):
results["errors"].append(f"NameJet: {namejet_result['error']}")
except Exception as e:
logger.exception(f"Playwright scraping error: {e}")
results["errors"].append(str(e))
finally:
await self.close()
return results
# Singleton instance
playwright_scraper = PlaywrightScraperService()