""" Playwright-based Stealth Scraper for Cloudflare-protected Domain Auction Sites. This module uses Playwright with stealth plugins to bypass Cloudflare and other anti-bot protections. It's designed for enterprise-grade web scraping. Features: - Stealth mode (undetectable browser fingerprint) - Automatic Cloudflare bypass - Connection pooling - Retry logic with exponential backoff - JSON extraction from rendered pages - Cookie persistence across sessions Supported Platforms: - GoDaddy Auctions (Cloudflare protected) - NameJet (Cloudflare protected) - Any other protected auction site Usage: scraper = PlaywrightScraperService() await scraper.initialize() auctions = await scraper.scrape_godaddy() await scraper.close() """ import asyncio import json import logging import os import random import re from datetime import datetime, timedelta from typing import Any, Dict, List, Optional from pathlib import Path from urllib.parse import urlparse from zoneinfo import ZoneInfo logger = logging.getLogger(__name__) # Try to import playwright (optional dependency) try: from playwright.async_api import async_playwright, Browser, BrowserContext, Page from playwright_stealth import Stealth PLAYWRIGHT_AVAILABLE = True except ImportError: PLAYWRIGHT_AVAILABLE = False Stealth = None # Define dummy types for type hints Browser = Any BrowserContext = Any Page = Any logger.warning("Playwright not installed. Stealth scraping disabled.") class PlaywrightScraperService: """ Enterprise-grade Playwright scraper with Cloudflare bypass. Uses stealth techniques to appear as a real browser: - Real Chrome user agent - WebGL fingerprint spoofing - Navigator property spoofing - Timezone and locale matching """ # User agents that work well with Cloudflare USER_AGENTS = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", ] def __init__(self): self.playwright = None self.browser: Optional[Browser] = None self.context: Optional[BrowserContext] = None self._initialized = False self._cookie_dir = Path(__file__).parent.parent.parent / "data" / "cookies" self._cookie_dir.mkdir(parents=True, exist_ok=True) async def initialize(self) -> bool: """Initialize the browser instance.""" if not PLAYWRIGHT_AVAILABLE: logger.error("Playwright not available. Install with: pip install playwright playwright-stealth") return False if self._initialized: return True try: self.playwright = await async_playwright().start() # Proxy selection: # - SCRAPER_PLAYWRIGHT_PROXY: single proxy URL # - SCRAPER_PLAYWRIGHT_PROXY_POOL: comma-separated proxy URLs, one is chosen randomly per browser start proxy_pool_raw = os.getenv("SCRAPER_PLAYWRIGHT_PROXY_POOL", "").strip() proxy_pool = [p.strip() for p in proxy_pool_raw.split(",") if p.strip()] if proxy_pool_raw else [] proxy_url = ( random.choice(proxy_pool) if proxy_pool else ( os.getenv("SCRAPER_PLAYWRIGHT_PROXY") or os.getenv("SCRAPER_PROXY_URL") or os.getenv("SCRAPER_HTTP_PROXY") ) ) proxy_config = None if proxy_url: parsed = urlparse(proxy_url) if parsed.scheme and parsed.hostname and parsed.port: proxy_config = { "server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}", } if parsed.username: proxy_config["username"] = parsed.username if parsed.password: proxy_config["password"] = parsed.password headless = os.getenv("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes") # Launch with stealth settings self.browser = await self.playwright.chromium.launch( headless=headless, proxy=proxy_config, args=[ "--disable-blink-features=AutomationControlled", "--disable-dev-shm-usage", "--no-sandbox", "--disable-setuid-sandbox", "--disable-infobars", "--disable-extensions", "--window-size=1920,1080", ] ) # Create context with realistic settings self.context = await self.browser.new_context( user_agent=random.choice(self.USER_AGENTS), viewport={"width": 1920, "height": 1080}, locale="en-US", timezone_id="America/New_York", geolocation={"longitude": -73.935242, "latitude": 40.730610}, permissions=["geolocation"], ) # Load saved cookies if available await self._load_cookies() self._initialized = True logger.info("Playwright browser initialized successfully") return True except Exception as e: logger.exception(f"Failed to initialize Playwright: {e}") return False async def close(self): """Close browser and cleanup.""" if self.context: await self._save_cookies() await self.context.close() if self.browser: await self.browser.close() if self.playwright: await self.playwright.stop() self._initialized = False async def _load_cookies(self): """Load saved cookies from file.""" cookie_file = self._cookie_dir / "session_cookies.json" if cookie_file.exists(): try: with open(cookie_file) as f: cookies = json.load(f) await self.context.add_cookies(cookies) logger.info(f"Loaded {len(cookies)} saved cookies") except Exception as e: logger.warning(f"Failed to load cookies: {e}") async def _save_cookies(self): """Save cookies to file for persistence.""" try: cookies = await self.context.cookies() cookie_file = self._cookie_dir / "session_cookies.json" with open(cookie_file, "w") as f: json.dump(cookies, f) logger.info(f"Saved {len(cookies)} cookies") except Exception as e: logger.warning(f"Failed to save cookies: {e}") async def _create_stealth_page(self) -> Page: """Create a new page with stealth mode enabled.""" page = await self.context.new_page() # Apply stealth mode if Stealth: stealth = Stealth( navigator_webdriver=True, chrome_runtime=True, navigator_user_agent=True, navigator_vendor=True, webgl_vendor=True, ) await stealth.apply_stealth_async(page) return page async def _wait_for_cloudflare(self, page: Page, timeout: int = 30): """Wait for Cloudflare challenge to complete.""" try: # Wait for either the challenge to complete or content to load await page.wait_for_function( """ () => { // Check if we're past Cloudflare const title = document.title.toLowerCase(); return !title.includes('just a moment') && !title.includes('attention required') && !title.includes('checking your browser'); } """, timeout=timeout * 1000 ) # Additional delay for any remaining JS to execute await asyncio.sleep(2) except Exception as e: logger.warning(f"Cloudflare wait timeout: {e}") # ═══════════════════════════════════════════════════════════════════════════════ # GODADDY AUCTIONS SCRAPER # ═══════════════════════════════════════════════════════════════════════════════ async def scrape_godaddy(self, limit: int = 100) -> Dict[str, Any]: """ Scrape GoDaddy Auctions using Playwright. GoDaddy uses Cloudflare + their own bot detection. We intercept the API calls made by their frontend. """ if not await self.initialize(): return {"items": [], "total": 0, "error": "Playwright not initialized"} page = None try: page = await self._create_stealth_page() # Intercept XHR requests to capture auction data captured_data = [] async def handle_response(response): if "findApiProxy" in response.url and "auction" in response.url: try: data = await response.json() captured_data.append(data) except: pass page.on("response", handle_response) # Navigate to GoDaddy Auctions logger.info("Navigating to GoDaddy Auctions...") await page.goto("https://auctions.godaddy.com/beta", wait_until="domcontentloaded", timeout=60_000) # Wait for Cloudflare await self._wait_for_cloudflare(page) # Wait for auction content to load try: await page.wait_for_selector('[data-testid="auction-card"], .auction-card, .domain-item', timeout=15000) except: logger.warning("Auction cards not found, trying to scroll...") # Scroll to trigger lazy loading await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") await asyncio.sleep(2) # Try to extract from intercepted API calls first if captured_data: return self._parse_godaddy_api_response(captured_data) # Fallback: Extract from DOM return await self._extract_godaddy_from_dom(page) except Exception as e: logger.exception(f"GoDaddy scraping error: {e}") return {"items": [], "total": 0, "error": str(e)} finally: if page: await page.close() def _parse_godaddy_api_response(self, captured_data: List[Dict]) -> Dict[str, Any]: """Parse captured API response from GoDaddy.""" items = [] for data in captured_data: results = data.get("results", []) for item in results: domain = item.get("fqdn", "") or item.get("domain", "") if not domain: continue tld = domain.rsplit(".", 1)[-1] if "." in domain else "" # Parse end time end_time = None end_at = item.get("endingAt") or item.get("auctionEndTime") if end_at: try: end_time = datetime.fromisoformat(end_at.replace("Z", "+00:00")).replace(tzinfo=None) except: pass price = item.get("price") or item.get("currentBidPrice") or item.get("minBid") or 0 items.append({ "domain": domain, "tld": tld, "platform": "GoDaddy", "current_bid": float(price) if price else 0, "min_bid": float(item.get("minBid", 0) or 0), "num_bids": int(item.get("bids", 0) or item.get("bidCount", 0) or 0), "end_time": end_time or datetime.utcnow() + timedelta(days=1), "buy_now_price": float(item.get("buyNowPrice")) if item.get("buyNowPrice") else None, "auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce", "currency": "USD", "is_active": True, "traffic": int(item.get("traffic", 0) or 0), "domain_authority": int(item.get("valuationPrice", 0) or 0), }) return { "items": items, "total": len(items), "source": "api_intercept", } async def _extract_godaddy_from_dom(self, page: Page) -> Dict[str, Any]: """Extract auction data from GoDaddy DOM when API intercept fails.""" items = [] try: # Try different selectors selectors = [ '[data-testid="auction-card"]', '.auction-card', '.domain-listing', 'tr[data-domain]', '.domain-row', ] for selector in selectors: elements = await page.query_selector_all(selector) if elements: logger.info(f"Found {len(elements)} elements with selector: {selector}") for el in elements[:100]: # Max 100 items try: # Try to extract domain name domain_el = await el.query_selector('.domain-name, .fqdn, [data-domain], a[href*="domain"]') if domain_el: domain = await domain_el.text_content() domain = domain.strip() if domain else "" else: domain = await el.get_attribute("data-domain") or "" if not domain or "." not in domain: continue tld = domain.rsplit(".", 1)[-1] # Try to extract price price = 0 price_el = await el.query_selector('.price, .bid, .current-bid, [data-price]') if price_el: price_text = await price_el.text_content() price = float("".join(c for c in price_text if c.isdigit() or c == ".") or "0") items.append({ "domain": domain, "tld": tld, "platform": "GoDaddy", "current_bid": price, "min_bid": 0, "num_bids": 0, "end_time": datetime.utcnow() + timedelta(days=1), "buy_now_price": None, "auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce", "currency": "USD", "is_active": True, }) except Exception as e: logger.debug(f"Error extracting element: {e}") break # Found elements, stop trying other selectors except Exception as e: logger.exception(f"DOM extraction error: {e}") return { "items": items, "total": len(items), "source": "dom_extraction", } # ═══════════════════════════════════════════════════════════════════════════════ # NAMEJET SCRAPER # ═══════════════════════════════════════════════════════════════════════════════ async def scrape_namejet(self, limit: int = 100) -> Dict[str, Any]: """ Scrape NameJet auctions using Playwright. NameJet uses heavy Cloudflare protection. """ if not await self.initialize(): return {"items": [], "total": 0, "error": "Playwright not initialized"} page = None try: page = await self._create_stealth_page() # Navigate to NameJet auctions page logger.info("Navigating to NameJet...") await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="domcontentloaded", timeout=60_000) # Wait for Cloudflare await self._wait_for_cloudflare(page) # Wait for auction table try: await page.wait_for_selector('#MainContent_gvAuctions, .auction-table, table', timeout=15000) except: logger.warning("NameJet table not found") # Extract data from table items = [] rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row') namejet_tz = os.getenv("NAMEJET_TIMEZONE", "America/Los_Angeles") def parse_end_time(text: str) -> Optional[datetime]: raw = (text or "").strip() if not raw: return None # Sometimes they include timezone abbreviation raw = raw.replace("PST", "").replace("PDT", "").replace("EST", "").replace("EDT", "").replace("UTC", "").strip() # Relative format like "1d 2h 3m" (rare) m = re.findall(r"(\\d+)\\s*([dhms])", raw.lower()) if m: secs = 0 for n, u in m: n_i = int(n) if u == "d": secs += n_i * 86400 elif u == "h": secs += n_i * 3600 elif u == "m": secs += n_i * 60 elif u == "s": secs += n_i if secs > 0: return (datetime.now(ZoneInfo("UTC")) + timedelta(seconds=secs)).replace(tzinfo=None) # Absolute formats (common) fmts = [ "%m/%d/%Y %I:%M %p", "%m/%d/%Y %H:%M", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", ] for fmt in fmts: try: local = datetime.strptime(raw, fmt).replace(tzinfo=ZoneInfo(namejet_tz)) return local.astimezone(ZoneInfo("UTC")).replace(tzinfo=None) except Exception: continue return None for row in rows[:limit]: try: cells = await row.query_selector_all('td') if len(cells) < 4: continue # NameJet format: Domain, End Time, Price, Bids, ... domain = await cells[0].text_content() domain = domain.strip() if domain else "" if not domain or "." not in domain: continue tld = domain.rsplit(".", 1)[-1] # Parse end time from column 1 end_text = await cells[1].text_content() end_time = parse_end_time(end_text or "") if end_time is None: continue if end_time <= datetime.utcnow(): continue # Parse price price = 0 if len(cells) > 2: price_text = await cells[2].text_content() price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0") if price <= 0: continue # Parse bids bids = 0 if len(cells) > 3: bids_text = await cells[3].text_content() bids = int("".join(c for c in (bids_text or "0") if c.isdigit()) or "0") items.append({ "domain": domain, "tld": tld, "platform": "NameJet", "current_bid": price, "num_bids": bids, "end_time": end_time, "auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}", "currency": "USD", }) except Exception as e: logger.debug(f"Error parsing row: {e}") return { "items": items, "total": len(items), "source": "playwright", } except Exception as e: logger.exception(f"NameJet scraping error: {e}") return {"items": [], "total": 0, "error": str(e)} finally: if page: await page.close() # ═══════════════════════════════════════════════════════════════════════════════ # UNIFIED SCRAPE METHOD # ═══════════════════════════════════════════════════════════════════════════════ async def scrape_all_protected(self) -> Dict[str, Any]: """ Scrape all Cloudflare-protected platforms. Returns combined results from: - GoDaddy Auctions - NameJet """ results = { "total_found": 0, "platforms": {}, "items": [], "errors": [], } if not PLAYWRIGHT_AVAILABLE: results["errors"].append("Playwright not installed") return results try: await self.initialize() # Scrape NameJet (Cloudflare protected) logger.info("Scraping NameJet with Playwright...") namejet_result = await self.scrape_namejet() results["platforms"]["NameJet"] = { "found": len(namejet_result.get("items", [])), "source": namejet_result.get("source", "unknown"), } results["items"].extend(namejet_result.get("items", [])) results["total_found"] += len(namejet_result.get("items", [])) if namejet_result.get("error"): results["errors"].append(f"NameJet: {namejet_result['error']}") except Exception as e: logger.exception(f"Playwright scraping error: {e}") results["errors"].append(str(e)) finally: await self.close() return results # Singleton instance playwright_scraper = PlaywrightScraperService()