""" Playwright-based Stealth Scraper for Cloudflare-protected Domain Auction Sites. This module uses Playwright with stealth plugins to bypass Cloudflare and other anti-bot protections. It's designed for enterprise-grade web scraping. Features: - Stealth mode (undetectable browser fingerprint) - Automatic Cloudflare bypass - Connection pooling - Retry logic with exponential backoff - JSON extraction from rendered pages - Cookie persistence across sessions Supported Platforms: - GoDaddy Auctions (Cloudflare protected) - NameJet (Cloudflare protected) - Any other protected auction site Usage: scraper = PlaywrightScraperService() await scraper.initialize() auctions = await scraper.scrape_godaddy() await scraper.close() """ import asyncio import json import logging import random from datetime import datetime, timedelta from typing import Any, Dict, List, Optional from pathlib import Path logger = logging.getLogger(__name__) # Try to import playwright (optional dependency) try: from playwright.async_api import async_playwright, Browser, BrowserContext, Page from playwright_stealth import Stealth PLAYWRIGHT_AVAILABLE = True except ImportError: PLAYWRIGHT_AVAILABLE = False Stealth = None logger.warning("Playwright not installed. Stealth scraping disabled.") class PlaywrightScraperService: """ Enterprise-grade Playwright scraper with Cloudflare bypass. Uses stealth techniques to appear as a real browser: - Real Chrome user agent - WebGL fingerprint spoofing - Navigator property spoofing - Timezone and locale matching """ # User agents that work well with Cloudflare USER_AGENTS = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", ] def __init__(self): self.playwright = None self.browser: Optional[Browser] = None self.context: Optional[BrowserContext] = None self._initialized = False self._cookie_dir = Path(__file__).parent.parent.parent / "data" / "cookies" self._cookie_dir.mkdir(parents=True, exist_ok=True) async def initialize(self) -> bool: """Initialize the browser instance.""" if not PLAYWRIGHT_AVAILABLE: logger.error("Playwright not available. Install with: pip install playwright playwright-stealth") return False if self._initialized: return True try: self.playwright = await async_playwright().start() # Launch with stealth settings self.browser = await self.playwright.chromium.launch( headless=True, args=[ "--disable-blink-features=AutomationControlled", "--disable-dev-shm-usage", "--no-sandbox", "--disable-setuid-sandbox", "--disable-infobars", "--disable-extensions", "--window-size=1920,1080", ] ) # Create context with realistic settings self.context = await self.browser.new_context( user_agent=random.choice(self.USER_AGENTS), viewport={"width": 1920, "height": 1080}, locale="en-US", timezone_id="America/New_York", geolocation={"longitude": -73.935242, "latitude": 40.730610}, permissions=["geolocation"], ) # Load saved cookies if available await self._load_cookies() self._initialized = True logger.info("Playwright browser initialized successfully") return True except Exception as e: logger.exception(f"Failed to initialize Playwright: {e}") return False async def close(self): """Close browser and cleanup.""" if self.context: await self._save_cookies() await self.context.close() if self.browser: await self.browser.close() if self.playwright: await self.playwright.stop() self._initialized = False async def _load_cookies(self): """Load saved cookies from file.""" cookie_file = self._cookie_dir / "session_cookies.json" if cookie_file.exists(): try: with open(cookie_file) as f: cookies = json.load(f) await self.context.add_cookies(cookies) logger.info(f"Loaded {len(cookies)} saved cookies") except Exception as e: logger.warning(f"Failed to load cookies: {e}") async def _save_cookies(self): """Save cookies to file for persistence.""" try: cookies = await self.context.cookies() cookie_file = self._cookie_dir / "session_cookies.json" with open(cookie_file, "w") as f: json.dump(cookies, f) logger.info(f"Saved {len(cookies)} cookies") except Exception as e: logger.warning(f"Failed to save cookies: {e}") async def _create_stealth_page(self) -> Page: """Create a new page with stealth mode enabled.""" page = await self.context.new_page() # Apply stealth mode if Stealth: stealth = Stealth( navigator_webdriver=True, chrome_runtime=True, navigator_user_agent=True, navigator_vendor=True, webgl_vendor=True, ) await stealth.apply_stealth_async(page) return page async def _wait_for_cloudflare(self, page: Page, timeout: int = 30): """Wait for Cloudflare challenge to complete.""" try: # Wait for either the challenge to complete or content to load await page.wait_for_function( """ () => { // Check if we're past Cloudflare const title = document.title.toLowerCase(); return !title.includes('just a moment') && !title.includes('attention required') && !title.includes('checking your browser'); } """, timeout=timeout * 1000 ) # Additional delay for any remaining JS to execute await asyncio.sleep(2) except Exception as e: logger.warning(f"Cloudflare wait timeout: {e}") # ═══════════════════════════════════════════════════════════════════════════════ # GODADDY AUCTIONS SCRAPER # ═══════════════════════════════════════════════════════════════════════════════ async def scrape_godaddy(self, limit: int = 100) -> Dict[str, Any]: """ Scrape GoDaddy Auctions using Playwright. GoDaddy uses Cloudflare + their own bot detection. We intercept the API calls made by their frontend. """ if not await self.initialize(): return {"items": [], "total": 0, "error": "Playwright not initialized"} page = None try: page = await self._create_stealth_page() # Intercept XHR requests to capture auction data captured_data = [] async def handle_response(response): if "findApiProxy" in response.url and "auction" in response.url: try: data = await response.json() captured_data.append(data) except: pass page.on("response", handle_response) # Navigate to GoDaddy Auctions logger.info("Navigating to GoDaddy Auctions...") await page.goto("https://auctions.godaddy.com/beta", wait_until="networkidle") # Wait for Cloudflare await self._wait_for_cloudflare(page) # Wait for auction content to load try: await page.wait_for_selector('[data-testid="auction-card"], .auction-card, .domain-item', timeout=15000) except: logger.warning("Auction cards not found, trying to scroll...") # Scroll to trigger lazy loading await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") await asyncio.sleep(2) # Try to extract from intercepted API calls first if captured_data: return self._parse_godaddy_api_response(captured_data) # Fallback: Extract from DOM return await self._extract_godaddy_from_dom(page) except Exception as e: logger.exception(f"GoDaddy scraping error: {e}") return {"items": [], "total": 0, "error": str(e)} finally: if page: await page.close() def _parse_godaddy_api_response(self, captured_data: List[Dict]) -> Dict[str, Any]: """Parse captured API response from GoDaddy.""" items = [] for data in captured_data: results = data.get("results", []) for item in results: domain = item.get("fqdn", "") or item.get("domain", "") if not domain: continue tld = domain.rsplit(".", 1)[-1] if "." in domain else "" # Parse end time end_time = None end_at = item.get("endingAt") or item.get("auctionEndTime") if end_at: try: end_time = datetime.fromisoformat(end_at.replace("Z", "+00:00")).replace(tzinfo=None) except: pass price = item.get("price") or item.get("currentBidPrice") or item.get("minBid") or 0 items.append({ "domain": domain, "tld": tld, "platform": "GoDaddy", "current_bid": float(price) if price else 0, "min_bid": float(item.get("minBid", 0) or 0), "num_bids": int(item.get("bids", 0) or item.get("bidCount", 0) or 0), "end_time": end_time or datetime.utcnow() + timedelta(days=1), "buy_now_price": float(item.get("buyNowPrice")) if item.get("buyNowPrice") else None, "auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce", "currency": "USD", "is_active": True, "traffic": int(item.get("traffic", 0) or 0), "domain_authority": int(item.get("valuationPrice", 0) or 0), }) return { "items": items, "total": len(items), "source": "api_intercept", } async def _extract_godaddy_from_dom(self, page: Page) -> Dict[str, Any]: """Extract auction data from GoDaddy DOM when API intercept fails.""" items = [] try: # Try different selectors selectors = [ '[data-testid="auction-card"]', '.auction-card', '.domain-listing', 'tr[data-domain]', '.domain-row', ] for selector in selectors: elements = await page.query_selector_all(selector) if elements: logger.info(f"Found {len(elements)} elements with selector: {selector}") for el in elements[:100]: # Max 100 items try: # Try to extract domain name domain_el = await el.query_selector('.domain-name, .fqdn, [data-domain], a[href*="domain"]') if domain_el: domain = await domain_el.text_content() domain = domain.strip() if domain else "" else: domain = await el.get_attribute("data-domain") or "" if not domain or "." not in domain: continue tld = domain.rsplit(".", 1)[-1] # Try to extract price price = 0 price_el = await el.query_selector('.price, .bid, .current-bid, [data-price]') if price_el: price_text = await price_el.text_content() price = float("".join(c for c in price_text if c.isdigit() or c == ".") or "0") items.append({ "domain": domain, "tld": tld, "platform": "GoDaddy", "current_bid": price, "min_bid": 0, "num_bids": 0, "end_time": datetime.utcnow() + timedelta(days=1), "buy_now_price": None, "auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce", "currency": "USD", "is_active": True, }) except Exception as e: logger.debug(f"Error extracting element: {e}") break # Found elements, stop trying other selectors except Exception as e: logger.exception(f"DOM extraction error: {e}") return { "items": items, "total": len(items), "source": "dom_extraction", } # ═══════════════════════════════════════════════════════════════════════════════ # NAMEJET SCRAPER # ═══════════════════════════════════════════════════════════════════════════════ async def scrape_namejet(self, limit: int = 100) -> Dict[str, Any]: """ Scrape NameJet auctions using Playwright. NameJet uses heavy Cloudflare protection. """ if not await self.initialize(): return {"items": [], "total": 0, "error": "Playwright not initialized"} page = None try: page = await self._create_stealth_page() # Navigate to NameJet auctions page logger.info("Navigating to NameJet...") await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="networkidle") # Wait for Cloudflare await self._wait_for_cloudflare(page) # Wait for auction table try: await page.wait_for_selector('#MainContent_gvAuctions, .auction-table, table', timeout=15000) except: logger.warning("NameJet table not found") # Extract data from table items = [] rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row') for row in rows[:limit]: try: cells = await row.query_selector_all('td') if len(cells) < 3: continue # NameJet format: Domain, End Time, Price, Bids, ... domain = await cells[0].text_content() domain = domain.strip() if domain else "" if not domain or "." not in domain: continue tld = domain.rsplit(".", 1)[-1] # Parse price price = 0 if len(cells) > 2: price_text = await cells[2].text_content() price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0") # Parse bids bids = 0 if len(cells) > 3: bids_text = await cells[3].text_content() bids = int("".join(c for c in (bids_text or "0") if c.isdigit()) or "0") items.append({ "domain": domain, "tld": tld, "platform": "NameJet", "current_bid": price, "min_bid": 0, "num_bids": bids, "end_time": datetime.utcnow() + timedelta(days=1), "buy_now_price": None, "auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}", "currency": "USD", "is_active": True, }) except Exception as e: logger.debug(f"Error parsing row: {e}") return { "items": items, "total": len(items), "source": "playwright", } except Exception as e: logger.exception(f"NameJet scraping error: {e}") return {"items": [], "total": 0, "error": str(e)} finally: if page: await page.close() # ═══════════════════════════════════════════════════════════════════════════════ # UNIFIED SCRAPE METHOD # ═══════════════════════════════════════════════════════════════════════════════ async def scrape_all_protected(self) -> Dict[str, Any]: """ Scrape all Cloudflare-protected platforms. Returns combined results from: - GoDaddy Auctions - NameJet """ results = { "total_found": 0, "platforms": {}, "items": [], "errors": [], } if not PLAYWRIGHT_AVAILABLE: results["errors"].append("Playwright not installed") return results try: await self.initialize() # Scrape GoDaddy logger.info("Scraping GoDaddy with Playwright...") godaddy_result = await self.scrape_godaddy() results["platforms"]["GoDaddy"] = { "found": len(godaddy_result.get("items", [])), "source": godaddy_result.get("source", "unknown"), } results["items"].extend(godaddy_result.get("items", [])) results["total_found"] += len(godaddy_result.get("items", [])) if godaddy_result.get("error"): results["errors"].append(f"GoDaddy: {godaddy_result['error']}") # Small delay between platforms await asyncio.sleep(3) # Scrape NameJet logger.info("Scraping NameJet with Playwright...") namejet_result = await self.scrape_namejet() results["platforms"]["NameJet"] = { "found": len(namejet_result.get("items", [])), "source": namejet_result.get("source", "unknown"), } results["items"].extend(namejet_result.get("items", [])) results["total_found"] += len(namejet_result.get("items", [])) if namejet_result.get("error"): results["errors"].append(f"NameJet: {namejet_result['error']}") except Exception as e: logger.exception(f"Playwright scraping error: {e}") results["errors"].append(str(e)) finally: await self.close() return results # Singleton instance playwright_scraper = PlaywrightScraperService()