pounce/backend/app/services/playwright_scraper.py

"""
Playwright-based Stealth Scraper for Cloudflare-protected Domain Auction Sites.

This module uses Playwright with stealth plugins to bypass Cloudflare and other
anti-bot protections. It's designed for enterprise-grade web scraping.

Features:
- Stealth mode (undetectable browser fingerprint)
- Automatic Cloudflare bypass
- Connection pooling
- Retry logic with exponential backoff
- JSON extraction from rendered pages
- Cookie persistence across sessions

Supported Platforms:
- GoDaddy Auctions (Cloudflare protected)
- NameJet (Cloudflare protected)
- Any other protected auction site

Usage:
    scraper = PlaywrightScraperService()
    await scraper.initialize()
    auctions = await scraper.scrape_godaddy()
    await scraper.close()
"""

import asyncio
import json
import logging
import os
import random
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from pathlib import Path
from urllib.parse import urlparse

logger = logging.getLogger(__name__)

# Try to import playwright (optional dependency)
try:
    from playwright.async_api import async_playwright, Browser, BrowserContext, Page
    from playwright_stealth import Stealth
    PLAYWRIGHT_AVAILABLE = True
except ImportError:
    PLAYWRIGHT_AVAILABLE = False
    Stealth = None
    # Define dummy types for type hints
    Browser = Any
    BrowserContext = Any
    Page = Any
    logger.warning("Playwright not installed. Stealth scraping disabled.")


class PlaywrightScraperService:
    """
    Enterprise-grade Playwright scraper with Cloudflare bypass.

    Uses stealth techniques to appear as a real browser:
    - Real Chrome user agent
    - WebGL fingerprint spoofing
    - Navigator property spoofing
    - Timezone and locale matching
    """

    # User agents that work well with Cloudflare
    USER_AGENTS = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    ]

    def __init__(self):
        self.playwright = None
        self.browser: Optional[Browser] = None
        self.context: Optional[BrowserContext] = None
        self._initialized = False
        self._cookie_dir = Path(__file__).parent.parent.parent / "data" / "cookies"
        self._cookie_dir.mkdir(parents=True, exist_ok=True)

    async def initialize(self) -> bool:
        """Initialize the browser instance."""
        if not PLAYWRIGHT_AVAILABLE:
            logger.error("Playwright not available. Install with: pip install playwright playwright-stealth")
            return False

        if self._initialized:
            return True

        try:
            self.playwright = await async_playwright().start()

            proxy_url = (
                os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
                or os.getenv("SCRAPER_PROXY_URL")
                or os.getenv("SCRAPER_HTTP_PROXY")
            )
            proxy_config = None
            if proxy_url:
                parsed = urlparse(proxy_url)
                if parsed.scheme and parsed.hostname and parsed.port:
                    proxy_config = {
                        "server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
                    }
                    if parsed.username:
                        proxy_config["username"] = parsed.username
                    if parsed.password:
                        proxy_config["password"] = parsed.password

            headless = os.getenv("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes")

            # Launch with stealth settings
            self.browser = await self.playwright.chromium.launch(
                headless=headless,
                proxy=proxy_config,
                args=[
                    "--disable-blink-features=AutomationControlled",
                    "--disable-dev-shm-usage",
                    "--no-sandbox",
                    "--disable-setuid-sandbox",
                    "--disable-infobars",
                    "--disable-extensions",
                    "--window-size=1920,1080",
                ]
            )

            # Create context with realistic settings
            self.context = await self.browser.new_context(
                user_agent=random.choice(self.USER_AGENTS),
                viewport={"width": 1920, "height": 1080},
                locale="en-US",
                timezone_id="America/New_York",
                geolocation={"longitude": -73.935242, "latitude": 40.730610},
                permissions=["geolocation"],
            )

            # Load saved cookies if available
            await self._load_cookies()

            self._initialized = True
            logger.info("Playwright browser initialized successfully")
            return True

        except Exception as e:
            logger.exception(f"Failed to initialize Playwright: {e}")
            return False

    async def close(self):
        """Close browser and cleanup."""
        if self.context:
            await self._save_cookies()
            await self.context.close()
        if self.browser:
            await self.browser.close()
        if self.playwright:
            await self.playwright.stop()
        self._initialized = False

    async def _load_cookies(self):
        """Load saved cookies from file."""
        cookie_file = self._cookie_dir / "session_cookies.json"
        if cookie_file.exists():
            try:
                with open(cookie_file) as f:
                    cookies = json.load(f)
                await self.context.add_cookies(cookies)
                logger.info(f"Loaded {len(cookies)} saved cookies")
            except Exception as e:
                logger.warning(f"Failed to load cookies: {e}")

    async def _save_cookies(self):
        """Save cookies to file for persistence."""
        try:
            cookies = await self.context.cookies()
            cookie_file = self._cookie_dir / "session_cookies.json"
            with open(cookie_file, "w") as f:
                json.dump(cookies, f)
            logger.info(f"Saved {len(cookies)} cookies")
        except Exception as e:
            logger.warning(f"Failed to save cookies: {e}")

    async def _create_stealth_page(self) -> Page:
        """Create a new page with stealth mode enabled."""
        page = await self.context.new_page()

        # Apply stealth mode
        if Stealth:
            stealth = Stealth(
                navigator_webdriver=True,
                chrome_runtime=True,
                navigator_user_agent=True,
                navigator_vendor=True,
                webgl_vendor=True,
            )
            await stealth.apply_stealth_async(page)

        return page

    async def _wait_for_cloudflare(self, page: Page, timeout: int = 30):
        """Wait for Cloudflare challenge to complete."""
        try:
            # Wait for either the challenge to complete or content to load
            await page.wait_for_function(
                """
                () => {
                    // Check if we're past Cloudflare
                    const title = document.title.toLowerCase();
                    return !title.includes('just a moment') &&
                           !title.includes('attention required') &&
                           !title.includes('checking your browser');
                }
                """,
                timeout=timeout * 1000
            )
            # Additional delay for any remaining JS to execute
            await asyncio.sleep(2)
        except Exception as e:
            logger.warning(f"Cloudflare wait timeout: {e}")

    # ═══════════════════════════════════════════════════════════════════════════════
    # GODADDY AUCTIONS SCRAPER
    # ═══════════════════════════════════════════════════════════════════════════════

    async def scrape_godaddy(self, limit: int = 100) -> Dict[str, Any]:
        """
        Scrape GoDaddy Auctions using Playwright.

        GoDaddy uses Cloudflare + their own bot detection.
        We intercept the API calls made by their frontend.
        """
        if not await self.initialize():
            return {"items": [], "total": 0, "error": "Playwright not initialized"}

        page = None
        try:
            page = await self._create_stealth_page()

            # Intercept XHR requests to capture auction data
            captured_data = []

            async def handle_response(response):
                if "findApiProxy" in response.url and "auction" in response.url:
                    try:
                        data = await response.json()
                        captured_data.append(data)
                    except:
                        pass

            page.on("response", handle_response)

            # Navigate to GoDaddy Auctions
            logger.info("Navigating to GoDaddy Auctions...")
            await page.goto("https://auctions.godaddy.com/beta", wait_until="domcontentloaded", timeout=60_000)

            # Wait for Cloudflare
            await self._wait_for_cloudflare(page)

            # Wait for auction content to load
            try:
                await page.wait_for_selector('[data-testid="auction-card"], .auction-card, .domain-item', timeout=15000)
            except:
                logger.warning("Auction cards not found, trying to scroll...")

            # Scroll to trigger lazy loading
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
            await asyncio.sleep(2)

            # Try to extract from intercepted API calls first
            if captured_data:
                return self._parse_godaddy_api_response(captured_data)

            # Fallback: Extract from DOM
            return await self._extract_godaddy_from_dom(page)

        except Exception as e:
            logger.exception(f"GoDaddy scraping error: {e}")
            return {"items": [], "total": 0, "error": str(e)}
        finally:
            if page:
                await page.close()

    def _parse_godaddy_api_response(self, captured_data: List[Dict]) -> Dict[str, Any]:
        """Parse captured API response from GoDaddy."""
        items = []

        for data in captured_data:
            results = data.get("results", [])
            for item in results:
                domain = item.get("fqdn", "") or item.get("domain", "")
                if not domain:
                    continue

                tld = domain.rsplit(".", 1)[-1] if "." in domain else ""

                # Parse end time
                end_time = None
                end_at = item.get("endingAt") or item.get("auctionEndTime")
                if end_at:
                    try:
                        end_time = datetime.fromisoformat(end_at.replace("Z", "+00:00")).replace(tzinfo=None)
                    except:
                        pass

                price = item.get("price") or item.get("currentBidPrice") or item.get("minBid") or 0

                items.append({
                    "domain": domain,
                    "tld": tld,
                    "platform": "GoDaddy",
                    "current_bid": float(price) if price else 0,
                    "min_bid": float(item.get("minBid", 0) or 0),
                    "num_bids": int(item.get("bids", 0) or item.get("bidCount", 0) or 0),
                    "end_time": end_time or datetime.utcnow() + timedelta(days=1),
                    "buy_now_price": float(item.get("buyNowPrice")) if item.get("buyNowPrice") else None,
                    "auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce",
                    "currency": "USD",
                    "is_active": True,
                    "traffic": int(item.get("traffic", 0) or 0),
                    "domain_authority": int(item.get("valuationPrice", 0) or 0),
                })

        return {
            "items": items,
            "total": len(items),
            "source": "api_intercept",
        }

    async def _extract_godaddy_from_dom(self, page: Page) -> Dict[str, Any]:
        """Extract auction data from GoDaddy DOM when API intercept fails."""
        items = []

        try:
            # Try different selectors
            selectors = [
                '[data-testid="auction-card"]',
                '.auction-card',
                '.domain-listing',
                'tr[data-domain]',
                '.domain-row',
            ]

            for selector in selectors:
                elements = await page.query_selector_all(selector)
                if elements:
                    logger.info(f"Found {len(elements)} elements with selector: {selector}")

                    for el in elements[:100]:  # Max 100 items
                        try:
                            # Try to extract domain name
                            domain_el = await el.query_selector('.domain-name, .fqdn, [data-domain], a[href*="domain"]')
                            if domain_el:
                                domain = await domain_el.text_content()
                                domain = domain.strip() if domain else ""
                            else:
                                domain = await el.get_attribute("data-domain") or ""

                            if not domain or "." not in domain:
                                continue

                            tld = domain.rsplit(".", 1)[-1]

                            # Try to extract price
                            price = 0
                            price_el = await el.query_selector('.price, .bid, .current-bid, [data-price]')
                            if price_el:
                                price_text = await price_el.text_content()
                                price = float("".join(c for c in price_text if c.isdigit() or c == ".") or "0")

                            items.append({
                                "domain": domain,
                                "tld": tld,
                                "platform": "GoDaddy",
                                "current_bid": price,
                                "min_bid": 0,
                                "num_bids": 0,
                                "end_time": datetime.utcnow() + timedelta(days=1),
                                "buy_now_price": None,
                                "auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce",
                                "currency": "USD",
                                "is_active": True,
                            })
                        except Exception as e:
                            logger.debug(f"Error extracting element: {e}")

                    break  # Found elements, stop trying other selectors

        except Exception as e:
            logger.exception(f"DOM extraction error: {e}")

        return {
            "items": items,
            "total": len(items),
            "source": "dom_extraction",
        }

    # ═══════════════════════════════════════════════════════════════════════════════
    # NAMEJET SCRAPER
    # ═══════════════════════════════════════════════════════════════════════════════

    async def scrape_namejet(self, limit: int = 100) -> Dict[str, Any]:
        """
        Scrape NameJet auctions using Playwright.

        NameJet uses heavy Cloudflare protection.
        """
        if not await self.initialize():
            return {"items": [], "total": 0, "error": "Playwright not initialized"}

        page = None
        try:
            page = await self._create_stealth_page()

            # Navigate to NameJet auctions page
            logger.info("Navigating to NameJet...")
            await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="domcontentloaded", timeout=60_000)

            # Wait for Cloudflare
            await self._wait_for_cloudflare(page)

            # Wait for auction table
            try:
                await page.wait_for_selector('#MainContent_gvAuctions, .auction-table, table', timeout=15000)
            except:
                logger.warning("NameJet table not found")

            # Extract data from table
            items = []
            rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')

            for row in rows[:limit]:
                try:
                    cells = await row.query_selector_all('td')
                    if len(cells) < 3:
                        continue

                    # NameJet format: Domain, End Time, Price, Bids, ...
                    domain = await cells[0].text_content()
                    domain = domain.strip() if domain else ""

                    if not domain or "." not in domain:
                        continue

                    tld = domain.rsplit(".", 1)[-1]

                    # Parse price
                    price = 0
                    if len(cells) > 2:
                        price_text = await cells[2].text_content()
                        price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")

                    # Parse bids
                    bids = 0
                    if len(cells) > 3:
                        bids_text = await cells[3].text_content()
                        bids = int("".join(c for c in (bids_text or "0") if c.isdigit()) or "0")

                    items.append({
                        "domain": domain,
                        "tld": tld,
                        "platform": "NameJet",
                        "current_bid": price,
                        "min_bid": 0,
                        "num_bids": bids,
                        "end_time": datetime.utcnow() + timedelta(days=1),
                        "buy_now_price": None,
                        "auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
                        "currency": "USD",
                        "is_active": True,
                    })
                except Exception as e:
                    logger.debug(f"Error parsing row: {e}")

            return {
                "items": items,
                "total": len(items),
                "source": "playwright",
            }

        except Exception as e:
            logger.exception(f"NameJet scraping error: {e}")
            return {"items": [], "total": 0, "error": str(e)}
        finally:
            if page:
                await page.close()

    # ═══════════════════════════════════════════════════════════════════════════════
    # UNIFIED SCRAPE METHOD
    # ═══════════════════════════════════════════════════════════════════════════════

    async def scrape_all_protected(self) -> Dict[str, Any]:
        """
        Scrape all Cloudflare-protected platforms.

        Returns combined results from:
        - GoDaddy Auctions
        - NameJet
        """
        results = {
            "total_found": 0,
            "platforms": {},
            "items": [],
            "errors": [],
        }

        if not PLAYWRIGHT_AVAILABLE:
            results["errors"].append("Playwright not installed")
            return results

        try:
            await self.initialize()

            # Scrape GoDaddy
            logger.info("Scraping GoDaddy with Playwright...")
            godaddy_result = await self.scrape_godaddy()
            results["platforms"]["GoDaddy"] = {
                "found": len(godaddy_result.get("items", [])),
                "source": godaddy_result.get("source", "unknown"),
            }
            results["items"].extend(godaddy_result.get("items", []))
            results["total_found"] += len(godaddy_result.get("items", []))

            if godaddy_result.get("error"):
                results["errors"].append(f"GoDaddy: {godaddy_result['error']}")

            # Small delay between platforms
            await asyncio.sleep(3)

            # Scrape NameJet
            logger.info("Scraping NameJet with Playwright...")
            namejet_result = await self.scrape_namejet()
            results["platforms"]["NameJet"] = {
                "found": len(namejet_result.get("items", [])),
                "source": namejet_result.get("source", "unknown"),
            }
            results["items"].extend(namejet_result.get("items", []))
            results["total_found"] += len(namejet_result.get("items", []))

            if namejet_result.get("error"):
                results["errors"].append(f"NameJet: {namejet_result['error']}")

        except Exception as e:
            logger.exception(f"Playwright scraping error: {e}")
            results["errors"].append(str(e))
        finally:
            await self.close()

        return results


# Singleton instance
playwright_scraper = PlaywrightScraperService()