pounce/backend/app/services/tld_scraper/base.py

"""Base class for TLD price scrapers."""
import logging
import random
import asyncio
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional

import httpx

logger = logging.getLogger(__name__)


@dataclass
class TLDPriceData:
    """Data structure for TLD pricing information."""
    tld: str
    registrar: str
    registration_price: float
    renewal_price: Optional[float] = None
    transfer_price: Optional[float] = None
    currency: str = "USD"
    source: str = "scrape"
    confidence: float = 1.0
    scraped_at: datetime = field(default_factory=datetime.utcnow)
    promo_price: Optional[float] = None
    notes: Optional[str] = None

    def to_dict(self) -> dict:
        """Convert to dictionary."""
        return {
            "tld": self.tld,
            "registrar": self.registrar,
            "registration_price": self.registration_price,
            "renewal_price": self.renewal_price,
            "transfer_price": self.transfer_price,
            "currency": self.currency,
            "source": self.source,
            "confidence": self.confidence,
            "scraped_at": self.scraped_at.isoformat(),
            "promo_price": self.promo_price,
            "notes": self.notes,
        }


class ScraperError(Exception):
    """Base exception for scraper errors."""
    pass


class HTMLStructureChanged(ScraperError):
    """Website structure has changed - scraper needs update."""
    pass


class RateLimitDetected(ScraperError):
    """Too many requests - wait and retry."""
    retry_after: int = 300


class BaseTLDScraper(ABC):
    """
    Base class for TLD price scrapers.

    Implements common functionality like HTTP requests, rate limiting,
    user-agent rotation, and error handling.
    """

    name: str = "base"
    base_url: str = ""

    # User agents for rotation
    USER_AGENTS = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    ]

    def __init__(self, timeout: float = 30.0, delay_range: tuple[float, float] = (1.0, 3.0)):
        """
        Initialize the scraper.

        Args:
            timeout: HTTP request timeout in seconds
            delay_range: Min and max delay between requests (seconds)
        """
        self.timeout = timeout
        self.delay_range = delay_range
        self._request_count = 0

    def get_user_agent(self) -> str:
        """Get a random user agent."""
        return random.choice(self.USER_AGENTS)

    def get_headers(self) -> dict:
        """Get HTTP headers for requests."""
        return {
            "User-Agent": self.get_user_agent(),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        }

    async def delay(self):
        """Add random delay between requests."""
        delay = random.uniform(*self.delay_range)
        await asyncio.sleep(delay)

    async def fetch_page(self, url: str) -> str:
        """
        Fetch a webpage with proper headers and error handling.

        Args:
            url: URL to fetch

        Returns:
            HTML content as string
        """
        self._request_count += 1

        # Add delay after first request
        if self._request_count > 1:
            await self.delay()

        try:
            async with httpx.AsyncClient(timeout=self.timeout) as client:
                response = await client.get(
                    url,
                    headers=self.get_headers(),
                    follow_redirects=True,
                )

                if response.status_code == 429:
                    raise RateLimitDetected(f"Rate limited by {url}")

                if response.status_code != 200:
                    raise ScraperError(f"HTTP {response.status_code} for {url}")

                return response.text

        except httpx.TimeoutException:
            raise ScraperError(f"Timeout fetching {url}")
        except httpx.RequestError as e:
            raise ScraperError(f"Request error for {url}: {e}")

    @staticmethod
    def parse_price(text: str) -> Optional[float]:
        """
        Parse a price from text.

        Handles formats like:
        - $9.99
        - €8.50
        - £7.99
        - 9.99 USD
        - $9,999.99

        Args:
            text: Text containing a price

        Returns:
            Parsed price as float, or None if not parseable
        """
        import re

        if not text:
            return None

        # Clean the text
        text = text.strip()

        # Remove currency symbols and extract number
        # Match patterns like $9.99, €8,50, £7.99, 9.99
        match = re.search(r'[\$€£]?\s*([\d,]+\.?\d*)', text.replace(',', ''))

        if match:
            try:
                price = float(match.group(1))
                # Sanity check - prices should be between $0.50 and $500
                if 0.50 <= price <= 500:
                    return round(price, 2)
            except ValueError:
                pass

        return None

    @abstractmethod
    async def scrape(self) -> list[TLDPriceData]:
        """
        Scrape TLD prices from the source.

        Returns:
            List of TLDPriceData objects
        """
        pass

    async def health_check(self) -> bool:
        """
        Check if the source is accessible.

        Returns:
            True if source is accessible, False otherwise
        """
        try:
            async with httpx.AsyncClient(timeout=10.0) as client:
                response = await client.get(
                    self.base_url,
                    headers=self.get_headers(),
                    follow_redirects=True,
                )
                return response.status_code == 200
        except Exception:
            return False