"""Base class for TLD price scrapers.""" import logging import random import asyncio from abc import ABC, abstractmethod from dataclasses import dataclass, field from datetime import datetime from typing import Optional import httpx logger = logging.getLogger(__name__) @dataclass class TLDPriceData: """Data structure for TLD pricing information.""" tld: str registrar: str registration_price: float renewal_price: Optional[float] = None transfer_price: Optional[float] = None currency: str = "USD" source: str = "scrape" confidence: float = 1.0 scraped_at: datetime = field(default_factory=datetime.utcnow) promo_price: Optional[float] = None notes: Optional[str] = None def to_dict(self) -> dict: """Convert to dictionary.""" return { "tld": self.tld, "registrar": self.registrar, "registration_price": self.registration_price, "renewal_price": self.renewal_price, "transfer_price": self.transfer_price, "currency": self.currency, "source": self.source, "confidence": self.confidence, "scraped_at": self.scraped_at.isoformat(), "promo_price": self.promo_price, "notes": self.notes, } class ScraperError(Exception): """Base exception for scraper errors.""" pass class HTMLStructureChanged(ScraperError): """Website structure has changed - scraper needs update.""" pass class RateLimitDetected(ScraperError): """Too many requests - wait and retry.""" retry_after: int = 300 class BaseTLDScraper(ABC): """ Base class for TLD price scrapers. Implements common functionality like HTTP requests, rate limiting, user-agent rotation, and error handling. """ name: str = "base" base_url: str = "" # User agents for rotation USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", ] def __init__(self, timeout: float = 30.0, delay_range: tuple[float, float] = (1.0, 3.0)): """ Initialize the scraper. Args: timeout: HTTP request timeout in seconds delay_range: Min and max delay between requests (seconds) """ self.timeout = timeout self.delay_range = delay_range self._request_count = 0 def get_user_agent(self) -> str: """Get a random user agent.""" return random.choice(self.USER_AGENTS) def get_headers(self) -> dict: """Get HTTP headers for requests.""" return { "User-Agent": self.get_user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } async def delay(self): """Add random delay between requests.""" delay = random.uniform(*self.delay_range) await asyncio.sleep(delay) async def fetch_page(self, url: str) -> str: """ Fetch a webpage with proper headers and error handling. Args: url: URL to fetch Returns: HTML content as string """ self._request_count += 1 # Add delay after first request if self._request_count > 1: await self.delay() try: async with httpx.AsyncClient(timeout=self.timeout) as client: response = await client.get( url, headers=self.get_headers(), follow_redirects=True, ) if response.status_code == 429: raise RateLimitDetected(f"Rate limited by {url}") if response.status_code != 200: raise ScraperError(f"HTTP {response.status_code} for {url}") return response.text except httpx.TimeoutException: raise ScraperError(f"Timeout fetching {url}") except httpx.RequestError as e: raise ScraperError(f"Request error for {url}: {e}") @staticmethod def parse_price(text: str) -> Optional[float]: """ Parse a price from text. Handles formats like: - $9.99 - €8.50 - £7.99 - 9.99 USD - $9,999.99 Args: text: Text containing a price Returns: Parsed price as float, or None if not parseable """ import re if not text: return None # Clean the text text = text.strip() # Remove currency symbols and extract number # Match patterns like $9.99, €8,50, £7.99, 9.99 match = re.search(r'[\$€£]?\s*([\d,]+\.?\d*)', text.replace(',', '')) if match: try: price = float(match.group(1)) # Sanity check - prices should be between $0.50 and $500 if 0.50 <= price <= 500: return round(price, 2) except ValueError: pass return None @abstractmethod async def scrape(self) -> list[TLDPriceData]: """ Scrape TLD prices from the source. Returns: List of TLDPriceData objects """ pass async def health_check(self) -> bool: """ Check if the source is accessible. Returns: True if source is accessible, False otherwise """ try: async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get( self.base_url, headers=self.get_headers(), follow_redirects=True, ) return response.status_code == 200 except Exception: return False