yves.gugger f0cc69ac95 feat: TLD price scraper, .ch domain fix, DB integration
Major changes:
- Add TLD price scraper with Porkbun API (886+ TLDs, no API key needed)
- Fix .ch domain checker using rdap.nic.ch custom RDAP
- Integrate database for TLD price history tracking
- Add admin endpoints for manual scrape and stats
- Extend scheduler with daily TLD price scrape job (03:00 UTC)
- Update API to use DB data with static fallback
- Update README with complete documentation

New files:
- backend/app/services/tld_scraper/ (scraper package)
- TLD_TRACKING_PLAN.md (implementation plan)

API changes:
- POST /admin/scrape-tld-prices - trigger manual scrape
- GET /admin/tld-prices/stats - database statistics
- GET /tld-prices/overview now uses DB data
2025-12-08 09:12:44 +01:00

221 lines
6.6 KiB
Python

"""Base class for TLD price scrapers."""
import logging
import random
import asyncio
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
@dataclass
class TLDPriceData:
"""Data structure for TLD pricing information."""
tld: str
registrar: str
registration_price: float
renewal_price: Optional[float] = None
transfer_price: Optional[float] = None
currency: str = "USD"
source: str = "scrape"
confidence: float = 1.0
scraped_at: datetime = field(default_factory=datetime.utcnow)
promo_price: Optional[float] = None
notes: Optional[str] = None
def to_dict(self) -> dict:
"""Convert to dictionary."""
return {
"tld": self.tld,
"registrar": self.registrar,
"registration_price": self.registration_price,
"renewal_price": self.renewal_price,
"transfer_price": self.transfer_price,
"currency": self.currency,
"source": self.source,
"confidence": self.confidence,
"scraped_at": self.scraped_at.isoformat(),
"promo_price": self.promo_price,
"notes": self.notes,
}
class ScraperError(Exception):
"""Base exception for scraper errors."""
pass
class HTMLStructureChanged(ScraperError):
"""Website structure has changed - scraper needs update."""
pass
class RateLimitDetected(ScraperError):
"""Too many requests - wait and retry."""
retry_after: int = 300
class BaseTLDScraper(ABC):
"""
Base class for TLD price scrapers.
Implements common functionality like HTTP requests, rate limiting,
user-agent rotation, and error handling.
"""
name: str = "base"
base_url: str = ""
# User agents for rotation
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]
def __init__(self, timeout: float = 30.0, delay_range: tuple[float, float] = (1.0, 3.0)):
"""
Initialize the scraper.
Args:
timeout: HTTP request timeout in seconds
delay_range: Min and max delay between requests (seconds)
"""
self.timeout = timeout
self.delay_range = delay_range
self._request_count = 0
def get_user_agent(self) -> str:
"""Get a random user agent."""
return random.choice(self.USER_AGENTS)
def get_headers(self) -> dict:
"""Get HTTP headers for requests."""
return {
"User-Agent": self.get_user_agent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
async def delay(self):
"""Add random delay between requests."""
delay = random.uniform(*self.delay_range)
await asyncio.sleep(delay)
async def fetch_page(self, url: str) -> str:
"""
Fetch a webpage with proper headers and error handling.
Args:
url: URL to fetch
Returns:
HTML content as string
"""
self._request_count += 1
# Add delay after first request
if self._request_count > 1:
await self.delay()
try:
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.get(
url,
headers=self.get_headers(),
follow_redirects=True,
)
if response.status_code == 429:
raise RateLimitDetected(f"Rate limited by {url}")
if response.status_code != 200:
raise ScraperError(f"HTTP {response.status_code} for {url}")
return response.text
except httpx.TimeoutException:
raise ScraperError(f"Timeout fetching {url}")
except httpx.RequestError as e:
raise ScraperError(f"Request error for {url}: {e}")
@staticmethod
def parse_price(text: str) -> Optional[float]:
"""
Parse a price from text.
Handles formats like:
- $9.99
- €8.50
- £7.99
- 9.99 USD
- $9,999.99
Args:
text: Text containing a price
Returns:
Parsed price as float, or None if not parseable
"""
import re
if not text:
return None
# Clean the text
text = text.strip()
# Remove currency symbols and extract number
# Match patterns like $9.99, €8,50, £7.99, 9.99
match = re.search(r'[\$€£]?\s*([\d,]+\.?\d*)', text.replace(',', ''))
if match:
try:
price = float(match.group(1))
# Sanity check - prices should be between $0.50 and $500
if 0.50 <= price <= 500:
return round(price, 2)
except ValueError:
pass
return None
@abstractmethod
async def scrape(self) -> list[TLDPriceData]:
"""
Scrape TLD prices from the source.
Returns:
List of TLDPriceData objects
"""
pass
async def health_check(self) -> bool:
"""
Check if the source is accessible.
Returns:
True if source is accessible, False otherwise
"""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(
self.base_url,
headers=self.get_headers(),
follow_redirects=True,
)
return response.status_code == 200
except Exception:
return False