Major changes: - Add TLD price scraper with Porkbun API (886+ TLDs, no API key needed) - Fix .ch domain checker using rdap.nic.ch custom RDAP - Integrate database for TLD price history tracking - Add admin endpoints for manual scrape and stats - Extend scheduler with daily TLD price scrape job (03:00 UTC) - Update API to use DB data with static fallback - Update README with complete documentation New files: - backend/app/services/tld_scraper/ (scraper package) - TLD_TRACKING_PLAN.md (implementation plan) API changes: - POST /admin/scrape-tld-prices - trigger manual scrape - GET /admin/tld-prices/stats - database statistics - GET /tld-prices/overview now uses DB data
221 lines
6.6 KiB
Python
221 lines
6.6 KiB
Python
"""Base class for TLD price scrapers."""
|
|
import logging
|
|
import random
|
|
import asyncio
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class TLDPriceData:
|
|
"""Data structure for TLD pricing information."""
|
|
tld: str
|
|
registrar: str
|
|
registration_price: float
|
|
renewal_price: Optional[float] = None
|
|
transfer_price: Optional[float] = None
|
|
currency: str = "USD"
|
|
source: str = "scrape"
|
|
confidence: float = 1.0
|
|
scraped_at: datetime = field(default_factory=datetime.utcnow)
|
|
promo_price: Optional[float] = None
|
|
notes: Optional[str] = None
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary."""
|
|
return {
|
|
"tld": self.tld,
|
|
"registrar": self.registrar,
|
|
"registration_price": self.registration_price,
|
|
"renewal_price": self.renewal_price,
|
|
"transfer_price": self.transfer_price,
|
|
"currency": self.currency,
|
|
"source": self.source,
|
|
"confidence": self.confidence,
|
|
"scraped_at": self.scraped_at.isoformat(),
|
|
"promo_price": self.promo_price,
|
|
"notes": self.notes,
|
|
}
|
|
|
|
|
|
class ScraperError(Exception):
|
|
"""Base exception for scraper errors."""
|
|
pass
|
|
|
|
|
|
class HTMLStructureChanged(ScraperError):
|
|
"""Website structure has changed - scraper needs update."""
|
|
pass
|
|
|
|
|
|
class RateLimitDetected(ScraperError):
|
|
"""Too many requests - wait and retry."""
|
|
retry_after: int = 300
|
|
|
|
|
|
class BaseTLDScraper(ABC):
|
|
"""
|
|
Base class for TLD price scrapers.
|
|
|
|
Implements common functionality like HTTP requests, rate limiting,
|
|
user-agent rotation, and error handling.
|
|
"""
|
|
|
|
name: str = "base"
|
|
base_url: str = ""
|
|
|
|
# User agents for rotation
|
|
USER_AGENTS = [
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
]
|
|
|
|
def __init__(self, timeout: float = 30.0, delay_range: tuple[float, float] = (1.0, 3.0)):
|
|
"""
|
|
Initialize the scraper.
|
|
|
|
Args:
|
|
timeout: HTTP request timeout in seconds
|
|
delay_range: Min and max delay between requests (seconds)
|
|
"""
|
|
self.timeout = timeout
|
|
self.delay_range = delay_range
|
|
self._request_count = 0
|
|
|
|
def get_user_agent(self) -> str:
|
|
"""Get a random user agent."""
|
|
return random.choice(self.USER_AGENTS)
|
|
|
|
def get_headers(self) -> dict:
|
|
"""Get HTTP headers for requests."""
|
|
return {
|
|
"User-Agent": self.get_user_agent(),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"DNT": "1",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
}
|
|
|
|
async def delay(self):
|
|
"""Add random delay between requests."""
|
|
delay = random.uniform(*self.delay_range)
|
|
await asyncio.sleep(delay)
|
|
|
|
async def fetch_page(self, url: str) -> str:
|
|
"""
|
|
Fetch a webpage with proper headers and error handling.
|
|
|
|
Args:
|
|
url: URL to fetch
|
|
|
|
Returns:
|
|
HTML content as string
|
|
"""
|
|
self._request_count += 1
|
|
|
|
# Add delay after first request
|
|
if self._request_count > 1:
|
|
await self.delay()
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
response = await client.get(
|
|
url,
|
|
headers=self.get_headers(),
|
|
follow_redirects=True,
|
|
)
|
|
|
|
if response.status_code == 429:
|
|
raise RateLimitDetected(f"Rate limited by {url}")
|
|
|
|
if response.status_code != 200:
|
|
raise ScraperError(f"HTTP {response.status_code} for {url}")
|
|
|
|
return response.text
|
|
|
|
except httpx.TimeoutException:
|
|
raise ScraperError(f"Timeout fetching {url}")
|
|
except httpx.RequestError as e:
|
|
raise ScraperError(f"Request error for {url}: {e}")
|
|
|
|
@staticmethod
|
|
def parse_price(text: str) -> Optional[float]:
|
|
"""
|
|
Parse a price from text.
|
|
|
|
Handles formats like:
|
|
- $9.99
|
|
- €8.50
|
|
- £7.99
|
|
- 9.99 USD
|
|
- $9,999.99
|
|
|
|
Args:
|
|
text: Text containing a price
|
|
|
|
Returns:
|
|
Parsed price as float, or None if not parseable
|
|
"""
|
|
import re
|
|
|
|
if not text:
|
|
return None
|
|
|
|
# Clean the text
|
|
text = text.strip()
|
|
|
|
# Remove currency symbols and extract number
|
|
# Match patterns like $9.99, €8,50, £7.99, 9.99
|
|
match = re.search(r'[\$€£]?\s*([\d,]+\.?\d*)', text.replace(',', ''))
|
|
|
|
if match:
|
|
try:
|
|
price = float(match.group(1))
|
|
# Sanity check - prices should be between $0.50 and $500
|
|
if 0.50 <= price <= 500:
|
|
return round(price, 2)
|
|
except ValueError:
|
|
pass
|
|
|
|
return None
|
|
|
|
@abstractmethod
|
|
async def scrape(self) -> list[TLDPriceData]:
|
|
"""
|
|
Scrape TLD prices from the source.
|
|
|
|
Returns:
|
|
List of TLDPriceData objects
|
|
"""
|
|
pass
|
|
|
|
async def health_check(self) -> bool:
|
|
"""
|
|
Check if the source is accessible.
|
|
|
|
Returns:
|
|
True if source is accessible, False otherwise
|
|
"""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
response = await client.get(
|
|
self.base_url,
|
|
headers=self.get_headers(),
|
|
follow_redirects=True,
|
|
)
|
|
return response.status_code == 200
|
|
except Exception:
|
|
return False
|
|
|