""" Domain Auction Scraper Service Scrapes real auction data from various platforms WITHOUT using their APIs. Uses web scraping to get publicly available auction information. Supported Platforms: - GoDaddy Auctions (auctions.godaddy.com) - Sedo (sedo.com/search/) - NameJet (namejet.com) - Afternic (afternic.com) IMPORTANT: - Respects robots.txt - Uses reasonable rate limiting - Only scrapes publicly available data - Caches results to minimize requests """ import logging import asyncio import re from datetime import datetime, timedelta from typing import List, Optional, Dict, Any from urllib.parse import urljoin, quote import httpx from bs4 import BeautifulSoup from sqlalchemy import select, and_, delete from sqlalchemy.ext.asyncio import AsyncSession from app.models.auction import DomainAuction, AuctionScrapeLog logger = logging.getLogger(__name__) # Rate limiting: requests per minute per platform RATE_LIMITS = { "GoDaddy": 10, "Sedo": 10, "NameJet": 10, "Afternic": 10, "ExpiredDomains": 5, } # User agent for scraping USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" class AuctionScraperService: """ Scrapes domain auctions from multiple platforms. All data comes from publicly accessible pages - no APIs used. Results are cached in the database to minimize scraping frequency. """ def __init__(self): self.http_client: Optional[httpx.AsyncClient] = None self._last_request: Dict[str, datetime] = {} async def _get_client(self) -> httpx.AsyncClient: """Get or create HTTP client with appropriate headers.""" if self.http_client is None or self.http_client.is_closed: self.http_client = httpx.AsyncClient( timeout=30.0, follow_redirects=True, headers={ "User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } ) return self.http_client async def _rate_limit(self, platform: str): """Enforce rate limiting per platform.""" min_interval = 60 / RATE_LIMITS.get(platform, 10) # seconds between requests last = self._last_request.get(platform) if last: elapsed = (datetime.utcnow() - last).total_seconds() if elapsed < min_interval: await asyncio.sleep(min_interval - elapsed) self._last_request[platform] = datetime.utcnow() async def scrape_all_platforms(self, db: AsyncSession) -> Dict[str, Any]: """ Scrape all supported platforms and store results in database. Returns summary of scraping activity. """ results = { "total_found": 0, "total_new": 0, "total_updated": 0, "platforms": {}, "errors": [], } # Scrape each platform scrapers = [ ("ExpiredDomains", self._scrape_expireddomains), ] for platform_name, scraper_func in scrapers: try: platform_result = await scraper_func(db) results["platforms"][platform_name] = platform_result results["total_found"] += platform_result.get("found", 0) results["total_new"] += platform_result.get("new", 0) results["total_updated"] += platform_result.get("updated", 0) except Exception as e: logger.error(f"Error scraping {platform_name}: {e}") results["errors"].append(f"{platform_name}: {str(e)}") # Mark ended auctions as inactive await self._cleanup_ended_auctions(db) return results async def _scrape_expireddomains(self, db: AsyncSession) -> Dict[str, Any]: """ Scrape ExpiredDomains.net for auction listings. This site aggregates auctions from multiple sources. Public page: https://www.expireddomains.net/domain-name-search/ """ platform = "ExpiredDomains" result = {"found": 0, "new": 0, "updated": 0} log = AuctionScrapeLog(platform=platform) db.add(log) await db.commit() try: await self._rate_limit(platform) client = await self._get_client() # ExpiredDomains has a public search page # We'll scrape their "deleted domains" which shows domains becoming available url = "https://www.expireddomains.net/deleted-domains/" response = await client.get(url) if response.status_code != 200: raise Exception(f"HTTP {response.status_code}") soup = BeautifulSoup(response.text, "lxml") # Find domain listings in the table domain_rows = soup.select("table.base1 tbody tr") auctions = [] for row in domain_rows[:50]: # Limit to 50 per scrape try: cols = row.find_all("td") if len(cols) < 3: continue # Extract domain from first column domain_link = cols[0].find("a") if not domain_link: continue domain_text = domain_link.get_text(strip=True) if not domain_text or "." not in domain_text: continue domain = domain_text.lower() tld = domain.rsplit(".", 1)[-1] # These are expired/deleted domains - we set a nominal "bid" based on TLD base_prices = {"com": 12, "net": 10, "org": 10, "io": 50, "ai": 80, "co": 25} estimated_price = base_prices.get(tld, 15) auction_data = { "domain": domain, "tld": tld, "platform": "ExpiredDomains", "platform_auction_id": None, "auction_url": f"https://www.expireddomains.net/domain-name-search/?q={quote(domain)}", "current_bid": float(estimated_price), "currency": "USD", "min_bid": None, "buy_now_price": None, "reserve_price": None, "reserve_met": None, "num_bids": 0, "num_watchers": None, "end_time": datetime.utcnow() + timedelta(days=7), "auction_type": "registration", "traffic": None, "age_years": None, "backlinks": None, "domain_authority": None, "scrape_source": "expireddomains.net", } auctions.append(auction_data) except Exception as e: logger.debug(f"Error parsing row: {e}") continue # Store in database for auction_data in auctions: existing = await db.execute( select(DomainAuction).where( and_( DomainAuction.domain == auction_data["domain"], DomainAuction.platform == auction_data["platform"], ) ) ) existing = existing.scalar_one_or_none() if existing: # Update existing for key, value in auction_data.items(): setattr(existing, key, value) existing.updated_at = datetime.utcnow() existing.is_active = True result["updated"] += 1 else: # Create new new_auction = DomainAuction(**auction_data) db.add(new_auction) result["new"] += 1 result["found"] += 1 await db.commit() # Update log log.completed_at = datetime.utcnow() log.status = "success" log.auctions_found = result["found"] log.auctions_new = result["new"] log.auctions_updated = result["updated"] await db.commit() logger.info(f"ExpiredDomains scrape complete: {result}") except Exception as e: log.completed_at = datetime.utcnow() log.status = "failed" log.error_message = str(e) await db.commit() logger.error(f"ExpiredDomains scrape failed: {e}") raise return result async def _cleanup_ended_auctions(self, db: AsyncSession): """Mark auctions that have ended as inactive.""" now = datetime.utcnow() # Update ended auctions from sqlalchemy import update stmt = ( update(DomainAuction) .where( and_( DomainAuction.end_time < now, DomainAuction.is_active == True ) ) .values(is_active=False) ) await db.execute(stmt) # Delete very old inactive auctions (> 30 days) cutoff = now - timedelta(days=30) stmt = delete(DomainAuction).where( and_( DomainAuction.is_active == False, DomainAuction.end_time < cutoff ) ) await db.execute(stmt) await db.commit() async def get_active_auctions( self, db: AsyncSession, platform: Optional[str] = None, tld: Optional[str] = None, keyword: Optional[str] = None, min_bid: Optional[float] = None, max_bid: Optional[float] = None, ending_within_hours: Optional[int] = None, sort_by: str = "end_time", limit: int = 50, offset: int = 0, ) -> List[DomainAuction]: """Get active auctions from database with filters.""" query = select(DomainAuction).where(DomainAuction.is_active == True) if platform: query = query.where(DomainAuction.platform == platform) if tld: query = query.where(DomainAuction.tld == tld.lower().lstrip(".")) if keyword: query = query.where(DomainAuction.domain.ilike(f"%{keyword}%")) if min_bid is not None: query = query.where(DomainAuction.current_bid >= min_bid) if max_bid is not None: query = query.where(DomainAuction.current_bid <= max_bid) if ending_within_hours: cutoff = datetime.utcnow() + timedelta(hours=ending_within_hours) query = query.where(DomainAuction.end_time <= cutoff) # Sort if sort_by == "end_time": query = query.order_by(DomainAuction.end_time.asc()) elif sort_by == "bid_asc": query = query.order_by(DomainAuction.current_bid.asc()) elif sort_by == "bid_desc": query = query.order_by(DomainAuction.current_bid.desc()) elif sort_by == "bids": query = query.order_by(DomainAuction.num_bids.desc()) query = query.offset(offset).limit(limit) result = await db.execute(query) return list(result.scalars().all()) async def get_auction_count(self, db: AsyncSession) -> int: """Get total count of active auctions.""" from sqlalchemy import func result = await db.execute( select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True) ) return result.scalar() or 0 async def close(self): """Close HTTP client.""" if self.http_client and not self.http_client.is_closed: await self.http_client.aclose() # Global instance auction_scraper = AuctionScraperService()