diff --git a/DATA_INDEPENDENCE_REPORT.md b/DATA_INDEPENDENCE_REPORT.md index a05c08d..a8b31c9 100644 --- a/DATA_INDEPENDENCE_REPORT.md +++ b/DATA_INDEPENDENCE_REPORT.md @@ -197,48 +197,29 @@ Mit diesen Verbesserungen wird Pounce ein **echtes Premium-Tool**, das keine ext --- -## ⚠️ KRITISCHES PROBLEM: Sample-Daten vs. Echte Daten +## ✅ GELÖST: Keine Sample-/Fake-Daten im Auction Feed -### Aktueller Zustand der Auktions-Daten: +### Neuer Zustand der Auktions-Daten (Stand: 2025-12) -**Das Scraping ist implementiert ABER:** +**Das Scraping liefert jetzt ausschließlich echte Auktionsdaten** (keine Schätzpreise, kein Random-Fallback, kein Seed/Demo): -1. **ExpiredDomains.net**: Funktioniert, aber: - - Preise sind **geschätzt** (nicht echt): `estimated_price = base_prices.get(tld, 15)` - - Dies sind Registrierungspreise, KEINE Auktionspreise +1. **GoDaddy / Namecheap / Sedo** (robust, ohne Cloudflare-Probleme): + - Ingestion über die ExpiredDomains-Provider-Seiten mit **Price / Bids / Endtime** + - Vorteil: Wir müssen die Cloudflare-geschützten Provider nicht direkt scrapen, bekommen aber echte Live-Daten. -2. **GoDaddy/Sedo/NameJet/DropCatch**: Scraping existiert, aber: - - Websites haben Anti-Bot-Maßnahmen - - Layouts ändern sich regelmäßig - - **Aktuell werden oft Sample-Daten als Fallback verwendet** +2. **Park.io** + - Scraping der öffentlichen Auktionstabelle (inkl. **Price / Bids / Close Date**) -3. **In der Praxis zeigt die Seite oft:** - ```python - # backend/app/services/auction_scraper.py:689-780 - async def seed_sample_auctions(self, db: AsyncSession): - # DIESE DATEN SIND FAKE (Demo-Daten)! - sample_auctions = [ - {"domain": "techflow.io", "platform": "GoDaddy", "current_bid": 250, ...}, - ... - ] - ``` +3. **Sav** + - Scraping des Tabellen-Endpoints `load_domains_ajax/*` (inkl. **Price / Bids / Time left** → deterministische `end_time` Ableitung) -### 🚨 Für Premium-Qualität erforderlich: +4. **Dynadot** + - Hidden JSON API (Frontend-API) mit echten Preis- und Endzeit-Feldern -1. **Keine geschätzten Preise** - Nur echte Auktionspreise anzeigen -2. **Klare Kennzeichnung** - Wenn Daten unsicher sind, transparent kommunizieren -3. **Fallback-Strategie** - Wenn Scraping fehlschlägt, keine Fake-Daten zeigen +### Datenqualitäts-Regeln -### Empfohlene Änderungen: - -```python -# Statt geschätzter Preise: -"current_bid": float(estimated_price), # ❌ FALSCH - -# Besser: -"current_bid": None, # Kein Preis = keine falsche Info -"price_type": "registration_estimate", # Kennzeichnung -``` +- **`current_bid > 0` und `end_time` müssen vorhanden sein**, sonst wird der Datensatz verworfen. +- Es gibt **keinen** `/api/v1/auctions/seed` Endpunkt mehr und **keine** Seed-/Demo-Skripte. --- diff --git a/DEPLOYMENT_INSTRUCTIONS.md b/DEPLOYMENT_INSTRUCTIONS.md index 88fbd25..7ea997a 100644 --- a/DEPLOYMENT_INSTRUCTIONS.md +++ b/DEPLOYMENT_INSTRUCTIONS.md @@ -48,8 +48,8 @@ python init_db.py # TLD Preise seeden python seed_tld_prices.py -# Auctions seeden (optional für Demo-Daten) -python seed_auctions.py +# Auctions initial scrapen (echte Daten, keine Demo-Daten) +python scripts/scrape_auctions.py # Stripe Produkte erstellen python -c " diff --git a/backend/app/api/auctions.py b/backend/app/api/auctions.py index 33fe52d..304518f 100644 --- a/backend/app/api/auctions.py +++ b/backend/app/api/auctions.py @@ -599,27 +599,6 @@ async def trigger_scrape( raise HTTPException(status_code=500, detail=f"Scrape failed: {str(e)}") -@router.post("/seed") -async def seed_auctions( - current_user: User = Depends(get_current_user), - db: AsyncSession = Depends(get_db), -): - """ - Seed the database with realistic sample auction data. - Useful for development and demo purposes. - """ - try: - result = await auction_scraper.seed_sample_auctions(db) - return { - "status": "success", - "message": "Sample auctions seeded", - "result": result, - } - except Exception as e: - logger.error(f"Seeding failed: {e}") - raise HTTPException(status_code=500, detail=f"Seeding failed: {str(e)}") - - @router.get("/opportunities") async def get_smart_opportunities( current_user: User = Depends(get_current_user), diff --git a/backend/app/models/auction.py b/backend/app/models/auction.py index 96b49ce..39aad53 100644 --- a/backend/app/models/auction.py +++ b/backend/app/models/auction.py @@ -62,7 +62,8 @@ class DomainAuction(Base): # Indexes for common queries __table_args__ = ( - Index('ix_auctions_platform_domain', 'platform', 'domain'), + # Enforce de-duplication at the database level. + Index('ux_auctions_platform_domain', 'platform', 'domain', unique=True), Index('ix_auctions_end_time_active', 'end_time', 'is_active'), Index('ix_auctions_tld_bid', 'tld', 'current_bid'), ) diff --git a/backend/app/services/auction_scraper.py b/backend/app/services/auction_scraper.py index e5c2852..dcbd1b4 100644 --- a/backend/app/services/auction_scraper.py +++ b/backend/app/services/auction_scraper.py @@ -1,95 +1,85 @@ """ -Domain Auction Scraper Service +Domain Auction Scraper Service (Pounce) -Data Acquisition Strategy (from MARKET_CONCEPT.md): +Hard rules (project requirement): +- No mock/demo data. +- No estimated / placeholder auction prices. +- Store auctions only when we have real `current_bid` and a real `end_time` + (or a provider-provided time-left that can be converted deterministically). -TIER 0: HIDDEN JSON APIs (Most Reliable, Fastest) -- Namecheap GraphQL API (aftermarketapi.namecheap.com) -- Dynadot REST API (dynadot-vue-api) -- Sav.com AJAX API +Current data sources (works without scraping Cloudflare-protected providers): +- Dynadot: hidden JSON API (via `hidden_api_scraper`) +- ExpiredDomains provider auction pages (GoDaddy / Namecheap / Sedo): + include Price, Bids, Endtime +- Park.io: public auctions table includes Price, Bids, Close Date +- Sav: auctions table endpoint includes Price, Bids, Time left -TIER 1: OFFICIAL APIs -- DropCatch API (Official Partner) -- Sedo Partner API (wenn konfiguriert) - -TIER 2: WEB SCRAPING (Fallback) -- ExpiredDomains.net (aggregator for deleted domains) -- GoDaddy Auctions (public listings via RSS/public pages) -- NameJet (public auctions) - -The scraper tries Tier 0 first, then Tier 1, then Tier 2. - -ALL URLs include AFFILIATE TRACKING for monetization! - -IMPORTANT: -- Respects robots.txt -- Uses reasonable rate limiting -- Only scrapes publicly available data -- Caches results to minimize requests +Optional sources: +- DropCatch Partner API (if configured) +- Sedo Partner API (if configured) +- Playwright (opt-in) for Cloudflare-protected providers like NameJet """ -import logging + import asyncio +import logging +import os import re -import random from datetime import datetime, timedelta -from typing import List, Optional, Dict, Any -from urllib.parse import urljoin, quote +from typing import Any, Dict, List, Optional import httpx from bs4 import BeautifulSoup -from sqlalchemy import select, and_, delete +from sqlalchemy import and_, delete, select +from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession -from app.models.auction import DomainAuction, AuctionScrapeLog +from app.models.auction import AuctionScrapeLog, DomainAuction from app.services.dropcatch_api import dropcatch_client +from app.services.hidden_api_scrapers import build_affiliate_url, hidden_api_scraper from app.services.sedo_api import sedo_client -from app.services.hidden_api_scrapers import ( - hidden_api_scraper, - build_affiliate_url, - AFFILIATE_CONFIG, -) -# Optional: Playwright for Cloudflare-protected sites try: from app.services.playwright_scraper import playwright_scraper PLAYWRIGHT_AVAILABLE = True except ImportError: - PLAYWRIGHT_AVAILABLE = False playwright_scraper = None + PLAYWRIGHT_AVAILABLE = False logger = logging.getLogger(__name__) # Rate limiting: requests per minute per platform -RATE_LIMITS = { - "GoDaddy": 10, - "Sedo": 10, - "NameJet": 10, - "DropCatch": 10, +RATE_LIMITS: Dict[str, int] = { "ExpiredDomains": 5, + "Park.io": 10, + "Sav": 10, + "DropCatch": 10, + "Sedo": 10, + "NameJet": 5, } -# User agent for scraping -USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +USER_AGENT = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +) class AuctionScraperService: """ - Scrapes domain auctions from multiple platforms. - - All data comes from publicly accessible pages - no APIs used. - Results are cached in the database to minimize scraping frequency. + Orchestrates scraping across multiple sources and stores results in DB. """ - + def __init__(self): self.http_client: Optional[httpx.AsyncClient] = None self._last_request: Dict[str, datetime] = {} - + async def _get_client(self) -> httpx.AsyncClient: - """Get or create HTTP client with appropriate headers.""" + """Get or create HTTP client with appropriate headers (and optional proxy).""" if self.http_client is None or self.http_client.is_closed: + proxy = os.getenv("SCRAPER_HTTP_PROXY") or os.getenv("SCRAPER_PROXY_URL") self.http_client = httpx.AsyncClient( timeout=30.0, follow_redirects=True, + proxy=proxy, headers={ "User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", @@ -98,34 +88,597 @@ class AuctionScraperService: "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", - } + }, ) return self.http_client - + async def _rate_limit(self, platform: str): """Enforce rate limiting per platform.""" - min_interval = 60 / RATE_LIMITS.get(platform, 10) # seconds between requests + min_interval = 60 / RATE_LIMITS.get(platform, 10) last = self._last_request.get(platform) - if last: elapsed = (datetime.utcnow() - last).total_seconds() if elapsed < min_interval: await asyncio.sleep(min_interval - elapsed) - self._last_request[platform] = datetime.utcnow() - + + # ---------------------------- + # Parsing & validation helpers + # ---------------------------- + + def _parse_datetime(self, value: Any) -> Optional[datetime]: + """Parse datetime from common API formats (ISO strings, timestamps).""" + if value is None: + return None + if isinstance(value, datetime): + return value.replace(tzinfo=None) + if isinstance(value, (int, float)): + try: + return datetime.utcfromtimestamp(float(value)).replace(tzinfo=None) + except Exception: + return None + if isinstance(value, str): + raw = value.strip() + if not raw: + return None + try: + return datetime.fromisoformat(raw.replace("Z", "+00:00")).replace(tzinfo=None) + except Exception: + return None + return None + + def _to_float(self, value: Any) -> Optional[float]: + """Parse float from strings like '$1,234.56'.""" + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + cleaned = value.strip().replace(",", "") + cleaned = cleaned.replace("$", "").replace("€", "").replace("£", "") + if not cleaned: + return None + try: + return float(cleaned) + except Exception: + return None + return None + + def _parse_price_currency(self, text: str) -> Optional[tuple[float, str]]: + """Parse price strings like '7,100 USD' or '$530.00' into (price, currency).""" + if not text: + return None + raw = text.strip() + if not raw or raw.lower() in {"-", "n/a", "na"}: + return None + + currency = "USD" + m_amount = re.search(r"([0-9][0-9,]*(?:\.[0-9]+)?)", raw) + if not m_amount: + return None + amount = self._to_float(m_amount.group(1)) + if amount is None: + return None + + m_cur = re.search(r"\b([A-Z]{3})\b", raw) + if m_cur: + currency = m_cur.group(1).upper() + elif "$" in raw: + currency = "USD" + elif "€" in raw: + currency = "EUR" + elif "£" in raw: + currency = "GBP" + + return float(amount), currency + + def _parse_timeleft(self, text: str) -> Optional[timedelta]: + """ + Parse relative time strings into a timedelta. + + Supported examples: + - ExpiredDomains: '4d 20h 39m', '6m 48s', '23h 46m' + - Sav: '6D 2H' + """ + if not text: + return None + raw = text.strip().lower() + if not raw or raw in {"-", "n/a", "na", "ended"}: + return None + + matches = re.findall(r"(\d+)\s*([dhms])", raw) + if not matches: + return None + + total_seconds = 0 + for amount_str, unit in matches: + try: + amount = int(amount_str) + except Exception: + return None + if unit == "d": + total_seconds += amount * 86400 + elif unit == "h": + total_seconds += amount * 3600 + elif unit == "m": + total_seconds += amount * 60 + elif unit == "s": + total_seconds += amount + + if total_seconds <= 0: + return None + return timedelta(seconds=total_seconds) + + def _sanitize_auction_payload(self, auction_data: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """ + Ensure we only store real, complete auctions. + + Rules (strict): + - domain/platform/auction_url must be present + - current_bid must be > 0 + - end_time must be parseable + - drop unknown keys (prevents accidental schema drift) + """ + if not isinstance(auction_data, dict): + return None + + domain = str(auction_data.get("domain", "")).strip().lower() + platform = str(auction_data.get("platform", "")).strip() + auction_url = str(auction_data.get("auction_url", "")).strip() + + if not domain or "." not in domain: + return None + if not platform: + return None + if not auction_url: + return None + + tld = auction_data.get("tld") or domain.rsplit(".", 1)[-1] + tld = str(tld).strip().lower().lstrip(".") + if not tld: + return None + + current_bid = self._to_float(auction_data.get("current_bid")) + if current_bid is None or current_bid <= 0: + return None + + end_time = self._parse_datetime(auction_data.get("end_time")) + if end_time is None: + return None + + try: + num_bids = int(auction_data.get("num_bids", 0) or 0) + except Exception: + return None + if num_bids < 0: + return None + + # Normalize optional floats + min_bid = self._to_float(auction_data.get("min_bid")) + buy_now_price = self._to_float(auction_data.get("buy_now_price")) + reserve_price = self._to_float(auction_data.get("reserve_price")) + + # Normalize watchers (optional) + try: + num_watchers = auction_data.get("num_watchers") + num_watchers = int(num_watchers) if num_watchers is not None else None + except Exception: + num_watchers = None + + allowed = {c.name for c in DomainAuction.__table__.columns} + cleaned = {k: v for k, v in auction_data.items() if k in allowed} + + cleaned.update( + { + "domain": domain, + "tld": tld, + "platform": platform, + "auction_url": auction_url, + "current_bid": float(current_bid), + "min_bid": float(min_bid) if min_bid is not None else None, + "buy_now_price": float(buy_now_price) if buy_now_price is not None else None, + "reserve_price": float(reserve_price) if reserve_price is not None else None, + "num_bids": num_bids, + "num_watchers": num_watchers, + "end_time": end_time, + "is_active": True, + } + ) + + currency = cleaned.get("currency") or "USD" + cleaned["currency"] = str(currency).strip().upper() + + return cleaned + + async def _store_auction(self, db: AsyncSession, auction_data: Dict[str, Any]) -> str: + """Store or update an auction in the database. Returns 'new', 'updated' or 'skipped'.""" + cleaned = self._sanitize_auction_payload(auction_data) + if cleaned is None: + return "skipped" + + # AsyncSessionLocal is configured with autoflush=False. + # Flush pending inserts/updates so the existence check can see them and we don't create duplicates. + await db.flush() + + existing = await db.execute( + select(DomainAuction).where( + and_( + DomainAuction.domain == cleaned["domain"], + DomainAuction.platform == cleaned["platform"], + ) + ) + ) + existing = existing.scalar_one_or_none() + + if existing: + for key, value in cleaned.items(): + setattr(existing, key, value) + existing.updated_at = datetime.utcnow() + existing.is_active = True + return "updated" + + try: + # Protect against concurrent inserts (e.g. cron overlap) when a unique index exists. + async with db.begin_nested(): + db.add(DomainAuction(**cleaned)) + await db.flush() + return "new" + except IntegrityError: + # Another transaction inserted the same (platform, domain) in the meantime. + existing = await db.execute( + select(DomainAuction).where( + and_( + DomainAuction.domain == cleaned["domain"], + DomainAuction.platform == cleaned["platform"], + ) + ) + ) + existing = existing.scalar_one_or_none() + if not existing: + return "skipped" + + for key, value in cleaned.items(): + setattr(existing, key, value) + existing.updated_at = datetime.utcnow() + existing.is_active = True + return "updated" + + # ---------------------------- + # Source scrapers + # ---------------------------- + + async def _scrape_expireddomains_auction_page( + self, + db: AsyncSession, + platform: str, + url: str, + limit: int = 200, + ) -> Dict[str, Any]: + """Scrape ExpiredDomains provider-specific auction pages (real Price/Bids/Endtime).""" + result = {"found": 0, "new": 0, "updated": 0} + + log = AuctionScrapeLog(platform=platform) + db.add(log) + await db.commit() + + try: + await self._rate_limit("ExpiredDomains") + client = await self._get_client() + + resp = await client.get(url, timeout=20.0) + if resp.status_code != 200: + raise Exception(f"HTTP {resp.status_code}") + + soup = BeautifulSoup(resp.text, "lxml") + table = soup.select_one("table.base1") + if not table: + raise Exception("ExpiredDomains table not found") + + headers = [th.get_text(" ", strip=True) for th in table.select("thead th")] + header_index = {h: i for i, h in enumerate(headers)} + + required = ["Domain", "Price", "Bids", "Endtime"] + if not all(k in header_index for k in required): + raise Exception(f"Missing required columns: {required} in {headers}") + + rows = table.select("tbody tr") + now = datetime.utcnow() + + for row in rows[:limit]: + cols = row.find_all("td") + if len(cols) < len(headers): + continue + + domain = cols[header_index["Domain"]].get_text(" ", strip=True).lower() + if not domain or "." not in domain: + continue + + tld = domain.rsplit(".", 1)[-1].lower() + + parsed_price = self._parse_price_currency(cols[header_index["Price"]].get_text(" ", strip=True)) + if not parsed_price: + continue + current_bid, currency = parsed_price + if current_bid <= 0: + continue + + bids_raw = cols[header_index["Bids"]].get_text(" ", strip=True) + try: + num_bids = int(re.sub(r"[^0-9]", "", bids_raw) or "0") + except Exception: + continue + + end_raw = cols[header_index["Endtime"]].get_text(" ", strip=True) + delta = self._parse_timeleft(end_raw) + if not delta: + continue + end_time = now + delta + + domain_link = cols[header_index["Domain"]].find("a") + href = domain_link.get("href") if domain_link else None + if href and href.startswith("/"): + href = f"https://www.expireddomains.net{href}" + + auction_data = { + "domain": domain, + "tld": tld, + "platform": platform, + "platform_auction_id": None, + "auction_url": href or build_affiliate_url(platform, domain), + "current_bid": current_bid, + "currency": currency, + "num_bids": num_bids, + "end_time": end_time, + "scrape_source": f"expireddomains:{url}", + } + + status = await self._store_auction(db, auction_data) + if status == "skipped": + continue + result["found"] += 1 + result[status] += 1 + + await db.commit() + + log.completed_at = datetime.utcnow() + log.status = "success" + log.auctions_found = result["found"] + log.auctions_new = result["new"] + log.auctions_updated = result["updated"] + await db.commit() + + except Exception as e: + log.completed_at = datetime.utcnow() + log.status = "failed" + log.error_message = str(e)[:500] + await db.commit() + logger.error(f"ExpiredDomains({platform}) scrape failed: {e}") + + return result + + async def _scrape_expireddomains_godaddy(self, db: AsyncSession) -> Dict[str, Any]: + return await self._scrape_expireddomains_auction_page( + db=db, + platform="GoDaddy", + url="https://www.expireddomains.net/godaddy-domain-auctions-with-bids/", + ) + + async def _scrape_expireddomains_namecheap(self, db: AsyncSession) -> Dict[str, Any]: + return await self._scrape_expireddomains_auction_page( + db=db, + platform="Namecheap", + url="https://www.expireddomains.net/namecheap-auction-domains/", + ) + + async def _scrape_expireddomains_sedo(self, db: AsyncSession) -> Dict[str, Any]: + return await self._scrape_expireddomains_auction_page( + db=db, + platform="Sedo", + url="https://www.expireddomains.net/sedo-auction-domains/", + ) + + async def _scrape_parkio_public(self, db: AsyncSession) -> Dict[str, Any]: + """Scrape Park.io public auctions page (includes price + close date).""" + platform = "Park.io" + result = {"found": 0, "new": 0, "updated": 0} + + log = AuctionScrapeLog(platform=platform) + db.add(log) + await db.commit() + + try: + await self._rate_limit(platform) + client = await self._get_client() + + resp = await client.get("https://park.io/auctions", timeout=20.0) + if resp.status_code != 200: + raise Exception(f"HTTP {resp.status_code}") + + soup = BeautifulSoup(resp.text, "lxml") + table = soup.select_one("table.table") + if not table: + raise Exception("Park.io table not found") + + rows = table.select("tbody tr") + for row in rows[:200]: + cols = row.find_all("td") + if len(cols) < 5: + continue + + domain = cols[1].get_text(" ", strip=True).lower() + if not domain or "." not in domain: + continue + + tld = domain.rsplit(".", 1)[-1].lower() + + parsed_price = self._parse_price_currency(cols[2].get_text(" ", strip=True)) + if not parsed_price: + continue + current_bid, currency = parsed_price + if current_bid <= 0: + continue + + bids_raw = cols[3].get_text(" ", strip=True) + try: + num_bids = int(re.sub(r"[^0-9]", "", bids_raw) or "0") + except Exception: + continue + + close_raw = cols[4].get_text(" ", strip=True) + try: + # Park.io displays a naive timestamp. We treat it as UTC. + end_time = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S") + except Exception: + continue + + link_el = cols[1].find("a", href=True) + href = link_el["href"] if link_el else None + if href and href.startswith("/"): + href = f"https://park.io{href}" + + auction_data = { + "domain": domain, + "tld": tld, + "platform": platform, + "auction_url": href or "https://park.io/auctions", + "current_bid": current_bid, + "currency": currency, + "num_bids": num_bids, + "end_time": end_time, + "scrape_source": "park.io:auctions", + } + + status = await self._store_auction(db, auction_data) + if status == "skipped": + continue + result["found"] += 1 + result[status] += 1 + + await db.commit() + + log.completed_at = datetime.utcnow() + log.status = "success" + log.auctions_found = result["found"] + log.auctions_new = result["new"] + log.auctions_updated = result["updated"] + await db.commit() + + except Exception as e: + log.completed_at = datetime.utcnow() + log.status = "failed" + log.error_message = str(e)[:500] + await db.commit() + logger.error(f"Park.io scrape failed: {e}") + + return result + + async def _scrape_sav_public(self, db: AsyncSession) -> Dict[str, Any]: + """Scrape Sav auctions from their HTML table endpoint.""" + platform = "Sav" + result = {"found": 0, "new": 0, "updated": 0} + + log = AuctionScrapeLog(platform=platform) + db.add(log) + await db.commit() + + try: + await self._rate_limit(platform) + client = await self._get_client() + + now = datetime.utcnow() + for page in range(0, 3): + resp = await client.post( + f"https://www.sav.com/auctions/load_domains_ajax/{page}", + headers={"X-Requested-With": "XMLHttpRequest"}, + timeout=20.0, + ) + if resp.status_code != 200: + continue + + soup = BeautifulSoup(resp.text, "html.parser") + rows = soup.select("tr") + if not rows: + continue + + for row in rows[:200]: + cells = row.find_all("td") + if len(cells) < 7: + continue + + domain_link = cells[1].find("a") + domain = domain_link.get_text(" ", strip=True).lower() if domain_link else "" + if not domain or "." not in domain: + continue + + tld = domain.rsplit(".", 1)[-1].lower() + + parsed_price = self._parse_price_currency(cells[2].get_text(" ", strip=True)) + if not parsed_price: + continue + current_bid, currency = parsed_price + if current_bid <= 0: + continue + + bids_raw = cells[3].get_text(" ", strip=True) + try: + num_bids = int(re.sub(r"[^0-9]", "", bids_raw) or "0") + except Exception: + continue + + time_left_raw = cells[6].get_text(" ", strip=True) + delta = self._parse_timeleft(time_left_raw) + if not delta: + continue + end_time = now + delta + + href = domain_link.get("href") if domain_link else None + if href and href.startswith("/"): + href = f"https://www.sav.com{href}" + + auction_data = { + "domain": domain, + "tld": tld, + "platform": platform, + "auction_url": href or "https://www.sav.com/domains/auctions", + "current_bid": current_bid, + "currency": currency, + "num_bids": num_bids, + "end_time": end_time, + "scrape_source": f"sav:load_domains_ajax:{page}", + } + + status = await self._store_auction(db, auction_data) + if status == "skipped": + continue + result["found"] += 1 + result[status] += 1 + + await asyncio.sleep(1) + + await db.commit() + + log.completed_at = datetime.utcnow() + log.status = "success" + log.auctions_found = result["found"] + log.auctions_new = result["new"] + log.auctions_updated = result["updated"] + await db.commit() + + except Exception as e: + log.completed_at = datetime.utcnow() + log.status = "failed" + log.error_message = str(e)[:500] + await db.commit() + logger.error(f"Sav scrape failed: {e}") + + return result + + # ---------------------------- + # Orchestration + # ---------------------------- + async def scrape_all_platforms(self, db: AsyncSession) -> Dict[str, Any]: - """ - Scrape all supported platforms and store results in database. - Returns summary of scraping activity. - - Data Acquisition Priority: - - TIER 0: Hidden JSON APIs (Namecheap, Dynadot, Sav) - Most reliable! - - TIER 1: Official Partner APIs (DropCatch, Sedo) - - TIER 2: Web Scraping (ExpiredDomains, GoDaddy, NameJet) - - All URLs include affiliate tracking for monetization. - """ + """Scrape all configured sources and store results in DB.""" results = { "total_found": 0, "total_new": 0, @@ -133,57 +686,39 @@ class AuctionScraperService: "platforms": {}, "errors": [], } - - # ═══════════════════════════════════════════════════════════════ - # TIER 0: Hidden JSON APIs (Most Reliable!) - # These are undocumented but public APIs used by platform frontends - # ═══════════════════════════════════════════════════════════════ - logger.info("🚀 Starting TIER 0: Hidden JSON APIs (Namecheap, Dynadot, Sav)") + + def _touch_platform(platform: str): + if platform not in results["platforms"]: + results["platforms"][platform] = {"found": 0, "new": 0, "updated": 0} + + # TIER 0: Hidden APIs (Dynadot, etc.) try: hidden_api_result = await hidden_api_scraper.scrape_all(limit_per_platform=100) - for item in hidden_api_result.get("items", []): action = await self._store_auction(db, item) + if action == "skipped": + continue platform = item.get("platform", "Unknown") - - if platform not in results["platforms"]: - results["platforms"][platform] = {"found": 0, "new": 0, "updated": 0} - + _touch_platform(platform) results["platforms"][platform]["found"] += 1 + results["total_found"] += 1 if action == "new": results["platforms"][platform]["new"] += 1 results["total_new"] += 1 elif action == "updated": results["platforms"][platform]["updated"] += 1 results["total_updated"] += 1 - - results["total_found"] += 1 - - # Log platform summaries - for platform, data in hidden_api_result.get("platforms", {}).items(): - logger.info(f"✅ {platform} Hidden API: {data.get('found', 0)} auctions") - + if hidden_api_result.get("errors"): for error in hidden_api_result["errors"]: - logger.warning(f"⚠️ Hidden API: {error}") results["errors"].append(f"Hidden API: {error}") - except Exception as e: - logger.error(f"❌ TIER 0 Hidden APIs failed: {e}") results["errors"].append(f"Hidden APIs: {str(e)}") - + await db.commit() - - # ═══════════════════════════════════════════════════════════════ - # TIER 1: Official Partner APIs (Best data quality) - # ═══════════════════════════════════════════════════════════════ - logger.info("🔌 Starting TIER 1: Official Partner APIs (DropCatch, Sedo)") - tier1_apis = [ - ("DropCatch", self._fetch_dropcatch_api), - ("Sedo", self._fetch_sedo_api), - ] - - for platform_name, api_func in tier1_apis: + + # TIER 1: Official Partner APIs (if configured) + for platform_name, api_func in [("DropCatch", self._fetch_dropcatch_api), ("Sedo", self._fetch_sedo_api)]: try: api_result = await api_func(db) if api_result.get("found", 0) > 0: @@ -191,997 +726,167 @@ class AuctionScraperService: results["total_found"] += api_result.get("found", 0) results["total_new"] += api_result.get("new", 0) results["total_updated"] += api_result.get("updated", 0) - logger.info(f"✅ {platform_name} API: {api_result['found']} auctions") except Exception as e: - logger.warning(f"⚠️ {platform_name} API failed, will try scraping: {e}") - - # ═══════════════════════════════════════════════════════════════ - # TIER 2: Web Scraping (Fallback for platforms without API access) - # ═══════════════════════════════════════════════════════════════ - logger.info("📦 Starting TIER 2: Web Scraping (ExpiredDomains, GoDaddy, NameJet)") + results["errors"].append(f"{platform_name} API: {str(e)}") + + # TIER 2: Web scraping (non-Cloudflare, or via ExpiredDomains provider pages) scrapers = [ - ("ExpiredDomains", self._scrape_expireddomains), - ("GoDaddy", self._scrape_godaddy_public), - ("NameJet", self._scrape_namejet_public), + ("GoDaddy", self._scrape_expireddomains_godaddy), + ("Namecheap", self._scrape_expireddomains_namecheap), + ("Sedo", self._scrape_expireddomains_sedo), + ("Park.io", self._scrape_parkio_public), + ("Sav", self._scrape_sav_public), ] - - # Add fallbacks only if APIs failed - if "DropCatch" not in results["platforms"]: - scrapers.append(("DropCatch", self._scrape_dropcatch_public)) - if "Sedo" not in results["platforms"]: - scrapers.append(("Sedo", self._scrape_sedo_public)) - - for platform_name, scraper_func in scrapers: + + for platform_name, fn in scrapers: try: - platform_result = await scraper_func(db) - results["platforms"][platform_name] = platform_result - results["total_found"] += platform_result.get("found", 0) - results["total_new"] += platform_result.get("new", 0) - results["total_updated"] += platform_result.get("updated", 0) + r = await fn(db) + results["platforms"][platform_name] = r + results["total_found"] += r.get("found", 0) + results["total_new"] += r.get("new", 0) + results["total_updated"] += r.get("updated", 0) except Exception as e: - logger.error(f"Error scraping {platform_name}: {e}") results["errors"].append(f"{platform_name}: {str(e)}") - - # ═══════════════════════════════════════════════════════════════ - # TIER 3: Playwright Stealth (Cloudflare-protected sites) - # Uses headless browser with stealth mode to bypass protection - # ═══════════════════════════════════════════════════════════════ - if PLAYWRIGHT_AVAILABLE and playwright_scraper: - # Only run Playwright if we didn't get enough data from other sources - godaddy_count = results["platforms"].get("GoDaddy", {}).get("found", 0) - namejet_count = results["platforms"].get("NameJet", {}).get("found", 0) - - if godaddy_count < 10 or namejet_count < 5: - logger.info("🎭 Starting TIER 3: Playwright Stealth (GoDaddy, NameJet)") - try: - playwright_result = await playwright_scraper.scrape_all_protected() - - for item in playwright_result.get("items", []): - action = await self._store_auction(db, item) - platform = item.get("platform", "Unknown") - - if platform not in results["platforms"]: - results["platforms"][platform] = {"found": 0, "new": 0, "updated": 0} - - results["platforms"][platform]["found"] += 1 - results["platforms"][platform]["source"] = "playwright" - if action == "new": - results["platforms"][platform]["new"] += 1 - results["total_new"] += 1 - elif action == "updated": - results["platforms"][platform]["updated"] += 1 - results["total_updated"] += 1 - - results["total_found"] += 1 - - for platform, data in playwright_result.get("platforms", {}).items(): - logger.info(f"🎭 {platform} Playwright: {data.get('found', 0)} auctions") - - if playwright_result.get("errors"): - for error in playwright_result["errors"]: - logger.warning(f"⚠️ Playwright: {error}") - results["errors"].append(f"Playwright: {error}") - - except Exception as e: - logger.error(f"❌ Playwright scraping failed: {e}") - results["errors"].append(f"Playwright: {str(e)}") - - await db.commit() - - # Mark ended auctions as inactive + + # TIER 3: Playwright (opt-in) + playwright_enabled = os.getenv("POUNCE_ENABLE_PROTECTED_SCRAPERS", "false").lower() in ("1", "true", "yes") + if PLAYWRIGHT_AVAILABLE and playwright_scraper and playwright_enabled: + try: + playwright_result = await playwright_scraper.scrape_all_protected() + for item in playwright_result.get("items", []): + action = await self._store_auction(db, item) + if action == "skipped": + continue + platform = item.get("platform", "Unknown") + _touch_platform(platform) + results["platforms"][platform]["found"] += 1 + results["total_found"] += 1 + if action == "new": + results["platforms"][platform]["new"] += 1 + results["total_new"] += 1 + elif action == "updated": + results["platforms"][platform]["updated"] += 1 + results["total_updated"] += 1 + if playwright_result.get("errors"): + for error in playwright_result["errors"]: + results["errors"].append(f"Playwright: {error}") + except Exception as e: + results["errors"].append(f"Playwright: {str(e)}") + + await db.commit() await self._cleanup_ended_auctions(db) - return results - - async def _store_auction(self, db: AsyncSession, auction_data: Dict[str, Any]) -> str: - """Store or update an auction in the database. Returns 'new' or 'updated'.""" - existing = await db.execute( - select(DomainAuction).where( - and_( - DomainAuction.domain == auction_data["domain"], - DomainAuction.platform == auction_data["platform"], - ) - ) - ) - existing = existing.scalar_one_or_none() - - if existing: - # Update existing - for key, value in auction_data.items(): - setattr(existing, key, value) - existing.updated_at = datetime.utcnow() - existing.is_active = True - return "updated" - else: - # Create new - new_auction = DomainAuction(**auction_data) - db.add(new_auction) - return "new" - - async def _scrape_expireddomains(self, db: AsyncSession) -> Dict[str, Any]: - """ - Scrape ExpiredDomains.net for auction listings. - This site aggregates expired/deleted domains from various TLDs. - - Enhanced to scrape multiple pages and categories: - - Deleted domains (multiple TLDs) - - Pending delete domains - - Expired auction domains - """ - platform = "ExpiredDomains" - result = {"found": 0, "new": 0, "updated": 0} - - log = AuctionScrapeLog(platform=platform) - db.add(log) - await db.commit() - - try: - await self._rate_limit(platform) - client = await self._get_client() - - # TLD-based pricing - base_prices = { - "com": 12, "net": 10, "org": 10, "io": 50, "ai": 80, - "co": 25, "de": 8, "nl": 10, "fr": 10, "app": 15, - "xyz": 5, "info": 8, "tech": 15, "dev": 12, "me": 15, - "tv": 35, "gg": 60, "sh": 40, "cc": 25, "biz": 8, - } - - # Enhanced: Multiple pages to scrape - pages_to_scrape = [ - # Deleted domains (different sorting/pages) - "https://www.expireddomains.net/deleted-domains/", - "https://www.expireddomains.net/deleted-domains/?start=25", - "https://www.expireddomains.net/deleted-domains/?start=50", - # Pending delete - "https://www.expireddomains.net/pending-delete-domains/", - # By TLD - "https://www.expireddomains.net/deleted-com-domains/", - "https://www.expireddomains.net/deleted-net-domains/", - "https://www.expireddomains.net/deleted-io-domains/", - "https://www.expireddomains.net/deleted-ai-domains/", - # Backorder auctions - "https://www.expireddomains.net/backorder-domain-auctions/", - ] - - seen_domains = set() - - for url in pages_to_scrape: - try: - await asyncio.sleep(1) # Rate limit between pages - response = await client.get(url, timeout=15.0) - - if response.status_code != 200: - logger.debug(f"ExpiredDomains {url}: HTTP {response.status_code}") - continue - - soup = BeautifulSoup(response.text, "lxml") - domain_rows = soup.select("table.base1 tbody tr") - - for row in domain_rows[:50]: # 50 per page - try: - cols = row.find_all("td") - if len(cols) < 3: - continue - - domain_link = cols[0].find("a") - if not domain_link: - continue - - domain_text = domain_link.get_text(strip=True) - if not domain_text or "." not in domain_text: - continue - - domain = domain_text.lower() - - # Skip if already seen - if domain in seen_domains: - continue - seen_domains.add(domain) - - tld = domain.rsplit(".", 1)[-1] - estimated_price = base_prices.get(tld, 15) - - # Try to extract age/backlinks from other columns - age_years = None - backlinks = None - domain_authority = None - - if len(cols) >= 5: - try: - # BL column (backlinks) - bl_text = cols[3].get_text(strip=True) - if bl_text and bl_text.isdigit(): - backlinks = int(bl_text) - except: - pass - try: - # ABY column (archive.org age) - age_text = cols[4].get_text(strip=True) - if age_text and age_text.isdigit(): - age_years = int(age_text) - except: - pass - - auction_data = { - "domain": domain, - "tld": tld, - "platform": platform, - "platform_auction_id": None, - "auction_url": f"https://www.expireddomains.net/domain-name-search/?q={quote(domain)}", - "current_bid": float(estimated_price), - "currency": "USD", - "min_bid": None, - "buy_now_price": None, - "reserve_price": None, - "reserve_met": None, - "num_bids": 0, - "num_watchers": None, - "end_time": datetime.utcnow() + timedelta(days=7), - "auction_type": "registration", - "traffic": None, - "age_years": age_years, - "backlinks": backlinks, - "domain_authority": domain_authority, - "scrape_source": "expireddomains.net", - } - - status = await self._store_auction(db, auction_data) - result["found"] += 1 - result[status] += 1 - - except Exception as e: - logger.debug(f"Error parsing row: {e}") - continue - - except Exception as e: - logger.debug(f"Error fetching {url}: {e}") - continue - - await db.commit() - log.completed_at = datetime.utcnow() - log.status = "success" - log.auctions_found = result["found"] - log.auctions_new = result["new"] - log.auctions_updated = result["updated"] - await db.commit() - - logger.info(f"✅ ExpiredDomains: {result['found']} domains found") - - except Exception as e: - log.completed_at = datetime.utcnow() - log.status = "failed" - log.error_message = str(e) - await db.commit() - logger.error(f"ExpiredDomains scrape failed: {e}") - - return result - - async def _scrape_godaddy_public(self, db: AsyncSession) -> Dict[str, Any]: - """ - Scrape GoDaddy Auctions public RSS feed. - GoDaddy provides a public RSS feed of their auctions. - """ - platform = "GoDaddy" - result = {"found": 0, "new": 0, "updated": 0} - - log = AuctionScrapeLog(platform=platform) - db.add(log) - await db.commit() - - try: - await self._rate_limit(platform) - client = await self._get_client() - - # GoDaddy public auction feeds - these are publicly accessible - urls = [ - "https://auctions.godaddy.com/trpItemListingRSS.aspx?ci=2", # Expiring auctions - "https://auctions.godaddy.com/trpItemListingRSS.aspx?ci=3", # Closeout - ] - - for url in urls: - try: - response = await client.get(url, timeout=15.0) - if response.status_code != 200: - continue - - soup = BeautifulSoup(response.text, "xml") - items = soup.find_all("item") - - for item in items[:15]: - try: - title = item.find("title") - link = item.find("link") - description = item.find("description") - - if not title or not link: - continue - - domain = title.get_text(strip=True).lower() - if not domain or "." not in domain: - continue - - tld = domain.rsplit(".", 1)[-1] - - # Parse price from description - price = 12.0 - if description: - desc_text = description.get_text() - price_match = re.search(r'\$(\d+(?:,\d+)?(?:\.\d+)?)', desc_text) - if price_match: - price = float(price_match.group(1).replace(',', '')) - - # Parse bids from description - num_bids = 0 - if description: - bids_match = re.search(r'(\d+)\s*bid', description.get_text(), re.I) - if bids_match: - num_bids = int(bids_match.group(1)) - - auction_data = { - "domain": domain, - "tld": tld, - "platform": platform, - "platform_auction_id": None, - "auction_url": link.get_text(strip=True) if link else f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}", - "current_bid": price, - "currency": "USD", - "min_bid": None, - "buy_now_price": None, - "reserve_price": None, - "reserve_met": None, - "num_bids": num_bids, - "num_watchers": None, - "end_time": datetime.utcnow() + timedelta(days=random.randint(1, 5)), - "auction_type": "auction", - "traffic": None, - "age_years": None, - "backlinks": None, - "domain_authority": None, - "scrape_source": "godaddy_rss", - } - - status = await self._store_auction(db, auction_data) - result["found"] += 1 - result[status] += 1 - - except Exception as e: - logger.debug(f"Error parsing GoDaddy item: {e}") - continue - - except Exception as e: - logger.debug(f"Error fetching GoDaddy feed {url}: {e}") - continue - - await db.commit() - log.completed_at = datetime.utcnow() - log.status = "success" - log.auctions_found = result["found"] - log.auctions_new = result["new"] - log.auctions_updated = result["updated"] - await db.commit() - - except Exception as e: - log.completed_at = datetime.utcnow() - log.status = "failed" - log.error_message = str(e) - await db.commit() - logger.error(f"GoDaddy scrape failed: {e}") - - return result - - async def _scrape_sedo_public(self, db: AsyncSession) -> Dict[str, Any]: - """ - Scrape Sedo public marketplace listings. - Sedo has a public search that we can query. - """ - platform = "Sedo" - result = {"found": 0, "new": 0, "updated": 0} - - log = AuctionScrapeLog(platform=platform) - db.add(log) - await db.commit() - - try: - await self._rate_limit(platform) - client = await self._get_client() - - # Sedo public search pages for different TLDs - tlds_to_search = ["com", "io", "ai", "net", "org"] - - for tld in tlds_to_search: - try: - url = f"https://sedo.com/search/?keyword=.{tld}&price_min=1&price_max=500" - response = await client.get(url, timeout=15.0) - - if response.status_code != 200: - continue - - soup = BeautifulSoup(response.text, "lxml") - - # Find domain listings - listings = soup.select(".listing-item, .searchresult, .domain-item") - - for listing in listings[:10]: - try: - # Try multiple selectors for domain name - domain_elem = listing.select_one(".domain-name, .listing-title, a[href*='sedo.com']") - if not domain_elem: - continue - - domain = domain_elem.get_text(strip=True).lower() - if not domain or "." not in domain: - continue - - domain_tld = domain.rsplit(".", 1)[-1] - - # Try to find price - price = 100.0 - price_elem = listing.select_one(".price, .listing-price, .amount") - if price_elem: - price_text = price_elem.get_text() - price_match = re.search(r'[\$€]?\s*(\d+(?:,\d+)?(?:\.\d+)?)', price_text) - if price_match: - price = float(price_match.group(1).replace(',', '')) - - auction_data = { - "domain": domain, - "tld": domain_tld, - "platform": platform, - "platform_auction_id": None, - "auction_url": f"https://sedo.com/search/?keyword={domain}", - "current_bid": price, - "currency": "USD", - "min_bid": None, - "buy_now_price": price, - "reserve_price": None, - "reserve_met": None, - "num_bids": random.randint(0, 5), - "num_watchers": random.randint(0, 20), - "end_time": datetime.utcnow() + timedelta(days=random.randint(3, 14)), - "auction_type": "buy_now", - "traffic": None, - "age_years": None, - "backlinks": None, - "domain_authority": None, - "scrape_source": "sedo_search", - } - - status = await self._store_auction(db, auction_data) - result["found"] += 1 - result[status] += 1 - - except Exception as e: - logger.debug(f"Error parsing Sedo listing: {e}") - continue - - except Exception as e: - logger.debug(f"Error searching Sedo for .{tld}: {e}") - continue - - await db.commit() - log.completed_at = datetime.utcnow() - log.status = "success" - log.auctions_found = result["found"] - log.auctions_new = result["new"] - log.auctions_updated = result["updated"] - await db.commit() - - except Exception as e: - log.completed_at = datetime.utcnow() - log.status = "failed" - log.error_message = str(e) - await db.commit() - logger.error(f"Sedo scrape failed: {e}") - - return result - - async def _scrape_namejet_public(self, db: AsyncSession) -> Dict[str, Any]: - """ - Scrape NameJet public auction listings. - NameJet has public pages showing current auctions. - """ - platform = "NameJet" - result = {"found": 0, "new": 0, "updated": 0} - - log = AuctionScrapeLog(platform=platform) - db.add(log) - await db.commit() - - try: - await self._rate_limit(platform) - client = await self._get_client() - - # NameJet public auction page - url = "https://www.namejet.com/Pages/Auctions/BackorderSearch.aspx" - response = await client.get(url, timeout=15.0) - - if response.status_code == 200: - soup = BeautifulSoup(response.text, "lxml") - - # Find auction listings - auction_rows = soup.select(".auction-row, .domain-listing, tr[data-domain]") - - for row in auction_rows[:15]: - try: - domain_elem = row.select_one(".domain, .domain-name, td:first-child a") - if not domain_elem: - continue - - domain = domain_elem.get_text(strip=True).lower() - if not domain or "." not in domain: - continue - - tld = domain.rsplit(".", 1)[-1] - - # Try to find price - price = 69.0 # NameJet typical starting price - price_elem = row.select_one(".price, .bid, td:nth-child(2)") - if price_elem: - price_text = price_elem.get_text() - price_match = re.search(r'\$(\d+(?:,\d+)?(?:\.\d+)?)', price_text) - if price_match: - price = float(price_match.group(1).replace(',', '')) - - auction_data = { - "domain": domain, - "tld": tld, - "platform": platform, - "platform_auction_id": None, - "auction_url": f"https://www.namejet.com/Pages/Auctions/BackorderSearch.aspx?q={domain}", - "current_bid": price, - "currency": "USD", - "min_bid": None, - "buy_now_price": None, - "reserve_price": None, - "reserve_met": None, - "num_bids": random.randint(1, 15), - "num_watchers": None, - "end_time": datetime.utcnow() + timedelta(days=random.randint(1, 7)), - "auction_type": "auction", - "traffic": None, - "age_years": None, - "backlinks": None, - "domain_authority": None, - "scrape_source": "namejet_search", - } - - status = await self._store_auction(db, auction_data) - result["found"] += 1 - result[status] += 1 - - except Exception as e: - logger.debug(f"Error parsing NameJet row: {e}") - continue - - await db.commit() - log.completed_at = datetime.utcnow() - log.status = "success" - log.auctions_found = result["found"] - log.auctions_new = result["new"] - log.auctions_updated = result["updated"] - await db.commit() - - except Exception as e: - log.completed_at = datetime.utcnow() - log.status = "failed" - log.error_message = str(e) - await db.commit() - logger.error(f"NameJet scrape failed: {e}") - - return result - + + # ---------------------------- + # Tier 1 helpers (official APIs) + # ---------------------------- + async def _fetch_dropcatch_api(self, db: AsyncSession) -> Dict[str, Any]: - """ - 🚀 TIER 1: Fetch DropCatch auctions via OFFICIAL API - - This is our preferred method - faster, more reliable, more data. - Uses the official DropCatch Partner API. - """ platform = "DropCatch" result = {"found": 0, "new": 0, "updated": 0, "source": "api"} - + if not dropcatch_client.is_configured: - logger.info("DropCatch API not configured, skipping") return result - + log = AuctionScrapeLog(platform=platform) db.add(log) await db.commit() - + try: - # Fetch auctions from official API api_result = await dropcatch_client.search_auctions(page_size=100) - auctions = api_result.get("auctions") or api_result.get("items") or [] result["found"] = len(auctions) - + for dc_auction in auctions: - try: - # Transform to our format - auction_data = dropcatch_client.transform_to_pounce_format(dc_auction) - - if not auction_data["domain"]: - continue - - # Check if exists - existing = await db.execute( - select(DomainAuction).where( - and_( - DomainAuction.domain == auction_data["domain"], - DomainAuction.platform == platform - ) - ) - ) - existing_auction = existing.scalar_one_or_none() - - if existing_auction: - # Update existing - existing_auction.current_bid = auction_data["current_bid"] - existing_auction.num_bids = auction_data["num_bids"] - existing_auction.end_time = auction_data["end_time"] - existing_auction.is_active = True - existing_auction.updated_at = datetime.utcnow() - result["updated"] += 1 - else: - # Create new - new_auction = DomainAuction( - domain=auction_data["domain"], - tld=auction_data["tld"], - platform=platform, - current_bid=auction_data["current_bid"], - currency=auction_data["currency"], - num_bids=auction_data["num_bids"], - end_time=auction_data["end_time"], - auction_url=auction_data["auction_url"], - age_years=auction_data.get("age_years"), - buy_now_price=auction_data.get("buy_now_price"), - reserve_met=auction_data.get("reserve_met"), - traffic=auction_data.get("traffic"), - is_active=True, - ) - db.add(new_auction) - result["new"] += 1 - - except Exception as e: - logger.warning(f"Error processing DropCatch auction: {e}") + auction_data = dropcatch_client.transform_to_pounce_format(dc_auction) + status = await self._store_auction(db, auction_data) + if status == "skipped": continue - + result[status] += 1 + await db.commit() - + log.status = "success" log.auctions_found = result["found"] log.auctions_new = result["new"] log.auctions_updated = result["updated"] log.completed_at = datetime.utcnow() await db.commit() - - logger.info(f"DropCatch API: Found {result['found']}, New {result['new']}, Updated {result['updated']}") - return result - + except Exception as e: - logger.error(f"DropCatch API error: {e}") log.status = "failed" log.error_message = str(e)[:500] log.completed_at = datetime.utcnow() await db.commit() - return result - + + return result + async def _fetch_sedo_api(self, db: AsyncSession) -> Dict[str, Any]: - """ - 🚀 TIER 1: Fetch Sedo auctions via OFFICIAL API - - This is our preferred method for Sedo data. - Uses the official Sedo Partner API. - """ platform = "Sedo" result = {"found": 0, "new": 0, "updated": 0, "source": "api"} - + if not sedo_client.is_configured: - logger.info("Sedo API not configured, skipping") return result - + log = AuctionScrapeLog(platform=platform) db.add(log) await db.commit() - + try: - # Fetch auctions from official API api_result = await sedo_client.search_auctions(page_size=100) - - # Sedo response structure may vary listings = api_result.get("domains") or api_result.get("items") or api_result.get("result") or [] if isinstance(listings, dict): listings = list(listings.values()) if listings else [] - + result["found"] = len(listings) - + for sedo_listing in listings: - try: - # Transform to our format - auction_data = sedo_client.transform_to_pounce_format(sedo_listing) - - if not auction_data["domain"]: - continue - - # Check if exists - existing = await db.execute( - select(DomainAuction).where( - and_( - DomainAuction.domain == auction_data["domain"], - DomainAuction.platform == platform - ) - ) - ) - existing_auction = existing.scalar_one_or_none() - - if existing_auction: - # Update existing - existing_auction.current_bid = auction_data["current_bid"] - existing_auction.num_bids = auction_data["num_bids"] - existing_auction.end_time = auction_data["end_time"] - existing_auction.is_active = True - existing_auction.updated_at = datetime.utcnow() - result["updated"] += 1 - else: - # Create new - new_auction = DomainAuction( - domain=auction_data["domain"], - tld=auction_data["tld"], - platform=platform, - current_bid=auction_data["current_bid"], - currency=auction_data["currency"], - num_bids=auction_data["num_bids"], - end_time=auction_data["end_time"], - auction_url=auction_data["auction_url"], - buy_now_price=auction_data.get("buy_now_price"), - is_active=True, - ) - db.add(new_auction) - result["new"] += 1 - - except Exception as e: - logger.warning(f"Error processing Sedo listing: {e}") + auction_data = sedo_client.transform_to_pounce_format(sedo_listing) + status = await self._store_auction(db, auction_data) + if status == "skipped": continue - + result[status] += 1 + await db.commit() - + log.status = "success" log.auctions_found = result["found"] log.auctions_new = result["new"] log.auctions_updated = result["updated"] log.completed_at = datetime.utcnow() await db.commit() - - logger.info(f"Sedo API: Found {result['found']}, New {result['new']}, Updated {result['updated']}") - return result - + except Exception as e: - logger.error(f"Sedo API error: {e}") log.status = "failed" log.error_message = str(e)[:500] log.completed_at = datetime.utcnow() await db.commit() - return result - - async def _scrape_dropcatch_public(self, db: AsyncSession) -> Dict[str, Any]: - """ - 📦 TIER 2 FALLBACK: Scrape DropCatch public auction listings. - Only used if the API is not configured or fails. - """ - platform = "DropCatch" - result = {"found": 0, "new": 0, "updated": 0, "source": "scrape"} - - log = AuctionScrapeLog(platform=platform) - db.add(log) - await db.commit() - - try: - await self._rate_limit(platform) - client = await self._get_client() - - # DropCatch public search - url = "https://www.dropcatch.com/domain/search" - response = await client.get(url, timeout=15.0) - - if response.status_code == 200: - soup = BeautifulSoup(response.text, "lxml") - - # Find auction listings - auction_items = soup.select(".domain-item, .auction-listing, .search-result") - - for item in auction_items[:15]: - try: - domain_elem = item.select_one(".domain-name, .name, a[href*='domain']") - if not domain_elem: - continue - - domain = domain_elem.get_text(strip=True).lower() - if not domain or "." not in domain: - continue - - tld = domain.rsplit(".", 1)[-1] - - # Try to find price - price = 59.0 # DropCatch typical starting price - price_elem = item.select_one(".price, .bid-amount") - if price_elem: - price_text = price_elem.get_text() - price_match = re.search(r'\$(\d+(?:,\d+)?(?:\.\d+)?)', price_text) - if price_match: - price = float(price_match.group(1).replace(',', '')) - - auction_data = { - "domain": domain, - "tld": tld, - "platform": platform, - "platform_auction_id": None, - "auction_url": f"https://www.dropcatch.com/domain/{domain}", - "current_bid": price, - "currency": "USD", - "min_bid": None, - "buy_now_price": None, - "reserve_price": None, - "reserve_met": None, - "num_bids": random.randint(1, 10), - "num_watchers": None, - "end_time": datetime.utcnow() + timedelta(hours=random.randint(12, 72)), - "auction_type": "auction", - "traffic": None, - "age_years": None, - "backlinks": None, - "domain_authority": None, - "scrape_source": "dropcatch_search", - } - - status = await self._store_auction(db, auction_data) - result["found"] += 1 - result[status] += 1 - - except Exception as e: - logger.debug(f"Error parsing DropCatch item: {e}") - continue - - await db.commit() - log.completed_at = datetime.utcnow() - log.status = "success" - log.auctions_found = result["found"] - log.auctions_new = result["new"] - log.auctions_updated = result["updated"] - await db.commit() - - except Exception as e: - log.completed_at = datetime.utcnow() - log.status = "failed" - log.error_message = str(e) - await db.commit() - logger.error(f"DropCatch scrape failed: {e}") - + return result - + + # ---------------------------- + # DB cleanup / queries + # ---------------------------- + async def _cleanup_ended_auctions(self, db: AsyncSession): - """Mark auctions that have ended as inactive.""" + """Mark auctions that have ended as inactive and delete very old inactive auctions.""" now = datetime.utcnow() - - # Update ended auctions + from sqlalchemy import update - stmt = ( + + await db.execute( update(DomainAuction) - .where( - and_( - DomainAuction.end_time < now, - DomainAuction.is_active == True - ) - ) + .where(and_(DomainAuction.end_time < now, DomainAuction.is_active == True)) .values(is_active=False) ) - await db.execute(stmt) - - # Delete very old inactive auctions (> 30 days) + cutoff = now - timedelta(days=30) - stmt = delete(DomainAuction).where( - and_( - DomainAuction.is_active == False, - DomainAuction.end_time < cutoff - ) + await db.execute( + delete(DomainAuction).where(and_(DomainAuction.is_active == False, DomainAuction.end_time < cutoff)) ) - await db.execute(stmt) - + await db.commit() - - async def seed_sample_auctions(self, db: AsyncSession) -> Dict[str, Any]: - """ - Seed the database with realistic sample auction data. - This provides good demo data while real scraping is being developed. - """ - result = {"found": 0, "new": 0, "updated": 0} - - # Realistic sample auctions from different platforms - sample_auctions = [ - # GoDaddy Auctions - typically have more competitive bidding - {"domain": "techflow.io", "platform": "GoDaddy", "current_bid": 250, "num_bids": 12, "end_hours": 6, "tld": "io"}, - {"domain": "cryptovault.co", "platform": "GoDaddy", "current_bid": 180, "num_bids": 8, "end_hours": 18, "tld": "co"}, - {"domain": "aitools.dev", "platform": "GoDaddy", "current_bid": 420, "num_bids": 15, "end_hours": 3, "tld": "dev"}, - {"domain": "startupkit.com", "platform": "GoDaddy", "current_bid": 850, "num_bids": 23, "end_hours": 12, "tld": "com"}, - {"domain": "datastream.io", "platform": "GoDaddy", "current_bid": 175, "num_bids": 6, "end_hours": 48, "tld": "io"}, - {"domain": "nftmarket.xyz", "platform": "GoDaddy", "current_bid": 95, "num_bids": 4, "end_hours": 72, "tld": "xyz"}, - {"domain": "cloudbase.ai", "platform": "GoDaddy", "current_bid": 1200, "num_bids": 28, "end_hours": 2, "tld": "ai"}, - {"domain": "blockvest.co", "platform": "GoDaddy", "current_bid": 320, "num_bids": 11, "end_hours": 24, "tld": "co"}, - - # Sedo - marketplace listings, often buy-now prices - {"domain": "fintech.io", "platform": "Sedo", "current_bid": 5500, "num_bids": 0, "end_hours": 168, "tld": "io", "buy_now": 5500}, - {"domain": "healthtech.ai", "platform": "Sedo", "current_bid": 8900, "num_bids": 0, "end_hours": 168, "tld": "ai", "buy_now": 8900}, - {"domain": "metaverse.xyz", "platform": "Sedo", "current_bid": 2400, "num_bids": 2, "end_hours": 96, "tld": "xyz"}, - {"domain": "greentech.co", "platform": "Sedo", "current_bid": 1800, "num_bids": 0, "end_hours": 168, "tld": "co", "buy_now": 1800}, - {"domain": "webtools.dev", "platform": "Sedo", "current_bid": 950, "num_bids": 1, "end_hours": 120, "tld": "dev"}, - {"domain": "saasify.io", "platform": "Sedo", "current_bid": 3200, "num_bids": 0, "end_hours": 168, "tld": "io", "buy_now": 3200}, - - # NameJet - backorder auctions, often expired premium domains - {"domain": "pixel.com", "platform": "NameJet", "current_bid": 15000, "num_bids": 45, "end_hours": 1, "tld": "com"}, - {"domain": "swift.io", "platform": "NameJet", "current_bid": 4200, "num_bids": 18, "end_hours": 4, "tld": "io"}, - {"domain": "venture.co", "platform": "NameJet", "current_bid": 2100, "num_bids": 9, "end_hours": 8, "tld": "co"}, - {"domain": "quantum.ai", "platform": "NameJet", "current_bid": 8500, "num_bids": 32, "end_hours": 2, "tld": "ai"}, - {"domain": "nexus.dev", "platform": "NameJet", "current_bid": 890, "num_bids": 7, "end_hours": 36, "tld": "dev"}, - {"domain": "cyber.net", "platform": "NameJet", "current_bid": 1450, "num_bids": 11, "end_hours": 12, "tld": "net"}, - - # DropCatch - pending delete auctions - {"domain": "fusion.io", "platform": "DropCatch", "current_bid": 520, "num_bids": 14, "end_hours": 3, "tld": "io"}, - {"domain": "stellar.co", "platform": "DropCatch", "current_bid": 380, "num_bids": 8, "end_hours": 6, "tld": "co"}, - {"domain": "apex.dev", "platform": "DropCatch", "current_bid": 290, "num_bids": 5, "end_hours": 12, "tld": "dev"}, - {"domain": "nova.xyz", "platform": "DropCatch", "current_bid": 145, "num_bids": 3, "end_hours": 24, "tld": "xyz"}, - {"domain": "prime.ai", "platform": "DropCatch", "current_bid": 2800, "num_bids": 22, "end_hours": 1, "tld": "ai"}, - {"domain": "orbit.io", "platform": "DropCatch", "current_bid": 440, "num_bids": 9, "end_hours": 8, "tld": "io"}, - - # More variety for different price ranges - {"domain": "budget.app", "platform": "GoDaddy", "current_bid": 45, "num_bids": 2, "end_hours": 96, "tld": "app"}, - {"domain": "quick.site", "platform": "GoDaddy", "current_bid": 28, "num_bids": 1, "end_hours": 120, "tld": "site"}, - {"domain": "tiny.link", "platform": "Sedo", "current_bid": 890, "num_bids": 0, "end_hours": 168, "tld": "link", "buy_now": 890}, - {"domain": "mega.shop", "platform": "DropCatch", "current_bid": 125, "num_bids": 4, "end_hours": 18, "tld": "shop"}, - ] - - platform_urls = { - "GoDaddy": "https://auctions.godaddy.com/trpItemListing.aspx?domain=", - "Sedo": "https://sedo.com/search/?keyword=", - "NameJet": "https://www.namejet.com/Pages/Auctions/BackorderSearch.aspx?q=", - "DropCatch": "https://www.dropcatch.com/domain/", - } - - for sample in sample_auctions: - try: - auction_data = { - "domain": sample["domain"], - "tld": sample["tld"], - "platform": sample["platform"], - "platform_auction_id": None, - "auction_url": platform_urls[sample["platform"]] + sample["domain"], - "current_bid": float(sample["current_bid"]), - "currency": "USD", - "min_bid": None, - "buy_now_price": float(sample.get("buy_now")) if sample.get("buy_now") else None, - "reserve_price": None, - "reserve_met": True if sample["num_bids"] > 5 else None, - "num_bids": sample["num_bids"], - "num_watchers": random.randint(5, 50), - "end_time": datetime.utcnow() + timedelta(hours=sample["end_hours"]), - "auction_type": "buy_now" if sample.get("buy_now") else "auction", - "traffic": random.randint(0, 5000) if random.random() > 0.5 else None, - "age_years": random.randint(1, 15) if random.random() > 0.3 else None, - "backlinks": random.randint(0, 500) if random.random() > 0.6 else None, - "domain_authority": random.randint(5, 50) if random.random() > 0.7 else None, - "scrape_source": "seed_data", - } - - status = await self._store_auction(db, auction_data) - result["found"] += 1 - result[status] += 1 - - except Exception as e: - logger.error(f"Error seeding auction {sample['domain']}: {e}") - continue - - await db.commit() - return result - + async def get_active_auctions( self, db: AsyncSession, @@ -1197,27 +902,21 @@ class AuctionScraperService: ) -> List[DomainAuction]: """Get active auctions from database with filters.""" query = select(DomainAuction).where(DomainAuction.is_active == True) - + if platform: query = query.where(DomainAuction.platform == platform) - if tld: query = query.where(DomainAuction.tld == tld.lower().lstrip(".")) - if keyword: query = query.where(DomainAuction.domain.ilike(f"%{keyword}%")) - if min_bid is not None: query = query.where(DomainAuction.current_bid >= min_bid) - if max_bid is not None: query = query.where(DomainAuction.current_bid <= max_bid) - if ending_within_hours: cutoff = datetime.utcnow() + timedelta(hours=ending_within_hours) query = query.where(DomainAuction.end_time <= cutoff) - - # Sort + if sort_by == "end_time": query = query.order_by(DomainAuction.end_time.asc()) elif sort_by == "bid_asc": @@ -1226,20 +925,17 @@ class AuctionScraperService: query = query.order_by(DomainAuction.current_bid.desc()) elif sort_by == "bids": query = query.order_by(DomainAuction.num_bids.desc()) - - query = query.offset(offset).limit(limit) - - result = await db.execute(query) + + result = await db.execute(query.offset(offset).limit(limit)) return list(result.scalars().all()) - + async def get_auction_count(self, db: AsyncSession) -> int: """Get total count of active auctions.""" from sqlalchemy import func - result = await db.execute( - select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True) - ) + + result = await db.execute(select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True)) return result.scalar() or 0 - + async def close(self): """Close HTTP client.""" if self.http_client and not self.http_client.is_closed: @@ -1248,3 +944,5 @@ class AuctionScraperService: # Global instance auction_scraper = AuctionScraperService() + + diff --git a/backend/app/services/hidden_api_scrapers.py b/backend/app/services/hidden_api_scrapers.py index d129e9c..028ede3 100644 --- a/backend/app/services/hidden_api_scrapers.py +++ b/backend/app/services/hidden_api_scrapers.py @@ -1,250 +1,128 @@ """ -Hidden JSON API Scrapers for Domain Auction Platforms. +Hidden JSON API scrapers for auction platforms. -These scrapers use undocumented but public JSON endpoints that are -much more reliable than HTML scraping. +Important project rule: +- We do NOT generate mock/demo/estimated auction values. +- This module only includes sources that provide verifiable auction fields. -Discovered Endpoints (December 2025): -- Namecheap: GraphQL API at aftermarketapi.namecheap.com -- Dynadot: REST API at dynadot-vue-api -- Sav.com: AJAX endpoint for auction listings +Currently enabled: +- Dynadot hidden JSON API (used by their frontend) + +Affiliate links: +- Read from environment variables. If not configured, plain URLs are used. +- No placeholder affiliate IDs are baked into code. """ import logging +import os from datetime import datetime, timedelta -from typing import Dict, Any, List, Optional +from typing import Any, Dict, List, Optional import httpx logger = logging.getLogger(__name__) -# ═══════════════════════════════════════════════════════════════════════════════ -# AFFILIATE LINKS — Monetization through referral commissions -# ═══════════════════════════════════════════════════════════════════════════════ - -AFFILIATE_CONFIG = { - "Namecheap": { - "base_url": "https://www.namecheap.com/market/", - "affiliate_param": "aff=pounce", # TODO: Replace with actual affiliate ID - "auction_url_template": "https://www.namecheap.com/market/domain/{domain}?aff=pounce", - }, - "Dynadot": { - "base_url": "https://www.dynadot.com/market/", - "affiliate_param": "affiliate_id=pounce", # TODO: Replace with actual affiliate ID - "auction_url_template": "https://www.dynadot.com/market/auction/{domain}?affiliate_id=pounce", - }, - "Sav": { - "base_url": "https://www.sav.com/auctions", - "affiliate_param": "ref=pounce", # TODO: Replace with actual affiliate ID - "auction_url_template": "https://www.sav.com/domain/{domain}?ref=pounce", - }, - "GoDaddy": { - "base_url": "https://auctions.godaddy.com/", - "affiliate_param": "isc=cjcpounce", # TODO: Replace with actual CJ affiliate ID - "auction_url_template": "https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce", - }, - "DropCatch": { - "base_url": "https://www.dropcatch.com/", - "affiliate_param": None, # No affiliate program - "auction_url_template": "https://www.dropcatch.com/domain/{domain}", - }, - "Sedo": { - "base_url": "https://sedo.com/", - "affiliate_param": "partnerid=pounce", # TODO: Replace with actual partner ID - "auction_url_template": "https://sedo.com/search/details/?domain={domain}&partnerid=pounce", - }, - "NameJet": { - "base_url": "https://www.namejet.com/", - "affiliate_param": None, # No public affiliate program - "auction_url_template": "https://www.namejet.com/pages/Auctions/ViewAuctions.aspx?domain={domain}", - }, - "ExpiredDomains": { - "base_url": "https://www.expireddomains.net/", - "affiliate_param": None, # Aggregator, links to actual registrars - "auction_url_template": "https://www.expireddomains.net/domain-name-search/?q={domain}", - }, -} - def build_affiliate_url(platform: str, domain: str, original_url: Optional[str] = None) -> str: """ Build an affiliate URL for a given platform and domain. - - If the platform has an affiliate program, the URL will include - the affiliate tracking parameter. Otherwise, returns the original URL. + + If the affiliate program is not configured, returns the plain provider URL. + If `original_url` is provided, it is preferred (e.g. ExpiredDomains click-through links). """ - config = AFFILIATE_CONFIG.get(platform, {}) - - if config.get("auction_url_template"): - return config["auction_url_template"].format(domain=domain) - - return original_url or f"https://www.google.com/search?q={domain}+auction" + if original_url: + return original_url + p = (platform or "").strip() + d = (domain or "").strip().lower() -# ═══════════════════════════════════════════════════════════════════════════════ -# NAMECHEAP SCRAPER — GraphQL API -# ═══════════════════════════════════════════════════════════════════════════════ + if not d: + return "" -class NamecheapApiScraper: - """ - Scraper for Namecheap Marketplace using their hidden GraphQL API. - - Endpoint: https://aftermarketapi.namecheap.com/client/graphql - - This is a public API used by their frontend, stable and reliable. - """ - - GRAPHQL_ENDPOINT = "https://aftermarketapi.namecheap.com/client/graphql" - - # GraphQL query for fetching auctions - AUCTIONS_QUERY = """ - query GetAuctions($filter: AuctionFilterInput, $pagination: PaginationInput, $sort: SortInput) { - auctions(filter: $filter, pagination: $pagination, sort: $sort) { - items { - id - domain - currentBid - minBid - bidCount - endTime - status - buyNowPrice - hasBuyNow - } - totalCount - pageInfo { - hasNextPage - endCursor - } - } - } - """ - - async def fetch_auctions( - self, - limit: int = 100, - offset: int = 0, - keyword: Optional[str] = None, - tld: Optional[str] = None, - ) -> Dict[str, Any]: - """Fetch auctions from Namecheap GraphQL API.""" - try: - async with httpx.AsyncClient(timeout=30.0) as client: - # Build filter - filter_input = {} - if keyword: - filter_input["searchTerm"] = keyword - if tld: - filter_input["tld"] = tld.lstrip(".") - - variables = { - "filter": filter_input, - "pagination": {"limit": limit, "offset": offset}, - "sort": {"field": "endTime", "direction": "ASC"}, - } - - response = await client.post( - self.GRAPHQL_ENDPOINT, - json={ - "query": self.AUCTIONS_QUERY, - "variables": variables, - }, - headers={ - "Content-Type": "application/json", - "Accept": "application/json", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - "Origin": "https://www.namecheap.com", - "Referer": "https://www.namecheap.com/market/", - }, - ) - - if response.status_code != 200: - logger.error(f"Namecheap API error: {response.status_code}") - return {"items": [], "total": 0, "error": response.text} - - data = response.json() - - if "errors" in data: - logger.error(f"Namecheap GraphQL errors: {data['errors']}") - return {"items": [], "total": 0, "error": str(data["errors"])} - - auctions_data = data.get("data", {}).get("auctions", {}) - items = auctions_data.get("items", []) - - # Transform to Pounce format - transformed = [] - for item in items: - domain = item.get("domain", "") - tld_part = domain.rsplit(".", 1)[-1] if "." in domain else "" - - transformed.append({ - "domain": domain, - "tld": tld_part, - "platform": "Namecheap", - "current_bid": float(item.get("currentBid", 0)), - "min_bid": float(item.get("minBid", 0)), - "num_bids": int(item.get("bidCount", 0)), - "end_time": item.get("endTime"), - "buy_now_price": float(item.get("buyNowPrice")) if item.get("hasBuyNow") else None, - "auction_url": build_affiliate_url("Namecheap", domain), - "currency": "USD", - "is_active": True, - }) - - return { - "items": transformed, - "total": auctions_data.get("totalCount", 0), - "has_more": auctions_data.get("pageInfo", {}).get("hasNextPage", False), - } - - except Exception as e: - logger.exception(f"Namecheap API scraper error: {e}") - return {"items": [], "total": 0, "error": str(e)} + if p == "Dynadot": + base = f"https://www.dynadot.com/market/auction/{d}" + affiliate_id = os.getenv("DYNADOT_AFFILIATE_ID") + return f"{base}?affiliate_id={affiliate_id}" if affiliate_id else base + if p == "GoDaddy": + base = f"https://auctions.godaddy.com/trpItemListing.aspx?domain={d}" + isc = os.getenv("GODADDY_ISC") + return f"{base}&isc={isc}" if isc else base + + if p == "Namecheap": + base = f"https://www.namecheap.com/market/domain/{d}" + aff = os.getenv("NAMECHEAP_AFFILIATE_ID") + return f"{base}?aff={aff}" if aff else base + + if p == "Sedo": + base = f"https://sedo.com/search/details/?domain={d}" + partner = os.getenv("SEDO_PARTNER_ID") + return f"{base}&partnerid={partner}" if partner else base + + if p == "Park.io": + return f"https://park.io/domain/{d}" + + if p == "Sav": + ref = os.getenv("SAV_REF") + base = f"https://www.sav.com/domain/{d}" + return f"{base}?ref={ref}" if ref else base + + return "" -# ═══════════════════════════════════════════════════════════════════════════════ -# DYNADOT SCRAPER — REST JSON API -# ═══════════════════════════════════════════════════════════════════════════════ class DynadotApiScraper: """ Scraper for Dynadot Marketplace using their hidden JSON API. - - Endpoints: - - /dynadot-vue-api/dynadot-service/marketplace-api - - /dynadot-vue-api/dynadot-service/main-site-api - - Supports: - - EXPIRED_AUCTION: Expired auctions - - BACKORDER: Backorder listings - - USER_LISTING: User marketplace listings + + Endpoint: + - https://www.dynadot.com/dynadot-vue-api/dynadot-service/marketplace-api """ - + BASE_URL = "https://www.dynadot.com" MARKETPLACE_API = "/dynadot-vue-api/dynadot-service/marketplace-api" - + + def _parse_end_time(self, item: Dict[str, Any]) -> Optional[datetime]: + # Dynadot often provides an epoch timestamp in ms + end_time_stamp = item.get("end_time_stamp") + if isinstance(end_time_stamp, (int, float)) and end_time_stamp > 0: + try: + return datetime.utcfromtimestamp(end_time_stamp / 1000).replace(tzinfo=None) + except Exception: + pass + + # Or a string like "2025/12/12 08:00 PST" (timezone ambiguous) + end_time_str = item.get("end_time") or item.get("auction_end_time") + if isinstance(end_time_str, str) and end_time_str.strip(): + raw = end_time_str.strip() + raw = raw.replace(" PST", "").replace(" PDT", "").replace(" UTC", "") + for fmt in ("%Y/%m/%d %H:%M", "%Y-%m-%d %H:%M:%S"): + try: + return datetime.strptime(raw, fmt).replace(tzinfo=None) + except Exception: + continue + return None + async def fetch_auctions( self, - aftermarket_type: str = "EXPIRED_AUCTION", page_size: int = 100, page_index: int = 0, keyword: Optional[str] = None, ) -> Dict[str, Any]: - """Fetch auctions from Dynadot REST API.""" + """Fetch auctions from Dynadot hidden API.""" try: - async with httpx.AsyncClient(timeout=30.0) as client: + proxy = os.getenv("SCRAPER_HTTP_PROXY") or os.getenv("SCRAPER_PROXY_URL") + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True, proxy=proxy) as client: params = { "command": "get_list", - "aftermarket_type": aftermarket_type, + "aftermarket_type": "EXPIRED_AUCTION", "page_size": page_size, "page_index": page_index, "lang": "en", } - if keyword: params["keyword"] = keyword - - response = await client.post( + + resp = await client.post( f"{self.BASE_URL}{self.MARKETPLACE_API}", params=params, headers={ @@ -253,1049 +131,79 @@ class DynadotApiScraper: "Referer": "https://www.dynadot.com/market", }, ) - - if response.status_code != 200: - logger.error(f"Dynadot API error: {response.status_code}") - return {"items": [], "total": 0, "error": response.text} - - data = response.json() - - # Dynadot returns code: 200 for success - if data.get("code") not in [0, 200] and data.get("msg") != "success": - logger.error(f"Dynadot API error: {data}") - return {"items": [], "total": 0, "error": str(data)} - - # Data can be in 'records' or 'list' + + if resp.status_code != 200: + return {"items": [], "total": 0, "error": f"HTTP {resp.status_code}: {resp.text[:200]}"} + + data = resp.json() listings = data.get("data", {}).get("records", []) or data.get("data", {}).get("list", []) - - # Transform to Pounce format - transformed = [] + + transformed: List[Dict[str, Any]] = [] for item in listings: - domain = item.get("domain", "") or item.get("name", "") or item.get("utf8_name", "") - tld_part = domain.rsplit(".", 1)[-1] if "." in domain else "" - - # Parse end time (Dynadot uses timestamp in milliseconds or string) - end_time = None - end_time_stamp = item.get("end_time_stamp") - if end_time_stamp: - try: - end_time = datetime.fromtimestamp(end_time_stamp / 1000) - except: - pass - - if not end_time: - end_time_str = item.get("end_time") or item.get("auction_end_time") - if end_time_str: - try: - # Format: "2025/12/12 08:00 PST" - end_time = datetime.strptime(end_time_str.split(" PST")[0], "%Y/%m/%d %H:%M") - except: - end_time = datetime.utcnow() + timedelta(days=1) - - # Parse bid price (can be string or number) + domain = item.get("domain") or item.get("name") or item.get("utf8_name") or "" + domain = str(domain).strip().lower() + if not domain or "." not in domain: + continue + + end_time = self._parse_end_time(item) + if end_time is None: + # No end time -> skip (no fake) + continue + bid_price = item.get("bid_price") or item.get("current_bid") or item.get("price") or 0 if isinstance(bid_price, str): - bid_price = float(bid_price.replace(",", "").replace("$", "")) - - transformed.append({ - "domain": domain, - "tld": tld_part, - "platform": "Dynadot", - "current_bid": float(bid_price), - "min_bid": float(item.get("start_price", 0) or 0), - "num_bids": int(item.get("bids", 0) or item.get("bid_count", 0) or 0), - "end_time": end_time or datetime.utcnow() + timedelta(days=1), - "buy_now_price": float(item.get("accepted_bid_price")) if item.get("accepted_bid_price") else None, - "auction_url": build_affiliate_url("Dynadot", domain), - "currency": item.get("bid_price_currency", "USD"), - "is_active": True, - # Map to existing DomainAuction fields - "backlinks": int(item.get("links", 0) or 0), - "age_years": int(item.get("age", 0) or 0), - }) - + bid_price = bid_price.replace(",", "").replace("$", "").strip() + try: + current_bid = float(bid_price) + except Exception: + continue + if current_bid <= 0: + continue + + bids = item.get("bids") or item.get("bid_count") or 0 + try: + num_bids = int(bids) + except Exception: + num_bids = 0 + + tld = domain.rsplit(".", 1)[-1].lower() + + transformed.append( + { + "domain": domain, + "tld": tld, + "platform": "Dynadot", + "current_bid": current_bid, + "currency": str(item.get("bid_price_currency") or "USD").upper(), + "num_bids": num_bids, + "end_time": end_time, + "auction_url": build_affiliate_url("Dynadot", domain), + "buy_now_price": float(item.get("accepted_bid_price")) if item.get("accepted_bid_price") else None, + "age_years": int(item.get("age", 0) or 0) or None, + "backlinks": int(item.get("links", 0) or 0) or None, + "scrape_source": "dynadot:hidden_api", + } + ) + return { "items": transformed, "total": data.get("data", {}).get("total_count", len(transformed)), "has_more": len(listings) >= page_size, } - except Exception as e: logger.exception(f"Dynadot API scraper error: {e}") return {"items": [], "total": 0, "error": str(e)} -# ═══════════════════════════════════════════════════════════════════════════════ -# SAV.COM SCRAPER — AJAX JSON API -# ═══════════════════════════════════════════════════════════════════════════════ - -class SavApiScraper: - """ - Scraper for Sav.com Auctions using their hidden AJAX endpoint. - - Endpoint: /auctions/load_domains_ajax/{page} - - Simple POST request that returns paginated auction data. - """ - - BASE_URL = "https://www.sav.com" - AJAX_ENDPOINT = "/auctions/load_domains_ajax" - - async def fetch_auctions( - self, - page: int = 0, - ) -> Dict[str, Any]: - """Fetch auctions from Sav.com AJAX API.""" - try: - async with httpx.AsyncClient(timeout=30.0) as client: - response = await client.post( - f"{self.BASE_URL}{self.AJAX_ENDPOINT}/{page}", - headers={ - "Accept": "application/json, text/html", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - "Referer": "https://www.sav.com/domains/auctions", - "X-Requested-With": "XMLHttpRequest", - }, - ) - - if response.status_code != 200: - logger.error(f"Sav API error: {response.status_code}") - return {"items": [], "total": 0, "error": response.text} - - # The response is HTML but contains structured data - # We need to parse it or check for JSON - content_type = response.headers.get("content-type", "") - - if "application/json" in content_type: - data = response.json() - else: - # HTML response - parse it - # For now, we'll use BeautifulSoup if needed - logger.warning("Sav returned HTML instead of JSON, parsing...") - return await self._parse_html_response(response.text) - - listings = data.get("domains", data.get("auctions", [])) - - # Transform to Pounce format - transformed = [] - for item in listings: - domain = item.get("domain", "") or item.get("name", "") - tld_part = domain.rsplit(".", 1)[-1] if "." in domain else "" - - # Parse end time - end_time_str = item.get("end_time") or item.get("ends_at") - end_time = None - if end_time_str: - try: - end_time = datetime.fromisoformat(end_time_str.replace("Z", "+00:00")) - except: - end_time = datetime.utcnow() + timedelta(days=1) - - transformed.append({ - "domain": domain, - "tld": tld_part, - "platform": "Sav", - "current_bid": float(item.get("current_bid", 0) or item.get("price", 0)), - "min_bid": float(item.get("min_bid", 0) or 0), - "num_bids": int(item.get("bids", 0) or 0), - "end_time": end_time, - "buy_now_price": float(item.get("buy_now")) if item.get("buy_now") else None, - "auction_url": build_affiliate_url("Sav", domain), - "currency": "USD", - "is_active": True, - }) - - return { - "items": transformed, - "total": len(transformed), - "has_more": len(listings) >= 20, # Default page size - } - - except Exception as e: - logger.exception(f"Sav API scraper error: {e}") - return {"items": [], "total": 0, "error": str(e)} - - async def _parse_html_response(self, html: str) -> Dict[str, Any]: - """Parse HTML response from Sav.com when JSON is not available.""" - try: - from bs4 import BeautifulSoup - - soup = BeautifulSoup(html, "html.parser") - - # Find auction rows - rows = soup.select(".auction-row, .domain-row, tr[data-domain]") - - transformed = [] - for row in rows: - domain_el = row.select_one(".domain-name, .name, [data-domain]") - price_el = row.select_one(".price, .bid, .current-bid") - time_el = row.select_one(".time-left, .ends, .countdown") - bids_el = row.select_one(".bids, .bid-count") - - if not domain_el: - continue - - domain = domain_el.get_text(strip=True) or domain_el.get("data-domain", "") - tld_part = domain.rsplit(".", 1)[-1] if "." in domain else "" - - price_text = price_el.get_text(strip=True) if price_el else "0" - price = float("".join(c for c in price_text if c.isdigit() or c == ".") or "0") - - bids_text = bids_el.get_text(strip=True) if bids_el else "0" - bids = int("".join(c for c in bids_text if c.isdigit()) or "0") - - transformed.append({ - "domain": domain, - "tld": tld_part, - "platform": "Sav", - "current_bid": price, - "min_bid": 0, - "num_bids": bids, - "end_time": datetime.utcnow() + timedelta(days=1), # Estimate - "buy_now_price": None, - "auction_url": build_affiliate_url("Sav", domain), - "currency": "USD", - "is_active": True, - }) - - return { - "items": transformed, - "total": len(transformed), - "has_more": len(rows) >= 20, - } - - except Exception as e: - logger.exception(f"Sav HTML parsing error: {e}") - return {"items": [], "total": 0, "error": str(e)} - - -# ═══════════════════════════════════════════════════════════════════════════════ -# GODADDY SCRAPER — Hidden REST JSON API -# ═══════════════════════════════════════════════════════════════════════════════ - -class GoDaddyApiScraper: - """ - Scraper for GoDaddy Auctions using their hidden JSON API. - - Discovered Endpoint: - https://auctions.godaddy.com/beta/findApiProxy/v4/aftermarket/find/auction/recommend - - Parameters: - - paginationSize: number of results (max 150) - - paginationStart: offset - - sortBy: auctionBids:desc, auctionValuationPrice:desc, endingAt:asc - - endTimeAfter: ISO timestamp - - typeIncludeList: 14,16,38 (auction types) - """ - - BASE_URL = "https://auctions.godaddy.com" - API_ENDPOINT = "/beta/findApiProxy/v4/aftermarket/find/auction/recommend" - - async def fetch_auctions( - self, - limit: int = 100, - offset: int = 0, - sort_by: str = "auctionBids:desc", - ending_within_hours: Optional[int] = None, - ) -> Dict[str, Any]: - """Fetch auctions from GoDaddy hidden JSON API.""" - try: - async with httpx.AsyncClient(timeout=30.0) as client: - params = { - "paginationSize": min(limit, 150), - "paginationStart": offset, - "sortBy": sort_by, - "typeIncludeList": "14,16,38", # All auction types - "endTimeAfter": datetime.utcnow().isoformat() + "Z", - } - - if ending_within_hours: - end_before = (datetime.utcnow() + timedelta(hours=ending_within_hours)).isoformat() + "Z" - params["endTimeBefore"] = end_before - - response = await client.get( - f"{self.BASE_URL}{self.API_ENDPOINT}", - params=params, - headers={ - "Accept": "application/json", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - "Referer": "https://auctions.godaddy.com/beta", - }, - ) - - if response.status_code != 200: - logger.error(f"GoDaddy API error: {response.status_code}") - return {"items": [], "total": 0, "error": response.text} - - data = response.json() - - # GoDaddy returns listings in 'results' array - listings = data.get("results", []) - - # Transform to Pounce format - transformed = [] - for item in listings: - domain = item.get("fqdn", "") or item.get("domain", "") - tld_part = domain.rsplit(".", 1)[-1] if "." in domain else "" - - # Parse end time - end_time = None - end_at = item.get("endingAt") or item.get("auctionEndTime") - if end_at: - try: - end_time = datetime.fromisoformat(end_at.replace("Z", "+00:00")).replace(tzinfo=None) - except: - pass - - # Parse price (can be in different fields) - price = ( - item.get("price") or - item.get("currentBidPrice") or - item.get("auctionPrice") or - item.get("minBid") or 0 - ) - - transformed.append({ - "domain": domain, - "tld": tld_part, - "platform": "GoDaddy", - "current_bid": float(price) if price else 0, - "min_bid": float(item.get("minBid", 0) or 0), - "num_bids": int(item.get("bids", 0) or item.get("bidCount", 0) or 0), - "end_time": end_time or datetime.utcnow() + timedelta(days=1), - "buy_now_price": float(item.get("buyNowPrice")) if item.get("buyNowPrice") else None, - "auction_url": build_affiliate_url("GoDaddy", domain), - "currency": "USD", - "is_active": True, - "traffic": int(item.get("traffic", 0) or 0), - "domain_authority": int(item.get("domainAuthority", 0) or item.get("valuationPrice", 0) or 0), - }) - - return { - "items": transformed, - "total": data.get("totalRecordCount", len(transformed)), - "has_more": len(listings) >= limit, - } - - except Exception as e: - logger.exception(f"GoDaddy API scraper error: {e}") - return {"items": [], "total": 0, "error": str(e)} - - -# ═══════════════════════════════════════════════════════════════════════════════ -# GODADDY RSS SCRAPER — Public RSS Feed (NO Cloudflare!) -# ═══════════════════════════════════════════════════════════════════════════════ - -class GoDaddyRssScraper: - """ - Scraper for GoDaddy Auctions using their PUBLIC RSS feeds. - - These RSS feeds are NOT protected by Cloudflare and always work! - - Feeds: - - https://auctions.godaddy.com/rss/ending.aspx (Ending Soon) - - https://auctions.godaddy.com/rss/new.aspx (New Auctions) - - https://auctions.godaddy.com/rss/closeouts.aspx (Closeouts) - """ - - RSS_FEEDS = { - "ending": "https://auctions.godaddy.com/rss/ending.aspx", - "new": "https://auctions.godaddy.com/rss/new.aspx", - "closeouts": "https://auctions.godaddy.com/rss/closeouts.aspx", - } - - async def fetch_auctions( - self, - feed_type: str = "ending", # "ending", "new", or "closeouts" - limit: int = 100, - ) -> Dict[str, Any]: - """Fetch auctions from GoDaddy RSS feeds.""" - try: - import xml.etree.ElementTree as ET - - feed_url = self.RSS_FEEDS.get(feed_type, self.RSS_FEEDS["ending"]) - - async with httpx.AsyncClient(timeout=30.0) as client: - response = await client.get( - feed_url, - headers={ - "Accept": "application/rss+xml, application/xml, text/xml", - "User-Agent": "Mozilla/5.0 (compatible; PounceBot/1.0; +https://pounce.ch)", - }, - ) - - if response.status_code != 200: - logger.error(f"GoDaddy RSS error: {response.status_code}") - return {"items": [], "total": 0, "error": f"HTTP {response.status_code}"} - - # Parse RSS XML - root = ET.fromstring(response.text) - - # Find all items in the RSS feed - items = root.findall(".//item") - - transformed = [] - for item in items[:limit]: - try: - title = item.find("title").text if item.find("title") is not None else "" - link = item.find("link").text if item.find("link") is not None else "" - description = item.find("description").text if item.find("description") is not None else "" - - # Extract domain from title (format: "domain.com - $XX") - domain = "" - price = 0 - - if title: - # Title format: "example.com - $12" or "example.com" - parts = title.split(" - ") - domain = parts[0].strip().lower() - - if len(parts) > 1: - price_str = parts[1].replace("$", "").replace(",", "").strip() - try: - price = float(price_str) - except: - pass - - # Try to extract price from description if not in title - if price == 0 and description: - import re - price_match = re.search(r'\$([0-9,]+(?:\.[0-9]+)?)', description) - if price_match: - price = float(price_match.group(1).replace(",", "")) - - if not domain or "." not in domain: - continue - - tld = domain.rsplit(".", 1)[-1] - - # Add affiliate param to link - affiliate_url = link - if link and "?" in link: - affiliate_url = f"{link}&isc=cjcpounce" - elif link: - affiliate_url = f"{link}?isc=cjcpounce" - else: - affiliate_url = build_affiliate_url("GoDaddy", domain) - - transformed.append({ - "domain": domain, - "tld": tld, - "platform": "GoDaddy", - "current_bid": price, - "min_bid": price, - "num_bids": 0, # RSS doesn't provide bid count - "end_time": datetime.utcnow() + timedelta(hours=24), # Estimate - "buy_now_price": None, - "auction_url": affiliate_url, - "currency": "USD", - "is_active": True, - "source": f"RSS-{feed_type}", - }) - except Exception as e: - logger.warning(f"Error parsing GoDaddy RSS item: {e}") - continue - - logger.info(f"GoDaddy RSS ({feed_type}): Found {len(transformed)} auctions") - return { - "items": transformed, - "total": len(transformed), - "has_more": False, - } - - except Exception as e: - logger.exception(f"GoDaddy RSS scraper error: {e}") - return {"items": [], "total": 0, "error": str(e)} - - async def fetch_all_feeds(self) -> Dict[str, Any]: - """Fetch from all GoDaddy RSS feeds.""" - all_items = [] - errors = [] - - for feed_type in ["ending", "new", "closeouts"]: - result = await self.fetch_auctions(feed_type=feed_type, limit=50) - all_items.extend(result.get("items", [])) - if result.get("error"): - errors.append(f"{feed_type}: {result['error']}") - - # Dedupe by domain - seen = set() - unique_items = [] - for item in all_items: - if item["domain"] not in seen: - seen.add(item["domain"]) - unique_items.append(item) - - return { - "items": unique_items, - "total": len(unique_items), - "errors": errors if errors else None, - } - - -# ═══════════════════════════════════════════════════════════════════════════════ -# PARK.IO SCRAPER — HTML Scraping (API is private) -# ═══════════════════════════════════════════════════════════════════════════════ - -class ParkIoApiScraper: - """ - Scraper for Park.io domain backorders via HTML scraping. - - Park.io specializes in catching expiring .io, .gg, .me domains. - Their API is private, so we scrape the public auction pages. - """ - - BASE_URL = "https://park.io" - - async def fetch_pending_drops( - self, - limit: int = 100, - tld: Optional[str] = None, - ) -> Dict[str, Any]: - """Fetch pending domain drops from Park.io via HTML scraping.""" - try: - from bs4 import BeautifulSoup - - async with httpx.AsyncClient(timeout=30.0) as client: - # Scrape the auctions page - pages_to_try = [ - f"{self.BASE_URL}/auctions", - f"{self.BASE_URL}/domains", - f"{self.BASE_URL}/premium-domains", - ] - - transformed = [] - - for page_url in pages_to_try: - try: - response = await client.get( - page_url, - headers={ - "Accept": "text/html,application/xhtml+xml", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - }, - ) - - if response.status_code != 200: - continue - - soup = BeautifulSoup(response.text, "html.parser") - - # Try various selectors for domain listings - selectors = [ - ".domain-item", - ".auction-item", - "tr.domain-row", - "[data-domain]", - ".domain-listing", - ] - - for selector in selectors: - items = soup.select(selector) - if items: - for item in items[:limit]: - try: - # Extract domain - domain_el = item.select_one(".domain-name, .name, a[href*='domain']") - if domain_el: - domain = domain_el.get_text(strip=True).lower() - else: - domain = item.get("data-domain", "") - - if not domain or "." not in domain: - continue - - tld_part = domain.rsplit(".", 1)[-1] - - # Filter by TLD if specified - if tld and tld_part != tld.lstrip("."): - continue - - # Extract price - price = 99 # Park.io standard price - price_el = item.select_one(".price, .amount") - if price_el: - price_text = price_el.get_text() - import re - price_match = re.search(r'\$?(\d+)', price_text) - if price_match: - price = int(price_match.group(1)) - - transformed.append({ - "domain": domain, - "tld": tld_part, - "platform": "Park.io", - "current_bid": float(price), - "min_bid": float(price), - "num_bids": 0, - "end_time": datetime.utcnow() + timedelta(days=7), - "buy_now_price": float(price), - "auction_url": f"{self.BASE_URL}/domain/{domain}", - "currency": "USD", - "is_active": True, - "auction_type": "backorder", - }) - except Exception as e: - logger.debug(f"Error parsing Park.io item: {e}") - continue - - if transformed: - break # Found items, stop trying selectors - - except Exception as e: - logger.debug(f"Error fetching {page_url}: {e}") - continue - - if transformed: - logger.info(f"✅ Park.io: Found {len(transformed)} domains") - - return { - "items": transformed, - "total": len(transformed), - "has_more": False, - } - - except Exception as e: - logger.exception(f"Park.io scraper error: {e}") - return {"items": [], "total": 0, "error": str(e)} - - -# ═══════════════════════════════════════════════════════════════════════════════ -# SNAPNAMES SCRAPER — Public Auction Listings -# ═══════════════════════════════════════════════════════════════════════════════ - -class SnapNamesApiScraper: - """ - Scraper for SnapNames domain auctions. - - SnapNames is one of the largest domain auction platforms. - They have a public auction page that we can scrape. - """ - - BASE_URL = "https://www.snapnames.com" - - async def fetch_auctions( - self, - limit: int = 100, - ) -> Dict[str, Any]: - """Fetch auctions from SnapNames.""" - try: - from bs4 import BeautifulSoup - - async with httpx.AsyncClient(timeout=30.0) as client: - # Try their public auction search - response = await client.get( - f"{self.BASE_URL}/names/search", - params={ - "type": "auction", - "sort": "end_date", - "order": "asc", - }, - headers={ - "Accept": "text/html,application/xhtml+xml", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - }, - ) - - if response.status_code != 200: - return {"items": [], "total": 0, "error": f"HTTP {response.status_code}"} - - soup = BeautifulSoup(response.text, "html.parser") - - # Find auction rows - rows = soup.select("tr.auction-row, .domain-row, [data-auction-id]") - - transformed = [] - for row in rows[:limit]: - try: - # Extract domain - domain_el = row.select_one(".domain-name, .name, a[href*='auction']") - if not domain_el: - continue - - domain = domain_el.get_text(strip=True).lower() - if not domain or "." not in domain: - continue - - tld = domain.rsplit(".", 1)[-1] - - # Extract price - price = 69 # SnapNames minimum - price_el = row.select_one(".price, .bid, .current-bid") - if price_el: - price_text = price_el.get_text() - import re - price_match = re.search(r'\$?(\d+(?:,\d+)?)', price_text) - if price_match: - price = int(price_match.group(1).replace(",", "")) - - # Extract bids - bids = 0 - bids_el = row.select_one(".bids, .bid-count") - if bids_el: - bids_text = bids_el.get_text() - import re - bids_match = re.search(r'(\d+)', bids_text) - if bids_match: - bids = int(bids_match.group(1)) - - transformed.append({ - "domain": domain, - "tld": tld, - "platform": "SnapNames", - "current_bid": float(price), - "min_bid": float(price), - "num_bids": bids, - "end_time": datetime.utcnow() + timedelta(days=1), - "buy_now_price": None, - "auction_url": f"{self.BASE_URL}/names/domain/{domain}", - "currency": "USD", - "is_active": True, - }) - except Exception as e: - logger.debug(f"Error parsing SnapNames row: {e}") - continue - - if transformed: - logger.info(f"✅ SnapNames: Found {len(transformed)} auctions") - - return { - "items": transformed, - "total": len(transformed), - "has_more": len(transformed) >= limit, - } - - except Exception as e: - logger.exception(f"SnapNames scraper error: {e}") - return {"items": [], "total": 0, "error": str(e)} - - -# Legacy ParkIo class for backwards compatibility -class ParkIoApiScraperLegacy: - """Legacy API scraper - kept for reference.""" - - BASE_URL = "https://park.io" - API_ENDPOINT = "/api/domains" - - async def fetch_pending_drops( - self, - limit: int = 100, - tld: Optional[str] = None, - ) -> Dict[str, Any]: - """Fetch pending domain drops from Park.io (legacy API).""" - try: - async with httpx.AsyncClient(timeout=30.0) as client: - params = { - "limit": limit, - "status": "pending", - } - - if tld: - params["tld"] = tld.lstrip(".") - - response = await client.get( - f"{self.BASE_URL}{self.API_ENDPOINT}", - params=params, - headers={ - "Accept": "application/json", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - }, - ) - - if response.status_code != 200: - logger.error(f"Park.io API error: {response.status_code}") - return {"items": [], "total": 0, "error": response.text} - - data = response.json() - domains = data.get("domains", []) if isinstance(data, dict) else data - - # Transform to Pounce format - transformed = [] - for item in domains: - domain = item.get("domain", "") or item.get("name", "") - tld_part = domain.rsplit(".", 1)[-1] if "." in domain else "" - - # Parse drop date - drop_date = None - drop_at = item.get("drop_date") or item.get("expires_at") - if drop_at: - try: - drop_date = datetime.fromisoformat(drop_at.replace("Z", "+00:00")).replace(tzinfo=None) - except: - drop_date = datetime.utcnow() + timedelta(days=1) - - transformed.append({ - "domain": domain, - "tld": tld_part, - "platform": "Park.io", - "current_bid": float(item.get("price", 99)), # Park.io default price - "min_bid": float(item.get("min_price", 99)), - "num_bids": int(item.get("backorders", 0) or 0), # Number of backorders - "end_time": drop_date or datetime.utcnow() + timedelta(days=1), - "buy_now_price": None, # Backorder, not auction - "auction_url": f"https://park.io/domains/{domain}", - "auction_type": "backorder", - "currency": "USD", - "is_active": True, - }) - - return { - "items": transformed, - "total": len(transformed), - "has_more": len(domains) >= limit, - } - - except Exception as e: - logger.exception(f"Park.io API scraper error: {e}") - return {"items": [], "total": 0, "error": str(e)} - - -# ═══════════════════════════════════════════════════════════════════════════════ -# NAMEJET SCRAPER — Hidden AJAX API -# ═══════════════════════════════════════════════════════════════════════════════ - -class NameJetApiScraper: - """ - Scraper for NameJet auctions using their AJAX endpoint. - - NameJet is owned by GoDaddy but operates independently. - Uses a hidden AJAX endpoint for loading auction data. - """ - - BASE_URL = "https://www.namejet.com" - AJAX_ENDPOINT = "/PreRelease/Auctions/LoadPage" - - async def fetch_auctions( - self, - limit: int = 100, - page: int = 1, - sort_by: str = "EndTime", - ) -> Dict[str, Any]: - """Fetch auctions from NameJet AJAX API.""" - try: - async with httpx.AsyncClient(timeout=30.0) as client: - # NameJet uses POST with form data - form_data = { - "page": page, - "rows": limit, - "sidx": sort_by, - "sord": "asc", - } - - response = await client.post( - f"{self.BASE_URL}{self.AJAX_ENDPOINT}", - data=form_data, - headers={ - "Accept": "application/json", - "Content-Type": "application/x-www-form-urlencoded", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - "Referer": "https://www.namejet.com/PreRelease/Auctions", - "X-Requested-With": "XMLHttpRequest", - }, - ) - - if response.status_code != 200: - logger.error(f"NameJet API error: {response.status_code}") - return {"items": [], "total": 0, "error": response.text} - - # Try JSON first, fall back to HTML parsing - try: - data = response.json() - except: - return await self._parse_html_response(response.text) - - # NameJet returns 'rows' array with auction data - rows = data.get("rows", []) - - # Transform to Pounce format - transformed = [] - for item in rows: - # NameJet format: item.cell contains [domain, endTime, price, bids, ...] - cell = item.get("cell", []) - if len(cell) < 4: - continue - - domain = cell[0] if isinstance(cell[0], str) else cell[0].get("domain", "") - tld_part = domain.rsplit(".", 1)[-1] if "." in domain else "" - - # Parse end time - end_time = None - if len(cell) > 1 and cell[1]: - try: - end_time = datetime.strptime(cell[1], "%m/%d/%Y %H:%M:%S") - except: - try: - end_time = datetime.strptime(cell[1], "%Y-%m-%d %H:%M") - except: - pass - - # Parse price - price = 0 - if len(cell) > 2: - price_str = str(cell[2]).replace("$", "").replace(",", "") - try: - price = float(price_str) - except: - pass - - # Parse bids - bids = 0 - if len(cell) > 3: - try: - bids = int(cell[3]) - except: - pass - - transformed.append({ - "domain": domain, - "tld": tld_part, - "platform": "NameJet", - "current_bid": price, - "min_bid": 0, - "num_bids": bids, - "end_time": end_time or datetime.utcnow() + timedelta(days=1), - "buy_now_price": None, - "auction_url": build_affiliate_url("NameJet", domain), - "currency": "USD", - "is_active": True, - }) - - return { - "items": transformed, - "total": data.get("records", len(transformed)), - "has_more": len(rows) >= limit, - } - - except Exception as e: - logger.exception(f"NameJet API scraper error: {e}") - return {"items": [], "total": 0, "error": str(e)} - - async def _parse_html_response(self, html: str) -> Dict[str, Any]: - """Parse HTML response from NameJet when JSON is not available.""" - try: - from bs4 import BeautifulSoup - - soup = BeautifulSoup(html, "html.parser") - rows = soup.select("tr[data-domain], .auction-row") - - transformed = [] - for row in rows: - domain_el = row.select_one("td:first-child, .domain") - if not domain_el: - continue - - domain = domain_el.get_text(strip=True) - tld_part = domain.rsplit(".", 1)[-1] if "." in domain else "" - - transformed.append({ - "domain": domain, - "tld": tld_part, - "platform": "NameJet", - "current_bid": 0, - "min_bid": 0, - "num_bids": 0, - "end_time": datetime.utcnow() + timedelta(days=1), - "buy_now_price": None, - "auction_url": build_affiliate_url("NameJet", domain), - "currency": "USD", - "is_active": True, - }) - - return { - "items": transformed, - "total": len(transformed), - "has_more": False, - } - - except Exception as e: - logger.exception(f"NameJet HTML parsing error: {e}") - return {"items": [], "total": 0, "error": str(e)} - - -# ═══════════════════════════════════════════════════════════════════════════════ -# UNIFIED SCRAPER — Combines all hidden API scrapers -# ═══════════════════════════════════════════════════════════════════════════════ - class HiddenApiScraperService: - """ - Unified service that combines all hidden API scrapers. - - Priority order: - 1. GoDaddy JSON API (most reliable, 150 auctions/request) - 2. Dynadot REST API (100 auctions/request) - 3. NameJet AJAX (requires parsing) - 4. Park.io (backorders) - 5. Namecheap GraphQL (requires query hash - may fail) - 6. Sav.com AJAX (HTML fallback) - - All URLs include affiliate tracking for monetization. - """ - + """Orchestrates enabled hidden API scrapers.""" + def __init__(self): - self.namecheap = NamecheapApiScraper() self.dynadot = DynadotApiScraper() - self.sav = SavApiScraper() - self.godaddy = GoDaddyApiScraper() - self.godaddy_rss = GoDaddyRssScraper() # RSS fallback (NO Cloudflare!) - self.parkio = ParkIoApiScraper() - self.namejet = NameJetApiScraper() - self.snapnames = SnapNamesApiScraper() # NEW: SnapNames auctions - + async def scrape_all(self, limit_per_platform: int = 100) -> Dict[str, Any]: - """ - Scrape all platforms using hidden APIs. - - Returns combined results with platform breakdown. - """ - results = { - "total_found": 0, - "platforms": {}, - "errors": [], - "items": [], - } - - # ═══════════════════════════════════════════════════════════ - # TIER 0: RSS Feeds (Most Reliable - NO Cloudflare!) - # ═══════════════════════════════════════════════════════════ - - # Scrape GoDaddy RSS (Always works!) - try: - rss_data = await self.godaddy_rss.fetch_all_feeds() - rss_count = len(rss_data.get("items", [])) - if rss_count > 0: - results["platforms"]["GoDaddy-RSS"] = { - "found": rss_count, - "total": rss_count, - } - results["items"].extend(rss_data.get("items", [])) - results["total_found"] += rss_count - logger.info(f"✅ GoDaddy RSS: {rss_count} auctions") - except Exception as e: - results["errors"].append(f"GoDaddy-RSS: {str(e)}") - - # ═══════════════════════════════════════════════════════════ - # TIER 1: Most Reliable JSON APIs - # ═══════════════════════════════════════════════════════════ - - # Scrape GoDaddy JSON API (may have Cloudflare issues) - try: - godaddy_data = await self.godaddy.fetch_auctions(limit=limit_per_platform) - godaddy_count = len(godaddy_data.get("items", [])) - if godaddy_count > 0: - results["platforms"]["GoDaddy-API"] = { - "found": godaddy_count, - "total": godaddy_data.get("total", 0), - } - results["items"].extend(godaddy_data.get("items", [])) - results["total_found"] += godaddy_count - - if godaddy_data.get("error"): - results["errors"].append(f"GoDaddy-API: {godaddy_data['error'][:100]}") - - except Exception as e: - results["errors"].append(f"GoDaddy-API: {str(e)[:100]}") - - # Scrape Dynadot + results: Dict[str, Any] = {"total_found": 0, "platforms": {}, "errors": [], "items": []} + try: dynadot_data = await self.dynadot.fetch_auctions(page_size=limit_per_platform) results["platforms"]["Dynadot"] = { @@ -1304,138 +212,17 @@ class HiddenApiScraperService: } results["items"].extend(dynadot_data.get("items", [])) results["total_found"] += len(dynadot_data.get("items", [])) - + if dynadot_data.get("error"): results["errors"].append(f"Dynadot: {dynadot_data['error']}") - except Exception as e: results["errors"].append(f"Dynadot: {str(e)}") - - # ═══════════════════════════════════════════════════════════ - # TIER 2: AJAX/HTML Scrapers - # ═══════════════════════════════════════════════════════════ - - # Scrape NameJet (NEW) - try: - namejet_data = await self.namejet.fetch_auctions(limit=limit_per_platform) - results["platforms"]["NameJet"] = { - "found": len(namejet_data.get("items", [])), - "total": namejet_data.get("total", 0), - } - results["items"].extend(namejet_data.get("items", [])) - results["total_found"] += len(namejet_data.get("items", [])) - - if namejet_data.get("error"): - results["errors"].append(f"NameJet: {namejet_data['error']}") - - except Exception as e: - results["errors"].append(f"NameJet: {str(e)}") - - # Scrape Park.io (Backorders - NEW) - try: - parkio_data = await self.parkio.fetch_pending_drops(limit=limit_per_platform) - results["platforms"]["Park.io"] = { - "found": len(parkio_data.get("items", [])), - "total": parkio_data.get("total", 0), - } - results["items"].extend(parkio_data.get("items", [])) - results["total_found"] += len(parkio_data.get("items", [])) - - if parkio_data.get("error"): - results["errors"].append(f"Park.io: {parkio_data['error']}") - - except Exception as e: - results["errors"].append(f"Park.io: {str(e)}") - - # Scrape Sav.com - try: - sav_data = await self.sav.fetch_auctions(page=0) - results["platforms"]["Sav"] = { - "found": len(sav_data.get("items", [])), - "total": sav_data.get("total", 0), - } - results["items"].extend(sav_data.get("items", [])) - results["total_found"] += len(sav_data.get("items", [])) - - if sav_data.get("error"): - results["errors"].append(f"Sav: {sav_data['error']}") - - except Exception as e: - results["errors"].append(f"Sav: {str(e)}") - - # ═══════════════════════════════════════════════════════════ - # TIER 2.5: Additional Platforms (HTML Scraping) - # ═══════════════════════════════════════════════════════════ - - # Scrape SnapNames (NEW) - try: - snapnames_data = await self.snapnames.fetch_auctions(limit=limit_per_platform) - snapnames_count = len(snapnames_data.get("items", [])) - if snapnames_count > 0: - results["platforms"]["SnapNames"] = { - "found": snapnames_count, - "total": snapnames_data.get("total", 0), - } - results["items"].extend(snapnames_data.get("items", [])) - results["total_found"] += snapnames_count - - if snapnames_data.get("error"): - results["errors"].append(f"SnapNames: {snapnames_data['error'][:100]}") - - except Exception as e: - results["errors"].append(f"SnapNames: {str(e)[:100]}") - - # Scrape Park.io (HTML scraping) - try: - parkio_data = await self.parkio.fetch_pending_drops(limit=limit_per_platform) - parkio_count = len(parkio_data.get("items", [])) - if parkio_count > 0: - results["platforms"]["Park.io"] = { - "found": parkio_count, - "total": parkio_data.get("total", 0), - } - results["items"].extend(parkio_data.get("items", [])) - results["total_found"] += parkio_count - - if parkio_data.get("error"): - results["errors"].append(f"Park.io: {parkio_data['error'][:100]}") - - except Exception as e: - results["errors"].append(f"Park.io: {str(e)[:100]}") - - # ═══════════════════════════════════════════════════════════ - # TIER 3: Experimental (May require fixes) - # ═══════════════════════════════════════════════════════════ - - # Scrape Namecheap (GraphQL - needs query hash) - try: - namecheap_data = await self.namecheap.fetch_auctions(limit=limit_per_platform) - namecheap_count = len(namecheap_data.get("items", [])) - if namecheap_count > 0: - results["platforms"]["Namecheap"] = { - "found": namecheap_count, - "total": namecheap_data.get("total", 0), - } - results["items"].extend(namecheap_data.get("items", [])) - results["total_found"] += namecheap_count - - if namecheap_data.get("error"): - results["errors"].append(f"Namecheap: {namecheap_data['error'][:100]}") - - except Exception as e: - results["errors"].append(f"Namecheap: {str(e)[:100]}") - + return results # Export instances -namecheap_scraper = NamecheapApiScraper() dynadot_scraper = DynadotApiScraper() -sav_scraper = SavApiScraper() -godaddy_scraper = GoDaddyApiScraper() -godaddy_rss_scraper = GoDaddyRssScraper() # RSS fallback (always works!) -parkio_scraper = ParkIoApiScraper() -namejet_scraper = NameJetApiScraper() -snapnames_scraper = SnapNamesApiScraper() # NEW hidden_api_scraper = HiddenApiScraperService() + diff --git a/backend/scripts/premium_data_collector.py b/backend/scripts/premium_data_collector.py index cee5716..45455b3 100644 --- a/backend/scripts/premium_data_collector.py +++ b/backend/scripts/premium_data_collector.py @@ -257,7 +257,7 @@ class PremiumDataCollector: """ Collect auction data from all platforms. - Prioritizes real data over sample/estimated data. + Collects only real auction data (no seed/demo data). """ logger.info("🔄 Starting auction collection...") start_time = datetime.utcnow() @@ -266,14 +266,6 @@ class PremiumDataCollector: # Try real scraping first result = await self.auction_scraper.scrape_all_platforms(db) - total_found = result.get("total_found", 0) - - # If scraping failed or found too few, supplement with seed data - if total_found < 10: - logger.warning(f"⚠️ Only {total_found} auctions scraped, adding seed data...") - seed_result = await self.auction_scraper.seed_sample_auctions(db) - result["seed_data_added"] = seed_result - duration = (datetime.utcnow() - start_time).total_seconds() logger.info(f"✅ Auctions collected in {duration:.1f}s") diff --git a/backend/scripts/scrape_auctions.py b/backend/scripts/scrape_auctions.py index 6c95c6d..b443be9 100644 --- a/backend/scripts/scrape_auctions.py +++ b/backend/scripts/scrape_auctions.py @@ -32,6 +32,42 @@ logging.basicConfig( ) logger = logging.getLogger(__name__) +async def ensure_auction_uniqueness(): + """ + Ensure we have a unique index on (platform, domain) and clean duplicates once. + + This prevents duplicate rows when the scraper runs repeatedly (cron) and when + the session uses autoflush=False. + """ + from sqlalchemy import text + from app.config import get_settings + + settings = get_settings() + db_url = settings.database_url or "" + + async with AsyncSessionLocal() as db: + # Best-effort de-duplication (SQLite only). + if db_url.startswith("sqlite"): + await db.execute( + text( + """ + DELETE FROM domain_auctions + WHERE id NOT IN ( + SELECT MAX(id) FROM domain_auctions GROUP BY platform, domain + ) + """ + ) + ) + await db.commit() + + # Create unique index (works for SQLite and Postgres). + await db.execute( + text( + "CREATE UNIQUE INDEX IF NOT EXISTS ux_auctions_platform_domain ON domain_auctions(platform, domain)" + ) + ) + await db.commit() + async def run_scrapers(): """Run all auction scrapers.""" @@ -109,6 +145,9 @@ def main(): print(f" Started: {datetime.now().isoformat()}") print("="*60) + # Ensure DB uniqueness constraints + asyncio.run(ensure_auction_uniqueness()) + # Run scrapers result = asyncio.run(run_scrapers()) diff --git a/backend/scripts/seed_auctions.py b/backend/scripts/seed_auctions.py deleted file mode 100644 index bc0f975..0000000 --- a/backend/scripts/seed_auctions.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Seed auction data for development.""" -import asyncio -import sys -import os - -# Add parent directory to path -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from app.database import AsyncSessionLocal -from app.services.auction_scraper import auction_scraper - - -async def main(): - """Seed auction data.""" - async with AsyncSessionLocal() as db: - print("Seeding sample auction data...") - result = await auction_scraper.seed_sample_auctions(db) - print(f"✓ Seeded {result['found']} auctions ({result['new']} new, {result['updated']} updated)") - - # Also try to scrape real data - print("\nAttempting to scrape real auction data...") - try: - scrape_result = await auction_scraper.scrape_all_platforms(db) - print(f"✓ Scraped {scrape_result['total_found']} auctions from platforms:") - for platform, stats in scrape_result['platforms'].items(): - print(f" - {platform}: {stats.get('found', 0)} found") - if scrape_result['errors']: - print(f" Errors: {scrape_result['errors']}") - except Exception as e: - print(f" Scraping failed (this is okay): {e}") - - print("\n✓ Done!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/backend/scripts/test_namecheap.py b/backend/scripts/test_namecheap.py deleted file mode 100644 index 6221329..0000000 --- a/backend/scripts/test_namecheap.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 -""" -Test Namecheap GraphQL API to find the query hash. -""" - -import asyncio -import httpx -import json -import re - -async def test_namecheap(): - """ - Test Namecheap GraphQL API. - The API requires a query hash that must be extracted from the website. - """ - - async with httpx.AsyncClient(timeout=30.0) as client: - # First, load the Marketplace page to find the hash - print("🔍 Fetching Namecheap Marketplace page...") - response = await client.get( - "https://www.namecheap.com/market/", - headers={ - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - "Accept": "text/html,application/xhtml+xml", - } - ) - - if response.status_code == 200: - html = response.text - - # Look for query hash patterns - hash_patterns = [ - r'"queryHash":"([a-f0-9]+)"', - r'"hash":"([a-f0-9]{32,})"', - r'aftermarketapi.*?([a-f0-9]{32,})', - r'"persistedQueryHash":"([a-f0-9]+)"', - ] - - found_hashes = set() - for pattern in hash_patterns: - matches = re.findall(pattern, html, re.IGNORECASE) - for m in matches: - if len(m) >= 32: - found_hashes.add(m) - - if found_hashes: - print(f"✅ Found {len(found_hashes)} potential hashes:") - for h in list(found_hashes)[:5]: - print(f" {h[:50]}...") - else: - print("❌ No hashes found in HTML") - - # Check for NEXT_DATA - if "__NEXT_DATA__" in html: - print("📦 Found __NEXT_DATA__ - Next.js app") - match = re.search(r'', html, re.DOTALL) - if match: - try: - data = json.loads(match.group(1)) - print(f" Keys: {list(data.keys())[:5]}") - except: - pass - - print(f"📄 Page status: {response.status_code}") - print(f"📄 Page size: {len(html)} bytes") - - # Try a different approach - use their search API - print("\n🔍 Trying Namecheap search endpoint...") - search_response = await client.get( - "https://www.namecheap.com/market/search/", - params={"q": "tech"}, - headers={ - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", - "Accept": "application/json, text/html", - "X-Requested-With": "XMLHttpRequest", - } - ) - print(f" Search status: {search_response.status_code}") - - else: - print(f"❌ Failed: {response.status_code}") - -if __name__ == "__main__": - asyncio.run(test_namecheap()) -