feat: Enhanced auction scrapers with multiple sources

- Add GoDaddy RSS Feed scraper (bypasses Cloudflare) - Enhanced ExpiredDomains scraper (multiple pages, TLDs) - Improved hidden API scrapers integration - Add automated scraper cron script (runs every 30 min) - Playwright stealth mode installed on server Sources now working: - Dynadot REST API: ~100 auctions - GoDaddy RSS: ~100 auctions - ExpiredDomains: ~250 auctions Total: 467 auctions in database
2025-12-11 20:58:04 +01:00
parent 10c71855b4
commit 42fc4fec52
3 changed files with 431 additions and 62 deletions
--- a/backend/app/services/auction_scraper.py
+++ b/backend/app/services/auction_scraper.py
@ -302,6 +302,11 @@ class AuctionScraperService:
        """
        Scrape ExpiredDomains.net for auction listings.
        This site aggregates expired/deleted domains from various TLDs.
        Enhanced to scrape multiple pages and categories:
        - Deleted domains (multiple TLDs)
        - Pending delete domains
        - Expired auction domains
        """
        platform = "ExpiredDomains"
        result = {"found": 0, "new": 0, "updated": 0}
@ -314,20 +319,46 @@ class AuctionScraperService:
            await self._rate_limit(platform)
            client = await self._get_client()
-            # Scrape deleted domains page
+            # TLD-based pricing
-            url = "https://www.expireddomains.net/deleted-domains/"
+            base_prices = {
-            response = await client.get(url)
+                "com": 12, "net": 10, "org": 10, "io": 50, "ai": 80, 
                "co": 25, "de": 8, "nl": 10, "fr": 10, "app": 15,
                "xyz": 5, "info": 8, "tech": 15, "dev": 12, "me": 15,
                "tv": 35, "gg": 60, "sh": 40, "cc": 25, "biz": 8,
            }
            # Enhanced: Multiple pages to scrape
            pages_to_scrape = [
                # Deleted domains (different sorting/pages)
                "https://www.expireddomains.net/deleted-domains/",
                "https://www.expireddomains.net/deleted-domains/?start=25",
                "https://www.expireddomains.net/deleted-domains/?start=50",
                # Pending delete
                "https://www.expireddomains.net/pending-delete-domains/",
                # By TLD
                "https://www.expireddomains.net/deleted-com-domains/",
                "https://www.expireddomains.net/deleted-net-domains/",
                "https://www.expireddomains.net/deleted-io-domains/",
                "https://www.expireddomains.net/deleted-ai-domains/",
                # Backorder auctions
                "https://www.expireddomains.net/backorder-domain-auctions/",
            ]
            seen_domains = set()
            for url in pages_to_scrape:
                try:
                    await asyncio.sleep(1)  # Rate limit between pages
                    response = await client.get(url, timeout=15.0)
                    if response.status_code != 200:
-                raise Exception(f"HTTP {response.status_code}")
+                        logger.debug(f"ExpiredDomains {url}: HTTP {response.status_code}")
                        continue
                    soup = BeautifulSoup(response.text, "lxml")
                    domain_rows = soup.select("table.base1 tbody tr")
-            # TLD-based pricing
+                    for row in domain_rows[:50]:  # 50 per page
            base_prices = {"com": 12, "net": 10, "org": 10, "io": 50, "ai": 80, "co": 25, "de": 8, "nl": 10, "fr": 10, "app": 15}
            for row in domain_rows[:30]:
                        try:
                            cols = row.find_all("td")
                            if len(cols) < 3:
@ -342,9 +373,36 @@ class AuctionScraperService:
                                continue
                            domain = domain_text.lower()
                            # Skip if already seen
                            if domain in seen_domains:
                                continue
                            seen_domains.add(domain)
                            tld = domain.rsplit(".", 1)[-1]
                            estimated_price = base_prices.get(tld, 15)
                            # Try to extract age/backlinks from other columns
                            age_years = None
                            backlinks = None
                            domain_authority = None
                            if len(cols) >= 5:
                                try:
                                    # BL column (backlinks)
                                    bl_text = cols[3].get_text(strip=True)
                                    if bl_text and bl_text.isdigit():
                                        backlinks = int(bl_text)
                                except:
                                    pass
                                try:
                                    # ABY column (archive.org age)
                                    age_text = cols[4].get_text(strip=True)
                                    if age_text and age_text.isdigit():
                                        age_years = int(age_text)
                                except:
                                    pass
                            auction_data = {
                                "domain": domain,
                                "tld": tld,
@ -362,9 +420,9 @@ class AuctionScraperService:
                                "end_time": datetime.utcnow() + timedelta(days=7),
                                "auction_type": "registration",
                                "traffic": None,
-                        "age_years": None,
+                                "age_years": age_years,
-                        "backlinks": None,
+                                "backlinks": backlinks,
-                        "domain_authority": None,
+                                "domain_authority": domain_authority,
                                "scrape_source": "expireddomains.net",
                            }
@ -376,6 +434,10 @@ class AuctionScraperService:
                            logger.debug(f"Error parsing row: {e}")
                            continue
                except Exception as e:
                    logger.debug(f"Error fetching {url}: {e}")
                    continue
            await db.commit()
            log.completed_at = datetime.utcnow()
            log.status = "success"
@ -384,6 +446,8 @@ class AuctionScraperService:
            log.auctions_updated = result["updated"]
            await db.commit()
            logger.info(f"✅ ExpiredDomains: {result['found']} domains found")
        except Exception as e:
            log.completed_at = datetime.utcnow()
            log.status = "failed"
--- a/backend/app/services/hidden_api_scrapers.py
+++ b/backend/app/services/hidden_api_scrapers.py
@ -582,6 +582,157 @@ class GoDaddyApiScraper:
            return {"items": [], "total": 0, "error": str(e)}
 # ═══════════════════════════════════════════════════════════════════════════════
 # GODADDY RSS SCRAPER — Public RSS Feed (NO Cloudflare!)
 # ═══════════════════════════════════════════════════════════════════════════════
 class GoDaddyRssScraper:
    """
    Scraper for GoDaddy Auctions using their PUBLIC RSS feeds.
    These RSS feeds are NOT protected by Cloudflare and always work!
    Feeds:
    - https://auctions.godaddy.com/rss/ending.aspx (Ending Soon)
    - https://auctions.godaddy.com/rss/new.aspx (New Auctions)
    - https://auctions.godaddy.com/rss/closeouts.aspx (Closeouts)
    """
    RSS_FEEDS = {
        "ending": "https://auctions.godaddy.com/rss/ending.aspx",
        "new": "https://auctions.godaddy.com/rss/new.aspx",
        "closeouts": "https://auctions.godaddy.com/rss/closeouts.aspx",
    }
    async def fetch_auctions(
        self,
        feed_type: str = "ending",  # "ending", "new", or "closeouts"
        limit: int = 100,
    ) -> Dict[str, Any]:
        """Fetch auctions from GoDaddy RSS feeds."""
        try:
            import xml.etree.ElementTree as ET
            feed_url = self.RSS_FEEDS.get(feed_type, self.RSS_FEEDS["ending"])
            async with httpx.AsyncClient(timeout=30.0) as client:
                response = await client.get(
                    feed_url,
                    headers={
                        "Accept": "application/rss+xml, application/xml, text/xml",
                        "User-Agent": "Mozilla/5.0 (compatible; PounceBot/1.0; +https://pounce.ch)",
                    },
                )
                if response.status_code != 200:
                    logger.error(f"GoDaddy RSS error: {response.status_code}")
                    return {"items": [], "total": 0, "error": f"HTTP {response.status_code}"}
                # Parse RSS XML
                root = ET.fromstring(response.text)
                # Find all items in the RSS feed
                items = root.findall(".//item")
                transformed = []
                for item in items[:limit]:
                    try:
                        title = item.find("title").text if item.find("title") is not None else ""
                        link = item.find("link").text if item.find("link") is not None else ""
                        description = item.find("description").text if item.find("description") is not None else ""
                        # Extract domain from title (format: "domain.com - $XX")
                        domain = ""
                        price = 0
                        if title:
                            # Title format: "example.com - $12" or "example.com"
                            parts = title.split(" - ")
                            domain = parts[0].strip().lower()
                            if len(parts) > 1:
                                price_str = parts[1].replace("$", "").replace(",", "").strip()
                                try:
                                    price = float(price_str)
                                except:
                                    pass
                        # Try to extract price from description if not in title
                        if price == 0 and description:
                            import re
                            price_match = re.search(r'\$([0-9,]+(?:\.[0-9]+)?)', description)
                            if price_match:
                                price = float(price_match.group(1).replace(",", ""))
                        if not domain or "." not in domain:
                            continue
                        tld = domain.rsplit(".", 1)[-1]
                        # Add affiliate param to link
                        affiliate_url = link
                        if link and "?" in link:
                            affiliate_url = f"{link}&isc=cjcpounce"
                        elif link:
                            affiliate_url = f"{link}?isc=cjcpounce"
                        else:
                            affiliate_url = build_affiliate_url("GoDaddy", domain)
                        transformed.append({
                            "domain": domain,
                            "tld": tld,
                            "platform": "GoDaddy",
                            "current_bid": price,
                            "min_bid": price,
                            "num_bids": 0,  # RSS doesn't provide bid count
                            "end_time": datetime.utcnow() + timedelta(hours=24),  # Estimate
                            "buy_now_price": None,
                            "auction_url": affiliate_url,
                            "currency": "USD",
                            "is_active": True,
                            "source": f"RSS-{feed_type}",
                        })
                    except Exception as e:
                        logger.warning(f"Error parsing GoDaddy RSS item: {e}")
                        continue
                logger.info(f"GoDaddy RSS ({feed_type}): Found {len(transformed)} auctions")
                return {
                    "items": transformed,
                    "total": len(transformed),
                    "has_more": False,
                }
        except Exception as e:
            logger.exception(f"GoDaddy RSS scraper error: {e}")
            return {"items": [], "total": 0, "error": str(e)}
    async def fetch_all_feeds(self) -> Dict[str, Any]:
        """Fetch from all GoDaddy RSS feeds."""
        all_items = []
        errors = []
        for feed_type in ["ending", "new", "closeouts"]:
            result = await self.fetch_auctions(feed_type=feed_type, limit=50)
            all_items.extend(result.get("items", []))
            if result.get("error"):
                errors.append(f"{feed_type}: {result['error']}")
        # Dedupe by domain
        seen = set()
        unique_items = []
        for item in all_items:
            if item["domain"] not in seen:
                seen.add(item["domain"])
                unique_items.append(item)
        return {
            "items": unique_items,
            "total": len(unique_items),
            "errors": errors if errors else None,
        }
 # ═══════════════════════════════════════════════════════════════════════════════
 # PARK.IO SCRAPER — Backorder Service API
 # ═══════════════════════════════════════════════════════════════════════════════
@ -857,6 +1008,7 @@ class HiddenApiScraperService:
        self.dynadot = DynadotApiScraper()
        self.sav = SavApiScraper()
        self.godaddy = GoDaddyApiScraper()
        self.godaddy_rss = GoDaddyRssScraper()  # RSS fallback (NO Cloudflare!)
        self.parkio = ParkIoApiScraper()
        self.namejet = NameJetApiScraper()
@ -873,25 +1025,46 @@ class HiddenApiScraperService:
            "items": [],
        }
        # ═══════════════════════════════════════════════════════════
        # TIER 0: RSS Feeds (Most Reliable - NO Cloudflare!)
        # ═══════════════════════════════════════════════════════════
        # Scrape GoDaddy RSS (Always works!)
        try:
            rss_data = await self.godaddy_rss.fetch_all_feeds()
            rss_count = len(rss_data.get("items", []))
            if rss_count > 0:
                results["platforms"]["GoDaddy-RSS"] = {
                    "found": rss_count,
                    "total": rss_count,
                }
                results["items"].extend(rss_data.get("items", []))
                results["total_found"] += rss_count
                logger.info(f"✅ GoDaddy RSS: {rss_count} auctions")
        except Exception as e:
            results["errors"].append(f"GoDaddy-RSS: {str(e)}")
        # ═══════════════════════════════════════════════════════════
        # TIER 1: Most Reliable JSON APIs
        # ═══════════════════════════════════════════════════════════
-        # Scrape GoDaddy (NEW - Most reliable!)
+        # Scrape GoDaddy JSON API (may have Cloudflare issues)
        try:
            godaddy_data = await self.godaddy.fetch_auctions(limit=limit_per_platform)
-            results["platforms"]["GoDaddy"] = {
+            godaddy_count = len(godaddy_data.get("items", []))
-                "found": len(godaddy_data.get("items", [])),
+            if godaddy_count > 0:
                results["platforms"]["GoDaddy-API"] = {
                    "found": godaddy_count,
                    "total": godaddy_data.get("total", 0),
                }
                results["items"].extend(godaddy_data.get("items", []))
-            results["total_found"] += len(godaddy_data.get("items", []))
+                results["total_found"] += godaddy_count
            if godaddy_data.get("error"):
-                results["errors"].append(f"GoDaddy: {godaddy_data['error']}")
+                results["errors"].append(f"GoDaddy-API: {godaddy_data['error'][:100]}")
        except Exception as e:
-            results["errors"].append(f"GoDaddy: {str(e)}")
+            results["errors"].append(f"GoDaddy-API: {str(e)[:100]}")
        # Scrape Dynadot
        try:
@ -989,6 +1162,7 @@ namecheap_scraper = NamecheapApiScraper()
 dynadot_scraper = DynadotApiScraper()
 sav_scraper = SavApiScraper()
 godaddy_scraper = GoDaddyApiScraper()
 godaddy_rss_scraper = GoDaddyRssScraper()  # RSS fallback (always works!)
 parkio_scraper = ParkIoApiScraper()
 namejet_scraper = NameJetApiScraper()
 hidden_api_scraper = HiddenApiScraperService()
--- a/backend/scripts/scrape_auctions.py
+++ b/backend/scripts/scrape_auctions.py
@ -0,0 +1,131 @@
 #!/usr/bin/env python3
 """
 Automated Auction Scraper Script
 This script runs all auction scrapers and saves results to the database.
 Designed to be run via cron job every 30 minutes.
 Usage:
    python scripts/scrape_auctions.py
 Cron example (every 30 minutes):
    */30 * * * * cd /home/user/pounce/backend && ./venv/bin/python scripts/scrape_auctions.py >> /var/log/pounce/scraper.log 2>&1
 """
 import sys
 import os
 import asyncio
 import logging
 from datetime import datetime
 from pathlib import Path
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from app.services.auction_scraper import auction_scraper
 from app.database import AsyncSessionLocal
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 async def run_scrapers():
    """Run all auction scrapers."""
    start_time = datetime.utcnow()
    logger.info(f"🚀 Starting auction scrape at {start_time.isoformat()}")
    try:
        async with AsyncSessionLocal() as db:
            result = await auction_scraper.scrape_all_platforms(db)
            # Log results
            total_found = result.get("total_found", 0)
            total_new = result.get("total_new", 0)
            logger.info(f"✅ Scrape complete!")
            logger.info(f"   Total Found: {total_found}")
            logger.info(f"   New Added: {total_new}")
            # Log platform breakdown
            platforms = result.get("platforms", {})
            for platform, data in platforms.items():
                if isinstance(data, dict) and data.get("found", 0) > 0:
                    logger.info(f"   {platform}: {data.get('found', 0)} found, {data.get('new', 0)} new")
            # Log errors (but don't fail)
            errors = result.get("errors", [])
            if errors:
                logger.warning(f"⚠️ {len(errors)} errors occurred:")
                for err in errors[:5]:
                    logger.warning(f"   - {str(err)[:100]}")
            elapsed = (datetime.utcnow() - start_time).total_seconds()
            logger.info(f"⏱️ Completed in {elapsed:.1f} seconds")
            return result
    except Exception as e:
        logger.exception(f"❌ Scrape failed: {e}")
        return {"error": str(e)}
 async def cleanup_old_auctions():
    """Remove expired/old auctions from database."""
    try:
        async with AsyncSessionLocal() as db:
            from sqlalchemy import delete, and_
            from datetime import timedelta
            from app.models.auction import DomainAuction
            cutoff = datetime.utcnow() - timedelta(days=7)
            # Mark expired auctions as inactive
            from sqlalchemy import update
            stmt = update(DomainAuction).where(
                and_(
                    DomainAuction.end_time < datetime.utcnow(),
                    DomainAuction.is_active == True
                )
            ).values(is_active=False)
            result = await db.execute(stmt)
            await db.commit()
            if result.rowcount > 0:
                logger.info(f"🧹 Marked {result.rowcount} expired auctions as inactive")
    except Exception as e:
        logger.warning(f"Cleanup error: {e}")
 def main():
    """Main entry point."""
    print("="*60)
    print(f"🐾 POUNCE Auction Scraper")
    print(f"   Started: {datetime.now().isoformat()}")
    print("="*60)
    # Run scrapers
    result = asyncio.run(run_scrapers())
    # Run cleanup
    asyncio.run(cleanup_old_auctions())
    print("="*60)
    print(f"✅ Done!")
    print("="*60)
    # Exit with error code if no results
    if result.get("error") or result.get("total_found", 0) == 0:
        sys.exit(1)
    sys.exit(0)
 if __name__ == "__main__":
    main()