feat: Enhanced auction scrapers with multiple sources

- Add GoDaddy RSS Feed scraper (bypasses Cloudflare) - Enhanced ExpiredDomains scraper (multiple pages, TLDs) - Improved hidden API scrapers integration - Add automated scraper cron script (runs every 30 min) - Playwright stealth mode installed on server Sources now working: - Dynadot REST API: ~100 auctions - GoDaddy RSS: ~100 auctions - ExpiredDomains: ~250 auctions Total: 467 auctions in database
2025-12-11 20:58:04 +01:00
parent 048f42e876
commit de5cfdc10a
3 changed files with 431 additions and 62 deletions
--- a/backend/app/services/auction_scraper.py
+++ b/backend/app/services/auction_scraper.py
@ -302,6 +302,11 @@ class AuctionScraperService:
        """
        Scrape ExpiredDomains.net for auction listings.
        This site aggregates expired/deleted domains from various TLDs.
+        
+        Enhanced to scrape multiple pages and categories:
+        - Deleted domains (multiple TLDs)
+        - Pending delete domains
+        - Expired auction domains
        """
        platform = "ExpiredDomains"
        result = {"found": 0, "new": 0, "updated": 0}
@ -314,66 +319,123 @@ class AuctionScraperService:
            await self._rate_limit(platform)
            client = await self._get_client()
            
-            # Scrape deleted domains page
-            url = "https://www.expireddomains.net/deleted-domains/"
-            response = await client.get(url)
-            
-            if response.status_code != 200:
-                raise Exception(f"HTTP {response.status_code}")
-            
-            soup = BeautifulSoup(response.text, "lxml")
-            domain_rows = soup.select("table.base1 tbody tr")
-            
            # TLD-based pricing
-            base_prices = {"com": 12, "net": 10, "org": 10, "io": 50, "ai": 80, "co": 25, "de": 8, "nl": 10, "fr": 10, "app": 15}
+            base_prices = {
+                "com": 12, "net": 10, "org": 10, "io": 50, "ai": 80, 
+                "co": 25, "de": 8, "nl": 10, "fr": 10, "app": 15,
+                "xyz": 5, "info": 8, "tech": 15, "dev": 12, "me": 15,
+                "tv": 35, "gg": 60, "sh": 40, "cc": 25, "biz": 8,
+            }
            
-            for row in domain_rows[:30]:
+            # Enhanced: Multiple pages to scrape
+            pages_to_scrape = [
+                # Deleted domains (different sorting/pages)
+                "https://www.expireddomains.net/deleted-domains/",
+                "https://www.expireddomains.net/deleted-domains/?start=25",
+                "https://www.expireddomains.net/deleted-domains/?start=50",
+                # Pending delete
+                "https://www.expireddomains.net/pending-delete-domains/",
+                # By TLD
+                "https://www.expireddomains.net/deleted-com-domains/",
+                "https://www.expireddomains.net/deleted-net-domains/",
+                "https://www.expireddomains.net/deleted-io-domains/",
+                "https://www.expireddomains.net/deleted-ai-domains/",
+                # Backorder auctions
+                "https://www.expireddomains.net/backorder-domain-auctions/",
+            ]
+            
+            seen_domains = set()
+            
+            for url in pages_to_scrape:
                try:
-                    cols = row.find_all("td")
-                    if len(cols) < 3:
+                    await asyncio.sleep(1)  # Rate limit between pages
+                    response = await client.get(url, timeout=15.0)
+                    
+                    if response.status_code != 200:
+                        logger.debug(f"ExpiredDomains {url}: HTTP {response.status_code}")
                        continue
                    
-                    domain_link = cols[0].find("a")
-                    if not domain_link:
-                        continue
-                    
-                    domain_text = domain_link.get_text(strip=True)
-                    if not domain_text or "." not in domain_text:
-                        continue
-                    
-                    domain = domain_text.lower()
-                    tld = domain.rsplit(".", 1)[-1]
-                    estimated_price = base_prices.get(tld, 15)
-                    
-                    auction_data = {
-                        "domain": domain,
-                        "tld": tld,
-                        "platform": platform,
-                        "platform_auction_id": None,
-                        "auction_url": f"https://www.expireddomains.net/domain-name-search/?q={quote(domain)}",
-                        "current_bid": float(estimated_price),
-                        "currency": "USD",
-                        "min_bid": None,
-                        "buy_now_price": None,
-                        "reserve_price": None,
-                        "reserve_met": None,
-                        "num_bids": 0,
-                        "num_watchers": None,
-                        "end_time": datetime.utcnow() + timedelta(days=7),
-                        "auction_type": "registration",
-                        "traffic": None,
-                        "age_years": None,
-                        "backlinks": None,
-                        "domain_authority": None,
-                        "scrape_source": "expireddomains.net",
-                    }
-                    
-                    status = await self._store_auction(db, auction_data)
-                    result["found"] += 1
-                    result[status] += 1
+                    soup = BeautifulSoup(response.text, "lxml")
+                    domain_rows = soup.select("table.base1 tbody tr")
                    
+                    for row in domain_rows[:50]:  # 50 per page
+                        try:
+                            cols = row.find_all("td")
+                            if len(cols) < 3:
+                                continue
+                            
+                            domain_link = cols[0].find("a")
+                            if not domain_link:
+                                continue
+                            
+                            domain_text = domain_link.get_text(strip=True)
+                            if not domain_text or "." not in domain_text:
+                                continue
+                            
+                            domain = domain_text.lower()
+                            
+                            # Skip if already seen
+                            if domain in seen_domains:
+                                continue
+                            seen_domains.add(domain)
+                            
+                            tld = domain.rsplit(".", 1)[-1]
+                            estimated_price = base_prices.get(tld, 15)
+                            
+                            # Try to extract age/backlinks from other columns
+                            age_years = None
+                            backlinks = None
+                            domain_authority = None
+                            
+                            if len(cols) >= 5:
+                                try:
+                                    # BL column (backlinks)
+                                    bl_text = cols[3].get_text(strip=True)
+                                    if bl_text and bl_text.isdigit():
+                                        backlinks = int(bl_text)
+                                except:
+                                    pass
+                                try:
+                                    # ABY column (archive.org age)
+                                    age_text = cols[4].get_text(strip=True)
+                                    if age_text and age_text.isdigit():
+                                        age_years = int(age_text)
+                                except:
+                                    pass
+                            
+                            auction_data = {
+                                "domain": domain,
+                                "tld": tld,
+                                "platform": platform,
+                                "platform_auction_id": None,
+                                "auction_url": f"https://www.expireddomains.net/domain-name-search/?q={quote(domain)}",
+                                "current_bid": float(estimated_price),
+                                "currency": "USD",
+                                "min_bid": None,
+                                "buy_now_price": None,
+                                "reserve_price": None,
+                                "reserve_met": None,
+                                "num_bids": 0,
+                                "num_watchers": None,
+                                "end_time": datetime.utcnow() + timedelta(days=7),
+                                "auction_type": "registration",
+                                "traffic": None,
+                                "age_years": age_years,
+                                "backlinks": backlinks,
+                                "domain_authority": domain_authority,
+                                "scrape_source": "expireddomains.net",
+                            }
+                            
+                            status = await self._store_auction(db, auction_data)
+                            result["found"] += 1
+                            result[status] += 1
+                            
+                        except Exception as e:
+                            logger.debug(f"Error parsing row: {e}")
+                            continue
+                            
                except Exception as e:
-                    logger.debug(f"Error parsing row: {e}")
+                    logger.debug(f"Error fetching {url}: {e}")
                    continue
            
            await db.commit()
@ -384,6 +446,8 @@ class AuctionScraperService:
            log.auctions_updated = result["updated"]
            await db.commit()
            
+            logger.info(f"✅ ExpiredDomains: {result['found']} domains found")
+            
        except Exception as e:
            log.completed_at = datetime.utcnow()
            log.status = "failed"
--- a/backend/app/services/hidden_api_scrapers.py
+++ b/backend/app/services/hidden_api_scrapers.py
@ -582,6 +582,157 @@ class GoDaddyApiScraper:
            return {"items": [], "total": 0, "error": str(e)}


+# ═══════════════════════════════════════════════════════════════════════════════
+# GODADDY RSS SCRAPER — Public RSS Feed (NO Cloudflare!)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+class GoDaddyRssScraper:
+    """
+    Scraper for GoDaddy Auctions using their PUBLIC RSS feeds.
+    
+    These RSS feeds are NOT protected by Cloudflare and always work!
+    
+    Feeds:
+    - https://auctions.godaddy.com/rss/ending.aspx (Ending Soon)
+    - https://auctions.godaddy.com/rss/new.aspx (New Auctions)
+    - https://auctions.godaddy.com/rss/closeouts.aspx (Closeouts)
+    """
+    
+    RSS_FEEDS = {
+        "ending": "https://auctions.godaddy.com/rss/ending.aspx",
+        "new": "https://auctions.godaddy.com/rss/new.aspx",
+        "closeouts": "https://auctions.godaddy.com/rss/closeouts.aspx",
+    }
+    
+    async def fetch_auctions(
+        self,
+        feed_type: str = "ending",  # "ending", "new", or "closeouts"
+        limit: int = 100,
+    ) -> Dict[str, Any]:
+        """Fetch auctions from GoDaddy RSS feeds."""
+        try:
+            import xml.etree.ElementTree as ET
+            
+            feed_url = self.RSS_FEEDS.get(feed_type, self.RSS_FEEDS["ending"])
+            
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                response = await client.get(
+                    feed_url,
+                    headers={
+                        "Accept": "application/rss+xml, application/xml, text/xml",
+                        "User-Agent": "Mozilla/5.0 (compatible; PounceBot/1.0; +https://pounce.ch)",
+                    },
+                )
+                
+                if response.status_code != 200:
+                    logger.error(f"GoDaddy RSS error: {response.status_code}")
+                    return {"items": [], "total": 0, "error": f"HTTP {response.status_code}"}
+                
+                # Parse RSS XML
+                root = ET.fromstring(response.text)
+                
+                # Find all items in the RSS feed
+                items = root.findall(".//item")
+                
+                transformed = []
+                for item in items[:limit]:
+                    try:
+                        title = item.find("title").text if item.find("title") is not None else ""
+                        link = item.find("link").text if item.find("link") is not None else ""
+                        description = item.find("description").text if item.find("description") is not None else ""
+                        
+                        # Extract domain from title (format: "domain.com - $XX")
+                        domain = ""
+                        price = 0
+                        
+                        if title:
+                            # Title format: "example.com - $12" or "example.com"
+                            parts = title.split(" - ")
+                            domain = parts[0].strip().lower()
+                            
+                            if len(parts) > 1:
+                                price_str = parts[1].replace("$", "").replace(",", "").strip()
+                                try:
+                                    price = float(price_str)
+                                except:
+                                    pass
+                        
+                        # Try to extract price from description if not in title
+                        if price == 0 and description:
+                            import re
+                            price_match = re.search(r'\$([0-9,]+(?:\.[0-9]+)?)', description)
+                            if price_match:
+                                price = float(price_match.group(1).replace(",", ""))
+                        
+                        if not domain or "." not in domain:
+                            continue
+                        
+                        tld = domain.rsplit(".", 1)[-1]
+                        
+                        # Add affiliate param to link
+                        affiliate_url = link
+                        if link and "?" in link:
+                            affiliate_url = f"{link}&isc=cjcpounce"
+                        elif link:
+                            affiliate_url = f"{link}?isc=cjcpounce"
+                        else:
+                            affiliate_url = build_affiliate_url("GoDaddy", domain)
+                        
+                        transformed.append({
+                            "domain": domain,
+                            "tld": tld,
+                            "platform": "GoDaddy",
+                            "current_bid": price,
+                            "min_bid": price,
+                            "num_bids": 0,  # RSS doesn't provide bid count
+                            "end_time": datetime.utcnow() + timedelta(hours=24),  # Estimate
+                            "buy_now_price": None,
+                            "auction_url": affiliate_url,
+                            "currency": "USD",
+                            "is_active": True,
+                            "source": f"RSS-{feed_type}",
+                        })
+                    except Exception as e:
+                        logger.warning(f"Error parsing GoDaddy RSS item: {e}")
+                        continue
+                
+                logger.info(f"GoDaddy RSS ({feed_type}): Found {len(transformed)} auctions")
+                return {
+                    "items": transformed,
+                    "total": len(transformed),
+                    "has_more": False,
+                }
+                
+        except Exception as e:
+            logger.exception(f"GoDaddy RSS scraper error: {e}")
+            return {"items": [], "total": 0, "error": str(e)}
+    
+    async def fetch_all_feeds(self) -> Dict[str, Any]:
+        """Fetch from all GoDaddy RSS feeds."""
+        all_items = []
+        errors = []
+        
+        for feed_type in ["ending", "new", "closeouts"]:
+            result = await self.fetch_auctions(feed_type=feed_type, limit=50)
+            all_items.extend(result.get("items", []))
+            if result.get("error"):
+                errors.append(f"{feed_type}: {result['error']}")
+        
+        # Dedupe by domain
+        seen = set()
+        unique_items = []
+        for item in all_items:
+            if item["domain"] not in seen:
+                seen.add(item["domain"])
+                unique_items.append(item)
+        
+        return {
+            "items": unique_items,
+            "total": len(unique_items),
+            "errors": errors if errors else None,
+        }
+
+
 # ═══════════════════════════════════════════════════════════════════════════════
 # PARK.IO SCRAPER — Backorder Service API
 # ═══════════════════════════════════════════════════════════════════════════════
@ -857,6 +1008,7 @@ class HiddenApiScraperService:
        self.dynadot = DynadotApiScraper()
        self.sav = SavApiScraper()
        self.godaddy = GoDaddyApiScraper()
+        self.godaddy_rss = GoDaddyRssScraper()  # RSS fallback (NO Cloudflare!)
        self.parkio = ParkIoApiScraper()
        self.namejet = NameJetApiScraper()
    
@ -873,25 +1025,46 @@ class HiddenApiScraperService:
            "items": [],
        }
        
+        # ═══════════════════════════════════════════════════════════
+        # TIER 0: RSS Feeds (Most Reliable - NO Cloudflare!)
+        # ═══════════════════════════════════════════════════════════
+        
+        # Scrape GoDaddy RSS (Always works!)
+        try:
+            rss_data = await self.godaddy_rss.fetch_all_feeds()
+            rss_count = len(rss_data.get("items", []))
+            if rss_count > 0:
+                results["platforms"]["GoDaddy-RSS"] = {
+                    "found": rss_count,
+                    "total": rss_count,
+                }
+                results["items"].extend(rss_data.get("items", []))
+                results["total_found"] += rss_count
+                logger.info(f"✅ GoDaddy RSS: {rss_count} auctions")
+        except Exception as e:
+            results["errors"].append(f"GoDaddy-RSS: {str(e)}")
+        
        # ═══════════════════════════════════════════════════════════
        # TIER 1: Most Reliable JSON APIs
        # ═══════════════════════════════════════════════════════════
        
-        # Scrape GoDaddy (NEW - Most reliable!)
+        # Scrape GoDaddy JSON API (may have Cloudflare issues)
        try:
            godaddy_data = await self.godaddy.fetch_auctions(limit=limit_per_platform)
-            results["platforms"]["GoDaddy"] = {
-                "found": len(godaddy_data.get("items", [])),
-                "total": godaddy_data.get("total", 0),
-            }
-            results["items"].extend(godaddy_data.get("items", []))
-            results["total_found"] += len(godaddy_data.get("items", []))
+            godaddy_count = len(godaddy_data.get("items", []))
+            if godaddy_count > 0:
+                results["platforms"]["GoDaddy-API"] = {
+                    "found": godaddy_count,
+                    "total": godaddy_data.get("total", 0),
+                }
+                results["items"].extend(godaddy_data.get("items", []))
+                results["total_found"] += godaddy_count
            
            if godaddy_data.get("error"):
-                results["errors"].append(f"GoDaddy: {godaddy_data['error']}")
+                results["errors"].append(f"GoDaddy-API: {godaddy_data['error'][:100]}")
                
        except Exception as e:
-            results["errors"].append(f"GoDaddy: {str(e)}")
+            results["errors"].append(f"GoDaddy-API: {str(e)[:100]}")
        
        # Scrape Dynadot
        try:
@ -989,6 +1162,7 @@ namecheap_scraper = NamecheapApiScraper()
 dynadot_scraper = DynadotApiScraper()
 sav_scraper = SavApiScraper()
 godaddy_scraper = GoDaddyApiScraper()
+godaddy_rss_scraper = GoDaddyRssScraper()  # RSS fallback (always works!)
 parkio_scraper = ParkIoApiScraper()
 namejet_scraper = NameJetApiScraper()
 hidden_api_scraper = HiddenApiScraperService()
--- a/backend/scripts/scrape_auctions.py
+++ b/backend/scripts/scrape_auctions.py
@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Automated Auction Scraper Script
+
+This script runs all auction scrapers and saves results to the database.
+Designed to be run via cron job every 30 minutes.
+
+Usage:
+    python scripts/scrape_auctions.py
+
+Cron example (every 30 minutes):
+    */30 * * * * cd /home/user/pounce/backend && ./venv/bin/python scripts/scrape_auctions.py >> /var/log/pounce/scraper.log 2>&1
+"""
+
+import sys
+import os
+import asyncio
+import logging
+from datetime import datetime
+from pathlib import Path
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from app.services.auction_scraper import auction_scraper
+from app.database import AsyncSessionLocal
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+async def run_scrapers():
+    """Run all auction scrapers."""
+    start_time = datetime.utcnow()
+    logger.info(f"🚀 Starting auction scrape at {start_time.isoformat()}")
+    
+    try:
+        async with AsyncSessionLocal() as db:
+            result = await auction_scraper.scrape_all_platforms(db)
+            
+            # Log results
+            total_found = result.get("total_found", 0)
+            total_new = result.get("total_new", 0)
+            
+            logger.info(f"✅ Scrape complete!")
+            logger.info(f"   Total Found: {total_found}")
+            logger.info(f"   New Added: {total_new}")
+            
+            # Log platform breakdown
+            platforms = result.get("platforms", {})
+            for platform, data in platforms.items():
+                if isinstance(data, dict) and data.get("found", 0) > 0:
+                    logger.info(f"   {platform}: {data.get('found', 0)} found, {data.get('new', 0)} new")
+            
+            # Log errors (but don't fail)
+            errors = result.get("errors", [])
+            if errors:
+                logger.warning(f"⚠️ {len(errors)} errors occurred:")
+                for err in errors[:5]:
+                    logger.warning(f"   - {str(err)[:100]}")
+            
+            elapsed = (datetime.utcnow() - start_time).total_seconds()
+            logger.info(f"⏱️ Completed in {elapsed:.1f} seconds")
+            
+            return result
+            
+    except Exception as e:
+        logger.exception(f"❌ Scrape failed: {e}")
+        return {"error": str(e)}
+
+
+async def cleanup_old_auctions():
+    """Remove expired/old auctions from database."""
+    try:
+        async with AsyncSessionLocal() as db:
+            from sqlalchemy import delete, and_
+            from datetime import timedelta
+            from app.models.auction import DomainAuction
+            
+            cutoff = datetime.utcnow() - timedelta(days=7)
+            
+            # Mark expired auctions as inactive
+            from sqlalchemy import update
+            stmt = update(DomainAuction).where(
+                and_(
+                    DomainAuction.end_time < datetime.utcnow(),
+                    DomainAuction.is_active == True
+                )
+            ).values(is_active=False)
+            
+            result = await db.execute(stmt)
+            await db.commit()
+            
+            if result.rowcount > 0:
+                logger.info(f"🧹 Marked {result.rowcount} expired auctions as inactive")
+                
+    except Exception as e:
+        logger.warning(f"Cleanup error: {e}")
+
+
+def main():
+    """Main entry point."""
+    print("="*60)
+    print(f"🐾 POUNCE Auction Scraper")
+    print(f"   Started: {datetime.now().isoformat()}")
+    print("="*60)
+    
+    # Run scrapers
+    result = asyncio.run(run_scrapers())
+    
+    # Run cleanup
+    asyncio.run(cleanup_old_auctions())
+    
+    print("="*60)
+    print(f"✅ Done!")
+    print("="*60)
+    
+    # Exit with error code if no results
+    if result.get("error") or result.get("total_found", 0) == 0:
+        sys.exit(1)
+    
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
+