From 42fc4fec5222c1e1ac7683f5e14265a17ca1a5e7 Mon Sep 17 00:00:00 2001 From: Yves Gugger Date: Thu, 11 Dec 2025 20:58:04 +0100 Subject: [PATCH] feat: Enhanced auction scrapers with multiple sources - Add GoDaddy RSS Feed scraper (bypasses Cloudflare) - Enhanced ExpiredDomains scraper (multiple pages, TLDs) - Improved hidden API scrapers integration - Add automated scraper cron script (runs every 30 min) - Playwright stealth mode installed on server Sources now working: - Dynadot REST API: ~100 auctions - GoDaddy RSS: ~100 auctions - ExpiredDomains: ~250 auctions Total: 467 auctions in database --- backend/app/services/auction_scraper.py | 170 +++++++++++------ backend/app/services/hidden_api_scrapers.py | 192 +++++++++++++++++++- backend/scripts/scrape_auctions.py | 131 +++++++++++++ 3 files changed, 431 insertions(+), 62 deletions(-) create mode 100644 backend/scripts/scrape_auctions.py diff --git a/backend/app/services/auction_scraper.py b/backend/app/services/auction_scraper.py index 73d0c86..e5c2852 100644 --- a/backend/app/services/auction_scraper.py +++ b/backend/app/services/auction_scraper.py @@ -302,6 +302,11 @@ class AuctionScraperService: """ Scrape ExpiredDomains.net for auction listings. This site aggregates expired/deleted domains from various TLDs. + + Enhanced to scrape multiple pages and categories: + - Deleted domains (multiple TLDs) + - Pending delete domains + - Expired auction domains """ platform = "ExpiredDomains" result = {"found": 0, "new": 0, "updated": 0} @@ -314,66 +319,123 @@ class AuctionScraperService: await self._rate_limit(platform) client = await self._get_client() - # Scrape deleted domains page - url = "https://www.expireddomains.net/deleted-domains/" - response = await client.get(url) - - if response.status_code != 200: - raise Exception(f"HTTP {response.status_code}") - - soup = BeautifulSoup(response.text, "lxml") - domain_rows = soup.select("table.base1 tbody tr") - # TLD-based pricing - base_prices = {"com": 12, "net": 10, "org": 10, "io": 50, "ai": 80, "co": 25, "de": 8, "nl": 10, "fr": 10, "app": 15} + base_prices = { + "com": 12, "net": 10, "org": 10, "io": 50, "ai": 80, + "co": 25, "de": 8, "nl": 10, "fr": 10, "app": 15, + "xyz": 5, "info": 8, "tech": 15, "dev": 12, "me": 15, + "tv": 35, "gg": 60, "sh": 40, "cc": 25, "biz": 8, + } - for row in domain_rows[:30]: + # Enhanced: Multiple pages to scrape + pages_to_scrape = [ + # Deleted domains (different sorting/pages) + "https://www.expireddomains.net/deleted-domains/", + "https://www.expireddomains.net/deleted-domains/?start=25", + "https://www.expireddomains.net/deleted-domains/?start=50", + # Pending delete + "https://www.expireddomains.net/pending-delete-domains/", + # By TLD + "https://www.expireddomains.net/deleted-com-domains/", + "https://www.expireddomains.net/deleted-net-domains/", + "https://www.expireddomains.net/deleted-io-domains/", + "https://www.expireddomains.net/deleted-ai-domains/", + # Backorder auctions + "https://www.expireddomains.net/backorder-domain-auctions/", + ] + + seen_domains = set() + + for url in pages_to_scrape: try: - cols = row.find_all("td") - if len(cols) < 3: + await asyncio.sleep(1) # Rate limit between pages + response = await client.get(url, timeout=15.0) + + if response.status_code != 200: + logger.debug(f"ExpiredDomains {url}: HTTP {response.status_code}") continue - domain_link = cols[0].find("a") - if not domain_link: - continue - - domain_text = domain_link.get_text(strip=True) - if not domain_text or "." not in domain_text: - continue - - domain = domain_text.lower() - tld = domain.rsplit(".", 1)[-1] - estimated_price = base_prices.get(tld, 15) - - auction_data = { - "domain": domain, - "tld": tld, - "platform": platform, - "platform_auction_id": None, - "auction_url": f"https://www.expireddomains.net/domain-name-search/?q={quote(domain)}", - "current_bid": float(estimated_price), - "currency": "USD", - "min_bid": None, - "buy_now_price": None, - "reserve_price": None, - "reserve_met": None, - "num_bids": 0, - "num_watchers": None, - "end_time": datetime.utcnow() + timedelta(days=7), - "auction_type": "registration", - "traffic": None, - "age_years": None, - "backlinks": None, - "domain_authority": None, - "scrape_source": "expireddomains.net", - } - - status = await self._store_auction(db, auction_data) - result["found"] += 1 - result[status] += 1 + soup = BeautifulSoup(response.text, "lxml") + domain_rows = soup.select("table.base1 tbody tr") + for row in domain_rows[:50]: # 50 per page + try: + cols = row.find_all("td") + if len(cols) < 3: + continue + + domain_link = cols[0].find("a") + if not domain_link: + continue + + domain_text = domain_link.get_text(strip=True) + if not domain_text or "." not in domain_text: + continue + + domain = domain_text.lower() + + # Skip if already seen + if domain in seen_domains: + continue + seen_domains.add(domain) + + tld = domain.rsplit(".", 1)[-1] + estimated_price = base_prices.get(tld, 15) + + # Try to extract age/backlinks from other columns + age_years = None + backlinks = None + domain_authority = None + + if len(cols) >= 5: + try: + # BL column (backlinks) + bl_text = cols[3].get_text(strip=True) + if bl_text and bl_text.isdigit(): + backlinks = int(bl_text) + except: + pass + try: + # ABY column (archive.org age) + age_text = cols[4].get_text(strip=True) + if age_text and age_text.isdigit(): + age_years = int(age_text) + except: + pass + + auction_data = { + "domain": domain, + "tld": tld, + "platform": platform, + "platform_auction_id": None, + "auction_url": f"https://www.expireddomains.net/domain-name-search/?q={quote(domain)}", + "current_bid": float(estimated_price), + "currency": "USD", + "min_bid": None, + "buy_now_price": None, + "reserve_price": None, + "reserve_met": None, + "num_bids": 0, + "num_watchers": None, + "end_time": datetime.utcnow() + timedelta(days=7), + "auction_type": "registration", + "traffic": None, + "age_years": age_years, + "backlinks": backlinks, + "domain_authority": domain_authority, + "scrape_source": "expireddomains.net", + } + + status = await self._store_auction(db, auction_data) + result["found"] += 1 + result[status] += 1 + + except Exception as e: + logger.debug(f"Error parsing row: {e}") + continue + except Exception as e: - logger.debug(f"Error parsing row: {e}") + logger.debug(f"Error fetching {url}: {e}") continue await db.commit() @@ -384,6 +446,8 @@ class AuctionScraperService: log.auctions_updated = result["updated"] await db.commit() + logger.info(f"✅ ExpiredDomains: {result['found']} domains found") + except Exception as e: log.completed_at = datetime.utcnow() log.status = "failed" diff --git a/backend/app/services/hidden_api_scrapers.py b/backend/app/services/hidden_api_scrapers.py index 03a7e84..01f5fbb 100644 --- a/backend/app/services/hidden_api_scrapers.py +++ b/backend/app/services/hidden_api_scrapers.py @@ -582,6 +582,157 @@ class GoDaddyApiScraper: return {"items": [], "total": 0, "error": str(e)} +# ═══════════════════════════════════════════════════════════════════════════════ +# GODADDY RSS SCRAPER — Public RSS Feed (NO Cloudflare!) +# ═══════════════════════════════════════════════════════════════════════════════ + +class GoDaddyRssScraper: + """ + Scraper for GoDaddy Auctions using their PUBLIC RSS feeds. + + These RSS feeds are NOT protected by Cloudflare and always work! + + Feeds: + - https://auctions.godaddy.com/rss/ending.aspx (Ending Soon) + - https://auctions.godaddy.com/rss/new.aspx (New Auctions) + - https://auctions.godaddy.com/rss/closeouts.aspx (Closeouts) + """ + + RSS_FEEDS = { + "ending": "https://auctions.godaddy.com/rss/ending.aspx", + "new": "https://auctions.godaddy.com/rss/new.aspx", + "closeouts": "https://auctions.godaddy.com/rss/closeouts.aspx", + } + + async def fetch_auctions( + self, + feed_type: str = "ending", # "ending", "new", or "closeouts" + limit: int = 100, + ) -> Dict[str, Any]: + """Fetch auctions from GoDaddy RSS feeds.""" + try: + import xml.etree.ElementTree as ET + + feed_url = self.RSS_FEEDS.get(feed_type, self.RSS_FEEDS["ending"]) + + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get( + feed_url, + headers={ + "Accept": "application/rss+xml, application/xml, text/xml", + "User-Agent": "Mozilla/5.0 (compatible; PounceBot/1.0; +https://pounce.ch)", + }, + ) + + if response.status_code != 200: + logger.error(f"GoDaddy RSS error: {response.status_code}") + return {"items": [], "total": 0, "error": f"HTTP {response.status_code}"} + + # Parse RSS XML + root = ET.fromstring(response.text) + + # Find all items in the RSS feed + items = root.findall(".//item") + + transformed = [] + for item in items[:limit]: + try: + title = item.find("title").text if item.find("title") is not None else "" + link = item.find("link").text if item.find("link") is not None else "" + description = item.find("description").text if item.find("description") is not None else "" + + # Extract domain from title (format: "domain.com - $XX") + domain = "" + price = 0 + + if title: + # Title format: "example.com - $12" or "example.com" + parts = title.split(" - ") + domain = parts[0].strip().lower() + + if len(parts) > 1: + price_str = parts[1].replace("$", "").replace(",", "").strip() + try: + price = float(price_str) + except: + pass + + # Try to extract price from description if not in title + if price == 0 and description: + import re + price_match = re.search(r'\$([0-9,]+(?:\.[0-9]+)?)', description) + if price_match: + price = float(price_match.group(1).replace(",", "")) + + if not domain or "." not in domain: + continue + + tld = domain.rsplit(".", 1)[-1] + + # Add affiliate param to link + affiliate_url = link + if link and "?" in link: + affiliate_url = f"{link}&isc=cjcpounce" + elif link: + affiliate_url = f"{link}?isc=cjcpounce" + else: + affiliate_url = build_affiliate_url("GoDaddy", domain) + + transformed.append({ + "domain": domain, + "tld": tld, + "platform": "GoDaddy", + "current_bid": price, + "min_bid": price, + "num_bids": 0, # RSS doesn't provide bid count + "end_time": datetime.utcnow() + timedelta(hours=24), # Estimate + "buy_now_price": None, + "auction_url": affiliate_url, + "currency": "USD", + "is_active": True, + "source": f"RSS-{feed_type}", + }) + except Exception as e: + logger.warning(f"Error parsing GoDaddy RSS item: {e}") + continue + + logger.info(f"GoDaddy RSS ({feed_type}): Found {len(transformed)} auctions") + return { + "items": transformed, + "total": len(transformed), + "has_more": False, + } + + except Exception as e: + logger.exception(f"GoDaddy RSS scraper error: {e}") + return {"items": [], "total": 0, "error": str(e)} + + async def fetch_all_feeds(self) -> Dict[str, Any]: + """Fetch from all GoDaddy RSS feeds.""" + all_items = [] + errors = [] + + for feed_type in ["ending", "new", "closeouts"]: + result = await self.fetch_auctions(feed_type=feed_type, limit=50) + all_items.extend(result.get("items", [])) + if result.get("error"): + errors.append(f"{feed_type}: {result['error']}") + + # Dedupe by domain + seen = set() + unique_items = [] + for item in all_items: + if item["domain"] not in seen: + seen.add(item["domain"]) + unique_items.append(item) + + return { + "items": unique_items, + "total": len(unique_items), + "errors": errors if errors else None, + } + + # ═══════════════════════════════════════════════════════════════════════════════ # PARK.IO SCRAPER — Backorder Service API # ═══════════════════════════════════════════════════════════════════════════════ @@ -857,6 +1008,7 @@ class HiddenApiScraperService: self.dynadot = DynadotApiScraper() self.sav = SavApiScraper() self.godaddy = GoDaddyApiScraper() + self.godaddy_rss = GoDaddyRssScraper() # RSS fallback (NO Cloudflare!) self.parkio = ParkIoApiScraper() self.namejet = NameJetApiScraper() @@ -873,25 +1025,46 @@ class HiddenApiScraperService: "items": [], } + # ═══════════════════════════════════════════════════════════ + # TIER 0: RSS Feeds (Most Reliable - NO Cloudflare!) + # ═══════════════════════════════════════════════════════════ + + # Scrape GoDaddy RSS (Always works!) + try: + rss_data = await self.godaddy_rss.fetch_all_feeds() + rss_count = len(rss_data.get("items", [])) + if rss_count > 0: + results["platforms"]["GoDaddy-RSS"] = { + "found": rss_count, + "total": rss_count, + } + results["items"].extend(rss_data.get("items", [])) + results["total_found"] += rss_count + logger.info(f"✅ GoDaddy RSS: {rss_count} auctions") + except Exception as e: + results["errors"].append(f"GoDaddy-RSS: {str(e)}") + # ═══════════════════════════════════════════════════════════ # TIER 1: Most Reliable JSON APIs # ═══════════════════════════════════════════════════════════ - # Scrape GoDaddy (NEW - Most reliable!) + # Scrape GoDaddy JSON API (may have Cloudflare issues) try: godaddy_data = await self.godaddy.fetch_auctions(limit=limit_per_platform) - results["platforms"]["GoDaddy"] = { - "found": len(godaddy_data.get("items", [])), - "total": godaddy_data.get("total", 0), - } - results["items"].extend(godaddy_data.get("items", [])) - results["total_found"] += len(godaddy_data.get("items", [])) + godaddy_count = len(godaddy_data.get("items", [])) + if godaddy_count > 0: + results["platforms"]["GoDaddy-API"] = { + "found": godaddy_count, + "total": godaddy_data.get("total", 0), + } + results["items"].extend(godaddy_data.get("items", [])) + results["total_found"] += godaddy_count if godaddy_data.get("error"): - results["errors"].append(f"GoDaddy: {godaddy_data['error']}") + results["errors"].append(f"GoDaddy-API: {godaddy_data['error'][:100]}") except Exception as e: - results["errors"].append(f"GoDaddy: {str(e)}") + results["errors"].append(f"GoDaddy-API: {str(e)[:100]}") # Scrape Dynadot try: @@ -989,6 +1162,7 @@ namecheap_scraper = NamecheapApiScraper() dynadot_scraper = DynadotApiScraper() sav_scraper = SavApiScraper() godaddy_scraper = GoDaddyApiScraper() +godaddy_rss_scraper = GoDaddyRssScraper() # RSS fallback (always works!) parkio_scraper = ParkIoApiScraper() namejet_scraper = NameJetApiScraper() hidden_api_scraper = HiddenApiScraperService() diff --git a/backend/scripts/scrape_auctions.py b/backend/scripts/scrape_auctions.py new file mode 100644 index 0000000..6c95c6d --- /dev/null +++ b/backend/scripts/scrape_auctions.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +Automated Auction Scraper Script + +This script runs all auction scrapers and saves results to the database. +Designed to be run via cron job every 30 minutes. + +Usage: + python scripts/scrape_auctions.py + +Cron example (every 30 minutes): + */30 * * * * cd /home/user/pounce/backend && ./venv/bin/python scripts/scrape_auctions.py >> /var/log/pounce/scraper.log 2>&1 +""" + +import sys +import os +import asyncio +import logging +from datetime import datetime +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from app.services.auction_scraper import auction_scraper +from app.database import AsyncSessionLocal + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +async def run_scrapers(): + """Run all auction scrapers.""" + start_time = datetime.utcnow() + logger.info(f"🚀 Starting auction scrape at {start_time.isoformat()}") + + try: + async with AsyncSessionLocal() as db: + result = await auction_scraper.scrape_all_platforms(db) + + # Log results + total_found = result.get("total_found", 0) + total_new = result.get("total_new", 0) + + logger.info(f"✅ Scrape complete!") + logger.info(f" Total Found: {total_found}") + logger.info(f" New Added: {total_new}") + + # Log platform breakdown + platforms = result.get("platforms", {}) + for platform, data in platforms.items(): + if isinstance(data, dict) and data.get("found", 0) > 0: + logger.info(f" {platform}: {data.get('found', 0)} found, {data.get('new', 0)} new") + + # Log errors (but don't fail) + errors = result.get("errors", []) + if errors: + logger.warning(f"⚠️ {len(errors)} errors occurred:") + for err in errors[:5]: + logger.warning(f" - {str(err)[:100]}") + + elapsed = (datetime.utcnow() - start_time).total_seconds() + logger.info(f"⏱️ Completed in {elapsed:.1f} seconds") + + return result + + except Exception as e: + logger.exception(f"❌ Scrape failed: {e}") + return {"error": str(e)} + + +async def cleanup_old_auctions(): + """Remove expired/old auctions from database.""" + try: + async with AsyncSessionLocal() as db: + from sqlalchemy import delete, and_ + from datetime import timedelta + from app.models.auction import DomainAuction + + cutoff = datetime.utcnow() - timedelta(days=7) + + # Mark expired auctions as inactive + from sqlalchemy import update + stmt = update(DomainAuction).where( + and_( + DomainAuction.end_time < datetime.utcnow(), + DomainAuction.is_active == True + ) + ).values(is_active=False) + + result = await db.execute(stmt) + await db.commit() + + if result.rowcount > 0: + logger.info(f"🧹 Marked {result.rowcount} expired auctions as inactive") + + except Exception as e: + logger.warning(f"Cleanup error: {e}") + + +def main(): + """Main entry point.""" + print("="*60) + print(f"🐾 POUNCE Auction Scraper") + print(f" Started: {datetime.now().isoformat()}") + print("="*60) + + # Run scrapers + result = asyncio.run(run_scrapers()) + + # Run cleanup + asyncio.run(cleanup_old_auctions()) + + print("="*60) + print(f"✅ Done!") + print("="*60) + + # Exit with error code if no results + if result.get("error") or result.get("total_found", 0) == 0: + sys.exit(1) + + sys.exit(0) + + +if __name__ == "__main__": + main() +