pounce/backend/scripts/scrape_auctions.py
Yves Gugger 31f27123db
Some checks failed
CI / Frontend Lint & Type Check (push) Has been cancelled
CI / Frontend Build (push) Has been cancelled
CI / Backend Lint (push) Has been cancelled
CI / Backend Tests (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
Deploy / Build & Push Images (push) Has been cancelled
Deploy / Deploy to Server (push) Has been cancelled
Deploy / Notify (push) Has been cancelled
fix(scraping): real auctions only + cleanup
- Remove seed/demo auction endpoint + scripts (no mock data)
- Rebuild AuctionScraper: strict validation (no -- bids, requires end_time)
- Add robust sources:
  - ExpiredDomains provider auction pages (GoDaddy/Namecheap/Sedo)
  - Park.io auctions table
  - Sav load_domains_ajax table
- Simplify hidden API scrapers to Dynadot only
- Add unique index on (platform, domain) + safe upsert
- Update deployment/docs to reflect real scraping
2025-12-11 21:50:33 +01:00

171 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""
Automated Auction Scraper Script
This script runs all auction scrapers and saves results to the database.
Designed to be run via cron job every 30 minutes.
Usage:
python scripts/scrape_auctions.py
Cron example (every 30 minutes):
*/30 * * * * cd /home/user/pounce/backend && ./venv/bin/python scripts/scrape_auctions.py >> /var/log/pounce/scraper.log 2>&1
"""
import sys
import os
import asyncio
import logging
from datetime import datetime
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.services.auction_scraper import auction_scraper
from app.database import AsyncSessionLocal
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
async def ensure_auction_uniqueness():
"""
Ensure we have a unique index on (platform, domain) and clean duplicates once.
This prevents duplicate rows when the scraper runs repeatedly (cron) and when
the session uses autoflush=False.
"""
from sqlalchemy import text
from app.config import get_settings
settings = get_settings()
db_url = settings.database_url or ""
async with AsyncSessionLocal() as db:
# Best-effort de-duplication (SQLite only).
if db_url.startswith("sqlite"):
await db.execute(
text(
"""
DELETE FROM domain_auctions
WHERE id NOT IN (
SELECT MAX(id) FROM domain_auctions GROUP BY platform, domain
)
"""
)
)
await db.commit()
# Create unique index (works for SQLite and Postgres).
await db.execute(
text(
"CREATE UNIQUE INDEX IF NOT EXISTS ux_auctions_platform_domain ON domain_auctions(platform, domain)"
)
)
await db.commit()
async def run_scrapers():
"""Run all auction scrapers."""
start_time = datetime.utcnow()
logger.info(f"🚀 Starting auction scrape at {start_time.isoformat()}")
try:
async with AsyncSessionLocal() as db:
result = await auction_scraper.scrape_all_platforms(db)
# Log results
total_found = result.get("total_found", 0)
total_new = result.get("total_new", 0)
logger.info(f"✅ Scrape complete!")
logger.info(f" Total Found: {total_found}")
logger.info(f" New Added: {total_new}")
# Log platform breakdown
platforms = result.get("platforms", {})
for platform, data in platforms.items():
if isinstance(data, dict) and data.get("found", 0) > 0:
logger.info(f" {platform}: {data.get('found', 0)} found, {data.get('new', 0)} new")
# Log errors (but don't fail)
errors = result.get("errors", [])
if errors:
logger.warning(f"⚠️ {len(errors)} errors occurred:")
for err in errors[:5]:
logger.warning(f" - {str(err)[:100]}")
elapsed = (datetime.utcnow() - start_time).total_seconds()
logger.info(f"⏱️ Completed in {elapsed:.1f} seconds")
return result
except Exception as e:
logger.exception(f"❌ Scrape failed: {e}")
return {"error": str(e)}
async def cleanup_old_auctions():
"""Remove expired/old auctions from database."""
try:
async with AsyncSessionLocal() as db:
from sqlalchemy import delete, and_
from datetime import timedelta
from app.models.auction import DomainAuction
cutoff = datetime.utcnow() - timedelta(days=7)
# Mark expired auctions as inactive
from sqlalchemy import update
stmt = update(DomainAuction).where(
and_(
DomainAuction.end_time < datetime.utcnow(),
DomainAuction.is_active == True
)
).values(is_active=False)
result = await db.execute(stmt)
await db.commit()
if result.rowcount > 0:
logger.info(f"🧹 Marked {result.rowcount} expired auctions as inactive")
except Exception as e:
logger.warning(f"Cleanup error: {e}")
def main():
"""Main entry point."""
print("="*60)
print(f"🐾 POUNCE Auction Scraper")
print(f" Started: {datetime.now().isoformat()}")
print("="*60)
# Ensure DB uniqueness constraints
asyncio.run(ensure_auction_uniqueness())
# Run scrapers
result = asyncio.run(run_scrapers())
# Run cleanup
asyncio.run(cleanup_old_auctions())
print("="*60)
print(f"✅ Done!")
print("="*60)
# Exit with error code if no results
if result.get("error") or result.get("total_found", 0) == 0:
sys.exit(1)
sys.exit(0)
if __name__ == "__main__":
main()