- Add GoDaddy RSS Feed scraper (bypasses Cloudflare) - Enhanced ExpiredDomains scraper (multiple pages, TLDs) - Improved hidden API scrapers integration - Add automated scraper cron script (runs every 30 min) - Playwright stealth mode installed on server Sources now working: - Dynadot REST API: ~100 auctions - GoDaddy RSS: ~100 auctions - ExpiredDomains: ~250 auctions Total: 467 auctions in database
132 lines
3.9 KiB
Python
132 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Automated Auction Scraper Script
|
|
|
|
This script runs all auction scrapers and saves results to the database.
|
|
Designed to be run via cron job every 30 minutes.
|
|
|
|
Usage:
|
|
python scripts/scrape_auctions.py
|
|
|
|
Cron example (every 30 minutes):
|
|
*/30 * * * * cd /home/user/pounce/backend && ./venv/bin/python scripts/scrape_auctions.py >> /var/log/pounce/scraper.log 2>&1
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from app.services.auction_scraper import auction_scraper
|
|
from app.database import AsyncSessionLocal
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def run_scrapers():
|
|
"""Run all auction scrapers."""
|
|
start_time = datetime.utcnow()
|
|
logger.info(f"🚀 Starting auction scrape at {start_time.isoformat()}")
|
|
|
|
try:
|
|
async with AsyncSessionLocal() as db:
|
|
result = await auction_scraper.scrape_all_platforms(db)
|
|
|
|
# Log results
|
|
total_found = result.get("total_found", 0)
|
|
total_new = result.get("total_new", 0)
|
|
|
|
logger.info(f"✅ Scrape complete!")
|
|
logger.info(f" Total Found: {total_found}")
|
|
logger.info(f" New Added: {total_new}")
|
|
|
|
# Log platform breakdown
|
|
platforms = result.get("platforms", {})
|
|
for platform, data in platforms.items():
|
|
if isinstance(data, dict) and data.get("found", 0) > 0:
|
|
logger.info(f" {platform}: {data.get('found', 0)} found, {data.get('new', 0)} new")
|
|
|
|
# Log errors (but don't fail)
|
|
errors = result.get("errors", [])
|
|
if errors:
|
|
logger.warning(f"⚠️ {len(errors)} errors occurred:")
|
|
for err in errors[:5]:
|
|
logger.warning(f" - {str(err)[:100]}")
|
|
|
|
elapsed = (datetime.utcnow() - start_time).total_seconds()
|
|
logger.info(f"⏱️ Completed in {elapsed:.1f} seconds")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.exception(f"❌ Scrape failed: {e}")
|
|
return {"error": str(e)}
|
|
|
|
|
|
async def cleanup_old_auctions():
|
|
"""Remove expired/old auctions from database."""
|
|
try:
|
|
async with AsyncSessionLocal() as db:
|
|
from sqlalchemy import delete, and_
|
|
from datetime import timedelta
|
|
from app.models.auction import DomainAuction
|
|
|
|
cutoff = datetime.utcnow() - timedelta(days=7)
|
|
|
|
# Mark expired auctions as inactive
|
|
from sqlalchemy import update
|
|
stmt = update(DomainAuction).where(
|
|
and_(
|
|
DomainAuction.end_time < datetime.utcnow(),
|
|
DomainAuction.is_active == True
|
|
)
|
|
).values(is_active=False)
|
|
|
|
result = await db.execute(stmt)
|
|
await db.commit()
|
|
|
|
if result.rowcount > 0:
|
|
logger.info(f"🧹 Marked {result.rowcount} expired auctions as inactive")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Cleanup error: {e}")
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
print("="*60)
|
|
print(f"🐾 POUNCE Auction Scraper")
|
|
print(f" Started: {datetime.now().isoformat()}")
|
|
print("="*60)
|
|
|
|
# Run scrapers
|
|
result = asyncio.run(run_scrapers())
|
|
|
|
# Run cleanup
|
|
asyncio.run(cleanup_old_auctions())
|
|
|
|
print("="*60)
|
|
print(f"✅ Done!")
|
|
print("="*60)
|
|
|
|
# Exit with error code if no results
|
|
if result.get("error") or result.get("total_found", 0) == 0:
|
|
sys.exit(1)
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|