fix(scraping): correct timezones + no ended auctions in API

- Treat Park.io close_date as America/New_York (configurable) and convert to UTC - Ensure /stats, /scrape-status, /opportunities only count not-ended auctions - Make NameJet Playwright scraper strict: requires real end_time + price - Add Playwright proxy pool support (SCRAPER_PLAYWRIGHT_PROXY_POOL) - Simplify protected scraping to NameJet only
2025-12-11 22:00:06 +01:00
parent b5485cf13c
commit 797280eeb3
3 changed files with 99 additions and 31 deletions
--- a/backend/app/api/auctions.py
+++ b/backend/app/api/auctions.py
@ -497,6 +497,7 @@ async def get_platform_stats(
    Data is scraped from public auction sites - no mock data.
    """
    now = datetime.utcnow()
    # Get stats per platform
    stats_query = (
        select(
@ -504,7 +505,12 @@ async def get_platform_stats(
            func.count(DomainAuction.id).label("count"),
            func.avg(DomainAuction.current_bid).label("avg_bid"),
        )
-        .where(DomainAuction.is_active == True)
+        .where(
            and_(
                DomainAuction.is_active == True,
                DomainAuction.end_time > now,
            )
        )
        .group_by(DomainAuction.platform)
    )
@ -512,7 +518,7 @@ async def get_platform_stats(
    platform_data = result.all()
    # Get ending soon counts
-    cutoff = datetime.utcnow() + timedelta(hours=1)
+    cutoff = now + timedelta(hours=1)
    ending_query = (
        select(
            DomainAuction.platform,
@ -521,6 +527,7 @@ async def get_platform_stats(
        .where(
            and_(
                DomainAuction.is_active == True,
                DomainAuction.end_time > now,
                DomainAuction.end_time <= cutoff,
            )
        )
@ -548,6 +555,7 @@ async def get_scrape_status(
    db: AsyncSession = Depends(get_db),
 ):
    """Get status of auction scraping."""
    now = datetime.utcnow()
    # Get last successful scrape
    last_scrape_query = (
        select(AuctionScrapeLog)
@ -559,7 +567,12 @@ async def get_scrape_status(
    last_log = result.scalar_one_or_none()
    # Get total auctions
-    total_query = select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True)
+    total_query = select(func.count(DomainAuction.id)).where(
        and_(
            DomainAuction.is_active == True,
            DomainAuction.end_time > now,
        )
    )
    total_result = await db.execute(total_query)
    total = total_result.scalar() or 0
@ -615,9 +628,15 @@ async def get_smart_opportunities(
    Opportunity Score = time_urgency × competition_factor × price_factor
    """
    # Get active auctions
    now = datetime.utcnow()
    query = (
        select(DomainAuction)
-        .where(DomainAuction.is_active == True)
+        .where(
            and_(
                DomainAuction.is_active == True,
                DomainAuction.end_time > now,
            )
        )
        .order_by(DomainAuction.end_time.asc())
        .limit(100)
    )
--- a/backend/app/services/auction_scraper.py
+++ b/backend/app/services/auction_scraper.py
@ -32,6 +32,7 @@ from bs4 import BeautifulSoup
 from sqlalchemy import and_, delete, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
 from zoneinfo import ZoneInfo
 from app.models.auction import AuctionScrapeLog, DomainAuction
 from app.services.dropcatch_api import dropcatch_client
@ -526,8 +527,12 @@ class AuctionScraperService:
                close_raw = cols[4].get_text(" ", strip=True)
                try:
-                    # Park.io displays a naive timestamp. We treat it as UTC.
+                    # Park.io displays a naive timestamp in their platform timezone.
-                    end_time = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S")
+                    # Default timezone is America/New_York (configurable).
                    tz_name = os.getenv("PARKIO_TIMEZONE", "America/New_York")
                    tz = ZoneInfo(tz_name)
                    local_dt = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz)
                    end_time = local_dt.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
                except Exception:
                    continue
--- a/backend/app/services/playwright_scraper.py
+++ b/backend/app/services/playwright_scraper.py
@ -29,10 +29,12 @@ import json
 import logging
 import os
 import random
 import re
 from datetime import datetime, timedelta
 from typing import Any, Dict, List, Optional
 from pathlib import Path
 from urllib.parse import urlparse
 from zoneinfo import ZoneInfo
 logger = logging.getLogger(__name__)
@ -89,10 +91,20 @@ class PlaywrightScraperService:
        try:
            self.playwright = await async_playwright().start()
            # Proxy selection:
            # - SCRAPER_PLAYWRIGHT_PROXY: single proxy URL
            # - SCRAPER_PLAYWRIGHT_PROXY_POOL: comma-separated proxy URLs, one is chosen randomly per browser start
            proxy_pool_raw = os.getenv("SCRAPER_PLAYWRIGHT_PROXY_POOL", "").strip()
            proxy_pool = [p.strip() for p in proxy_pool_raw.split(",") if p.strip()] if proxy_pool_raw else []
            proxy_url = (
-                os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
+                random.choice(proxy_pool)
-                or os.getenv("SCRAPER_PROXY_URL")
+                if proxy_pool
-                or os.getenv("SCRAPER_HTTP_PROXY")
+                else (
                    os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
                    or os.getenv("SCRAPER_PROXY_URL")
                    or os.getenv("SCRAPER_HTTP_PROXY")
                )
            )
            proxy_config = None
            if proxy_url:
@ -426,10 +438,51 @@ class PlaywrightScraperService:
            items = []
            rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')
            namejet_tz = os.getenv("NAMEJET_TIMEZONE", "America/Los_Angeles")
            def parse_end_time(text: str) -> Optional[datetime]:
                raw = (text or "").strip()
                if not raw:
                    return None
                # Sometimes they include timezone abbreviation
                raw = raw.replace("PST", "").replace("PDT", "").replace("EST", "").replace("EDT", "").replace("UTC", "").strip()
                # Relative format like "1d 2h 3m" (rare)
                m = re.findall(r"(\\d+)\\s*([dhms])", raw.lower())
                if m:
                    secs = 0
                    for n, u in m:
                        n_i = int(n)
                        if u == "d":
                            secs += n_i * 86400
                        elif u == "h":
                            secs += n_i * 3600
                        elif u == "m":
                            secs += n_i * 60
                        elif u == "s":
                            secs += n_i
                    if secs > 0:
                        return (datetime.now(ZoneInfo("UTC")) + timedelta(seconds=secs)).replace(tzinfo=None)
                # Absolute formats (common)
                fmts = [
                    "%m/%d/%Y %I:%M %p",
                    "%m/%d/%Y %H:%M",
                    "%Y-%m-%d %H:%M:%S",
                    "%Y-%m-%d %H:%M",
                ]
                for fmt in fmts:
                    try:
                        local = datetime.strptime(raw, fmt).replace(tzinfo=ZoneInfo(namejet_tz))
                        return local.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
                    except Exception:
                        continue
                return None
            for row in rows[:limit]:
                try:
                    cells = await row.query_selector_all('td')
-                    if len(cells) < 3:
+                    if len(cells) < 4:
                        continue
                    # NameJet format: Domain, End Time, Price, Bids, ...
@ -441,11 +494,21 @@ class PlaywrightScraperService:
                    tld = domain.rsplit(".", 1)[-1]
                    # Parse end time from column 1
                    end_text = await cells[1].text_content()
                    end_time = parse_end_time(end_text or "")
                    if end_time is None:
                        continue
                    if end_time <= datetime.utcnow():
                        continue
                    # Parse price
                    price = 0
                    if len(cells) > 2:
                        price_text = await cells[2].text_content()
                        price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")
                    if price <= 0:
                        continue
                    # Parse bids
                    bids = 0
@ -458,13 +521,10 @@ class PlaywrightScraperService:
                        "tld": tld,
                        "platform": "NameJet",
                        "current_bid": price,
                        "min_bid": 0,
                        "num_bids": bids,
-                        "end_time": datetime.utcnow() + timedelta(days=1),
+                        "end_time": end_time,
                        "buy_now_price": None,
                        "auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
                        "currency": "USD",
                        "is_active": True,
                    })
                except Exception as e:
                    logger.debug(f"Error parsing row: {e}")
@ -508,23 +568,7 @@ class PlaywrightScraperService:
        try:
            await self.initialize()
-            # Scrape GoDaddy
+            # Scrape NameJet (Cloudflare protected)
            logger.info("Scraping GoDaddy with Playwright...")
            godaddy_result = await self.scrape_godaddy()
            results["platforms"]["GoDaddy"] = {
                "found": len(godaddy_result.get("items", [])),
                "source": godaddy_result.get("source", "unknown"),
            }
            results["items"].extend(godaddy_result.get("items", []))
            results["total_found"] += len(godaddy_result.get("items", []))
            if godaddy_result.get("error"):
                results["errors"].append(f"GoDaddy: {godaddy_result['error']}")
            # Small delay between platforms
            await asyncio.sleep(3)
            # Scrape NameJet
            logger.info("Scraping NameJet with Playwright...")
            namejet_result = await self.scrape_namejet()
            results["platforms"]["NameJet"] = {