fix(scraping): correct timezones + no ended auctions in API

- Treat Park.io close_date as America/New_York (configurable) and convert to UTC - Ensure /stats, /scrape-status, /opportunities only count not-ended auctions - Make NameJet Playwright scraper strict: requires real end_time + price - Add Playwright proxy pool support (SCRAPER_PLAYWRIGHT_PROXY_POOL) - Simplify protected scraping to NameJet only
2025-12-11 22:00:06 +01:00
parent 3323f33d7c
commit 325a684809
3 changed files with 99 additions and 31 deletions
--- a/backend/app/api/auctions.py
+++ b/backend/app/api/auctions.py
@ -497,6 +497,7 @@ async def get_platform_stats(
    
    Data is scraped from public auction sites - no mock data.
    """
+    now = datetime.utcnow()
    # Get stats per platform
    stats_query = (
        select(
@ -504,7 +505,12 @@ async def get_platform_stats(
            func.count(DomainAuction.id).label("count"),
            func.avg(DomainAuction.current_bid).label("avg_bid"),
        )
-        .where(DomainAuction.is_active == True)
+        .where(
+            and_(
+                DomainAuction.is_active == True,
+                DomainAuction.end_time > now,
+            )
+        )
        .group_by(DomainAuction.platform)
    )
    
@ -512,7 +518,7 @@ async def get_platform_stats(
    platform_data = result.all()
    
    # Get ending soon counts
-    cutoff = datetime.utcnow() + timedelta(hours=1)
+    cutoff = now + timedelta(hours=1)
    ending_query = (
        select(
            DomainAuction.platform,
@ -521,6 +527,7 @@ async def get_platform_stats(
        .where(
            and_(
                DomainAuction.is_active == True,
+                DomainAuction.end_time > now,
                DomainAuction.end_time <= cutoff,
            )
        )
@ -548,6 +555,7 @@ async def get_scrape_status(
    db: AsyncSession = Depends(get_db),
 ):
    """Get status of auction scraping."""
+    now = datetime.utcnow()
    # Get last successful scrape
    last_scrape_query = (
        select(AuctionScrapeLog)
@ -559,7 +567,12 @@ async def get_scrape_status(
    last_log = result.scalar_one_or_none()
    
    # Get total auctions
-    total_query = select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True)
+    total_query = select(func.count(DomainAuction.id)).where(
+        and_(
+            DomainAuction.is_active == True,
+            DomainAuction.end_time > now,
+        )
+    )
    total_result = await db.execute(total_query)
    total = total_result.scalar() or 0
    
@ -615,9 +628,15 @@ async def get_smart_opportunities(
    Opportunity Score = time_urgency × competition_factor × price_factor
    """
    # Get active auctions
+    now = datetime.utcnow()
    query = (
        select(DomainAuction)
-        .where(DomainAuction.is_active == True)
+        .where(
+            and_(
+                DomainAuction.is_active == True,
+                DomainAuction.end_time > now,
+            )
+        )
        .order_by(DomainAuction.end_time.asc())
        .limit(100)
    )
--- a/backend/app/services/auction_scraper.py
+++ b/backend/app/services/auction_scraper.py
@ -32,6 +32,7 @@ from bs4 import BeautifulSoup
 from sqlalchemy import and_, delete, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
+from zoneinfo import ZoneInfo

 from app.models.auction import AuctionScrapeLog, DomainAuction
 from app.services.dropcatch_api import dropcatch_client
@ -526,8 +527,12 @@ class AuctionScraperService:

                close_raw = cols[4].get_text(" ", strip=True)
                try:
-                    # Park.io displays a naive timestamp. We treat it as UTC.
-                    end_time = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S")
+                    # Park.io displays a naive timestamp in their platform timezone.
+                    # Default timezone is America/New_York (configurable).
+                    tz_name = os.getenv("PARKIO_TIMEZONE", "America/New_York")
+                    tz = ZoneInfo(tz_name)
+                    local_dt = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz)
+                    end_time = local_dt.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
                except Exception:
                    continue

--- a/backend/app/services/playwright_scraper.py
+++ b/backend/app/services/playwright_scraper.py
@ -29,10 +29,12 @@ import json
 import logging
 import os
 import random
+import re
 from datetime import datetime, timedelta
 from typing import Any, Dict, List, Optional
 from pathlib import Path
 from urllib.parse import urlparse
+from zoneinfo import ZoneInfo

 logger = logging.getLogger(__name__)

@ -89,10 +91,20 @@ class PlaywrightScraperService:
        try:
            self.playwright = await async_playwright().start()
            
+            # Proxy selection:
+            # - SCRAPER_PLAYWRIGHT_PROXY: single proxy URL
+            # - SCRAPER_PLAYWRIGHT_PROXY_POOL: comma-separated proxy URLs, one is chosen randomly per browser start
+            proxy_pool_raw = os.getenv("SCRAPER_PLAYWRIGHT_PROXY_POOL", "").strip()
+            proxy_pool = [p.strip() for p in proxy_pool_raw.split(",") if p.strip()] if proxy_pool_raw else []
+
            proxy_url = (
-                os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
-                or os.getenv("SCRAPER_PROXY_URL")
-                or os.getenv("SCRAPER_HTTP_PROXY")
+                random.choice(proxy_pool)
+                if proxy_pool
+                else (
+                    os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
+                    or os.getenv("SCRAPER_PROXY_URL")
+                    or os.getenv("SCRAPER_HTTP_PROXY")
+                )
            )
            proxy_config = None
            if proxy_url:
@ -426,10 +438,51 @@ class PlaywrightScraperService:
            items = []
            rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')
            
+            namejet_tz = os.getenv("NAMEJET_TIMEZONE", "America/Los_Angeles")
+
+            def parse_end_time(text: str) -> Optional[datetime]:
+                raw = (text or "").strip()
+                if not raw:
+                    return None
+                # Sometimes they include timezone abbreviation
+                raw = raw.replace("PST", "").replace("PDT", "").replace("EST", "").replace("EDT", "").replace("UTC", "").strip()
+
+                # Relative format like "1d 2h 3m" (rare)
+                m = re.findall(r"(\\d+)\\s*([dhms])", raw.lower())
+                if m:
+                    secs = 0
+                    for n, u in m:
+                        n_i = int(n)
+                        if u == "d":
+                            secs += n_i * 86400
+                        elif u == "h":
+                            secs += n_i * 3600
+                        elif u == "m":
+                            secs += n_i * 60
+                        elif u == "s":
+                            secs += n_i
+                    if secs > 0:
+                        return (datetime.now(ZoneInfo("UTC")) + timedelta(seconds=secs)).replace(tzinfo=None)
+
+                # Absolute formats (common)
+                fmts = [
+                    "%m/%d/%Y %I:%M %p",
+                    "%m/%d/%Y %H:%M",
+                    "%Y-%m-%d %H:%M:%S",
+                    "%Y-%m-%d %H:%M",
+                ]
+                for fmt in fmts:
+                    try:
+                        local = datetime.strptime(raw, fmt).replace(tzinfo=ZoneInfo(namejet_tz))
+                        return local.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
+                    except Exception:
+                        continue
+                return None
+
            for row in rows[:limit]:
                try:
                    cells = await row.query_selector_all('td')
-                    if len(cells) < 3:
+                    if len(cells) < 4:
                        continue
                    
                    # NameJet format: Domain, End Time, Price, Bids, ...
@ -441,11 +494,21 @@ class PlaywrightScraperService:
                    
                    tld = domain.rsplit(".", 1)[-1]
                    
+                    # Parse end time from column 1
+                    end_text = await cells[1].text_content()
+                    end_time = parse_end_time(end_text or "")
+                    if end_time is None:
+                        continue
+                    if end_time <= datetime.utcnow():
+                        continue
+
                    # Parse price
                    price = 0
                    if len(cells) > 2:
                        price_text = await cells[2].text_content()
                        price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")
+                    if price <= 0:
+                        continue
                    
                    # Parse bids
                    bids = 0
@ -458,13 +521,10 @@ class PlaywrightScraperService:
                        "tld": tld,
                        "platform": "NameJet",
                        "current_bid": price,
-                        "min_bid": 0,
                        "num_bids": bids,
-                        "end_time": datetime.utcnow() + timedelta(days=1),
-                        "buy_now_price": None,
+                        "end_time": end_time,
                        "auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
                        "currency": "USD",
-                        "is_active": True,
                    })
                except Exception as e:
                    logger.debug(f"Error parsing row: {e}")
@ -508,23 +568,7 @@ class PlaywrightScraperService:
        try:
            await self.initialize()
            
-            # Scrape GoDaddy
-            logger.info("Scraping GoDaddy with Playwright...")
-            godaddy_result = await self.scrape_godaddy()
-            results["platforms"]["GoDaddy"] = {
-                "found": len(godaddy_result.get("items", [])),
-                "source": godaddy_result.get("source", "unknown"),
-            }
-            results["items"].extend(godaddy_result.get("items", []))
-            results["total_found"] += len(godaddy_result.get("items", []))
-            
-            if godaddy_result.get("error"):
-                results["errors"].append(f"GoDaddy: {godaddy_result['error']}")
-            
-            # Small delay between platforms
-            await asyncio.sleep(3)
-            
-            # Scrape NameJet
+            # Scrape NameJet (Cloudflare protected)
            logger.info("Scraping NameJet with Playwright...")
            namejet_result = await self.scrape_namejet()
            results["platforms"]["NameJet"] = {