From 325a684809e521c528b37d8721e1739fc93527cc Mon Sep 17 00:00:00 2001
From: Yves Gugger <guggeryves@hotmail.com>
Date: Thu, 11 Dec 2025 22:00:06 +0100
Subject: [PATCH] fix(scraping): correct timezones + no ended auctions in API

- Treat Park.io close_date as America/New_York (configurable) and convert to UTC
- Ensure /stats, /scrape-status, /opportunities only count not-ended auctions
- Make NameJet Playwright scraper strict: requires real end_time + price
- Add Playwright proxy pool support (SCRAPER_PLAYWRIGHT_PROXY_POOL)
- Simplify protected scraping to NameJet only
---
 backend/app/api/auctions.py                | 27 ++++++-
 backend/app/services/auction_scraper.py    |  9 ++-
 backend/app/services/playwright_scraper.py | 94 ++++++++++++++++------
 3 files changed, 99 insertions(+), 31 deletions(-)

diff --git a/backend/app/api/auctions.py b/backend/app/api/auctions.py
index 304518f..79dc008 100644
--- a/backend/app/api/auctions.py
+++ b/backend/app/api/auctions.py
@@ -497,6 +497,7 @@ async def get_platform_stats(
     
     Data is scraped from public auction sites - no mock data.
     """
+    now = datetime.utcnow()
     # Get stats per platform
     stats_query = (
         select(
@@ -504,7 +505,12 @@ async def get_platform_stats(
             func.count(DomainAuction.id).label("count"),
             func.avg(DomainAuction.current_bid).label("avg_bid"),
         )
-        .where(DomainAuction.is_active == True)
+        .where(
+            and_(
+                DomainAuction.is_active == True,
+                DomainAuction.end_time > now,
+            )
+        )
         .group_by(DomainAuction.platform)
     )
     
@@ -512,7 +518,7 @@ async def get_platform_stats(
     platform_data = result.all()
     
     # Get ending soon counts
-    cutoff = datetime.utcnow() + timedelta(hours=1)
+    cutoff = now + timedelta(hours=1)
     ending_query = (
         select(
             DomainAuction.platform,
@@ -521,6 +527,7 @@ async def get_platform_stats(
         .where(
             and_(
                 DomainAuction.is_active == True,
+                DomainAuction.end_time > now,
                 DomainAuction.end_time <= cutoff,
             )
         )
@@ -548,6 +555,7 @@ async def get_scrape_status(
     db: AsyncSession = Depends(get_db),
 ):
     """Get status of auction scraping."""
+    now = datetime.utcnow()
     # Get last successful scrape
     last_scrape_query = (
         select(AuctionScrapeLog)
@@ -559,7 +567,12 @@ async def get_scrape_status(
     last_log = result.scalar_one_or_none()
     
     # Get total auctions
-    total_query = select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True)
+    total_query = select(func.count(DomainAuction.id)).where(
+        and_(
+            DomainAuction.is_active == True,
+            DomainAuction.end_time > now,
+        )
+    )
     total_result = await db.execute(total_query)
     total = total_result.scalar() or 0
     
@@ -615,9 +628,15 @@ async def get_smart_opportunities(
     Opportunity Score = time_urgency × competition_factor × price_factor
     """
     # Get active auctions
+    now = datetime.utcnow()
     query = (
         select(DomainAuction)
-        .where(DomainAuction.is_active == True)
+        .where(
+            and_(
+                DomainAuction.is_active == True,
+                DomainAuction.end_time > now,
+            )
+        )
         .order_by(DomainAuction.end_time.asc())
         .limit(100)
     )
diff --git a/backend/app/services/auction_scraper.py b/backend/app/services/auction_scraper.py
index dcbd1b4..190c337 100644
--- a/backend/app/services/auction_scraper.py
+++ b/backend/app/services/auction_scraper.py
@@ -32,6 +32,7 @@ from bs4 import BeautifulSoup
 from sqlalchemy import and_, delete, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
+from zoneinfo import ZoneInfo
 
 from app.models.auction import AuctionScrapeLog, DomainAuction
 from app.services.dropcatch_api import dropcatch_client
@@ -526,8 +527,12 @@ class AuctionScraperService:
 
                 close_raw = cols[4].get_text(" ", strip=True)
                 try:
-                    # Park.io displays a naive timestamp. We treat it as UTC.
-                    end_time = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S")
+                    # Park.io displays a naive timestamp in their platform timezone.
+                    # Default timezone is America/New_York (configurable).
+                    tz_name = os.getenv("PARKIO_TIMEZONE", "America/New_York")
+                    tz = ZoneInfo(tz_name)
+                    local_dt = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz)
+                    end_time = local_dt.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
                 except Exception:
                     continue
 
diff --git a/backend/app/services/playwright_scraper.py b/backend/app/services/playwright_scraper.py
index 292de52..28e19c3 100644
--- a/backend/app/services/playwright_scraper.py
+++ b/backend/app/services/playwright_scraper.py
@@ -29,10 +29,12 @@ import json
 import logging
 import os
 import random
+import re
 from datetime import datetime, timedelta
 from typing import Any, Dict, List, Optional
 from pathlib import Path
 from urllib.parse import urlparse
+from zoneinfo import ZoneInfo
 
 logger = logging.getLogger(__name__)
 
@@ -89,10 +91,20 @@ class PlaywrightScraperService:
         try:
             self.playwright = await async_playwright().start()
             
+            # Proxy selection:
+            # - SCRAPER_PLAYWRIGHT_PROXY: single proxy URL
+            # - SCRAPER_PLAYWRIGHT_PROXY_POOL: comma-separated proxy URLs, one is chosen randomly per browser start
+            proxy_pool_raw = os.getenv("SCRAPER_PLAYWRIGHT_PROXY_POOL", "").strip()
+            proxy_pool = [p.strip() for p in proxy_pool_raw.split(",") if p.strip()] if proxy_pool_raw else []
+
             proxy_url = (
-                os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
-                or os.getenv("SCRAPER_PROXY_URL")
-                or os.getenv("SCRAPER_HTTP_PROXY")
+                random.choice(proxy_pool)
+                if proxy_pool
+                else (
+                    os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
+                    or os.getenv("SCRAPER_PROXY_URL")
+                    or os.getenv("SCRAPER_HTTP_PROXY")
+                )
             )
             proxy_config = None
             if proxy_url:
@@ -426,10 +438,51 @@ class PlaywrightScraperService:
             items = []
             rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')
             
+            namejet_tz = os.getenv("NAMEJET_TIMEZONE", "America/Los_Angeles")
+
+            def parse_end_time(text: str) -> Optional[datetime]:
+                raw = (text or "").strip()
+                if not raw:
+                    return None
+                # Sometimes they include timezone abbreviation
+                raw = raw.replace("PST", "").replace("PDT", "").replace("EST", "").replace("EDT", "").replace("UTC", "").strip()
+
+                # Relative format like "1d 2h 3m" (rare)
+                m = re.findall(r"(\\d+)\\s*([dhms])", raw.lower())
+                if m:
+                    secs = 0
+                    for n, u in m:
+                        n_i = int(n)
+                        if u == "d":
+                            secs += n_i * 86400
+                        elif u == "h":
+                            secs += n_i * 3600
+                        elif u == "m":
+                            secs += n_i * 60
+                        elif u == "s":
+                            secs += n_i
+                    if secs > 0:
+                        return (datetime.now(ZoneInfo("UTC")) + timedelta(seconds=secs)).replace(tzinfo=None)
+
+                # Absolute formats (common)
+                fmts = [
+                    "%m/%d/%Y %I:%M %p",
+                    "%m/%d/%Y %H:%M",
+                    "%Y-%m-%d %H:%M:%S",
+                    "%Y-%m-%d %H:%M",
+                ]
+                for fmt in fmts:
+                    try:
+                        local = datetime.strptime(raw, fmt).replace(tzinfo=ZoneInfo(namejet_tz))
+                        return local.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
+                    except Exception:
+                        continue
+                return None
+
             for row in rows[:limit]:
                 try:
                     cells = await row.query_selector_all('td')
-                    if len(cells) < 3:
+                    if len(cells) < 4:
                         continue
                     
                     # NameJet format: Domain, End Time, Price, Bids, ...
@@ -441,11 +494,21 @@ class PlaywrightScraperService:
                     
                     tld = domain.rsplit(".", 1)[-1]
                     
+                    # Parse end time from column 1
+                    end_text = await cells[1].text_content()
+                    end_time = parse_end_time(end_text or "")
+                    if end_time is None:
+                        continue
+                    if end_time <= datetime.utcnow():
+                        continue
+
                     # Parse price
                     price = 0
                     if len(cells) > 2:
                         price_text = await cells[2].text_content()
                         price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")
+                    if price <= 0:
+                        continue
                     
                     # Parse bids
                     bids = 0
@@ -458,13 +521,10 @@ class PlaywrightScraperService:
                         "tld": tld,
                         "platform": "NameJet",
                         "current_bid": price,
-                        "min_bid": 0,
                         "num_bids": bids,
-                        "end_time": datetime.utcnow() + timedelta(days=1),
-                        "buy_now_price": None,
+                        "end_time": end_time,
                         "auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
                         "currency": "USD",
-                        "is_active": True,
                     })
                 except Exception as e:
                     logger.debug(f"Error parsing row: {e}")
@@ -508,23 +568,7 @@ class PlaywrightScraperService:
         try:
             await self.initialize()
             
-            # Scrape GoDaddy
-            logger.info("Scraping GoDaddy with Playwright...")
-            godaddy_result = await self.scrape_godaddy()
-            results["platforms"]["GoDaddy"] = {
-                "found": len(godaddy_result.get("items", [])),
-                "source": godaddy_result.get("source", "unknown"),
-            }
-            results["items"].extend(godaddy_result.get("items", []))
-            results["total_found"] += len(godaddy_result.get("items", []))
-            
-            if godaddy_result.get("error"):
-                results["errors"].append(f"GoDaddy: {godaddy_result['error']}")
-            
-            # Small delay between platforms
-            await asyncio.sleep(3)
-            
-            # Scrape NameJet
+            # Scrape NameJet (Cloudflare protected)
             logger.info("Scraping NameJet with Playwright...")
             namejet_result = await self.scrape_namejet()
             results["platforms"]["NameJet"] = {