From 325a684809e521c528b37d8721e1739fc93527cc Mon Sep 17 00:00:00 2001 From: Yves Gugger Date: Thu, 11 Dec 2025 22:00:06 +0100 Subject: [PATCH] fix(scraping): correct timezones + no ended auctions in API - Treat Park.io close_date as America/New_York (configurable) and convert to UTC - Ensure /stats, /scrape-status, /opportunities only count not-ended auctions - Make NameJet Playwright scraper strict: requires real end_time + price - Add Playwright proxy pool support (SCRAPER_PLAYWRIGHT_PROXY_POOL) - Simplify protected scraping to NameJet only --- backend/app/api/auctions.py | 27 ++++++- backend/app/services/auction_scraper.py | 9 ++- backend/app/services/playwright_scraper.py | 94 ++++++++++++++++------ 3 files changed, 99 insertions(+), 31 deletions(-) diff --git a/backend/app/api/auctions.py b/backend/app/api/auctions.py index 304518f..79dc008 100644 --- a/backend/app/api/auctions.py +++ b/backend/app/api/auctions.py @@ -497,6 +497,7 @@ async def get_platform_stats( Data is scraped from public auction sites - no mock data. """ + now = datetime.utcnow() # Get stats per platform stats_query = ( select( @@ -504,7 +505,12 @@ async def get_platform_stats( func.count(DomainAuction.id).label("count"), func.avg(DomainAuction.current_bid).label("avg_bid"), ) - .where(DomainAuction.is_active == True) + .where( + and_( + DomainAuction.is_active == True, + DomainAuction.end_time > now, + ) + ) .group_by(DomainAuction.platform) ) @@ -512,7 +518,7 @@ async def get_platform_stats( platform_data = result.all() # Get ending soon counts - cutoff = datetime.utcnow() + timedelta(hours=1) + cutoff = now + timedelta(hours=1) ending_query = ( select( DomainAuction.platform, @@ -521,6 +527,7 @@ async def get_platform_stats( .where( and_( DomainAuction.is_active == True, + DomainAuction.end_time > now, DomainAuction.end_time <= cutoff, ) ) @@ -548,6 +555,7 @@ async def get_scrape_status( db: AsyncSession = Depends(get_db), ): """Get status of auction scraping.""" + now = datetime.utcnow() # Get last successful scrape last_scrape_query = ( select(AuctionScrapeLog) @@ -559,7 +567,12 @@ async def get_scrape_status( last_log = result.scalar_one_or_none() # Get total auctions - total_query = select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True) + total_query = select(func.count(DomainAuction.id)).where( + and_( + DomainAuction.is_active == True, + DomainAuction.end_time > now, + ) + ) total_result = await db.execute(total_query) total = total_result.scalar() or 0 @@ -615,9 +628,15 @@ async def get_smart_opportunities( Opportunity Score = time_urgency × competition_factor × price_factor """ # Get active auctions + now = datetime.utcnow() query = ( select(DomainAuction) - .where(DomainAuction.is_active == True) + .where( + and_( + DomainAuction.is_active == True, + DomainAuction.end_time > now, + ) + ) .order_by(DomainAuction.end_time.asc()) .limit(100) ) diff --git a/backend/app/services/auction_scraper.py b/backend/app/services/auction_scraper.py index dcbd1b4..190c337 100644 --- a/backend/app/services/auction_scraper.py +++ b/backend/app/services/auction_scraper.py @@ -32,6 +32,7 @@ from bs4 import BeautifulSoup from sqlalchemy import and_, delete, select from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession +from zoneinfo import ZoneInfo from app.models.auction import AuctionScrapeLog, DomainAuction from app.services.dropcatch_api import dropcatch_client @@ -526,8 +527,12 @@ class AuctionScraperService: close_raw = cols[4].get_text(" ", strip=True) try: - # Park.io displays a naive timestamp. We treat it as UTC. - end_time = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S") + # Park.io displays a naive timestamp in their platform timezone. + # Default timezone is America/New_York (configurable). + tz_name = os.getenv("PARKIO_TIMEZONE", "America/New_York") + tz = ZoneInfo(tz_name) + local_dt = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz) + end_time = local_dt.astimezone(ZoneInfo("UTC")).replace(tzinfo=None) except Exception: continue diff --git a/backend/app/services/playwright_scraper.py b/backend/app/services/playwright_scraper.py index 292de52..28e19c3 100644 --- a/backend/app/services/playwright_scraper.py +++ b/backend/app/services/playwright_scraper.py @@ -29,10 +29,12 @@ import json import logging import os import random +import re from datetime import datetime, timedelta from typing import Any, Dict, List, Optional from pathlib import Path from urllib.parse import urlparse +from zoneinfo import ZoneInfo logger = logging.getLogger(__name__) @@ -89,10 +91,20 @@ class PlaywrightScraperService: try: self.playwright = await async_playwright().start() + # Proxy selection: + # - SCRAPER_PLAYWRIGHT_PROXY: single proxy URL + # - SCRAPER_PLAYWRIGHT_PROXY_POOL: comma-separated proxy URLs, one is chosen randomly per browser start + proxy_pool_raw = os.getenv("SCRAPER_PLAYWRIGHT_PROXY_POOL", "").strip() + proxy_pool = [p.strip() for p in proxy_pool_raw.split(",") if p.strip()] if proxy_pool_raw else [] + proxy_url = ( - os.getenv("SCRAPER_PLAYWRIGHT_PROXY") - or os.getenv("SCRAPER_PROXY_URL") - or os.getenv("SCRAPER_HTTP_PROXY") + random.choice(proxy_pool) + if proxy_pool + else ( + os.getenv("SCRAPER_PLAYWRIGHT_PROXY") + or os.getenv("SCRAPER_PROXY_URL") + or os.getenv("SCRAPER_HTTP_PROXY") + ) ) proxy_config = None if proxy_url: @@ -426,10 +438,51 @@ class PlaywrightScraperService: items = [] rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row') + namejet_tz = os.getenv("NAMEJET_TIMEZONE", "America/Los_Angeles") + + def parse_end_time(text: str) -> Optional[datetime]: + raw = (text or "").strip() + if not raw: + return None + # Sometimes they include timezone abbreviation + raw = raw.replace("PST", "").replace("PDT", "").replace("EST", "").replace("EDT", "").replace("UTC", "").strip() + + # Relative format like "1d 2h 3m" (rare) + m = re.findall(r"(\\d+)\\s*([dhms])", raw.lower()) + if m: + secs = 0 + for n, u in m: + n_i = int(n) + if u == "d": + secs += n_i * 86400 + elif u == "h": + secs += n_i * 3600 + elif u == "m": + secs += n_i * 60 + elif u == "s": + secs += n_i + if secs > 0: + return (datetime.now(ZoneInfo("UTC")) + timedelta(seconds=secs)).replace(tzinfo=None) + + # Absolute formats (common) + fmts = [ + "%m/%d/%Y %I:%M %p", + "%m/%d/%Y %H:%M", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M", + ] + for fmt in fmts: + try: + local = datetime.strptime(raw, fmt).replace(tzinfo=ZoneInfo(namejet_tz)) + return local.astimezone(ZoneInfo("UTC")).replace(tzinfo=None) + except Exception: + continue + return None + for row in rows[:limit]: try: cells = await row.query_selector_all('td') - if len(cells) < 3: + if len(cells) < 4: continue # NameJet format: Domain, End Time, Price, Bids, ... @@ -441,11 +494,21 @@ class PlaywrightScraperService: tld = domain.rsplit(".", 1)[-1] + # Parse end time from column 1 + end_text = await cells[1].text_content() + end_time = parse_end_time(end_text or "") + if end_time is None: + continue + if end_time <= datetime.utcnow(): + continue + # Parse price price = 0 if len(cells) > 2: price_text = await cells[2].text_content() price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0") + if price <= 0: + continue # Parse bids bids = 0 @@ -458,13 +521,10 @@ class PlaywrightScraperService: "tld": tld, "platform": "NameJet", "current_bid": price, - "min_bid": 0, "num_bids": bids, - "end_time": datetime.utcnow() + timedelta(days=1), - "buy_now_price": None, + "end_time": end_time, "auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}", "currency": "USD", - "is_active": True, }) except Exception as e: logger.debug(f"Error parsing row: {e}") @@ -508,23 +568,7 @@ class PlaywrightScraperService: try: await self.initialize() - # Scrape GoDaddy - logger.info("Scraping GoDaddy with Playwright...") - godaddy_result = await self.scrape_godaddy() - results["platforms"]["GoDaddy"] = { - "found": len(godaddy_result.get("items", [])), - "source": godaddy_result.get("source", "unknown"), - } - results["items"].extend(godaddy_result.get("items", [])) - results["total_found"] += len(godaddy_result.get("items", [])) - - if godaddy_result.get("error"): - results["errors"].append(f"GoDaddy: {godaddy_result['error']}") - - # Small delay between platforms - await asyncio.sleep(3) - - # Scrape NameJet + # Scrape NameJet (Cloudflare protected) logger.info("Scraping NameJet with Playwright...") namejet_result = await self.scrape_namejet() results["platforms"]["NameJet"] = {