fix(scraping): correct timezones + no ended auctions in API

- Treat Park.io close_date as America/New_York (configurable) and convert to UTC
- Ensure /stats, /scrape-status, /opportunities only count not-ended auctions
- Make NameJet Playwright scraper strict: requires real end_time + price
- Add Playwright proxy pool support (SCRAPER_PLAYWRIGHT_PROXY_POOL)
- Simplify protected scraping to NameJet only
This commit is contained in:
2025-12-11 22:00:06 +01:00
parent 3323f33d7c
commit 325a684809
3 changed files with 99 additions and 31 deletions

View File

@ -497,6 +497,7 @@ async def get_platform_stats(
Data is scraped from public auction sites - no mock data.
"""
now = datetime.utcnow()
# Get stats per platform
stats_query = (
select(
@ -504,7 +505,12 @@ async def get_platform_stats(
func.count(DomainAuction.id).label("count"),
func.avg(DomainAuction.current_bid).label("avg_bid"),
)
.where(DomainAuction.is_active == True)
.where(
and_(
DomainAuction.is_active == True,
DomainAuction.end_time > now,
)
)
.group_by(DomainAuction.platform)
)
@ -512,7 +518,7 @@ async def get_platform_stats(
platform_data = result.all()
# Get ending soon counts
cutoff = datetime.utcnow() + timedelta(hours=1)
cutoff = now + timedelta(hours=1)
ending_query = (
select(
DomainAuction.platform,
@ -521,6 +527,7 @@ async def get_platform_stats(
.where(
and_(
DomainAuction.is_active == True,
DomainAuction.end_time > now,
DomainAuction.end_time <= cutoff,
)
)
@ -548,6 +555,7 @@ async def get_scrape_status(
db: AsyncSession = Depends(get_db),
):
"""Get status of auction scraping."""
now = datetime.utcnow()
# Get last successful scrape
last_scrape_query = (
select(AuctionScrapeLog)
@ -559,7 +567,12 @@ async def get_scrape_status(
last_log = result.scalar_one_or_none()
# Get total auctions
total_query = select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True)
total_query = select(func.count(DomainAuction.id)).where(
and_(
DomainAuction.is_active == True,
DomainAuction.end_time > now,
)
)
total_result = await db.execute(total_query)
total = total_result.scalar() or 0
@ -615,9 +628,15 @@ async def get_smart_opportunities(
Opportunity Score = time_urgency × competition_factor × price_factor
"""
# Get active auctions
now = datetime.utcnow()
query = (
select(DomainAuction)
.where(DomainAuction.is_active == True)
.where(
and_(
DomainAuction.is_active == True,
DomainAuction.end_time > now,
)
)
.order_by(DomainAuction.end_time.asc())
.limit(100)
)

View File

@ -32,6 +32,7 @@ from bs4 import BeautifulSoup
from sqlalchemy import and_, delete, select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from zoneinfo import ZoneInfo
from app.models.auction import AuctionScrapeLog, DomainAuction
from app.services.dropcatch_api import dropcatch_client
@ -526,8 +527,12 @@ class AuctionScraperService:
close_raw = cols[4].get_text(" ", strip=True)
try:
# Park.io displays a naive timestamp. We treat it as UTC.
end_time = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S")
# Park.io displays a naive timestamp in their platform timezone.
# Default timezone is America/New_York (configurable).
tz_name = os.getenv("PARKIO_TIMEZONE", "America/New_York")
tz = ZoneInfo(tz_name)
local_dt = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz)
end_time = local_dt.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
except Exception:
continue

View File

@ -29,10 +29,12 @@ import json
import logging
import os
import random
import re
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from pathlib import Path
from urllib.parse import urlparse
from zoneinfo import ZoneInfo
logger = logging.getLogger(__name__)
@ -89,10 +91,20 @@ class PlaywrightScraperService:
try:
self.playwright = await async_playwright().start()
# Proxy selection:
# - SCRAPER_PLAYWRIGHT_PROXY: single proxy URL
# - SCRAPER_PLAYWRIGHT_PROXY_POOL: comma-separated proxy URLs, one is chosen randomly per browser start
proxy_pool_raw = os.getenv("SCRAPER_PLAYWRIGHT_PROXY_POOL", "").strip()
proxy_pool = [p.strip() for p in proxy_pool_raw.split(",") if p.strip()] if proxy_pool_raw else []
proxy_url = (
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
or os.getenv("SCRAPER_PROXY_URL")
or os.getenv("SCRAPER_HTTP_PROXY")
random.choice(proxy_pool)
if proxy_pool
else (
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
or os.getenv("SCRAPER_PROXY_URL")
or os.getenv("SCRAPER_HTTP_PROXY")
)
)
proxy_config = None
if proxy_url:
@ -426,10 +438,51 @@ class PlaywrightScraperService:
items = []
rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')
namejet_tz = os.getenv("NAMEJET_TIMEZONE", "America/Los_Angeles")
def parse_end_time(text: str) -> Optional[datetime]:
raw = (text or "").strip()
if not raw:
return None
# Sometimes they include timezone abbreviation
raw = raw.replace("PST", "").replace("PDT", "").replace("EST", "").replace("EDT", "").replace("UTC", "").strip()
# Relative format like "1d 2h 3m" (rare)
m = re.findall(r"(\\d+)\\s*([dhms])", raw.lower())
if m:
secs = 0
for n, u in m:
n_i = int(n)
if u == "d":
secs += n_i * 86400
elif u == "h":
secs += n_i * 3600
elif u == "m":
secs += n_i * 60
elif u == "s":
secs += n_i
if secs > 0:
return (datetime.now(ZoneInfo("UTC")) + timedelta(seconds=secs)).replace(tzinfo=None)
# Absolute formats (common)
fmts = [
"%m/%d/%Y %I:%M %p",
"%m/%d/%Y %H:%M",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M",
]
for fmt in fmts:
try:
local = datetime.strptime(raw, fmt).replace(tzinfo=ZoneInfo(namejet_tz))
return local.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
except Exception:
continue
return None
for row in rows[:limit]:
try:
cells = await row.query_selector_all('td')
if len(cells) < 3:
if len(cells) < 4:
continue
# NameJet format: Domain, End Time, Price, Bids, ...
@ -441,11 +494,21 @@ class PlaywrightScraperService:
tld = domain.rsplit(".", 1)[-1]
# Parse end time from column 1
end_text = await cells[1].text_content()
end_time = parse_end_time(end_text or "")
if end_time is None:
continue
if end_time <= datetime.utcnow():
continue
# Parse price
price = 0
if len(cells) > 2:
price_text = await cells[2].text_content()
price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")
if price <= 0:
continue
# Parse bids
bids = 0
@ -458,13 +521,10 @@ class PlaywrightScraperService:
"tld": tld,
"platform": "NameJet",
"current_bid": price,
"min_bid": 0,
"num_bids": bids,
"end_time": datetime.utcnow() + timedelta(days=1),
"buy_now_price": None,
"end_time": end_time,
"auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
"currency": "USD",
"is_active": True,
})
except Exception as e:
logger.debug(f"Error parsing row: {e}")
@ -508,23 +568,7 @@ class PlaywrightScraperService:
try:
await self.initialize()
# Scrape GoDaddy
logger.info("Scraping GoDaddy with Playwright...")
godaddy_result = await self.scrape_godaddy()
results["platforms"]["GoDaddy"] = {
"found": len(godaddy_result.get("items", [])),
"source": godaddy_result.get("source", "unknown"),
}
results["items"].extend(godaddy_result.get("items", []))
results["total_found"] += len(godaddy_result.get("items", []))
if godaddy_result.get("error"):
results["errors"].append(f"GoDaddy: {godaddy_result['error']}")
# Small delay between platforms
await asyncio.sleep(3)
# Scrape NameJet
# Scrape NameJet (Cloudflare protected)
logger.info("Scraping NameJet with Playwright...")
namejet_result = await self.scrape_namejet()
results["platforms"]["NameJet"] = {