fix(scraping): correct timezones + no ended auctions in API
- Treat Park.io close_date as America/New_York (configurable) and convert to UTC - Ensure /stats, /scrape-status, /opportunities only count not-ended auctions - Make NameJet Playwright scraper strict: requires real end_time + price - Add Playwright proxy pool support (SCRAPER_PLAYWRIGHT_PROXY_POOL) - Simplify protected scraping to NameJet only
This commit is contained in:
@ -497,6 +497,7 @@ async def get_platform_stats(
|
||||
|
||||
Data is scraped from public auction sites - no mock data.
|
||||
"""
|
||||
now = datetime.utcnow()
|
||||
# Get stats per platform
|
||||
stats_query = (
|
||||
select(
|
||||
@ -504,7 +505,12 @@ async def get_platform_stats(
|
||||
func.count(DomainAuction.id).label("count"),
|
||||
func.avg(DomainAuction.current_bid).label("avg_bid"),
|
||||
)
|
||||
.where(DomainAuction.is_active == True)
|
||||
.where(
|
||||
and_(
|
||||
DomainAuction.is_active == True,
|
||||
DomainAuction.end_time > now,
|
||||
)
|
||||
)
|
||||
.group_by(DomainAuction.platform)
|
||||
)
|
||||
|
||||
@ -512,7 +518,7 @@ async def get_platform_stats(
|
||||
platform_data = result.all()
|
||||
|
||||
# Get ending soon counts
|
||||
cutoff = datetime.utcnow() + timedelta(hours=1)
|
||||
cutoff = now + timedelta(hours=1)
|
||||
ending_query = (
|
||||
select(
|
||||
DomainAuction.platform,
|
||||
@ -521,6 +527,7 @@ async def get_platform_stats(
|
||||
.where(
|
||||
and_(
|
||||
DomainAuction.is_active == True,
|
||||
DomainAuction.end_time > now,
|
||||
DomainAuction.end_time <= cutoff,
|
||||
)
|
||||
)
|
||||
@ -548,6 +555,7 @@ async def get_scrape_status(
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""Get status of auction scraping."""
|
||||
now = datetime.utcnow()
|
||||
# Get last successful scrape
|
||||
last_scrape_query = (
|
||||
select(AuctionScrapeLog)
|
||||
@ -559,7 +567,12 @@ async def get_scrape_status(
|
||||
last_log = result.scalar_one_or_none()
|
||||
|
||||
# Get total auctions
|
||||
total_query = select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True)
|
||||
total_query = select(func.count(DomainAuction.id)).where(
|
||||
and_(
|
||||
DomainAuction.is_active == True,
|
||||
DomainAuction.end_time > now,
|
||||
)
|
||||
)
|
||||
total_result = await db.execute(total_query)
|
||||
total = total_result.scalar() or 0
|
||||
|
||||
@ -615,9 +628,15 @@ async def get_smart_opportunities(
|
||||
Opportunity Score = time_urgency × competition_factor × price_factor
|
||||
"""
|
||||
# Get active auctions
|
||||
now = datetime.utcnow()
|
||||
query = (
|
||||
select(DomainAuction)
|
||||
.where(DomainAuction.is_active == True)
|
||||
.where(
|
||||
and_(
|
||||
DomainAuction.is_active == True,
|
||||
DomainAuction.end_time > now,
|
||||
)
|
||||
)
|
||||
.order_by(DomainAuction.end_time.asc())
|
||||
.limit(100)
|
||||
)
|
||||
|
||||
@ -32,6 +32,7 @@ from bs4 import BeautifulSoup
|
||||
from sqlalchemy import and_, delete, select
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from app.models.auction import AuctionScrapeLog, DomainAuction
|
||||
from app.services.dropcatch_api import dropcatch_client
|
||||
@ -526,8 +527,12 @@ class AuctionScraperService:
|
||||
|
||||
close_raw = cols[4].get_text(" ", strip=True)
|
||||
try:
|
||||
# Park.io displays a naive timestamp. We treat it as UTC.
|
||||
end_time = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S")
|
||||
# Park.io displays a naive timestamp in their platform timezone.
|
||||
# Default timezone is America/New_York (configurable).
|
||||
tz_name = os.getenv("PARKIO_TIMEZONE", "America/New_York")
|
||||
tz = ZoneInfo(tz_name)
|
||||
local_dt = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz)
|
||||
end_time = local_dt.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
|
||||
@ -29,10 +29,12 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -89,10 +91,20 @@ class PlaywrightScraperService:
|
||||
try:
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
# Proxy selection:
|
||||
# - SCRAPER_PLAYWRIGHT_PROXY: single proxy URL
|
||||
# - SCRAPER_PLAYWRIGHT_PROXY_POOL: comma-separated proxy URLs, one is chosen randomly per browser start
|
||||
proxy_pool_raw = os.getenv("SCRAPER_PLAYWRIGHT_PROXY_POOL", "").strip()
|
||||
proxy_pool = [p.strip() for p in proxy_pool_raw.split(",") if p.strip()] if proxy_pool_raw else []
|
||||
|
||||
proxy_url = (
|
||||
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
|
||||
or os.getenv("SCRAPER_PROXY_URL")
|
||||
or os.getenv("SCRAPER_HTTP_PROXY")
|
||||
random.choice(proxy_pool)
|
||||
if proxy_pool
|
||||
else (
|
||||
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
|
||||
or os.getenv("SCRAPER_PROXY_URL")
|
||||
or os.getenv("SCRAPER_HTTP_PROXY")
|
||||
)
|
||||
)
|
||||
proxy_config = None
|
||||
if proxy_url:
|
||||
@ -426,10 +438,51 @@ class PlaywrightScraperService:
|
||||
items = []
|
||||
rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')
|
||||
|
||||
namejet_tz = os.getenv("NAMEJET_TIMEZONE", "America/Los_Angeles")
|
||||
|
||||
def parse_end_time(text: str) -> Optional[datetime]:
|
||||
raw = (text or "").strip()
|
||||
if not raw:
|
||||
return None
|
||||
# Sometimes they include timezone abbreviation
|
||||
raw = raw.replace("PST", "").replace("PDT", "").replace("EST", "").replace("EDT", "").replace("UTC", "").strip()
|
||||
|
||||
# Relative format like "1d 2h 3m" (rare)
|
||||
m = re.findall(r"(\\d+)\\s*([dhms])", raw.lower())
|
||||
if m:
|
||||
secs = 0
|
||||
for n, u in m:
|
||||
n_i = int(n)
|
||||
if u == "d":
|
||||
secs += n_i * 86400
|
||||
elif u == "h":
|
||||
secs += n_i * 3600
|
||||
elif u == "m":
|
||||
secs += n_i * 60
|
||||
elif u == "s":
|
||||
secs += n_i
|
||||
if secs > 0:
|
||||
return (datetime.now(ZoneInfo("UTC")) + timedelta(seconds=secs)).replace(tzinfo=None)
|
||||
|
||||
# Absolute formats (common)
|
||||
fmts = [
|
||||
"%m/%d/%Y %I:%M %p",
|
||||
"%m/%d/%Y %H:%M",
|
||||
"%Y-%m-%d %H:%M:%S",
|
||||
"%Y-%m-%d %H:%M",
|
||||
]
|
||||
for fmt in fmts:
|
||||
try:
|
||||
local = datetime.strptime(raw, fmt).replace(tzinfo=ZoneInfo(namejet_tz))
|
||||
return local.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
for row in rows[:limit]:
|
||||
try:
|
||||
cells = await row.query_selector_all('td')
|
||||
if len(cells) < 3:
|
||||
if len(cells) < 4:
|
||||
continue
|
||||
|
||||
# NameJet format: Domain, End Time, Price, Bids, ...
|
||||
@ -441,11 +494,21 @@ class PlaywrightScraperService:
|
||||
|
||||
tld = domain.rsplit(".", 1)[-1]
|
||||
|
||||
# Parse end time from column 1
|
||||
end_text = await cells[1].text_content()
|
||||
end_time = parse_end_time(end_text or "")
|
||||
if end_time is None:
|
||||
continue
|
||||
if end_time <= datetime.utcnow():
|
||||
continue
|
||||
|
||||
# Parse price
|
||||
price = 0
|
||||
if len(cells) > 2:
|
||||
price_text = await cells[2].text_content()
|
||||
price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")
|
||||
if price <= 0:
|
||||
continue
|
||||
|
||||
# Parse bids
|
||||
bids = 0
|
||||
@ -458,13 +521,10 @@ class PlaywrightScraperService:
|
||||
"tld": tld,
|
||||
"platform": "NameJet",
|
||||
"current_bid": price,
|
||||
"min_bid": 0,
|
||||
"num_bids": bids,
|
||||
"end_time": datetime.utcnow() + timedelta(days=1),
|
||||
"buy_now_price": None,
|
||||
"end_time": end_time,
|
||||
"auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
|
||||
"currency": "USD",
|
||||
"is_active": True,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.debug(f"Error parsing row: {e}")
|
||||
@ -508,23 +568,7 @@ class PlaywrightScraperService:
|
||||
try:
|
||||
await self.initialize()
|
||||
|
||||
# Scrape GoDaddy
|
||||
logger.info("Scraping GoDaddy with Playwright...")
|
||||
godaddy_result = await self.scrape_godaddy()
|
||||
results["platforms"]["GoDaddy"] = {
|
||||
"found": len(godaddy_result.get("items", [])),
|
||||
"source": godaddy_result.get("source", "unknown"),
|
||||
}
|
||||
results["items"].extend(godaddy_result.get("items", []))
|
||||
results["total_found"] += len(godaddy_result.get("items", []))
|
||||
|
||||
if godaddy_result.get("error"):
|
||||
results["errors"].append(f"GoDaddy: {godaddy_result['error']}")
|
||||
|
||||
# Small delay between platforms
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# Scrape NameJet
|
||||
# Scrape NameJet (Cloudflare protected)
|
||||
logger.info("Scraping NameJet with Playwright...")
|
||||
namejet_result = await self.scrape_namejet()
|
||||
results["platforms"]["NameJet"] = {
|
||||
|
||||
Reference in New Issue
Block a user