fix(scraping): correct timezones + no ended auctions in API
Some checks failed
CI / Frontend Lint & Type Check (push) Has been cancelled
CI / Frontend Build (push) Has been cancelled
CI / Backend Lint (push) Has been cancelled
CI / Backend Tests (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
Deploy / Build & Push Images (push) Has been cancelled
Deploy / Deploy to Server (push) Has been cancelled
Deploy / Notify (push) Has been cancelled
Some checks failed
CI / Frontend Lint & Type Check (push) Has been cancelled
CI / Frontend Build (push) Has been cancelled
CI / Backend Lint (push) Has been cancelled
CI / Backend Tests (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
Deploy / Build & Push Images (push) Has been cancelled
Deploy / Deploy to Server (push) Has been cancelled
Deploy / Notify (push) Has been cancelled
- Treat Park.io close_date as America/New_York (configurable) and convert to UTC - Ensure /stats, /scrape-status, /opportunities only count not-ended auctions - Make NameJet Playwright scraper strict: requires real end_time + price - Add Playwright proxy pool support (SCRAPER_PLAYWRIGHT_PROXY_POOL) - Simplify protected scraping to NameJet only
This commit is contained in:
@ -497,6 +497,7 @@ async def get_platform_stats(
|
|||||||
|
|
||||||
Data is scraped from public auction sites - no mock data.
|
Data is scraped from public auction sites - no mock data.
|
||||||
"""
|
"""
|
||||||
|
now = datetime.utcnow()
|
||||||
# Get stats per platform
|
# Get stats per platform
|
||||||
stats_query = (
|
stats_query = (
|
||||||
select(
|
select(
|
||||||
@ -504,7 +505,12 @@ async def get_platform_stats(
|
|||||||
func.count(DomainAuction.id).label("count"),
|
func.count(DomainAuction.id).label("count"),
|
||||||
func.avg(DomainAuction.current_bid).label("avg_bid"),
|
func.avg(DomainAuction.current_bid).label("avg_bid"),
|
||||||
)
|
)
|
||||||
.where(DomainAuction.is_active == True)
|
.where(
|
||||||
|
and_(
|
||||||
|
DomainAuction.is_active == True,
|
||||||
|
DomainAuction.end_time > now,
|
||||||
|
)
|
||||||
|
)
|
||||||
.group_by(DomainAuction.platform)
|
.group_by(DomainAuction.platform)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -512,7 +518,7 @@ async def get_platform_stats(
|
|||||||
platform_data = result.all()
|
platform_data = result.all()
|
||||||
|
|
||||||
# Get ending soon counts
|
# Get ending soon counts
|
||||||
cutoff = datetime.utcnow() + timedelta(hours=1)
|
cutoff = now + timedelta(hours=1)
|
||||||
ending_query = (
|
ending_query = (
|
||||||
select(
|
select(
|
||||||
DomainAuction.platform,
|
DomainAuction.platform,
|
||||||
@ -521,6 +527,7 @@ async def get_platform_stats(
|
|||||||
.where(
|
.where(
|
||||||
and_(
|
and_(
|
||||||
DomainAuction.is_active == True,
|
DomainAuction.is_active == True,
|
||||||
|
DomainAuction.end_time > now,
|
||||||
DomainAuction.end_time <= cutoff,
|
DomainAuction.end_time <= cutoff,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -548,6 +555,7 @@ async def get_scrape_status(
|
|||||||
db: AsyncSession = Depends(get_db),
|
db: AsyncSession = Depends(get_db),
|
||||||
):
|
):
|
||||||
"""Get status of auction scraping."""
|
"""Get status of auction scraping."""
|
||||||
|
now = datetime.utcnow()
|
||||||
# Get last successful scrape
|
# Get last successful scrape
|
||||||
last_scrape_query = (
|
last_scrape_query = (
|
||||||
select(AuctionScrapeLog)
|
select(AuctionScrapeLog)
|
||||||
@ -559,7 +567,12 @@ async def get_scrape_status(
|
|||||||
last_log = result.scalar_one_or_none()
|
last_log = result.scalar_one_or_none()
|
||||||
|
|
||||||
# Get total auctions
|
# Get total auctions
|
||||||
total_query = select(func.count(DomainAuction.id)).where(DomainAuction.is_active == True)
|
total_query = select(func.count(DomainAuction.id)).where(
|
||||||
|
and_(
|
||||||
|
DomainAuction.is_active == True,
|
||||||
|
DomainAuction.end_time > now,
|
||||||
|
)
|
||||||
|
)
|
||||||
total_result = await db.execute(total_query)
|
total_result = await db.execute(total_query)
|
||||||
total = total_result.scalar() or 0
|
total = total_result.scalar() or 0
|
||||||
|
|
||||||
@ -615,9 +628,15 @@ async def get_smart_opportunities(
|
|||||||
Opportunity Score = time_urgency × competition_factor × price_factor
|
Opportunity Score = time_urgency × competition_factor × price_factor
|
||||||
"""
|
"""
|
||||||
# Get active auctions
|
# Get active auctions
|
||||||
|
now = datetime.utcnow()
|
||||||
query = (
|
query = (
|
||||||
select(DomainAuction)
|
select(DomainAuction)
|
||||||
.where(DomainAuction.is_active == True)
|
.where(
|
||||||
|
and_(
|
||||||
|
DomainAuction.is_active == True,
|
||||||
|
DomainAuction.end_time > now,
|
||||||
|
)
|
||||||
|
)
|
||||||
.order_by(DomainAuction.end_time.asc())
|
.order_by(DomainAuction.end_time.asc())
|
||||||
.limit(100)
|
.limit(100)
|
||||||
)
|
)
|
||||||
|
|||||||
@ -32,6 +32,7 @@ from bs4 import BeautifulSoup
|
|||||||
from sqlalchemy import and_, delete, select
|
from sqlalchemy import and_, delete, select
|
||||||
from sqlalchemy.exc import IntegrityError
|
from sqlalchemy.exc import IntegrityError
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
from zoneinfo import ZoneInfo
|
||||||
|
|
||||||
from app.models.auction import AuctionScrapeLog, DomainAuction
|
from app.models.auction import AuctionScrapeLog, DomainAuction
|
||||||
from app.services.dropcatch_api import dropcatch_client
|
from app.services.dropcatch_api import dropcatch_client
|
||||||
@ -526,8 +527,12 @@ class AuctionScraperService:
|
|||||||
|
|
||||||
close_raw = cols[4].get_text(" ", strip=True)
|
close_raw = cols[4].get_text(" ", strip=True)
|
||||||
try:
|
try:
|
||||||
# Park.io displays a naive timestamp. We treat it as UTC.
|
# Park.io displays a naive timestamp in their platform timezone.
|
||||||
end_time = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S")
|
# Default timezone is America/New_York (configurable).
|
||||||
|
tz_name = os.getenv("PARKIO_TIMEZONE", "America/New_York")
|
||||||
|
tz = ZoneInfo(tz_name)
|
||||||
|
local_dt = datetime.strptime(close_raw, "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz)
|
||||||
|
end_time = local_dt.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
@ -29,10 +29,12 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from zoneinfo import ZoneInfo
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -89,10 +91,20 @@ class PlaywrightScraperService:
|
|||||||
try:
|
try:
|
||||||
self.playwright = await async_playwright().start()
|
self.playwright = await async_playwright().start()
|
||||||
|
|
||||||
|
# Proxy selection:
|
||||||
|
# - SCRAPER_PLAYWRIGHT_PROXY: single proxy URL
|
||||||
|
# - SCRAPER_PLAYWRIGHT_PROXY_POOL: comma-separated proxy URLs, one is chosen randomly per browser start
|
||||||
|
proxy_pool_raw = os.getenv("SCRAPER_PLAYWRIGHT_PROXY_POOL", "").strip()
|
||||||
|
proxy_pool = [p.strip() for p in proxy_pool_raw.split(",") if p.strip()] if proxy_pool_raw else []
|
||||||
|
|
||||||
proxy_url = (
|
proxy_url = (
|
||||||
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
|
random.choice(proxy_pool)
|
||||||
or os.getenv("SCRAPER_PROXY_URL")
|
if proxy_pool
|
||||||
or os.getenv("SCRAPER_HTTP_PROXY")
|
else (
|
||||||
|
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
|
||||||
|
or os.getenv("SCRAPER_PROXY_URL")
|
||||||
|
or os.getenv("SCRAPER_HTTP_PROXY")
|
||||||
|
)
|
||||||
)
|
)
|
||||||
proxy_config = None
|
proxy_config = None
|
||||||
if proxy_url:
|
if proxy_url:
|
||||||
@ -426,10 +438,51 @@ class PlaywrightScraperService:
|
|||||||
items = []
|
items = []
|
||||||
rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')
|
rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')
|
||||||
|
|
||||||
|
namejet_tz = os.getenv("NAMEJET_TIMEZONE", "America/Los_Angeles")
|
||||||
|
|
||||||
|
def parse_end_time(text: str) -> Optional[datetime]:
|
||||||
|
raw = (text or "").strip()
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
# Sometimes they include timezone abbreviation
|
||||||
|
raw = raw.replace("PST", "").replace("PDT", "").replace("EST", "").replace("EDT", "").replace("UTC", "").strip()
|
||||||
|
|
||||||
|
# Relative format like "1d 2h 3m" (rare)
|
||||||
|
m = re.findall(r"(\\d+)\\s*([dhms])", raw.lower())
|
||||||
|
if m:
|
||||||
|
secs = 0
|
||||||
|
for n, u in m:
|
||||||
|
n_i = int(n)
|
||||||
|
if u == "d":
|
||||||
|
secs += n_i * 86400
|
||||||
|
elif u == "h":
|
||||||
|
secs += n_i * 3600
|
||||||
|
elif u == "m":
|
||||||
|
secs += n_i * 60
|
||||||
|
elif u == "s":
|
||||||
|
secs += n_i
|
||||||
|
if secs > 0:
|
||||||
|
return (datetime.now(ZoneInfo("UTC")) + timedelta(seconds=secs)).replace(tzinfo=None)
|
||||||
|
|
||||||
|
# Absolute formats (common)
|
||||||
|
fmts = [
|
||||||
|
"%m/%d/%Y %I:%M %p",
|
||||||
|
"%m/%d/%Y %H:%M",
|
||||||
|
"%Y-%m-%d %H:%M:%S",
|
||||||
|
"%Y-%m-%d %H:%M",
|
||||||
|
]
|
||||||
|
for fmt in fmts:
|
||||||
|
try:
|
||||||
|
local = datetime.strptime(raw, fmt).replace(tzinfo=ZoneInfo(namejet_tz))
|
||||||
|
return local.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
for row in rows[:limit]:
|
for row in rows[:limit]:
|
||||||
try:
|
try:
|
||||||
cells = await row.query_selector_all('td')
|
cells = await row.query_selector_all('td')
|
||||||
if len(cells) < 3:
|
if len(cells) < 4:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# NameJet format: Domain, End Time, Price, Bids, ...
|
# NameJet format: Domain, End Time, Price, Bids, ...
|
||||||
@ -441,11 +494,21 @@ class PlaywrightScraperService:
|
|||||||
|
|
||||||
tld = domain.rsplit(".", 1)[-1]
|
tld = domain.rsplit(".", 1)[-1]
|
||||||
|
|
||||||
|
# Parse end time from column 1
|
||||||
|
end_text = await cells[1].text_content()
|
||||||
|
end_time = parse_end_time(end_text or "")
|
||||||
|
if end_time is None:
|
||||||
|
continue
|
||||||
|
if end_time <= datetime.utcnow():
|
||||||
|
continue
|
||||||
|
|
||||||
# Parse price
|
# Parse price
|
||||||
price = 0
|
price = 0
|
||||||
if len(cells) > 2:
|
if len(cells) > 2:
|
||||||
price_text = await cells[2].text_content()
|
price_text = await cells[2].text_content()
|
||||||
price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")
|
price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")
|
||||||
|
if price <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
# Parse bids
|
# Parse bids
|
||||||
bids = 0
|
bids = 0
|
||||||
@ -458,13 +521,10 @@ class PlaywrightScraperService:
|
|||||||
"tld": tld,
|
"tld": tld,
|
||||||
"platform": "NameJet",
|
"platform": "NameJet",
|
||||||
"current_bid": price,
|
"current_bid": price,
|
||||||
"min_bid": 0,
|
|
||||||
"num_bids": bids,
|
"num_bids": bids,
|
||||||
"end_time": datetime.utcnow() + timedelta(days=1),
|
"end_time": end_time,
|
||||||
"buy_now_price": None,
|
|
||||||
"auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
|
"auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
|
||||||
"currency": "USD",
|
"currency": "USD",
|
||||||
"is_active": True,
|
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Error parsing row: {e}")
|
logger.debug(f"Error parsing row: {e}")
|
||||||
@ -508,23 +568,7 @@ class PlaywrightScraperService:
|
|||||||
try:
|
try:
|
||||||
await self.initialize()
|
await self.initialize()
|
||||||
|
|
||||||
# Scrape GoDaddy
|
# Scrape NameJet (Cloudflare protected)
|
||||||
logger.info("Scraping GoDaddy with Playwright...")
|
|
||||||
godaddy_result = await self.scrape_godaddy()
|
|
||||||
results["platforms"]["GoDaddy"] = {
|
|
||||||
"found": len(godaddy_result.get("items", [])),
|
|
||||||
"source": godaddy_result.get("source", "unknown"),
|
|
||||||
}
|
|
||||||
results["items"].extend(godaddy_result.get("items", []))
|
|
||||||
results["total_found"] += len(godaddy_result.get("items", []))
|
|
||||||
|
|
||||||
if godaddy_result.get("error"):
|
|
||||||
results["errors"].append(f"GoDaddy: {godaddy_result['error']}")
|
|
||||||
|
|
||||||
# Small delay between platforms
|
|
||||||
await asyncio.sleep(3)
|
|
||||||
|
|
||||||
# Scrape NameJet
|
|
||||||
logger.info("Scraping NameJet with Playwright...")
|
logger.info("Scraping NameJet with Playwright...")
|
||||||
namejet_result = await self.scrape_namejet()
|
namejet_result = await self.scrape_namejet()
|
||||||
results["platforms"]["NameJet"] = {
|
results["platforms"]["NameJet"] = {
|
||||||
|
|||||||
Reference in New Issue
Block a user