- Treat Park.io close_date as America/New_York (configurable) and convert to UTC - Ensure /stats, /scrape-status, /opportunities only count not-ended auctions - Make NameJet Playwright scraper strict: requires real end_time + price - Add Playwright proxy pool support (SCRAPER_PLAYWRIGHT_PROXY_POOL) - Simplify protected scraping to NameJet only
596 lines
24 KiB
Python
596 lines
24 KiB
Python
"""
|
|
Playwright-based Stealth Scraper for Cloudflare-protected Domain Auction Sites.
|
|
|
|
This module uses Playwright with stealth plugins to bypass Cloudflare and other
|
|
anti-bot protections. It's designed for enterprise-grade web scraping.
|
|
|
|
Features:
|
|
- Stealth mode (undetectable browser fingerprint)
|
|
- Automatic Cloudflare bypass
|
|
- Connection pooling
|
|
- Retry logic with exponential backoff
|
|
- JSON extraction from rendered pages
|
|
- Cookie persistence across sessions
|
|
|
|
Supported Platforms:
|
|
- GoDaddy Auctions (Cloudflare protected)
|
|
- NameJet (Cloudflare protected)
|
|
- Any other protected auction site
|
|
|
|
Usage:
|
|
scraper = PlaywrightScraperService()
|
|
await scraper.initialize()
|
|
auctions = await scraper.scrape_godaddy()
|
|
await scraper.close()
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from typing import Any, Dict, List, Optional
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
from zoneinfo import ZoneInfo
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Try to import playwright (optional dependency)
|
|
try:
|
|
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
|
from playwright_stealth import Stealth
|
|
PLAYWRIGHT_AVAILABLE = True
|
|
except ImportError:
|
|
PLAYWRIGHT_AVAILABLE = False
|
|
Stealth = None
|
|
# Define dummy types for type hints
|
|
Browser = Any
|
|
BrowserContext = Any
|
|
Page = Any
|
|
logger.warning("Playwright not installed. Stealth scraping disabled.")
|
|
|
|
|
|
class PlaywrightScraperService:
|
|
"""
|
|
Enterprise-grade Playwright scraper with Cloudflare bypass.
|
|
|
|
Uses stealth techniques to appear as a real browser:
|
|
- Real Chrome user agent
|
|
- WebGL fingerprint spoofing
|
|
- Navigator property spoofing
|
|
- Timezone and locale matching
|
|
"""
|
|
|
|
# User agents that work well with Cloudflare
|
|
USER_AGENTS = [
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
|
|
]
|
|
|
|
def __init__(self):
|
|
self.playwright = None
|
|
self.browser: Optional[Browser] = None
|
|
self.context: Optional[BrowserContext] = None
|
|
self._initialized = False
|
|
self._cookie_dir = Path(__file__).parent.parent.parent / "data" / "cookies"
|
|
self._cookie_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
async def initialize(self) -> bool:
|
|
"""Initialize the browser instance."""
|
|
if not PLAYWRIGHT_AVAILABLE:
|
|
logger.error("Playwright not available. Install with: pip install playwright playwright-stealth")
|
|
return False
|
|
|
|
if self._initialized:
|
|
return True
|
|
|
|
try:
|
|
self.playwright = await async_playwright().start()
|
|
|
|
# Proxy selection:
|
|
# - SCRAPER_PLAYWRIGHT_PROXY: single proxy URL
|
|
# - SCRAPER_PLAYWRIGHT_PROXY_POOL: comma-separated proxy URLs, one is chosen randomly per browser start
|
|
proxy_pool_raw = os.getenv("SCRAPER_PLAYWRIGHT_PROXY_POOL", "").strip()
|
|
proxy_pool = [p.strip() for p in proxy_pool_raw.split(",") if p.strip()] if proxy_pool_raw else []
|
|
|
|
proxy_url = (
|
|
random.choice(proxy_pool)
|
|
if proxy_pool
|
|
else (
|
|
os.getenv("SCRAPER_PLAYWRIGHT_PROXY")
|
|
or os.getenv("SCRAPER_PROXY_URL")
|
|
or os.getenv("SCRAPER_HTTP_PROXY")
|
|
)
|
|
)
|
|
proxy_config = None
|
|
if proxy_url:
|
|
parsed = urlparse(proxy_url)
|
|
if parsed.scheme and parsed.hostname and parsed.port:
|
|
proxy_config = {
|
|
"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
|
|
}
|
|
if parsed.username:
|
|
proxy_config["username"] = parsed.username
|
|
if parsed.password:
|
|
proxy_config["password"] = parsed.password
|
|
|
|
headless = os.getenv("PLAYWRIGHT_HEADLESS", "true").lower() in ("1", "true", "yes")
|
|
|
|
# Launch with stealth settings
|
|
self.browser = await self.playwright.chromium.launch(
|
|
headless=headless,
|
|
proxy=proxy_config,
|
|
args=[
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--disable-dev-shm-usage",
|
|
"--no-sandbox",
|
|
"--disable-setuid-sandbox",
|
|
"--disable-infobars",
|
|
"--disable-extensions",
|
|
"--window-size=1920,1080",
|
|
]
|
|
)
|
|
|
|
# Create context with realistic settings
|
|
self.context = await self.browser.new_context(
|
|
user_agent=random.choice(self.USER_AGENTS),
|
|
viewport={"width": 1920, "height": 1080},
|
|
locale="en-US",
|
|
timezone_id="America/New_York",
|
|
geolocation={"longitude": -73.935242, "latitude": 40.730610},
|
|
permissions=["geolocation"],
|
|
)
|
|
|
|
# Load saved cookies if available
|
|
await self._load_cookies()
|
|
|
|
self._initialized = True
|
|
logger.info("Playwright browser initialized successfully")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.exception(f"Failed to initialize Playwright: {e}")
|
|
return False
|
|
|
|
async def close(self):
|
|
"""Close browser and cleanup."""
|
|
if self.context:
|
|
await self._save_cookies()
|
|
await self.context.close()
|
|
if self.browser:
|
|
await self.browser.close()
|
|
if self.playwright:
|
|
await self.playwright.stop()
|
|
self._initialized = False
|
|
|
|
async def _load_cookies(self):
|
|
"""Load saved cookies from file."""
|
|
cookie_file = self._cookie_dir / "session_cookies.json"
|
|
if cookie_file.exists():
|
|
try:
|
|
with open(cookie_file) as f:
|
|
cookies = json.load(f)
|
|
await self.context.add_cookies(cookies)
|
|
logger.info(f"Loaded {len(cookies)} saved cookies")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to load cookies: {e}")
|
|
|
|
async def _save_cookies(self):
|
|
"""Save cookies to file for persistence."""
|
|
try:
|
|
cookies = await self.context.cookies()
|
|
cookie_file = self._cookie_dir / "session_cookies.json"
|
|
with open(cookie_file, "w") as f:
|
|
json.dump(cookies, f)
|
|
logger.info(f"Saved {len(cookies)} cookies")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to save cookies: {e}")
|
|
|
|
async def _create_stealth_page(self) -> Page:
|
|
"""Create a new page with stealth mode enabled."""
|
|
page = await self.context.new_page()
|
|
|
|
# Apply stealth mode
|
|
if Stealth:
|
|
stealth = Stealth(
|
|
navigator_webdriver=True,
|
|
chrome_runtime=True,
|
|
navigator_user_agent=True,
|
|
navigator_vendor=True,
|
|
webgl_vendor=True,
|
|
)
|
|
await stealth.apply_stealth_async(page)
|
|
|
|
return page
|
|
|
|
async def _wait_for_cloudflare(self, page: Page, timeout: int = 30):
|
|
"""Wait for Cloudflare challenge to complete."""
|
|
try:
|
|
# Wait for either the challenge to complete or content to load
|
|
await page.wait_for_function(
|
|
"""
|
|
() => {
|
|
// Check if we're past Cloudflare
|
|
const title = document.title.toLowerCase();
|
|
return !title.includes('just a moment') &&
|
|
!title.includes('attention required') &&
|
|
!title.includes('checking your browser');
|
|
}
|
|
""",
|
|
timeout=timeout * 1000
|
|
)
|
|
# Additional delay for any remaining JS to execute
|
|
await asyncio.sleep(2)
|
|
except Exception as e:
|
|
logger.warning(f"Cloudflare wait timeout: {e}")
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# GODADDY AUCTIONS SCRAPER
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
async def scrape_godaddy(self, limit: int = 100) -> Dict[str, Any]:
|
|
"""
|
|
Scrape GoDaddy Auctions using Playwright.
|
|
|
|
GoDaddy uses Cloudflare + their own bot detection.
|
|
We intercept the API calls made by their frontend.
|
|
"""
|
|
if not await self.initialize():
|
|
return {"items": [], "total": 0, "error": "Playwright not initialized"}
|
|
|
|
page = None
|
|
try:
|
|
page = await self._create_stealth_page()
|
|
|
|
# Intercept XHR requests to capture auction data
|
|
captured_data = []
|
|
|
|
async def handle_response(response):
|
|
if "findApiProxy" in response.url and "auction" in response.url:
|
|
try:
|
|
data = await response.json()
|
|
captured_data.append(data)
|
|
except:
|
|
pass
|
|
|
|
page.on("response", handle_response)
|
|
|
|
# Navigate to GoDaddy Auctions
|
|
logger.info("Navigating to GoDaddy Auctions...")
|
|
await page.goto("https://auctions.godaddy.com/beta", wait_until="domcontentloaded", timeout=60_000)
|
|
|
|
# Wait for Cloudflare
|
|
await self._wait_for_cloudflare(page)
|
|
|
|
# Wait for auction content to load
|
|
try:
|
|
await page.wait_for_selector('[data-testid="auction-card"], .auction-card, .domain-item', timeout=15000)
|
|
except:
|
|
logger.warning("Auction cards not found, trying to scroll...")
|
|
|
|
# Scroll to trigger lazy loading
|
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
|
|
await asyncio.sleep(2)
|
|
|
|
# Try to extract from intercepted API calls first
|
|
if captured_data:
|
|
return self._parse_godaddy_api_response(captured_data)
|
|
|
|
# Fallback: Extract from DOM
|
|
return await self._extract_godaddy_from_dom(page)
|
|
|
|
except Exception as e:
|
|
logger.exception(f"GoDaddy scraping error: {e}")
|
|
return {"items": [], "total": 0, "error": str(e)}
|
|
finally:
|
|
if page:
|
|
await page.close()
|
|
|
|
def _parse_godaddy_api_response(self, captured_data: List[Dict]) -> Dict[str, Any]:
|
|
"""Parse captured API response from GoDaddy."""
|
|
items = []
|
|
|
|
for data in captured_data:
|
|
results = data.get("results", [])
|
|
for item in results:
|
|
domain = item.get("fqdn", "") or item.get("domain", "")
|
|
if not domain:
|
|
continue
|
|
|
|
tld = domain.rsplit(".", 1)[-1] if "." in domain else ""
|
|
|
|
# Parse end time
|
|
end_time = None
|
|
end_at = item.get("endingAt") or item.get("auctionEndTime")
|
|
if end_at:
|
|
try:
|
|
end_time = datetime.fromisoformat(end_at.replace("Z", "+00:00")).replace(tzinfo=None)
|
|
except:
|
|
pass
|
|
|
|
price = item.get("price") or item.get("currentBidPrice") or item.get("minBid") or 0
|
|
|
|
items.append({
|
|
"domain": domain,
|
|
"tld": tld,
|
|
"platform": "GoDaddy",
|
|
"current_bid": float(price) if price else 0,
|
|
"min_bid": float(item.get("minBid", 0) or 0),
|
|
"num_bids": int(item.get("bids", 0) or item.get("bidCount", 0) or 0),
|
|
"end_time": end_time or datetime.utcnow() + timedelta(days=1),
|
|
"buy_now_price": float(item.get("buyNowPrice")) if item.get("buyNowPrice") else None,
|
|
"auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce",
|
|
"currency": "USD",
|
|
"is_active": True,
|
|
"traffic": int(item.get("traffic", 0) or 0),
|
|
"domain_authority": int(item.get("valuationPrice", 0) or 0),
|
|
})
|
|
|
|
return {
|
|
"items": items,
|
|
"total": len(items),
|
|
"source": "api_intercept",
|
|
}
|
|
|
|
async def _extract_godaddy_from_dom(self, page: Page) -> Dict[str, Any]:
|
|
"""Extract auction data from GoDaddy DOM when API intercept fails."""
|
|
items = []
|
|
|
|
try:
|
|
# Try different selectors
|
|
selectors = [
|
|
'[data-testid="auction-card"]',
|
|
'.auction-card',
|
|
'.domain-listing',
|
|
'tr[data-domain]',
|
|
'.domain-row',
|
|
]
|
|
|
|
for selector in selectors:
|
|
elements = await page.query_selector_all(selector)
|
|
if elements:
|
|
logger.info(f"Found {len(elements)} elements with selector: {selector}")
|
|
|
|
for el in elements[:100]: # Max 100 items
|
|
try:
|
|
# Try to extract domain name
|
|
domain_el = await el.query_selector('.domain-name, .fqdn, [data-domain], a[href*="domain"]')
|
|
if domain_el:
|
|
domain = await domain_el.text_content()
|
|
domain = domain.strip() if domain else ""
|
|
else:
|
|
domain = await el.get_attribute("data-domain") or ""
|
|
|
|
if not domain or "." not in domain:
|
|
continue
|
|
|
|
tld = domain.rsplit(".", 1)[-1]
|
|
|
|
# Try to extract price
|
|
price = 0
|
|
price_el = await el.query_selector('.price, .bid, .current-bid, [data-price]')
|
|
if price_el:
|
|
price_text = await price_el.text_content()
|
|
price = float("".join(c for c in price_text if c.isdigit() or c == ".") or "0")
|
|
|
|
items.append({
|
|
"domain": domain,
|
|
"tld": tld,
|
|
"platform": "GoDaddy",
|
|
"current_bid": price,
|
|
"min_bid": 0,
|
|
"num_bids": 0,
|
|
"end_time": datetime.utcnow() + timedelta(days=1),
|
|
"buy_now_price": None,
|
|
"auction_url": f"https://auctions.godaddy.com/trpItemListing.aspx?domain={domain}&isc=cjcpounce",
|
|
"currency": "USD",
|
|
"is_active": True,
|
|
})
|
|
except Exception as e:
|
|
logger.debug(f"Error extracting element: {e}")
|
|
|
|
break # Found elements, stop trying other selectors
|
|
|
|
except Exception as e:
|
|
logger.exception(f"DOM extraction error: {e}")
|
|
|
|
return {
|
|
"items": items,
|
|
"total": len(items),
|
|
"source": "dom_extraction",
|
|
}
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# NAMEJET SCRAPER
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
async def scrape_namejet(self, limit: int = 100) -> Dict[str, Any]:
|
|
"""
|
|
Scrape NameJet auctions using Playwright.
|
|
|
|
NameJet uses heavy Cloudflare protection.
|
|
"""
|
|
if not await self.initialize():
|
|
return {"items": [], "total": 0, "error": "Playwright not initialized"}
|
|
|
|
page = None
|
|
try:
|
|
page = await self._create_stealth_page()
|
|
|
|
# Navigate to NameJet auctions page
|
|
logger.info("Navigating to NameJet...")
|
|
await page.goto("https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx", wait_until="domcontentloaded", timeout=60_000)
|
|
|
|
# Wait for Cloudflare
|
|
await self._wait_for_cloudflare(page)
|
|
|
|
# Wait for auction table
|
|
try:
|
|
await page.wait_for_selector('#MainContent_gvAuctions, .auction-table, table', timeout=15000)
|
|
except:
|
|
logger.warning("NameJet table not found")
|
|
|
|
# Extract data from table
|
|
items = []
|
|
rows = await page.query_selector_all('tr[data-id], #MainContent_gvAuctions tr, .auction-row')
|
|
|
|
namejet_tz = os.getenv("NAMEJET_TIMEZONE", "America/Los_Angeles")
|
|
|
|
def parse_end_time(text: str) -> Optional[datetime]:
|
|
raw = (text or "").strip()
|
|
if not raw:
|
|
return None
|
|
# Sometimes they include timezone abbreviation
|
|
raw = raw.replace("PST", "").replace("PDT", "").replace("EST", "").replace("EDT", "").replace("UTC", "").strip()
|
|
|
|
# Relative format like "1d 2h 3m" (rare)
|
|
m = re.findall(r"(\\d+)\\s*([dhms])", raw.lower())
|
|
if m:
|
|
secs = 0
|
|
for n, u in m:
|
|
n_i = int(n)
|
|
if u == "d":
|
|
secs += n_i * 86400
|
|
elif u == "h":
|
|
secs += n_i * 3600
|
|
elif u == "m":
|
|
secs += n_i * 60
|
|
elif u == "s":
|
|
secs += n_i
|
|
if secs > 0:
|
|
return (datetime.now(ZoneInfo("UTC")) + timedelta(seconds=secs)).replace(tzinfo=None)
|
|
|
|
# Absolute formats (common)
|
|
fmts = [
|
|
"%m/%d/%Y %I:%M %p",
|
|
"%m/%d/%Y %H:%M",
|
|
"%Y-%m-%d %H:%M:%S",
|
|
"%Y-%m-%d %H:%M",
|
|
]
|
|
for fmt in fmts:
|
|
try:
|
|
local = datetime.strptime(raw, fmt).replace(tzinfo=ZoneInfo(namejet_tz))
|
|
return local.astimezone(ZoneInfo("UTC")).replace(tzinfo=None)
|
|
except Exception:
|
|
continue
|
|
return None
|
|
|
|
for row in rows[:limit]:
|
|
try:
|
|
cells = await row.query_selector_all('td')
|
|
if len(cells) < 4:
|
|
continue
|
|
|
|
# NameJet format: Domain, End Time, Price, Bids, ...
|
|
domain = await cells[0].text_content()
|
|
domain = domain.strip() if domain else ""
|
|
|
|
if not domain or "." not in domain:
|
|
continue
|
|
|
|
tld = domain.rsplit(".", 1)[-1]
|
|
|
|
# Parse end time from column 1
|
|
end_text = await cells[1].text_content()
|
|
end_time = parse_end_time(end_text or "")
|
|
if end_time is None:
|
|
continue
|
|
if end_time <= datetime.utcnow():
|
|
continue
|
|
|
|
# Parse price
|
|
price = 0
|
|
if len(cells) > 2:
|
|
price_text = await cells[2].text_content()
|
|
price = float("".join(c for c in (price_text or "0") if c.isdigit() or c == ".") or "0")
|
|
if price <= 0:
|
|
continue
|
|
|
|
# Parse bids
|
|
bids = 0
|
|
if len(cells) > 3:
|
|
bids_text = await cells[3].text_content()
|
|
bids = int("".join(c for c in (bids_text or "0") if c.isdigit()) or "0")
|
|
|
|
items.append({
|
|
"domain": domain,
|
|
"tld": tld,
|
|
"platform": "NameJet",
|
|
"current_bid": price,
|
|
"num_bids": bids,
|
|
"end_time": end_time,
|
|
"auction_url": f"https://www.namejet.com/Pages/Auctions/ViewAuctions.aspx?domain={domain}",
|
|
"currency": "USD",
|
|
})
|
|
except Exception as e:
|
|
logger.debug(f"Error parsing row: {e}")
|
|
|
|
return {
|
|
"items": items,
|
|
"total": len(items),
|
|
"source": "playwright",
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.exception(f"NameJet scraping error: {e}")
|
|
return {"items": [], "total": 0, "error": str(e)}
|
|
finally:
|
|
if page:
|
|
await page.close()
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# UNIFIED SCRAPE METHOD
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
async def scrape_all_protected(self) -> Dict[str, Any]:
|
|
"""
|
|
Scrape all Cloudflare-protected platforms.
|
|
|
|
Returns combined results from:
|
|
- GoDaddy Auctions
|
|
- NameJet
|
|
"""
|
|
results = {
|
|
"total_found": 0,
|
|
"platforms": {},
|
|
"items": [],
|
|
"errors": [],
|
|
}
|
|
|
|
if not PLAYWRIGHT_AVAILABLE:
|
|
results["errors"].append("Playwright not installed")
|
|
return results
|
|
|
|
try:
|
|
await self.initialize()
|
|
|
|
# Scrape NameJet (Cloudflare protected)
|
|
logger.info("Scraping NameJet with Playwright...")
|
|
namejet_result = await self.scrape_namejet()
|
|
results["platforms"]["NameJet"] = {
|
|
"found": len(namejet_result.get("items", [])),
|
|
"source": namejet_result.get("source", "unknown"),
|
|
}
|
|
results["items"].extend(namejet_result.get("items", []))
|
|
results["total_found"] += len(namejet_result.get("items", []))
|
|
|
|
if namejet_result.get("error"):
|
|
results["errors"].append(f"NameJet: {namejet_result['error']}")
|
|
|
|
except Exception as e:
|
|
logger.exception(f"Playwright scraping error: {e}")
|
|
results["errors"].append(str(e))
|
|
finally:
|
|
await self.close()
|
|
|
|
return results
|
|
|
|
|
|
# Singleton instance
|
|
playwright_scraper = PlaywrightScraperService()
|
|
|