fix(scraping): real auctions only + cleanup

- Remove seed/demo auction endpoint + scripts (no mock data)
- Rebuild AuctionScraper: strict validation (no -- bids, requires end_time)
- Add robust sources:
  - ExpiredDomains provider auction pages (GoDaddy/Namecheap/Sedo)
  - Park.io auctions table
  - Sav load_domains_ajax table
- Simplify hidden API scrapers to Dynadot only
- Add unique index on (platform, domain) + safe upsert
- Update deployment/docs to reflect real scraping
This commit is contained in:
2025-12-11 21:50:33 +01:00
parent fce87b6550
commit 5e0d4c6590
10 changed files with 933 additions and 2577 deletions

View File

@ -197,48 +197,29 @@ Mit diesen Verbesserungen wird Pounce ein **echtes Premium-Tool**, das keine ext
---
## ⚠️ KRITISCHES PROBLEM: Sample-Daten vs. Echte Daten
## ✅ GELÖST: Keine Sample-/Fake-Daten im Auction Feed
### Aktueller Zustand der Auktions-Daten:
### Neuer Zustand der Auktions-Daten (Stand: 2025-12)
**Das Scraping ist implementiert ABER:**
**Das Scraping liefert jetzt ausschließlich echte Auktionsdaten** (keine Schätzpreise, kein Random-Fallback, kein Seed/Demo):
1. **ExpiredDomains.net**: Funktioniert, aber:
- Preise sind **geschätzt** (nicht echt): `estimated_price = base_prices.get(tld, 15)`
- Dies sind Registrierungspreise, KEINE Auktionspreise
1. **GoDaddy / Namecheap / Sedo** (robust, ohne Cloudflare-Probleme):
- Ingestion über die ExpiredDomains-Provider-Seiten mit **Price / Bids / Endtime**
- Vorteil: Wir müssen die Cloudflare-geschützten Provider nicht direkt scrapen, bekommen aber echte Live-Daten.
2. **GoDaddy/Sedo/NameJet/DropCatch**: Scraping existiert, aber:
- Websites haben Anti-Bot-Maßnahmen
- Layouts ändern sich regelmäßig
- **Aktuell werden oft Sample-Daten als Fallback verwendet**
2. **Park.io**
- Scraping der öffentlichen Auktionstabelle (inkl. **Price / Bids / Close Date**)
3. **In der Praxis zeigt die Seite oft:**
```python
# backend/app/services/auction_scraper.py:689-780
async def seed_sample_auctions(self, db: AsyncSession):
# DIESE DATEN SIND FAKE (Demo-Daten)!
sample_auctions = [
{"domain": "techflow.io", "platform": "GoDaddy", "current_bid": 250, ...},
...
]
```
3. **Sav**
- Scraping des Tabellen-Endpoints `load_domains_ajax/*` (inkl. **Price / Bids / Time left** → deterministische `end_time` Ableitung)
### 🚨 Für Premium-Qualität erforderlich:
4. **Dynadot**
- Hidden JSON API (Frontend-API) mit echten Preis- und Endzeit-Feldern
1. **Keine geschätzten Preise** - Nur echte Auktionspreise anzeigen
2. **Klare Kennzeichnung** - Wenn Daten unsicher sind, transparent kommunizieren
3. **Fallback-Strategie** - Wenn Scraping fehlschlägt, keine Fake-Daten zeigen
### Datenqualitäts-Regeln
### Empfohlene Änderungen:
```python
# Statt geschätzter Preise:
"current_bid": float(estimated_price), # ❌ FALSCH
# Besser:
"current_bid": None, # Kein Preis = keine falsche Info
"price_type": "registration_estimate", # Kennzeichnung
```
- **`current_bid > 0` und `end_time` müssen vorhanden sein**, sonst wird der Datensatz verworfen.
- Es gibt **keinen** `/api/v1/auctions/seed` Endpunkt mehr und **keine** Seed-/Demo-Skripte.
---

View File

@ -48,8 +48,8 @@ python init_db.py
# TLD Preise seeden
python seed_tld_prices.py
# Auctions seeden (optional für Demo-Daten)
python seed_auctions.py
# Auctions initial scrapen (echte Daten, keine Demo-Daten)
python scripts/scrape_auctions.py
# Stripe Produkte erstellen
python -c "

View File

@ -599,27 +599,6 @@ async def trigger_scrape(
raise HTTPException(status_code=500, detail=f"Scrape failed: {str(e)}")
@router.post("/seed")
async def seed_auctions(
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
"""
Seed the database with realistic sample auction data.
Useful for development and demo purposes.
"""
try:
result = await auction_scraper.seed_sample_auctions(db)
return {
"status": "success",
"message": "Sample auctions seeded",
"result": result,
}
except Exception as e:
logger.error(f"Seeding failed: {e}")
raise HTTPException(status_code=500, detail=f"Seeding failed: {str(e)}")
@router.get("/opportunities")
async def get_smart_opportunities(
current_user: User = Depends(get_current_user),

View File

@ -62,7 +62,8 @@ class DomainAuction(Base):
# Indexes for common queries
__table_args__ = (
Index('ix_auctions_platform_domain', 'platform', 'domain'),
# Enforce de-duplication at the database level.
Index('ux_auctions_platform_domain', 'platform', 'domain', unique=True),
Index('ix_auctions_end_time_active', 'end_time', 'is_active'),
Index('ix_auctions_tld_bid', 'tld', 'current_bid'),
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -257,7 +257,7 @@ class PremiumDataCollector:
"""
Collect auction data from all platforms.
Prioritizes real data over sample/estimated data.
Collects only real auction data (no seed/demo data).
"""
logger.info("🔄 Starting auction collection...")
start_time = datetime.utcnow()
@ -266,14 +266,6 @@ class PremiumDataCollector:
# Try real scraping first
result = await self.auction_scraper.scrape_all_platforms(db)
total_found = result.get("total_found", 0)
# If scraping failed or found too few, supplement with seed data
if total_found < 10:
logger.warning(f"⚠️ Only {total_found} auctions scraped, adding seed data...")
seed_result = await self.auction_scraper.seed_sample_auctions(db)
result["seed_data_added"] = seed_result
duration = (datetime.utcnow() - start_time).total_seconds()
logger.info(f"✅ Auctions collected in {duration:.1f}s")

View File

@ -32,6 +32,42 @@ logging.basicConfig(
)
logger = logging.getLogger(__name__)
async def ensure_auction_uniqueness():
"""
Ensure we have a unique index on (platform, domain) and clean duplicates once.
This prevents duplicate rows when the scraper runs repeatedly (cron) and when
the session uses autoflush=False.
"""
from sqlalchemy import text
from app.config import get_settings
settings = get_settings()
db_url = settings.database_url or ""
async with AsyncSessionLocal() as db:
# Best-effort de-duplication (SQLite only).
if db_url.startswith("sqlite"):
await db.execute(
text(
"""
DELETE FROM domain_auctions
WHERE id NOT IN (
SELECT MAX(id) FROM domain_auctions GROUP BY platform, domain
)
"""
)
)
await db.commit()
# Create unique index (works for SQLite and Postgres).
await db.execute(
text(
"CREATE UNIQUE INDEX IF NOT EXISTS ux_auctions_platform_domain ON domain_auctions(platform, domain)"
)
)
await db.commit()
async def run_scrapers():
"""Run all auction scrapers."""
@ -109,6 +145,9 @@ def main():
print(f" Started: {datetime.now().isoformat()}")
print("="*60)
# Ensure DB uniqueness constraints
asyncio.run(ensure_auction_uniqueness())
# Run scrapers
result = asyncio.run(run_scrapers())

View File

@ -1,36 +0,0 @@
"""Seed auction data for development."""
import asyncio
import sys
import os
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app.database import AsyncSessionLocal
from app.services.auction_scraper import auction_scraper
async def main():
"""Seed auction data."""
async with AsyncSessionLocal() as db:
print("Seeding sample auction data...")
result = await auction_scraper.seed_sample_auctions(db)
print(f"✓ Seeded {result['found']} auctions ({result['new']} new, {result['updated']} updated)")
# Also try to scrape real data
print("\nAttempting to scrape real auction data...")
try:
scrape_result = await auction_scraper.scrape_all_platforms(db)
print(f"✓ Scraped {scrape_result['total_found']} auctions from platforms:")
for platform, stats in scrape_result['platforms'].items():
print(f" - {platform}: {stats.get('found', 0)} found")
if scrape_result['errors']:
print(f" Errors: {scrape_result['errors']}")
except Exception as e:
print(f" Scraping failed (this is okay): {e}")
print("\n✓ Done!")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,85 +0,0 @@
#!/usr/bin/env python3
"""
Test Namecheap GraphQL API to find the query hash.
"""
import asyncio
import httpx
import json
import re
async def test_namecheap():
"""
Test Namecheap GraphQL API.
The API requires a query hash that must be extracted from the website.
"""
async with httpx.AsyncClient(timeout=30.0) as client:
# First, load the Marketplace page to find the hash
print("🔍 Fetching Namecheap Marketplace page...")
response = await client.get(
"https://www.namecheap.com/market/",
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml",
}
)
if response.status_code == 200:
html = response.text
# Look for query hash patterns
hash_patterns = [
r'"queryHash":"([a-f0-9]+)"',
r'"hash":"([a-f0-9]{32,})"',
r'aftermarketapi.*?([a-f0-9]{32,})',
r'"persistedQueryHash":"([a-f0-9]+)"',
]
found_hashes = set()
for pattern in hash_patterns:
matches = re.findall(pattern, html, re.IGNORECASE)
for m in matches:
if len(m) >= 32:
found_hashes.add(m)
if found_hashes:
print(f"✅ Found {len(found_hashes)} potential hashes:")
for h in list(found_hashes)[:5]:
print(f" {h[:50]}...")
else:
print("❌ No hashes found in HTML")
# Check for NEXT_DATA
if "__NEXT_DATA__" in html:
print("📦 Found __NEXT_DATA__ - Next.js app")
match = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if match:
try:
data = json.loads(match.group(1))
print(f" Keys: {list(data.keys())[:5]}")
except:
pass
print(f"📄 Page status: {response.status_code}")
print(f"📄 Page size: {len(html)} bytes")
# Try a different approach - use their search API
print("\n🔍 Trying Namecheap search endpoint...")
search_response = await client.get(
"https://www.namecheap.com/market/search/",
params={"q": "tech"},
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "application/json, text/html",
"X-Requested-With": "XMLHttpRequest",
}
)
print(f" Search status: {search_response.status_code}")
else:
print(f"❌ Failed: {response.status_code}")
if __name__ == "__main__":
asyncio.run(test_namecheap())