pounce/backend/app/services/seo_analyzer.py
yves.gugger a33d57ccb4 feat: Add SEO Juice Detector (Tycoon feature)
From analysis_3.md - Strategie 3: SEO-Daten & Backlinks:
'SEO-Agenturen suchen Domains wegen der Power (Backlinks).
Solche Domains sind für SEOs 100-500€ wert, auch wenn der Name hässlich ist.'

BACKEND:
- Model: DomainSEOData for caching SEO metrics
- Service: seo_analyzer.py with Moz API integration
  - Falls back to estimation if no API keys
  - Detects notable links (Wikipedia, .gov, .edu, news)
  - Calculates SEO value estimate
- API: /seo endpoints (Tycoon-only access)

FRONTEND:
- /command/seo page with full SEO analysis
- Upgrade prompt for non-Tycoon users
- Notable links display (Wikipedia, .gov, .edu, news)
- Top backlinks with authority scores
- Recent searches saved locally

SIDEBAR:
- Added 'SEO Juice' nav item with 'Tycoon' badge

DOCS:
- Updated DATABASE_MIGRATIONS.md with domain_seo_data table
- Added SEO API endpoints documentation
- Added Moz API environment variables info
2025-12-10 11:58:05 +01:00

382 lines
14 KiB
Python

"""
SEO Analyzer Service - "SEO Juice Detector"
This implements Strategie 3 from analysis_3.md:
"SEO-Agenturen suchen Domains wegen der Power (Backlinks).
Solche Domains sind für SEOs 100€ - 500€ wert, auch wenn der Name hässlich ist."
Data Sources (in priority order):
1. Moz API (if MOZ_ACCESS_ID and MOZ_SECRET_KEY are set)
2. CommonCrawl Index (free, but limited)
3. Estimation based on domain characteristics
This is a TYCOON-ONLY feature.
"""
import os
import logging
import base64
import hashlib
import hmac
import time
import httpx
from datetime import datetime, timedelta
from typing import Optional, Dict, Any, List
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.seo_data import DomainSEOData
logger = logging.getLogger(__name__)
class SEOAnalyzerService:
"""
Analyzes domains for SEO value (backlinks, authority, etc.)
From analysis_3.md:
"Domain `alte-bäckerei-münchen.de` ist frei.
Hat Links von `sueddeutsche.de` und `wikipedia.org`."
"""
# Moz API configuration
MOZ_API_URL = "https://lsapi.seomoz.com/v2/url_metrics"
MOZ_LINKS_URL = "https://lsapi.seomoz.com/v2/links"
# Cache duration (7 days for SEO data)
CACHE_DURATION_DAYS = 7
# Known high-authority domains for notable link detection
NOTABLE_DOMAINS = {
'wikipedia': ['wikipedia.org', 'wikimedia.org'],
'gov': ['.gov', '.gov.uk', '.admin.ch', '.bund.de'],
'edu': ['.edu', '.ac.uk', '.ethz.ch', '.uzh.ch'],
'news': [
'nytimes.com', 'theguardian.com', 'bbc.com', 'cnn.com',
'forbes.com', 'bloomberg.com', 'reuters.com', 'techcrunch.com',
'spiegel.de', 'faz.net', 'nzz.ch', 'tagesanzeiger.ch'
]
}
def __init__(self):
self.moz_access_id = os.getenv('MOZ_ACCESS_ID')
self.moz_secret_key = os.getenv('MOZ_SECRET_KEY')
self.has_moz = bool(self.moz_access_id and self.moz_secret_key)
if self.has_moz:
logger.info("SEO Analyzer: Moz API configured")
else:
logger.warning("SEO Analyzer: No Moz API keys - using estimation mode")
async def analyze_domain(
self,
domain: str,
db: AsyncSession,
force_refresh: bool = False
) -> Dict[str, Any]:
"""
Analyze a domain for SEO value.
Returns:
Dict with SEO metrics, backlinks, and value estimate
"""
domain = domain.lower().strip()
# Check cache first
if not force_refresh:
cached = await self._get_cached(domain, db)
if cached and not cached.is_expired:
return self._format_response(cached)
# Fetch fresh data
if self.has_moz:
seo_data = await self._fetch_moz_data(domain)
else:
seo_data = await self._estimate_seo_data(domain)
# Save to cache
cached = await self._save_to_cache(domain, seo_data, db)
return self._format_response(cached)
async def _get_cached(self, domain: str, db: AsyncSession) -> Optional[DomainSEOData]:
"""Get cached SEO data for a domain."""
result = await db.execute(
select(DomainSEOData).where(DomainSEOData.domain == domain)
)
return result.scalar_one_or_none()
async def _save_to_cache(
self,
domain: str,
data: Dict[str, Any],
db: AsyncSession
) -> DomainSEOData:
"""Save SEO data to cache."""
# Check if exists
result = await db.execute(
select(DomainSEOData).where(DomainSEOData.domain == domain)
)
cached = result.scalar_one_or_none()
if cached:
# Update existing
for key, value in data.items():
if hasattr(cached, key):
setattr(cached, key, value)
cached.last_updated = datetime.utcnow()
cached.expires_at = datetime.utcnow() + timedelta(days=self.CACHE_DURATION_DAYS)
cached.fetch_count += 1
else:
# Create new
cached = DomainSEOData(
domain=domain,
expires_at=datetime.utcnow() + timedelta(days=self.CACHE_DURATION_DAYS),
**data
)
db.add(cached)
await db.commit()
await db.refresh(cached)
return cached
async def _fetch_moz_data(self, domain: str) -> Dict[str, Any]:
"""Fetch SEO data from Moz API."""
try:
# Generate authentication
expires = int(time.time()) + 300
string_to_sign = f"{self.moz_access_id}\n{expires}"
signature = base64.b64encode(
hmac.new(
self.moz_secret_key.encode('utf-8'),
string_to_sign.encode('utf-8'),
hashlib.sha1
).digest()
).decode('utf-8')
auth_params = {
'AccessID': self.moz_access_id,
'Expires': expires,
'Signature': signature
}
async with httpx.AsyncClient(timeout=30) as client:
# Get URL metrics
response = await client.post(
self.MOZ_API_URL,
params=auth_params,
json={
'targets': [f'http://{domain}/'],
}
)
if response.status_code == 200:
metrics = response.json()
if metrics and 'results' in metrics and metrics['results']:
result = metrics['results'][0]
# Extract notable backlinks
top_backlinks = await self._fetch_top_backlinks(
domain, auth_params, client
)
return {
'domain_authority': result.get('domain_authority', 0),
'page_authority': result.get('page_authority', 0),
'spam_score': result.get('spam_score', 0),
'total_backlinks': result.get('external_links_to_root_domain', 0),
'referring_domains': result.get('root_domains_to_root_domain', 0),
'top_backlinks': top_backlinks,
'notable_backlinks': self._extract_notable(top_backlinks),
'has_wikipedia_link': self._has_notable_link(top_backlinks, 'wikipedia'),
'has_gov_link': self._has_notable_link(top_backlinks, 'gov'),
'has_edu_link': self._has_notable_link(top_backlinks, 'edu'),
'has_news_link': self._has_notable_link(top_backlinks, 'news'),
'seo_value_estimate': self._calculate_seo_value(result),
'data_source': 'moz',
}
logger.warning(f"Moz API returned {response.status_code} for {domain}")
except Exception as e:
logger.error(f"Moz API error for {domain}: {e}")
# Fallback to estimation
return await self._estimate_seo_data(domain)
async def _fetch_top_backlinks(
self,
domain: str,
auth_params: dict,
client: httpx.AsyncClient
) -> List[Dict[str, Any]]:
"""Fetch top backlinks from Moz."""
try:
response = await client.post(
self.MOZ_LINKS_URL,
params=auth_params,
json={
'target': f'http://{domain}/',
'target_scope': 'root_domain',
'filter': 'external+nofollow',
'sort': 'domain_authority',
'limit': 20
}
)
if response.status_code == 200:
data = response.json()
if 'results' in data:
return [
{
'domain': link.get('source', {}).get('root_domain', ''),
'authority': link.get('source', {}).get('domain_authority', 0),
'page': link.get('source', {}).get('page', ''),
}
for link in data['results'][:10]
]
except Exception as e:
logger.error(f"Error fetching backlinks: {e}")
return []
async def _estimate_seo_data(self, domain: str) -> Dict[str, Any]:
"""
Estimate SEO data when no API is available.
Uses heuristics based on domain characteristics.
"""
# Extract domain parts
parts = domain.split('.')
name = parts[0] if parts else domain
tld = parts[-1] if len(parts) > 1 else ''
# Estimate domain authority based on characteristics
estimated_da = 10 # Base
# Short domains tend to have more backlinks
if len(name) <= 4:
estimated_da += 15
elif len(name) <= 6:
estimated_da += 10
elif len(name) <= 8:
estimated_da += 5
# Premium TLDs
premium_tlds = {'com': 10, 'org': 8, 'net': 5, 'io': 7, 'ai': 8, 'co': 6}
estimated_da += premium_tlds.get(tld, 0)
# Dictionary words get a boost
common_words = ['tech', 'app', 'data', 'cloud', 'web', 'net', 'hub', 'lab', 'dev']
if any(word in name.lower() for word in common_words):
estimated_da += 5
# Cap at reasonable estimate
estimated_da = min(40, estimated_da)
# Estimate backlinks based on DA
estimated_backlinks = estimated_da * 50
estimated_referring = estimated_da * 5
return {
'domain_authority': estimated_da,
'page_authority': max(0, estimated_da - 5),
'spam_score': 5, # Assume low spam for estimates
'total_backlinks': estimated_backlinks,
'referring_domains': estimated_referring,
'top_backlinks': [],
'notable_backlinks': None,
'has_wikipedia_link': False,
'has_gov_link': False,
'has_edu_link': False,
'has_news_link': False,
'seo_value_estimate': self._estimate_value(estimated_da),
'data_source': 'estimated',
}
def _has_notable_link(self, backlinks: List[Dict], category: str) -> bool:
"""Check if backlinks contain notable sources."""
domains_to_check = self.NOTABLE_DOMAINS.get(category, [])
for link in backlinks:
link_domain = link.get('domain', '').lower()
for notable in domains_to_check:
if notable in link_domain:
return True
return False
def _extract_notable(self, backlinks: List[Dict]) -> Optional[str]:
"""Extract notable backlink domains as comma-separated string."""
notable = []
for link in backlinks:
domain = link.get('domain', '')
authority = link.get('authority', 0)
# Include high-authority links
if authority >= 50:
notable.append(domain)
return ','.join(notable[:10]) if notable else None
def _calculate_seo_value(self, metrics: Dict) -> float:
"""Calculate estimated SEO value in USD."""
da = metrics.get('domain_authority', 0)
backlinks = metrics.get('external_links_to_root_domain', 0)
# Base value from DA
if da >= 60:
base_value = 500
elif da >= 40:
base_value = 200
elif da >= 20:
base_value = 50
else:
base_value = 10
# Boost for backlinks
link_boost = min(backlinks / 100, 10) * 20
return round(base_value + link_boost, 2)
def _estimate_value(self, da: int) -> float:
"""Estimate value based on estimated DA."""
if da >= 40:
return 200
elif da >= 30:
return 100
elif da >= 20:
return 50
return 20
def _format_response(self, data: DomainSEOData) -> Dict[str, Any]:
"""Format SEO data for API response."""
return {
'domain': data.domain,
'seo_score': data.seo_score,
'value_category': data.value_category,
'metrics': {
'domain_authority': data.domain_authority,
'page_authority': data.page_authority,
'spam_score': data.spam_score,
'total_backlinks': data.total_backlinks,
'referring_domains': data.referring_domains,
},
'notable_links': {
'has_wikipedia': data.has_wikipedia_link,
'has_gov': data.has_gov_link,
'has_edu': data.has_edu_link,
'has_news': data.has_news_link,
'notable_domains': data.notable_backlinks.split(',') if data.notable_backlinks else [],
},
'top_backlinks': data.top_backlinks or [],
'estimated_value': data.seo_value_estimate,
'data_source': data.data_source,
'last_updated': data.last_updated.isoformat() if data.last_updated else None,
'is_estimated': data.data_source == 'estimated',
}
# Singleton instance
seo_analyzer = SEOAnalyzerService()