""" SEO Analyzer Service - "SEO Juice Detector" This implements Strategie 3 from analysis_3.md: "SEO-Agenturen suchen Domains wegen der Power (Backlinks). Solche Domains sind für SEOs 100€ - 500€ wert, auch wenn der Name hässlich ist." Data Sources (in priority order): 1. Moz API (if MOZ_ACCESS_ID and MOZ_SECRET_KEY are set) 2. CommonCrawl Index (free, but limited) 3. Estimation based on domain characteristics This is a TYCOON-ONLY feature. """ import os import logging import base64 import hashlib import hmac import time import httpx from datetime import datetime, timedelta from typing import Optional, Dict, Any, List from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.models.seo_data import DomainSEOData logger = logging.getLogger(__name__) class SEOAnalyzerService: """ Analyzes domains for SEO value (backlinks, authority, etc.) From analysis_3.md: "Domain `alte-bäckerei-münchen.de` ist frei. Hat Links von `sueddeutsche.de` und `wikipedia.org`." """ # Moz API configuration MOZ_API_URL = "https://lsapi.seomoz.com/v2/url_metrics" MOZ_LINKS_URL = "https://lsapi.seomoz.com/v2/links" # Cache duration (7 days for SEO data) CACHE_DURATION_DAYS = 7 # Known high-authority domains for notable link detection NOTABLE_DOMAINS = { 'wikipedia': ['wikipedia.org', 'wikimedia.org'], 'gov': ['.gov', '.gov.uk', '.admin.ch', '.bund.de'], 'edu': ['.edu', '.ac.uk', '.ethz.ch', '.uzh.ch'], 'news': [ 'nytimes.com', 'theguardian.com', 'bbc.com', 'cnn.com', 'forbes.com', 'bloomberg.com', 'reuters.com', 'techcrunch.com', 'spiegel.de', 'faz.net', 'nzz.ch', 'tagesanzeiger.ch' ] } def __init__(self): self.moz_access_id = os.getenv('MOZ_ACCESS_ID') self.moz_secret_key = os.getenv('MOZ_SECRET_KEY') self.has_moz = bool(self.moz_access_id and self.moz_secret_key) if self.has_moz: logger.info("SEO Analyzer: Moz API configured") else: logger.warning("SEO Analyzer: No Moz API keys - using estimation mode") async def analyze_domain( self, domain: str, db: AsyncSession, force_refresh: bool = False ) -> Dict[str, Any]: """ Analyze a domain for SEO value. Returns: Dict with SEO metrics, backlinks, and value estimate """ domain = domain.lower().strip() # Check cache first if not force_refresh: cached = await self._get_cached(domain, db) if cached and not cached.is_expired: return self._format_response(cached) # Fetch fresh data if self.has_moz: seo_data = await self._fetch_moz_data(domain) else: seo_data = await self._estimate_seo_data(domain) # Save to cache cached = await self._save_to_cache(domain, seo_data, db) return self._format_response(cached) async def _get_cached(self, domain: str, db: AsyncSession) -> Optional[DomainSEOData]: """Get cached SEO data for a domain.""" result = await db.execute( select(DomainSEOData).where(DomainSEOData.domain == domain) ) return result.scalar_one_or_none() async def _save_to_cache( self, domain: str, data: Dict[str, Any], db: AsyncSession ) -> DomainSEOData: """Save SEO data to cache.""" # Check if exists result = await db.execute( select(DomainSEOData).where(DomainSEOData.domain == domain) ) cached = result.scalar_one_or_none() if cached: # Update existing for key, value in data.items(): if hasattr(cached, key): setattr(cached, key, value) cached.last_updated = datetime.utcnow() cached.expires_at = datetime.utcnow() + timedelta(days=self.CACHE_DURATION_DAYS) cached.fetch_count += 1 else: # Create new cached = DomainSEOData( domain=domain, expires_at=datetime.utcnow() + timedelta(days=self.CACHE_DURATION_DAYS), **data ) db.add(cached) await db.commit() await db.refresh(cached) return cached async def _fetch_moz_data(self, domain: str) -> Dict[str, Any]: """Fetch SEO data from Moz API.""" try: # Generate authentication expires = int(time.time()) + 300 string_to_sign = f"{self.moz_access_id}\n{expires}" signature = base64.b64encode( hmac.new( self.moz_secret_key.encode('utf-8'), string_to_sign.encode('utf-8'), hashlib.sha1 ).digest() ).decode('utf-8') auth_params = { 'AccessID': self.moz_access_id, 'Expires': expires, 'Signature': signature } async with httpx.AsyncClient(timeout=30) as client: # Get URL metrics response = await client.post( self.MOZ_API_URL, params=auth_params, json={ 'targets': [f'http://{domain}/'], } ) if response.status_code == 200: metrics = response.json() if metrics and 'results' in metrics and metrics['results']: result = metrics['results'][0] # Extract notable backlinks top_backlinks = await self._fetch_top_backlinks( domain, auth_params, client ) return { 'domain_authority': result.get('domain_authority', 0), 'page_authority': result.get('page_authority', 0), 'spam_score': result.get('spam_score', 0), 'total_backlinks': result.get('external_links_to_root_domain', 0), 'referring_domains': result.get('root_domains_to_root_domain', 0), 'top_backlinks': top_backlinks, 'notable_backlinks': self._extract_notable(top_backlinks), 'has_wikipedia_link': self._has_notable_link(top_backlinks, 'wikipedia'), 'has_gov_link': self._has_notable_link(top_backlinks, 'gov'), 'has_edu_link': self._has_notable_link(top_backlinks, 'edu'), 'has_news_link': self._has_notable_link(top_backlinks, 'news'), 'seo_value_estimate': self._calculate_seo_value(result), 'data_source': 'moz', } logger.warning(f"Moz API returned {response.status_code} for {domain}") except Exception as e: logger.error(f"Moz API error for {domain}: {e}") # Fallback to estimation return await self._estimate_seo_data(domain) async def _fetch_top_backlinks( self, domain: str, auth_params: dict, client: httpx.AsyncClient ) -> List[Dict[str, Any]]: """Fetch top backlinks from Moz.""" try: response = await client.post( self.MOZ_LINKS_URL, params=auth_params, json={ 'target': f'http://{domain}/', 'target_scope': 'root_domain', 'filter': 'external+nofollow', 'sort': 'domain_authority', 'limit': 20 } ) if response.status_code == 200: data = response.json() if 'results' in data: return [ { 'domain': link.get('source', {}).get('root_domain', ''), 'authority': link.get('source', {}).get('domain_authority', 0), 'page': link.get('source', {}).get('page', ''), } for link in data['results'][:10] ] except Exception as e: logger.error(f"Error fetching backlinks: {e}") return [] async def _estimate_seo_data(self, domain: str) -> Dict[str, Any]: """ Estimate SEO data when no API is available. Uses heuristics based on domain characteristics. """ # Extract domain parts parts = domain.split('.') name = parts[0] if parts else domain tld = parts[-1] if len(parts) > 1 else '' # Estimate domain authority based on characteristics estimated_da = 10 # Base # Short domains tend to have more backlinks if len(name) <= 4: estimated_da += 15 elif len(name) <= 6: estimated_da += 10 elif len(name) <= 8: estimated_da += 5 # Premium TLDs premium_tlds = {'com': 10, 'org': 8, 'net': 5, 'io': 7, 'ai': 8, 'co': 6} estimated_da += premium_tlds.get(tld, 0) # Dictionary words get a boost common_words = ['tech', 'app', 'data', 'cloud', 'web', 'net', 'hub', 'lab', 'dev'] if any(word in name.lower() for word in common_words): estimated_da += 5 # Cap at reasonable estimate estimated_da = min(40, estimated_da) # Estimate backlinks based on DA estimated_backlinks = estimated_da * 50 estimated_referring = estimated_da * 5 return { 'domain_authority': estimated_da, 'page_authority': max(0, estimated_da - 5), 'spam_score': 5, # Assume low spam for estimates 'total_backlinks': estimated_backlinks, 'referring_domains': estimated_referring, 'top_backlinks': [], 'notable_backlinks': None, 'has_wikipedia_link': False, 'has_gov_link': False, 'has_edu_link': False, 'has_news_link': False, 'seo_value_estimate': self._estimate_value(estimated_da), 'data_source': 'estimated', } def _has_notable_link(self, backlinks: List[Dict], category: str) -> bool: """Check if backlinks contain notable sources.""" domains_to_check = self.NOTABLE_DOMAINS.get(category, []) for link in backlinks: link_domain = link.get('domain', '').lower() for notable in domains_to_check: if notable in link_domain: return True return False def _extract_notable(self, backlinks: List[Dict]) -> Optional[str]: """Extract notable backlink domains as comma-separated string.""" notable = [] for link in backlinks: domain = link.get('domain', '') authority = link.get('authority', 0) # Include high-authority links if authority >= 50: notable.append(domain) return ','.join(notable[:10]) if notable else None def _calculate_seo_value(self, metrics: Dict) -> float: """Calculate estimated SEO value in USD.""" da = metrics.get('domain_authority', 0) backlinks = metrics.get('external_links_to_root_domain', 0) # Base value from DA if da >= 60: base_value = 500 elif da >= 40: base_value = 200 elif da >= 20: base_value = 50 else: base_value = 10 # Boost for backlinks link_boost = min(backlinks / 100, 10) * 20 return round(base_value + link_boost, 2) def _estimate_value(self, da: int) -> float: """Estimate value based on estimated DA.""" if da >= 40: return 200 elif da >= 30: return 100 elif da >= 20: return 50 return 20 def _format_response(self, data: DomainSEOData) -> Dict[str, Any]: """Format SEO data for API response.""" return { 'domain': data.domain, 'seo_score': data.seo_score, 'value_category': data.value_category, 'metrics': { 'domain_authority': data.domain_authority, 'page_authority': data.page_authority, 'spam_score': data.spam_score, 'total_backlinks': data.total_backlinks, 'referring_domains': data.referring_domains, }, 'notable_links': { 'has_wikipedia': data.has_wikipedia_link, 'has_gov': data.has_gov_link, 'has_edu': data.has_edu_link, 'has_news': data.has_news_link, 'notable_domains': data.notable_backlinks.split(',') if data.notable_backlinks else [], }, 'top_backlinks': data.top_backlinks or [], 'estimated_value': data.seo_value_estimate, 'data_source': data.data_source, 'last_updated': data.last_updated.isoformat() if data.last_updated else None, 'is_estimated': data.data_source == 'estimated', } # Singleton instance seo_analyzer = SEOAnalyzerService()