pounce/backend/app/services/seo_analyzer.py

"""
SEO Analyzer Service - "SEO Juice Detector"

This implements Strategie 3 from analysis_3.md:
"SEO-Agenturen suchen Domains wegen der Power (Backlinks).
Solche Domains sind für SEOs 100€ - 500€ wert, auch wenn der Name hässlich ist."

Data Sources (in priority order):
1. Moz API (if MOZ_ACCESS_ID and MOZ_SECRET_KEY are set)
2. CommonCrawl Index (free, but limited)
3. Estimation based on domain characteristics

This is a TYCOON-ONLY feature.
"""
import os
import logging
import base64
import hashlib
import hmac
import time
import httpx
from datetime import datetime, timedelta
from typing import Optional, Dict, Any, List
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from app.models.seo_data import DomainSEOData

logger = logging.getLogger(__name__)


class SEOAnalyzerService:
    """
    Analyzes domains for SEO value (backlinks, authority, etc.)

    From analysis_3.md:
    "Domain `alte-bäckerei-münchen.de` ist frei.
    Hat Links von `sueddeutsche.de` und `wikipedia.org`."
    """

    # Moz API configuration
    MOZ_API_URL = "https://lsapi.seomoz.com/v2/url_metrics"
    MOZ_LINKS_URL = "https://lsapi.seomoz.com/v2/links"

    # Cache duration (7 days for SEO data)
    CACHE_DURATION_DAYS = 7

    # Known high-authority domains for notable link detection
    NOTABLE_DOMAINS = {
        'wikipedia': ['wikipedia.org', 'wikimedia.org'],
        'gov': ['.gov', '.gov.uk', '.admin.ch', '.bund.de'],
        'edu': ['.edu', '.ac.uk', '.ethz.ch', '.uzh.ch'],
        'news': [
            'nytimes.com', 'theguardian.com', 'bbc.com', 'cnn.com',
            'forbes.com', 'bloomberg.com', 'reuters.com', 'techcrunch.com',
            'spiegel.de', 'faz.net', 'nzz.ch', 'tagesanzeiger.ch'
        ]
    }

    def __init__(self):
        self.moz_access_id = os.getenv('MOZ_ACCESS_ID')
        self.moz_secret_key = os.getenv('MOZ_SECRET_KEY')
        self.has_moz = bool(self.moz_access_id and self.moz_secret_key)

        if self.has_moz:
            logger.info("SEO Analyzer: Moz API configured")
        else:
            logger.warning("SEO Analyzer: No Moz API keys - using estimation mode")

    async def analyze_domain(
        self,
        domain: str,
        db: AsyncSession,
        force_refresh: bool = False
    ) -> Dict[str, Any]:
        """
        Analyze a domain for SEO value.

        Returns:
            Dict with SEO metrics, backlinks, and value estimate
        """
        domain = domain.lower().strip()

        # Check cache first
        if not force_refresh:
            cached = await self._get_cached(domain, db)
            if cached and not cached.is_expired:
                return self._format_response(cached)

        # Fetch fresh data
        if self.has_moz:
            seo_data = await self._fetch_moz_data(domain)
        else:
            seo_data = await self._estimate_seo_data(domain)

        # Save to cache
        cached = await self._save_to_cache(domain, seo_data, db)

        return self._format_response(cached)

    async def _get_cached(self, domain: str, db: AsyncSession) -> Optional[DomainSEOData]:
        """Get cached SEO data for a domain."""
        result = await db.execute(
            select(DomainSEOData).where(DomainSEOData.domain == domain)
        )
        return result.scalar_one_or_none()

    async def _save_to_cache(
        self,
        domain: str,
        data: Dict[str, Any],
        db: AsyncSession
    ) -> DomainSEOData:
        """Save SEO data to cache."""
        # Check if exists
        result = await db.execute(
            select(DomainSEOData).where(DomainSEOData.domain == domain)
        )
        cached = result.scalar_one_or_none()

        if cached:
            # Update existing
            for key, value in data.items():
                if hasattr(cached, key):
                    setattr(cached, key, value)
            cached.last_updated = datetime.utcnow()
            cached.expires_at = datetime.utcnow() + timedelta(days=self.CACHE_DURATION_DAYS)
            cached.fetch_count += 1
        else:
            # Create new
            cached = DomainSEOData(
                domain=domain,
                expires_at=datetime.utcnow() + timedelta(days=self.CACHE_DURATION_DAYS),
                **data
            )
            db.add(cached)

        await db.commit()
        await db.refresh(cached)
        return cached

    async def _fetch_moz_data(self, domain: str) -> Dict[str, Any]:
        """Fetch SEO data from Moz API."""
        try:
            # Generate authentication
            expires = int(time.time()) + 300
            string_to_sign = f"{self.moz_access_id}\n{expires}"
            signature = base64.b64encode(
                hmac.new(
                    self.moz_secret_key.encode('utf-8'),
                    string_to_sign.encode('utf-8'),
                    hashlib.sha1
                ).digest()
            ).decode('utf-8')

            auth_params = {
                'AccessID': self.moz_access_id,
                'Expires': expires,
                'Signature': signature
            }

            async with httpx.AsyncClient(timeout=30) as client:
                # Get URL metrics
                response = await client.post(
                    self.MOZ_API_URL,
                    params=auth_params,
                    json={
                        'targets': [f'http://{domain}/'],
                    }
                )

                if response.status_code == 200:
                    metrics = response.json()
                    if metrics and 'results' in metrics and metrics['results']:
                        result = metrics['results'][0]

                        # Extract notable backlinks
                        top_backlinks = await self._fetch_top_backlinks(
                            domain, auth_params, client
                        )

                        return {
                            'domain_authority': result.get('domain_authority', 0),
                            'page_authority': result.get('page_authority', 0),
                            'spam_score': result.get('spam_score', 0),
                            'total_backlinks': result.get('external_links_to_root_domain', 0),
                            'referring_domains': result.get('root_domains_to_root_domain', 0),
                            'top_backlinks': top_backlinks,
                            'notable_backlinks': self._extract_notable(top_backlinks),
                            'has_wikipedia_link': self._has_notable_link(top_backlinks, 'wikipedia'),
                            'has_gov_link': self._has_notable_link(top_backlinks, 'gov'),
                            'has_edu_link': self._has_notable_link(top_backlinks, 'edu'),
                            'has_news_link': self._has_notable_link(top_backlinks, 'news'),
                            'seo_value_estimate': self._calculate_seo_value(result),
                            'data_source': 'moz',
                        }

                logger.warning(f"Moz API returned {response.status_code} for {domain}")

        except Exception as e:
            logger.error(f"Moz API error for {domain}: {e}")

        # Fallback to estimation
        return await self._estimate_seo_data(domain)

    async def _fetch_top_backlinks(
        self,
        domain: str,
        auth_params: dict,
        client: httpx.AsyncClient
    ) -> List[Dict[str, Any]]:
        """Fetch top backlinks from Moz."""
        try:
            response = await client.post(
                self.MOZ_LINKS_URL,
                params=auth_params,
                json={
                    'target': f'http://{domain}/',
                    'target_scope': 'root_domain',
                    'filter': 'external+nofollow',
                    'sort': 'domain_authority',
                    'limit': 20
                }
            )

            if response.status_code == 200:
                data = response.json()
                if 'results' in data:
                    return [
                        {
                            'domain': link.get('source', {}).get('root_domain', ''),
                            'authority': link.get('source', {}).get('domain_authority', 0),
                            'page': link.get('source', {}).get('page', ''),
                        }
                        for link in data['results'][:10]
                    ]
        except Exception as e:
            logger.error(f"Error fetching backlinks: {e}")

        return []

    async def _estimate_seo_data(self, domain: str) -> Dict[str, Any]:
        """
        Estimate SEO data when no API is available.

        Uses heuristics based on domain characteristics.
        """
        # Extract domain parts
        parts = domain.split('.')
        name = parts[0] if parts else domain
        tld = parts[-1] if len(parts) > 1 else ''

        # Estimate domain authority based on characteristics
        estimated_da = 10  # Base

        # Short domains tend to have more backlinks
        if len(name) <= 4:
            estimated_da += 15
        elif len(name) <= 6:
            estimated_da += 10
        elif len(name) <= 8:
            estimated_da += 5

        # Premium TLDs
        premium_tlds = {'com': 10, 'org': 8, 'net': 5, 'io': 7, 'ai': 8, 'co': 6}
        estimated_da += premium_tlds.get(tld, 0)

        # Dictionary words get a boost
        common_words = ['tech', 'app', 'data', 'cloud', 'web', 'net', 'hub', 'lab', 'dev']
        if any(word in name.lower() for word in common_words):
            estimated_da += 5

        # Cap at reasonable estimate
        estimated_da = min(40, estimated_da)

        # Estimate backlinks based on DA
        estimated_backlinks = estimated_da * 50
        estimated_referring = estimated_da * 5

        return {
            'domain_authority': estimated_da,
            'page_authority': max(0, estimated_da - 5),
            'spam_score': 5,  # Assume low spam for estimates
            'total_backlinks': estimated_backlinks,
            'referring_domains': estimated_referring,
            'top_backlinks': [],
            'notable_backlinks': None,
            'has_wikipedia_link': False,
            'has_gov_link': False,
            'has_edu_link': False,
            'has_news_link': False,
            'seo_value_estimate': self._estimate_value(estimated_da),
            'data_source': 'estimated',
        }

    def _has_notable_link(self, backlinks: List[Dict], category: str) -> bool:
        """Check if backlinks contain notable sources."""
        domains_to_check = self.NOTABLE_DOMAINS.get(category, [])

        for link in backlinks:
            link_domain = link.get('domain', '').lower()
            for notable in domains_to_check:
                if notable in link_domain:
                    return True
        return False

    def _extract_notable(self, backlinks: List[Dict]) -> Optional[str]:
        """Extract notable backlink domains as comma-separated string."""
        notable = []

        for link in backlinks:
            domain = link.get('domain', '')
            authority = link.get('authority', 0)

            # Include high-authority links
            if authority >= 50:
                notable.append(domain)

        return ','.join(notable[:10]) if notable else None

    def _calculate_seo_value(self, metrics: Dict) -> float:
        """Calculate estimated SEO value in USD."""
        da = metrics.get('domain_authority', 0)
        backlinks = metrics.get('external_links_to_root_domain', 0)

        # Base value from DA
        if da >= 60:
            base_value = 500
        elif da >= 40:
            base_value = 200
        elif da >= 20:
            base_value = 50
        else:
            base_value = 10

        # Boost for backlinks
        link_boost = min(backlinks / 100, 10) * 20

        return round(base_value + link_boost, 2)

    def _estimate_value(self, da: int) -> float:
        """Estimate value based on estimated DA."""
        if da >= 40:
            return 200
        elif da >= 30:
            return 100
        elif da >= 20:
            return 50
        return 20

    def _format_response(self, data: DomainSEOData) -> Dict[str, Any]:
        """Format SEO data for API response."""
        return {
            'domain': data.domain,
            'seo_score': data.seo_score,
            'value_category': data.value_category,
            'metrics': {
                'domain_authority': data.domain_authority,
                'page_authority': data.page_authority,
                'spam_score': data.spam_score,
                'total_backlinks': data.total_backlinks,
                'referring_domains': data.referring_domains,
            },
            'notable_links': {
                'has_wikipedia': data.has_wikipedia_link,
                'has_gov': data.has_gov_link,
                'has_edu': data.has_edu_link,
                'has_news': data.has_news_link,
                'notable_domains': data.notable_backlinks.split(',') if data.notable_backlinks else [],
            },
            'top_backlinks': data.top_backlinks or [],
            'estimated_value': data.seo_value_estimate,
            'data_source': data.data_source,
            'last_updated': data.last_updated.isoformat() if data.last_updated else None,
            'is_estimated': data.data_source == 'estimated',
        }


# Singleton instance
seo_analyzer = SEOAnalyzerService()