""" Intent Detection Engine for Yield Domains. Analyzes domain names to detect user intent and match with affiliate partners. Uses keyword matching, pattern detection, and NLP-lite techniques. """ import re from dataclasses import dataclass from typing import Optional @dataclass class IntentResult: """Result of intent detection for a domain.""" category: str # Primary intent category subcategory: Optional[str] # More specific subcategory confidence: float # 0.0 - 1.0 keywords_matched: list[str] # Which keywords triggered the match suggested_partners: list[str] # Affiliate partner slugs monetization_potential: str # "high", "medium", "low" # Intent categories with keywords (Swiss/German/English focus) INTENT_PATTERNS = { # Medical / Health "medical_dental": { "keywords": [ "zahnarzt", "dental", "dentist", "zahn", "zähne", "kieferorthopäde", "implantate", "zahnklinik", "prothese", "bleaching", "zahnpflege", "dentalhygiene", "mundgesundheit", "braces", "orthodontist" ], "patterns": [r"zahn\w*", r"dent\w*"], "potential": "high", "partners": ["comparis_dental", "swisssmile", "dentaldeal"] }, "medical_general": { "keywords": [ "arzt", "doctor", "klinik", "clinic", "hospital", "spital", "praxis", "gesundheit", "health", "medizin", "medicine", "therapie", "therapy", "behandlung", "treatment" ], "patterns": [r"med\w+", r"gesund\w*", r"health\w*"], "potential": "high", "partners": ["comparis_health", "sanitas", "helsana"] }, "medical_beauty": { "keywords": [ "schönheit", "beauty", "kosmetik", "cosmetic", "botox", "filler", "laser", "aesthetic", "ästhetik", "haut", "skin", "anti-aging", "wellness", "spa", "massage" ], "patterns": [r"beauty\w*", r"kosm\w*"], "potential": "high", "partners": ["swissesthetic", "beautyfinder"] }, # Finance / Insurance "finance_insurance": { "keywords": [ "versicherung", "insurance", "krankenkasse", "autoversicherung", "hausrat", "haftpflicht", "lebensversicherung", "police" ], "patterns": [r"versicher\w*", r"insur\w*"], "potential": "high", "partners": ["comparis_insurance", "bonus_ch", "financescout"] }, "finance_mortgage": { "keywords": [ "hypothek", "mortgage", "kredit", "credit", "darlehen", "loan", "finanzierung", "financing", "immobilien", "eigenheim" ], "patterns": [r"hypo\w*", r"kredit\w*", r"mortg\w*"], "potential": "high", "partners": ["comparis_hypo", "moneypark", "hypocenter"] }, "finance_banking": { "keywords": [ "bank", "banking", "konto", "account", "sparen", "savings", "anlegen", "invest", "geld", "money", "zinsen", "interest" ], "patterns": [r"bank\w*", r"finanz\w*"], "potential": "medium", "partners": ["neon_bank", "yuh_ch"] }, # Legal "legal_general": { "keywords": [ "anwalt", "lawyer", "rechtsanwalt", "attorney", "rechtshilfe", "legal", "recht", "law", "kanzlei", "advokat", "jurist" ], "patterns": [r"anwalt\w*", r"recht\w*", r"law\w*"], "potential": "high", "partners": ["legal_ch", "anwalt24"] }, # Real Estate "realestate_buy": { "keywords": [ "immobilien", "realestate", "wohnung", "apartment", "haus", "house", "kaufen", "buy", "villa", "eigentum", "property", "liegenschaft" ], "patterns": [r"immobil\w*", r"wohn\w*"], "potential": "high", "partners": ["homegate", "immoscout", "comparis_immo"] }, "realestate_rent": { "keywords": [ "mieten", "rent", "miete", "mietwohnung", "rental", "wg", "studio", "loft", "untermiete" ], "patterns": [r"miet\w*", r"rent\w*"], "potential": "medium", "partners": ["homegate", "flatfox"] }, # Travel "travel_flights": { "keywords": [ "flug", "flight", "fliegen", "fly", "airline", "flughafen", "airport", "billigflug", "cheapflight", "reise", "travel" ], "patterns": [r"fl[uy]g\w*", r"travel\w*"], "potential": "medium", "partners": ["skyscanner", "kayak", "booking"] }, "travel_hotels": { "keywords": [ "hotel", "unterkunft", "accommodation", "hostel", "pension", "resort", "übernachtung", "booking", "airbnb" ], "patterns": [r"hotel\w*"], "potential": "medium", "partners": ["booking_com", "trivago", "hotels_com"] }, # E-Commerce / Shopping "shopping_general": { "keywords": [ "shop", "store", "kaufen", "buy", "einkaufen", "shopping", "deals", "rabatt", "discount", "sale", "angebot", "offer" ], "patterns": [r"shop\w*", r"deal\w*"], "potential": "medium", "partners": ["amazon_ch", "galaxus", "digitec"] }, "shopping_fashion": { "keywords": [ "mode", "fashion", "kleider", "clothes", "schuhe", "shoes", "outfit", "style", "bekleidung", "garderobe" ], "patterns": [r"mode\w*", r"fash\w*"], "potential": "medium", "partners": ["zalando", "about_you"] }, # Automotive "auto_buy": { "keywords": [ "auto", "car", "fahrzeug", "vehicle", "wagen", "neuwagen", "gebrauchtwagen", "occasion", "carmarket", "autohaus" ], "patterns": [r"auto\w*", r"car\w*"], "potential": "high", "partners": ["autoscout", "comparis_auto", "carforyou"] }, "auto_service": { "keywords": [ "garage", "werkstatt", "reparatur", "repair", "service", "reifenwechsel", "inspektion", "tuning" ], "patterns": [r"garag\w*"], "potential": "medium", "partners": ["autobutler"] }, # Jobs / Career "jobs": { "keywords": [ "job", "jobs", "karriere", "career", "arbeit", "work", "stelle", "stellenangebot", "vacancy", "hiring", "bewerbung" ], "patterns": [r"job\w*", r"karrier\w*"], "potential": "medium", "partners": ["jobs_ch", "indeed", "linkedin"] }, # Education "education": { "keywords": [ "schule", "school", "uni", "university", "bildung", "education", "kurs", "course", "lernen", "learn", "ausbildung", "training", "weiterbildung", "studium", "studieren" ], "patterns": [r"schul\w*", r"edu\w*", r"learn\w*"], "potential": "medium", "partners": ["udemy", "coursera", "edx"] }, # Technology "tech_hosting": { "keywords": [ "hosting", "server", "cloud", "domain", "website", "webhosting", "vps", "dedicated", "webspace" ], "patterns": [r"host\w*", r"server\w*"], "potential": "medium", "partners": ["hostpoint", "infomaniak", "cyon"] }, "tech_software": { "keywords": [ "software", "app", "tool", "saas", "crm", "erp", "programm", "application", "platform" ], "patterns": [r"soft\w*", r"app\w*"], "potential": "medium", "partners": ["capterra", "g2"] }, # Food / Restaurant "food_restaurant": { "keywords": [ "restaurant", "essen", "food", "pizza", "sushi", "burger", "cafe", "bistro", "gastronomie", "dining" ], "patterns": [r"food\w*", r"pizza\w*"], "potential": "low", "partners": ["eatme", "uber_eats"] }, "food_delivery": { "keywords": [ "lieferung", "delivery", "liefern", "bestellen", "order", "takeaway", "takeout" ], "patterns": [r"deliver\w*", r"liefer\w*"], "potential": "medium", "partners": ["uber_eats", "just_eat"] }, } # Swiss city names for geo-targeting SWISS_CITIES = { "zürich", "zurich", "zuerich", "zri", "zh", "bern", "genf", "geneva", "geneve", "basel", "lausanne", "luzern", "lucerne", "winterthur", "stgallen", "st-gallen", "lugano", "biel", "bienne", "thun", "köniz", "chur", "schaffhausen", "fribourg", "freiburg", "neuchatel", "neuenburg", "uster", "sion", "sitten", "zug", "aarau", "baden", "wil", "davos", "interlaken" } # German cities GERMAN_CITIES = { "berlin", "münchen", "munich", "muenchen", "hamburg", "frankfurt", "köln", "koeln", "düsseldorf", "duesseldorf", "stuttgart", "dortmund", "essen", "leipzig", "bremen" } class IntentDetector: """ Detects user intent from domain names. Uses keyword matching and pattern detection to categorize domains and suggest appropriate affiliate partners for monetization. """ def __init__(self): self.patterns = INTENT_PATTERNS self.swiss_cities = SWISS_CITIES self.german_cities = GERMAN_CITIES def detect(self, domain: str) -> IntentResult: """ Analyze a domain name and detect its intent category. Args: domain: The domain name (e.g., "zahnarzt-zuerich.ch") Returns: IntentResult with category, confidence, and partner suggestions """ # Normalize domain domain_clean = self._normalize_domain(domain) parts = self._split_domain_parts(domain_clean) # Find best matching category best_match = None best_score = 0.0 best_keywords = [] for category, config in self.patterns.items(): score, matched_keywords = self._score_category(parts, config) if score > best_score: best_score = score best_match = category best_keywords = matched_keywords # Determine confidence level confidence = min(best_score / 3.0, 1.0) # Normalize to 0-1 # If no strong match, return generic if best_score < 0.5 or best_match is None: return IntentResult( category="generic", subcategory=None, confidence=0.2, keywords_matched=[], suggested_partners=["generic_affiliate"], monetization_potential="low" ) # Get category config config = self.patterns[best_match] # Split category into main and sub parts = best_match.split("_", 1) main_category = parts[0] subcategory = parts[1] if len(parts) > 1 else None return IntentResult( category=main_category, subcategory=subcategory, confidence=confidence, keywords_matched=best_keywords, suggested_partners=config.get("partners", []), monetization_potential=config.get("potential", "medium") ) def detect_geo(self, domain: str) -> Optional[str]: """ Detect geographic targeting from domain name. Returns: ISO country code if detected (e.g., "CH", "DE"), None otherwise """ domain_clean = self._normalize_domain(domain) parts = set(self._split_domain_parts(domain_clean)) # Check TLD first if domain.endswith(".ch") or domain.endswith(".swiss"): return "CH" if domain.endswith(".de"): return "DE" if domain.endswith(".at"): return "AT" # Check city names if parts & self.swiss_cities: return "CH" if parts & self.german_cities: return "DE" return None def estimate_value(self, domain: str) -> dict: """ Estimate the monetization value of a domain. Returns dict with value estimates based on intent and traffic potential. """ intent = self.detect(domain) geo = self.detect_geo(domain) # Base value by potential base_values = { "high": {"min": 50, "max": 500}, "medium": {"min": 20, "max": 100}, "low": {"min": 5, "max": 30} } potential = intent.monetization_potential base = base_values.get(potential, base_values["low"]) # Adjust for geo (Swiss = premium) multiplier = 1.5 if geo == "CH" else 1.0 # Adjust for confidence confidence_mult = 0.5 + (intent.confidence * 0.5) return { "estimated_monthly_min": int(base["min"] * multiplier * confidence_mult), "estimated_monthly_max": int(base["max"] * multiplier * confidence_mult), "currency": "CHF" if geo == "CH" else "EUR", "potential": potential, "confidence": intent.confidence, "geo": geo } def _normalize_domain(self, domain: str) -> str: """Remove TLD and normalize domain string.""" # Remove common TLDs domain = re.sub(r'\.(com|net|org|ch|de|at|io|co|info|swiss)$', '', domain.lower()) # Replace common separators with space domain = re.sub(r'[-_.]', ' ', domain) return domain.strip() def _split_domain_parts(self, domain_clean: str) -> list[str]: """Split domain into meaningful parts.""" # Split on spaces (from separators) parts = domain_clean.split() # Also try to split camelCase or compound words expanded = [] for part in parts: # Try to find compound word boundaries expanded.append(part) # Add any sub-matches for longer words if len(part) > 6: expanded.extend(self._find_subwords(part)) return expanded def _find_subwords(self, word: str) -> list[str]: """Find meaningful subwords in compound words.""" subwords = [] # Check if any keywords are contained in this word for config in self.patterns.values(): for keyword in config["keywords"]: if keyword in word and keyword != word: subwords.append(keyword) return subwords def _score_category(self, parts: list[str], config: dict) -> tuple[float, list[str]]: """ Score how well domain parts match a category. Returns (score, matched_keywords) """ score = 0.0 matched = [] keywords = set(config.get("keywords", [])) patterns = config.get("patterns", []) for part in parts: # Exact keyword match if part in keywords: score += 1.0 matched.append(part) continue # Partial keyword match for kw in keywords: if kw in part or part in kw: score += 0.5 matched.append(f"{part}~{kw}") break # Regex pattern match for pattern in patterns: if re.match(pattern, part): score += 0.7 matched.append(f"{part}@{pattern}") break return score, matched # Singleton instance _detector = None def get_intent_detector() -> IntentDetector: """Get singleton IntentDetector instance.""" global _detector if _detector is None: _detector = IntentDetector() return _detector def detect_domain_intent(domain: str) -> IntentResult: """Convenience function to detect intent for a domain.""" return get_intent_detector().detect(domain) def estimate_domain_yield(domain: str) -> dict: """Convenience function to estimate yield value for a domain.""" detector = get_intent_detector() intent = detector.detect(domain) value = detector.estimate_value(domain) return { "domain": domain, "intent": { "category": intent.category, "subcategory": intent.subcategory, "confidence": intent.confidence, "keywords": intent.keywords_matched }, "value": value, "partners": intent.suggested_partners, "monetization_potential": intent.monetization_potential }