""" Zone File Service for .ch and .li domains ========================================== Uses DNS AXFR zone transfer to fetch domain lists from Switch.ch Compares daily snapshots to find freshly dropped domains. Storage: We only store the diff (dropped/new domains) to minimize disk usage. """ import asyncio import hashlib import logging from datetime import datetime, timedelta from pathlib import Path from typing import Optional from sqlalchemy import select, func from sqlalchemy.ext.asyncio import AsyncSession from app.models.zone_file import ZoneSnapshot, DroppedDomain logger = logging.getLogger(__name__) # ============================================================================ # TSIG KEYS (from Switch.ch documentation) # ============================================================================ TSIG_KEYS = { "ch": { "name": "tsig-zonedata-ch-public-21-01", "algorithm": "hmac-sha512", "secret": "stZwEGApYumtXkh73qMLPqfbIDozWKZLkqRvcjKSpRnsor6A6MxixRL6C2HeSVBQNfMW4wer+qjS0ZSfiWiJ3Q==" }, "li": { "name": "tsig-zonedata-li-public-21-01", "algorithm": "hmac-sha512", "secret": "t8GgeCn+fhPaj+cRy1epox2Vj4hZ45ax6v3rQCkkfIQNg5fsxuU23QM5mzz+BxJ4kgF/jiQyBDBvL+XWPE6oCQ==" } } ZONE_SERVER = "zonedata.switch.ch" # ============================================================================ # ZONE FILE SERVICE # ============================================================================ class ZoneFileService: """Service for fetching and analyzing zone files""" def __init__(self, data_dir: Optional[Path] = None): self.data_dir = data_dir or Path("/tmp/pounce_zones") self.data_dir.mkdir(parents=True, exist_ok=True) def _get_key_file_path(self, tld: str) -> Path: """Generate TSIG key file for dig command""" key_path = self.data_dir / f"{tld}_zonedata.key" key_info = TSIG_KEYS.get(tld) if not key_info: raise ValueError(f"Unknown TLD: {tld}") # Write TSIG key file in BIND format key_content = f"""key "{key_info['name']}" {{ algorithm {key_info['algorithm']}; secret "{key_info['secret']}"; }}; """ key_path.write_text(key_content) return key_path async def fetch_zone_file(self, tld: str) -> set[str]: """ Fetch zone file via DNS AXFR transfer. Returns set of domain names (without TLD suffix). """ if tld not in TSIG_KEYS: raise ValueError(f"Unsupported TLD: {tld}. Only 'ch' and 'li' are supported.") logger.info(f"Starting zone transfer for .{tld}") key_file = self._get_key_file_path(tld) # Build dig command cmd = [ "dig", "-k", str(key_file), f"@{ZONE_SERVER}", "+noall", "+answer", "+noidnout", "+onesoa", "AXFR", f"{tld}." ] try: # Run dig command (this can take a while for large zones) process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await asyncio.wait_for( process.communicate(), timeout=600 # 10 minutes timeout for large zones ) if process.returncode != 0: error_msg = stderr.decode() if stderr else "Unknown error" logger.error(f"Zone transfer failed for .{tld}: {error_msg}") raise RuntimeError(f"Zone transfer failed: {error_msg}") # Parse output to extract domain names domains = set() for line in stdout.decode().splitlines(): parts = line.split() if len(parts) >= 1: domain = parts[0].rstrip('.') # Only include actual domain names (not the TLD itself) if domain and domain != tld and '.' in domain: # Extract just the name part (before the TLD) name = domain.rsplit('.', 1)[0] if name: domains.add(name.lower()) logger.info(f"Zone transfer complete for .{tld}: {len(domains)} domains") return domains except asyncio.TimeoutError: logger.error(f"Zone transfer timed out for .{tld}") raise RuntimeError("Zone transfer timed out") except FileNotFoundError: logger.error("dig command not found. Please install bind-utils or dnsutils.") raise RuntimeError("dig command not available") def compute_checksum(self, domains: set[str]) -> str: """Compute SHA256 checksum of sorted domain list""" sorted_domains = "\n".join(sorted(domains)) return hashlib.sha256(sorted_domains.encode()).hexdigest() async def get_previous_snapshot(self, db: AsyncSession, tld: str) -> Optional[set[str]]: """Load previous day's domain set from cache file""" cache_file = self.data_dir / f"{tld}_domains.txt" if cache_file.exists(): try: content = cache_file.read_text() return set(line.strip() for line in content.splitlines() if line.strip()) except Exception as e: logger.warning(f"Failed to load cache for .{tld}: {e}") return None async def save_snapshot(self, db: AsyncSession, tld: str, domains: set[str]): """Save current snapshot to cache and database""" # Save to cache file cache_file = self.data_dir / f"{tld}_domains.txt" cache_file.write_text("\n".join(sorted(domains))) # Save metadata to database checksum = self.compute_checksum(domains) snapshot = ZoneSnapshot( tld=tld, snapshot_date=datetime.utcnow(), domain_count=len(domains), checksum=checksum ) db.add(snapshot) await db.commit() logger.info(f"Saved snapshot for .{tld}: {len(domains)} domains") async def process_drops( self, db: AsyncSession, tld: str, previous: set[str], current: set[str] ) -> list[dict]: """Find and store dropped domains""" dropped = previous - current if not dropped: logger.info(f"No dropped domains found for .{tld}") return [] logger.info(f"Found {len(dropped)} dropped domains for .{tld}") today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) # Store dropped domains dropped_records = [] for name in dropped: record = DroppedDomain( domain=f"{name}.{tld}", tld=tld, dropped_date=today, length=len(name), is_numeric=name.isdigit(), has_hyphen='-' in name ) db.add(record) dropped_records.append({ "domain": f"{name}.{tld}", "length": len(name), "is_numeric": name.isdigit(), "has_hyphen": '-' in name }) await db.commit() return dropped_records async def run_daily_sync(self, db: AsyncSession, tld: str) -> dict: """ Run daily zone file sync: 1. Fetch current zone file 2. Compare with previous snapshot 3. Store dropped domains 4. Save new snapshot """ logger.info(f"Starting daily sync for .{tld}") # Get previous snapshot previous = await self.get_previous_snapshot(db, tld) # Fetch current zone current = await self.fetch_zone_file(tld) result = { "tld": tld, "current_count": len(current), "previous_count": len(previous) if previous else 0, "dropped": [], "new_count": 0 } if previous: # Find dropped domains result["dropped"] = await self.process_drops(db, tld, previous, current) result["new_count"] = len(current - previous) # Save current snapshot await self.save_snapshot(db, tld, current) logger.info(f"Daily sync complete for .{tld}: {len(result['dropped'])} dropped, {result['new_count']} new") return result # ============================================================================ # API FUNCTIONS # ============================================================================ async def get_dropped_domains( db: AsyncSession, tld: Optional[str] = None, hours: int = 24, min_length: Optional[int] = None, max_length: Optional[int] = None, exclude_numeric: bool = False, exclude_hyphen: bool = False, keyword: Optional[str] = None, limit: int = 100, offset: int = 0 ) -> dict: """ Get recently dropped domains with filters. Only returns drops from last 24-48h (we don't store older data). """ cutoff = datetime.utcnow() - timedelta(hours=hours) query = select(DroppedDomain).where(DroppedDomain.dropped_date >= cutoff) count_query = select(func.count(DroppedDomain.id)).where(DroppedDomain.dropped_date >= cutoff) if tld: query = query.where(DroppedDomain.tld == tld) count_query = count_query.where(DroppedDomain.tld == tld) if min_length: query = query.where(DroppedDomain.length >= min_length) count_query = count_query.where(DroppedDomain.length >= min_length) if max_length: query = query.where(DroppedDomain.length <= max_length) count_query = count_query.where(DroppedDomain.length <= max_length) if exclude_numeric: query = query.where(DroppedDomain.is_numeric == False) count_query = count_query.where(DroppedDomain.is_numeric == False) if exclude_hyphen: query = query.where(DroppedDomain.has_hyphen == False) count_query = count_query.where(DroppedDomain.has_hyphen == False) if keyword: query = query.where(DroppedDomain.domain.ilike(f"%{keyword}%")) count_query = count_query.where(DroppedDomain.domain.ilike(f"%{keyword}%")) # Get total count total_result = await db.execute(count_query) total = total_result.scalar() or 0 # Get items with pagination query = query.order_by(DroppedDomain.dropped_date.desc(), DroppedDomain.length) query = query.offset(offset).limit(limit) result = await db.execute(query) items = result.scalars().all() return { "total": total, "items": [ { "domain": item.domain, "tld": item.tld, "dropped_date": item.dropped_date.isoformat(), "length": item.length, "is_numeric": item.is_numeric, "has_hyphen": item.has_hyphen } for item in items ] } async def get_zone_stats(db: AsyncSession) -> dict: """Get zone file statistics""" # Get latest snapshots ch_query = select(ZoneSnapshot).where(ZoneSnapshot.tld == "ch").order_by(ZoneSnapshot.snapshot_date.desc()).limit(1) li_query = select(ZoneSnapshot).where(ZoneSnapshot.tld == "li").order_by(ZoneSnapshot.snapshot_date.desc()).limit(1) ch_result = await db.execute(ch_query) li_result = await db.execute(li_query) ch_snapshot = ch_result.scalar_one_or_none() li_snapshot = li_result.scalar_one_or_none() # Count drops from last 24h only cutoff_24h = datetime.utcnow() - timedelta(hours=24) drops_query = select(func.count(DroppedDomain.id)).where(DroppedDomain.dropped_date >= cutoff_24h) drops_result = await db.execute(drops_query) daily_drops = drops_result.scalar() or 0 return { "ch": { "domain_count": ch_snapshot.domain_count if ch_snapshot else 0, "last_sync": ch_snapshot.snapshot_date.isoformat() if ch_snapshot else None }, "li": { "domain_count": li_snapshot.domain_count if li_snapshot else 0, "last_sync": li_snapshot.snapshot_date.isoformat() if li_snapshot else None }, "daily_drops": daily_drops } async def cleanup_old_drops(db: AsyncSession, hours: int = 48) -> int: """ Delete dropped domains older than specified hours. Default: Keep only last 48h for safety margin (24h display + 24h buffer). Returns number of deleted records. """ from sqlalchemy import delete cutoff = datetime.utcnow() - timedelta(hours=hours) # Delete old drops stmt = delete(DroppedDomain).where(DroppedDomain.dropped_date < cutoff) result = await db.execute(stmt) await db.commit() deleted = result.rowcount if deleted > 0: logger.info(f"Cleaned up {deleted} old dropped domains (older than {hours}h)") return deleted async def cleanup_old_snapshots(db: AsyncSession, keep_days: int = 7) -> int: """ Delete zone snapshots older than specified days. Keep at least 7 days of metadata for debugging. Returns number of deleted records. """ from sqlalchemy import delete cutoff = datetime.utcnow() - timedelta(days=keep_days) stmt = delete(ZoneSnapshot).where(ZoneSnapshot.snapshot_date < cutoff) result = await db.execute(stmt) await db.commit() deleted = result.rowcount if deleted > 0: logger.info(f"Cleaned up {deleted} old zone snapshots (older than {keep_days}d)") return deleted