perf: Harden zone sync + scheduler concurrency

This commit is contained in:
2025-12-21 16:07:35 +01:00
parent 54fcfd80cb
commit 93bd23c1cd
5 changed files with 137 additions and 61 deletions

View File

@ -50,6 +50,8 @@ jobs:
GH_OAUTH_SECRET: ${{ secrets.GH_OAUTH_SECRET }}
CZDS_USERNAME: ${{ secrets.CZDS_USERNAME }}
CZDS_PASSWORD: ${{ secrets.CZDS_PASSWORD }}
SWITCH_TSIG_CH_SECRET: ${{ secrets.SWITCH_TSIG_CH_SECRET }}
SWITCH_TSIG_LI_SECRET: ${{ secrets.SWITCH_TSIG_LI_SECRET }}
run: |
python3 - <<'PY'
import os
@ -110,6 +112,10 @@ jobs:
# CZDS
"CZDS_USERNAME": os.environ["CZDS_USERNAME"],
"CZDS_PASSWORD": os.environ["CZDS_PASSWORD"],
# Switch TSIG (AXFR)
"SWITCH_TSIG_CH_SECRET": os.environ["SWITCH_TSIG_CH_SECRET"],
"SWITCH_TSIG_LI_SECRET": os.environ["SWITCH_TSIG_LI_SECRET"],
}
lines = []

View File

@ -331,3 +331,5 @@ Empfehlungen:

View File

@ -134,9 +134,23 @@ class Settings(BaseSettings):
# Switch.ch Zone Files (.ch, .li)
switch_data_dir: str = "/data/switch" # Persistent storage
# Switch.ch TSIG (DNS AXFR) credentials
# These should be provided via environment variables in production.
switch_tsig_ch_name: str = "tsig-zonedata-ch-public-21-01"
switch_tsig_ch_algorithm: str = "hmac-sha512"
switch_tsig_ch_secret: str = ""
switch_tsig_li_name: str = "tsig-zonedata-li-public-21-01"
switch_tsig_li_algorithm: str = "hmac-sha512"
switch_tsig_li_secret: str = ""
# Zone File Retention (days to keep historical snapshots)
zone_retention_days: int = 3
# Domain check scheduler tuning (external I/O heavy; keep conservative defaults)
domain_check_max_concurrent: int = 3
domain_check_delay_seconds: float = 0.3
class Config:
env_file = ".env"
env_file_encoding = "utf-8"

View File

@ -88,7 +88,6 @@ async def check_domains_by_frequency(frequency: str):
tiers_for_frequency.append(tier)
# Get domains from users with matching subscription tier
from sqlalchemy.orm import joinedload
result = await db.execute(
select(Domain)
.join(User, Domain.user_id == User.id)
@ -108,10 +107,32 @@ async def check_domains_by_frequency(frequency: str):
newly_taken = [] # Track domains that became taken
status_changes = [] # All status changes for logging
for domain in domains:
try:
# Check domain availability
check_result = await domain_checker.check_domain(domain.name)
# Concurrency control + polite pacing (prevents RDAP/WHOIS bans)
max_concurrent = max(1, int(getattr(settings, "domain_check_max_concurrent", 3) or 3))
delay = float(getattr(settings, "domain_check_delay_seconds", 0.3) or 0.3)
semaphore = asyncio.Semaphore(max_concurrent)
async def _check_one(d: Domain) -> tuple[Domain, object | None, Exception | None]:
async with semaphore:
try:
res = await domain_checker.check_domain(d.name)
# small delay after each external request
await asyncio.sleep(delay)
return d, res, None
except Exception as e:
return d, None, e
# Process in chunks to avoid huge gather lists
chunk_size = 200
for i in range(0, len(domains), chunk_size):
chunk = domains[i : i + chunk_size]
results = await asyncio.gather(*[_check_one(d) for d in chunk])
for domain, check_result, err in results:
if err is not None or check_result is None:
logger.error(f"Error checking domain {domain.name}: {err}")
errors += 1
continue
# Track status transitions
was_available = domain.is_available
@ -119,27 +140,28 @@ async def check_domains_by_frequency(frequency: str):
# Detect transition: taken -> available (domain dropped!)
if not was_available and is_now_available:
status_changes.append({
'domain': domain.name,
'change': 'became_available',
'old_registrar': domain.registrar,
})
status_changes.append(
{
"domain": domain.name,
"change": "became_available",
"old_registrar": domain.registrar,
}
)
if domain.notify_on_available:
newly_available.append(domain)
logger.info(f"🎯 Domain AVAILABLE: {domain.name} (was registered by {domain.registrar})")
# Detect transition: available -> taken (someone registered it!)
elif was_available and not is_now_available:
status_changes.append({
'domain': domain.name,
'change': 'became_taken',
'new_registrar': check_result.registrar,
})
status_changes.append(
{
"domain": domain.name,
"change": "became_taken",
"new_registrar": check_result.registrar,
}
)
if domain.notify_on_available: # Notify if alerts are on
newly_taken.append({
'domain': domain,
'registrar': check_result.registrar,
})
newly_taken.append({"domain": domain, "registrar": check_result.registrar})
logger.info(f"⚠️ Domain TAKEN: {domain.name} (now registered by {check_result.registrar})")
# Update domain with fresh data
@ -158,16 +180,8 @@ async def check_domains_by_frequency(frequency: str):
checked_at=datetime.utcnow(),
)
db.add(check)
checked += 1
# Small delay to avoid rate limiting
await asyncio.sleep(0.5)
except Exception as e:
logger.error(f"Error checking domain {domain.name}: {e}")
errors += 1
await db.commit()
elapsed = (datetime.utcnow() - start_time).total_seconds()

View File

@ -17,28 +17,12 @@ from typing import Optional
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import get_settings
from app.models.zone_file import ZoneSnapshot, DroppedDomain
logger = logging.getLogger(__name__)
# ============================================================================
# TSIG KEYS (from Switch.ch documentation)
# ============================================================================
TSIG_KEYS = {
"ch": {
"name": "tsig-zonedata-ch-public-21-01",
"algorithm": "hmac-sha512",
"secret": "stZwEGApYumtXkh73qMLPqfbIDozWKZLkqRvcjKSpRnsor6A6MxixRL6C2HeSVBQNfMW4wer+qjS0ZSfiWiJ3Q=="
},
"li": {
"name": "tsig-zonedata-li-public-21-01",
"algorithm": "hmac-sha512",
"secret": "t8GgeCn+fhPaj+cRy1epox2Vj4hZ45ax6v3rQCkkfIQNg5fsxuU23QM5mzz+BxJ4kgF/jiQyBDBvL+XWPE6oCQ=="
}
}
ZONE_SERVER = "zonedata.switch.ch"
# ============================================================================
@ -49,18 +33,36 @@ class ZoneFileService:
"""Service for fetching and analyzing zone files"""
def __init__(self, data_dir: Optional[Path] = None):
from app.config import get_settings
settings = get_settings()
self.data_dir = data_dir or Path(settings.switch_data_dir)
self.data_dir.mkdir(parents=True, exist_ok=True)
self._settings = settings
# Store daily snapshots for N days (premium reliability)
self.snapshots_dir = self.data_dir / "snapshots"
self.snapshots_dir.mkdir(parents=True, exist_ok=True)
def _get_tsig_config(self, tld: str) -> dict:
"""Resolve TSIG config from settings/env (no secrets in git)."""
if tld == "ch":
return {
"name": self._settings.switch_tsig_ch_name,
"algorithm": self._settings.switch_tsig_ch_algorithm,
"secret": self._settings.switch_tsig_ch_secret,
}
if tld == "li":
return {
"name": self._settings.switch_tsig_li_name,
"algorithm": self._settings.switch_tsig_li_algorithm,
"secret": self._settings.switch_tsig_li_secret,
}
raise ValueError(f"Unknown TLD: {tld}")
def _get_key_file_path(self, tld: str) -> Path:
"""Generate TSIG key file for dig command"""
key_path = self.data_dir / f"{tld}_zonedata.key"
key_info = TSIG_KEYS.get(tld)
if not key_info:
raise ValueError(f"Unknown TLD: {tld}")
key_info = self._get_tsig_config(tld)
if not (key_info.get("secret") or "").strip():
raise RuntimeError(f"Missing Switch TSIG secret for .{tld} (set SWITCH_TSIG_{tld.upper()}_SECRET)")
# Write TSIG key file in BIND format
key_content = f"""key "{key_info['name']}" {{
@ -76,7 +78,7 @@ class ZoneFileService:
Fetch zone file via DNS AXFR transfer.
Returns set of domain names (without TLD suffix).
"""
if tld not in TSIG_KEYS:
if tld not in ("ch", "li"):
raise ValueError(f"Unsupported TLD: {tld}. Only 'ch' and 'li' are supported.")
logger.info(f"Starting zone transfer for .{tld}")
@ -143,23 +145,61 @@ class ZoneFileService:
async def get_previous_snapshot(self, db: AsyncSession, tld: str) -> Optional[set[str]]:
"""Load previous day's domain set from cache file"""
cache_file = self.data_dir / f"{tld}_domains.txt"
# Prefer most recent snapshot file before today (supports N-day retention)
tld_dir = self.snapshots_dir / tld
if tld_dir.exists():
candidates = sorted([p for p in tld_dir.glob("*.domains.txt") if p.is_file()])
if candidates:
# Pick the latest snapshot file (by name sort = date sort)
latest = candidates[-1]
try:
content = latest.read_text()
return set(line.strip() for line in content.splitlines() if line.strip())
except Exception as e:
logger.warning(f"Failed to load snapshot for .{tld} from {latest.name}: {e}")
# Fallback: legacy cache file
cache_file = self.data_dir / f"{tld}_domains.txt"
if cache_file.exists():
try:
content = cache_file.read_text()
return set(line.strip() for line in content.splitlines() if line.strip())
except Exception as e:
logger.warning(f"Failed to load cache for .{tld}: {e}")
return None
def _cleanup_snapshot_files(self, tld: str) -> None:
"""Delete snapshot files older than retention window (best-effort)."""
keep_days = int(self._settings.zone_retention_days or 3)
cutoff = datetime.utcnow().date() - timedelta(days=keep_days)
tld_dir = self.snapshots_dir / tld
if not tld_dir.exists():
return
for p in tld_dir.glob("*.domains.txt"):
try:
# filename: YYYY-MM-DD.domains.txt
date_part = p.name.split(".")[0]
snap_date = datetime.fromisoformat(date_part).date()
if snap_date < cutoff:
p.unlink(missing_ok=True)
except Exception:
# Don't let cleanup break sync
continue
async def save_snapshot(self, db: AsyncSession, tld: str, domains: set[str]):
"""Save current snapshot to cache and database"""
# Save to cache file
# Save to legacy cache file (fast path)
cache_file = self.data_dir / f"{tld}_domains.txt"
cache_file.write_text("\n".join(sorted(domains)))
# Save a daily snapshot file for retention/debugging
tld_dir = self.snapshots_dir / tld
tld_dir.mkdir(parents=True, exist_ok=True)
today_str = datetime.utcnow().date().isoformat()
snapshot_file = tld_dir / f"{today_str}.domains.txt"
snapshot_file.write_text("\n".join(sorted(domains)))
self._cleanup_snapshot_files(tld)
# Save metadata to database
checksum = self.compute_checksum(domains)
snapshot = ZoneSnapshot(