comunidadhll/backend/app/historical_runner.py

"""Local development loop for periodic historical CRCON refreshes."""

from __future__ import annotations

import argparse
import json
import time
import traceback
from datetime import datetime, timezone
from typing import Any

from .config import (
    DEFAULT_DB_MAINTENANCE_INTERVAL_SECONDS,
    get_db_maintenance_enabled,
    get_db_maintenance_interval_seconds,
    get_historical_full_snapshot_every_runs,
    get_historical_elo_mmr_min_new_samples,
    get_historical_elo_mmr_rebuild_interval_minutes,
    get_historical_refresh_interval_seconds,
    get_historical_refresh_max_retries,
    get_historical_refresh_retry_delay_seconds,
    get_historical_data_source_kind,
)
from .database_maintenance import run_database_maintenance_cleanup
from .elo_mmr_engine import rebuild_elo_mmr_models
from .elo_mmr_storage import get_latest_elo_mmr_generated_at
from .historical_ingestion import run_incremental_refresh
from .historical_snapshots import (
    generate_and_persist_historical_snapshots,
    generate_and_persist_priority_historical_snapshots,
)
from .rcon_historical_storage import count_rcon_historical_samples_since
from .rcon_historical_worker import run_rcon_historical_capture
from .writer_lock import backend_writer_lock, build_writer_lock_holder

HOURLY_INTERVAL_SECONDS = 3600
DEFAULT_HISTORICAL_SERVER_SCOPE = (
    "comunidad-hispana-01",
    "comunidad-hispana-02",
)
_LAST_DATABASE_MAINTENANCE_RUN_AT: datetime | None = None


def run_periodic_historical_refresh(
    *,
    interval_seconds: int,
    max_retries: int,
    retry_delay_seconds: float,
    server_slug: str | None = None,
    max_pages: int | None = None,
    page_size: int | None = None,
    max_runs: int | None = None,
) -> None:
    """Run periodic historical refreshes and rebuild persisted snapshots."""
    completed_runs = 0
    print(
        json.dumps(
            {
                "event": "historical-refresh-loop-started",
                "interval_seconds": interval_seconds,
                "max_retries": max_retries,
                "retry_delay_seconds": retry_delay_seconds,
                "server_scope": _describe_refresh_scope(server_slug),
                "snapshot_scope": _describe_snapshot_scope(server_slug),
            },
            indent=2,
        )
    )
    print("Press Ctrl+C to stop.")

    try:
        while max_runs is None or completed_runs < max_runs:
            completed_runs += 1
            payload = _run_refresh_with_retries(
                max_retries=max_retries,
                retry_delay_seconds=retry_delay_seconds,
                server_slug=server_slug,
                max_pages=max_pages,
                page_size=page_size,
                run_number=completed_runs,
            )
            _emit_json_log({"run": completed_runs, **payload})

            if max_runs is not None and completed_runs >= max_runs:
                break

            time.sleep(interval_seconds)
    except KeyboardInterrupt:
        print("\nHistorical refresh loop stopped by user.")


def _run_refresh_with_retries(
    *,
    max_retries: int,
    retry_delay_seconds: float,
    server_slug: str | None,
    max_pages: int | None,
    page_size: int | None,
    run_number: int,
) -> dict[str, Any]:
    attempt = 0
    while True:
        attempt += 1
        try:
            with backend_writer_lock(
                holder=build_writer_lock_holder(
                    f"app.historical_runner refresh:{server_slug or 'all-servers'}"
                )
            ):
                rcon_capture_result = _run_primary_rcon_capture()
                should_run_classic_fallback, classic_fallback_reason = (
                    _resolve_classic_fallback_policy(
                        server_slug=server_slug,
                        run_number=run_number,
                        rcon_capture_result=rcon_capture_result,
                    )
                )
                if should_run_classic_fallback:
                    refresh_result = run_incremental_refresh(
                        server_slug=server_slug,
                        max_pages=max_pages,
                        page_size=page_size,
                        rebuild_snapshots=False,
                    )
                    snapshot_result = generate_historical_snapshots(
                        server_slug=server_slug,
                        run_number=run_number,
                    )
                    elo_mmr_result = rebuild_elo_mmr_models()
                else:
                    should_generate_snapshots = _rcon_capture_has_new_useful_data(
                        rcon_capture_result
                    )
                    refresh_result = {
                        "status": "skipped",
                        "reason": "rcon-primary-cycle-no-classic-fallback-needed",
                    }
                    if should_generate_snapshots:
                        snapshot_result = generate_historical_snapshots(
                            server_slug=server_slug,
                            run_number=run_number,
                        )
                        snapshot_result = {
                            **snapshot_result,
                            "generation_policy": "rcon-primary-useful-cycle",
                            "reason": "rcon-primary-cycle-produced-new-useful-coverage",
                        }
                        elo_policy = _build_elo_mmr_rebuild_policy(
                            rcon_capture_result=rcon_capture_result
                        )
                        if bool(elo_policy["due"]):
                            elo_mmr_result = {
                                **rebuild_elo_mmr_models(),
                                "generation_policy": "rcon-primary-useful-cycle-elo-rebuild-due",
                                "reason": "rcon-primary-useful-cycle-met-elo-rebuild-threshold",
                                **elo_policy,
                            }
                        else:
                            elo_mmr_result = {
                                "status": "skipped",
                                "reason": "rcon-primary-useful-cycle-elo-rebuild-throttled",
                                "generation_policy": "rcon-primary-useful-cycle-elo-rebuild-throttled",
                                **elo_policy,
                            }
                    else:
                        snapshot_result = {
                            "status": "skipped",
                            "reason": "rcon-primary-cycle-had-no-new-useful-data",
                            "generation_policy": "rcon-primary-no-new-useful-data",
                        }
                        elo_mmr_result = {
                            "status": "skipped",
                            "reason": "rcon-primary-cycle-had-no-new-useful-data",
                            "generation_policy": "rcon-primary-no-new-useful-data",
                            **_build_elo_mmr_rebuild_policy(
                                rcon_capture_result=rcon_capture_result
                            ),
                        }
                maintenance_result = _maybe_run_database_maintenance()
            return {
                "status": "ok",
                "attempts_used": attempt,
                "max_retries": max_retries,
                "rcon_capture_result": rcon_capture_result,
                "classic_fallback_used": should_run_classic_fallback,
                "classic_fallback_reason": classic_fallback_reason,
                "refresh_result": refresh_result,
                "snapshot_result": snapshot_result,
                "elo_mmr_result": elo_mmr_result,
                "database_maintenance_result": maintenance_result,
            }
        except Exception as exc:
            failure_payload = {
                "event": "historical-refresh-attempt-failed",
                "attempt": attempt,
                "max_retries": max_retries,
                "server_scope": _describe_refresh_scope(server_slug),
                "snapshot_scope": _describe_snapshot_scope(server_slug),
                "error_type": type(exc).__name__,
                "error": str(exc),
                "traceback": traceback.format_exc(),
            }
            _emit_json_log(failure_payload)
            if attempt > max_retries:
                return {
                    "status": "error",
                    "attempts_used": attempt,
                    "max_retries": max_retries,
                    "error_type": type(exc).__name__,
                    "error": str(exc),
                    "traceback": failure_payload["traceback"],
                }
            if retry_delay_seconds > 0:
                time.sleep(retry_delay_seconds)


def generate_historical_snapshots(
    *,
    server_slug: str | None = None,
    run_number: int = 1,
) -> dict[str, Any]:
    """Build priority prewarm snapshots on every run and the full matrix on cadence."""
    generated_at = datetime.now(timezone.utc)
    full_snapshot_every_runs = get_historical_full_snapshot_every_runs()
    should_run_full_refresh = bool(server_slug) or run_number % full_snapshot_every_runs == 0
    _emit_json_log(
        {
            "event": "historical-snapshot-refresh-started",
            "run_number": run_number,
            "snapshot_step": "full-matrix" if should_run_full_refresh else "priority-prewarm",
            "server_slug": server_slug,
            "snapshot_scope": _describe_snapshot_scope(server_slug),
        }
    )
    if should_run_full_refresh:
        result = generate_and_persist_historical_snapshots(
            server_key=server_slug,
            generated_at=generated_at,
        )
    else:
        result = generate_and_persist_priority_historical_snapshots(
            generated_at=generated_at,
        )
    return {
        **result,
        "run_number": run_number,
        "full_snapshot_every_runs": full_snapshot_every_runs,
        "prewarm_only": not should_run_full_refresh,
        "refresh_interval_seconds": get_historical_refresh_interval_seconds(),
        "includes_monthly_mvp_v2": True,
    }


def _emit_json_log(payload: dict[str, Any]) -> None:
    """Print JSON logs that remain safe for Compose and log collectors."""
    print(json.dumps(payload, ensure_ascii=True, default=str), flush=True)


def _maybe_run_database_maintenance(*, now: datetime | None = None) -> dict[str, Any]:
    """Optionally run scheduled database maintenance without crashing the runner."""
    global _LAST_DATABASE_MAINTENANCE_RUN_AT

    anchor = now.astimezone(timezone.utc) if now else datetime.now(timezone.utc)
    if not get_db_maintenance_enabled():
        result = {"status": "skipped", "reason": "disabled", "enabled": False}
        _emit_json_log({"event": "database-maintenance-scheduler-skipped-disabled", **result})
        return result

    interval_seconds, interval_source = _resolve_db_maintenance_interval_seconds()
    if _LAST_DATABASE_MAINTENANCE_RUN_AT is not None:
        elapsed_seconds = max(
            0,
            int((anchor - _LAST_DATABASE_MAINTENANCE_RUN_AT).total_seconds()),
        )
        if elapsed_seconds < interval_seconds:
            result = {
                "status": "skipped",
                "reason": "not-due",
                "enabled": True,
                "interval_seconds": interval_seconds,
                "interval_source": interval_source,
                "elapsed_seconds": elapsed_seconds,
                "last_run_at": _LAST_DATABASE_MAINTENANCE_RUN_AT.isoformat().replace(
                    "+00:00", "Z"
                ),
            }
            _emit_json_log({"event": "database-maintenance-scheduler-skipped-not-due", **result})
            return result

    _emit_json_log(
        {
            "event": "database-maintenance-scheduler-started",
            "enabled": True,
            "interval_seconds": interval_seconds,
            "interval_source": interval_source,
            "scheduled_at": anchor.isoformat().replace("+00:00", "Z"),
        }
    )
    try:
        result = run_database_maintenance_cleanup(apply=True, now=anchor)
    except Exception as exc:  # noqa: BLE001 - scheduler must not crash the runner
        result = {
            "status": "error",
            "error_type": type(exc).__name__,
            "error": str(exc),
            "enabled": True,
            "interval_seconds": interval_seconds,
            "interval_source": interval_source,
        }
        _emit_json_log({"event": "database-maintenance-scheduler-failed", **result})
        return result

    if result.get("status") == "ok":
        _LAST_DATABASE_MAINTENANCE_RUN_AT = anchor
        _emit_json_log(
            {
                "event": "database-maintenance-scheduler-completed",
                "enabled": True,
                "interval_seconds": interval_seconds,
                "interval_source": interval_source,
                "result": result,
            }
        )
        return result

    failed_result = {
        "enabled": True,
        "interval_seconds": interval_seconds,
        "interval_source": interval_source,
        "result": result,
    }
    _emit_json_log({"event": "database-maintenance-scheduler-failed", **failed_result})
    return result


def _resolve_db_maintenance_interval_seconds() -> tuple[int, str]:
    """Return a safe maintenance interval even if env configuration is invalid."""
    try:
        return get_db_maintenance_interval_seconds(), "env"
    except ValueError:
        return DEFAULT_DB_MAINTENANCE_INTERVAL_SECONDS, "default-invalid-env-fallback"


def _describe_refresh_scope(server_slug: str | None) -> list[str]:
    if server_slug:
        return [server_slug]
    return list(DEFAULT_HISTORICAL_SERVER_SCOPE)


def _describe_snapshot_scope(server_slug: str | None) -> list[str]:
    if server_slug:
        return [server_slug, "all-servers"]
    return [*DEFAULT_HISTORICAL_SERVER_SCOPE, "all-servers"]


def _run_primary_rcon_capture() -> dict[str, Any]:
    if get_historical_data_source_kind() != "rcon":
        return {
            "status": "skipped",
            "reason": "historical-data-source-configured-without-rcon-primary",
        }
    return run_rcon_historical_capture()


def _resolve_classic_fallback_policy(
    *,
    server_slug: str | None,
    run_number: int,
    rcon_capture_result: dict[str, Any],
) -> tuple[bool, str]:
    if get_historical_data_source_kind() != "rcon":
        return True, "public-scoreboard-configured-as-primary-historical-source"

    if not _rcon_capture_has_usable_results(rcon_capture_result):
        return True, "rcon-historical-capture-failed-or-returned-no-usable-targets"

    if server_slug:
        return True, "manual-server-scope-still-needs-classic-historical-fallback"

    if run_number % get_historical_full_snapshot_every_runs() == 0:
        return True, "periodic-classic-fallback-for-competitive-historical-coverage"

    return False, "rcon-primary-cycle-succeeded-without-needing-classic-fallback"


def _rcon_capture_has_usable_results(rcon_capture_result: dict[str, Any]) -> bool:
    if rcon_capture_result.get("status") != "ok":
        return False
    targets = rcon_capture_result.get("targets")
    return isinstance(targets, list) and len(targets) > 0


def _rcon_capture_has_new_useful_data(rcon_capture_result: dict[str, Any]) -> bool:
    if rcon_capture_result.get("status") != "ok":
        return False
    totals = rcon_capture_result.get("totals")
    if isinstance(totals, dict) and int(totals.get("samples_inserted") or 0) > 0:
        return True
    if isinstance(totals, dict) and int(totals.get("admin_log_events_inserted") or 0) > 0:
        return True
    if isinstance(totals, dict) and int(totals.get("materialized_matches_inserted") or 0) > 0:
        return True
    targets = rcon_capture_result.get("targets")
    if not isinstance(targets, list):
        return False
    return any(bool(target.get("sample_inserted")) for target in targets if isinstance(target, dict))


def _build_elo_mmr_rebuild_policy(
    *,
    rcon_capture_result: dict[str, Any],
) -> dict[str, Any]:
    interval_minutes = get_historical_elo_mmr_rebuild_interval_minutes()
    min_new_samples = get_historical_elo_mmr_min_new_samples()
    last_generated_at = get_latest_elo_mmr_generated_at()
    last_generated_at_iso = (
        last_generated_at.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
        if last_generated_at is not None
        else None
    )
    minutes_since_last_rebuild = None
    if last_generated_at is not None:
        minutes_since_last_rebuild = int(
            max(
                0,
                (
                    datetime.now(timezone.utc) - last_generated_at.astimezone(timezone.utc)
                ).total_seconds() // 60,
            )
        )
    samples_since_last_rebuild = count_rcon_historical_samples_since(last_generated_at_iso)
    due = (
        _rcon_capture_has_new_useful_data(rcon_capture_result)
        and samples_since_last_rebuild >= min_new_samples
        and (
            last_generated_at is None
            or minutes_since_last_rebuild is None
            or minutes_since_last_rebuild >= interval_minutes
        )
    )
    return {
        "policy": "min-new-rcon-samples-and-minutes-since-last-successful-rebuild",
        "due": due,
        "last_generated_at": last_generated_at_iso,
        "samples_since_last_rebuild": samples_since_last_rebuild,
        "minutes_since_last_rebuild": minutes_since_last_rebuild,
        "rebuild_interval_minutes": interval_minutes,
        "min_new_samples": min_new_samples,
    }


def main() -> None:
    """Allow local scheduled historical refresh execution without external infra."""
    parser = argparse.ArgumentParser(
        description="Run periodic historical refreshes and regenerate snapshots for HLL Vietnam.",
    )
    parser.add_argument(
        "--interval",
        type=int,
        default=get_historical_refresh_interval_seconds(),
        help="Seconds to wait between refresh-plus-snapshot runs.",
    )
    parser.add_argument(
        "--hourly",
        action="store_true",
        help="Shortcut for running the refresh loop every 3600 seconds.",
    )
    parser.add_argument(
        "--retries",
        type=int,
        default=get_historical_refresh_max_retries(),
        help="Retry attempts after a failed incremental refresh.",
    )
    parser.add_argument(
        "--retry-delay",
        type=float,
        default=get_historical_refresh_retry_delay_seconds(),
        help="Seconds to wait between failed attempts.",
    )
    parser.add_argument(
        "--server",
        dest="server_slug",
        help="Optional historical server slug.",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=None,
        help="Optional page cap for local validation.",
    )
    parser.add_argument(
        "--page-size",
        type=int,
        default=None,
        help="Optional override for CRCON page size.",
    )
    parser.add_argument(
        "--max-runs",
        type=int,
        default=None,
        help="Optional safety limit for the number of refresh cycles to execute.",
    )
    args = parser.parse_args()

    if args.hourly:
        args.interval = HOURLY_INTERVAL_SECONDS

    if args.interval <= 0:
        raise ValueError("--interval must be a positive integer.")
    if args.retries < 0:
        raise ValueError("--retries must be zero or positive.")
    if args.retry_delay < 0:
        raise ValueError("--retry-delay must be zero or positive.")
    if args.max_runs is not None and args.max_runs <= 0:
        raise ValueError("--max-runs must be positive when provided.")

    run_periodic_historical_refresh(
        interval_seconds=args.interval,
        max_retries=args.retries,
        retry_delay_seconds=args.retry_delay,
        server_slug=args.server_slug,
        max_pages=args.max_pages,
        page_size=args.page_size,
        max_runs=args.max_runs,
    )


if __name__ == "__main__":
    main()