Fix
This commit is contained in:
255
backend/app/writer_lock.py
Normal file
255
backend/app/writer_lock.py
Normal file
@@ -0,0 +1,255 @@
|
||||
"""Shared single-writer lock coordination for backend automation jobs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
from .config import (
|
||||
get_storage_path,
|
||||
get_writer_lock_poll_interval_seconds,
|
||||
get_writer_lock_timeout_seconds,
|
||||
)
|
||||
|
||||
|
||||
class BackendWriterLockTimeoutError(RuntimeError):
|
||||
"""Raised when the shared backend writer lock cannot be acquired in time."""
|
||||
|
||||
|
||||
_ACTIVE_LOCK_DEPTH_BY_PATH: dict[Path, int] = {}
|
||||
_ACTIVE_LOCK_TOKEN_BY_PATH: dict[Path, str] = {}
|
||||
CONTAINER_STALE_LOCK_GRACE_SECONDS = 300
|
||||
|
||||
|
||||
def resolve_backend_writer_lock_path(*, storage_path: Path | None = None) -> Path:
|
||||
"""Return the shared lock path derived from the configured SQLite storage path."""
|
||||
resolved_storage_path = storage_path or get_storage_path()
|
||||
return resolved_storage_path.parent / f"{resolved_storage_path.stem}.writer.lock"
|
||||
|
||||
|
||||
@contextmanager
|
||||
def backend_writer_lock(
|
||||
*,
|
||||
holder: str,
|
||||
storage_path: Path | None = None,
|
||||
timeout_seconds: float | None = None,
|
||||
poll_interval_seconds: float | None = None,
|
||||
):
|
||||
"""Acquire the shared backend writer lock with reentrant safety per process."""
|
||||
lock_path = resolve_backend_writer_lock_path(storage_path=storage_path).resolve()
|
||||
if lock_path in _ACTIVE_LOCK_DEPTH_BY_PATH:
|
||||
_ACTIVE_LOCK_DEPTH_BY_PATH[lock_path] += 1
|
||||
try:
|
||||
yield _read_lock_metadata(lock_path)
|
||||
finally:
|
||||
_ACTIVE_LOCK_DEPTH_BY_PATH[lock_path] -= 1
|
||||
if _ACTIVE_LOCK_DEPTH_BY_PATH[lock_path] <= 0:
|
||||
_ACTIVE_LOCK_DEPTH_BY_PATH.pop(lock_path, None)
|
||||
_ACTIVE_LOCK_TOKEN_BY_PATH.pop(lock_path, None)
|
||||
return
|
||||
|
||||
metadata = _acquire_backend_writer_lock(
|
||||
lock_path=lock_path,
|
||||
holder=holder,
|
||||
timeout_seconds=get_writer_lock_timeout_seconds()
|
||||
if timeout_seconds is None
|
||||
else timeout_seconds,
|
||||
poll_interval_seconds=get_writer_lock_poll_interval_seconds()
|
||||
if poll_interval_seconds is None
|
||||
else poll_interval_seconds,
|
||||
)
|
||||
_ACTIVE_LOCK_DEPTH_BY_PATH[lock_path] = 1
|
||||
_ACTIVE_LOCK_TOKEN_BY_PATH[lock_path] = str(metadata["lock_token"])
|
||||
try:
|
||||
yield metadata
|
||||
finally:
|
||||
_release_backend_writer_lock(lock_path)
|
||||
_ACTIVE_LOCK_DEPTH_BY_PATH.pop(lock_path, None)
|
||||
_ACTIVE_LOCK_TOKEN_BY_PATH.pop(lock_path, None)
|
||||
|
||||
|
||||
def build_writer_lock_holder(label: str) -> str:
|
||||
"""Build one readable holder label from the current command line."""
|
||||
argv = " ".join(sys.argv).strip()
|
||||
if argv:
|
||||
return f"{label} [{argv}]"
|
||||
return label
|
||||
|
||||
|
||||
def _acquire_backend_writer_lock(
|
||||
*,
|
||||
lock_path: Path,
|
||||
holder: str,
|
||||
timeout_seconds: float,
|
||||
poll_interval_seconds: float,
|
||||
) -> dict[str, object]:
|
||||
if timeout_seconds < 0:
|
||||
raise ValueError("Writer lock timeout must be zero or positive.")
|
||||
if poll_interval_seconds <= 0:
|
||||
raise ValueError("Writer lock poll interval must be positive.")
|
||||
|
||||
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
deadline = time.monotonic() + timeout_seconds
|
||||
metadata = _build_lock_metadata(holder=holder)
|
||||
|
||||
while True:
|
||||
try:
|
||||
file_descriptor = os.open(
|
||||
lock_path,
|
||||
os.O_CREAT | os.O_EXCL | os.O_WRONLY,
|
||||
)
|
||||
except FileExistsError:
|
||||
existing_metadata = _read_lock_metadata(lock_path)
|
||||
if _can_clear_stale_lock(existing_metadata):
|
||||
_remove_lock_file(lock_path)
|
||||
continue
|
||||
if time.monotonic() >= deadline:
|
||||
raise BackendWriterLockTimeoutError(
|
||||
_build_lock_timeout_message(
|
||||
lock_path=lock_path,
|
||||
holder=holder,
|
||||
timeout_seconds=timeout_seconds,
|
||||
existing_metadata=existing_metadata,
|
||||
)
|
||||
)
|
||||
time.sleep(poll_interval_seconds)
|
||||
continue
|
||||
|
||||
try:
|
||||
with os.fdopen(file_descriptor, "w", encoding="utf-8") as handle:
|
||||
json.dump(metadata, handle, ensure_ascii=True, indent=2)
|
||||
handle.write("\n")
|
||||
return metadata
|
||||
except Exception:
|
||||
_remove_lock_file(lock_path)
|
||||
raise
|
||||
|
||||
|
||||
def _release_backend_writer_lock(lock_path: Path) -> None:
|
||||
expected_token = _ACTIVE_LOCK_TOKEN_BY_PATH.get(lock_path)
|
||||
existing_metadata = _read_lock_metadata(lock_path)
|
||||
if existing_metadata and expected_token and existing_metadata.get("lock_token") != expected_token:
|
||||
return
|
||||
_remove_lock_file(lock_path)
|
||||
|
||||
|
||||
def _remove_lock_file(lock_path: Path) -> None:
|
||||
try:
|
||||
lock_path.unlink()
|
||||
except FileNotFoundError:
|
||||
return
|
||||
|
||||
|
||||
def _build_lock_metadata(*, holder: str) -> dict[str, object]:
|
||||
return {
|
||||
"lock_token": uuid4().hex,
|
||||
"holder": holder,
|
||||
"started_at": _utc_now_iso(),
|
||||
"hostname": socket.gethostname(),
|
||||
"pid": os.getpid(),
|
||||
"cwd": str(Path.cwd()),
|
||||
}
|
||||
|
||||
|
||||
def _read_lock_metadata(lock_path: Path) -> dict[str, object] | None:
|
||||
try:
|
||||
return json.loads(lock_path.read_text(encoding="utf-8"))
|
||||
except (FileNotFoundError, OSError, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
|
||||
def _can_clear_stale_lock(existing_metadata: dict[str, object] | None) -> bool:
|
||||
if not existing_metadata:
|
||||
return False
|
||||
try:
|
||||
holder_pid = int(existing_metadata.get("pid"))
|
||||
except (TypeError, ValueError):
|
||||
return False
|
||||
if holder_pid <= 0:
|
||||
return False
|
||||
|
||||
holder_hostname = str(existing_metadata.get("hostname") or "").strip()
|
||||
current_hostname = socket.gethostname()
|
||||
if holder_hostname == current_hostname:
|
||||
if _is_process_alive(holder_pid):
|
||||
return False
|
||||
return True
|
||||
if not _looks_like_containerized_holder(existing_metadata):
|
||||
return False
|
||||
lock_age_seconds = _calculate_lock_age_seconds(existing_metadata)
|
||||
if lock_age_seconds is None:
|
||||
return False
|
||||
if lock_age_seconds < CONTAINER_STALE_LOCK_GRACE_SECONDS:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _is_process_alive(pid: int) -> bool:
|
||||
if pid == os.getpid():
|
||||
return True
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
return True
|
||||
except OSError as exc:
|
||||
winerror = getattr(exc, "winerror", None)
|
||||
if winerror in {3, 87} or exc.errno in {3}:
|
||||
return False
|
||||
return True
|
||||
return True
|
||||
|
||||
|
||||
def _build_lock_timeout_message(
|
||||
*,
|
||||
lock_path: Path,
|
||||
holder: str,
|
||||
timeout_seconds: float,
|
||||
existing_metadata: dict[str, object] | None,
|
||||
) -> str:
|
||||
if not existing_metadata:
|
||||
return (
|
||||
f"Writer lock is busy at {lock_path} and could not be acquired within "
|
||||
f"{timeout_seconds:.1f}s for {holder}."
|
||||
)
|
||||
|
||||
existing_holder = existing_metadata.get("holder") or "unknown-holder"
|
||||
started_at = existing_metadata.get("started_at") or "unknown-started-at"
|
||||
hostname = existing_metadata.get("hostname") or "unknown-host"
|
||||
pid = existing_metadata.get("pid") or "unknown-pid"
|
||||
return (
|
||||
f"Writer lock is busy at {lock_path}. Held by {existing_holder} "
|
||||
f"since {started_at} on {hostname} (pid {pid}). "
|
||||
f"Timed out after waiting {timeout_seconds:.1f}s for {holder}."
|
||||
)
|
||||
|
||||
|
||||
def _looks_like_containerized_holder(existing_metadata: dict[str, object]) -> bool:
|
||||
holder_cwd = str(existing_metadata.get("cwd") or "").strip().lower()
|
||||
return holder_cwd.startswith("/app")
|
||||
|
||||
|
||||
def _calculate_lock_age_seconds(existing_metadata: dict[str, object]) -> float | None:
|
||||
started_at_raw = str(existing_metadata.get("started_at") or "").strip()
|
||||
if not started_at_raw:
|
||||
return None
|
||||
try:
|
||||
started_at = datetime.fromisoformat(started_at_raw.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
if started_at.tzinfo is None:
|
||||
started_at = started_at.replace(tzinfo=timezone.utc)
|
||||
delta = datetime.now(timezone.utc) - started_at.astimezone(timezone.utc)
|
||||
return max(0.0, delta.total_seconds())
|
||||
|
||||
|
||||
def _utc_now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
Reference in New Issue
Block a user