spawner/container_manager.py
XPS\Micro 24afef32e4 fix: Add container readiness wait after spawn
After spawning a new container, wait until it's fully started and services
are ready before returning to the frontend. This prevents race conditions
where the frontend opens the container URL before the service is ready.

Solution: Wait loop (max 30s) for container.status == 'running',
then additional 2s for service startup (code-server, PlatformIO, etc).

This fixes the 404 error when opening a freshly spawned container.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-03-19 13:39:52 +01:00

323 lines
14 KiB
Python

import requests_unixsocket
import docker
from config import Config
class ContainerManager:
def __init__(self):
self.client = None
def _get_client(self):
"""Lazy initialization of Docker client"""
if self.client is None:
try:
# Nutze from_env() - DOCKER_HOST aus Umgebungsvariable
self.client = docker.from_env()
except Exception as e:
raise Exception(f"Docker connection failed: {str(e)}")
return self.client
def spawn_container(self, user_id, slug):
"""Spawnt einen neuen Container für den User"""
try:
existing = self._get_user_container(slug)
if existing and existing.status == 'running':
return existing.id, self._get_container_port(existing)
# Pfad-basiertes Routing: User unter coder.domain.org/<slug>
base_host = f"{Config.SPAWNER_SUBDOMAIN}.{Config.BASE_DOMAIN}"
# Labels vorbereiten
labels = {
'traefik.enable': 'true',
'traefik.docker.network': Config.TRAEFIK_NETWORK,
# HTTPS Router mit PathPrefix
f'traefik.http.routers.user{user_id}.rule':
f'Host(`{base_host}`) && PathPrefix(`/{slug}`)',
f'traefik.http.routers.user{user_id}.entrypoints': Config.TRAEFIK_ENTRYPOINT,
f'traefik.http.routers.user{user_id}.priority': '100',
# Router muss zum Service zeigen
f'traefik.http.routers.user{user_id}.service': f'user{user_id}',
# StripPrefix Middleware - entfernt /{slug} bevor Container Request erhält
f'traefik.http.routers.user{user_id}.middlewares': f'user{user_id}-strip',
f'traefik.http.middlewares.user{user_id}-strip.stripprefix.prefixes': f'/{slug}',
# TLS für HTTPS
f'traefik.http.routers.user{user_id}.tls': 'true',
f'traefik.http.routers.user{user_id}.tls.certresolver': Config.TRAEFIK_CERTRESOLVER,
# Service mit Port-Konfiguration
f'traefik.http.services.user{user_id}.loadbalancer.server.port': '8080',
# Metadata
'spawner.user_id': str(user_id),
'spawner.slug': slug,
'spawner.managed': 'true'
}
# Logging: Traefik-Labels ausgeben
print(f"[SPAWNER] Creating container user-{slug}-{user_id}")
print(f"[SPAWNER] Traefik Labels:")
for key, value in labels.items():
if 'traefik' in key:
print(f"[SPAWNER] {key}: {value}")
container = self._get_client().containers.run(
Config.USER_TEMPLATE_IMAGE,
name=f"user-{slug}-{user_id}",
detach=True,
labels=labels,
environment={
'USER_ID': str(user_id),
'USER_SLUG': slug,
'JWT_SECRET': Config.SECRET_KEY # Für Token-Validierung im Container
},
restart_policy={'Name': 'unless-stopped'},
mem_limit=Config.DEFAULT_MEMORY_LIMIT,
cpu_quota=Config.DEFAULT_CPU_QUOTA
)
# Container an Traefik-Netzwerk verbinden
try:
network = self._get_client().networks.get(Config.TRAEFIK_NETWORK)
network.connect(container)
print(f"[SPAWNER] Container an Netzwerk '{Config.TRAEFIK_NETWORK}' verbunden")
except Exception as e:
print(f"[SPAWNER] WARNUNG: Container konnte nicht an Netzwerk verbunden werden: {str(e)}")
container.remove(force=True)
raise Exception(f"Konnte Container nicht an Netzwerk '{Config.TRAEFIK_NETWORK}' verbinden: {str(e)}")
print(f"[SPAWNER] Container created: {container.id[:12]}")
print(f"[SPAWNER] URL: https://{base_host}/{slug}")
return container.id, 8080
except docker.errors.ImageNotFound as e:
error_msg = f"Template-Image '{Config.USER_TEMPLATE_IMAGE}' nicht gefunden"
print(f"[SPAWNER] ERROR: {error_msg}")
raise Exception(error_msg)
except docker.errors.APIError as e:
error_msg = f"Docker API Fehler: {str(e)}"
print(f"[SPAWNER] ERROR: {error_msg}")
raise Exception(error_msg)
except Exception as e:
print(f"[SPAWNER] ERROR: {str(e)}")
raise
def start_container(self, container_id):
"""Startet einen gestoppten User-Container"""
try:
container = self._get_client().containers.get(container_id)
if container.status != 'running':
container.start()
print(f"[SPAWNER] Container {container_id[:12]} gestartet")
return True
except docker.errors.NotFound:
return False
def stop_container(self, container_id):
"""Stoppt einen User-Container"""
try:
container = self._get_client().containers.get(container_id)
container.stop(timeout=10)
return True
except docker.errors.NotFound:
return False
def remove_container(self, container_id):
"""Entfernt einen User-Container komplett"""
try:
container = self._get_client().containers.get(container_id)
container.remove(force=True)
return True
except docker.errors.NotFound:
return False
def get_container_status(self, container_id):
"""Gibt Status eines Containers zurück"""
try:
container = self._get_client().containers.get(container_id)
return container.status
except docker.errors.NotFound:
return 'not_found'
def _get_user_container(self, slug):
"""Findet existierenden Container für User"""
filters = {'label': f'spawner.slug={slug}'}
containers = self._get_client().containers.list(all=True, filters=filters)
return containers[0] if containers else None
def _get_container_port(self, container):
"""Extrahiert Port aus Container-Config"""
return 8080
def spawn_multi_container(self, user_id: int, slug: str, container_type: str) -> tuple:
"""
Spawnt einen Container für einen User mit bestimmtem Typ
Args:
user_id: User ID
slug: User Slug (für URL)
container_type: 'dev' oder 'prod'
Returns:
(container_id, container_port)
"""
try:
# Template-Config holen
template = Config.CONTAINER_TEMPLATES.get(container_type)
if not template:
raise ValueError(f"Ungültiger Container-Typ: {container_type}")
image = template['image']
container_name = f"user-{slug}-{container_type}-{user_id}"
# Traefik Labels mit Suffix
slug_with_suffix = f"{slug}-{container_type}"
base_host = f"{Config.SPAWNER_SUBDOMAIN}.{Config.BASE_DOMAIN}"
labels = {
'traefik.enable': 'true',
'traefik.docker.network': Config.TRAEFIK_NETWORK,
# HTTPS Router mit PathPrefix
f'traefik.http.routers.user{user_id}-{container_type}.rule':
f'Host(`{base_host}`) && PathPrefix(`/{slug_with_suffix}`)',
f'traefik.http.routers.user{user_id}-{container_type}.entrypoints': Config.TRAEFIK_ENTRYPOINT,
f'traefik.http.routers.user{user_id}-{container_type}.priority': '100',
# Router muss zum Service zeigen
f'traefik.http.routers.user{user_id}-{container_type}.service': f'user{user_id}-{container_type}',
# StripPrefix Middleware - entfernt /{slug_with_suffix} bevor Container Request erhält
f'traefik.http.routers.user{user_id}-{container_type}.middlewares': f'user{user_id}-{container_type}-strip',
f'traefik.http.middlewares.user{user_id}-{container_type}-strip.stripprefix.prefixes': f'/{slug_with_suffix}',
# TLS für HTTPS
f'traefik.http.routers.user{user_id}-{container_type}.tls': 'true',
f'traefik.http.routers.user{user_id}-{container_type}.tls.certresolver': Config.TRAEFIK_CERTRESOLVER,
# Service mit Port-Konfiguration
f'traefik.http.services.user{user_id}-{container_type}.loadbalancer.server.port': '8080',
# Metadata
'spawner.user_id': str(user_id),
'spawner.slug': slug,
'spawner.container_type': container_type,
'spawner.managed': 'true'
}
# Lösche ALLE alten Container mit gleicher user_id und container_type (B)
# Dies verhindert Traefik Router-Konflikte mit mehreren Containern gleicher Config
try:
filters = {
'label': [
f'spawner.user_id={user_id}',
f'spawner.container_type={container_type}'
]
}
old_containers = self._get_client().containers.list(all=True, filters=filters)
for old_container in old_containers:
if old_container.status == 'running':
try:
old_container.stop(timeout=5)
print(f"[SPAWNER] Alter Container {old_container.name} gestoppt")
except Exception as e:
print(f"[SPAWNER] WARNUNG: Kann alten Container nicht stoppen: {str(e)}")
try:
old_container.remove(force=True)
print(f"[SPAWNER] Alter Container {old_container.name} gelöscht (Traefik-Konflikt-Prävention)")
except Exception as e:
print(f"[SPAWNER] WARNUNG: Kann alten Container nicht löschen: {str(e)}")
except Exception as e:
print(f"[SPAWNER] WARNUNG: Fehler beim Löschen alter Container: {str(e)}")
# Logging: Traefik-Labels ausgeben
print(f"[SPAWNER] Creating {container_type} container user-{slug}-{container_type}-{user_id}")
print(f"[SPAWNER] Image: {image}")
print(f"[SPAWNER] Traefik Labels:")
for key, value in labels.items():
if 'traefik' in key:
print(f"[SPAWNER] {key}: {value}")
# Persistente Volumes für spezifische Container-Typen
volumes = {}
if container_type == 'vcoder':
data_path = f"/data/users/{user_id}/vcoder"
volumes = {
f"{data_path}/workspace": {'bind': '/home/coder/project', 'mode': 'rw'},
f"{data_path}/platformio": {'bind': '/home/coder/.platformio', 'mode': 'rw'},
}
print(f"[SPAWNER] Volumes für vcoder:")
for host, config in volumes.items():
print(f"[SPAWNER] {host} -> {config['bind']}")
# Environment-Variablen für Container
env_vars = {
'USER_ID': str(user_id),
'USER_SLUG': slug,
'CONTAINER_TYPE': container_type,
'JWT_SECRET': Config.SECRET_KEY # Für Token-Validierung im Container
}
# vcoder braucht BASE_PATH für code-server Subpath-Routing
if container_type == 'vcoder':
env_vars['BASE_PATH'] = f'/{slug_with_suffix}'
container = self._get_client().containers.run(
image=image,
name=container_name,
detach=True,
labels=labels,
environment=env_vars,
restart_policy={'Name': 'unless-stopped'},
mem_limit=Config.DEFAULT_MEMORY_LIMIT,
cpu_quota=Config.DEFAULT_CPU_QUOTA,
volumes=volumes if volumes else None
)
# Container an Traefik-Netzwerk verbinden
try:
network = self._get_client().networks.get(Config.TRAEFIK_NETWORK)
network.connect(container)
print(f"[SPAWNER] Container an Netzwerk '{Config.TRAEFIK_NETWORK}' verbunden")
except Exception as e:
print(f"[SPAWNER] WARNUNG: Container konnte nicht an Netzwerk verbunden werden: {str(e)}")
container.remove(force=True)
raise Exception(f"Konnte Container nicht an Netzwerk '{Config.TRAEFIK_NETWORK}' verbinden: {str(e)}")
# Warte bis Container bereit ist (Health-Check)
print(f"[SPAWNER] Warte auf Container-Startup...")
import time
max_retries = 30 # 30 Sekunden max
retry_count = 0
while retry_count < max_retries:
try:
container.reload() # Aktualisiere Container-Status
if container.status == 'running':
# Container läuft - warte noch 2 Sekunden für Service-Startup
print(f"[SPAWNER] Container läuft, warte 2 Sekunden auf Service-Startup...")
time.sleep(2)
print(f"[SPAWNER] Container bereit!")
break
except Exception as e:
print(f"[SPAWNER] Fehler beim Status-Check: {str(e)}")
retry_count += 1
time.sleep(1)
if retry_count >= max_retries:
print(f"[SPAWNER] WARNUNG: Container wurde nicht bereit nach {max_retries}s")
print(f"[SPAWNER] {container_type.upper()} container created: {container.id[:12]}")
print(f"[SPAWNER] URL: {Config.PREFERRED_URL_SCHEME}://{base_host}/{slug_with_suffix}")
return container.id, 8080
except docker.errors.ImageNotFound as e:
error_msg = f"Template-Image '{template['image']}' für Typ '{container_type}' nicht gefunden"
print(f"[SPAWNER] ERROR: {error_msg}")
raise Exception(error_msg)
except docker.errors.APIError as e:
error_msg = f"Docker API Fehler: {str(e)}"
print(f"[SPAWNER] ERROR: {error_msg}")
raise Exception(error_msg)
except Exception as e:
print(f"[SPAWNER] ERROR: {str(e)}")
raise