Files
OPC/ControlPlane.Worker/Services/ClarityContainerService.cs
T
2026-04-25 22:59:50 -04:00

721 lines
30 KiB
C#

using System.Diagnostics;
using System.Text;
using ControlPlane.Core.Config;
using ControlPlane.Core.Messages;
using ControlPlane.Core.Models;
using Docker.DotNet;
using Docker.DotNet.Models;
using MassTransit;
using Microsoft.Extensions.Options;
namespace ControlPlane.Worker.Services;
/// <summary>
/// Manages Clarity.Server Docker containers for provisioned tenants.
/// Container naming convention: {env}-app-clarity-{siteCode}
/// e.g. fdev-app-clarity-01000014
/// </summary>
public class ClarityContainerService(
IConfiguration config,
IOptions<ClarityInfraOptions> infraOptions,
IPublishEndpoint bus,
ILogger<ClarityContainerService> logger)
{
private ClarityInfraOptions Infra => infraOptions.Value;
// The image to run - override via config for prod registries
private string ImageName => config["Docker:ClarityServerImage"] ?? "clarity-server:latest";
private DockerClient CreateClient()
{
var uri = config["Docker:Socket"] ?? "npipe://./pipe/docker_engine";
return new DockerClientConfiguration(new Uri(uri)).CreateClient();
}
/// <summary>
/// Derives the container name from environment + siteCode.
/// Convention: {env}-app-clarity-{siteCode}
/// </summary>
public static string ContainerName(string environment, string siteCode) =>
$"{environment.ToLowerInvariant()}-app-clarity-{siteCode.ToLowerInvariant()}";
/// <summary>
/// Pulls the image (if not present locally), starts the container on the managed network,
/// and writes an nginx conf.d snippet so traffic routes in.
/// No host port binding — nginx reaches the container via Docker DNS on the shared network.
/// </summary>
public async Task<string> StartTenantContainerAsync(
string environment,
string siteCode,
string subdomain,
string keycloakRealm,
string? postgresConnectionString,
string? vaultToken,
Guid jobId,
CancellationToken cancellationToken)
{
using var docker = CreateClient();
var name = ContainerName(environment, siteCode);
// Stop and remove any existing container with this name (idempotent reprovision)
await TryRemoveExistingAsync(docker, name, cancellationToken);
// Pull image if not already local
await EnsureImageAsync(docker, cancellationToken);
// All service URLs use stable Docker DNS names on the managed network — no host ports involved.
var container = await docker.Containers.CreateContainerAsync(new CreateContainerParameters
{
Name = name,
Image = ImageName,
Env =
[
"ASPNETCORE_ENVIRONMENT=Production",
"ASPNETCORE_URLS=http://+:8080",
$"TenantSubdomain={subdomain}",
$"Keycloak__BaseUrl={Infra.KeycloakPublicUrl}",
$"Keycloak__InternalUrl={Infra.KeycloakInternalUrl}",
$"Keycloak__Realm={keycloakRealm}",
$"Vault__Address={Infra.VaultInternalUrl}",
.. (vaultToken is not null
? (string[])[$"Vault__Token={vaultToken}"]
: []),
.. (postgresConnectionString is not null
? (string[])[$"ConnectionStrings__postgresdb={postgresConnectionString}"]
: []),
],
HostConfig = new HostConfig
{
NetworkMode = Infra.Network,
RestartPolicy = new RestartPolicy { Name = RestartPolicyKind.UnlessStopped },
// Map *.clarity.test domains to the Docker host gateway so that Clarity.Server,
// running inside a container, can reach nginx (which routes *.clarity.test).
// This is required for Keycloak OIDC discovery and JWT iss-claim validation —
// Keycloak issues tokens with iss=https://keycloak.clarity.test/realms/...
// and Clarity.Server must be able to reach that URL for OIDC metadata.
ExtraHosts =
[
$"keycloak.{Infra.Domain}:host-gateway",
$"{subdomain}.{Infra.Domain}:host-gateway",
],
},
Labels = new Dictionary<string, string>
{
["clarity.managed"] = "true",
["clarity.subdomain"] = subdomain,
["clarity.siteCode"] = siteCode,
["clarity.env"] = environment,
},
}, cancellationToken);
// Ensure Keycloak and Vault are reachable on the managed network via their Docker DNS aliases.
// Aspire places them on its own bridge; tenant containers on clarity-net need them aliased here.
await EnsureContainerOnNetworkAsync(docker, "clarity-keycloak", Infra.Network, "keycloak", cancellationToken);
await EnsureContainerOnNetworkAsync(docker, "clarity-vault", Infra.Network, "vault", cancellationToken);
var started = await docker.Containers.StartContainerAsync(container.ID, null, cancellationToken);
if (!started)
throw new InvalidOperationException($"Docker failed to start container {name} (id={container.ID}).");
logger.LogInformation("Started container {Name} on {Network} (image: {Image})", name, Infra.Network, ImageName);
await WriteNginxConfigAsync(subdomain, name, jobId, cancellationToken);
await WriteComposeArtifactAsync(environment, subdomain, keycloakRealm, name, cancellationToken);
return name;
}
/// <summary>
/// Stops and removes a tenant container. Called from InfrastructureStep.CompensateAsync.
/// </summary>
public async Task StopAndRemoveAsync(string containerName, CancellationToken cancellationToken)
{
using var docker = CreateClient();
await TryRemoveExistingAsync(docker, containerName, cancellationToken);
logger.LogInformation("Removed container {Name}", containerName);
}
// -- helpers --
private async Task EnsureImageAsync(DockerClient docker, CancellationToken cancellationToken)
{
var images = await docker.Images.ListImagesAsync(new ImagesListParameters
{
Filters = new Dictionary<string, IDictionary<string, bool>>
{
["reference"] = new Dictionary<string, bool> { [ImageName] = true }
}
}, cancellationToken);
if (images.Count > 0)
{
logger.LogInformation("Image {Image} already present locally.", ImageName);
return;
}
// Local image (no registry host) — pulling from Docker Hub will always fail.
// The image must be built manually before provisioning.
var isLocalOnly = !ImageName.Contains('/') || ImageName.StartsWith("localhost/");
if (isLocalOnly)
{
throw new InvalidOperationException(
$"Image '{ImageName}' was not found locally and cannot be pulled from a registry. " +
$"Build it first from the repo root:{Environment.NewLine}" +
$" docker build -f Clarity.Server/Dockerfile -t {ImageName} ." +
$"{Environment.NewLine}Then retry provisioning.");
}
// Registry image — attempt pull
logger.LogInformation("Pulling image {Image} from registry...", ImageName);
var (repo, tag) = SplitImageTag(ImageName);
await docker.Images.CreateImageAsync(
new ImagesCreateParameters { FromImage = repo, Tag = tag },
null,
new Progress<JSONMessage>(m =>
{
if (!string.IsNullOrWhiteSpace(m.Status))
logger.LogDebug("[docker pull] {Status} {Progress}", m.Status, m.ProgressMessage);
}),
cancellationToken);
}
// -- nginx conf.d helpers --
/// <summary>
/// Writes /NginxConfig/conf.d/{subdomain}.conf so nginx routes
/// {subdomain}.clarity.test → the containe
/// Then signals nginx to reload its config without dropping connections.
/// </summary>
private async Task WriteNginxConfigAsync(string subdomain, string containerName, Guid jobId, CancellationToken ct)
{
var confDPath = config["Nginx:ConfDPath"];
if (string.IsNullOrWhiteSpace(confDPath))
{
logger.LogWarning("Nginx:ConfDPath is not configured — skipping nginx conf write for {Subdomain}.", subdomain);
return;
}
var confContent = $$$"""
# Auto-generated by ControlPlane.Worker — do not edit manually.
# Tenant: {{{subdomain}}}
server {
listen 443 ssl;
server_name {{{subdomain}}}.{{{Infra.Domain}}};
ssl_certificate {{{Infra.NginxCertPath}}};
ssl_certificate_key {{{Infra.NginxCertKeyPath}}};
location / {
# Docker DNS resolves the container name on the managed network
set $upstream http://{{{containerName}}}:8080;
proxy_pass $upstream;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}
""";
var confFile = Path.Combine(confDPath, $"{subdomain}.conf");
await File.WriteAllTextAsync(confFile, confContent, ct);
logger.LogInformation("Wrote nginx config for {Subdomain} → {Container}", subdomain, containerName);
await ReloadNginxAsync(jobId, subdomain, ct);
}
public async Task RemoveNginxConfigAsync(string subdomain, CancellationToken ct)
{
var confDPath = config["Nginx:ConfDPath"];
if (string.IsNullOrWhiteSpace(confDPath)) return;
var confFile = Path.Combine(confDPath, $"{subdomain}.conf");
if (File.Exists(confFile))
{
File.Delete(confFile);
logger.LogInformation("Removed nginx config for {Subdomain}", subdomain);
await ReloadNginxAsync(Guid.Empty, subdomain, ct);
}
}
/// Sends SIGHUP to the nginx container which triggers a graceful config reload.
private async Task ReloadNginxAsync(Guid jobId, string subdomain, CancellationToken ct)
{
try
{
using var docker = CreateClient();
// Find the nginx container by name — platform infra always uses "clarity-nginx".
var containers = await docker.Containers.ListContainersAsync(
new ContainersListParameters
{
Filters = new Dictionary<string, IDictionary<string, bool>>
{
["name"] = new Dictionary<string, bool> { ["clarity-nginx"] = true }
}
}, ct);
var nginx = containers.FirstOrDefault();
if (nginx is null)
{
logger.LogWarning("nginx container not found — skipping reload.");
return;
}
await docker.Containers.KillContainerAsync(nginx.ID, new ContainerKillParameters { Signal = "HUP" }, ct);
var containerName = nginx.Names.FirstOrDefault() ?? nginx.ID;
logger.LogInformation("nginx reloaded (container: {Name}).", containerName);
if (jobId != Guid.Empty)
{
await bus.Publish(new ProvisioningProgressEvent
{
JobId = jobId,
Type = "nginx_reloaded",
Step = "Container Launch",
Message = $"nginx reloaded — route for {subdomain}.{Infra.Domain} is live.",
}, ct);
}
}
catch (Exception ex)
{
logger.LogWarning(ex, "Failed to reload nginx — new tenant route may not be active until next nginx restart.");
if (jobId != Guid.Empty)
{
await bus.Publish(new ProvisioningProgressEvent
{
JobId = jobId,
Type = "diagnostic",
Step = "Container Launch",
Message = "nginx reload failed — route may not be active.",
Detail = ex.ToString(),
}, ct);
}
}
}
// -- docker helpers --
private static async Task TryRemoveExistingAsync(DockerClient docker, string name, CancellationToken cancellationToken)
{
try
{
await docker.Containers.StopContainerAsync(name,
new ContainerStopParameters { WaitBeforeKillSeconds = 5 }, cancellationToken);
await docker.Containers.RemoveContainerAsync(name,
new ContainerRemoveParameters { Force = true }, cancellationToken);
}
catch (DockerContainerNotFoundException) { /* already gone - fine */ }
catch (DockerApiException ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound) { /* same */ }
}
private static (string repo, string tag) SplitImageTag(string image)
{
var colon = image.LastIndexOf(':');
return colon < 0 ? (image, "latest") : (image[..colon], image[(colon + 1)..]);
}
/// <summary>
/// Connects <paramref name="containerName"/> to <paramref name="network"/> with the given
/// <paramref name="alias"/> if it isn't already connected.
/// Silently no-ops if the container isn't found (it may not be running in all environments).
/// </summary>
private async Task EnsureContainerOnNetworkAsync(
DockerClient docker,
string containerName,
string network,
string alias,
CancellationToken cancellationToken)
{
try
{
var inspect = await docker.Containers.InspectContainerAsync(containerName, cancellationToken);
if (inspect.NetworkSettings.Networks.TryGetValue(network, out var existing))
{
// Already connected — check whether our alias is present.
var hasAlias = existing.Aliases?.Contains(alias, StringComparer.OrdinalIgnoreCase) == true;
if (hasAlias) return;
// Connected but without the alias — disconnect so we can reconnect with it.
await docker.Networks.DisconnectNetworkAsync(network, new NetworkDisconnectParameters
{
Container = inspect.ID,
Force = true,
}, cancellationToken);
}
await docker.Networks.ConnectNetworkAsync(network, new NetworkConnectParameters
{
Container = inspect.ID,
EndpointConfig = new EndpointSettings
{
Aliases = [alias],
},
}, cancellationToken);
logger.LogInformation("Connected container '{Container}' to network '{Network}' with alias '{Alias}'.", containerName, network, alias);
}
catch (DockerContainerNotFoundException)
{
logger.LogWarning("Container '{Container}' not found — skipping network connect.", containerName);
}
catch (DockerApiException ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound)
{
logger.LogWarning("Container '{Container}' not found — skipping network connect.", containerName);
}
catch (Exception ex)
{
logger.LogWarning(ex, "Could not connect '{Container}' to '{Network}' — tenant JWT validation may fail.", containerName, network);
}
}
// ── ClientAssets / compose artifact helpers ──────────────────────────────
private string ClientAssetsFolder(string subdomain)
{
var root = config["ClientAssets__Folder"] ?? config["ClientAssets:Folder"]
?? Path.GetFullPath(Path.Combine(AppContext.BaseDirectory, "..", "ClientAssets"));
return Path.Combine(root, subdomain);
}
/// <summary>
/// Writes a docker-compose.yml to ClientAssets/{subdomain}/ documenting the SharedPlatform
/// clarity-server deployment. The file is an audit artifact — it is NOT executed by the Worker.
/// </summary>
private async Task WriteComposeArtifactAsync(
string environment,
string subdomain,
string keycloakRealm,
string containerName,
CancellationToken ct)
{
var folder = ClientAssetsFolder(subdomain);
try
{
Directory.CreateDirectory(folder);
var content = $$$"""
# Auto-generated by ControlPlane.Worker — do not edit manually.
# Tenant: {{{subdomain}}}
# Tier: SharedPlatform
# Generated: {{{DateTimeOffset.UtcNow:O}}}
name: clarity-{{{subdomain}}}
services:
app-{{{subdomain}}}:
image: {{{ImageName}}}
restart: unless-stopped
container_name: {{{containerName}}}
environment:
ASPNETCORE_ENVIRONMENT: Production
ASPNETCORE_URLS: http://+:8080
TenantSubdomain: {{{subdomain}}}
Keycloak__BaseUrl: {{{Infra.KeycloakPublicUrl}}}
Keycloak__InternalUrl: {{{Infra.KeycloakInternalUrl}}}
Keycloak__Realm: {{{keycloakRealm}}}
Vault__Address: {{{Infra.VaultInternalUrl}}}
# ConnectionStrings__postgresdb: (persisted in TenantRecord)
networks:
- clarity-net
extra_hosts:
- "keycloak.{{{Infra.Domain}}}:host-gateway"
- "{{{subdomain}}}.{{{Infra.Domain}}}:host-gateway"
labels:
clarity.managed: "true"
clarity.subdomain: {{{subdomain}}}
clarity.env: {{{environment}}}
networks:
clarity-net:
external: true
""";
var composePath = Path.Combine(folder, "docker-compose.yml");
await File.WriteAllTextAsync(composePath, content, ct);
logger.LogInformation("Wrote compose artifact for {Subdomain} → {Path}", subdomain, composePath);
}
catch (Exception ex)
{
// Non-fatal — the container is already running; the artifact is an audit record.
logger.LogWarning(ex, "Could not write compose artifact for {Subdomain}.", subdomain);
}
}
// ── OwnContainer — sidecar lifecycle ─────────────────────────────────────
/// <summary>
/// OwnContainer tier — generates a per-tenant docker-compose.yml for sidecar services
/// (Keycloak, Vault, Postgres, MinIO as elected by StackConfig), writes it to
/// ClientAssets/{subdomain}/docker-compose.yml, and runs <c>docker compose up -d</c>.
/// Returns the absolute path to the compose file.
/// </summary>
public async Task<string> GenerateAndRunSidecarsAsync(
ProvisioningJob job,
Dictionary<string, ResolvedEndpoint> topology,
CancellationToken ct)
{
var folder = ClientAssetsFolder(job.Subdomain);
Directory.CreateDirectory(folder);
var content = BuildSidecarCompose(job);
var composePath = Path.Combine(folder, "docker-compose.yml");
await File.WriteAllTextAsync(composePath, content, ct);
logger.LogInformation("[{JobId}] Wrote sidecar compose → {Path}", job.Id, composePath);
await RunDockerComposeAsync(composePath, "up -d", job.Id, ct);
logger.LogInformation("[{JobId}] Sidecar containers started.", job.Id);
return composePath;
}
/// <summary>
/// After sidecars are started, inspects each OwnContainer component's Docker container
/// to resolve its ephemeral host port, then rewrites the topology AdminUrl to
/// <c>http://localhost:{hostPort}</c> so downstream saga steps can call admin APIs.
/// </summary>
public async Task UpdateTopologyWithHostPortsAsync(
Dictionary<string, ResolvedEndpoint> topology,
CancellationToken ct)
{
using var docker = CreateClient();
foreach (var (component, endpoint) in topology.ToList())
{
if (endpoint.Mode != ComponentMode.OwnContainer) continue;
if (string.IsNullOrWhiteSpace(endpoint.ContainerName)) continue;
try
{
var inspect = await docker.Containers.InspectContainerAsync(endpoint.ContainerName, ct);
var firstBinding = inspect.NetworkSettings.Ports
.SelectMany(p => p.Value ?? [])
.FirstOrDefault(b => !string.IsNullOrWhiteSpace(b.HostPort));
if (firstBinding is not null)
{
topology[component] = endpoint with { AdminUrl = $"http://localhost:{firstBinding.HostPort}" };
logger.LogInformation("Resolved {Component} host port → {Url}", component, topology[component].AdminUrl);
}
else
{
logger.LogWarning("No host port binding found for {Component} container {Name}.", component, endpoint.ContainerName);
}
}
catch (Exception ex)
{
logger.LogWarning(ex, "Could not resolve host port for {Component} container {Name}.", component, endpoint.ContainerName);
}
}
}
/// <summary>
/// Tears down all sidecar containers for a tenant by running
/// <c>docker compose down -v</c> against the stored compose file.
/// Called from InfrastructureProvisioningStep.CompensateAsync.
/// </summary>
public async Task TearDownComposeProjectAsync(string subdomain, CancellationToken ct)
{
var composePath = Path.Combine(ClientAssetsFolder(subdomain), "docker-compose.yml");
if (!File.Exists(composePath))
{
logger.LogWarning("No compose file found for {Subdomain} — nothing to tear down.", subdomain);
return;
}
await RunDockerComposeAsync(composePath, "down -v", Guid.Empty, ct);
logger.LogInformation("Tore down sidecar containers for {Subdomain}.", subdomain);
}
/// <summary>
/// Builds the docker-compose YAML content for OwnContainer sidecar services.
/// Services are included conditionally based on StackConfig. clarity-net is
/// declared as an external network so all sidecars join the shared platform network.
///
/// All services include <c>extra_hosts: host-gateway</c> entries for *.clarity.test so that
/// intra-container calls that go through nginx (e.g. OIDC discovery) route correctly.
/// </summary>
private string BuildSidecarCompose(ProvisioningJob job)
{
var s = job.Subdomain;
var stack = job.StackConfig;
var sb = new StringBuilder();
sb.AppendLine($"""
# Auto-generated by ControlPlane.Worker — do not edit manually.
# Tenant: {s} | Tier: {job.Tier}
# Generated: {DateTimeOffset.UtcNow:O}
name: clarity-{s}
services:
""");
// ── Postgres ──────────────────────────────────────────────────────────
if (stack.Postgres == ComponentMode.OwnContainer)
{
sb.AppendLine($$"""
pg-{{s}}:
image: postgres:16
restart: unless-stopped
environment:
POSTGRES_USER: clarity
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-clarity-dev}
POSTGRES_DB: clarity
expose:
- "5432"
ports:
- "127.0.0.1::5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U clarity"]
interval: 10s
timeout: 5s
retries: 5
networks:
- clarity-net
labels:
clarity.managed: "true"
clarity.subdomain: {{s}}
clarity.component: postgres
""");
}
// ── Keycloak ──────────────────────────────────────────────────────────
if (stack.Keycloak == ComponentMode.OwnContainer)
{
var kcHostname = $"kc.{s}.{Infra.Domain}";
var dependsBlock = stack.Postgres == ComponentMode.OwnContainer
? $"""
depends_on:
pg-{s}:
condition: service_healthy
"""
: string.Empty;
sb.AppendLine($$"""
kc-{{s}}:
image: quay.io/keycloak/keycloak:latest
restart: unless-stopped
command: start-dev
environment:
KEYCLOAK_ADMIN: admin
KEYCLOAK_ADMIN_PASSWORD: ${KEYCLOAK_ADMIN_PASSWORD:-admin}
KC_DB: postgres
KC_DB_URL_HOST: pg-{{s}}
KC_DB_URL_DATABASE: keycloak
KC_DB_USERNAME: clarity
KC_DB_PASSWORD: ${POSTGRES_PASSWORD:-clarity-dev}
KC_HOSTNAME: {{kcHostname}}
KC_HOSTNAME_STRICT: "false"
KC_HTTP_ENABLED: "true"
expose:
- "8080"
ports:
- "127.0.0.1::8080"
networks:
- clarity-net
extra_hosts:
- "{{kcHostname}}:host-gateway"
{{dependsBlock}}
labels:
clarity.managed: "true"
clarity.subdomain: {{s}}
clarity.component: keycloak
""");
}
// ── Vault ─────────────────────────────────────────────────────────────
if (stack.Vault == ComponentMode.OwnContainer)
{
sb.AppendLine($$"""
vault-{{s}}:
image: hashicorp/vault:latest
restart: unless-stopped
cap_add:
- IPC_LOCK
environment:
VAULT_DEV_ROOT_TOKEN_ID: ${VAULT_TOKEN:-vault-dev-root}
VAULT_DEV_LISTEN_ADDRESS: "0.0.0.0:8200"
expose:
- "8200"
ports:
- "127.0.0.1::8200"
networks:
- clarity-net
labels:
clarity.managed: "true"
clarity.subdomain: {{s}}
clarity.component: vault
""");
}
// ── MinIO ─────────────────────────────────────────────────────────────
if (stack.Minio == ComponentMode.OwnContainer)
{
sb.AppendLine($$"""
minio-{{s}}:
image: minio/minio:latest
restart: unless-stopped
command: server /data --console-address ":9001"
environment:
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minio}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minio-dev}
expose:
- "9000"
- "9001"
ports:
- "127.0.0.1::9000"
- "127.0.0.1::9001"
networks:
- clarity-net
labels:
clarity.managed: "true"
clarity.subdomain: {{s}}
clarity.component: minio
""");
}
sb.AppendLine("""
networks:
clarity-net:
external: true
""");
return sb.ToString();
}
/// <summary>
/// Runs <c>docker compose -f {composePath} {args}</c> as a child process.
/// Streams stdout/stderr to the logger and throws on non-zero exit.
/// </summary>
private async Task RunDockerComposeAsync(string composePath, string args, Guid jobId, CancellationToken ct)
{
var psi = new ProcessStartInfo("docker")
{
Arguments = $"compose -f \"{composePath}\" {args}",
WorkingDirectory = Path.GetDirectoryName(composePath)!,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
};
using var process = Process.Start(psi)
?? throw new InvalidOperationException("Failed to start docker compose process.");
var stdoutTask = process.StandardOutput.ReadToEndAsync(ct);
var stderrTask = process.StandardError.ReadToEndAsync(ct);
await process.WaitForExitAsync(ct);
var stdout = await stdoutTask;
var stderr = await stderrTask;
if (!string.IsNullOrWhiteSpace(stdout))
logger.LogInformation("[docker compose] {Output}", stdout.Trim());
if (!string.IsNullOrWhiteSpace(stderr))
logger.LogInformation("[docker compose stderr] {Output}", stderr.Trim());
if (process.ExitCode != 0)
throw new InvalidOperationException(
$"'docker compose {args}' exited with code {process.ExitCode}. See logs for details.");
}
}