{ config, pkgs, lib, strix-halo-pkgs, ... }:

{
  # Systemd service for llama-server with GLM-4.7-Flash
  # Replaces Calvin's Docker-based setup
  systemd.services.llama-server = {
    description = "llama.cpp server (GLM-4.7-Flash)";
    after = [ "network.target" ];
    wantedBy = [ "multi-user.target" ];

    environment = {
      HSA_OVERRIDE_GFX_VERSION = "11.5.1";
    };

    serviceConfig = {
      # Source-built llamacpp with ROCm for gfx1151, tracks flake's llama-cpp input (b7984)
      ExecStart = ''
        ${strix-halo-pkgs.llamacpp-rocm-gfx1151}/bin/llama-server \
          -m /srv/llama/models/GLM-4.7-Flash-Q4_K_S.gguf \
          --fa \
          -c 16384 \
          --port 25566 \
          --host 0.0.0.0 \
          --jinja \
          --chat-template-file /srv/llama/templates/glminstruct.template
      '';
      Restart = "on-failure";
      RestartSec = 5;

      # Run as a dedicated user
      DynamicUser = true;
      StateDirectory = "llama-server";

      # Read-only access to model and template files
      ReadOnlyPaths = [ "/srv/llama" ];
    };
  };

  # Ensure directories exist
  systemd.tmpfiles.rules = [
    "d /srv/llama 0755 root root -"
    "d /srv/llama/models 0755 root root -"
    "d /srv/llama/templates 0755 root root -"
  ];

  networking.firewall.allowedTCPPorts = [ 25566 ];
}