{ config, pkgs, lib, strix-halo-pkgs, ... }: { # Systemd service for llama-server with GLM-4.7-Flash # Replaces Calvin's Docker-based setup systemd.services.llama-server = { description = "llama.cpp server (GLM-4.7-Flash)"; after = [ "network.target" ]; wantedBy = [ "multi-user.target" ]; environment = { HSA_OVERRIDE_GFX_VERSION = "11.5.1"; }; serviceConfig = { # Source-built llamacpp with ROCm for gfx1151, tracks flake's llama-cpp input (b7984) ExecStart = '' ${strix-halo-pkgs.llamacpp-rocm-gfx1151}/bin/llama-server \ -m /srv/llama/models/GLM-4.7-Flash-Q4_K_S.gguf \ --fa \ -c 16384 \ --port 25566 \ --host 0.0.0.0 \ --jinja \ --chat-template-file /srv/llama/templates/glminstruct.template ''; Restart = "on-failure"; RestartSec = 5; # Run as a dedicated user DynamicUser = true; StateDirectory = "llama-server"; # Read-only access to model and template files ReadOnlyPaths = [ "/srv/llama" ]; }; }; # Ensure directories exist systemd.tmpfiles.rules = [ "d /srv/llama 0755 root root -" "d /srv/llama/models 0755 root root -" "d /srv/llama/templates 0755 root root -" ]; networking.firewall.allowedTCPPorts = [ 25566 ]; }