48 lines
1.3 KiB
Nix
48 lines
1.3 KiB
Nix
|
|
{ config, pkgs, lib, strix-halo-pkgs, ... }:
|
||
|
|
|
||
|
|
{
|
||
|
|
# Systemd service for llama-server with GLM-4.7-Flash
|
||
|
|
# Replaces Calvin's Docker-based setup
|
||
|
|
systemd.services.llama-server = {
|
||
|
|
description = "llama.cpp server (GLM-4.7-Flash)";
|
||
|
|
after = [ "network.target" ];
|
||
|
|
wantedBy = [ "multi-user.target" ];
|
||
|
|
|
||
|
|
environment = {
|
||
|
|
HSA_OVERRIDE_GFX_VERSION = "11.5.1";
|
||
|
|
};
|
||
|
|
|
||
|
|
serviceConfig = {
|
||
|
|
# Source-built llamacpp with ROCm for gfx1151, tracks flake's llama-cpp input (b7984)
|
||
|
|
ExecStart = ''
|
||
|
|
${strix-halo-pkgs.llamacpp-rocm-gfx1151}/bin/llama-server \
|
||
|
|
-m /srv/llama/models/GLM-4.7-Flash-Q4_K_S.gguf \
|
||
|
|
--fa \
|
||
|
|
-c 16384 \
|
||
|
|
--port 25566 \
|
||
|
|
--host 0.0.0.0 \
|
||
|
|
--jinja \
|
||
|
|
--chat-template-file /srv/llama/templates/glminstruct.template
|
||
|
|
'';
|
||
|
|
Restart = "on-failure";
|
||
|
|
RestartSec = 5;
|
||
|
|
|
||
|
|
# Run as a dedicated user
|
||
|
|
DynamicUser = true;
|
||
|
|
StateDirectory = "llama-server";
|
||
|
|
|
||
|
|
# Read-only access to model and template files
|
||
|
|
ReadOnlyPaths = [ "/srv/llama" ];
|
||
|
|
};
|
||
|
|
};
|
||
|
|
|
||
|
|
# Ensure directories exist
|
||
|
|
systemd.tmpfiles.rules = [
|
||
|
|
"d /srv/llama 0755 root root -"
|
||
|
|
"d /srv/llama/models 0755 root root -"
|
||
|
|
"d /srv/llama/templates 0755 root root -"
|
||
|
|
];
|
||
|
|
|
||
|
|
networking.firewall.allowedTCPPorts = [ 25566 ];
|
||
|
|
}
|