140 lines
6.2 KiB
Nix
140 lines
6.2 KiB
Nix
# ollama: <https://github.com/ollama/ollama>
|
|
# - <https://wiki.nixos.org/wiki/Ollama>
|
|
#
|
|
# use: `ollama run llama3.2`
|
|
# - list available: `ollama list`
|
|
# - use a remote session: <https://github.com/ggozad/oterm>
|
|
#
|
|
# models have to be explicitly downloaded. see `ollamaPackages` for examples.
|
|
# should ollamaPackages not suffice, `ollama pull llama3.2` should fetch a model,
|
|
# but the service will need modification to allow net access first.
|
|
#
|
|
# models are defined e.g. here: <https://ollama.com/library/llama3.2:3b/blobs/dde5aa3fc5ff>
|
|
#
|
|
### to confirm GPU acceleration
|
|
# grep `journalctl -u ollama` for:
|
|
# - `looking for compatible GPUs`
|
|
# and see if it says anything bad afterward like:
|
|
# - `no compatible GPUs were discovered`
|
|
# then run a model and check for:
|
|
# `offloading <n> repeating layers to GPU`
|
|
{ config, lib, pkgs, ... }:
|
|
let
|
|
cfg = config.sane.services.ollama;
|
|
modelSources = pkgs.symlinkJoin {
|
|
name = "ollama-models";
|
|
paths = with pkgs.ollamaPackages; [
|
|
athene-v2-72b-q2_K # very knowledgable; fairly compliant (briefly lets you know if something's wrong, but still answers)
|
|
# aya-8b # it avoids generating code, only text
|
|
# codegeex4-9b # it's okaaay, seems to not give wrong code, just incomplete code.
|
|
# codegemma-7b # it generates invalid nix code
|
|
codestral-22b
|
|
# deepseek-coder-7b # subpar to deepseek-coder-v2 in nearly every way
|
|
deepseek-coder-v2-16b # GREAT balance between speed and code quality. code is superior to qwen2_5 in some ways, and inferior in others
|
|
# deepseek-coder-v2-16b-lite-instruct-q5_1 # higher-res version of default 16b (but in practice, is more rambly and less correct)
|
|
deepseek-r1-1_5b
|
|
deepseek-r1-7b
|
|
deepseek-r1-14b
|
|
# deepseek-r1-32b # redundant with abliterated deepseek-r1
|
|
# deepseek-r1-671b # requires 443 GB of RAM
|
|
deepseek-r1-abliterated-14b
|
|
deepseek-r1-abliterated-32b
|
|
deepseek-r1-abliterated-70b
|
|
devstral-24b
|
|
dolphin3-8b # gives incorrect RDMA RoCEv2 UDP port
|
|
# dolphin-mistral-7b # UNCENSORED mistral; compliant
|
|
# dolphin-mixtral-8x7b # about as fast as a 14b model, similar quality results. uncensored, but still preachy
|
|
# falcon2-11b # code examples are lacking
|
|
# gemma2-9b # fast, but not great for code
|
|
# gemma2-27b # generates at 1word/sec, but decent coding results if you can wrangle it
|
|
gemma3-12b
|
|
gemma3-27b # gives incorrect RDMA RoCEv2 UDP port
|
|
gemma3n-e2b
|
|
gemma3n-e4b
|
|
# glm4-9b # it generates invalid code
|
|
# hermes3-8b # FAST, but unwieldy
|
|
# kimi-k2-1026b # MoE with 32B activated parameters, 384 experts (requires 384 GiB RAM)
|
|
# llama3-chatqa-8b # it gets stuck
|
|
# llama3_1-70b # generates like 1 word/sec, decent output (comparable to qwen2_5-32b)
|
|
# llama3_2-3b # redundant with uncensored llama
|
|
llama3_2-uncensored-3b
|
|
# llama3_3-70b # non-compliant; dodges iffy questions
|
|
llama3_3-abliterated-70b # compliant, but slower and not as helpful as deepseek-r1-abliterated-70b
|
|
llama4-16x17b # gives incorrect RDMA RoCEv2 UDP port
|
|
magicoder-7b # it generates valid, if sparse, code
|
|
magistral-24b
|
|
marco-o1-7b # untested
|
|
# mistral-7b # it generates invalid code
|
|
# mistral-nemo-12b # it generates invalid code
|
|
# mistral-small-22b # quality comparable to qwen2_5
|
|
mistral-small3_2-24b
|
|
# mistral-large-123b # times out launch on desko
|
|
# mixtral-8x7b # generates valid, if sparse, code; only for the most popular languages
|
|
olmo2-13b
|
|
openthinker-7b
|
|
openthinker-32b
|
|
orca-mini-7b
|
|
# phi3_5-3b # generates invalid code
|
|
phi4-14b
|
|
# qwen2_5-7b # notably less quality than 32b (i.e. generates invalid code)
|
|
# qwen2_5-14b # *almost* same quality to 32b variant, but faster
|
|
qwen3-8b
|
|
qwen3-14b # gives correct RDMA RoCEv2 UDP port
|
|
qwen3-30b
|
|
# qwen2_5-32b-instruct-q2_K # lower-res version of default 32b (so, slightly faster, but generates invalid code where the full res generates valid code)
|
|
qwen2_5-32b # generates 3~5 words/sec, but notably more accurate than coder-7b
|
|
qwen2_5-abliterate-7b
|
|
qwen2_5-abliterate-14b
|
|
qwen2_5-abliterate-32b
|
|
# qwen2_5-coder-7b # fast, and concise, but generates invalid code
|
|
# qwq-32b # heavily restricted
|
|
qwq-abliterated-32b
|
|
# solar-pro-22b # generates invalid code
|
|
# starcoder2-15b-instruct # it gets stuck
|
|
# wizardlm2-7b # generates invalid code
|
|
# yi-coder-9b # subpar to qwen2-14b, but it's still useful
|
|
];
|
|
};
|
|
models = "${modelSources}/share/ollama/models";
|
|
in
|
|
{
|
|
options.sane.services.ollama = with lib; {
|
|
enable = mkEnableOption "ollama Large Language Model";
|
|
};
|
|
|
|
config = lib.mkIf (cfg.enable && config.sane.maxBuildCost >= 3) {
|
|
services.ollama.enable = true;
|
|
services.ollama.user = "ollama";
|
|
services.ollama.group = "ollama";
|
|
services.ollama.models = models;
|
|
services.ollama.host = "0.0.0.0"; # TODO: specify specifically 127.0.0.1 and 10.0.10.22
|
|
|
|
# these acceleration settings are relevant to `desko`.
|
|
services.ollama.acceleration = lib.mkIf config.hardware.amdgpu.opencl.enable "rocm"; # AMD GPU acceleration (achieves the same as `nixpkgs.config.rocmSupport = true` but just for ollama (the global toggle rebuilds the world))
|
|
services.ollama.rocmOverrideGfx = "10.1.0"; #< `nix-shell -p "rocmPackages.rocminfo" --run "rocminfo" | grep "gfx"` (e.g. gfx1010)
|
|
services.ollama.environmentVariables.HCC_AMDGPU_TARGET = "gfx1010";
|
|
|
|
users.groups.ollama = {};
|
|
|
|
users.users.ollama = {
|
|
group = "ollama";
|
|
isSystemUser = true;
|
|
};
|
|
|
|
systemd.services.ollama.serviceConfig.DynamicUser = lib.mkForce false; #< not required, but DynamicUser is confusing
|
|
# `ollama run` connects to the ollama service over IP,
|
|
# but other than that networking isn't required for anything but downloading models.
|
|
systemd.services.ollama.serviceConfig.IPAddressDeny = "any";
|
|
systemd.services.ollama.serviceConfig.IPAddressAllow = [
|
|
"10.0.10.0/24"
|
|
"127.0.0.1"
|
|
];
|
|
|
|
sane.ports.ports."11434" = {
|
|
protocol = [ "tcp" ];
|
|
visibleTo.lan = true; #< TODO: restrict to just wireguard clients
|
|
description = "colin-ollama";
|
|
};
|
|
};
|
|
}
|