Compare commits

..

22 Commits

Author SHA1 Message Date
Georgi Gerganov afb3929279 Merge branch 'master' into llama-refactor 2023-10-31 20:35:31 +02:00
Tungsten842 07178c98e1 flake.nix: fix for rocm 5.7 (#3853) 2023-10-31 19:24:03 +02:00
Georgi Gerganov 5baefef497 llama : add llm_build helper functions (#3848)
* llama : add llm_build_norm helper function

ggml-ci

* llama : add llm_build_ffn helper function (#3849)

ggml-ci

* llama : add llm_build_k_shift helper

ggml-ci

* llama : fix offloading after recent changes

* llama : add llm_build_kv_store helper

ggml-ci

* llama : remove obsolete offload names

* llama : fix llm_build_k_shift to use n_head_kv instead of n_head

* llama : simplify falcon Q, K, V computation

* llama : remove obsolete comments in build graphs

* llama : add llm_build_kqv helper

ggml-ci

* llama : minor

* llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading

* llama : fix input allocation logic

* llama : update offload functions for KQ tensors

* llama : normalize tensor names

ggml-ci

* llama : enable warning about not offloaded tensors

* llama : remove extra ; + deduplicate gate_b logic

* llama : add llm_build_inp_embd helper
2023-10-31 19:23:12 +02:00
Georgi Gerganov 210e6e5d02 llama : remove obsolete map for layer counting 2023-10-29 13:39:04 +02:00
Georgi Gerganov 79ad734417 llama : comment
ggml-ci
2023-10-29 13:27:53 +02:00
Georgi Gerganov 761087932b llama : add functional header 2023-10-29 13:26:32 +02:00
Georgi Gerganov 8925cf9ef8 llama : add layer index to all tensor names 2023-10-29 13:22:15 +02:00
Georgi Gerganov 1e9c5443c2 llama : refactor tensor offloading as callback 2023-10-29 13:05:10 +02:00
Georgi Gerganov da936188d8 llama : move refact in correct place + optimize graph input 2023-10-29 11:48:58 +02:00
Georgi Gerganov 739b85c985 llama : try to fix build 2023-10-29 11:25:32 +02:00
Georgi Gerganov 25cfbf6776 llama : fix non-CUDA build 2023-10-29 11:12:03 +02:00
Georgi Gerganov b4ad03b3a7 llama : try to optimize offloading code 2023-10-29 10:33:11 +02:00
Georgi Gerganov 79617902ea llama : fix res_norm offloading 2023-10-29 09:20:35 +02:00
Georgi Gerganov e14aa46151 llama : do tensor offload only with CUDA 2023-10-29 08:03:46 +02:00
Georgi Gerganov 0dc05b8433 llama : factor graph input into a function 2023-10-29 07:52:43 +02:00
Georgi Gerganov 4e98897ede llama : support offloading result_norm + comments 2023-10-29 07:36:07 +02:00
Georgi Gerganov 51c4f9ee9f llama : comments 2023-10-28 22:50:08 +03:00
Georgi Gerganov 3af8771389 llama : update offload log messages to print node index 2023-10-28 22:36:44 +03:00
Georgi Gerganov 83d2c43791 llama : offload rest of the models
ggml-ci
2023-10-28 22:30:54 +03:00
Georgi Gerganov 38aca9e1ab llama : factor out tensor offloading outside the build call (wip)
ggml-ci
2023-10-28 21:22:31 +03:00
Georgi Gerganov 5946d98fc8 metal : disable kernel load log 2023-10-28 21:22:01 +03:00
Georgi Gerganov 8b2420d249 llama : factor out ggml-alloc from graph graph build functions
ggml-ci
2023-10-28 19:54:28 +03:00
6 changed files with 1489 additions and 2592 deletions
Generated
+6 -6
View File
@@ -5,11 +5,11 @@
"systems": "systems"
},
"locked": {
"lastModified": 1692799911,
"narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
"lastModified": 1694529238,
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
"type": "github"
},
"original": {
@@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1698134075,
"narHash": "sha256-foCD+nuKzfh49bIoiCBur4+Fx1nozo+4C/6k8BYk4sg=",
"lastModified": 1698318101,
"narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "8efd5d1e283604f75a808a20e6cde0ef313d07d4",
"rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
"type": "github"
},
"original": {
+6 -4
View File
@@ -11,8 +11,7 @@
meta.mainProgram = "llama";
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
buildInputs = with pkgs; [ openmpi ];
osSpecific = with pkgs; buildInputs ++
(
osSpecific = with pkgs; buildInputs ++ (
if isAarch64 && isDarwin then
with pkgs.darwin.apple_sdk_11_0.frameworks; [
Accelerate
@@ -96,12 +95,15 @@
};
packages.rocm = pkgs.stdenv.mkDerivation {
inherit name src meta postPatch nativeBuildInputs postInstall;
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
cmakeFlags = cmakeFlags ++ [
"-DLLAMA_HIPBLAS=1"
"-DCMAKE_C_COMPILER=hipcc"
"-DCMAKE_CXX_COMPILER=hipcc"
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
# in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
# and select the line that matches the current nixpkgs version of rocBLAS.
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
];
};
apps.llama-server = {
+7 -4
View File
@@ -238,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
// load kernels
{
NSError * error = nil;
#define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
/*
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
(int) ctx->pipeline_##name.threadExecutionWidth); \
*/
#define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
if (error) { \
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \
}
+1 -1
View File
@@ -709,7 +709,7 @@ extern "C" {
// Context tensor enumeration and lookup
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+1469 -2186
View File
File diff suppressed because it is too large Load Diff
-391
View File
@@ -1,391 +0,0 @@
#!/bin/bash
#
# Helper script for deploying llama.cpp server with a single Bash command
#
# - Works on Linux and macOS
# - Supports: CPU, CUDA, Metal, OpenCL
# - Can run all GGUF models from HuggingFace
# - Can serve requests in parallel
# - Always builds latest llama.cpp from GitHub
#
# Limitations
#
# - Chat templates are poorly supported (base models recommended)
# - Might be unstable!
#
# Usage:
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
#
# --port: port number, default is 8888
# --repo: path to a repo containing GGUF model files
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
# --backend: cpu, cuda, metal, opencl, depends on the OS
# --gpu-id: gpu id, default is 0
# --n-parallel: number of parallel requests, default is 8
# --n-kv: KV cache size, default is 4096
# --verbose: verbose output
#
# Example:
#
# bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
#
set -e
# required utils: curl, git, make
if ! command -v curl &> /dev/null; then
printf "[-] curl not found\n"
exit 1
fi
if ! command -v git &> /dev/null; then
printf "[-] git not found\n"
exit 1
fi
if ! command -v make &> /dev/null; then
printf "[-] make not found\n"
exit 1
fi
# parse arguments
port=8888
repo=""
wtype=""
backend="cpu"
# if macOS, use metal backend by default
if [[ "$OSTYPE" == "darwin"* ]]; then
backend="metal"
elif command -v nvcc &> /dev/null; then
backend="cuda"
fi
gpu_id=0
n_parallel=8
n_kv=4096
verbose=0
function print_usage {
printf "Usage:\n"
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
printf " --port: port number, default is 8888\n"
printf " --repo: path to a repo containing GGUF model files\n"
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
printf " --gpu-id: gpu id, default is 0\n"
printf " --n-parallel: number of parallel requests, default is 8\n"
printf " --n-kv: KV cache size, default is 4096\n"
printf " --verbose: verbose output\n\n"
printf "Example:\n\n"
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
}
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--port)
port="$2"
shift
shift
;;
--repo)
repo="$2"
shift
shift
;;
--wtype)
wtype="$2"
shift
shift
;;
--backend)
backend="$2"
shift
shift
;;
--gpu-id)
gpu_id="$2"
shift
shift
;;
--n-parallel)
n_parallel="$2"
shift
shift
;;
--n-kv)
n_kv="$2"
shift
shift
;;
--verbose)
verbose=1
shift
;;
--help)
print_usage
exit 0
;;
*)
echo "Unknown argument: $key"
print_usage
exit 1
;;
esac
done
# available weights types
wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
wfiles=()
for wt in "${wtypes[@]}"; do
wfiles+=("")
done
# sample repos
repos=(
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
)
printf "\n"
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
printf " Based on the options that follow, the script might download a model file\n"
printf " from the internet, which can be a few GBs in size. The script will also\n"
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
printf "\n"
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
printf " model using llama.cpp for demonstration purposes.\n"
printf "\n"
printf " Please note:\n"
printf "\n"
printf " - All new data will be stored in the current folder\n"
printf " - The server will be listening on all network interfaces\n"
printf " - The server will run with default settings which are not always optimal\n"
printf " - Do not judge the quality of a model based on the results from this script\n"
printf " - Do not use this script to benchmark llama.cpp\n"
printf " - Do not use this script in production\n"
printf " - This script is only for demonstration purposes\n"
printf "\n"
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
printf "\n"
printf " Press Enter to continue ...\n\n"
read
if [[ -z "$repo" ]]; then
printf "[+] No repo provided from the command line\n"
printf " Please select a number from the list below or enter an URL:\n\n"
is=0
for r in "${repos[@]}"; do
printf " %2d) %s\n" $is "$r"
is=$((is+1))
done
# ask for repo until index of sample repo is provided or an URL
while [[ -z "$repo" ]]; do
printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
read -p "[+] Select repo: " repo
# check if the input is a number
if [[ "$repo" =~ ^[0-9]+$ ]]; then
if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
repo="${repos[$repo]}"
else
printf "[-] Invalid repo index: %s\n" "$repo"
repo=""
fi
elif [[ "$repo" =~ ^https?:// ]]; then
repo="$repo"
else
printf "[-] Invalid repo URL: %s\n" "$repo"
repo=""
fi
done
fi
# remove suffix
repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
printf "[+] Checking for GGUF model files in %s\n" "$repo"
# find GGUF files in the source
# TODO: better logic
model_tree="${repo%/}/tree/main"
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
# list all files in the provided git repo
printf "[+] Model files:\n\n"
for file in $model_files; do
# determine iw by grepping the filename with wtypes
iw=-1
is=0
for wt in "${wtypes[@]}"; do
# uppercase
ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
if [[ "$ufile" =~ "$wt" ]]; then
iw=$is
break
fi
is=$((is+1))
done
if [[ $iw -eq -1 ]]; then
continue
fi
wfiles[$iw]="$file"
have=" "
if [[ -f "$file" ]]; then
have="*"
fi
printf " %2d) %s %s\n" $iw "$have" "$file"
done
# ask for weights type until provided and available
while [[ -z "$wtype" ]]; do
printf "\n"
read -p "[+] Select weight type: " wtype
wfile="${wfiles[$wtype]}"
if [[ -z "$wfile" ]]; then
printf "[-] Invalid weight type: %s\n" "$wtype"
wtype=""
fi
done
printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
url="${repo%/}/resolve/main/$wfile"
# check file if the model has been downloaded before
chk="$wfile.chk"
# check if we should download the file
# - if $wfile does not exist
# - if $wfile exists but $chk does not exist
# - if $wfile exists and $chk exists but $wfile is newer than $chk
# TODO: better logic using git lfs info
do_download=0
if [[ ! -f "$wfile" ]]; then
do_download=1
elif [[ ! -f "$chk" ]]; then
do_download=1
elif [[ "$wfile" -nt "$chk" ]]; then
do_download=1
fi
if [[ $do_download -eq 1 ]]; then
printf "[+] Downloading weights from %s\n" "$url"
# download the weights file
curl -o "$wfile" -# -L "$url"
# create a check file if successful
if [[ $? -eq 0 ]]; then
printf "[+] Creating check file %s\n" "$chk"
touch "$chk"
fi
else
printf "[+] Using cached weights %s\n" "$wfile"
fi
# get latest llama.cpp and build
printf "[+] Downloading latest llama.cpp\n"
llama_cpp_dir="__llama_cpp_port_${port}__"
if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
# if the dir exists and there isn't a file "__ggml_script__" in it, abort
printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
printf "[-] Please remove it and try again\n"
exit 1
elif [[ -d "$llama_cpp_dir" ]]; then
printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
printf "[+] Using cached llama.cpp\n"
cd "$llama_cpp_dir"
git reset --hard
git fetch
git checkout origin/master
cd ..
else
printf "[+] Cloning llama.cpp\n"
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
fi
# mark that that the directory is made by this script
touch "$llama_cpp_dir/__ggml_script__"
if [[ $verbose -eq 1 ]]; then
set -x
fi
# build
cd "$llama_cpp_dir"
make clean
log="--silent"
if [[ $verbose -eq 1 ]]; then
log=""
fi
if [[ "$backend" == "cuda" ]]; then
printf "[+] Building with CUDA backend\n"
LLAMA_CUBLAS=1 make -j server $log
elif [[ "$backend" == "cpu" ]]; then
printf "[+] Building with CPU backend\n"
make -j server $log
elif [[ "$backend" == "metal" ]]; then
printf "[+] Building with Metal backend\n"
make -j server $log
elif [[ "$backend" == "opencl" ]]; then
printf "[+] Building with OpenCL backend\n"
LLAMA_CLBLAST=1 make -j server $log
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
fi
# run the server
printf "[+] Running server\n"
args=""
if [[ "$backend" == "cuda" ]]; then
export CUDA_VISIBLE_DEVICES=$gpu_id
args="-ngl 999"
elif [[ "$backend" == "cpu" ]]; then
args="-ngl 0"
elif [[ "$backend" == "metal" ]]; then
args="-ngl 999"
elif [[ "$backend" == "opencl" ]]; then
args="-ngl 999"
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
fi
if [[ $verbose -eq 1 ]]; then
args="$args --verbose"
fi
./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
exit 0