Compare commits
27 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 12aa74ba7d | |||
| 2605c139a6 | |||
| 3e9d3dbff9 | |||
| 6014a63125 | |||
| 927be9b58e | |||
| 284800b1e3 | |||
| c4d7f81786 | |||
| e849078c6e | |||
| 67fd33132f | |||
| 4804215cb8 | |||
| 8a533f0d90 | |||
| 269de86ba0 | |||
| c393733988 | |||
| e3965cf35a | |||
| 8b350356b2 | |||
| bf08e00643 | |||
| f7625019c5 | |||
| abbabc5e51 | |||
| f1a98c5254 | |||
| 7d548a1827 | |||
| 930b178026 | |||
| d52d7819b8 | |||
| 1289408817 | |||
| ab336a9d5e | |||
| 69917dfa55 | |||
| 9e359a4f47 | |||
| 4c4cb30736 |
@@ -76,6 +76,17 @@ jobs:
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -669,8 +680,7 @@ jobs:
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
|
||||
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
|
||||
./gradlew build --no-daemon -Pskip-armeabi-v7a
|
||||
./gradlew build --no-daemon
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
|
||||
@@ -6,11 +6,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- test/server-add-ci-test # FIXME remove
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||
|
||||
jobs:
|
||||
server:
|
||||
@@ -18,45 +17,21 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan]
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug, Release]
|
||||
include:
|
||||
- build: 'noavx'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
|
||||
image: ubuntu:latest
|
||||
- build: 'avx2'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
image: ubuntu:latest
|
||||
- build: 'avx'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
|
||||
image: ubuntu:latest
|
||||
- build: 'avx512'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON'
|
||||
image: ubuntu:latest
|
||||
experimental: true
|
||||
- build: 'cublas'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON'
|
||||
image: nvidia/cuda:12.3.1-devel-ubuntu22.04
|
||||
arch_not_available: true # require nvidia docker engine
|
||||
- build: 'clblast'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON'
|
||||
image: ubuntu:latest
|
||||
arch_not_available: true
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS'
|
||||
image: ubuntu:latest
|
||||
- build: 'kompute'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
|
||||
image: ubuntu:latest
|
||||
arch_not_available: true
|
||||
- build: 'vulkan'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON'
|
||||
image: ubuntu:latest
|
||||
arch_not_available: true
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
exclude:
|
||||
- build_type: Release
|
||||
sanitizer: ADDRESS
|
||||
- build_type: Release
|
||||
sanitizer: THREAD
|
||||
- build_type: Release
|
||||
sanitizer: UNDEFINED
|
||||
|
||||
container:
|
||||
image: ${{ matrix.image }}
|
||||
image: ubuntu:latest
|
||||
ports:
|
||||
- 8888
|
||||
options: --cpus 4
|
||||
@@ -72,40 +47,22 @@ jobs:
|
||||
apt-get update
|
||||
apt-get -y install \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
git \
|
||||
cmake \
|
||||
python3-pip \
|
||||
wget \
|
||||
psmisc
|
||||
|
||||
- name: Download CLBlast
|
||||
id: get_clblast
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
run: |
|
||||
apt install -y libclblast-dev
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
run: |
|
||||
apt-get -y install libopenblas-dev
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
|
||||
run: |
|
||||
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
apt-get update
|
||||
apt-get -y install vulkan-sdk
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }}
|
||||
cmake .. \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||
|
||||
- name: Tests dependencies
|
||||
@@ -121,7 +78,6 @@ jobs:
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_test
|
||||
continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
PORT=8888 ./tests.sh
|
||||
|
||||
+8
-2
@@ -936,10 +936,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
# Raspberry Pi 2
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
||||
# Android armeabi-v7a
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
else()
|
||||
# Raspberry Pi 2
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Android arm64-v8a
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
|
||||
@@ -597,7 +597,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||
ifdef LLAMA_CUBLAS
|
||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||
ifndef CUDA_DOCKER_ARCH
|
||||
ifndef CUDA_POWER_ARCH
|
||||
|
||||
@@ -114,6 +114,9 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||
|
||||
**HTTP server**
|
||||
|
||||
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||
|
||||
**Bindings:**
|
||||
|
||||
@@ -155,6 +158,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||
- [Msty](https://msty.app) (proprietary)
|
||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||
|
||||
---
|
||||
|
||||
|
||||
+9
-9
@@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
||||
else { invalid_param = true; break; }
|
||||
} else if (arg == "--rope-scale") {
|
||||
if (++i >= argc) {
|
||||
@@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
std::string arg_next = argv[i];
|
||||
if (arg_next == "none") {
|
||||
params.split_mode = LLAMA_SPLIT_NONE;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
} else if (arg_next == "layer") {
|
||||
params.split_mode = LLAMA_SPLIT_LAYER;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (arg_next == "row") {
|
||||
params.split_mode = LLAMA_SPLIT_ROW;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
@@ -837,15 +837,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
|
||||
+2
-2
@@ -61,7 +61,7 @@ struct gpt_params {
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
|
||||
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
@@ -75,7 +75,7 @@ struct gpt_params {
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
|
||||
// // sampling parameters
|
||||
|
||||
+1
-1
@@ -266,7 +266,7 @@ static llama_token llama_sampling_sample_impl(
|
||||
// }
|
||||
//}
|
||||
|
||||
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+5
-5
@@ -31,7 +31,7 @@ struct train_state * init_train_state() {
|
||||
|
||||
state->opt = new struct ggml_opt_context;
|
||||
state->opt->ctx = NULL;
|
||||
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||
state->opt->loss_after = 0.0f;
|
||||
|
||||
@@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
|
||||
std::string opt_type;
|
||||
GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
|
||||
if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
|
||||
opt->params.type = GGML_OPT_ADAM;
|
||||
opt->params.type = GGML_OPT_TYPE_ADAM;
|
||||
|
||||
GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
|
||||
GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
|
||||
@@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
|
||||
copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
|
||||
copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
|
||||
} else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
|
||||
opt->params.type = GGML_OPT_LBFGS;
|
||||
opt->params.type = GGML_OPT_TYPE_LBFGS;
|
||||
|
||||
GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
|
||||
GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
|
||||
@@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
|
||||
gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
|
||||
|
||||
switch (opt->params.type) {
|
||||
case GGML_OPT_ADAM:
|
||||
case GGML_OPT_TYPE_ADAM:
|
||||
{
|
||||
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
|
||||
gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best);
|
||||
@@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
|
||||
gguf_add_tensor(fctx, opt->adam.pf);
|
||||
}
|
||||
} break;
|
||||
case GGML_OPT_LBFGS:
|
||||
case GGML_OPT_TYPE_LBFGS:
|
||||
{
|
||||
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
|
||||
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
|
||||
|
||||
@@ -192,7 +192,7 @@ class Model:
|
||||
return RefactModel
|
||||
if model_architecture == "PersimmonForCausalLM":
|
||||
return PersimmonModel
|
||||
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||
if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||
return StableLMModel
|
||||
if model_architecture == "QWenLMHeadModel":
|
||||
return QwenModel
|
||||
@@ -253,7 +253,7 @@ class Model:
|
||||
return gguf.MODEL_ARCH.REFACT
|
||||
if arch == "PersimmonForCausalLM":
|
||||
return gguf.MODEL_ARCH.PERSIMMON
|
||||
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||
if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||
return gguf.MODEL_ARCH.STABLELM
|
||||
if arch == "QWenLMHeadModel":
|
||||
return gguf.MODEL_ARCH.QWEN
|
||||
@@ -1074,10 +1074,11 @@ class StableLMModel(Model):
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
||||
|
||||
|
||||
class MixtralModel(Model):
|
||||
|
||||
@@ -1547,7 +1547,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
|
||||
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
|
||||
opt_params_lbfgs.print_forward_graph = false;
|
||||
opt_params_lbfgs.print_backward_graph = false;
|
||||
opt_params_lbfgs.lbfgs.n_iter = 16;
|
||||
|
||||
@@ -21,6 +21,8 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface.
|
||||
|
||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||
|
||||
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
||||
|
||||
Now you can use the model with a command like:
|
||||
|
||||
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
@@ -78,111 +79,101 @@ typedef struct {
|
||||
|
||||
struct TransformerWeights {
|
||||
// token embedding table
|
||||
float* token_embedding_table; // (vocab_size, dim)
|
||||
std::vector<float> token_embedding_table; // (vocab_size, dim)
|
||||
// weights for rmsnorms
|
||||
float* rms_att_weight; // (layer, dim) rmsnorm weights
|
||||
float* rms_ffn_weight; // (layer, dim)
|
||||
std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
|
||||
std::vector<float> rms_ffn_weight; // (layer, dim)
|
||||
// weights for matmuls
|
||||
float* wq; // (layer, dim, dim)
|
||||
float* wk; // (layer, dim, dim)
|
||||
float* wv; // (layer, dim, dim)
|
||||
float* wo; // (layer, dim, dim)
|
||||
std::vector<float> wq; // (layer, dim, dim)
|
||||
std::vector<float> wk; // (layer, dim, dim)
|
||||
std::vector<float> wv; // (layer, dim, dim)
|
||||
std::vector<float> wo; // (layer, dim, dim)
|
||||
// weights for ffn
|
||||
float* w1; // (layer, hidden_dim, dim)
|
||||
float* w2; // (layer, dim, hidden_dim)
|
||||
float* w3; // (layer, hidden_dim, dim)
|
||||
std::vector<float> w1; // (layer, hidden_dim, dim)
|
||||
std::vector<float> w2; // (layer, dim, hidden_dim)
|
||||
std::vector<float> w3; // (layer, hidden_dim, dim)
|
||||
// final rmsnorm
|
||||
float* rms_final_weight; // (dim,)
|
||||
std::vector<float> rms_final_weight; // (dim,)
|
||||
// freq_cis for RoPE relatively positional embeddings
|
||||
// float* freq_cis_real; // (seq_len, dim/2)
|
||||
// float* freq_cis_imag; // (seq_len, dim/2)
|
||||
// std::vector<float> freq_cis_real; // (seq_len, dim/2)
|
||||
// std::vector<float> freq_cis_imag; // (seq_len, dim/2)
|
||||
// (optional) classifier weights for the logits, on the last layer
|
||||
float* wcls;
|
||||
|
||||
~TransformerWeights() {
|
||||
delete[] token_embedding_table;
|
||||
delete[] rms_att_weight;
|
||||
delete[] rms_ffn_weight;
|
||||
delete[] wq;
|
||||
delete[] wk;
|
||||
delete[] wv;
|
||||
delete[] wo;
|
||||
delete[] w1;
|
||||
delete[] w2;
|
||||
delete[] w3;
|
||||
delete[] rms_final_weight;
|
||||
delete[] wcls;
|
||||
}
|
||||
std::vector<float> wcls;
|
||||
};
|
||||
|
||||
static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
// we calloc instead of malloc to keep valgrind happy
|
||||
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
|
||||
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
|
||||
try {
|
||||
w->token_embedding_table.resize(p->vocab_size * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
|
||||
w->rms_att_weight = new float[p->n_layers * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
||||
w->rms_att_weight.resize(p->n_layers * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
||||
|
||||
w->rms_ffn_weight = new float[p->n_layers * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
||||
w->rms_ffn_weight.resize(p->n_layers * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
||||
|
||||
w->wq = new float[p->n_layers * p->dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
w->wq.resize(p->n_layers * p->dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
|
||||
w->wk = new float[p->n_layers * p->dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
|
||||
w->wv = new float[p->n_layers * p->dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
|
||||
w->wo = new float[p->n_layers * p->dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
w->wo.resize(p->n_layers * p->dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
|
||||
w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
|
||||
w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
|
||||
w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
|
||||
w->rms_final_weight = new float[p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||
w->rms_final_weight.resize(p->dim);
|
||||
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||
|
||||
if (shared_weights) {
|
||||
w->wcls = NULL;
|
||||
} else {
|
||||
w->wcls = new float[p->vocab_size * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
if (shared_weights) {
|
||||
w->wcls = {};
|
||||
} else {
|
||||
w->wcls.resize(p->vocab_size * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
}
|
||||
}
|
||||
catch (std::length_error &) {
|
||||
die("Invalid configuration. Failed to allocate memory for weights");
|
||||
}
|
||||
}
|
||||
|
||||
static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
||||
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
||||
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
||||
if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
|
||||
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
|
||||
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
|
||||
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
|
||||
static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
|
||||
if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
|
||||
if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
|
||||
if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
|
||||
if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
|
||||
if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
|
||||
if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
|
||||
if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
|
||||
if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
|
||||
if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
|
||||
if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
|
||||
if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
|
||||
|
||||
// Skip freq_cis_real & freq_cis_imag
|
||||
int head_size = p->dim / p->n_heads;
|
||||
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
|
||||
|
||||
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||
if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
|
||||
|
||||
// Check we didn't forget to read anything
|
||||
auto curr = ftell(f);
|
||||
fseek(f, 0, SEEK_END);
|
||||
auto end = ftell(f);
|
||||
if (curr != end) {
|
||||
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
|
||||
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -190,20 +181,20 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo
|
||||
}
|
||||
|
||||
static void print_sample_weights(TransformerWeights *w){
|
||||
printf("----- Quick print of first of the weight vales of all the variables\n");
|
||||
printf("%f\n", w->token_embedding_table[0]);
|
||||
printf("%f\n", w->rms_att_weight[0]);
|
||||
printf("%f\n", w->rms_ffn_weight[0]);
|
||||
LOG("----- Quick print of first of the weight vales of all the variables\n");
|
||||
LOG("%f\n", w->token_embedding_table[0]);
|
||||
LOG("%f\n", w->rms_att_weight[0]);
|
||||
LOG("%f\n", w->rms_ffn_weight[0]);
|
||||
|
||||
printf("%f\n", w->wq[0]);
|
||||
printf("%f\n", w->wk[0]);
|
||||
printf("%f\n", w->wv[0]);
|
||||
printf("%f\n", w->wo[0]);
|
||||
printf("%f\n", w->w1[0]);
|
||||
printf("%f\n", w->w2[0]);
|
||||
printf("%f\n", w->w3[0]);
|
||||
printf("%f\n", w->rms_att_weight[0]);
|
||||
if (w->wcls) printf("%f\n", w->wcls[0]);
|
||||
LOG("%f\n", w->wq[0]);
|
||||
LOG("%f\n", w->wk[0]);
|
||||
LOG("%f\n", w->wv[0]);
|
||||
LOG("%f\n", w->wo[0]);
|
||||
LOG("%f\n", w->w1[0]);
|
||||
LOG("%f\n", w->w2[0]);
|
||||
LOG("%f\n", w->w3[0]);
|
||||
LOG("%f\n", w->rms_att_weight[0]);
|
||||
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -225,14 +216,16 @@ struct llama_vocab {
|
||||
};
|
||||
|
||||
struct my_llama_hparams {
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512; // this is provided as user input?
|
||||
uint32_t n_embd = 4096;
|
||||
uint32_t n_ff = 11008;
|
||||
uint32_t n_mult = 4;
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_layer = 32;
|
||||
uint32_t n_rot = 64;
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512; // this is provided as user input?
|
||||
uint32_t n_embd = 4096;
|
||||
uint32_t n_ff = 11008;
|
||||
uint32_t n_mult = 4;
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_head_kv = 32;
|
||||
uint32_t n_layer = 32;
|
||||
uint32_t n_rot = 64;
|
||||
|
||||
bool operator!=(const my_llama_hparams& other) const {
|
||||
return memcmp(this, &other, sizeof(my_llama_hparams));
|
||||
}
|
||||
@@ -325,14 +318,30 @@ struct train_params {
|
||||
};
|
||||
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
printf("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
printf("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||
printf("%s: n_head: %u\n", __func__, params->n_head);
|
||||
printf("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||
printf("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||
printf("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||
LOG("%s: n_head: %u\n", __func__, params->n_head);
|
||||
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
|
||||
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||
}
|
||||
|
||||
static void print_tensor_info(const struct ggml_context * ctx) {
|
||||
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
LOG("%s: Allocating ", __func__);
|
||||
int64_t total = 1;
|
||||
int i = 0;
|
||||
for (; i < ggml_n_dims(t); ++i) {
|
||||
if (i > 0) LOG("x ");
|
||||
LOG("[%" PRId64 "] ", t->ne[i]);
|
||||
total *= t->ne[i];
|
||||
}
|
||||
if (i > 1) LOG("= [%" PRId64 "] ", total);
|
||||
LOG("float space for %s\n", ggml_get_name(t));
|
||||
}
|
||||
}
|
||||
|
||||
static void init_model(struct my_llama_model * model) {
|
||||
@@ -342,6 +351,8 @@ static void init_model(struct my_llama_model * model) {
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_vocab = hparams.n_vocab;
|
||||
|
||||
const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
struct ggml_context * ctx = model->ctx;
|
||||
|
||||
@@ -350,25 +361,8 @@ static void init_model(struct my_llama_model * model) {
|
||||
model->train_tokens = 0;
|
||||
|
||||
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||
printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
|
||||
|
||||
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
|
||||
|
||||
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
|
||||
|
||||
// printing the per-layer allocations here so we dont print in the for loop.
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
|
||||
printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
|
||||
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||
|
||||
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
|
||||
ggml_set_name(model->norm, "norm.weight");
|
||||
@@ -383,8 +377,8 @@ static void init_model(struct my_llama_model * model) {
|
||||
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
|
||||
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
|
||||
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||
|
||||
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
@@ -406,6 +400,8 @@ static void init_model(struct my_llama_model * model) {
|
||||
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
|
||||
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
|
||||
}
|
||||
|
||||
print_tensor_info(ctx);
|
||||
}
|
||||
|
||||
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
@@ -421,9 +417,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
static void print_row(struct ggml_tensor * probs, int i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = get_f32_2d(probs, k, i);
|
||||
printf(" %f", p);
|
||||
LOG(" %f", p);
|
||||
}
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
static void print_matrix(struct ggml_tensor * probs) {
|
||||
@@ -431,33 +427,12 @@ static void print_matrix(struct ggml_tensor * probs) {
|
||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = get_f32_2d(probs, k, i);
|
||||
printf(" %.2f", p);
|
||||
LOG(" %.2f", p);
|
||||
}
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((format(gnu_printf, 1, 2)))
|
||||
#else
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
#endif
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap, ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
GGML_ASSERT(size >= 0 && size < INT_MAX);
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
GGML_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
}
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
@@ -549,8 +524,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
|
||||
return out.str();
|
||||
}
|
||||
|
||||
static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
|
||||
if (is_ggml_file(filename)) {
|
||||
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
@@ -578,6 +554,9 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
|
||||
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||
|
||||
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
||||
if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
|
||||
die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
|
||||
}
|
||||
|
||||
vocab->id_to_token.resize(n_vocab);
|
||||
|
||||
@@ -595,7 +574,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
|
||||
gguf_free(ctx);
|
||||
} else {
|
||||
// assume llama2.c vocabulary
|
||||
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
|
||||
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
||||
llama_file file(filename, "rb");
|
||||
if (!file.fp) {
|
||||
die_fmt("%s: %s", strerror(errno), filename);
|
||||
@@ -638,38 +617,15 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
|
||||
}
|
||||
|
||||
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||
int ct;
|
||||
switch (ggml_n_dims(gg_weights)) {
|
||||
case 1:
|
||||
ct = 0;
|
||||
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
|
||||
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
|
||||
*ptr = karpathy_weights[ct];
|
||||
ct++;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
ct = 0;
|
||||
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
|
||||
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
|
||||
*ptr = karpathy_weights[ct];
|
||||
ct++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
ct = 0;
|
||||
for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
|
||||
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
|
||||
*ptr = karpathy_weights[ct];
|
||||
ct++;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
int size = 1;
|
||||
for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
|
||||
size *= gg_weights->ne[dim];
|
||||
}
|
||||
for (int ct = 0; ct < size; ++ct) {
|
||||
int64_t i0 = 0; int64_t i1 = 0;
|
||||
int64_t i2 = 0; int64_t i3 = 0;
|
||||
ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
|
||||
ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -679,16 +635,18 @@ static void save_as_llama_model(
|
||||
// convert AK weights into GG weights one by one.
|
||||
// w->token_embedding_table -> model->tok_embeddings
|
||||
// float* -> struct ggml_tensor
|
||||
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
|
||||
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
|
||||
convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
|
||||
|
||||
convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
|
||||
convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
|
||||
//print_row(model->norm, 0);
|
||||
|
||||
// for rms-att-weight
|
||||
int row_length = model->hparams.n_embd;
|
||||
int n_ff = model->hparams.n_ff;
|
||||
|
||||
const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
|
||||
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||
auto & layer = model->layers[i];
|
||||
// 1d
|
||||
@@ -697,9 +655,10 @@ static void save_as_llama_model(
|
||||
|
||||
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
// from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
|
||||
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/n_multiqueries]);
|
||||
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/n_multiqueries]);
|
||||
|
||||
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
@@ -736,8 +695,8 @@ static void save_as_llama_model(
|
||||
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
||||
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
|
||||
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
|
||||
// n_head_kv is optional, default to n_head
|
||||
// gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
|
||||
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
|
||||
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
|
||||
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
|
||||
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
|
||||
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
|
||||
@@ -789,12 +748,12 @@ static void save_as_llama_model(
|
||||
|
||||
static struct train_params get_default_train_params() {
|
||||
struct train_params params;
|
||||
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
||||
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
||||
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
||||
params.fn_train_data = "shakespeare.txt";
|
||||
params.fn_checkpoint_in = "checkpoint.bin";
|
||||
params.fn_checkpoint_out = "checkpoint.bin";
|
||||
params.fn_model_out = "ggml-checkpoint-f32.bin";
|
||||
params.fn_train_data = "shakespeare.txt";
|
||||
params.fn_checkpoint_in = "checkpoint.bin";
|
||||
params.fn_checkpoint_out = "checkpoint.bin";
|
||||
params.fn_model_out = "ggml-checkpoint-f32.bin";
|
||||
|
||||
params.seed = -1;
|
||||
|
||||
@@ -829,8 +788,8 @@ static struct train_params get_default_train_params() {
|
||||
params.adam_alpha = 1e-3f;
|
||||
params.adam_decay = 1e-3f;
|
||||
|
||||
params.mem_model_gb = 2;
|
||||
params.mem_compute_gb = 24;
|
||||
params.mem_model_gb = 2;
|
||||
params.mem_compute_gb = 24;
|
||||
params.mem_compute0_gb = 8;
|
||||
params.mem_compute1_gb = 2;
|
||||
|
||||
@@ -916,19 +875,30 @@ int main(int argc, char ** argv) {
|
||||
if (!params_parse(argc, argv, ¶ms)) {
|
||||
return 1;
|
||||
}
|
||||
log_set_target(stdout);
|
||||
Config config;
|
||||
TransformerWeights weights = {};
|
||||
{
|
||||
FILE *file = fopen(params.fn_llama2c_model, "rb");
|
||||
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
|
||||
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
|
||||
FILE *file = fopen(params.fn_llama2c_model, "r");
|
||||
if (!file) {
|
||||
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
|
||||
return 1;
|
||||
}
|
||||
// read in the config header
|
||||
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
|
||||
if (fread(&config, sizeof(Config), 1, file) != 1) {
|
||||
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
|
||||
return 1;
|
||||
}
|
||||
auto shared_weights = config.vocab_size > 0;
|
||||
config.vocab_size = abs(config.vocab_size);
|
||||
|
||||
// read in the Transformer weights
|
||||
malloc_weights(&weights, &config, shared_weights);
|
||||
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
|
||||
alloc_weights(&weights, &config, shared_weights);
|
||||
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
|
||||
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
|
||||
return 1;
|
||||
}
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
@@ -936,15 +906,18 @@ int main(int argc, char ** argv) {
|
||||
load_vocab(params.fn_vocab_model, &config, &vocab);
|
||||
|
||||
struct my_llama_model model;
|
||||
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
|
||||
model.hparams.n_ctx = params.n_ctx;
|
||||
model.hparams.n_embd = config.dim; //params.n_embd;
|
||||
model.hparams.n_ff = config.hidden_dim;
|
||||
model.hparams.n_mult = 32;//params.n_mult;
|
||||
model.hparams.n_head = config.n_heads; //params.n_head;
|
||||
model.hparams.n_layer = config.n_layers; //params.n_layer;
|
||||
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
|
||||
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
|
||||
model.hparams.n_ctx = params.n_ctx;
|
||||
model.hparams.n_embd = config.dim; //params.n_embd;
|
||||
model.hparams.n_ff = config.hidden_dim;
|
||||
model.hparams.n_mult = 32;//params.n_mult;
|
||||
model.hparams.n_head = config.n_heads; //params.n_head;
|
||||
model.hparams.n_head_kv = config.n_kv_heads;
|
||||
model.hparams.n_layer = config.n_layers; //params.n_layer;
|
||||
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
|
||||
|
||||
print_params(&model.hparams);
|
||||
|
||||
struct ggml_init_params lcparams;
|
||||
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
|
||||
lcparams.mem_buffer = NULL;
|
||||
@@ -956,7 +929,7 @@ int main(int argc, char ** argv) {
|
||||
model.name = basename(params.fn_llama2c_model);
|
||||
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
||||
|
||||
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||
|
||||
ggml_free(model.ctx);
|
||||
return 0;
|
||||
|
||||
@@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) {
|
||||
lora.hparams.n_rank_output = n_rank_output;
|
||||
|
||||
// set opt params from command line
|
||||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||
opt->params.print_forward_graph = false;
|
||||
opt->params.print_backward_graph = false;
|
||||
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||
|
||||
@@ -447,8 +447,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
|
||||
@@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) {
|
||||
|
||||
static const char * split_mode_str(llama_split_mode mode) {
|
||||
switch (mode) {
|
||||
case LLAMA_SPLIT_NONE: return "none";
|
||||
case LLAMA_SPLIT_LAYER: return "layer";
|
||||
case LLAMA_SPLIT_ROW: return "row";
|
||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
||||
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
||||
case LLAMA_SPLIT_MODE_ROW: return "row";
|
||||
default: GGML_ASSERT(!"invalid split mode");
|
||||
}
|
||||
}
|
||||
@@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {get_num_physical_cores()},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* split_mode */ {LLAMA_SPLIT_LAYER},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* mul_mat_q */ {true},
|
||||
@@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
for (const auto & m : p) {
|
||||
llama_split_mode mode;
|
||||
if (m == "none") {
|
||||
mode = LLAMA_SPLIT_NONE;
|
||||
mode = LLAMA_SPLIT_MODE_NONE;
|
||||
} else if (m == "layer") {
|
||||
mode = LLAMA_SPLIT_LAYER;
|
||||
mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (m == "row") {
|
||||
mode = LLAMA_SPLIT_ROW;
|
||||
mode = LLAMA_SPLIT_MODE_ROW;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
|
||||
@@ -21,12 +21,8 @@ android {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Workaround for https://github.com/llvm/llvm-project/issues/65820
|
||||
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
|
||||
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
|
||||
if (project.hasProperty("skip-armeabi-v7a")) {
|
||||
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
|
||||
}
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
|
||||
@@ -152,7 +152,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
|
||||
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
||||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
||||
if (newline_tmp->backend != GGML_BACKEND_CPU) {
|
||||
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
||||
if (newline_tmp->buffer == NULL) {
|
||||
printf("newline_tmp tensor buffer is NULL\n");
|
||||
}
|
||||
|
||||
@@ -548,8 +548,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
@@ -576,9 +576,9 @@ int main(int argc, char ** argv) {
|
||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
|
||||
n_past -= bd;
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
|
||||
const int n_batch = ctx_params.n_batch;
|
||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
@@ -146,10 +146,11 @@ int main(int argc, char ** argv) {
|
||||
const int ib = i/n_batch - 1;
|
||||
const int bd = n_batch_grp*(n_grp - 1);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= bd;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
@@ -179,10 +180,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= n_discard;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
@@ -208,10 +211,12 @@ int main(int argc, char ** argv) {
|
||||
if (n_discard > 0) {
|
||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= n_discard;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -27,6 +27,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
|
||||
@@ -1,8 +1,20 @@
|
||||
# llama.cpp/example/server
|
||||
# LLaMA.cpp HTTP Server
|
||||
|
||||
This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
|
||||
Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
|
||||
|
||||
Command line options:
|
||||
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantum models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* Parallel decoding with multi-user support
|
||||
* Continuous batching
|
||||
* Multimodal (wip)
|
||||
* Monitoring endpoints
|
||||
|
||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
|
||||
|
||||
**Command line options:**
|
||||
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||
@@ -39,9 +51,12 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||
- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
|
||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
|
||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- `--log-disable`: Output logs to stdout only, default: enabled.
|
||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
|
||||
|
||||
## Build
|
||||
|
||||
@@ -457,6 +472,18 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
]
|
||||
```
|
||||
|
||||
- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
|
||||
|
||||
Available metrics:
|
||||
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
|
||||
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
|
||||
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
|
||||
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
|
||||
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
|
||||
- `llamacpp:kv_cache_tokens`: KV-cache tokens.
|
||||
- `llamacpp:requests_processing`: Number of request processing.
|
||||
- `llamacpp:requests_deferred`: Number of request deferred.
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
|
||||
+352
-72
@@ -43,9 +43,11 @@ struct server_params
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
bool slots_endpoint = true;
|
||||
bool metrics_endpoint = false;
|
||||
};
|
||||
|
||||
bool server_verbose = false;
|
||||
bool server_log_json = true;
|
||||
|
||||
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
||||
{
|
||||
@@ -301,12 +303,76 @@ struct llama_client_slot
|
||||
}
|
||||
|
||||
void print_timings() const {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
||||
LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
|
||||
LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
|
||||
char buffer[512];
|
||||
double t_token = t_prompt_processing / num_prompt_tokens_processed;
|
||||
double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
|
||||
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||
t_prompt_processing, num_prompt_tokens_processed,
|
||||
t_token, n_tokens_second);
|
||||
LOG_INFO(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
{"t_prompt_processing", t_prompt_processing},
|
||||
{"num_prompt_tokens_processed", num_prompt_tokens_processed},
|
||||
{"t_token", t_token},
|
||||
{"n_tokens_second", n_tokens_second},
|
||||
});
|
||||
|
||||
t_token = t_token_generation / n_decoded;
|
||||
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
||||
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||
t_token_generation, n_decoded,
|
||||
t_token, n_tokens_second);
|
||||
LOG_INFO(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
{"t_token_generation", t_token_generation},
|
||||
{"n_decoded", n_decoded},
|
||||
{"t_token", t_token},
|
||||
{"n_tokens_second", n_tokens_second},
|
||||
});
|
||||
|
||||
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||
LOG_INFO(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
{"t_prompt_processing", t_prompt_processing},
|
||||
{"t_token_generation", t_token_generation},
|
||||
{"t_total", t_prompt_processing + t_token_generation},
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_metrics {
|
||||
uint64_t n_prompt_tokens_processed_total = 0;
|
||||
uint64_t n_tokens_predicted_total = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
uint64_t n_tokens_predicted = 0;
|
||||
uint64_t t_tokens_generation = 0;
|
||||
|
||||
|
||||
void on_prompt_eval(const llama_client_slot &slot) {
|
||||
n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
|
||||
|
||||
n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
|
||||
t_prompt_processing += slot.t_prompt_processing;
|
||||
}
|
||||
|
||||
void on_prediction(const llama_client_slot &slot) {
|
||||
n_tokens_predicted_total += slot.n_decoded;
|
||||
|
||||
n_tokens_predicted += slot.n_decoded;
|
||||
t_tokens_generation += slot.t_token_generation;
|
||||
}
|
||||
|
||||
void reset_bucket() {
|
||||
n_prompt_tokens_processed = 0;
|
||||
t_prompt_processing = 0;
|
||||
n_tokens_predicted = 0;
|
||||
t_tokens_generation = 0;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -344,6 +410,8 @@ struct llama_server_context
|
||||
llama_server_queue queue_tasks;
|
||||
llama_server_response queue_results;
|
||||
|
||||
llama_metrics metrics;
|
||||
|
||||
~llama_server_context()
|
||||
{
|
||||
if (ctx)
|
||||
@@ -363,7 +431,7 @@ struct llama_server_context
|
||||
params = params_;
|
||||
if (!params.mmproj.empty()) {
|
||||
multimodal = true;
|
||||
LOG_TEE("Multi Modal Mode Enabled");
|
||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||
if(clp_ctx == nullptr) {
|
||||
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
||||
@@ -416,7 +484,7 @@ struct llama_server_context
|
||||
|
||||
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
||||
|
||||
LOG_TEE("Available slots:\n");
|
||||
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
|
||||
for (int i = 0; i < params.n_parallel; i++)
|
||||
{
|
||||
llama_client_slot slot;
|
||||
@@ -425,7 +493,10 @@ struct llama_server_context
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.n_predict = params.n_predict;
|
||||
|
||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
||||
LOG_INFO("new slot", {
|
||||
{"slot_id", slot.id},
|
||||
{"n_ctx_slot", slot.n_ctx}
|
||||
});
|
||||
|
||||
const int ga_n = params.grp_attn_n;
|
||||
const int ga_w = params.grp_attn_w;
|
||||
@@ -435,7 +506,12 @@ struct llama_server_context
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
||||
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
||||
|
||||
LOG_INFO("slot self-extend", {
|
||||
{"slot_id", slot.id},
|
||||
{"ga_n", ga_n},
|
||||
{"ga_w", ga_w}
|
||||
});
|
||||
}
|
||||
|
||||
slot.ga_i = 0;
|
||||
@@ -729,10 +805,16 @@ struct llama_server_context
|
||||
img_sl.img_data = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||
{
|
||||
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
|
||||
LOG_ERROR("failed to load image", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
return false;
|
||||
}
|
||||
LOG_TEE("slot %i - loaded image\n", slot->id);
|
||||
LOG_VERBOSE("image loaded", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
img_sl.request_encode_image = true;
|
||||
slot->images.push_back(img_sl);
|
||||
}
|
||||
@@ -792,7 +874,10 @@ struct llama_server_context
|
||||
|
||||
all_slots_are_idle = false;
|
||||
|
||||
LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
|
||||
LOG_INFO("slot is processing task", {
|
||||
{"slot_id", slot->id},
|
||||
{"task_id", slot->task_id},
|
||||
});
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -817,10 +902,24 @@ struct llama_server_context
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0)
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
if (llama_decode(ctx, batch_view) != 0)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
@@ -1355,7 +1454,7 @@ struct llama_server_context
|
||||
if (slot == nullptr)
|
||||
{
|
||||
// if no slot is available, we defer this task for processing later
|
||||
LOG_VERBOSE("no slot is available", {});
|
||||
LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
|
||||
queue_tasks.defer(task);
|
||||
break;
|
||||
}
|
||||
@@ -1404,7 +1503,7 @@ struct llama_server_context
|
||||
case TASK_TYPE_NEXT_RESPONSE: {
|
||||
// do nothing
|
||||
} break;
|
||||
case TASK_TYPE_SLOTS_DATA: {
|
||||
case TASK_TYPE_METRICS: {
|
||||
json slots_data = json::array();
|
||||
int n_idle_slots = 0;
|
||||
int n_processing_slots = 0;
|
||||
@@ -1431,17 +1530,41 @@ struct llama_server_context
|
||||
}
|
||||
slots_data.push_back(slot_data);
|
||||
}
|
||||
LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
|
||||
LOG_INFO("slot data", {
|
||||
{"task_id", task.id},
|
||||
{"n_idle_slots", n_idle_slots},
|
||||
{"n_processing_slots", n_processing_slots}
|
||||
});
|
||||
LOG_VERBOSE("slot data", {
|
||||
{"task_id", task.id},
|
||||
{"n_idle_slots", n_idle_slots},
|
||||
{"n_processing_slots", n_processing_slots},
|
||||
{"slots", slots_data}
|
||||
});
|
||||
task_result res;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
res.stop = true;
|
||||
res.error = false;
|
||||
res.result_json = {
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "slots", slots_data }
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
|
||||
|
||||
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
|
||||
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
|
||||
|
||||
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
|
||||
{ "t_prompt_processing", metrics.t_prompt_processing},
|
||||
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
||||
{ "t_tokens_generation", metrics.t_tokens_generation},
|
||||
|
||||
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
||||
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
||||
|
||||
{ "slots", slots_data },
|
||||
};
|
||||
metrics.reset_bucket();
|
||||
queue_results.send(res);
|
||||
} break;
|
||||
}
|
||||
@@ -1469,7 +1592,7 @@ struct llama_server_context
|
||||
bool update_slots() {
|
||||
if (system_need_update)
|
||||
{
|
||||
LOG_TEE("updating system prompt\n");
|
||||
LOG_INFO("updating system prompt", {});
|
||||
update_system_prompt();
|
||||
}
|
||||
|
||||
@@ -1479,12 +1602,13 @@ struct llama_server_context
|
||||
{
|
||||
if (system_prompt.empty() && clean_kv_cache)
|
||||
{
|
||||
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
|
||||
LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
|
||||
kv_cache_clear();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
LOG_VERBOSE("posting NEXT_RESPONSE", {});
|
||||
task_server task;
|
||||
task.type = TASK_TYPE_NEXT_RESPONSE;
|
||||
task.target_id = -1;
|
||||
@@ -1498,12 +1622,22 @@ struct llama_server_context
|
||||
{
|
||||
// Shift context
|
||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||
const int n_left = system_tokens.size() + slot.n_past - n_keep;
|
||||
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
LOG_INFO("slot context shift", {
|
||||
{"slot_id", slot.id},
|
||||
{"task_id", slot.task_id},
|
||||
{"n_keep", n_keep},
|
||||
{"n_left", n_left},
|
||||
{"n_discard", n_discard},
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_past", slot.n_past},
|
||||
{"n_system_tokens", system_tokens.size()},
|
||||
{"n_cache_tokens", slot.cache_tokens.size()}
|
||||
});
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
@@ -1515,17 +1649,12 @@ struct llama_server_context
|
||||
slot.n_past -= n_discard;
|
||||
|
||||
slot.truncated = true;
|
||||
|
||||
LOG_VERBOSE("context shift", {
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "n_keep", n_keep },
|
||||
{ "n_left", n_left },
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// decode any currently ongoing sequences
|
||||
LOG_VERBOSE("decoding ongoing sequences", {});
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
// release the slot
|
||||
@@ -1535,7 +1664,15 @@ struct llama_server_context
|
||||
slot.command = NONE;
|
||||
slot.t_last_used = ggml_time_us();
|
||||
|
||||
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||
LOG_INFO("slot released", {
|
||||
{"slot_id", slot.id},
|
||||
{"task_id", slot.task_id},
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_past", slot.n_past},
|
||||
{"n_system_tokens", system_tokens.size()},
|
||||
{"n_cache_tokens", slot.cache_tokens.size()},
|
||||
{"truncated", slot.truncated}
|
||||
});
|
||||
queue_tasks.notify_slot_changed();
|
||||
|
||||
continue;
|
||||
@@ -1662,6 +1799,14 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
|
||||
// the last token of the cache is not in the KV cache until the next call to llama_decode
|
||||
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
|
||||
if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
|
||||
{
|
||||
slot.n_past -= 1;
|
||||
}
|
||||
|
||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
||||
|
||||
if (slot.ga_n != 1)
|
||||
@@ -1683,7 +1828,12 @@ struct llama_server_context
|
||||
slot.ga_i = ga_i;
|
||||
}
|
||||
|
||||
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
||||
LOG_INFO("slot progression", {
|
||||
{ "slot_id", slot.id },
|
||||
{ "task_id", slot.task_id },
|
||||
{ "n_past", slot.n_past },
|
||||
{ "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
|
||||
});
|
||||
}
|
||||
|
||||
slot.cache_tokens = prompt_tokens;
|
||||
@@ -1691,7 +1841,10 @@ struct llama_server_context
|
||||
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
||||
{
|
||||
// we have to evaluate at least 1 token to generate logits.
|
||||
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
|
||||
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
||||
{ "slot_id", slot.id },
|
||||
{ "task_id", slot.task_id }
|
||||
});
|
||||
slot.n_past--;
|
||||
if (slot.ga_i > 0)
|
||||
{
|
||||
@@ -1699,9 +1852,13 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
|
||||
int p0 = (int) system_tokens.size() + slot.n_past;
|
||||
LOG_INFO("kv cache rm [p0, end)", {
|
||||
{ "slot_id", slot.id },
|
||||
{ "task_id", slot.task_id },
|
||||
{ "p0", p0 }
|
||||
});
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
||||
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", slot.n_past},
|
||||
@@ -1736,7 +1893,13 @@ struct llama_server_context
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
{
|
||||
LOG_TEE("failed processing images\n");
|
||||
LOG_ERROR("failed processing images", {
|
||||
"slot_id", slot.id,
|
||||
"task_id", slot.task_id,
|
||||
});
|
||||
// FIXME @phymbert: to be properly tested
|
||||
// early returning without changing the slot state will block the slot for ever
|
||||
// no one at the moment is checking the return value
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1778,9 +1941,9 @@ struct llama_server_context
|
||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
|
||||
slot.n_past_se -= bd;
|
||||
|
||||
@@ -1836,7 +1999,7 @@ struct llama_server_context
|
||||
send_embedding(slot);
|
||||
slot.release();
|
||||
slot.i_batch = -1;
|
||||
return true;
|
||||
continue;
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
@@ -1849,6 +2012,7 @@ struct llama_server_context
|
||||
{
|
||||
slot.t_start_genereration = ggml_time_us();
|
||||
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||
@@ -1871,11 +2035,14 @@ struct llama_server_context
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
}
|
||||
|
||||
slot.i_batch = -1;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_VERBOSE("slots updated", {});
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1953,8 +2120,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||
printf(" KV cache data type for V (default: f16)\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-format log output format: json or text (default: json)\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
||||
printf("\n");
|
||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
@@ -2086,9 +2255,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
else if (arg == "--rope-freq-base")
|
||||
@@ -2212,15 +2381,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
std::string arg_next = argv[i];
|
||||
if (arg_next == "none")
|
||||
{
|
||||
params.split_mode = LLAMA_SPLIT_NONE;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
}
|
||||
else if (arg_next == "layer")
|
||||
{
|
||||
params.split_mode = LLAMA_SPLIT_LAYER;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
}
|
||||
else if (arg_next == "row")
|
||||
{
|
||||
params.split_mode = LLAMA_SPLIT_ROW;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
}
|
||||
else {
|
||||
invalid_param = true;
|
||||
@@ -2405,6 +2574,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.mmproj = argv[i];
|
||||
}
|
||||
else if (arg == "--log-format")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (std::strcmp(argv[i], "json") == 0)
|
||||
{
|
||||
server_log_json = true;
|
||||
}
|
||||
else if (std::strcmp(argv[i], "text") == 0)
|
||||
{
|
||||
server_log_json = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (arg == "--log-disable")
|
||||
{
|
||||
log_set_target(stdout);
|
||||
@@ -2414,6 +2604,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
{
|
||||
sparams.slots_endpoint = false;
|
||||
}
|
||||
else if (arg == "--metrics")
|
||||
{
|
||||
sparams.metrics_endpoint = true;
|
||||
}
|
||||
else if (arg == "--chat-template")
|
||||
{
|
||||
if (++i >= argc)
|
||||
@@ -2447,15 +2641,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
@@ -2514,32 +2708,40 @@ static json format_partial_response(
|
||||
|
||||
static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
||||
{
|
||||
return json{
|
||||
{"tokens", tokens}};
|
||||
return json {
|
||||
{"tokens", tokens}
|
||||
};
|
||||
}
|
||||
|
||||
static json format_detokenized_response(std::string content)
|
||||
{
|
||||
return json{
|
||||
{"content", content}};
|
||||
return json {
|
||||
{"content", content}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
static void log_server_request(const httplib::Request &req, const httplib::Response &res)
|
||||
{
|
||||
// skip GH copilot requests when using default port
|
||||
if (req.path == "/v1/health" || req.path == "/v1/completions")
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_INFO("request", {
|
||||
{"remote_addr", req.remote_addr},
|
||||
{"remote_port", req.remote_port},
|
||||
{"status", res.status},
|
||||
{"method", req.method},
|
||||
{"path", req.path},
|
||||
{"params", req.params},
|
||||
});
|
||||
{"remote_addr", req.remote_addr},
|
||||
{"remote_port", req.remote_port},
|
||||
{"status", res.status},
|
||||
{"method", req.method},
|
||||
{"path", req.path},
|
||||
{"params", req.params},
|
||||
});
|
||||
|
||||
LOG_VERBOSE("request", {
|
||||
{"request", req.body},
|
||||
{"response", res.body},
|
||||
});
|
||||
{"request", req.body},
|
||||
{"response", res.body},
|
||||
});
|
||||
}
|
||||
|
||||
struct token_translator
|
||||
@@ -2621,7 +2823,7 @@ int main(int argc, char **argv)
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.type = TASK_TYPE_METRICS;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
@@ -2668,7 +2870,7 @@ int main(int argc, char **argv)
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.type = TASK_TYPE_METRICS;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
@@ -2683,6 +2885,87 @@ int main(int argc, char **argv)
|
||||
});
|
||||
}
|
||||
|
||||
if (sparams.metrics_endpoint) {
|
||||
svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) {
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_METRICS;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
llama.queue_tasks.post(task);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task.id);
|
||||
llama.queue_results.remove_waiting_task_id(task.id);
|
||||
|
||||
json data = result.result_json;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
|
||||
uint64_t t_prompt_processing = data["t_prompt_processing"];
|
||||
|
||||
uint64_t n_tokens_predicted = data["n_tokens_predicted"];
|
||||
uint64_t t_tokens_generation = data["t_tokens_generation"];
|
||||
|
||||
int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
|
||||
|
||||
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
||||
json all_metrics_def = json {
|
||||
{"counter", {{
|
||||
{"name", "prompt_tokens_total"},
|
||||
{"help", "Number of prompt tokens processed."},
|
||||
{"value", data["n_prompt_tokens_processed_total"]}
|
||||
}, {
|
||||
{"name", "tokens_predicted_total"},
|
||||
{"help", "Number of generation tokens processed."},
|
||||
{"value", data["n_tokens_predicted_total"]}
|
||||
}}},
|
||||
{"gauge", {{
|
||||
{"name", "prompt_tokens_seconds"},
|
||||
{"help", "Average prompt throughput in tokens/s."},
|
||||
{"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
|
||||
},{
|
||||
{"name", "predicted_tokens_seconds"},
|
||||
{"help", "Average generation throughput in tokens/s."},
|
||||
{"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
|
||||
},{
|
||||
{"name", "kv_cache_usage_ratio"},
|
||||
{"help", "KV-cache usage. 1 means 100 percent usage."},
|
||||
{"value", 1. * kv_cache_used_cells / params.n_ctx}
|
||||
},{
|
||||
{"name", "kv_cache_tokens"},
|
||||
{"help", "KV-cache tokens."},
|
||||
{"value", data["kv_cache_tokens_count"]}
|
||||
},{
|
||||
{"name", "requests_processing"},
|
||||
{"help", "Number of request processing."},
|
||||
{"value", data["processing"]}
|
||||
},{
|
||||
{"name", "requests_deferred"},
|
||||
{"help", "Number of request deferred."},
|
||||
{"value", data["deferred"]}
|
||||
}}}
|
||||
};
|
||||
|
||||
std::stringstream prometheus;
|
||||
for (const auto& el : all_metrics_def.items()) {
|
||||
const auto& type = el.key();
|
||||
const auto& metrics_def = el.value();
|
||||
for (const auto& metric_def : metrics_def) {
|
||||
std::string name = metric_def["name"];
|
||||
std::string help = metric_def["help"];
|
||||
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
|
||||
<< "# TYPE llamacpp:" << name << " " << type << "\n"
|
||||
<< "llamacpp:" << name << " " << metric_def["value"] << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
|
||||
res.status = 200; // HTTP OK
|
||||
});
|
||||
}
|
||||
|
||||
svr.set_logger(log_server_request);
|
||||
|
||||
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
||||
@@ -2735,9 +3018,6 @@ int main(int argc, char **argv)
|
||||
// Set the base directory for serving static files
|
||||
svr.set_base_dir(sparams.public_path);
|
||||
|
||||
// to make it ctrl+clickable:
|
||||
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||
|
||||
std::unordered_map<std::string, std::string> log_data;
|
||||
log_data["hostname"] = sparams.hostname;
|
||||
log_data["port"] = std::to_string(sparams.port);
|
||||
|
||||
@@ -32,6 +32,7 @@ It's possible to override some scenario steps values with environment variables:
|
||||
- `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
|
||||
- `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
|
||||
- `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
|
||||
- `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
|
||||
|
||||
### Run @bug, @wip or @wrong_usage annotated scenario
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ def before_scenario(context, scenario):
|
||||
|
||||
|
||||
def after_scenario(context, scenario):
|
||||
if context.server_process is None:
|
||||
return
|
||||
if scenario.status == "failed":
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
||||
|
||||
@@ -1,36 +1,4 @@
|
||||
# List of ongoing issues
|
||||
@bug
|
||||
Feature: Issues
|
||||
# Issue #5655
|
||||
Scenario: Multi users embeddings
|
||||
Given a server listening on localhost:8080
|
||||
And a model file stories260K.gguf
|
||||
And a model alias tinyllama-2
|
||||
And 42 as server seed
|
||||
And 64 KV cache size
|
||||
And 2 slots
|
||||
And continuous batching
|
||||
And embeddings extraction
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another very long music lyrics.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long poem.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long joke.
|
||||
"""
|
||||
Given concurrent embedding requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all embeddings are generated
|
||||
# No confirmed issue at the moment
|
||||
|
||||
@@ -8,6 +8,7 @@ Feature: Parallel
|
||||
And 42 as server seed
|
||||
And 64 KV cache size
|
||||
And 2 slots
|
||||
And embeddings extraction
|
||||
And continuous batching
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
@@ -75,3 +76,48 @@ Feature: Parallel
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted
|
||||
|
||||
Scenario: Multi users embeddings
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another very long music lyrics.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long poem.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long joke.
|
||||
"""
|
||||
Given concurrent embedding requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all embeddings are generated
|
||||
|
||||
Scenario: Multi users OAI compatibility embeddings
|
||||
Given a prompt:
|
||||
"""
|
||||
In which country Paris is located ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Is Madrid the capital of Spain ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
What is the biggest US city ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
What is the capital of Bulgaria ?
|
||||
"""
|
||||
And a model tinyllama-2
|
||||
Given concurrent OAI embedding requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all embeddings are generated
|
||||
|
||||
@@ -13,6 +13,7 @@ Feature: llama.cpp server
|
||||
And 1 slots
|
||||
And embeddings extraction
|
||||
And 32 server max tokens to predict
|
||||
And prometheus compatible metrics exposed
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
@@ -25,11 +26,12 @@ Feature: llama.cpp server
|
||||
And <n_predict> max tokens to predict
|
||||
And a completion request with no api error
|
||||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
And prometheus metrics are exposed
|
||||
|
||||
Examples: Prompts
|
||||
| prompt | n_predict | re_content | n_predicted |
|
||||
| I believe the meaning of life is | 8 | read | 8 |
|
||||
| Write a joke about AI | 64 | (park<or>friends<or>scared)+ | 32 |
|
||||
| prompt | n_predict | re_content | n_predicted |
|
||||
| I believe the meaning of life is | 8 | (read<or>going)+ | 8 |
|
||||
| Write a joke about AI | 64 | (park<or>friends<or>scared<or>always)+ | 32 |
|
||||
|
||||
Scenario Outline: OAI Compatibility
|
||||
Given a model <model>
|
||||
@@ -60,6 +62,19 @@ Feature: llama.cpp server
|
||||
"""
|
||||
Then embeddings are generated
|
||||
|
||||
Scenario: OAI Embeddings compatibility with multiple inputs
|
||||
Given a model tinyllama-2
|
||||
Given a prompt:
|
||||
"""
|
||||
In which country Paris is located ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Is Madrid the capital of Spain ?
|
||||
"""
|
||||
When an OAI compatible embeddings computation request for multiple inputs
|
||||
Then embeddings are generated
|
||||
|
||||
|
||||
Scenario: Tokenize / Detokenize
|
||||
When tokenizing:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import collections
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -12,6 +13,7 @@ import aiohttp
|
||||
import openai
|
||||
from behave import step
|
||||
from behave.api.async_step import async_run_until_complete
|
||||
from prometheus_client import parser
|
||||
|
||||
|
||||
@step(u"a server listening on {server_fqdn}:{server_port}")
|
||||
@@ -33,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port):
|
||||
context.server_api_key = None
|
||||
context.server_continuous_batching = False
|
||||
context.server_embeddings = False
|
||||
context.server_metrics = False
|
||||
context.server_process = None
|
||||
context.server_seed = None
|
||||
context.user_api_key = None
|
||||
|
||||
@@ -81,6 +85,11 @@ def step_server_embeddings(context):
|
||||
context.server_embeddings = True
|
||||
|
||||
|
||||
@step(u'prometheus compatible metrics exposed')
|
||||
def step_server_metrics(context):
|
||||
context.server_metrics = True
|
||||
|
||||
|
||||
@step(u"the server is starting")
|
||||
def step_start_server(context):
|
||||
start_server_background(context)
|
||||
@@ -261,35 +270,35 @@ def step_a_prompt_prompt(context, prompt):
|
||||
@step(u'concurrent completion requests')
|
||||
@async_run_until_complete()
|
||||
async def step_concurrent_completion_requests(context):
|
||||
await concurrent_completion_requests(context,
|
||||
request_completion,
|
||||
# prompt is inserted automatically
|
||||
context.base_url,
|
||||
debug=context.debug,
|
||||
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
|
||||
server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
|
||||
user_api_key=context.user_api_key if hasattr(context,
|
||||
'user_api_key') else None)
|
||||
await concurrent_requests(context,
|
||||
request_completion,
|
||||
# prompt is inserted automatically
|
||||
context.base_url,
|
||||
debug=context.debug,
|
||||
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
|
||||
server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
|
||||
user_api_key=context.user_api_key if hasattr(context,
|
||||
'user_api_key') else None)
|
||||
|
||||
|
||||
@step(u'concurrent OAI completions requests')
|
||||
@async_run_until_complete
|
||||
async def step_oai_chat_completions(context):
|
||||
await concurrent_completion_requests(context, oai_chat_completions,
|
||||
# user_prompt is inserted automatically
|
||||
context.system_prompt,
|
||||
context.base_url,
|
||||
True, # async_client
|
||||
model=context.model
|
||||
if hasattr(context, 'model') else None,
|
||||
n_predict=context.n_predict
|
||||
if hasattr(context, 'n_predict') else None,
|
||||
enable_streaming=context.enable_streaming
|
||||
if hasattr(context, 'enable_streaming') else None,
|
||||
server_seed=context.server_seed
|
||||
if hasattr(context, 'server_seed') else None,
|
||||
user_api_key=context.user_api_key
|
||||
if hasattr(context, 'user_api_key') else None)
|
||||
await concurrent_requests(context, oai_chat_completions,
|
||||
# user_prompt is inserted automatically
|
||||
context.system_prompt,
|
||||
context.base_url,
|
||||
True, # async_client
|
||||
model=context.model
|
||||
if hasattr(context, 'model') else None,
|
||||
n_predict=context.n_predict
|
||||
if hasattr(context, 'n_predict') else None,
|
||||
enable_streaming=context.enable_streaming
|
||||
if hasattr(context, 'enable_streaming') else None,
|
||||
server_seed=context.server_seed
|
||||
if hasattr(context, 'server_seed') else None,
|
||||
user_api_key=context.user_api_key
|
||||
if hasattr(context, 'user_api_key') else None)
|
||||
|
||||
|
||||
@step(u'all prompts are predicted')
|
||||
@@ -316,36 +325,58 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
|
||||
@step(u'embeddings are computed for')
|
||||
@async_run_until_complete
|
||||
async def step_compute_embedding(context):
|
||||
content = context.text
|
||||
base_url = context.base_url
|
||||
context.embeddings = await request_embedding(content, base_url)
|
||||
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
|
||||
|
||||
|
||||
@step(u'embeddings are generated')
|
||||
def step_assert_embeddings(context):
|
||||
assert_embeddings(context.embeddings)
|
||||
if len(context.prompts) == 0:
|
||||
assert_embeddings(context.embeddings)
|
||||
else:
|
||||
assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
|
||||
f"context.prompts={context.prompts}\n"
|
||||
f"context.embeddings={context.embeddings}")
|
||||
for embedding in context.embeddings:
|
||||
context.prompts.pop()
|
||||
assert_embeddings(embedding)
|
||||
|
||||
|
||||
@step(u'an OAI compatible embeddings computation request for')
|
||||
def step_oai_compute_embedding(context):
|
||||
openai.api_key = 'nope' # openai client always expects an api_keu
|
||||
if context.user_api_key is not None:
|
||||
openai.api_key = context.user_api_key
|
||||
openai.api_base = f'{context.base_url}/v1'
|
||||
embeddings = openai.Embedding.create(
|
||||
model=context.model,
|
||||
input=context.text,
|
||||
)
|
||||
context.embeddings = embeddings
|
||||
@async_run_until_complete
|
||||
async def step_oai_compute_embeddings(context):
|
||||
context.embeddings = await request_oai_embeddings(context.text,
|
||||
base_url=context.base_url,
|
||||
user_api_key=context.user_api_key,
|
||||
model=context.model)
|
||||
|
||||
|
||||
@step(u'an OAI compatible embeddings computation request for multiple inputs')
|
||||
@async_run_until_complete
|
||||
async def step_oai_compute_embeddings_multiple_inputs(context):
|
||||
context.embeddings = await request_oai_embeddings(context.prompts,
|
||||
base_url=context.base_url,
|
||||
user_api_key=context.user_api_key,
|
||||
model=context.model)
|
||||
|
||||
|
||||
@step(u'concurrent embedding requests')
|
||||
@async_run_until_complete()
|
||||
async def step_concurrent_embedding_requests(context):
|
||||
await concurrent_completion_requests(context,
|
||||
request_embedding,
|
||||
# prompt is inserted automatically
|
||||
context.base_url)
|
||||
await concurrent_requests(context,
|
||||
request_embedding,
|
||||
# prompt is inserted automatically
|
||||
base_url=context.base_url)
|
||||
|
||||
|
||||
@step(u'concurrent OAI embedding requests')
|
||||
@async_run_until_complete()
|
||||
async def step_concurrent_oai_embedding_requests(context):
|
||||
await concurrent_requests(context,
|
||||
request_oai_embeddings,
|
||||
# prompt is inserted automatically
|
||||
base_url=context.base_url,
|
||||
async_client=True,
|
||||
model=context.model)
|
||||
|
||||
|
||||
@step(u'all embeddings are generated')
|
||||
@@ -401,7 +432,24 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
|
||||
assert context.options_response.headers[cors_header] == cors_header_value
|
||||
|
||||
|
||||
async def concurrent_completion_requests(context, f_completion, *args, **kwargs):
|
||||
@step(u'prometheus metrics are exposed')
|
||||
@async_run_until_complete
|
||||
async def step_prometheus_metrics_exported(context):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with await session.get(f'{context.base_url}/metrics') as metrics_response:
|
||||
assert metrics_response.status == 200
|
||||
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
|
||||
metrics_raw = await metrics_response.text()
|
||||
metric_exported = False
|
||||
for metric in parser.text_string_to_metric_families(metrics_raw):
|
||||
match metric.name:
|
||||
case "llamacpp:kv_cache_usage_ratio":
|
||||
assert len(metric.samples) > 0
|
||||
metric_exported = True
|
||||
assert metric_exported, "No metrics exported"
|
||||
|
||||
|
||||
async def concurrent_requests(context, f_completion, *args, **kwargs):
|
||||
n_prompts = len(context.prompts)
|
||||
if context.debug:
|
||||
print(f"starting {n_prompts} concurrent completion requests...")
|
||||
@@ -565,7 +613,7 @@ async def oai_chat_completions(user_prompt,
|
||||
return completion_response
|
||||
|
||||
|
||||
async def request_embedding(content, base_url):
|
||||
async def request_embedding(content, base_url=None):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{base_url}/embedding',
|
||||
json={
|
||||
@@ -576,6 +624,46 @@ async def request_embedding(content, base_url):
|
||||
return response_json['embedding']
|
||||
|
||||
|
||||
async def request_oai_embeddings(input,
|
||||
base_url=None, user_api_key=None,
|
||||
model=None, async_client=False):
|
||||
# openai client always expects an api_key
|
||||
user_api_key = user_api_key if user_api_key is not None else 'nope'
|
||||
if async_client:
|
||||
origin = 'llama.cpp'
|
||||
if user_api_key is not None:
|
||||
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{base_url}/v1/embeddings',
|
||||
json={
|
||||
"input": input,
|
||||
"model": model,
|
||||
},
|
||||
headers=headers) as response:
|
||||
assert response.status == 200, f"received status code not expected: {response.status}"
|
||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
|
||||
response_json = await response.json()
|
||||
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
|
||||
assert response_json['object'] == 'list'
|
||||
return response_json['data']
|
||||
else:
|
||||
openai.api_key = user_api_key
|
||||
openai.api_base = f'{base_url}/v1'
|
||||
oai_embeddings = openai.Embedding.create(
|
||||
model=model,
|
||||
input=input,
|
||||
)
|
||||
|
||||
if isinstance(input, collections.abc.Sequence):
|
||||
embeddings = []
|
||||
for an_oai_embeddings in oai_embeddings.data:
|
||||
embeddings.append(an_oai_embeddings.embedding)
|
||||
else:
|
||||
embeddings = oai_embeddings.data.embedding
|
||||
return embeddings
|
||||
|
||||
|
||||
def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
|
||||
content = completion_response['content']
|
||||
n_predicted = completion_response['timings']['predicted_n']
|
||||
@@ -611,6 +699,8 @@ async def wait_for_health_status(context,
|
||||
if context.debug:
|
||||
print(f"Starting checking for health for expected_health_status={expected_health_status}")
|
||||
timeout = 3 # seconds
|
||||
if expected_health_status == 'ok':
|
||||
timeout = 10 # CI slow inference
|
||||
interval = 0.5
|
||||
counter = 0
|
||||
async with aiohttp.ClientSession() as session:
|
||||
@@ -648,7 +738,7 @@ async def wait_for_health_status(context,
|
||||
if n_completions > 0:
|
||||
return
|
||||
|
||||
assert False, 'timeout exceeded'
|
||||
assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
|
||||
|
||||
|
||||
def assert_embeddings(embeddings):
|
||||
@@ -690,6 +780,8 @@ def start_server_background(context):
|
||||
server_args.append('--cont-batching')
|
||||
if context.server_embeddings:
|
||||
server_args.append('--embedding')
|
||||
if context.server_metrics:
|
||||
server_args.append('--metrics')
|
||||
if context.model_alias is not None:
|
||||
server_args.extend(['--alias', context.model_alias])
|
||||
if context.n_ctx is not None:
|
||||
@@ -702,6 +794,8 @@ def start_server_background(context):
|
||||
server_args.extend(['--api-key', context.server_api_key])
|
||||
if context.debug:
|
||||
server_args.append('--verbose')
|
||||
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
||||
server_args.extend(['--log-format', "text"])
|
||||
print(f"starting server with: {context.server_path}", *server_args)
|
||||
context.server_process = subprocess.Popen(
|
||||
[str(arg) for arg in [context.server_path, *server_args]],
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
aiohttp~=3.9.3
|
||||
behave~=1.2.6
|
||||
openai~=0.25.0
|
||||
prometheus-client~=0.20.0
|
||||
|
||||
+55
-29
@@ -14,6 +14,7 @@
|
||||
using json = nlohmann::json;
|
||||
|
||||
extern bool server_verbose;
|
||||
extern bool server_log_json;
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
@@ -27,14 +28,14 @@ extern bool server_verbose;
|
||||
{ \
|
||||
if (server_verbose) \
|
||||
{ \
|
||||
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// parallel
|
||||
@@ -50,7 +51,7 @@ enum task_type {
|
||||
TASK_TYPE_COMPLETION,
|
||||
TASK_TYPE_CANCEL,
|
||||
TASK_TYPE_NEXT_RESPONSE,
|
||||
TASK_TYPE_SLOTS_DATA
|
||||
TASK_TYPE_METRICS
|
||||
};
|
||||
|
||||
struct task_server {
|
||||
@@ -133,26 +134,48 @@ struct completion_token_output
|
||||
std::string text_to_send;
|
||||
};
|
||||
|
||||
static inline void server_log(const char *level, const char *function, int line,
|
||||
const char *message, const nlohmann::ordered_json &extra)
|
||||
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
|
||||
{
|
||||
nlohmann::ordered_json log
|
||||
{
|
||||
std::stringstream ss_tid;
|
||||
ss_tid << std::this_thread::get_id();
|
||||
json log = nlohmann::ordered_json{
|
||||
{"tid", ss_tid.str()},
|
||||
{"timestamp", time(nullptr)},
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
{"message", message},
|
||||
};
|
||||
|
||||
if (!extra.empty())
|
||||
{
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
if (server_log_json) {
|
||||
log.merge_patch(
|
||||
{
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
{"msg", message},
|
||||
});
|
||||
if (!extra.empty()) {
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
|
||||
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
fflush(stdout);
|
||||
std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
|
||||
} else {
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
|
||||
|
||||
if (!extra.empty()) {
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << buf << " |";
|
||||
for (const auto& el : log.items())
|
||||
{
|
||||
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str());
|
||||
ss << buf;
|
||||
}
|
||||
|
||||
const std::string str = ss.str();
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
@@ -234,6 +257,7 @@ struct llama_server_queue {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (task.id == -1) {
|
||||
task.id = id++;
|
||||
LOG_VERBOSE("new task id", {{"new_id", task.id}});
|
||||
}
|
||||
queue_tasks.push_back(std::move(task));
|
||||
condition_tasks.notify_one();
|
||||
@@ -249,7 +273,9 @@ struct llama_server_queue {
|
||||
// Get the next id for creating anew task
|
||||
int get_new_id() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
return id++;
|
||||
int new_id = id++;
|
||||
LOG_VERBOSE("new task id", {{"new_id", new_id}});
|
||||
return new_id;
|
||||
}
|
||||
|
||||
// Register function to process a new task
|
||||
@@ -290,8 +316,7 @@ struct llama_server_queue {
|
||||
void start_loop() {
|
||||
running = true;
|
||||
while (true) {
|
||||
// new task arrived
|
||||
LOG_VERBOSE("have new task", {});
|
||||
LOG_VERBOSE("new task may arrive", {});
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
@@ -303,7 +328,7 @@ struct llama_server_queue {
|
||||
task_server task = queue_tasks.front();
|
||||
queue_tasks.erase(queue_tasks.begin());
|
||||
lock.unlock();
|
||||
LOG_VERBOSE("callback_new_task", {});
|
||||
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
|
||||
callback_new_task(task);
|
||||
}
|
||||
LOG_VERBOSE("callback_all_task_finished", {});
|
||||
@@ -384,11 +409,13 @@ struct llama_server_response {
|
||||
std::condition_variable condition_results;
|
||||
|
||||
void add_waiting_task_id(int task_id) {
|
||||
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.insert(task_id);
|
||||
}
|
||||
|
||||
void remove_waiting_task_id(int task_id) {
|
||||
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.erase(task_id);
|
||||
}
|
||||
@@ -401,7 +428,6 @@ struct llama_server_response {
|
||||
condition_results.wait(lock, [&]{
|
||||
return !queue_results.empty();
|
||||
});
|
||||
LOG_VERBOSE("condition_results unblock", {});
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||
{
|
||||
@@ -426,22 +452,22 @@ struct llama_server_response {
|
||||
// Send a new result to a waiting task_id
|
||||
void send(task_result result) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
LOG_VERBOSE("send new result", {});
|
||||
LOG_VERBOSE("send new result", {{"task_id", result.id}});
|
||||
for (auto& task_id : waiting_task_ids) {
|
||||
// LOG_TEE("waiting task id %i \n", task_id);
|
||||
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||
if (result.multitask_id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("callback_update_multitask", {});
|
||||
LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
|
||||
callback_update_multitask(task_id, result.id, result);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result.id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("queue_results.push_back", {});
|
||||
LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
|
||||
queue_results.push_back(result);
|
||||
condition_results.notify_one();
|
||||
condition_results.notify_all();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -960,7 +960,7 @@ int main(int argc, char ** argv) {
|
||||
struct ggml_opt_context * opt = train->opt;
|
||||
|
||||
// set opt params from command line
|
||||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||
opt->params.print_forward_graph = false;
|
||||
opt->params.print_backward_graph = false;
|
||||
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||
|
||||
Generated
+3
-3
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1708118438,
|
||||
"narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=",
|
||||
"lastModified": 1708655239,
|
||||
"narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80",
|
||||
"rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
+239
-70
@@ -172,6 +172,7 @@
|
||||
#endif
|
||||
|
||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
@@ -196,6 +197,18 @@ static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
@@ -518,6 +531,17 @@ typedef struct {
|
||||
} block_iq3_xxs;
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
#define QR3_XS 8
|
||||
#define QI3_XS (QK_K / (4*QR3_XS))
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t signs[QK_K/8];
|
||||
uint8_t scales[QK_K/64];
|
||||
} block_iq3_s;
|
||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 27*(QK_K/64), "wrong iq3_s block size/padding");
|
||||
|
||||
#define QR1_S 8
|
||||
#define QI1_S (QK_K / (4*QR1_S))
|
||||
typedef struct {
|
||||
@@ -1700,6 +1724,74 @@ static const __device__ uint32_t iq3xxs_grid[256] = {
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
static const __device__ uint32_t iq3xs_grid[512] = {
|
||||
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
||||
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
||||
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
||||
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
||||
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
||||
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
||||
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
||||
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
||||
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
||||
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
||||
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
||||
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
||||
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
||||
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
||||
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
||||
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
||||
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
||||
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
||||
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
||||
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
||||
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
||||
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
||||
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
||||
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
||||
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
||||
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
||||
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
||||
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
||||
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
||||
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
||||
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
||||
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
||||
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
||||
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
||||
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
||||
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
||||
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
||||
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
||||
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
||||
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
||||
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
||||
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
||||
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
||||
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
||||
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
||||
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
||||
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
||||
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
||||
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
||||
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
||||
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
||||
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
||||
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
||||
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
||||
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
||||
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
||||
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
||||
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
||||
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
||||
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
||||
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
||||
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
||||
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
||||
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
||||
};
|
||||
|
||||
|
||||
static const __device__ uint64_t iq1s_grid[512] = {
|
||||
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
||||
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
||||
@@ -1973,6 +2065,32 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * qs = x[i].qs + 8*ib;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
|
||||
const uint8_t signs = x[i].signs[4*ib + il];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
@@ -4717,6 +4835,41 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO: don't use lookup table for signs
|
||||
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
const uint8_t * qs = bq2->qs + 8*ib32;
|
||||
const int8_t * q8 = bq8_1[ib32].qs;
|
||||
int sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
|
||||
const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
|
||||
uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
|
||||
const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
|
||||
sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
|
||||
sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
|
||||
q8 += 8;
|
||||
}
|
||||
const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
|
||||
return d * sumi;
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if QK_K == 256
|
||||
@@ -6216,11 +6369,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
|
||||
int ixj = col ^ j;
|
||||
if (ixj > col) {
|
||||
if ((col & k) == 0) {
|
||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
||||
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
||||
swap(dst_row[col], dst_row[ixj]);
|
||||
}
|
||||
} else {
|
||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
||||
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
||||
swap(dst_row[col], dst_row[ixj]);
|
||||
}
|
||||
}
|
||||
@@ -6849,6 +7002,12 @@ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k,
|
||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
@@ -6904,6 +7063,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
return dequantize_row_iq1_s_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
default:
|
||||
@@ -6943,6 +7104,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
return dequantize_row_iq1_s_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
default:
|
||||
@@ -7764,10 +7927,10 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||
|
||||
const dim3 block_dims(ncols, 1, 1);
|
||||
const dim3 block_nums(1, nrows, 1);
|
||||
if (order == GGML_SORT_ASC) {
|
||||
k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
} else if (order == GGML_SORT_DESC) {
|
||||
k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -8199,11 +8362,11 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||
|
||||
cudaMemcpyKind kind;
|
||||
char * src_ptr;
|
||||
if (src->backend == GGML_BACKEND_CPU) {
|
||||
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
kind = cudaMemcpyHostToDevice;
|
||||
src_ptr = (char *) src->data;
|
||||
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||
kind = cudaMemcpyDeviceToDevice;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
int id;
|
||||
@@ -8608,7 +8771,7 @@ static void ggml_cuda_op_mul_mat_q(
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
@@ -8688,6 +8851,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@@ -8713,6 +8877,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 64;
|
||||
@@ -8755,7 +8920,7 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
@@ -8818,6 +8983,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
||||
mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
|
||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
|
||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
@@ -8927,7 +9096,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// ldc == nrows of the matrix that cuBLAS writes into
|
||||
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
||||
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
||||
|
||||
const int compute_capability = g_device_caps[id].cc;
|
||||
|
||||
@@ -9275,7 +9444,7 @@ static void ggml_cuda_op_soft_max(
|
||||
const bool use_src2 = src2 != nullptr;
|
||||
|
||||
if (use_src2) {
|
||||
const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
|
||||
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
if (src2_on_device) {
|
||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||
@@ -9333,16 +9502,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||
const bool use_src1 = src1 != nullptr;
|
||||
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
||||
|
||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
// dd = data device
|
||||
float * src0_ddf = nullptr;
|
||||
@@ -9386,7 +9555,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
@@ -9467,8 +9636,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
||||
|
||||
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
||||
@@ -9484,20 +9653,20 @@ static void ggml_cuda_op_mul_mat(
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
||||
|
||||
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
GGML_ASSERT(!(split && ne02 > 1));
|
||||
GGML_ASSERT(!(split && ne03 > 1));
|
||||
GGML_ASSERT(!(split && ne02 < ne12));
|
||||
|
||||
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
||||
if (split) {
|
||||
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
|
||||
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
|
||||
// GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||
tensor_split = buft_ctx->tensor_split;
|
||||
@@ -9555,8 +9724,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
used_devices++;
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
||||
|
||||
ggml_cuda_set_device(id);
|
||||
cudaStream_t stream = g_cudaStreams[id][0];
|
||||
@@ -9607,8 +9776,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
||||
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
||||
|
||||
ggml_cuda_set_device(id);
|
||||
@@ -9633,12 +9802,12 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
||||
// in that case an offset on dst_ddf_i is needed
|
||||
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
|
||||
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
||||
}
|
||||
|
||||
// copy src0, src1 to device if necessary
|
||||
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
||||
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
||||
if (id != g_main_device) {
|
||||
if (convert_src1_to_q8_1) {
|
||||
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
||||
@@ -9651,14 +9820,14 @@ static void ggml_cuda_op_mul_mat(
|
||||
src1_ncols*ne10*sizeof(float), stream));
|
||||
}
|
||||
}
|
||||
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
||||
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
@@ -9676,10 +9845,10 @@ static void ggml_cuda_op_mul_mat(
|
||||
if (!dst_on_device) {
|
||||
void * dst_off_device;
|
||||
cudaMemcpyKind kind;
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
dst_off_device = dst->data;
|
||||
kind = cudaMemcpyDeviceToHost;
|
||||
} else if (dst->backend == GGML_BACKEND_GPU) {
|
||||
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
dst_off_device = dst_extra->data_device[g_main_device];
|
||||
kind = cudaMemcpyDeviceToDevice;
|
||||
} else {
|
||||
@@ -9744,7 +9913,7 @@ static void ggml_cuda_op_mul_mat(
|
||||
}
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
@@ -9850,7 +10019,7 @@ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const stru
|
||||
|
||||
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
@@ -9881,7 +10050,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -9940,7 +10109,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
@@ -10086,11 +10255,11 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
|
||||
|
||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool all_on_device =
|
||||
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||
(src1->backend == GGML_BACKEND_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_GPU);
|
||||
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
||||
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
|
||||
@@ -10240,7 +10409,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
||||
GGML_ASSERT(!ggml_is_transposed(src00));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
||||
@@ -10384,7 +10553,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||
|
||||
cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
if (ids->backend == GGML_BACKEND_GPU) {
|
||||
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
@@ -10401,20 +10570,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
src1_row.backend = GGML_BACKEND_GPU;
|
||||
dst_row.backend = GGML_BACKEND_GPU;
|
||||
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
src1_row.extra = &src1_row_extra;
|
||||
dst_row.extra = &dst_row_extra;
|
||||
|
||||
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
||||
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
||||
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
||||
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
||||
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
||||
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
||||
|
||||
if (src1->ne[1] == 1) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
//int32_t row_id;
|
||||
@@ -10442,9 +10611,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
||||
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
||||
|
||||
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
||||
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
||||
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
||||
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
||||
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
||||
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
||||
|
||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
||||
@@ -10499,7 +10668,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||
}
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
}
|
||||
@@ -10516,8 +10685,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||
@@ -10648,9 +10817,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||
if (!g_cublas_loaded) return false;
|
||||
|
||||
ggml_cuda_func_t func;
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
||||
return false;
|
||||
@@ -10797,14 +10966,14 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
||||
}
|
||||
|
||||
if (params->ith != 0) {
|
||||
return true;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return true;
|
||||
}
|
||||
func(tensor->src[0], tensor->src[1], tensor);
|
||||
@@ -10903,7 +11072,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||
|
||||
extra->data_device[ctx->device] = tensor->data;
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||
tensor->extra = extra;
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
@@ -10918,7 +11087,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
@@ -10929,7 +11098,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
@@ -11164,7 +11333,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
||||
}
|
||||
}
|
||||
tensor->backend = GGML_BACKEND_GPU_SPLIT;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
|
||||
@@ -11436,7 +11605,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
||||
}
|
||||
@@ -11445,7 +11614,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
||||
}
|
||||
@@ -11475,7 +11644,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
||||
ggml_cuda_set_main_device(cuda_ctx->device);
|
||||
|
||||
ggml_compute_params params = {};
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
params.ith = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
@@ -11485,13 +11654,13 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
|
||||
assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
||||
assert(node->extra != nullptr);
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] != nullptr) {
|
||||
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
||||
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
||||
assert(node->src[j]->extra != nullptr);
|
||||
}
|
||||
@@ -11541,7 +11710,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) {
|
||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S) {
|
||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
+31
-6
@@ -61,6 +61,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
||||
@@ -85,6 +86,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
||||
@@ -105,6 +107,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
||||
@@ -122,6 +125,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
||||
@@ -139,6 +143,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
||||
@@ -452,6 +457,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||
@@ -476,6 +482,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||
@@ -496,6 +503,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||
@@ -513,6 +521,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||
@@ -530,6 +539,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||
@@ -1347,6 +1357,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
@@ -1483,6 +1494,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
@@ -1537,8 +1554,8 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ3_XXS) {
|
||||
const int mem_size = 256*4+128;
|
||||
else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
|
||||
const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@@ -1640,6 +1657,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
@@ -1779,6 +1797,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
@@ -1849,8 +1873,8 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_IQ3_XXS) {
|
||||
const int mem_size = 256*4+128;
|
||||
else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) {
|
||||
const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@@ -1900,6 +1924,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||
@@ -2237,8 +2262,8 @@ static bool ggml_metal_graph_compute(
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
switch (order) {
|
||||
case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
|
||||
case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
|
||||
case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
|
||||
case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
};
|
||||
|
||||
|
||||
@@ -2525,6 +2525,20 @@ typedef struct {
|
||||
} block_iq3_xxs;
|
||||
// 98 bytes / block for QK_K = 256, so 3.0625 bpw
|
||||
|
||||
// 3.4375 bpw
|
||||
#if QK_K == 64
|
||||
#define IQ3S_N_SCALE 2
|
||||
#else
|
||||
#define IQ3S_N_SCALE QK_K/64
|
||||
#endif
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t signs[QK_K/8];
|
||||
uint8_t scales[IQ3S_N_SCALE];
|
||||
} block_iq3_s;
|
||||
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/8];
|
||||
@@ -3795,6 +3809,73 @@ constexpr constant static uint32_t iq3xxs_grid[256] = {
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
constexpr constant static uint32_t iq3xs_grid[512] = {
|
||||
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
||||
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
||||
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
||||
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
||||
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
||||
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
||||
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
||||
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
||||
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
||||
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
||||
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
||||
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
||||
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
||||
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
||||
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
||||
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
||||
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
||||
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
||||
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
||||
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
||||
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
||||
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
||||
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
||||
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
||||
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
||||
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
||||
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
||||
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
||||
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
||||
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
||||
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
||||
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
||||
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
||||
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
||||
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
||||
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
||||
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
||||
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
||||
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
||||
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
||||
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
||||
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
||||
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
||||
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
||||
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
||||
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
||||
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
||||
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
||||
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
||||
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
||||
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
||||
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
||||
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
||||
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
||||
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
||||
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
||||
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
||||
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
||||
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
||||
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
||||
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
||||
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
||||
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
||||
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
||||
};
|
||||
|
||||
#define NGRID_IQ1S 512
|
||||
constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = {
|
||||
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
||||
@@ -4361,6 +4442,136 @@ kernel void kernel_mul_mv_iq3_xxs_f32(
|
||||
kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq3_s_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[32];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int nb32 = nb * (QK_K / 32);
|
||||
|
||||
threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
|
||||
{
|
||||
int nval = 8;
|
||||
int pos = (32*sgitg + tiisg)*nval;
|
||||
for (int i = 0; i < nval; ++i) values[pos + i] = iq3xs_grid[pos + i];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
const int ix = tiisg;
|
||||
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
yl[i] = y4[i];
|
||||
}
|
||||
|
||||
const int ibl = ib32 / (QK_K / 32);
|
||||
const int ib = ib32 % (QK_K / 32);
|
||||
|
||||
device const block_iq3_s * xr = x + ibl;
|
||||
device const uint8_t * qs = xr->qs + 8 * ib;
|
||||
device const uint8_t * qh = xr->qh + ib;
|
||||
device const uint8_t * sc = xr->scales + (ib/2);
|
||||
device const uint8_t * signs = xr->signs + 4 * ib;
|
||||
device const half * dh = &xr->d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
const float db = dh[0];
|
||||
const float d = db * (0.5f + ((sc[0] >> 4*(ib%2)) & 0xf));
|
||||
|
||||
float2 sum = {0};
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
||||
const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
|
||||
sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
|
||||
}
|
||||
}
|
||||
sumf[row] += d * (sum[0] + sum[1]);
|
||||
|
||||
dh += nb*sizeof(block_iq3_s)/2;
|
||||
qs += nb*sizeof(block_iq3_s);
|
||||
qh += nb*sizeof(block_iq3_s);
|
||||
sc += nb*sizeof(block_iq3_s);
|
||||
signs += nb*sizeof(block_iq3_s);
|
||||
}
|
||||
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq3_s_f32")]]
|
||||
kernel void kernel_mul_mv_iq3_s_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq1_s_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -4952,6 +5163,31 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 8*ib32;
|
||||
device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * 0.5f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+0] | ((qh << 8) & 256)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+1] | ((qh << 7) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
|
||||
reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+2] | ((qh << 6) & 256)));
|
||||
grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+3] | ((qh << 5) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
|
||||
reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
@@ -5525,6 +5761,7 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows
|
||||
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
@@ -5566,6 +5803,7 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
@@ -5619,6 +5857,7 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
@@ -6589,6 +6828,71 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32(
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq3_s_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq3_s_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
device const char * src00,
|
||||
device const char * src01,
|
||||
device const char * src02,
|
||||
device const char * src03,
|
||||
device const char * src04,
|
||||
device const char * src05,
|
||||
device const char * src06,
|
||||
device const char * src07,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||
|
||||
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||
|
||||
tgpig.z = tgpig.z%(ne12*ne13);
|
||||
|
||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||
|
||||
kernel_mul_mv_iq3_s_f32_impl(
|
||||
src0[id],
|
||||
(device const float *) (src1 + bid*nb11),
|
||||
dst + bid*ne0,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
ne10,
|
||||
ne12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_values,
|
||||
tgpig,
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq1_s_f32(
|
||||
device const char * ids,
|
||||
|
||||
+25
-25
@@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
||||
}
|
||||
|
||||
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
||||
if (tensor->backend != GGML_BACKEND_GPU) {
|
||||
if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
@@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
|
||||
}
|
||||
|
||||
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
@@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
size_t y_size;
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->extra;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
|
||||
size_t x_offset = 0;
|
||||
|
||||
@@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
// TODO: copy src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else {
|
||||
// copy src0 to device
|
||||
@@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
// copy src1 to device
|
||||
if (src1->backend == GGML_BACKEND_CPU) {
|
||||
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
}
|
||||
|
||||
@@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||
}
|
||||
@@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
}
|
||||
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
if (src1->backend != GGML_BACKEND_GPU) {
|
||||
if (src1->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
}
|
||||
if (dst->backend != GGML_BACKEND_GPU) {
|
||||
if (dst->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
}
|
||||
}
|
||||
@@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
size_t y_size;
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->extra;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
||||
@@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
// TODO: copy src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else {
|
||||
// copy src0 to device
|
||||
@@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
|
||||
// copy dst to host, then convert to float
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
@@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
}
|
||||
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
@@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
cl_mem d_Q;
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
||||
}
|
||||
|
||||
@@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy src0 to device if necessary
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
d_Q = (cl_mem) src0->extra;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
@@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
if (!mul_mat_vec) {
|
||||
// convert src0 to fp32 on device
|
||||
const size_t global = x_ne / global_denom;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||
@@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
|
||||
// compute
|
||||
const size_t global = ne01 * local;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const cl_int ncols = ne00;
|
||||
events.emplace_back();
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
||||
@@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
ggml_cl_pool_free(d_Q, q_size);
|
||||
}
|
||||
}
|
||||
@@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
dst->type == GGML_TYPE_F32 &&
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
tensor->extra = dst;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
}
|
||||
|
||||
// ggml-backend
|
||||
@@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ctx->sub_buffers.push_back(sub_buffer);
|
||||
tensor->extra = sub_buffer;
|
||||
}
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
|
||||
+635
-64
@@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
@@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3505,6 +3530,73 @@ static const uint32_t iq3xxs_grid[256] = {
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
static const uint32_t iq3xs_grid[512] = {
|
||||
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
||||
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
||||
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
||||
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
||||
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
||||
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
||||
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
||||
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
||||
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
||||
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
||||
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
||||
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
||||
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
||||
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
||||
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
||||
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
||||
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
||||
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
||||
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
||||
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
||||
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
||||
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
||||
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
||||
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
||||
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
||||
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
||||
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
||||
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
||||
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
||||
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
||||
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
||||
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
||||
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
||||
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
||||
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
||||
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
||||
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
||||
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
||||
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
||||
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
||||
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
||||
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
||||
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
||||
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
||||
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
||||
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
||||
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
||||
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
||||
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
||||
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
||||
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
||||
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
||||
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
||||
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
||||
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
||||
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
||||
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
||||
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
||||
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
||||
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
||||
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
||||
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
||||
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
||||
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
||||
};
|
||||
|
||||
#define NGRID_IQ2XXS 512
|
||||
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
||||
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
||||
@@ -3736,6 +3828,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== 3.3125 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = x[i].signs;
|
||||
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f;
|
||||
const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
y += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
y += 8;
|
||||
}
|
||||
qh += 2;
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== 1.5625 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
|
||||
@@ -8806,6 +8941,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (__AVX2__) || defined (__ARM_NEON)
|
||||
static const int8_t keven_signs_q2xs[1024] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
||||
@@ -8840,6 +8976,7 @@ static const int8_t keven_signs_q2xs[1024] = {
|
||||
1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
|
||||
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
};
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -9327,6 +9464,202 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_iq3_s * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||
};
|
||||
|
||||
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
||||
|
||||
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
||||
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
||||
|
||||
uint8x16x2_t vs;
|
||||
ggml_int8x16x4_t q3s;
|
||||
ggml_int8x16x4_t q8b;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict qs = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||
const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
|
||||
iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
|
||||
const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
|
||||
iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
|
||||
const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
|
||||
iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
|
||||
const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
|
||||
iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
|
||||
qs += 16;
|
||||
|
||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
||||
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||
|
||||
q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0]));
|
||||
q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
|
||||
|
||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
||||
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||
|
||||
signs += 4;
|
||||
|
||||
q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0]));
|
||||
q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1]));
|
||||
|
||||
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
||||
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
||||
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
||||
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
||||
}
|
||||
sumf += d*(sumi1 + sumi2);
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||
};
|
||||
|
||||
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||
};
|
||||
|
||||
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
||||
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict qs = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
__m256i sumi1 = _mm256_setzero_si256();
|
||||
__m256i sumi2 = _mm256_setzero_si256();
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)],
|
||||
iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)],
|
||||
iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)],
|
||||
iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)],
|
||||
iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)],
|
||||
iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)],
|
||||
iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)],
|
||||
iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
|
||||
qs += 8;
|
||||
const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)],
|
||||
iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)],
|
||||
iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)],
|
||||
iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)],
|
||||
iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)],
|
||||
iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)],
|
||||
iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)],
|
||||
iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]);
|
||||
qs += 8;
|
||||
|
||||
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
||||
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
||||
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
||||
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
||||
|
||||
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
||||
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
||||
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
||||
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
||||
|
||||
signs += 4;
|
||||
|
||||
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
||||
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
||||
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
||||
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
||||
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
||||
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
||||
sumi1 = _mm256_add_epi32(sumi1, p1);
|
||||
sumi2 = _mm256_add_epi32(sumi2, p2);
|
||||
}
|
||||
|
||||
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
||||
|
||||
}
|
||||
|
||||
*s = 0.25f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict qs = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const uint8_t * restrict signs = x[i].signs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
||||
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls2;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#ifdef __AVX2__
|
||||
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
||||
const __m256i ax = _mm256_sign_epi8(x, x);
|
||||
@@ -9523,6 +9856,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
float sumf = 0;
|
||||
|
||||
for (int ib = 0; ib < nb; ib += 2) {
|
||||
|
||||
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
||||
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
||||
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
||||
@@ -10239,14 +10573,15 @@ typedef struct {
|
||||
uint16_t * neighbours;
|
||||
} iq3_entry_t;
|
||||
|
||||
static iq3_entry_t iq3_data[1] = {
|
||||
static iq3_entry_t iq3_data[2] = {
|
||||
{NULL, NULL, NULL},
|
||||
{NULL, NULL, NULL},
|
||||
};
|
||||
|
||||
static inline int iq3_data_index(int grid_size) {
|
||||
(void)grid_size;
|
||||
GGML_ASSERT(grid_size == 256);
|
||||
return 0;
|
||||
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
||||
return grid_size == 256 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int iq3_compare_func(const void * left, const void * right) {
|
||||
@@ -10278,9 +10613,44 @@ void iq3xs_init_impl(int grid_size) {
|
||||
3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
|
||||
3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
|
||||
};
|
||||
static const uint16_t kgrid_512[512] = {
|
||||
0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34,
|
||||
37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77,
|
||||
80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142,
|
||||
145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210,
|
||||
217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288,
|
||||
291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393,
|
||||
395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514,
|
||||
516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576,
|
||||
577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653,
|
||||
655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727,
|
||||
728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833,
|
||||
840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977,
|
||||
989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047,
|
||||
1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103,
|
||||
1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199,
|
||||
1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296,
|
||||
1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415,
|
||||
1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561,
|
||||
1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648,
|
||||
1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761,
|
||||
1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877,
|
||||
1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068,
|
||||
2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177,
|
||||
2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269,
|
||||
2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520,
|
||||
2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634,
|
||||
2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805,
|
||||
2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083,
|
||||
3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276,
|
||||
3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591,
|
||||
3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729,
|
||||
3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032,
|
||||
};
|
||||
|
||||
const int kmap_size = 4096;
|
||||
const int nwant = 2;
|
||||
const uint16_t * kgrid = kgrid_256;
|
||||
const int nwant = grid_size == 256 ? 2 : 3;
|
||||
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
||||
uint32_t * kgrid_q3xs;
|
||||
int * kmap_q3xs;
|
||||
uint16_t * kneighbors_q3xs;
|
||||
@@ -10377,7 +10747,7 @@ void iq3xs_init_impl(int grid_size) {
|
||||
}
|
||||
|
||||
void iq3xs_free_impl(int grid_size) {
|
||||
GGML_ASSERT(grid_size == 256);
|
||||
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
||||
const int gindex = iq3_data_index(grid_size);
|
||||
if (iq3_data[gindex].grid) {
|
||||
free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
|
||||
@@ -10410,9 +10780,10 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
||||
return grid_index;
|
||||
}
|
||||
|
||||
static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
||||
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
|
||||
const float * restrict quant_weights) {
|
||||
|
||||
const int gindex = iq3_data_index(256);
|
||||
const int gindex = iq3_data_index(grid_size);
|
||||
|
||||
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
||||
const int * kmap_q3xs = iq3_data[gindex].map;
|
||||
@@ -10426,9 +10797,23 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
||||
|
||||
const int kMaxQ = 8;
|
||||
|
||||
const int nbl = n/256;
|
||||
const int nbl = n/QK_K;
|
||||
|
||||
block_iq3_xxs * y = vy;
|
||||
ggml_fp16_t * dh;
|
||||
uint8_t * qs;
|
||||
int block_size;
|
||||
if (grid_size == 256) {
|
||||
block_iq3_xxs * y = vy;
|
||||
dh = &y->d;
|
||||
qs = y->qs;
|
||||
block_size = sizeof(block_iq3_xxs);
|
||||
} else {
|
||||
block_iq3_s * y = vy;
|
||||
dh = &y->d;
|
||||
qs = y->qs;
|
||||
block_size = sizeof(block_iq3_s);
|
||||
}
|
||||
int quant_size = block_size - sizeof(ggml_fp16_t);
|
||||
|
||||
float scales[QK_K/32];
|
||||
float weight[32];
|
||||
@@ -10439,20 +10824,21 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
||||
bool is_on_grid[8];
|
||||
bool is_on_grid_aux[8];
|
||||
uint8_t block_signs[8];
|
||||
uint8_t q3[3*(QK_K/8)];
|
||||
uint8_t q3[3*(QK_K/8)+QK_K/32];
|
||||
uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
|
||||
uint8_t * qh = q3 + 3*(QK_K/8);
|
||||
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) {
|
||||
|
||||
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
||||
memset(q3, 0, 3*QK_K/8);
|
||||
dh[0] = GGML_FP32_TO_FP16(0.f);
|
||||
memset(q3, 0, 3*QK_K/8+QK_K/32);
|
||||
|
||||
float max_scale = 0;
|
||||
|
||||
const float * xbl = x + QK_K*ibl;
|
||||
float sumx2 = 0;
|
||||
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
||||
float sigma2 = sumx2/QK_K;
|
||||
float sigma2 = 2*sumx2/QK_K;
|
||||
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const float * xb = xbl + 32*ib;
|
||||
@@ -10570,7 +10956,13 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
q3[8*ib+k] = grid_index;
|
||||
if (grid_size == 256) {
|
||||
q3[8*ib+k] = grid_index;
|
||||
} else {
|
||||
q3[8*ib+k] = grid_index & 255;
|
||||
qh[ib] |= ((grid_index >> 8) << k);
|
||||
}
|
||||
|
||||
}
|
||||
scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
|
||||
GGML_ASSERT(scale >= 0);
|
||||
@@ -10579,63 +10971,25 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
||||
}
|
||||
|
||||
if (!max_scale) {
|
||||
memset(y[ibl].qs, 0, 3*QK_K/8);
|
||||
memset(qs, 0, quant_size);
|
||||
dh += block_size/sizeof(ggml_fp16_t);
|
||||
qs += block_size;
|
||||
continue;
|
||||
}
|
||||
|
||||
float d = max_scale/31;
|
||||
y[ibl].d = GGML_FP32_TO_FP16(d);
|
||||
dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor
|
||||
float id = 1/d;
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
||||
l = MAX(0, MIN(15, l));
|
||||
scales_and_signs[ib] |= ((uint32_t)l << 28);
|
||||
if (false) {
|
||||
const float * xb = xbl + 32*ib;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
||||
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||
} else {
|
||||
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
||||
}
|
||||
const float db = 0.25f * d * (1 + 2*l);
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
|
||||
const float * xk = xb + 4*k;
|
||||
const float * wk = weight + 4*k;
|
||||
//const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
|
||||
const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
|
||||
float best_mse = 0; int best_index = q3[8*ib+k];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
float diff = db * grid[j] * signs[j] - xk[j];
|
||||
best_mse += wk[j] * diff * diff;
|
||||
}
|
||||
for (int idx = 0; idx < 256; ++idx) {
|
||||
//grid = (const uint8_t *)(kgrid_q3xs + idx);
|
||||
grid = (const uint8_t *)(iq3xxs_grid + idx);
|
||||
float mse = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
float diff = db * grid[j] * signs[j] - xk[j];
|
||||
mse += wk[j] * diff * diff;
|
||||
}
|
||||
if (mse < best_mse) {
|
||||
best_mse = mse; best_index = idx;
|
||||
}
|
||||
}
|
||||
q3[8*ib+k] = best_index;
|
||||
//grid = (const uint8_t *)(kgrid_q3xs + best_index);
|
||||
grid = (const uint8_t *)(iq3xxs_grid + best_index);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
float q = db * grid[j] * signs[j];
|
||||
sumqx += wk[j] * q * xk[j];
|
||||
sumq2 += wk[j] * q * q;
|
||||
}
|
||||
}
|
||||
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
||||
}
|
||||
}
|
||||
memcpy(y[ibl].qs, q3, 3*QK_K/8);
|
||||
memcpy(qs, q3, quant_size);
|
||||
|
||||
dh += block_size/sizeof(ggml_fp16_t);
|
||||
qs += block_size;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10645,7 +10999,7 @@ size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
||||
int nblock = n_per_row/QK_K;
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights);
|
||||
quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
|
||||
src += n_per_row;
|
||||
qrow += nblock*sizeof(block_iq3_xxs);
|
||||
}
|
||||
@@ -10660,9 +11014,226 @@ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
|
||||
|
||||
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_row_iq3_xxs_impl(x, y, k, NULL);
|
||||
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
||||
}
|
||||
|
||||
static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
|
||||
const float * restrict quant_weights,
|
||||
float * scales,
|
||||
float * weight,
|
||||
float * xval,
|
||||
int8_t * L,
|
||||
int8_t * Laux,
|
||||
float * waux,
|
||||
bool * is_on_grid,
|
||||
bool * is_on_grid_aux,
|
||||
uint8_t * block_signs) {
|
||||
|
||||
const int gindex = iq3_data_index(512);
|
||||
|
||||
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
||||
const int * kmap_q3xs = iq3_data[gindex].map;
|
||||
const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
|
||||
|
||||
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
||||
GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
|
||||
const int kMaxQ = 8;
|
||||
|
||||
const int nbl = n/QK_K;
|
||||
|
||||
block_iq3_s * y = vy;
|
||||
|
||||
const int bs4 = block_size/4;
|
||||
const int bs8 = block_size/8;
|
||||
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) {
|
||||
|
||||
memset(&y[ibl], 0, sizeof(block_iq3_s));
|
||||
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
||||
|
||||
uint8_t * qs = y[ibl].qs;
|
||||
uint8_t * qh = y[ibl].qh;
|
||||
uint8_t * signs = y[ibl].signs;
|
||||
|
||||
float max_scale = 0;
|
||||
|
||||
const float * xbl = x + QK_K*ibl;
|
||||
float sumx2 = 0;
|
||||
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
||||
float sigma2 = 2*sumx2/QK_K;
|
||||
|
||||
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
||||
const float * xb = xbl + block_size*ib;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
||||
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||
} else {
|
||||
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
||||
}
|
||||
for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
|
||||
for (int k = 0; k < bs8; ++k) {
|
||||
uint8_t s = 0;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
||||
else {
|
||||
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
||||
}
|
||||
}
|
||||
block_signs[k] = s;
|
||||
}
|
||||
float max = xval[0];
|
||||
for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
|
||||
if (!max) {
|
||||
scales[ib] = 0;
|
||||
continue;
|
||||
}
|
||||
float best = 0;
|
||||
float scale = max/(2*kMaxQ-1);
|
||||
for (int is = -15; is <= 15; ++is) {
|
||||
float id = (2*kMaxQ-1+is*0.2f)/max;
|
||||
float this_scale = 1/id;
|
||||
for (int k = 0; k < bs4; ++k) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
||||
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
||||
}
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
|
||||
int grid_index = kmap_q3xs[u];
|
||||
is_on_grid_aux[k] = true;
|
||||
if (grid_index < 0) {
|
||||
is_on_grid_aux[k] = false;
|
||||
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
||||
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
|
||||
}
|
||||
}
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int i = 0; i < block_size; ++i) {
|
||||
float w = weight[i];
|
||||
float q = 2*Laux[i] + 1;
|
||||
sumqx += w*xval[i]*q;
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
||||
scale = sumqx/sumq2; best = scale*sumqx;
|
||||
for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
|
||||
for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
||||
}
|
||||
}
|
||||
int n_not_ongrid = 0;
|
||||
for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
||||
if (n_not_ongrid > 0 && scale > 0) {
|
||||
float id = 1/scale;
|
||||
for (int k = 0; k < bs4; ++k) {
|
||||
if (is_on_grid[k]) continue;
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
||||
l = MAX(0, MIN(kMaxQ-1, l));
|
||||
u |= (l << 3*i);
|
||||
}
|
||||
int grid_index = kmap_q3xs[u];
|
||||
if (grid_index < 0) {
|
||||
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
||||
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
|
||||
}
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
|
||||
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
||||
}
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int i = 0; i < block_size; ++i) {
|
||||
float w = weight[i];
|
||||
float q = 2*L[i] + 1;
|
||||
sumqx += w*xval[i]*q;
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0) scale = sumqx/sumq2;
|
||||
}
|
||||
if (scale < 0) {
|
||||
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
||||
// and correspondingly flip quant signs.
|
||||
scale = -scale;
|
||||
for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
|
||||
}
|
||||
for (int k = 0; k < bs4; ++k) {
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
||||
int grid_index = kmap_q3xs[u];
|
||||
if (grid_index < 0) {
|
||||
printf("Oops: found point %u not on grid:", u);
|
||||
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
qs[k] = grid_index & 255;
|
||||
qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
|
||||
}
|
||||
qs += bs4;
|
||||
for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
|
||||
signs += bs8;
|
||||
GGML_ASSERT(scale >= 0);
|
||||
scales[ib] = scale;
|
||||
max_scale = MAX(max_scale, scale);
|
||||
}
|
||||
|
||||
if (!max_scale) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float d = max_scale/31;
|
||||
y[ibl].d = GGML_FP32_TO_FP16(d);
|
||||
float id = 1/d;
|
||||
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
||||
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|
||||
l1 = MAX(0, MIN(15, l1));
|
||||
int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
|
||||
l2 = MAX(0, MIN(15, l2));
|
||||
y[ibl].scales[ib/2] = l1 | (l2 << 4);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#define IQ3S_BLOCK_SIZE 32
|
||||
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int nblock = n_per_row/QK_K;
|
||||
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
||||
float weight[IQ3S_BLOCK_SIZE];
|
||||
float xval[IQ3S_BLOCK_SIZE];
|
||||
int8_t L[IQ3S_BLOCK_SIZE];
|
||||
int8_t Laux[IQ3S_BLOCK_SIZE];
|
||||
float waux[IQ3S_BLOCK_SIZE];
|
||||
bool is_on_grid[IQ3S_BLOCK_SIZE/4];
|
||||
bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
|
||||
uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
|
||||
scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
|
||||
src += n_per_row;
|
||||
qrow += nblock*sizeof(block_iq3_s);
|
||||
}
|
||||
return nrow * nblock * sizeof(block_iq3_s);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq3_s * restrict y = vy;
|
||||
quantize_row_iq3_s_reference(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq3_s(x, y, 1, k, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
// =================================== 1.5 bpw ===================================================
|
||||
|
||||
static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
||||
|
||||
@@ -191,6 +191,21 @@ typedef struct {
|
||||
} block_iq3_xxs;
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
// 3.4375 bpw
|
||||
#if QK_K == 64
|
||||
#define IQ3S_N_SCALE 2
|
||||
#else
|
||||
#define IQ3S_N_SCALE QK_K/64
|
||||
#endif
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t signs[QK_K/8];
|
||||
uint8_t scales[IQ3S_N_SCALE];
|
||||
} block_iq3_s;
|
||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[QK_K/8];
|
||||
@@ -226,6 +241,7 @@ void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGM
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
@@ -242,6 +258,7 @@ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
@@ -262,6 +279,7 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
@@ -280,6 +298,7 @@ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
//
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
@@ -289,6 +308,7 @@ size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row,
|
||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
||||
+242
-158
@@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
|
||||
|
||||
size_t total_elements = ggml_nelements(src);
|
||||
|
||||
const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
float *src_data =NULL;
|
||||
if(src_on_device) {
|
||||
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
@@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
|
||||
int ixj = col ^ j;
|
||||
if (ixj > col) {
|
||||
if ((col & k) == 0) {
|
||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
||||
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
||||
swap(dst_row[col], dst_row[ixj]);
|
||||
}
|
||||
} else {
|
||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
||||
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
||||
swap(dst_row[col], dst_row[ixj]);
|
||||
}
|
||||
}
|
||||
@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
||||
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
||||
}
|
||||
|
||||
static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
|
||||
const sycl::nd_item<3> &item_ct1, float *buf) {
|
||||
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int rowx = item_ct1.get_group(2);
|
||||
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
||||
|
||||
const int block_size = item_ct1.get_local_range(2);
|
||||
const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
|
||||
|
||||
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
|
||||
float slope = 0.0f;
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const uint32_t h = rowx/nrows_y; // head index
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slope = sycl::pow(base, float(exp));
|
||||
}
|
||||
|
||||
float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
|
||||
float max_val = -INFINITY;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
const int col = col0 + tid;
|
||||
|
||||
if (ncols_template == 0 && col >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
|
||||
|
||||
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
||||
|
||||
vals[col] = val;
|
||||
max_val = sycl::max(max_val, val);
|
||||
}
|
||||
|
||||
// find the max value in the block
|
||||
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
||||
if (warp_id == 0) {
|
||||
buf[lane_id] = -INFINITY;
|
||||
}
|
||||
/*
|
||||
DPCT1118:12: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
if (lane_id == 0) {
|
||||
buf[warp_id] = max_val;
|
||||
}
|
||||
/*
|
||||
DPCT1118:13: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
max_val = buf[lane_id];
|
||||
max_val = warp_reduce_max(max_val, item_ct1);
|
||||
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
||||
|
||||
float tmp = 0.f;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
const float val =
|
||||
sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
|
||||
#pragma unroll
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
const int col = col0 + tid;
|
||||
if (ncols_template == 0 && col >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
const float val = sycl::native::exp(vals[col] - max_val);
|
||||
tmp += val;
|
||||
dst[ix] = val;
|
||||
vals[col] = val;
|
||||
}
|
||||
|
||||
// find the sum of exps in the block
|
||||
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
||||
if (warp_id == 0) {
|
||||
buf[lane_id] = 0.f;
|
||||
}
|
||||
/*
|
||||
DPCT1118:14: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
if (lane_id == 0) {
|
||||
buf[warp_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:15: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
tmp = buf[lane_id];
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
|
||||
const float inv_tmp = 1.f / tmp;
|
||||
const float inv_sum = 1.f / tmp;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const int i = rowx*ncols + col;
|
||||
dst[i] *= inv_tmp;
|
||||
#pragma unroll
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
const int col = col0 + tid;
|
||||
|
||||
if (ncols_template == 0 && col >= ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int idst = rowx*ncols + col;
|
||||
dst[idst] = vals[col] * inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10825,7 +10827,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
|
||||
const sycl::range<3> block_dims(1, 1, ncols);
|
||||
const sycl::range<3> block_nums(1, nrows, 1);
|
||||
if (order == GGML_SORT_ASC) {
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
/*
|
||||
DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
@@ -10834,9 +10836,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
|
||||
});
|
||||
} else if (order == GGML_SORT_DESC) {
|
||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||
/*
|
||||
DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
@@ -10845,7 +10847,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
|
||||
});
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
@@ -10867,35 +10869,96 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
||||
});
|
||||
}
|
||||
|
||||
static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
|
||||
const int ncols_x, const int nrows_x,
|
||||
const int nrows_y, const float scale,
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
||||
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
|
||||
nrows_y, scale, max_bias, m0,
|
||||
m1, n_head_log2, item_ct1,
|
||||
local_buf_acc.get_pointer());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
|
||||
float * dst, const int ncols_x, const int nrows_x,
|
||||
const int nrows_y, const float scale, const float max_bias,
|
||||
dpct::queue_ptr stream) {
|
||||
int nth = WARP_SIZE;
|
||||
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||
const sycl::range<3> block_dims(1, 1, nth);
|
||||
const sycl::range<3> block_nums(1, 1, nrows_x);
|
||||
/*
|
||||
DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
|
||||
limit. To get the device limit, query info::device::max_work_group_size.
|
||||
Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
/*
|
||||
DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
|
||||
replaced with a value. Modify the code to use the original expression,
|
||||
provided in comments, if it is correct.
|
||||
*/
|
||||
sycl::local_accessor<float, 1> buf_acc_ct1(
|
||||
sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
|
||||
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
|
||||
static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
|
||||
buf_acc_ct1.get_pointer());
|
||||
});
|
||||
});
|
||||
const uint32_t n_head_kv = nrows_x/nrows_y;
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
||||
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
||||
switch (ncols_x) {
|
||||
case 32:
|
||||
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 64:
|
||||
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 128:
|
||||
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 256:
|
||||
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 512:
|
||||
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 1024:
|
||||
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 2048:
|
||||
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 4096:
|
||||
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
default:
|
||||
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, WARP_SIZE, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -11407,12 +11470,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
||||
|
||||
dpct::memcpy_direction kind;
|
||||
char * src_ptr;
|
||||
if (src->backend == GGML_BACKEND_CPU) {
|
||||
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
kind = dpct::host_to_device;
|
||||
src_ptr = (char *) src->data;
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
|
||||
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
|
||||
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||
kind = dpct::device_to_device;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
int id;
|
||||
@@ -11846,7 +11909,7 @@ inline void ggml_sycl_op_mul_mat_q(
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
@@ -12119,7 +12182,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// ldc == nrows of the matrix that cuBLAS writes into
|
||||
int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
||||
@@ -12435,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows_x = ggml_nrows(src0);
|
||||
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
||||
const int64_t nrows_y = src0->ne[1];
|
||||
|
||||
float scale = 1.0f;
|
||||
memcpy(&scale, dst->op_params, sizeof(float));
|
||||
float max_bias = 0.0f;
|
||||
|
||||
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
||||
|
||||
(void) dst;
|
||||
// positions tensor
|
||||
float * src2_dd = nullptr;
|
||||
sycl_pool_alloc<float> src2_f;
|
||||
|
||||
ggml_tensor * src2 = dst->src[2];
|
||||
const bool use_src2 = src2 != nullptr;
|
||||
|
||||
if (use_src2) {
|
||||
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
if (src2_on_device) {
|
||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
||||
} else {
|
||||
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
||||
}
|
||||
}
|
||||
|
||||
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
||||
nrows_x, nrows_y, scale, max_bias, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
@@ -12501,16 +12585,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
||||
const bool use_src1 = src1 != nullptr;
|
||||
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
||||
|
||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
// dd = data device
|
||||
float * src0_ddf = nullptr;
|
||||
@@ -12565,7 +12649,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
||||
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::get_current_device().queues_wait_and_throw()));
|
||||
}
|
||||
@@ -12640,8 +12724,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
|
||||
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
||||
|
||||
@@ -12656,13 +12740,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
||||
|
||||
int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
GGML_ASSERT(!(split && ne02 > 1));
|
||||
GGML_ASSERT(!(split && ne03 > 1));
|
||||
GGML_ASSERT(!(split && ne02 < ne12));
|
||||
@@ -12717,8 +12801,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
|
||||
used_devices++;
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||
|
||||
ggml_sycl_set_device(get_device_id_by_index(id));
|
||||
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
||||
@@ -12782,8 +12866,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||
const int64_t row_diff = row_high[id] - row_low[id];
|
||||
|
||||
ggml_sycl_set_device(get_device_id_by_index(id));
|
||||
@@ -12809,12 +12893,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
|
||||
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
||||
// in that case an offset on dst_ddf_i is needed
|
||||
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
|
||||
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
||||
}
|
||||
|
||||
// copy src0, src1 to device if necessary
|
||||
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
||||
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
||||
if (id != g_main_device_index) {
|
||||
if (convert_src1_to_q8_1) {
|
||||
char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
|
||||
@@ -12830,14 +12914,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
src1_ncols * ne10 * sizeof(float))));
|
||||
}
|
||||
}
|
||||
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
||||
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||
/*
|
||||
DPCT1010:92: SYCL uses exceptions to report errors and does
|
||||
@@ -12867,10 +12951,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
if (!dst_on_device) {
|
||||
void * dst_off_device;
|
||||
dpct::memcpy_direction kind;
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
dst_off_device = dst->data;
|
||||
kind = dpct::device_to_host;
|
||||
} else if (dst->backend == GGML_BACKEND_GPU) {
|
||||
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
dst_off_device = dst_extra->data_device[g_main_device_index];
|
||||
kind = dpct::device_to_device;
|
||||
} else {
|
||||
@@ -12954,7 +13038,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
}
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::get_current_device().queues_wait_and_throw()));
|
||||
@@ -13091,7 +13175,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
|
||||
const ggml_tensor *src1,
|
||||
ggml_tensor *dst) try {
|
||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
@@ -13129,7 +13213,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -13196,7 +13280,7 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -13372,11 +13456,11 @@ catch (sycl::exception const &exc) {
|
||||
|
||||
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool all_on_device =
|
||||
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||
(src1->backend == GGML_BACKEND_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_GPU);
|
||||
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
||||
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
@@ -13505,7 +13589,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
||||
GGML_ASSERT(!ggml_is_transposed(src00));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
||||
@@ -13643,7 +13727,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||
|
||||
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
||||
|
||||
if (ids->backend == GGML_BACKEND_GPU) {
|
||||
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
||||
@@ -13661,20 +13745,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
src1_row.backend = GGML_BACKEND_GPU;
|
||||
dst_row.backend = GGML_BACKEND_GPU;
|
||||
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
src1_row.extra = &src1_row_extra;
|
||||
dst_row.extra = &dst_row_extra;
|
||||
|
||||
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
||||
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
||||
(char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
|
||||
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
||||
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
||||
(char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
|
||||
|
||||
if (src1->ne[1] == 1) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
//int32_t row_id;
|
||||
@@ -13756,7 +13840,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||
}
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
||||
}
|
||||
}
|
||||
@@ -13779,8 +13863,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||
@@ -13887,17 +13971,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
|
||||
if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
|
||||
continue;
|
||||
}
|
||||
ggml_sycl_set_device(get_device_id_by_index(id));
|
||||
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
||||
|
||||
int64_t row_low, row_high;
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
if (backend == GGML_BACKEND_TYPE_GPU) {
|
||||
row_low = 0;
|
||||
row_high = nrows;
|
||||
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
} else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
const int64_t rounding = get_row_rounding(tensor->type);
|
||||
|
||||
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
||||
@@ -13946,7 +14030,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
||||
|
||||
extra->data_device[id] = buf;
|
||||
|
||||
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
|
||||
new sycl::event()));
|
||||
@@ -13963,7 +14047,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
|
||||
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
||||
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -14016,15 +14100,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
||||
return;
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
const ggml_op src0_op = tensor->src[0]->op;
|
||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
||||
ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
||||
}
|
||||
}
|
||||
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
||||
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
||||
}
|
||||
|
||||
@@ -14042,7 +14126,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
||||
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
||||
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
||||
|
||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
||||
size_t offset = 0;
|
||||
@@ -14111,7 +14195,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
|
||||
|
||||
const bool inplace = tensor->view_src != nullptr;
|
||||
|
||||
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
||||
size_t view_offset = 0;
|
||||
@@ -14132,7 +14216,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
@@ -14219,9 +14303,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
if (!g_sycl_loaded) return false;
|
||||
|
||||
ggml_sycl_func_t func;
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
||||
return false;
|
||||
@@ -14359,14 +14443,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
|
||||
}
|
||||
|
||||
if (params->ith != 0) {
|
||||
return true;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return true;
|
||||
}
|
||||
func(tensor->src[0], tensor->src[1], tensor);
|
||||
@@ -14517,7 +14601,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
extra->data_device[ctx->device] = tensor->data;
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||
tensor->extra = extra;
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
@@ -14548,7 +14632,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
@@ -14573,7 +14657,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *tensor,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
@@ -14809,7 +14893,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
||||
(char *)tensor->data + offset, data, size)));
|
||||
@@ -14827,7 +14911,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
||||
data, (const char *)tensor->data + offset, size)));
|
||||
@@ -14880,7 +14964,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||
|
||||
ggml_compute_params params = {};
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
params.ith = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
@@ -14888,13 +14972,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
||||
continue;
|
||||
|
||||
assert(node->backend == GGML_BACKEND_GPU);
|
||||
assert(node->backend == GGML_BACKEND_TYPE_GPU);
|
||||
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
||||
assert(node->extra != nullptr);
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] != nullptr) {
|
||||
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
||||
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
||||
assert(node->src[j]->extra != nullptr);
|
||||
}
|
||||
|
||||
+51
-51
@@ -2320,8 +2320,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
@@ -2453,7 +2453,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
||||
// compute
|
||||
ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data);
|
||||
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
||||
@@ -2506,8 +2506,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
@@ -2630,7 +2630,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -2647,7 +2647,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
@@ -2679,7 +2679,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
const uint64_t x_ne = ne00 * ne01 * ne02;
|
||||
const uint64_t y_ne = ne10 * ne11 * ne12;
|
||||
@@ -2721,7 +2721,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -2738,7 +2738,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -2771,7 +2771,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
||||
|
||||
@@ -2814,7 +2814,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -2832,7 +2832,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
||||
dst->type == GGML_TYPE_F32 &&
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU);
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
}
|
||||
|
||||
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
@@ -2880,8 +2880,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
||||
// TODO: support for transposed / permuted tensors
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
@@ -3110,8 +3110,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
}
|
||||
}
|
||||
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
|
||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
@@ -3120,7 +3120,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
vk_buffer d_D = extra->buffer_gpu.lock();
|
||||
|
||||
// Workaround for tiny tensor inputs on ROPE
|
||||
if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) {
|
||||
if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
|
||||
y_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
|
||||
@@ -3209,9 +3209,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
}
|
||||
if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
||||
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
||||
} else if(dst->backend == GGML_BACKEND_CPU) {
|
||||
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
||||
@@ -3253,7 +3253,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
}
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
||||
}
|
||||
@@ -3359,7 +3359,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
||||
|
||||
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
// If backend is CPU, data from src0 has to be copied off the device
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -3994,9 +3994,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
||||
#endif
|
||||
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|
||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU));
|
||||
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
||||
|
||||
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
|
||||
return;
|
||||
@@ -4215,9 +4215,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||
}
|
||||
|
||||
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
||||
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|
||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU);
|
||||
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
||||
return;
|
||||
@@ -4371,7 +4371,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
last_node = true;
|
||||
#endif
|
||||
|
||||
if (node->backend == GGML_BACKEND_CPU || last_node) {
|
||||
if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
|
||||
ggml_vk_ctx_end(ctx->compute_ctx);
|
||||
ctx->compute_ctx->exit_tensor = node;
|
||||
ctx->compute_ctx = nullptr;
|
||||
@@ -4379,9 +4379,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
}
|
||||
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
|
||||
return false;
|
||||
@@ -4442,7 +4442,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
if (params->ith != 0) {
|
||||
return true;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -4745,7 +4745,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
||||
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
|
||||
@@ -4753,7 +4753,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
|
||||
@@ -4768,7 +4768,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
|
||||
@@ -4999,7 +4999,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
||||
#endif
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
@@ -5020,7 +5020,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
||||
#endif
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
@@ -5097,7 +5097,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||
int last_node = cgraph->n_nodes - 1;
|
||||
|
||||
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
||||
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
|
||||
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
last_node -= 1;
|
||||
}
|
||||
|
||||
@@ -5106,7 +5106,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||
}
|
||||
|
||||
ggml_compute_params params = {};
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
params.ith = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
@@ -5410,7 +5410,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
||||
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
@@ -5436,14 +5436,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
free(tensor_data);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||
return;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
return;
|
||||
}
|
||||
@@ -5481,7 +5481,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -5518,10 +5518,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
src0_buffer = malloc(src0_size);
|
||||
src0_clone->data = src0_buffer;
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
memcpy(src0_clone->data, src0->data, src0_size);
|
||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
uint64_t offset = extra->offset;
|
||||
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
||||
@@ -5561,10 +5561,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
src1_buffer = malloc(src1_size);
|
||||
src1_clone->data = src1_buffer;
|
||||
if (src1->backend == GGML_BACKEND_CPU) {
|
||||
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
memcpy(src1_clone->data, src1->data, src1_size);
|
||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (src1->backend == GGML_BACKEND_GPU) {
|
||||
} else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
uint64_t offset = extra->offset;
|
||||
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
||||
@@ -5723,7 +5723,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||
return;
|
||||
}
|
||||
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
|
||||
@@ -5735,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
@@ -5868,7 +5868,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
comp_result = nullptr;
|
||||
comp_size = 0;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
free(tensor_data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -350,6 +350,7 @@ extern "C" {
|
||||
GGML_TYPE_IQ3_XXS = 18,
|
||||
GGML_TYPE_IQ1_S = 19,
|
||||
GGML_TYPE_IQ4_NL = 20,
|
||||
GGML_TYPE_IQ3_S = 21,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
@@ -363,9 +364,9 @@ extern "C" {
|
||||
};
|
||||
|
||||
enum ggml_backend_type {
|
||||
GGML_BACKEND_CPU = 0,
|
||||
GGML_BACKEND_GPU = 10,
|
||||
GGML_BACKEND_GPU_SPLIT = 20,
|
||||
GGML_BACKEND_TYPE_CPU = 0,
|
||||
GGML_BACKEND_TYPE_GPU = 10,
|
||||
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
||||
};
|
||||
|
||||
// model file types
|
||||
@@ -389,6 +390,7 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
@@ -496,9 +498,9 @@ extern "C" {
|
||||
};
|
||||
|
||||
enum ggml_object_type {
|
||||
GGML_OBJECT_TENSOR,
|
||||
GGML_OBJECT_GRAPH,
|
||||
GGML_OBJECT_WORK_BUFFER
|
||||
GGML_OBJECT_TYPE_TENSOR,
|
||||
GGML_OBJECT_TYPE_GRAPH,
|
||||
GGML_OBJECT_TYPE_WORK_BUFFER
|
||||
};
|
||||
|
||||
enum ggml_log_level {
|
||||
@@ -640,9 +642,9 @@ extern "C" {
|
||||
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
||||
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
||||
enum ggml_task_type {
|
||||
GGML_TASK_INIT = 0,
|
||||
GGML_TASK_COMPUTE,
|
||||
GGML_TASK_FINALIZE,
|
||||
GGML_TASK_TYPE_INIT = 0,
|
||||
GGML_TASK_TYPE_COMPUTE,
|
||||
GGML_TASK_TYPE_FINALIZE,
|
||||
};
|
||||
|
||||
struct ggml_compute_params {
|
||||
@@ -1647,8 +1649,8 @@ extern "C" {
|
||||
|
||||
// sort rows
|
||||
enum ggml_sort_order {
|
||||
GGML_SORT_ASC,
|
||||
GGML_SORT_DESC,
|
||||
GGML_SORT_ORDER_ASC,
|
||||
GGML_SORT_ORDER_DESC,
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_argsort(
|
||||
@@ -1941,8 +1943,8 @@ extern "C" {
|
||||
|
||||
// optimization methods
|
||||
enum ggml_opt_type {
|
||||
GGML_OPT_ADAM,
|
||||
GGML_OPT_LBFGS,
|
||||
GGML_OPT_TYPE_ADAM,
|
||||
GGML_OPT_TYPE_LBFGS,
|
||||
};
|
||||
|
||||
// linesearch methods
|
||||
@@ -1956,12 +1958,12 @@ extern "C" {
|
||||
|
||||
// optimization return values
|
||||
enum ggml_opt_result {
|
||||
GGML_OPT_OK = 0,
|
||||
GGML_OPT_DID_NOT_CONVERGE,
|
||||
GGML_OPT_NO_CONTEXT,
|
||||
GGML_OPT_INVALID_WOLFE,
|
||||
GGML_OPT_FAIL,
|
||||
GGML_OPT_CANCEL,
|
||||
GGML_OPT_RESULT_OK = 0,
|
||||
GGML_OPT_RESULT_DID_NOT_CONVERGE,
|
||||
GGML_OPT_RESULT_NO_CONTEXT,
|
||||
GGML_OPT_RESULT_INVALID_WOLFE,
|
||||
GGML_OPT_RESULT_FAIL,
|
||||
GGML_OPT_RESULT_CANCEL,
|
||||
|
||||
GGML_LINESEARCH_FAIL = -128,
|
||||
GGML_LINESEARCH_MINIMUM_STEP,
|
||||
|
||||
@@ -64,6 +64,15 @@ extern "C" {
|
||||
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
||||
enum llama_rope_type {
|
||||
LLAMA_ROPE_TYPE_NONE = -1,
|
||||
LLAMA_ROPE_TYPE_NORM = 0,
|
||||
LLAMA_ROPE_TYPE_NEOX = 2,
|
||||
LLAMA_ROPE_TYPE_GLM = 4,
|
||||
};
|
||||
|
||||
enum llama_token_type {
|
||||
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||
@@ -102,28 +111,30 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
||||
enum llama_rope_scaling_type {
|
||||
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
||||
LLAMA_ROPE_SCALING_NONE = 0,
|
||||
LLAMA_ROPE_SCALING_LINEAR = 1,
|
||||
LLAMA_ROPE_SCALING_YARN = 2,
|
||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
||||
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
||||
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
||||
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
||||
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
|
||||
};
|
||||
|
||||
enum llama_pooling_type {
|
||||
LLAMA_POOLING_NONE = 0,
|
||||
LLAMA_POOLING_MEAN = 1,
|
||||
LLAMA_POOLING_CLS = 2,
|
||||
LLAMA_POOLING_TYPE_NONE = 0,
|
||||
LLAMA_POOLING_TYPE_MEAN = 1,
|
||||
LLAMA_POOLING_TYPE_CLS = 2,
|
||||
};
|
||||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
||||
};
|
||||
|
||||
typedef struct llama_token_data {
|
||||
@@ -171,9 +182,9 @@ extern "C" {
|
||||
} llama_batch;
|
||||
|
||||
enum llama_model_kv_override_type {
|
||||
LLAMA_KV_OVERRIDE_INT,
|
||||
LLAMA_KV_OVERRIDE_FLOAT,
|
||||
LLAMA_KV_OVERRIDE_BOOL,
|
||||
LLAMA_KV_OVERRIDE_TYPE_INT,
|
||||
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
||||
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
||||
};
|
||||
|
||||
struct llama_model_kv_override {
|
||||
@@ -358,6 +369,7 @@ extern "C" {
|
||||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
@@ -512,10 +524,12 @@ extern "C" {
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_shift(
|
||||
LLAMA_API void llama_kv_cache_seq_add(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
@@ -523,7 +537,9 @@ extern "C" {
|
||||
llama_pos delta);
|
||||
|
||||
// Integer division of the positions by factor of `d > 1`
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_div(
|
||||
@@ -533,6 +549,20 @@ extern "C" {
|
||||
llama_pos p1,
|
||||
int d);
|
||||
|
||||
// Returns the largest position present in the KV cache for the specified sequence
|
||||
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Defragment the KV cache
|
||||
// This will be applied:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
|
||||
|
||||
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
||||
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// State / sessions
|
||||
//
|
||||
|
||||
@@ -1264,7 +1264,7 @@ struct test_argsort : public test_case {
|
||||
|
||||
test_argsort(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {16, 10, 10, 10},
|
||||
ggml_sort_order order = GGML_SORT_ASC)
|
||||
ggml_sort_order order = GGML_SORT_ORDER_ASC)
|
||||
: type(type), ne(ne), order(order) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
@@ -1918,7 +1918,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
GGML_TYPE_Q6_K,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S,
|
||||
GGML_TYPE_IQ4_NL,
|
||||
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S,
|
||||
};
|
||||
|
||||
// unary ops
|
||||
@@ -2116,7 +2116,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
||||
|
||||
for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
|
||||
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
||||
}
|
||||
|
||||
+1
-1
@@ -118,7 +118,7 @@ int main(void) {
|
||||
const float fe = ggml_get_f32_1d(e, 0);
|
||||
printf("%s: e = %.4f\n", __func__, fe);
|
||||
|
||||
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||
|
||||
ggml_opt(ctx, opt_params, e);
|
||||
|
||||
|
||||
@@ -151,6 +151,7 @@ int main(int argc, char * argv[]) {
|
||||
const float max_quantization_error =
|
||||
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
failed = !(total_error < max_quantization_error);
|
||||
num_failed += failed;
|
||||
@@ -167,7 +168,8 @@ int main(int argc, char * argv[]) {
|
||||
|
||||
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
||||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_DOT_PRODUCT_ERROR_LOWBIT : MAX_DOT_PRODUCT_ERROR;
|
||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
: MAX_DOT_PRODUCT_ERROR;
|
||||
failed = !(vec_dot_error < max_allowed_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
|
||||
@@ -404,7 +404,8 @@ static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
||||
|
||||
static int codepoint_type(uint32_t cp) {
|
||||
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
||||
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
|
||||
const auto it = codepoint_types.find(cp);
|
||||
return it == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
|
||||
}
|
||||
|
||||
static int codepoint_type(const std::string & utf8) {
|
||||
|
||||
Reference in New Issue
Block a user