Compare commits
56 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 117f7adbd9 | |||
| 91deef4606 | |||
| 902de8826b | |||
| 3e3cc7102f | |||
| c172b322c2 | |||
| d8f2da6b9f | |||
| 39a41a53b0 | |||
| d7fd29fff1 | |||
| 6f63d646c1 | |||
| 51d2ebadbb | |||
| 1e920018d3 | |||
| 01a5f06550 | |||
| 07786a61a2 | |||
| de14e2ea2b | |||
| 821922916f | |||
| b1c3f26e5e | |||
| b0a46993df | |||
| 807b0c49ff | |||
| f8c4c0738d | |||
| 402d6feffa | |||
| 20fc3804bf | |||
| f619024764 | |||
| d23287f122 | |||
| 5f2d4e60e2 | |||
| 916248af1f | |||
| f8d6a23804 | |||
| fadde67135 | |||
| a27152b602 | |||
| 3e2618bc7b | |||
| 07a3fc0608 | |||
| 968967376d | |||
| 023b8807e1 | |||
| 0e0590adab | |||
| a9f3b10215 | |||
| d08c20edde | |||
| 5fac350b9c | |||
| cb5fad4c6c | |||
| dae57a1ebc | |||
| 49122a873f | |||
| 0ddeff1023 | |||
| 3840b6f593 | |||
| 257f8e41e2 | |||
| 694c59cb42 | |||
| 197fe6c1d7 | |||
| d0a7145ba9 | |||
| 9ef0780062 | |||
| 1c5eba6f8e | |||
| 72272b83a3 | |||
| 8748d8ac6f | |||
| 26a39bbd6b | |||
| 38373cfbab | |||
| b851b3fba0 | |||
| 139cc621e9 | |||
| e57dc62057 | |||
| a27aa50ab7 | |||
| cb0b06a8a6 |
+7
-10
@@ -17,19 +17,18 @@
|
||||
rocmPackages,
|
||||
vulkan-headers,
|
||||
vulkan-loader,
|
||||
clblast,
|
||||
curl,
|
||||
useBlas ? builtins.all (x: !x) [
|
||||
useCuda
|
||||
useMetalKit
|
||||
useOpenCL
|
||||
useRocm
|
||||
useVulkan
|
||||
] && blas.meta.available,
|
||||
useCuda ? config.cudaSupport,
|
||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||
useOpenCL ? false,
|
||||
useRocm ? config.rocmSupport,
|
||||
enableCurl ? true,
|
||||
useVulkan ? false,
|
||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
|
||||
@@ -56,7 +55,6 @@ let
|
||||
++ lib.optionals useCuda [ "CUDA" ]
|
||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||
++ lib.optionals useMpi [ "MPI" ]
|
||||
++ lib.optionals useOpenCL [ "OpenCL" ]
|
||||
++ lib.optionals useRocm [ "ROCm" ]
|
||||
++ lib.optionals useVulkan [ "Vulkan" ];
|
||||
|
||||
@@ -198,19 +196,19 @@ effectiveStdenv.mkDerivation (
|
||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||
++ optionals useCuda cudaBuildInputs
|
||||
++ optionals useMpi [ mpi ]
|
||||
++ optionals useOpenCL [ clblast ]
|
||||
++ optionals useRocm rocmBuildInputs
|
||||
++ optionals useBlas [ blas ]
|
||||
++ optionals useVulkan vulkanBuildInputs;
|
||||
++ optionals useVulkan vulkanBuildInputs
|
||||
++ optionals enableCurl [ curl ];
|
||||
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||
(cmakeBool "GGML_NATIVE" false)
|
||||
(cmakeBool "GGML_BLAS" useBlas)
|
||||
(cmakeBool "GGML_CLBLAST" useOpenCL)
|
||||
(cmakeBool "GGML_CUDA" useCuda)
|
||||
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||
(cmakeBool "GGML_METAL" useMetalKit)
|
||||
@@ -254,7 +252,6 @@ effectiveStdenv.mkDerivation (
|
||||
useCuda
|
||||
useMetalKit
|
||||
useMpi
|
||||
useOpenCL
|
||||
useRocm
|
||||
useVulkan
|
||||
;
|
||||
@@ -281,7 +278,7 @@ effectiveStdenv.mkDerivation (
|
||||
# Configurations we don't want even the CI to evaluate. Results in the
|
||||
# "unsupported platform" messages. This is mostly a no-op, because
|
||||
# cudaPackages would've refused to evaluate anyway.
|
||||
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||
badPlatforms = optionals useCuda lib.platforms.darwin;
|
||||
|
||||
# Configurations that are known to result in build failures. Can be
|
||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||
|
||||
@@ -9,5 +9,3 @@ contact_links:
|
||||
- name: Want to contribute?
|
||||
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
||||
about: Head to the contribution guide page of the wiki for areas you can help with
|
||||
|
||||
|
||||
|
||||
@@ -799,6 +799,7 @@ jobs:
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
cd build
|
||||
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
|
||||
+6
-5
@@ -98,13 +98,14 @@ examples/server/*.mjs.hpp
|
||||
|
||||
# Python
|
||||
|
||||
__pycache__
|
||||
.venv
|
||||
/Pipfile
|
||||
dist
|
||||
poetry.lock
|
||||
/.venv
|
||||
__pycache__/
|
||||
*/poetry.lock
|
||||
poetry.toml
|
||||
|
||||
# Nix
|
||||
/result
|
||||
|
||||
# Test binaries
|
||||
/tests/test-backend-ops
|
||||
/tests/test-double-float
|
||||
|
||||
+14
-3
@@ -42,6 +42,10 @@ endif()
|
||||
|
||||
option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
|
||||
if (WIN32)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
endif()
|
||||
|
||||
#
|
||||
# option list
|
||||
#
|
||||
@@ -79,8 +83,15 @@ set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
|
||||
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
set(GGML_LLAMAFILE ON)
|
||||
set(GGML_CUDA_USE_GRAPHS ON)
|
||||
|
||||
# change the default for these ggml options
|
||||
if (NOT DEFINED GGML_LLAMAFILE)
|
||||
set(GGML_LLAMAFILE ON)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
|
||||
set(GGML_CUDA_USE_GRAPHS ON)
|
||||
endif()
|
||||
|
||||
# transition helpers
|
||||
function (llama_option_depr TYPE OLD NEW)
|
||||
@@ -145,7 +156,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
|
||||
|
||||
install(
|
||||
FILES convert-hf-to-gguf.py
|
||||
FILES convert_hf_to_gguf.py
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
"cacheVariables": {
|
||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||
"CMAKE_CXX_COMPILER": "icx",
|
||||
"CMAKE_C_COMPILER": "cl",
|
||||
"GGML_SYCL": "ON",
|
||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ BUILD_TARGETS = \
|
||||
TEST_TARGETS = \
|
||||
tests/test-autorelease \
|
||||
tests/test-backend-ops \
|
||||
tests/test-chat-template \
|
||||
tests/test-double-float \
|
||||
tests/test-grad0 \
|
||||
tests/test-grammar-integration \
|
||||
@@ -61,6 +62,11 @@ TEST_TARGETS = \
|
||||
tests/test-tokenizer-1-bpe \
|
||||
tests/test-tokenizer-1-spm
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
|
||||
|
||||
# Deprecation aliases
|
||||
ifdef LLAMA_CUBLAS
|
||||
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
|
||||
@@ -634,12 +640,6 @@ ifdef GGML_CUDA_DMMV_F16
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
||||
endif # GGML_CUDA_DMMV_F16
|
||||
|
||||
ifdef GGML_CUDA_KQUANTS_ITER
|
||||
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
|
||||
else
|
||||
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||
endif
|
||||
|
||||
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
|
||||
else
|
||||
@@ -728,7 +728,6 @@ ifdef GGML_HIPBLAS
|
||||
|
||||
GGML_CUDA_DMMV_X ?= 32
|
||||
GGML_CUDA_MMV_Y ?= 1
|
||||
GGML_CUDA_KQUANTS_ITER ?= 2
|
||||
|
||||
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
||||
|
||||
@@ -745,7 +744,6 @@ endif # GGML_HIP_UMA
|
||||
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
|
||||
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
|
||||
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
|
||||
|
||||
ifdef GGML_CUDA_FORCE_DMMV
|
||||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
@@ -1070,6 +1068,7 @@ clean:
|
||||
rm -rvf src/*.o
|
||||
rm -rvf tests/*.o
|
||||
rm -rvf examples/*.o
|
||||
rm -rvf common/*.o
|
||||
rm -rvf *.a
|
||||
rm -rvf *.dll
|
||||
rm -rvf *.so
|
||||
@@ -1084,6 +1083,7 @@ clean:
|
||||
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
||||
rm -rvf $(BUILD_TARGETS)
|
||||
rm -rvf $(TEST_TARGETS)
|
||||
rm -rvf $(LEGACY_TARGETS)
|
||||
find examples pocs -type f -name "*.o" -delete
|
||||
|
||||
#
|
||||
|
||||
@@ -26,7 +26,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
### Hot topics
|
||||
|
||||
- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
|
||||
- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
|
||||
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
|
||||
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
||||
@@ -108,6 +108,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
|
||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
|
||||
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
||||
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
|
||||
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
||||
@@ -217,6 +218,11 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
**Tools:**
|
||||
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||
|
||||
**Infrastructure:**
|
||||
|
||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||
|
||||
---
|
||||
|
||||
@@ -515,7 +521,6 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
||||
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
||||
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
||||
|
||||
@@ -576,7 +581,6 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
|
||||
- #### Vulkan
|
||||
|
||||
@@ -630,8 +634,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||
|
||||
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
|
||||
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
|
||||
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||
|
||||
```bash
|
||||
# obtain the official LLaMA model weights and place them in ./models
|
||||
@@ -648,7 +652,7 @@ ls ./models
|
||||
python3 -m pip install -r requirements.txt
|
||||
|
||||
# convert the model to ggml FP16 format
|
||||
python3 convert-hf-to-gguf.py models/mymodel/
|
||||
python3 convert_hf_to_gguf.py models/mymodel/
|
||||
|
||||
# quantize the model to 4-bits (using Q4_K_M method)
|
||||
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
|
||||
@@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -421,7 +421,7 @@ function gg_run_pythia_1_4b {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -553,7 +553,7 @@ function gg_run_pythia_2_8b {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -688,7 +688,7 @@ function gg_run_embd_bge_small {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
|
||||
+38
-5
@@ -757,7 +757,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.cache_type_v = argv[++i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--multiline-input") {
|
||||
if (arg == "-mli" || arg == "--multiline-input") {
|
||||
params.multiline_input = true;
|
||||
return true;
|
||||
}
|
||||
@@ -1014,16 +1014,23 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
}
|
||||
if (arg == "--in-prefix-bos") {
|
||||
params.input_prefix_bos = true;
|
||||
params.enable_chat_template = false;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--in-prefix") {
|
||||
CHECK_ARG
|
||||
params.input_prefix = argv[i];
|
||||
params.enable_chat_template = false;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--in-suffix") {
|
||||
CHECK_ARG
|
||||
params.input_suffix = argv[i];
|
||||
params.enable_chat_template = false;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--spm-infill") {
|
||||
params.spm_infill = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--grammar") {
|
||||
@@ -1402,13 +1409,15 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
"halt generation at PROMPT, return control in interactive mode\n"
|
||||
"can be specified more than once for multiple prompts" });
|
||||
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
||||
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
|
||||
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
||||
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
||||
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
||||
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
||||
options.push_back({ "server infill",
|
||||
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
||||
|
||||
options.push_back({ "sampling" });
|
||||
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
||||
@@ -2061,7 +2070,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
if (params.warmup) {
|
||||
LOG("warming up the model with an empty run\n");
|
||||
|
||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||
std::vector<llama_token> tmp;
|
||||
llama_token bos = llama_token_bos(model);
|
||||
llama_token eos = llama_token_eos(model);
|
||||
// some models (e.g. T5) don't have a BOS token
|
||||
if (bos != -1) {
|
||||
tmp.push_back(bos);
|
||||
}
|
||||
tmp.push_back(eos);
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
decoder_start_token_id = bos;
|
||||
}
|
||||
tmp.clear();
|
||||
tmp.push_back(decoder_start_token_id);
|
||||
}
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_synchronize(lctx);
|
||||
@@ -2662,12 +2688,19 @@ std::string llama_chat_format_single(const struct llama_model * model,
|
||||
const std::vector<llama_chat_msg> & past_msg,
|
||||
const llama_chat_msg & new_msg,
|
||||
bool add_ass) {
|
||||
std::ostringstream ss;
|
||||
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
|
||||
std::vector<llama_chat_msg> chat_new(past_msg);
|
||||
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
||||
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
||||
ss << "\n";
|
||||
};
|
||||
// format chat with new_msg
|
||||
chat_new.push_back(new_msg);
|
||||
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
|
||||
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
||||
return formatted;
|
||||
// get the diff part
|
||||
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string llama_chat_format_example(const struct llama_model * model,
|
||||
|
||||
+3
-1
@@ -200,6 +200,7 @@ struct gpt_params {
|
||||
std::string public_path = "";
|
||||
std::string chat_template = "";
|
||||
std::string system_prompt = "";
|
||||
bool enable_chat_template = true;
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
@@ -250,6 +251,8 @@ struct gpt_params {
|
||||
std::string cvector_outfile = "control_vector.gguf";
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
};
|
||||
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
@@ -456,4 +459,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
||||
void yaml_dump_non_result_info(
|
||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||
|
||||
|
||||
@@ -316,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
||||
};
|
||||
|
||||
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
||||
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||
|
||||
template <typename Iterator>
|
||||
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
||||
@@ -720,7 +720,7 @@ private:
|
||||
}
|
||||
prop_names.push_back(prop_name);
|
||||
}
|
||||
if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
|
||||
if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
|
||||
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
||||
std::string value_rule =
|
||||
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
|
||||
|
||||
@@ -13,7 +13,7 @@ import sys
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from hashlib import sha256
|
||||
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
||||
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
@@ -404,7 +404,7 @@ class Model:
|
||||
|
||||
return tokens, toktypes, tokpre
|
||||
|
||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
||||
# do not modify it manually!
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
# Marker: Start get_vocab_base_pre
|
||||
@@ -424,7 +424,7 @@ class Model:
|
||||
|
||||
res = None
|
||||
|
||||
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
|
||||
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
||||
# or pull the latest version of the model from Huggingface
|
||||
# don't edit the hashes manually!
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
@@ -490,15 +490,18 @@ class Model:
|
||||
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
||||
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
||||
res = "viking"
|
||||
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
||||
# ref: https://huggingface.co/core42/jais-13b
|
||||
res = "jais"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
logger.warning("**************************************************************************************")
|
||||
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
||||
logger.warning("** There are 2 possible reasons for this:")
|
||||
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
|
||||
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
|
||||
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
logger.warning("**")
|
||||
logger.warning(f"** chkhsh: {chkhsh}")
|
||||
@@ -576,7 +579,19 @@ class Model:
|
||||
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_sentencepiece(self):
|
||||
def _set_vocab_sentencepiece(self, add_to_gguf=True):
|
||||
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _create_vocab_sentencepiece(self):
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
@@ -638,14 +653,7 @@ class Model:
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
return tokens, scores, toktypes
|
||||
|
||||
def _set_vocab_llama_hf(self):
|
||||
vocab = gguf.LlamaHfVocab(self.dir_model)
|
||||
@@ -669,6 +677,51 @@ class Model:
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
||||
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
|
||||
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
||||
vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
|
||||
|
||||
default_pre = "mpt" if model_name == "gpt-neox" else "default"
|
||||
|
||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
||||
assert field # tokenizer model
|
||||
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
|
||||
|
||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
||||
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
|
||||
|
||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
||||
assert field # token list
|
||||
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
||||
|
||||
if model_name == "llama-spm":
|
||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
|
||||
assert field # token scores
|
||||
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
||||
|
||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
||||
assert field # token types
|
||||
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
||||
|
||||
if model_name != "llama-spm":
|
||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
||||
assert field # token merges
|
||||
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
||||
|
||||
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
|
||||
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
|
||||
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
|
||||
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
|
||||
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
|
||||
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
|
||||
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
|
||||
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
|
||||
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
|
||||
self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
|
||||
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
|
||||
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
|
||||
|
||||
|
||||
@Model.register("GPTNeoXForCausalLM")
|
||||
class GPTNeoXModel(Model):
|
||||
@@ -1934,7 +1987,7 @@ class Phi3MiniModel(Model):
|
||||
if len(rope_scaling_type) == 0:
|
||||
raise KeyError('Missing the required key rope_scaling.type')
|
||||
|
||||
if rope_scaling_type == 'su':
|
||||
if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
|
||||
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
|
||||
elif rope_scaling_type == 'yarn':
|
||||
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
|
||||
@@ -2308,6 +2361,8 @@ class GemmaModel(Model):
|
||||
special_vocab._set_special_token("eot", 107)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
@@ -2340,6 +2395,71 @@ class GemmaModel(Model):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("Gemma2ForCausalLM")
|
||||
class Gemma2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA2
|
||||
|
||||
def set_vocab(self):
|
||||
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
||||
# hack: This is required so that we can properly use start/end-of-turn for chat template
|
||||
for i in range(108):
|
||||
# including <unusedX>, <start_of_turn>, <end_of_turn>
|
||||
toktypes[i] = SentencePieceTokenTypes.CONTROL
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_key_length(hparams["head_dim"])
|
||||
self.gguf_writer.add_value_length(hparams["head_dim"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_attn_logit_softcapping(
|
||||
self.hparams["attn_logit_softcapping"]
|
||||
)
|
||||
self.gguf_writer.add_final_logit_softcapping(
|
||||
self.hparams["final_logit_softcapping"]
|
||||
)
|
||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||
|
||||
# sanity check
|
||||
attn_scalar = self.hparams["query_pre_attn_scalar"]
|
||||
if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
|
||||
raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unusem
|
||||
|
||||
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if name == "lm_head.weight":
|
||||
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||
return []
|
||||
|
||||
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
||||
if name.endswith("norm.weight"):
|
||||
data_torch = data_torch + 1
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("Starcoder2ForCausalLM")
|
||||
class StarCoder2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||
@@ -2364,39 +2484,7 @@ class MambaModel(Model):
|
||||
self._set_vocab_sentencepiece()
|
||||
else:
|
||||
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
||||
tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
|
||||
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
||||
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
||||
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
||||
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
||||
assert field
|
||||
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
||||
assert field
|
||||
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
||||
assert field
|
||||
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
|
||||
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
||||
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
|
||||
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
|
||||
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
|
||||
self._set_vocab_builtin("gpt-neox", vocab_size)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
d_model = self.find_hparam(["hidden_size", "d_model"])
|
||||
@@ -2548,6 +2636,82 @@ class JinaBertV2Model(BertModel):
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
|
||||
@Model.register("OpenELMForCausalLM")
|
||||
class OpenELMModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.OPENELM
|
||||
|
||||
@staticmethod
|
||||
def _make_divisible(v: float | int, divisor: int) -> int:
|
||||
# ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
|
||||
new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
|
||||
# Make sure that round down does not go down by more than 10%.
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
|
||||
ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
|
||||
self._n_embd: int = self.hparams["model_dim"]
|
||||
self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
|
||||
self._num_query_heads: list[int] = self.hparams["num_query_heads"]
|
||||
self._ffn_dims: list[int] = [
|
||||
OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
|
||||
for multiplier in ffn_multipliers
|
||||
]
|
||||
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
||||
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
|
||||
|
||||
# Uses the tokenizer from meta-llama/Llama-2-7b-hf
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
n_embd = self._n_embd
|
||||
head_dim = self.hparams["head_dim"]
|
||||
rot_pct = 1.0
|
||||
assert self.block_count == len(self._num_kv_heads)
|
||||
assert self.block_count == len(self._num_query_heads)
|
||||
assert self.block_count == len(self._ffn_dims)
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_context_length(self.hparams["max_context_length"])
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
||||
self.gguf_writer.add_head_count(self._num_query_heads)
|
||||
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
|
||||
# https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
|
||||
self.gguf_writer.add_layer_norm_rms_eps(1e-6)
|
||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
|
||||
self.gguf_writer.add_key_length(head_dim)
|
||||
self.gguf_writer.add_value_length(head_dim)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
||||
if "n_layers" in keys:
|
||||
return self.hparams["num_transformer_layers"]
|
||||
|
||||
return super().find_hparam(keys, optional)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
||||
# split ff
|
||||
if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
|
||||
ff_dim = self._ffn_dims[bid]
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
|
||||
return
|
||||
|
||||
yield (self.map_tensor_name(name), data_torch)
|
||||
|
||||
|
||||
@Model.register("ArcticForCausalLM")
|
||||
class ArcticModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.ARCTIC
|
||||
@@ -2778,11 +2942,17 @@ class DeepseekV2Model(Model):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("T5ForConditionalGeneration")
|
||||
@Model.register("T5WithLMHeadModel")
|
||||
@Model.register("T5ForConditionalGeneration")
|
||||
@Model.register("MT5ForConditionalGeneration")
|
||||
@Model.register("UMT5ForConditionalGeneration")
|
||||
class T5Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.T5
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.shared_token_embeddings_found = False
|
||||
|
||||
def set_vocab(self):
|
||||
# to avoid TypeError: Descriptors cannot be created directly
|
||||
# exception when importing sentencepiece_model_pb2
|
||||
@@ -2790,17 +2960,29 @@ class T5Model(Model):
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
from sentencepiece import sentencepiece_model_pb2 as model
|
||||
|
||||
tokenizer_path = self.dir_model / 'spiece.model'
|
||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
|
||||
# many older models use spiece.model tokenizer model filename
|
||||
if not tokenizer_path.is_file():
|
||||
tokenizer_path = self.dir_model / 'spiece.model'
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||
|
||||
sentencepiece_model = model.ModelProto()
|
||||
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||
|
||||
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
|
||||
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
||||
# assure the tokenizer model file name is correct
|
||||
assert tokenizer_path.name == 'tokenizer.model'
|
||||
return self._set_vocab_sentencepiece()
|
||||
else:
|
||||
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||
|
||||
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
||||
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||
|
||||
tokenizer = SentencePieceProcessor()
|
||||
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||
@@ -2870,7 +3052,10 @@ class T5Model(Model):
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("T5")
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
||||
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
||||
n_ctx = 512
|
||||
self.gguf_writer.add_context_length(n_ctx)
|
||||
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
||||
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
||||
@@ -2886,16 +3071,111 @@ class T5Model(Model):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
|
||||
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
|
||||
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
|
||||
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
|
||||
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
|
||||
return []
|
||||
# T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
|
||||
# "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
|
||||
# in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
|
||||
# and decoder and ignore the remaining ones.
|
||||
if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
|
||||
if not self.shared_token_embeddings_found:
|
||||
name = "shared.weight"
|
||||
self.shared_token_embeddings_found = True
|
||||
else:
|
||||
logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("JAISLMHeadModel")
|
||||
class JaisModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.JAIS
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# SwigLU activation
|
||||
assert self.hparams["activation_function"] == "swiglu"
|
||||
# ALiBi position embedding
|
||||
assert self.hparams["position_embedding_type"] == "alibi"
|
||||
|
||||
# Embeddings scale
|
||||
self.embeddings_scale = 1.0
|
||||
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
||||
self.output_is_wte = False
|
||||
if 'mup_embeddings_scale' in self.hparams:
|
||||
self.output_is_wte = True # Hack (?)
|
||||
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
||||
elif 'embeddings_scale' in self.hparams:
|
||||
self.embeddings_scale = self.hparams['embeddings_scale']
|
||||
else:
|
||||
assert False
|
||||
|
||||
self.width_scale = 1.0
|
||||
if 'mup_output_alpha' in self.hparams:
|
||||
assert 'mup_width_scale' in self.hparams
|
||||
self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
|
||||
elif 'width_scale' in self.hparams:
|
||||
self.width_scale = self.hparams['width_scale']
|
||||
else:
|
||||
assert False
|
||||
|
||||
self.max_alibi_bias = 8.0
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
|
||||
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# we don't need these
|
||||
if name.endswith((".attn.bias")):
|
||||
return tensors
|
||||
|
||||
if name.endswith(("relative_pe.slopes")):
|
||||
# Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
|
||||
# Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
|
||||
# but Jais's PyTorch model simply precalculates the slope values and places them
|
||||
# in relative_pes.slopes
|
||||
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
|
||||
first_val = float(data_torch._data[0])
|
||||
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
|
||||
|
||||
return tensors
|
||||
|
||||
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
|
||||
data_torch = data_torch.transpose(1, 0)
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
||||
tensors.append((new_name, data_torch * self.embeddings_scale))
|
||||
if self.output_is_wte:
|
||||
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
||||
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
||||
assert not self.output_is_wte
|
||||
tensors.append((new_name, data_torch * self.width_scale))
|
||||
else:
|
||||
tensors.append((new_name, data_torch))
|
||||
|
||||
return tensors
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
@@ -3051,7 +3331,8 @@ def main() -> None:
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
}
|
||||
|
||||
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
|
||||
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
|
||||
if args.use_temp_file and is_split:
|
||||
logger.error("Error: Cannot use temp file when splitting")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -3088,11 +3369,12 @@ def main() -> None:
|
||||
if args.vocab_only:
|
||||
logger.info("Exporting model vocab...")
|
||||
model_instance.write_vocab()
|
||||
logger.info("Model vocab successfully exported.")
|
||||
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
|
||||
else:
|
||||
logger.info("Exporting model...")
|
||||
model_instance.write()
|
||||
logger.info("Model successfully exported.")
|
||||
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
||||
logger.info(f"Model successfully exported to {out_path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
@@ -2,7 +2,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This script downloads the tokenizer models of the specified models from Huggingface and
|
||||
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
||||
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
||||
#
|
||||
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
||||
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
||||
@@ -15,9 +15,9 @@
|
||||
# - Add a new model to the "models" list
|
||||
# - Run the script with your huggingface token:
|
||||
#
|
||||
# python3 convert-hf-to-gguf-update.py <huggingface_token>
|
||||
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
||||
#
|
||||
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
|
||||
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
|
||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||
#
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
@@ -37,7 +37,7 @@ from enum import IntEnum, auto
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger("convert-hf-to-gguf-update")
|
||||
logger = logging.getLogger("convert_hf_to_gguf_update")
|
||||
sess = requests.Session()
|
||||
|
||||
|
||||
@@ -45,6 +45,7 @@ class TOKENIZER_TYPE(IntEnum):
|
||||
SPM = auto()
|
||||
BPE = auto()
|
||||
WPM = auto()
|
||||
UGM = auto()
|
||||
|
||||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
@@ -55,10 +56,10 @@ if len(sys.argv) == 2:
|
||||
token = sys.argv[1]
|
||||
if not token.startswith("hf_"):
|
||||
logger.info("Huggingface token seems invalid")
|
||||
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: add models here, base models preferred
|
||||
@@ -86,6 +87,10 @@ models = [
|
||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
||||
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
|
||||
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
||||
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||
]
|
||||
|
||||
|
||||
@@ -107,9 +112,13 @@ def download_model(model):
|
||||
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
|
||||
|
||||
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
|
||||
|
||||
if tokt == TOKENIZER_TYPE.SPM:
|
||||
files.append("tokenizer.model")
|
||||
|
||||
if tokt == TOKENIZER_TYPE.UGM:
|
||||
files.append("spiece.model")
|
||||
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
@@ -125,14 +134,14 @@ for model in models:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
|
||||
|
||||
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||
|
||||
src_ifs = ""
|
||||
for model in models:
|
||||
name = model["name"]
|
||||
tokt = model["tokt"]
|
||||
|
||||
if tokt == TOKENIZER_TYPE.SPM:
|
||||
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
|
||||
continue
|
||||
|
||||
# Skip if the tokenizer folder does not exist or there are other download issues previously
|
||||
@@ -142,7 +151,10 @@ for model in models:
|
||||
|
||||
# create the tokenizer
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||
if name == "t5":
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||
except OSError as e:
|
||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||
continue # Skip to the next model if the tokenizer can't be loaded
|
||||
@@ -189,7 +201,7 @@ src_func = f"""
|
||||
|
||||
res = None
|
||||
|
||||
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
|
||||
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
||||
# or pull the latest version of the model from Huggingface
|
||||
# don't edit the hashes manually!
|
||||
{src_ifs}
|
||||
@@ -198,9 +210,9 @@ src_func = f"""
|
||||
logger.warning("**************************************************************************************")
|
||||
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
||||
logger.warning("** There are 2 possible reasons for this:")
|
||||
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
|
||||
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
|
||||
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
logger.warning("**")
|
||||
logger.warning(f"** chkhsh: {{chkhsh}}")
|
||||
@@ -214,7 +226,7 @@ src_func = f"""
|
||||
return res
|
||||
"""
|
||||
|
||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
|
||||
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||
convert_py = re.sub(
|
||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||
@@ -225,7 +237,7 @@ convert_py = re.sub(
|
||||
|
||||
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
||||
|
||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||
logger.info("+++ convert_hf_to_gguf.py was updated")
|
||||
|
||||
# generate tests for each tokenizer model
|
||||
|
||||
@@ -263,6 +275,7 @@ tests = [
|
||||
"\n =",
|
||||
"' era",
|
||||
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
|
||||
"!!!!!!",
|
||||
"3",
|
||||
"33",
|
||||
"333",
|
||||
@@ -272,7 +285,8 @@ tests = [
|
||||
"3333333",
|
||||
"33333333",
|
||||
"333333333",
|
||||
# "Cửa Việt", # llama-bpe fails on this
|
||||
"Cửa Việt", # llama-bpe fails on this
|
||||
" discards",
|
||||
chktxt,
|
||||
]
|
||||
|
||||
@@ -300,7 +314,10 @@ for model in models:
|
||||
|
||||
# create the tokenizer
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||
if name == "t5":
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||
except OSError as e:
|
||||
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
|
||||
continue # Skip this model and continue with the next one in the loop
|
||||
@@ -326,6 +343,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
|
||||
for model in models:
|
||||
name = model["name"]
|
||||
|
||||
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
|
||||
print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
|
||||
|
||||
logger.info("\n")
|
||||
@@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
|
||||
### 1. Convert the model to GGUF
|
||||
|
||||
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||
Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
|
||||
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
|
||||
|
||||
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||
|
||||
|
||||
@@ -93,14 +93,34 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// create a llama_batch
|
||||
// we use this object to submit token data for decoding
|
||||
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
|
||||
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
|
||||
|
||||
std::vector<llama_seq_id> seq_ids(n_parallel, 0);
|
||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||
seq_ids[i] = i;
|
||||
}
|
||||
|
||||
// evaluate the initial prompt
|
||||
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
||||
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||
llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
|
||||
}
|
||||
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
if (llama_encode(ctx, batch)) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
decoder_start_token_id = llama_token_bos(model);
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
@@ -109,11 +129,11 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
|
||||
for (int32_t i = 1; i < n_parallel; ++i) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
//// assign the system KV cache to all parallel sequences
|
||||
//// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
|
||||
//for (int32_t i = 1; i < n_parallel; ++i) {
|
||||
// llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
//}
|
||||
|
||||
if (n_parallel > 1) {
|
||||
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
||||
|
||||
@@ -58,4 +58,3 @@ The above command will output space-separated float values.
|
||||
```powershell
|
||||
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||
```
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||
|
||||
## Input Prompts
|
||||
|
||||
|
||||
+13
-12
@@ -210,6 +210,7 @@ int main(int argc, char ** argv) {
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
std::vector<llama_token> embd_inp;
|
||||
std::vector<llama_token> embd_end;
|
||||
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
||||
const int space_token = 29871;
|
||||
@@ -217,12 +218,13 @@ int main(int argc, char ** argv) {
|
||||
inp_sfx.erase(inp_sfx.begin());
|
||||
}
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||
if (add_bos) {
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
|
||||
}
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
embd_inp = inp_pfx;
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
const llama_token middle_token = llama_token_middle(model);
|
||||
if (middle_token >= 0) {
|
||||
@@ -526,14 +528,14 @@ int main(int argc, char ** argv) {
|
||||
inp_sfx.erase(inp_sfx.begin());
|
||||
}
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||
if (add_bos) {
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
|
||||
}
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
embd_inp = inp_pfx;
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
const llama_token middle_token = llama_token_middle(model);
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
@@ -657,4 +659,3 @@ int main(int argc, char ** argv) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Usage:
|
||||
#! ./llama-server -m some-model.gguf &
|
||||
#! pip install pydantic
|
||||
#! python json-schema-pydantic-example.py
|
||||
#! python json_schema_pydantic_example.py
|
||||
|
||||
from pydantic import BaseModel, Extra, TypeAdapter
|
||||
from annotated_types import MinLen
|
||||
@@ -231,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
|
||||
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
|
||||
|
||||
NON_LITERAL_SET = set('|.()[]{}*+?')
|
||||
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
|
||||
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
|
||||
|
||||
|
||||
class SchemaConverter:
|
||||
@@ -602,7 +602,7 @@ class SchemaConverter:
|
||||
else:
|
||||
add_component(t, is_required=True)
|
||||
|
||||
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
|
||||
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
|
||||
|
||||
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
|
||||
items = schema.get('items') or schema['prefixItems']
|
||||
@@ -691,7 +691,7 @@ class SchemaConverter:
|
||||
required_props = [k for k in sorted_props if k in required]
|
||||
optional_props = [k for k in sorted_props if k not in required]
|
||||
|
||||
if additional_properties != False:
|
||||
if additional_properties is not None and additional_properties != False:
|
||||
sub_name = f'{name}{"-" if name else ""}additional'
|
||||
value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
|
||||
self._add_primitive('value', PRIMITIVE_RULES['value'])
|
||||
|
||||
@@ -30,16 +30,16 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B
|
||||
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
```
|
||||
|
||||
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
|
||||
python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert-image-encoder-to-gguf \
|
||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
@@ -47,17 +47,17 @@ python ./examples/llava/convert-image-encoder-to-gguf \
|
||||
```
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert-image-encoder-to-gguf \
|
||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||
--projector-type ldpv2
|
||||
```
|
||||
|
||||
4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
|
||||
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||
|
||||
+10
-10
@@ -38,22 +38,22 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
pip install -r examples/llava/requirements.txt
|
||||
```
|
||||
|
||||
3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
|
||||
python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
|
||||
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
|
||||
python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
|
||||
```
|
||||
|
||||
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
|
||||
@@ -70,9 +70,9 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
||||
pip install -r examples/llava/requirements.txt
|
||||
```
|
||||
|
||||
3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
```console
|
||||
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
```
|
||||
- you will find a llava.projector and a llava.clip file in your model directory
|
||||
|
||||
@@ -86,13 +86,13 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso
|
||||
|
||||
5) Create the visual gguf model:
|
||||
```console
|
||||
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
```
|
||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||
|
||||
6) Then convert the model to gguf format:
|
||||
```console
|
||||
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
||||
python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
||||
```
|
||||
|
||||
7) And finally we can run the llava cli using the 1.6 model version:
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
-r ../../requirements/requirements-convert-legacy-llama.txt
|
||||
-r ../../requirements/requirements-convert_legacy_llama.txt
|
||||
pillow~=10.2.0
|
||||
torch~=2.1.1
|
||||
torch~=2.2.1
|
||||
|
||||
@@ -10,4 +10,3 @@ More info:
|
||||
|
||||
https://github.com/ggerganov/llama.cpp/pull/4484
|
||||
https://github.com/ggerganov/llama.cpp/issues/4226
|
||||
|
||||
|
||||
@@ -48,4 +48,3 @@
|
||||
build*/
|
||||
out/
|
||||
tmp/
|
||||
|
||||
|
||||
@@ -30,4 +30,3 @@ target_include_directories(${TARGET} PRIVATE ${_common_path})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
|
||||
+28
-5
@@ -255,13 +255,15 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
if (!llama_model_has_encoder(model)) {
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
}
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
{
|
||||
auto prompt = params.conversation
|
||||
auto prompt = (params.conversation && params.enable_chat_template)
|
||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||
: params.prompt;
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
@@ -517,6 +519,24 @@ int main(int argc, char ** argv) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
int enc_input_size = embd_inp.size();
|
||||
llama_token * enc_input_buf = embd_inp.data();
|
||||
|
||||
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
decoder_start_token_id = llama_token_bos(model);
|
||||
}
|
||||
|
||||
embd_inp.clear();
|
||||
embd_inp.push_back(decoder_start_token_id);
|
||||
}
|
||||
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
@@ -810,7 +830,9 @@ int main(int argc, char ** argv) {
|
||||
is_antiprompt = true;
|
||||
}
|
||||
|
||||
chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
|
||||
if (params.enable_chat_template) {
|
||||
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
||||
}
|
||||
is_interacting = true;
|
||||
printf("\n");
|
||||
}
|
||||
@@ -872,12 +894,13 @@ int main(int argc, char ** argv) {
|
||||
string_process_escapes(buffer);
|
||||
}
|
||||
|
||||
std::string user_inp = params.conversation
|
||||
bool format_chat = params.conversation && params.enable_chat_template;
|
||||
std::string user_inp = format_chat
|
||||
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||
: std::move(buffer);
|
||||
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
|
||||
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||
|
||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||
|
||||
@@ -1991,6 +1991,12 @@ int main(int argc, char ** argv) {
|
||||
params.n_batch = std::min(params.n_batch, n_kv);
|
||||
} else {
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
if (params.kl_divergence) {
|
||||
params.n_parallel = 1;
|
||||
} else {
|
||||
// ensure there's at least enough seq_ids for HellaSwag
|
||||
params.n_parallel = std::max(4, params.n_parallel);
|
||||
}
|
||||
}
|
||||
|
||||
if (params.ppl_stride > 0) {
|
||||
@@ -2015,9 +2021,6 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
// ensure there's at least enough seq_ids for HellaSwag
|
||||
params.n_parallel = std::max(4, params.n_parallel);
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == NULL) {
|
||||
|
||||
@@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
||||
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
||||
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
||||
- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||
|
||||
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
||||
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
||||
|
||||
@@ -259,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
|
||||
const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
|
||||
|
||||
const NON_LITERAL_SET = new Set('|.()[]{}*+?');
|
||||
const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
|
||||
const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');
|
||||
|
||||
export class SchemaConverter {
|
||||
constructor(options) {
|
||||
@@ -751,7 +751,7 @@ export class SchemaConverter {
|
||||
const requiredProps = sortedProps.filter(k => required.has(k));
|
||||
const optionalProps = sortedProps.filter(k => !required.has(k));
|
||||
|
||||
if (additionalProperties !== false) {
|
||||
if (additionalProperties) {
|
||||
const subName = `${name ?? ''}${name ? '-' : ''}additional`;
|
||||
const valueRule =
|
||||
additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
|
||||
|
||||
@@ -2020,6 +2020,7 @@ struct server_context {
|
||||
slot.t_start_generation = 0;
|
||||
|
||||
if (slot.infill) {
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
bool suff_rm_leading_spc = true;
|
||||
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
@@ -2035,16 +2036,21 @@ struct server_context {
|
||||
}
|
||||
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
|
||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
|
||||
|
||||
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
||||
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
const llama_token middle_token = llama_token_middle(model);
|
||||
if (middle_token >= 0) {
|
||||
prefix_tokens.push_back(middle_token);
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
prompt_tokens = prefix_tokens;
|
||||
prompt_tokens = embd_inp;
|
||||
} else {
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||
}
|
||||
|
||||
@@ -52,4 +52,3 @@ Feature: Passkey / Self-extend with context shift
|
||||
#| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 |
|
||||
#| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0
|
||||
# 987 |
|
||||
|
||||
|
||||
@@ -1054,4 +1054,3 @@
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
||||
|
||||
@@ -1058,4 +1058,3 @@
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
||||
|
||||
@@ -31,4 +31,3 @@ for i in range(n-1):
|
||||
embedding2 = np.array(result[j])
|
||||
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||
print(f"Similarity between {i} and {j}: {similarity:.2f}")
|
||||
|
||||
@@ -34,4 +34,3 @@ fi
|
||||
|
||||
#use multiple GPUs with same max compute units
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
||||
@@ -31,4 +31,3 @@ exit /B 0
|
||||
:ERROR
|
||||
echo comomand error: %errorlevel%
|
||||
exit /B %errorlevel%
|
||||
|
||||
|
||||
@@ -7,5 +7,3 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
|
||||
|
||||
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
|
||||
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||
fprintf(stream, " --show-count print the total number of tokens.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
||||
@@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
bool printing_ids = false;
|
||||
bool no_bos = false;
|
||||
bool disable_logging = false;
|
||||
bool show_token_count = false;
|
||||
const char * model_path = NULL;
|
||||
const char * prompt_path = NULL;
|
||||
const char * prompt_arg = NULL;
|
||||
@@ -249,6 +251,9 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
else if (arg == "--log-disable") {
|
||||
disable_logging = true;
|
||||
}
|
||||
else if (arg == "--show-count") {
|
||||
show_token_count = true;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
|
||||
return 1;
|
||||
@@ -384,6 +389,9 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
if (show_token_count) {
|
||||
printf("Total number of tokens: %ld\n", tokens.size());
|
||||
}
|
||||
// silence valgrind
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
Generated
+3
-3
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1718895438,
|
||||
"narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
|
||||
"lastModified": 1719506693,
|
||||
"narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
|
||||
"rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -113,8 +113,6 @@ option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of
|
||||
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
||||
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
||||
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
||||
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
|
||||
"ggml: iters./thread per block for Q2_K/Q6_K")
|
||||
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"ggml: max. batch size for using peer access")
|
||||
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
||||
|
||||
@@ -63,4 +63,3 @@ GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -297,7 +297,6 @@ if (GGML_CUDA)
|
||||
|
||||
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
|
||||
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (GGML_CUDA_USE_GRAPHS)
|
||||
@@ -426,7 +425,6 @@ if (GGML_HIPBLAS)
|
||||
add_compile_definitions(GGML_USE_HIPBLAS)
|
||||
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
|
||||
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
|
||||
|
||||
if (GGML_HIP_UMA)
|
||||
add_compile_definitions(GGML_HIP_UMA)
|
||||
@@ -486,9 +484,11 @@ if (GGML_SYCL)
|
||||
add_compile_options(-I./) #include DPCT
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
else()
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
endif()
|
||||
|
||||
file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
|
||||
@@ -1166,7 +1166,9 @@ target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
|
||||
|
||||
find_library(MATH_LIBRARY m)
|
||||
if (MATH_LIBRARY)
|
||||
target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
|
||||
if (NOT WIN32 OR NOT GGML_SYCL)
|
||||
target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
||||
@@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
|
||||
#define QR6_K 2
|
||||
|
||||
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
||||
#define QR2_XXS 8
|
||||
#define QR2_XXS 4
|
||||
|
||||
#define QI2_XS (QK_K / (4*QR2_XS))
|
||||
#define QR2_XS 8
|
||||
#define QR2_XS 4
|
||||
|
||||
#define QI2_S (QK_K / (4*QR2_S))
|
||||
#define QR2_S 8
|
||||
#define QR2_S 4
|
||||
|
||||
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
||||
#define QR3_XXS 8
|
||||
#define QR3_XXS 4
|
||||
|
||||
#define QI3_XS (QK_K / (4*QR3_XS))
|
||||
#define QR3_XS 8
|
||||
#define QR3_XS 4
|
||||
|
||||
#define QI1_S (QK_K / (4*QR1_S))
|
||||
#define QR1_S 8
|
||||
@@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
|
||||
#define QR4_NL 2
|
||||
|
||||
#define QI4_XS (QK_K / (4*QR4_XS))
|
||||
#define QR4_XS 8
|
||||
#define QR4_XS 2
|
||||
|
||||
#define QI3_S (QK_K / (4*QR3_S))
|
||||
#define QR3_S 8
|
||||
#define QR3_S 4
|
||||
|
||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||
|
||||
|
||||
+35
-24
@@ -1882,6 +1882,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
// if mmvq is available it's a better choice than dmmv:
|
||||
#ifndef GGML_CUDA_FORCE_DMMV
|
||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||
#endif // GGML_CUDA_FORCE_DMMV
|
||||
|
||||
bool any_gpus_with_slow_fp16 = false;
|
||||
|
||||
if (split) {
|
||||
@@ -1894,22 +1899,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
}
|
||||
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
}
|
||||
} else {
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
}
|
||||
|
||||
// if mmvq is available it's a better choice than dmmv:
|
||||
#ifndef GGML_CUDA_FORCE_DMMV
|
||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||
#endif // GGML_CUDA_FORCE_DMMV
|
||||
|
||||
// debug helpers
|
||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
||||
@@ -2713,27 +2711,40 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
struct ggml_tensor * a;
|
||||
struct ggml_tensor * b;
|
||||
struct ggml_tensor * a = op->src[0];
|
||||
if (op->op == GGML_OP_MUL_MAT) {
|
||||
a = op->src[0];
|
||||
b = op->src[1];
|
||||
} else {
|
||||
a = op->src[2];
|
||||
b = op->src[1];
|
||||
}
|
||||
if (a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
|
||||
a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
|
||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||
struct ggml_tensor * b = op->src[1];
|
||||
if (a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
switch (a->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-cuda.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
@@ -226,6 +227,10 @@ typedef float2 dfloat2;
|
||||
#define RDNA2
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1010__) || defined(__gfx1012__)
|
||||
#define RDNA1
|
||||
#endif
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
@@ -268,30 +273,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3)
|
||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||
int tmp1;
|
||||
int tmp2;
|
||||
asm("\n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
"
|
||||
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
||||
: "v"(a), "v"(b)
|
||||
);
|
||||
#else
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
||||
#endif
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
@@ -467,8 +457,48 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||
}
|
||||
#endif // CUDART_VERSION < 12000
|
||||
|
||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3)
|
||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||
int tmp1;
|
||||
int tmp2;
|
||||
asm("\n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
"
|
||||
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
||||
: "v"(a), "v"(b)
|
||||
);
|
||||
#else
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
||||
#endif
|
||||
return c;
|
||||
|
||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
return __dp4a(a, b, c);
|
||||
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
const int8_t * a8 = (const int8_t *) &a;
|
||||
const int8_t * b8 = (const int8_t *) &b;
|
||||
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
// TODO: move to ggml-common.h
|
||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
|
||||
|
||||
@@ -487,4 +487,3 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+33
-87
@@ -2,16 +2,7 @@
|
||||
#include "dequantize.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
#ifndef K_QUANTS_PER_ITERATION
|
||||
#define K_QUANTS_PER_ITERATION 2
|
||||
#else
|
||||
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
||||
#endif
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
|
||||
@@ -22,15 +13,15 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
const int tid = threadIdx.x/2; // 0...15
|
||||
const int ix = threadIdx.x%2; // 0,1
|
||||
|
||||
const int step = 16/K_QUANTS_PER_ITERATION;
|
||||
const int step = 8;
|
||||
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0...15 or 0...7
|
||||
|
||||
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
||||
const int l0 = 2*in; // 0...14 in steps of 2
|
||||
const int q_offset = 32*im + l0;
|
||||
const int s_offset = 8*im;
|
||||
const int y_offset = 128*im + l0;
|
||||
@@ -39,7 +30,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
const uint8_t * d = (const uint8_t *)aux;
|
||||
const uint8_t * m = (const uint8_t *)(aux + 2);
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
|
||||
const float * y = yy + i * QK_K + y_offset;
|
||||
const uint8_t * q = x[i].qs + q_offset;
|
||||
@@ -54,7 +45,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
||||
|
||||
float sum1 = 0, sum2 = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
||||
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
||||
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
||||
@@ -94,17 +85,17 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||
const uint16_t kmask1 = 0x0303;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
const int tid = threadIdx.x/2; // 0...16
|
||||
const int ix = threadIdx.x%2; // 0,1
|
||||
|
||||
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
||||
const int step = 16/K_QUANTS_PER_ITERATION;
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0....15 or 0...7
|
||||
const int n = 2; // iterations in the inner loop
|
||||
const int step = 8;
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0....15 or 0...7
|
||||
|
||||
const uint8_t m = 1 << (4*im);
|
||||
|
||||
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
||||
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
||||
const int q_offset = 32*im + l0;
|
||||
const int y_offset = 128*im + l0;
|
||||
|
||||
@@ -113,7 +104,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||
|
||||
const uint16_t s_shift = 4*im;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
|
||||
const float * y = yy + i * QK_K + y_offset;
|
||||
const uint8_t * q = x[i].qs + q_offset;
|
||||
@@ -163,14 +154,14 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
const int tid = threadIdx.x/2; // 0...16
|
||||
const int ix = threadIdx.x%2; // 0,1
|
||||
|
||||
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
||||
const int step = 4;
|
||||
|
||||
const int il = tid/step; // 0...3
|
||||
const int ir = tid - step*il; // 0...7 or 0...3
|
||||
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
||||
const int il = tid/step; // 0...3
|
||||
const int ir = tid - step*il; // 0...7 or 0...3
|
||||
const int n = 4;
|
||||
|
||||
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const int in = il%2;
|
||||
@@ -182,17 +173,12 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||
uint16_t aux[4];
|
||||
const uint8_t * sc = (const uint8_t *)aux;
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 2
|
||||
uint32_t q32[4];
|
||||
const uint8_t * q4 = (const uint8_t *)q32;
|
||||
#else
|
||||
uint16_t q16[4];
|
||||
const uint8_t * q4 = (const uint8_t *)q16;
|
||||
#endif
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
|
||||
const float * y1 = yy + i*QK_K + y_offset;
|
||||
const float * y2 = y1 + 128;
|
||||
@@ -206,7 +192,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
||||
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 2
|
||||
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
||||
const uint32_t * q2 = q1 + 16;
|
||||
|
||||
@@ -223,25 +208,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
||||
}
|
||||
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
||||
#else
|
||||
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
||||
const uint16_t * q2 = q1 + 32;
|
||||
|
||||
q16[0] = q1[0] & 0x0f0f;
|
||||
q16[1] = q1[0] & 0xf0f0;
|
||||
q16[2] = q2[0] & 0x0f0f;
|
||||
q16[3] = q2[0] & 0xf0f0;
|
||||
|
||||
float4 s = {0.f, 0.f, 0.f, 0.f};
|
||||
float smin = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
||||
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
||||
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
||||
}
|
||||
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
@@ -341,9 +307,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
||||
}
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
|
||||
@@ -352,21 +315,17 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
|
||||
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
||||
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
const int tid = threadIdx.x/2; // 0...16
|
||||
const int ix = threadIdx.x%2; // 0, 1
|
||||
|
||||
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
||||
const int step = 8;
|
||||
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0...15 or 0...7
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0...15 or 0...7
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 1
|
||||
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
||||
const int is = 0;
|
||||
#else
|
||||
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
||||
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
||||
const int is = in / 4;
|
||||
#endif
|
||||
|
||||
const int ql_offset = 64*im + l0;
|
||||
const int qh_offset = 32*im + l0;
|
||||
const int s_offset = 8*im + is;
|
||||
@@ -374,7 +333,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
|
||||
const float * y = yy + i * QK_K + y_offset;
|
||||
const uint8_t * ql = x[i].ql + ql_offset;
|
||||
@@ -383,17 +342,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 1
|
||||
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
||||
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
||||
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
||||
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
||||
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
||||
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
||||
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
||||
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
||||
tmp += sum;
|
||||
#else
|
||||
float sum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
||||
@@ -402,8 +350,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
||||
}
|
||||
tmp += sum;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
@@ -547,7 +493,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y,
|
||||
|
||||
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
||||
const int ny = 2;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
@@ -556,7 +502,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
||||
|
||||
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int ny = 1;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
@@ -565,7 +511,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int ny = 1;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
@@ -580,7 +526,7 @@ static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, f
|
||||
|
||||
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int ny = 1;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
|
||||
@@ -54,12 +54,11 @@ typedef float (*vec_dot_KQ_f32_t)(
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
||||
half sum = 0.0f;
|
||||
T sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
||||
@@ -72,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
|
||||
const int sumi = __dp4a(v, u, 0);
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
@@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
@@ -120,7 +111,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
|
||||
const int sumi = __dp4a(v, u, 0);
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
@@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
@@ -179,7 +162,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
|
||||
const int sumi = __dp4a(v, u, 0);
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
@@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
@@ -234,7 +209,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
|
||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||
|
||||
const int sumi = __dp4a(v, u, 0);
|
||||
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if (std::is_same<T, half>::value) {
|
||||
@@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template <typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
@@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template <typename T, int D>
|
||||
|
||||
@@ -60,12 +60,16 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
}
|
||||
|
||||
static constexpr int get_mmq_y_host(const int cc) {
|
||||
return int8_mma_available(cc) || cc >= CC_VOLTA ? 128 : 64;
|
||||
return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64);
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(RDNA1)
|
||||
return 64;
|
||||
#else
|
||||
return 128;
|
||||
#endif // defined RDNA1
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
return 128;
|
||||
@@ -2259,9 +2263,9 @@ static __device__ void mul_mat_q_process_tile(
|
||||
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(RDNA3) || defined(RDNA2)
|
||||
#if defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
||||
|
||||
+16
-10
@@ -28,16 +28,22 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
||||
|
||||
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
|
||||
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
|
||||
1;
|
||||
}
|
||||
|
||||
|
||||
+316
-366
File diff suppressed because it is too large
Load Diff
@@ -6537,4 +6537,3 @@ template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t
|
||||
template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
|
||||
|
||||
|
||||
@@ -130,4 +130,3 @@ void iq3xs_free_impl(int grid_size);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
+44
-733
@@ -74,51 +74,6 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg
|
||||
const float *src1_dd, float *dst_dd,
|
||||
const queue_ptr &main_stream);
|
||||
|
||||
static __dpct_inline__ float warp_reduce_sum(float x,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
/*
|
||||
DPCT1096:98: The right-most dimension of the work-group used in the SYCL
|
||||
kernel that calls this function may be less than "32". The function
|
||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||
CPU device. Modify the size of the work-group to ensure that the value
|
||||
of the right-most dimension is a multiple of "32".
|
||||
*/
|
||||
x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
static __dpct_inline__ sycl::float2
|
||||
warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
|
||||
mask);
|
||||
a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
|
||||
mask);
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float warp_reduce_max(float x,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
/*
|
||||
DPCT1096:97: The right-most dimension of the work-group used in the SYCL
|
||||
kernel that calls this function may be less than "32". The function
|
||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||
CPU device. Modify the size of the work-group to ensure that the value
|
||||
of the right-most dimension is a multiple of "32".
|
||||
*/
|
||||
x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
|
||||
item_ct1.get_sub_group(), x, mask));
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
||||
return b;
|
||||
GGML_UNUSED(a);
|
||||
@@ -336,47 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
|
||||
dst[i] = x[i] * x[i];
|
||||
}
|
||||
|
||||
static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
|
||||
const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum, int block_size) {
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
|
||||
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row*ncols + col];
|
||||
mean_var.x() += xi;
|
||||
mean_var.y() += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = mean_var;
|
||||
}
|
||||
/*
|
||||
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
mean_var = s_sum[lane_id];
|
||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||
}
|
||||
|
||||
const float mean = mean_var.x() / ncols;
|
||||
const float var = mean_var.y() / ncols - mean * mean;
|
||||
const float inv_std = sycl::rsqrt(var + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
@@ -444,126 +358,11 @@ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00,
|
||||
}
|
||||
}
|
||||
|
||||
static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
|
||||
const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
|
||||
int start = item_ct1.get_group(2) * group_size;
|
||||
int end = start + group_size;
|
||||
|
||||
start += item_ct1.get_local_id(2);
|
||||
|
||||
if (end >= ne_elements) {
|
||||
end = ne_elements;
|
||||
}
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int j = start; j < end; j += block_size) {
|
||||
tmp += x[j];
|
||||
}
|
||||
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:1: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
tmp = s_sum[lane_id];
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
|
||||
float mean = tmp / group_size;
|
||||
tmp = 0.0f;
|
||||
|
||||
for (int j = start; j < end; j += block_size) {
|
||||
float xi = x[j] - mean;
|
||||
dst[j] = xi;
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:2: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
tmp = s_sum[lane_id];
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
|
||||
float variance = tmp / group_size;
|
||||
float scale = sycl::rsqrt(variance + eps);
|
||||
for (int j = start; j < end; j += block_size) {
|
||||
dst[j] *= scale;
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
|
||||
const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row*ncols + col];
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
tmp = s_sum[lane_id];
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
|
||||
const float mean = tmp / ncols;
|
||||
const float scale = sycl::rsqrt(mean + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row*ncols + col] = scale * x[row*ncols + col];
|
||||
}
|
||||
}
|
||||
|
||||
template<int QUANT_BLOCK_TILE>
|
||||
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE;
|
||||
|
||||
if (ix >= kx_padded) {
|
||||
return;
|
||||
@@ -578,23 +377,39 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
||||
|
||||
const int ib = i_padded / QK8_1; // block index
|
||||
const int iqs = i_padded % QK8_1; // quant index
|
||||
|
||||
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
||||
float amax = sycl::fabs((float)xi);
|
||||
float sum = xi;
|
||||
|
||||
typedef sycl::vec<float, QUANT_BLOCK_TILE> TC;
|
||||
typedef sycl::vec<int8_t, QUANT_BLOCK_TILE> TQ;
|
||||
TC zeros;
|
||||
TQ qzeros;
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
|
||||
item_ct1.get_sub_group(), amax, mask));
|
||||
sum +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
|
||||
for (int i = 0; i < QUANT_BLOCK_TILE; i++)
|
||||
{
|
||||
zeros[i] = 0.f;
|
||||
qzeros[i] = 0;
|
||||
}
|
||||
const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
|
||||
float sum = xi[0];
|
||||
float amax = sycl::fabs(xi[0]);
|
||||
#pragma unroll
|
||||
for (int i = 1; i < QUANT_BLOCK_TILE; i++)
|
||||
{
|
||||
sum += xi[i];
|
||||
amax = sycl::fmax(sycl::fabs(xi[i]), amax);
|
||||
}
|
||||
sum = warp_reduce_sum(sum, item_ct1);
|
||||
amax = warp_reduce_max(amax, item_ct1);
|
||||
|
||||
const float d = amax / 127;
|
||||
const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d);
|
||||
TQ q = qzeros;
|
||||
if (amax != 0.0f)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 0; i < QUANT_BLOCK_TILE; i++) {
|
||||
q[i] = sycl::round(xi[i] / d);
|
||||
}
|
||||
}
|
||||
|
||||
y[ib].qs[iqs] = q;
|
||||
*(TQ *)&y[ib].qs[iqs] = q;
|
||||
|
||||
if (iqs > 0) {
|
||||
return;
|
||||
@@ -728,7 +543,7 @@ static void mul_mat_p021_f16_f32(
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -781,7 +596,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -978,114 +793,6 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
|
||||
return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
|
||||
}
|
||||
|
||||
struct rope_corr_dims {
|
||||
float v[4];
|
||||
};
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
static void rope_yarn(
|
||||
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
||||
float * cos_theta, float * sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
|
||||
}
|
||||
*cos_theta = sycl::cos(theta) * mscale;
|
||||
*sin_theta = sycl::sin(theta) * mscale;
|
||||
}
|
||||
|
||||
// rope == RoPE == rotary positional embedding
|
||||
template<typename T, bool has_pos>
|
||||
static void rope(
|
||||
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
||||
,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
|
||||
if (col >= ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int i = row*ncols + col;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
const int p = has_pos ? pos[i2] : 0;
|
||||
const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + 1];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
template<typename T, bool has_pos, bool has_freq_facs>
|
||||
static void rope_neox(
|
||||
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
|
||||
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
|
||||
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
|
||||
if (col >= ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int ib = col / n_dims;
|
||||
const int ic = col % n_dims;
|
||||
|
||||
if (ib > 0) {
|
||||
const int i = row*ncols + ib*n_dims + ic;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const int i = row*ncols + ib*n_dims + ic/2;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
float cur_rot = inv_ndims * ic - ib;
|
||||
|
||||
const int p = has_pos ? pos[i2] : 0;
|
||||
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
||||
|
||||
const float theta_base =
|
||||
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + n_dims/2];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int row = item_ct1.get_group(1);
|
||||
@@ -1751,99 +1458,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
||||
});
|
||||
}
|
||||
|
||||
static void norm_f32_sycl(const float *x, float *dst, const int ncols,
|
||||
const int nrows, const float eps,
|
||||
queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
||||
sycl::range<1>(32), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), WARP_SIZE);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
const int work_group_size = get_work_group_size(stream->get_device());
|
||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||
/*
|
||||
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
||||
sycl::range<1>(32), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void group_norm_f32_sycl(const float *x, float *dst,
|
||||
const int num_groups, const int group_size,
|
||||
const int ne_elements, queue_ptr stream) {
|
||||
static const float eps = 1e-6f;
|
||||
if (group_size < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
||||
cgh);
|
||||
|
||||
const float eps_ct4 = eps;
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
group_norm_f32(
|
||||
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), WARP_SIZE);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
const int work_group_size = get_work_group_size(stream->get_device());
|
||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||
/*
|
||||
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
||||
cgh);
|
||||
|
||||
const float eps_ct4 = eps;
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
group_norm_f32(x, dst, group_size, ne_elements,
|
||||
eps_ct4, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
||||
const int ne0, int ne1, int ne2, int ne02,
|
||||
queue_ptr stream) {
|
||||
@@ -1885,64 +1499,22 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
||||
});
|
||||
}
|
||||
|
||||
static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
|
||||
const int nrows, const float eps,
|
||||
queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
||||
cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), WARP_SIZE);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
const int work_group_size = get_work_group_size(stream->get_device());
|
||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||
/*
|
||||
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
||||
cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
||||
const int ky, const int kx_padded,
|
||||
queue_ptr stream) {
|
||||
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
||||
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
||||
const sycl::range<3> block_size(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE);
|
||||
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
|
||||
static_assert(QK8_1 % WARP_SIZE == 0);
|
||||
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1962,7 +1534,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
|
||||
nchannels_y, item_ct1);
|
||||
});
|
||||
@@ -1982,7 +1554,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
||||
row_stride_x, channel_stride_x,
|
||||
nchannels_y / nchannels_x, item_ct1);
|
||||
@@ -2241,117 +1813,13 @@ static void clamp_f32_sycl(const float *x, float *dst, const float min,
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
|
||||
const int32_t *pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor,
|
||||
rope_corr_dims corr_dims, queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
|
||||
if (pos == nullptr) {
|
||||
/*
|
||||
DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
||||
freq_base, ext_factor, attn_factor, corr_dims,
|
||||
item_ct1);
|
||||
});
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
||||
freq_base, ext_factor, attn_factor, corr_dims,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
||||
const int32_t *pos, float freq_scale,
|
||||
int p_delta_rows, float freq_base, float ext_factor,
|
||||
float attn_factor, rope_corr_dims corr_dims,
|
||||
const float * freq_factors, queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float inv_ndims = -1.0f / n_dims;
|
||||
|
||||
if (pos == nullptr) {
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
if (freq_factors == nullptr) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
} else {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
||||
});
|
||||
} else {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
||||
const int nrows, queue_ptr stream) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
const sycl::range<3> block_nums(1, nrows, 1);
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
k_sum_rows_f32(x, dst, ncols, item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -2432,7 +1900,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
||||
nrows_y, scale, max_bias, m0,
|
||||
m1, n_head_log2, item_ct1,
|
||||
@@ -2612,12 +2080,6 @@ static inline int get_sycl_env(const char *env_name, int default_val) {
|
||||
return user_number;
|
||||
}
|
||||
|
||||
static inline int get_work_group_size(const sycl::device& device) {
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
return prop.get_max_work_group_size();
|
||||
}
|
||||
|
||||
static void ggml_check_sycl() try {
|
||||
static bool initialized = false;
|
||||
|
||||
@@ -3176,45 +2638,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
ggml_tensor *dst, const float *src0_dd,
|
||||
const float *src1_dd, float *dst_dd,
|
||||
const queue_ptr &main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
float *dst_dd,
|
||||
const queue_ptr &main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
int num_groups = dst->op_params[0];
|
||||
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
||||
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
@@ -3278,28 +2701,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
float *dst_dd,
|
||||
const queue_ptr &main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
int64_t max_compute_capability = INT_MIN;
|
||||
@@ -3461,97 +2862,6 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
ggml_tensor *dst, const float *src0_dd,
|
||||
const float *src1_dd, float *dst_dd,
|
||||
const queue_ptr &main_stream) {
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne2 = dst->ne[2];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
||||
|
||||
// RoPE alteration for extended context
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
|
||||
const float * freq_factors = nullptr;
|
||||
const int32_t * pos = nullptr;
|
||||
if ((mode & 1) == 0) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(src1->ne[0] == ne2);
|
||||
pos = (const int32_t *) src1_dd;
|
||||
}
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
|
||||
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
||||
|
||||
if (is_neox) {
|
||||
pos = (const int32_t *) src1_dd;
|
||||
|
||||
if (src2 != nullptr) {
|
||||
freq_factors = (const float *) src2->data;
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
|
||||
}
|
||||
|
||||
rope_corr_dims corr_dims;
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
|
||||
|
||||
// compute
|
||||
if (is_neox) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_neox_sycl(
|
||||
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
||||
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
||||
freq_base, ext_factor, attn_factor, corr_dims,
|
||||
freq_factors, main_stream);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_sycl(
|
||||
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, main_stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
|
||||
nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, main_stream);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
@@ -4576,7 +3886,6 @@ bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||
|
||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
||||
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
|
||||
if (split) {
|
||||
@@ -6241,7 +5550,9 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
|
||||
@@ -19,5 +19,7 @@
|
||||
#include "dmmv.hpp"
|
||||
#include "mmq.hpp"
|
||||
#include "mmvq.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "norm.hpp"
|
||||
|
||||
#endif // GGML_SYCL_BACKEND_HPP
|
||||
|
||||
@@ -295,5 +295,66 @@ struct ggml_backend_sycl_context {
|
||||
}
|
||||
};
|
||||
|
||||
// common host functions
|
||||
|
||||
static inline int get_work_group_size(const sycl::device& device) {
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
return prop.get_max_work_group_size();
|
||||
}
|
||||
|
||||
|
||||
// common device functions
|
||||
|
||||
static __dpct_inline__ float warp_reduce_sum(float x,
|
||||
const sycl::nd_item<3>& item_ct1) {
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
/*
|
||||
DPCT1096:98: The right-most dimension of the work-group used in the SYCL
|
||||
kernel that calls this function may be less than "32". The function
|
||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||
CPU device. Modify the size of the work-group to ensure that the value
|
||||
of the right-most dimension is a multiple of "32".
|
||||
*/
|
||||
x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
static __dpct_inline__ sycl::float2
|
||||
warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
|
||||
mask);
|
||||
a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
|
||||
mask);
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float warp_reduce_max(float x,
|
||||
const sycl::nd_item<3>& item_ct1) {
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
/*
|
||||
DPCT1096:97: The right-most dimension of the work-group used in the SYCL
|
||||
kernel that calls this function may be less than "32". The function
|
||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||
CPU device. Modify the size of the work-group to ensure that the value
|
||||
of the right-most dimension is a multiple of "32".
|
||||
*/
|
||||
x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
|
||||
item_ct1.get_sub_group(), x, mask));
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
// Helper for vec loading aligned data
|
||||
template <typename Tp, int n>
|
||||
inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
|
||||
return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
|
||||
}
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
||||
@@ -152,12 +152,15 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_K(vx, y, item_ct1);
|
||||
dequantize_block_q4_K(vx, y, scale_local_acc.get_pointer(), item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -293,7 +293,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
||||
#if QK_K == 256
|
||||
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||
if (j < 4) {
|
||||
d = q[j] & 63; m = q[j + 4] & 63;
|
||||
d = q[j] & 63;
|
||||
m = q[j + 4] & 63;
|
||||
} else {
|
||||
d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
|
||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||
@@ -303,7 +304,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
|
||||
const block_q4_K * x = (const block_q4_K *) vx;
|
||||
|
||||
const int i = item_ct1.get_group(2);
|
||||
@@ -318,19 +319,26 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
||||
|
||||
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
||||
|
||||
const float dall = x[i].dm[0];
|
||||
const float dmin = x[i].dm[1];
|
||||
const sycl::half2 dm = x[i].dm;
|
||||
const float dall = dm[0];
|
||||
const float dmin = dm[1];
|
||||
|
||||
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
||||
if (tid < 12)
|
||||
scales_local[tid] = x[i].scales[tid];
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(is + 0, x[i].scales, sc, m);
|
||||
const float d1 = dall * sc; const float m1 = dmin * m;
|
||||
get_scale_min_k4(is + 1, x[i].scales, sc, m);
|
||||
const float d2 = dall * sc; const float m2 = dmin * m;
|
||||
get_scale_min_k4(is + 0, scales_local, sc, m);
|
||||
const float d1 = dall * sc;
|
||||
const float m1 = dmin * m;
|
||||
get_scale_min_k4(is + 1, scales_local, sc, m);
|
||||
const float d2 = dall * sc;
|
||||
const float m2 = dmin * m;
|
||||
|
||||
sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
|
||||
for (int l = 0; l < n; ++l) {
|
||||
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
|
||||
y[l +32] = d2 * (q_vec[l] >> 4) - m2;
|
||||
}
|
||||
#else
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
|
||||
+78
-129
@@ -76,7 +76,7 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -104,7 +104,7 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
|
||||
nrows, item_ct1);
|
||||
});
|
||||
@@ -123,9 +123,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||
float *__restrict__ dst,
|
||||
const int ncols, int nrows,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
if (row > nrows) return;
|
||||
@@ -139,16 +136,16 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||
|
||||
#if QK_K == 256
|
||||
const int tid =
|
||||
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
||||
item_ct1.get_local_id(2) / 2; // 0...15
|
||||
const int ix =
|
||||
item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
item_ct1.get_local_id(2) % 2; // 0,1
|
||||
|
||||
const int step = 16/K_QUANTS_PER_ITERATION;
|
||||
const int step = 8;
|
||||
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0...15 or 0...7
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0...15 or 0...7
|
||||
|
||||
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
||||
const int l0 = 2*in; // 0...14 in steps of 2
|
||||
const int q_offset = 32*im + l0;
|
||||
const int s_offset = 8*im;
|
||||
const int y_offset = 128*im + l0;
|
||||
@@ -157,7 +154,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||
const uint8_t * d = (const uint8_t *)aux;
|
||||
const uint8_t * m = (const uint8_t *)(aux + 2);
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
|
||||
const float * y = yy + i * QK_K + y_offset;
|
||||
const uint8_t * q = x[i].qs + q_offset;
|
||||
@@ -172,7 +169,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
||||
|
||||
float sum1 = 0, sum2 = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
||||
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
||||
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
||||
@@ -189,18 +186,15 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||
|
||||
}
|
||||
#else
|
||||
const int tid = item_ct1.get_local_id(2) /
|
||||
(2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||
const int ix = item_ct1.get_local_id(2) %
|
||||
(2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||
const int offset = tid * K_QUANTS_PER_ITERATION;
|
||||
const int tid = item_ct1.get_local_id(2) / 4; // 0...7
|
||||
const int ix = item_ct1.get_local_id(2) % 4; // 0...3
|
||||
const int offset = tid * 2;
|
||||
|
||||
uint32_t uaux[2];
|
||||
const uint8_t * d = (const uint8_t *)uaux;
|
||||
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 4) {
|
||||
const float * y = yy + i * QK_K + offset;
|
||||
const uint8_t * q = x[i].qs + offset;
|
||||
const uint32_t * s = (const uint32_t *)x[i].scales;
|
||||
@@ -212,7 +206,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||
x[i].dm.convert<float, sycl::rounding_mode::automatic>();
|
||||
|
||||
float sum1 = 0, sum2 = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t ql = q[l];
|
||||
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
||||
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
||||
@@ -227,7 +221,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -267,14 +261,14 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
const int tid =
|
||||
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
item_ct1.get_local_id(2) / 2; // 0...16
|
||||
const int ix =
|
||||
item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
item_ct1.get_local_id(2) % 2; // 0,1
|
||||
|
||||
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
||||
const int step = 16/K_QUANTS_PER_ITERATION;
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0....15 or 0...7
|
||||
const int n = 2; // iterations in the inner loop
|
||||
const int step = 8;
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0....15 or 0...7
|
||||
|
||||
const uint8_t m = 1 << (4*im);
|
||||
|
||||
@@ -287,7 +281,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
||||
|
||||
const uint16_t s_shift = 4*im;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
|
||||
const float * y = yy + i * QK_K + y_offset;
|
||||
const uint8_t * q = x[i].qs + q_offset;
|
||||
@@ -317,13 +311,13 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
||||
}
|
||||
#else
|
||||
|
||||
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
||||
const int in = offset/8; // 0 or 1
|
||||
const int im = offset%8; // 0...7
|
||||
const int tid = item_ct1.get_local_id(2)/4; // 0...7
|
||||
const int ix = item_ct1.get_local_id(2)%4; // 0...3
|
||||
const int offset = tid * 2; // 0...14
|
||||
const int in = offset/8; // 0 or 1
|
||||
const int im = offset%8; // 0...7
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 4) {
|
||||
|
||||
const float * y = yy + i * QK_K + offset;
|
||||
const uint8_t * q = x[i].qs + offset;
|
||||
@@ -332,7 +326,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
||||
const float dall = (float)x[i].d;
|
||||
|
||||
float sum = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t hl = x[i].hmask[im+l] >> in;
|
||||
const uint8_t ql = q[l];
|
||||
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
||||
@@ -346,7 +340,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -383,15 +377,15 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int tid =
|
||||
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
item_ct1.get_local_id(2) / 2; // 0...16
|
||||
const int ix =
|
||||
item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
item_ct1.get_local_id(2) % 2; // 0,1
|
||||
|
||||
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
||||
const int step = 4;
|
||||
|
||||
const int il = tid/step; // 0...3
|
||||
const int ir = tid - step*il; // 0...7 or 0...3
|
||||
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
||||
const int n = 4;
|
||||
|
||||
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const int in = il%2;
|
||||
@@ -403,17 +397,12 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||
uint16_t aux[4];
|
||||
const uint8_t * sc = (const uint8_t *)aux;
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 2
|
||||
uint32_t q32[4];
|
||||
const uint8_t * q4 = (const uint8_t *)q32;
|
||||
#else
|
||||
uint16_t q16[4];
|
||||
const uint8_t * q4 = (const uint8_t *)q16;
|
||||
#endif
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
|
||||
const float * y1 = yy + i*QK_K + y_offset;
|
||||
const float * y2 = y1 + 128;
|
||||
@@ -427,7 +416,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
||||
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 2
|
||||
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
||||
const uint32_t * q2 = q1 + 16;
|
||||
|
||||
@@ -446,38 +434,19 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||
tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
|
||||
s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
|
||||
dmin * smin;
|
||||
#else
|
||||
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
||||
const uint16_t * q2 = q1 + 32;
|
||||
|
||||
q16[0] = q1[0] & 0x0f0f;
|
||||
q16[1] = q1[0] & 0xf0f0;
|
||||
q16[2] = q2[0] & 0x0f0f;
|
||||
q16[3] = q2[0] & 0xf0f0;
|
||||
|
||||
float4 s = {0.f, 0.f, 0.f, 0.f};
|
||||
float smin = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
||||
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
||||
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
||||
}
|
||||
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
||||
#endif
|
||||
|
||||
}
|
||||
#else
|
||||
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
||||
const int tid = item_ct1.get_local_id(2)/4; // 0...15
|
||||
const int ix = item_ct1.get_local_id(2)%4;
|
||||
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
const int step = tid * 2;
|
||||
|
||||
uint16_t aux16[2];
|
||||
const uint8_t * s = (const uint8_t *)aux16;
|
||||
|
||||
float tmp = 0;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 4) {
|
||||
const uint8_t * q = x[i].qs + step;
|
||||
const float * y = yy + i*QK_K + step;
|
||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||
@@ -486,7 +455,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||
const float d = (float)x[i].dm[0];
|
||||
const float m = (float)x[i].dm[1];
|
||||
float sum = 0.f;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
||||
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
||||
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
||||
@@ -499,7 +468,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -608,19 +577,19 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
||||
}
|
||||
|
||||
#else
|
||||
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
const int tid = item_ct1.get_local_id(2)/4; // 0...15
|
||||
const int ix = item_ct1.get_local_id(2)%4;
|
||||
const int step = tid * 2;
|
||||
const int im = step/8;
|
||||
const int in = step%8;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 4) {
|
||||
const uint8_t * q = x[i].qs + step;
|
||||
const int8_t * s = x[i].scales;
|
||||
const float * y = yy + i*QK_K + step;
|
||||
const float d = x[i].d;
|
||||
float sum = 0.f;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
const uint8_t h = x[i].qh[in+j] >> im;
|
||||
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
||||
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
||||
@@ -633,7 +602,7 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -645,9 +614,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
||||
|
||||
static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
if (row > nrows) return;
|
||||
@@ -660,22 +626,18 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||
#if QK_K == 256
|
||||
|
||||
const int tid =
|
||||
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
item_ct1.get_local_id(2) / 2; // 0...16
|
||||
const int ix =
|
||||
item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
item_ct1.get_local_id(2) % 2; // 0, 1
|
||||
|
||||
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
||||
const int step = 8;
|
||||
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0...15 or 0...7
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 1
|
||||
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
||||
const int is = 0;
|
||||
#else
|
||||
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
||||
const int is = in / 4;
|
||||
#endif
|
||||
|
||||
const int ql_offset = 64*im + l0;
|
||||
const int qh_offset = 32*im + l0;
|
||||
const int s_offset = 8*im + is;
|
||||
@@ -683,7 +645,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
|
||||
const float * y = yy + i * QK_K + y_offset;
|
||||
const uint8_t * ql = x[i].ql + ql_offset;
|
||||
@@ -692,17 +654,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 1
|
||||
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
||||
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
||||
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
||||
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
||||
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
||||
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
||||
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
||||
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
||||
tmp += sum;
|
||||
#else
|
||||
float sum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
||||
@@ -711,20 +662,18 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
||||
}
|
||||
tmp += sum;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
|
||||
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
|
||||
const int tid = item_ct1.get_local_id(2)/4; // 0...7
|
||||
const int ix = item_ct1.get_local_id(2)%4; // 0...3
|
||||
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
const int step = tid * 2;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
for (int i = ix; i < num_blocks_per_row; i += 4) {
|
||||
|
||||
const float * y = yy + i * QK_K + step;
|
||||
const uint8_t * ql = x[i].ql + step;
|
||||
@@ -734,7 +683,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||
const float d = x[i+0].d;
|
||||
|
||||
float sum = 0;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
||||
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
||||
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
||||
@@ -748,7 +697,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -774,7 +723,7 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
@@ -795,7 +744,7 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
@@ -816,7 +765,7 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
@@ -837,7 +786,7 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
@@ -858,7 +807,7 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
@@ -870,13 +819,13 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
||||
const int ny = 2;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, ny, 32);
|
||||
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -886,13 +835,13 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int ny = 1;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, ny, 32);
|
||||
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -902,13 +851,13 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int ny = 1;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, ny, 32);
|
||||
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -918,10 +867,10 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const sycl::range<3> block_dims(1, 1, 32);
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -931,13 +880,13 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int ny = 1;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, ny, 32);
|
||||
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -255,7 +255,7 @@ namespace dpct
|
||||
void set_pitch(size_t pitch) { _pitch = pitch; }
|
||||
|
||||
size_t get_x() { return _x; }
|
||||
void set_x(size_t x) { _x = x; };
|
||||
void set_x(size_t x) { _x = x; }
|
||||
|
||||
size_t get_y() { return _y; }
|
||||
void set_y(size_t y) { _y = y; }
|
||||
@@ -1056,7 +1056,7 @@ namespace dpct
|
||||
#error "Only support Windows and Linux."
|
||||
#endif
|
||||
next_free = mapped_address_space;
|
||||
};
|
||||
}
|
||||
|
||||
public:
|
||||
using buffer_id_t = int;
|
||||
@@ -1077,7 +1077,7 @@ namespace dpct
|
||||
#else
|
||||
#error "Only support Windows and Linux."
|
||||
#endif
|
||||
};
|
||||
}
|
||||
|
||||
mem_mgr(const mem_mgr &) = delete;
|
||||
mem_mgr &operator=(const mem_mgr &) = delete;
|
||||
|
||||
+64
-61
@@ -37,7 +37,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -85,7 +85,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -133,7 +133,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -181,7 +181,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -229,7 +229,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -277,7 +277,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -325,7 +325,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -373,7 +373,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -421,7 +421,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -470,7 +470,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
@@ -495,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
||||
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -519,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
||||
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -543,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
||||
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -567,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
||||
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -591,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
||||
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -615,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
||||
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -639,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
||||
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -663,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
||||
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -687,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
||||
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -711,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
||||
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
@@ -734,8 +734,8 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
@@ -759,8 +759,8 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
@@ -784,8 +784,8 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>(
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
@@ -809,8 +809,8 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
@@ -833,8 +833,8 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
@@ -858,7 +858,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
@@ -879,7 +879,7 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
@@ -901,7 +901,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
@@ -923,8 +923,8 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS, block_iq4_xs, 1>(
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
@@ -936,7 +936,7 @@ void ggml_sycl_op_mul_mat_vec_q(
|
||||
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
|
||||
float *dst_dd_i, const int64_t row_low, const int64_t row_high,
|
||||
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
||||
const int64_t src1_ncols, const int64_t src1_padded_col_size,
|
||||
const dpct::queue_ptr &stream) {
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
@@ -948,77 +948,80 @@ void ggml_sycl_op_mul_mat_vec_q(
|
||||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
|
||||
const size_t q8_1_ts = sizeof(block_q8_1);
|
||||
const size_t q8_1_bs = QK8_1;
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
|
||||
|
||||
switch (src0->type) {
|
||||
for (int i = 0; i < src1_ncols; i++)
|
||||
{
|
||||
const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
|
||||
const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
|
||||
float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_ddf_i;
|
||||
(void) src1_ncols;
|
||||
(void) src1_padded_row_size;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,370 @@
|
||||
#include "norm.hpp"
|
||||
|
||||
static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||
const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
assert(nwarps % WARP_SIZE == 0);
|
||||
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row * ncols + col];
|
||||
mean_var.x() += xi;
|
||||
mean_var.y() += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = mean_var;
|
||||
}
|
||||
/*
|
||||
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
mean_var = 0.f;
|
||||
int nreduce = nwarps / WARP_SIZE;
|
||||
for (size_t i = 0; i < nreduce; i += 1)
|
||||
{
|
||||
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
||||
}
|
||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||
}
|
||||
|
||||
const float mean = mean_var.x() / ncols;
|
||||
const float var = mean_var.y() / ncols - mean * mean;
|
||||
const float inv_std = sycl::rsqrt(var + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
|
||||
}
|
||||
}
|
||||
|
||||
static void group_norm_f32(const float* x, float* dst, const int group_size, const int ne_elements, const float eps,
|
||||
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||
int start = item_ct1.get_group(2) * group_size;
|
||||
int end = start + group_size;
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
assert(nwarps % WARP_SIZE == 0);
|
||||
start += item_ct1.get_local_id(2);
|
||||
|
||||
if (end >= ne_elements) {
|
||||
end = ne_elements;
|
||||
}
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int j = start; j < end; j += block_size) {
|
||||
tmp += x[j];
|
||||
}
|
||||
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:1: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
tmp = 0.f;
|
||||
int nreduce = nwarps / WARP_SIZE;
|
||||
for (size_t i = 0; i < nreduce; i += 1)
|
||||
{
|
||||
tmp += s_sum[lane_id + i * WARP_SIZE];
|
||||
}
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
|
||||
float mean = tmp / group_size;
|
||||
tmp = 0.0f;
|
||||
|
||||
for (int j = start; j < end; j += block_size) {
|
||||
float xi = x[j] - mean;
|
||||
dst[j] = xi;
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:2: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
tmp = s_sum[lane_id];
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
|
||||
float variance = tmp / group_size;
|
||||
float scale = sycl::rsqrt(variance + eps);
|
||||
for (int j = start; j < end; j += block_size) {
|
||||
dst[j] *= scale;
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
assert(nwarps % WARP_SIZE == 0);
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row * ncols + col];
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
int nreduce = nwarps / WARP_SIZE;
|
||||
tmp = 0.f;
|
||||
for (size_t i = 0; i < nreduce; i += 1)
|
||||
{
|
||||
tmp += s_sum[lane_id + i * WARP_SIZE];
|
||||
}
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
|
||||
const float mean = tmp / ncols;
|
||||
const float scale = sycl::rsqrt(mean + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row * ncols + col] = scale * x[row * ncols + col];
|
||||
}
|
||||
}
|
||||
|
||||
static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
const int nrows, const float eps,
|
||||
queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
else {
|
||||
const int work_group_size = get_work_group_size(stream->get_device());
|
||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||
/*
|
||||
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
||||
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void group_norm_f32_sycl(const float* x, float* dst,
|
||||
const int num_groups, const int group_size,
|
||||
const int ne_elements, queue_ptr stream) {
|
||||
static const float eps = 1e-6f;
|
||||
if (group_size < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
const float eps_ct4 = eps;
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
group_norm_f32(
|
||||
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
else {
|
||||
const int work_group_size = get_work_group_size(stream->get_device());
|
||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||
/*
|
||||
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||
cgh);
|
||||
|
||||
const float eps_ct4 = eps;
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
group_norm_f32(x, dst, group_size, ne_elements,
|
||||
eps_ct4, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
const int nrows, const float eps,
|
||||
queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
else {
|
||||
const int work_group_size = get_work_group_size(stream->get_device());
|
||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||
/*
|
||||
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||
cgh);
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
|
||||
ggml_tensor* dst, const float* src0_dd,
|
||||
const float* src1_dd, float* dst_dd,
|
||||
const queue_ptr& main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
||||
|
||||
(void)src1;
|
||||
(void)dst;
|
||||
(void)src1_dd;
|
||||
}
|
||||
|
||||
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||
const ggml_tensor* src1, ggml_tensor* dst,
|
||||
const float* src0_dd, const float* src1_dd,
|
||||
float* dst_dd,
|
||||
const queue_ptr& main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
int num_groups = dst->op_params[0];
|
||||
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
||||
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
||||
|
||||
(void)src1;
|
||||
(void)dst;
|
||||
(void)src1_dd;
|
||||
}
|
||||
|
||||
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||
const ggml_tensor* src1, ggml_tensor* dst,
|
||||
const float* src0_dd, const float* src1_dd,
|
||||
float* dst_dd,
|
||||
const queue_ptr& main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
||||
|
||||
(void)src1;
|
||||
(void)dst;
|
||||
(void)src1_dd;
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_NORM_HPP
|
||||
#define GGML_SYCL_NORM_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
|
||||
ggml_tensor* dst, const float* src0_dd,
|
||||
const float* src1_dd, float* dst_dd,
|
||||
const queue_ptr& main_stream);
|
||||
|
||||
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||
const ggml_tensor* src1, ggml_tensor* dst,
|
||||
const float* src0_dd, const float* src1_dd,
|
||||
float* dst_dd,
|
||||
const queue_ptr& main_stream);
|
||||
|
||||
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||
const ggml_tensor* src1, ggml_tensor* dst,
|
||||
const float* src0_dd, const float* src1_dd,
|
||||
float* dst_dd,
|
||||
const queue_ptr& main_stream);
|
||||
|
||||
#endif // GGML_SYCL_NORM_HPP
|
||||
@@ -16,7 +16,7 @@
|
||||
#define GGML_SYCL_MAX_STREAMS 8
|
||||
#define GGML_SYCL_MAX_BUFFERS 256
|
||||
|
||||
#define WARP_SIZE 32
|
||||
#define WARP_SIZE GGML_SYCL_WARP_SIZE
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
#define SYCL_GELU_BLOCK_SIZE 256
|
||||
@@ -50,12 +50,6 @@
|
||||
#define GGML_SYCL_MMV_Y 1
|
||||
#endif
|
||||
|
||||
#ifndef K_QUANTS_PER_ITERATION
|
||||
#define K_QUANTS_PER_ITERATION 2
|
||||
#else
|
||||
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
||||
#endif
|
||||
|
||||
#ifndef GGML_SYCL_PEER_MAX_BATCH_SIZE
|
||||
#define GGML_SYCL_PEER_MAX_BATCH_SIZE 128
|
||||
#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
|
||||
|
||||
@@ -0,0 +1,275 @@
|
||||
#include "rope.hpp"
|
||||
|
||||
struct rope_corr_dims {
|
||||
float v[2];
|
||||
};
|
||||
|
||||
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
|
||||
return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
|
||||
}
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
static void rope_yarn(
|
||||
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
||||
float * cos_theta, float * sin_theta) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
|
||||
}
|
||||
*cos_theta = sycl::cos(theta) * mscale;
|
||||
*sin_theta = sycl::sin(theta) * mscale;
|
||||
}
|
||||
|
||||
template<typename T, bool has_ff>
|
||||
static void rope_norm(
|
||||
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row*ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const int i = row*ne0 + i0;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + 1];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
template<typename T, bool has_ff>
|
||||
static void rope_neox(
|
||||
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row*ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const int i = row*ne0 + i0/2;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + n_dims/2];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void rope_norm_sycl(
|
||||
const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
/*
|
||||
DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, false>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
|
||||
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, true>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
|
||||
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void rope_neox_sycl(
|
||||
const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false>(x, dst, ne0, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
} else {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true>(x, dst, ne0, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_op_rope(
|
||||
ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream) {
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
||||
|
||||
// RoPE alteration for extended context
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
|
||||
const int32_t * pos = (const int32_t *) src1_dd;
|
||||
|
||||
const float * freq_factors = nullptr;
|
||||
if (src2 != nullptr) {
|
||||
freq_factors = (const float *) src2->data;
|
||||
}
|
||||
|
||||
rope_corr_dims corr_dims;
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
|
||||
|
||||
// compute
|
||||
if (is_neox) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_neox_sycl(
|
||||
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_neox_sycl(
|
||||
(const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_norm_sycl(
|
||||
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_norm_sycl(
|
||||
(const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_ROPE_HPP
|
||||
#define GGML_SYCL_ROPE_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_rope(
|
||||
ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream);
|
||||
|
||||
#endif // GGML_SYCL_ROPE_HPP
|
||||
@@ -820,7 +820,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
||||
#if QK_K == 256
|
||||
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
||||
|
||||
#if QR2_XXS == 8
|
||||
const int ib32 = iqs;
|
||||
const uint16_t * q2 = bq2->qs + 4*ib32;
|
||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
||||
@@ -838,26 +837,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
||||
}
|
||||
const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
|
||||
return d * sumi;
|
||||
#else
|
||||
// iqs is 0...15
|
||||
const int ib32 = iqs/2;
|
||||
const int il = iqs%2;
|
||||
const uint16_t * q2 = bq2->qs + 4*ib32;
|
||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
||||
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * bq8_1[ib32].ds[0] * 0.25f;
|
||||
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
||||
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
|
||||
sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
return d * (sumi1 + sumi2);
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
|
||||
@@ -144954,4 +144954,3 @@ unsigned char sum_rows_f32_data[] = {
|
||||
|
||||
};
|
||||
const uint64_t sum_rows_f32_len = 2112;
|
||||
|
||||
|
||||
@@ -44,12 +44,6 @@
|
||||
|
||||
#define MAX_VK_BUFFERS 256
|
||||
|
||||
#ifndef K_QUANTS_PER_ITERATION
|
||||
#define K_QUANTS_PER_ITERATION 1
|
||||
#else
|
||||
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
||||
#endif
|
||||
|
||||
#define VK_CHECK(err, msg) \
|
||||
do { \
|
||||
vk::Result err_ = (err); \
|
||||
|
||||
+1
-1
@@ -5312,7 +5312,7 @@ void ggml_mul_mat_set_prec(
|
||||
as -> [cols, rows, n_expert]
|
||||
ids -> [n_experts_used, n_tokens] (i32)
|
||||
b -> [cols, n_expert_used, n_tokens]
|
||||
c -> [cols, n_expert_used, n_tokens]
|
||||
c -> [rows, n_expert_used, n_tokens]
|
||||
|
||||
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
|
||||
|
||||
|
||||
@@ -2,8 +2,6 @@
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
|
||||
#define K_QUANTS_PER_ITERATION 2
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
#define EXPERT_COUNT 8
|
||||
#endif
|
||||
|
||||
@@ -15,22 +15,22 @@ void main() {
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
const uint tid = gl_LocalInvocationID.x/2; // 0...16
|
||||
const uint ix = gl_LocalInvocationID.x%2; // 0, 1
|
||||
|
||||
const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
||||
const uint step = 8;
|
||||
|
||||
const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const uint v_in = tid - step*v_im; // 0...15 or 0...7
|
||||
|
||||
const uint l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15
|
||||
const uint l0 = 2*v_in; // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint s_offset = 8*v_im;
|
||||
const uint y_offset = 128*v_im + l0;
|
||||
|
||||
tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
|
||||
@@ -38,7 +38,7 @@ void main() {
|
||||
|
||||
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
||||
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
sum1 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3)
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3)
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3)
|
||||
|
||||
@@ -15,17 +15,17 @@ void main() {
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
const uint tid = gl_LocalInvocationID.x/2; // 0...16
|
||||
const uint ix = gl_LocalInvocationID.x%2; // 0, 1
|
||||
|
||||
const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
||||
const uint step = 8;
|
||||
|
||||
const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const uint v_in = tid - step*v_im; // 0...15 or 0...7
|
||||
|
||||
const uint8_t m = uint8_t(1 << (4 * v_im));
|
||||
|
||||
const uint l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15
|
||||
const uint l0 = 2*v_in; // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 128*v_im + l0;
|
||||
|
||||
@@ -33,13 +33,13 @@ void main() {
|
||||
|
||||
const uint s_shift = 4 * v_im;
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
sum += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4))
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4))
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4))
|
||||
|
||||
@@ -15,14 +15,14 @@ void main() {
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
const uint tid = gl_LocalInvocationID.x/2; // 0...16
|
||||
const uint ix = gl_LocalInvocationID.x%2; // 0, 1
|
||||
|
||||
const uint step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
||||
const uint step = 4;
|
||||
|
||||
const uint il = tid/step; // 0...3
|
||||
const uint ir = tid - step*il; // 0...7 or 0...3
|
||||
const uint n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
||||
const uint n = 4;
|
||||
|
||||
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const uint v_in = il % 2;
|
||||
@@ -33,7 +33,7 @@ void main() {
|
||||
|
||||
tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
|
||||
const uint y1_idx = i * QUANT_K + y_offset;
|
||||
const uint y2_idx = y1_idx + 128;
|
||||
|
||||
@@ -49,7 +49,6 @@ void main() {
|
||||
const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2));
|
||||
const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2));
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 2
|
||||
const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf);
|
||||
const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf);
|
||||
const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset + 2] & 0xf);
|
||||
@@ -78,27 +77,6 @@ void main() {
|
||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7
|
||||
);
|
||||
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
|
||||
#else
|
||||
const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf);
|
||||
const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf);
|
||||
const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset ] >> 4);
|
||||
const uint8_t q4_3 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] >> 4);
|
||||
const uint8_t q4_4 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf);
|
||||
const uint8_t q4_5 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf);
|
||||
const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4);
|
||||
const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4);
|
||||
|
||||
const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx ]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1);
|
||||
const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3);
|
||||
const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx ]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5);
|
||||
const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7);
|
||||
const FLOAT_TYPE smin = FLOAT_TYPE(
|
||||
FLOAT_TYPE(data_b[b_offset + y1_idx]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7
|
||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7
|
||||
);
|
||||
|
||||
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
|
||||
#endif
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
|
||||
@@ -15,21 +15,16 @@ void main() {
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
const uint tid = gl_LocalInvocationID.x/2; // 0...16
|
||||
const uint ix = gl_LocalInvocationID.x%2; // 0, 1
|
||||
|
||||
const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
||||
const uint step = 8;
|
||||
|
||||
const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const uint v_in = tid - step*v_im; // 0...15 or 0...7
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 1
|
||||
const uint l0 = v_in; // 0...15
|
||||
const uint is = 0;
|
||||
#else
|
||||
const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28
|
||||
const uint is = v_in / 4;
|
||||
#endif
|
||||
|
||||
const uint ql_offset = 64*v_im + l0;
|
||||
const uint qh_offset = 32*v_im + l0;
|
||||
@@ -38,22 +33,11 @@ void main() {
|
||||
|
||||
tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||
|
||||
#if K_QUANTS_PER_ITERATION == 1
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32)
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32)
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32)
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32)
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32)
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32)
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32)
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32);
|
||||
tmp[16 * ix + tid] += sum;
|
||||
#else
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 4; ++l) {
|
||||
sum += FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32)
|
||||
@@ -62,7 +46,6 @@ void main() {
|
||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32);
|
||||
}
|
||||
tmp[16 * ix + tid] += sum;
|
||||
#endif
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
|
||||
+5
-5
@@ -3,7 +3,7 @@
|
||||
This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
|
||||
(GGML Universal File) format.
|
||||
|
||||
See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py)
|
||||
See [convert_hf_to_gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py)
|
||||
as an example for its usage.
|
||||
|
||||
## Installation
|
||||
@@ -15,13 +15,13 @@ pip install gguf
|
||||
|
||||
[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
|
||||
|
||||
[scripts/gguf-dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-dump.py) — Dumps a GGUF file's metadata to the console.
|
||||
[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
|
||||
|
||||
[scripts/gguf-set-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-set-metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||
[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||
|
||||
[scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files.
|
||||
[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
|
||||
|
||||
[scripts/gguf-new-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-new-metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||
[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||
|
||||
## Development
|
||||
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||
|
||||
@@ -50,6 +50,8 @@ class Keys:
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
LOGIT_SCALE = "{arch}.logit_scale"
|
||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
||||
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -64,6 +66,7 @@ class Keys:
|
||||
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
||||
|
||||
class Rope:
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
@@ -150,16 +153,19 @@ class MODEL_ARCH(IntEnum):
|
||||
INTERNLM2 = auto()
|
||||
MINICPM = auto()
|
||||
GEMMA = auto()
|
||||
GEMMA2 = auto()
|
||||
STARCODER2 = auto()
|
||||
MAMBA = auto()
|
||||
XVERSE = auto()
|
||||
COMMAND_R = auto()
|
||||
DBRX = auto()
|
||||
OLMO = auto()
|
||||
OPENELM = auto()
|
||||
ARCTIC = auto()
|
||||
DEEPSEEK2 = auto()
|
||||
BITNET = auto()
|
||||
T5 = auto()
|
||||
JAIS = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -180,10 +186,13 @@ class MODEL_TENSOR(IntEnum):
|
||||
ATTN_NORM = auto()
|
||||
ATTN_NORM_2 = auto()
|
||||
ATTN_OUT_NORM = auto()
|
||||
ATTN_POST_NORM = auto()
|
||||
ATTN_ROT_EMBD = auto()
|
||||
FFN_GATE_INP = auto()
|
||||
FFN_GATE_INP_SHEXP = auto()
|
||||
FFN_NORM = auto()
|
||||
FFN_PRE_NORM = auto()
|
||||
FFN_POST_NORM = auto()
|
||||
FFN_GATE = auto()
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
@@ -270,16 +279,19 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
MODEL_ARCH.MINICPM: "minicpm",
|
||||
MODEL_ARCH.GEMMA: "gemma",
|
||||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.MAMBA: "mamba",
|
||||
MODEL_ARCH.XVERSE: "xverse",
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
MODEL_ARCH.DBRX: "dbrx",
|
||||
MODEL_ARCH.OLMO: "olmo",
|
||||
MODEL_ARCH.OPENELM: "openelm",
|
||||
MODEL_ARCH.ARCTIC: "arctic",
|
||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||
MODEL_ARCH.BITNET: "bitnet",
|
||||
MODEL_ARCH.T5: "t5",
|
||||
MODEL_ARCH.JAIS: "jais",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -303,9 +315,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
@@ -751,6 +766,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
],
|
||||
MODEL_ARCH.GEMMA2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_PRE_NORM,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.STARCODER2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
@@ -833,6 +863,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.OPENELM: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.ARCTIC: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
@@ -928,6 +971,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ENC_FFN_UP,
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
||||
],
|
||||
MODEL_ARCH.JAIS: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
@@ -480,8 +480,11 @@ class GGUFWriter:
|
||||
def add_leading_dense_block_count(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length)
|
||||
|
||||
def add_feed_forward_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
def add_feed_forward_length(self, length: int | Sequence[int]) -> None:
|
||||
if isinstance(length, int):
|
||||
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
else:
|
||||
self.add_array(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_expert_feed_forward_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
@@ -495,11 +498,17 @@ class GGUFWriter:
|
||||
def add_decoder_start_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
|
||||
|
||||
def add_head_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
||||
def add_head_count(self, count: int | Sequence[int]) -> None:
|
||||
if isinstance(count, int):
|
||||
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
||||
else:
|
||||
self.add_array(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_head_count_kv(self, count: int) -> None:
|
||||
self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
|
||||
def add_head_count_kv(self, count: int | Sequence[int]) -> None:
|
||||
if isinstance(count, int):
|
||||
self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
|
||||
else:
|
||||
self.add_array(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
|
||||
|
||||
def add_key_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length)
|
||||
@@ -516,6 +525,12 @@ class GGUFWriter:
|
||||
def add_logit_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_attn_logit_softcapping(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||
|
||||
def add_final_logit_softcapping(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||
|
||||
def add_expert_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
||||
|
||||
@@ -546,6 +561,9 @@ class GGUFWriter:
|
||||
def add_relative_attn_buckets_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
||||
|
||||
def add_sliding_window(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
|
||||
|
||||
def add_pooling_type(self, value: PoolingType) -> None:
|
||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ class TensorNameMap:
|
||||
# Token embeddings
|
||||
MODEL_TENSOR.TOKEN_EMBD: (
|
||||
"gpt_neox.embed_in", # gptneox
|
||||
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx
|
||||
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais
|
||||
"transformer.word_embeddings", # falcon
|
||||
"word_embeddings", # bloom
|
||||
"model.embed_tokens", # llama-hf
|
||||
@@ -24,6 +24,7 @@ class TensorNameMap:
|
||||
"backbone.embedding", # mamba
|
||||
"backbone.embeddings", # mamba-hf
|
||||
"transformer.in_out_embed", # Grok
|
||||
"transformer.token_embeddings", # openelm
|
||||
"shared", # t5
|
||||
),
|
||||
|
||||
@@ -37,6 +38,7 @@ class TensorNameMap:
|
||||
"word_embeddings_layernorm", # bloom
|
||||
"embeddings.LayerNorm", # bert
|
||||
"emb_ln", # nomic-bert
|
||||
"transformer.norm", # openelm
|
||||
),
|
||||
|
||||
# Position embeddings
|
||||
@@ -49,7 +51,7 @@ class TensorNameMap:
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
|
||||
"output", # llama-pth bloom internlm2
|
||||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
@@ -58,7 +60,7 @@ class TensorNameMap:
|
||||
# Output norm
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon jais
|
||||
"model.norm", # llama-hf baichuan internlm2
|
||||
"norm", # llama-pth
|
||||
"transformer.norm_f", # mpt dbrx
|
||||
@@ -69,6 +71,7 @@ class TensorNameMap:
|
||||
"model.norm_f", # mamba-qbert
|
||||
"backbone.norm_f", # mamba
|
||||
"transformer.rms_norm", # Grok
|
||||
"transformer.norm", # openelm
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
@@ -81,7 +84,7 @@ class TensorNameMap:
|
||||
# Attention norm
|
||||
MODEL_TENSOR.ATTN_NORM: (
|
||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais
|
||||
"transformer.blocks.{bid}.norm_1", # mpt
|
||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||
"h.{bid}.input_layernorm", # bloom
|
||||
@@ -98,6 +101,7 @@ class TensorNameMap:
|
||||
"backbone.layers.{bid}.norm", # mamba
|
||||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||
"transformer.layers.{bid}.attn_norm", # openelm
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -109,7 +113,7 @@ class TensorNameMap:
|
||||
# Attention query-key-value
|
||||
MODEL_TENSOR.ATTN_QKV: (
|
||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen
|
||||
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais
|
||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx
|
||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||
@@ -119,7 +123,8 @@ class TensorNameMap:
|
||||
"h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||
"model.layers.{bid}.self_attn.qkv_proj" # phi3
|
||||
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
||||
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
||||
),
|
||||
|
||||
# Attention query
|
||||
@@ -160,7 +165,7 @@ class TensorNameMap:
|
||||
# Attention output
|
||||
MODEL_TENSOR.ATTN_OUT: (
|
||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen
|
||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais
|
||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||
"h.{bid}.self_attention.dense", # bloom
|
||||
@@ -177,6 +182,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
||||
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
||||
"transformer.layers.{bid}.attn.out_proj", # openelm
|
||||
),
|
||||
|
||||
# Attention output norm
|
||||
@@ -187,6 +193,10 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_POST_NORM: (
|
||||
"model.layers.{bid}.post_attention_layernorm", # gemma2
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||
@@ -198,7 +208,7 @@ class TensorNameMap:
|
||||
# Feed-forward norm
|
||||
MODEL_TENSOR.FFN_NORM: (
|
||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_2", # gpt2 refact qwen
|
||||
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais
|
||||
"h.{bid}.post_attention_layernorm", # bloom
|
||||
"transformer.blocks.{bid}.norm_2", # mpt
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||
@@ -208,6 +218,17 @@ class TensorNameMap:
|
||||
"h.{bid}.ln_2", # gpt2
|
||||
"model.layers.{bid}.ffn_norm", # internlm2
|
||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_PRE_NORM: (
|
||||
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_POST_NORM: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
@@ -225,7 +246,7 @@ class TensorNameMap:
|
||||
# Feed-forward up
|
||||
MODEL_TENSOR.FFN_UP: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.h.{bid}.mlp.c_fc", # gpt2 jais
|
||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
||||
@@ -271,6 +292,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
"transformer.h.{bid}.mlp.w2", # qwen
|
||||
"transformer.h.{bid}.mlp.c_fc2", # jais
|
||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||
@@ -294,7 +316,7 @@ class TensorNameMap:
|
||||
# Feed-forward down
|
||||
MODEL_TENSOR.FFN_DOWN: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen
|
||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais
|
||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||
@@ -312,6 +334,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||
"transformer.layers.{bid}.ffn.proj_2", # openelm
|
||||
"model.layers.{bid}.residual_mlp.w2", # arctic
|
||||
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
|
||||
),
|
||||
@@ -333,7 +356,8 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
@@ -341,7 +365,8 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ROPE_FREQS: (
|
||||
|
||||
@@ -1,13 +1,4 @@
|
||||
import os
|
||||
|
||||
from importlib import import_module
|
||||
|
||||
|
||||
os.environ["NO_LOCAL_GGUF"] = "TRUE"
|
||||
|
||||
gguf_convert_endian_entrypoint = import_module("scripts.gguf-convert-endian").main
|
||||
gguf_dump_entrypoint = import_module("scripts.gguf-dump").main
|
||||
gguf_set_metadata_entrypoint = import_module("scripts.gguf-set-metadata").main
|
||||
gguf_new_metadata_entrypoint = import_module("scripts.gguf-new-metadata").main
|
||||
|
||||
del import_module, os
|
||||
from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
|
||||
from .gguf_dump import main as gguf_dump_entrypoint
|
||||
from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
|
||||
from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
|
||||
|
||||
+248
-10
@@ -126,19 +126,257 @@ You can use GBNF grammars:
|
||||
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
||||
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
|
||||
|
||||
Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
||||
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
||||
|
||||
Here is also a non-exhaustive list of **unsupported** features:
|
||||
```bash
|
||||
llama-cli \
|
||||
-hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
|
||||
-hff Phi-3-medium-128k-instruct-Q8_0.gguf \
|
||||
-j '{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"maxLength": 100
|
||||
},
|
||||
"age": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 150
|
||||
}
|
||||
},
|
||||
"required": ["name", "age"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"minItems": 10,
|
||||
"maxItems": 100
|
||||
}' \
|
||||
-p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
|
||||
```
|
||||
|
||||
- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
|
||||
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
|
||||
- `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
|
||||
- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
|
||||
- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
|
||||
- `string` formats `uri`, `email`
|
||||
<details>
|
||||
|
||||
<summary>Show grammar</summary>
|
||||
|
||||
You can convert any schema in command-line with:
|
||||
|
||||
```bash
|
||||
examples/json_schema_to_grammar.py name-age-schema.json
|
||||
```
|
||||
|
||||
```
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
item ::= "{" space item-name-kv "," space item-age-kv "}" space
|
||||
item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space
|
||||
item-age-kv ::= "\"age\"" space ":" space item-age
|
||||
item-name ::= "\"" char{1,100} "\"" space
|
||||
item-name-kv ::= "\"name\"" space ":" space item-name
|
||||
root ::= "[" space item ("," space item){9,99} "]" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
Here is also a list of known limitations (contributions welcome):
|
||||
|
||||
- `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations).
|
||||
- `"additionalProperties": true` may produce keys that contain unescaped newlines.
|
||||
- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
|
||||
- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
|
||||
- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
|
||||
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
|
||||
- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
|
||||
- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
|
||||
- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
|
||||
- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
|
||||
- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
|
||||
|
||||
And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars):
|
||||
|
||||
- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems)
|
||||
- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
|
||||
- `uniqueItems`
|
||||
- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
|
||||
- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
|
||||
- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
|
||||
- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
|
||||
|
||||
### A word about additionalProperties
|
||||
|
||||
> [!WARNING]
|
||||
> The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default.
|
||||
> Since this is slow and seems prone to hallucinations, we default to no additional properties.
|
||||
> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties.
|
||||
|
||||
If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class:
|
||||
|
||||
```python
|
||||
# pip install pydantic
|
||||
import json
|
||||
from typing import Annotated, List
|
||||
from pydantic import BaseModel, Extra, Field
|
||||
class QAPair(BaseModel):
|
||||
class Config:
|
||||
extra = 'allow' # triggers additionalProperties: true in the JSON schema
|
||||
question: str
|
||||
concise_answer: str
|
||||
justification: str
|
||||
|
||||
class Summary(BaseModel):
|
||||
class Config:
|
||||
extra = 'allow'
|
||||
key_facts: List[Annotated[str, Field(pattern='- .{5,}')]]
|
||||
question_answers: List[Annotated[List[QAPair], Field(min_items=5)]]
|
||||
|
||||
print(json.dumps(Summary.model_json_schema(), indent=2))
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>Show JSON schema & grammar</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"$defs": {
|
||||
"QAPair": {
|
||||
"additionalProperties": true,
|
||||
"properties": {
|
||||
"question": {
|
||||
"title": "Question",
|
||||
"type": "string"
|
||||
},
|
||||
"concise_answer": {
|
||||
"title": "Concise Answer",
|
||||
"type": "string"
|
||||
},
|
||||
"justification": {
|
||||
"title": "Justification",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"question",
|
||||
"concise_answer",
|
||||
"justification"
|
||||
],
|
||||
"title": "QAPair",
|
||||
"type": "object"
|
||||
}
|
||||
},
|
||||
"additionalProperties": true,
|
||||
"properties": {
|
||||
"key_facts": {
|
||||
"items": {
|
||||
"pattern": "^- .{5,}$",
|
||||
"type": "string"
|
||||
},
|
||||
"title": "Key Facts",
|
||||
"type": "array"
|
||||
},
|
||||
"question_answers": {
|
||||
"items": {
|
||||
"items": {
|
||||
"$ref": "#/$defs/QAPair"
|
||||
},
|
||||
"minItems": 5,
|
||||
"type": "array"
|
||||
},
|
||||
"title": "Question Answers",
|
||||
"type": "array"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"key_facts",
|
||||
"question_answers"
|
||||
],
|
||||
"title": "Summary",
|
||||
"type": "object"
|
||||
}
|
||||
```
|
||||
|
||||
```
|
||||
QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv ( "," space ( QAPair-additional-kv ( "," space QAPair-additional-kv )* ) )? "}" space
|
||||
QAPair-additional-k ::= ["] ( [c] ([o] ([n] ([c] ([i] ([s] ([e] ([_] ([a] ([n] ([s] ([w] ([e] ([r] char+ | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"e] char*) | [^"s] char*) | [^"i] char*) | [^"c] char*) | [^"n] char*) | [^"o] char*) | [j] ([u] ([s] ([t] ([i] ([f] ([i] ([c] ([a] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"a] char*) | [^"c] char*) | [^"i] char*) | [^"f] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"u] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"cjq] char* )? ["] space
|
||||
QAPair-additional-kv ::= QAPair-additional-k ":" space value
|
||||
QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string
|
||||
QAPair-justification-kv ::= "\"justification\"" space ":" space string
|
||||
QAPair-question-kv ::= "\"question\"" space ":" space string
|
||||
additional-k ::= ["] ( [k] ([e] ([y] ([_] ([f] ([a] ([c] ([t] ([s] char+ | [^"s] char*) | [^"t] char*) | [^"c] char*) | [^"a] char*) | [^"f] char*) | [^"_] char*) | [^"y] char*) | [^"e] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] ([_] ([a] ([n] ([s] ([w] ([e] ([r] ([s] char+ | [^"s] char*) | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"kq] char* )? ["] space
|
||||
additional-kv ::= additional-k ":" space value
|
||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
||||
boolean ::= ("true" | "false") space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
dot ::= [^\x0A\x0D]
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space
|
||||
key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space
|
||||
key-facts-item-1 ::= dot
|
||||
key-facts-kv ::= "\"key_facts\"" space ":" space key-facts
|
||||
null ::= "null" space
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
||||
question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space
|
||||
question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space
|
||||
question-answers-item-item ::= QAPair
|
||||
question-answers-kv ::= "\"question_answers\"" space ":" space question-answers
|
||||
root ::= "{" space key-facts-kv "," space question-answers-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
string ::= "\"" char* "\"" space
|
||||
value ::= object | array | string | number | boolean | null
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
If you're using [Zod](https://zod.dev/), you can make your objects to explicitly allow extra properties w/ `nonstrict()` / `passthrough()` (or explicitly no extra props w/ `z.object(...).strict()` or `z.strictObject(...)`) but note that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always sets `"additionalProperties": false` anyway.
|
||||
|
||||
```js
|
||||
import { z } from 'zod';
|
||||
import { zodToJsonSchema } from 'zod-to-json-schema';
|
||||
|
||||
const Foo = z.object({
|
||||
age: z.number().positive(),
|
||||
email: z.string().email(),
|
||||
}).strict();
|
||||
|
||||
console.log(zodToJsonSchema(Foo));
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>Show JSON schema & grammar</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"age": {
|
||||
"type": "number",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"email": {
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"age",
|
||||
"email"
|
||||
],
|
||||
"additionalProperties": false,
|
||||
"$schema": "http://json-schema.org/draft-07/schema#"
|
||||
}
|
||||
```
|
||||
|
||||
```
|
||||
age-kv ::= "\"age\"" space ":" space number
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
email-kv ::= "\"email\"" space ":" space string
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "{" space age-kv "," space email-kv "}" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
string ::= "\"" char* "\"" space
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
@@ -89,6 +89,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 16,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 17,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
@@ -484,6 +485,13 @@ extern "C" {
|
||||
// Get a llama model tensor
|
||||
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
||||
|
||||
// Returns true if the model contains an encoder that requires llama_encode() call
|
||||
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
|
||||
|
||||
// For encoder-decoder models, this function returns id of the token that must be provided
|
||||
// to the decoder to start generating output sequence. For other models, it returns -1.
|
||||
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
||||
|
||||
// Returns 0 on success
|
||||
LLAMA_API uint32_t llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
@@ -769,6 +777,14 @@ extern "C" {
|
||||
// Frees a batch of tokens allocated with llama_batch_init()
|
||||
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
||||
|
||||
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
||||
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
||||
// 0 - success
|
||||
// < 0 - error
|
||||
LLAMA_API int32_t llama_encode(
|
||||
struct llama_context * ctx,
|
||||
struct llama_batch batch);
|
||||
|
||||
// Positive return values does not mean a fatal error, but rather a warning.
|
||||
// 0 - success
|
||||
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
||||
|
||||
@@ -73,6 +73,8 @@ __ggml_vocab_test__
|
||||
__ggml_vocab_test__
|
||||
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
||||
__ggml_vocab_test__
|
||||
!!!!!!
|
||||
__ggml_vocab_test__
|
||||
3
|
||||
__ggml_vocab_test__
|
||||
33
|
||||
@@ -91,6 +93,10 @@ __ggml_vocab_test__
|
||||
__ggml_vocab_test__
|
||||
333333333
|
||||
__ggml_vocab_test__
|
||||
Cửa Việt
|
||||
__ggml_vocab_test__
|
||||
discards
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
1027
|
||||
1005 3690
|
||||
7592 1010 1061 1005 2035 999 2129 2024 2017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995
|
||||
999 999 999 999 999 999
|
||||
1017
|
||||
3943
|
||||
21211
|
||||
@@ -40,4 +41,6 @@
|
||||
21211 22394 22394
|
||||
21211 22394 22394 2509
|
||||
21211 22394 22394 22394
|
||||
12731 2050 19710
|
||||
5860 18117
|
||||
100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 100 1017 3943 21211 21211 2509 21211 22394 21211 22394 2509 21211 22394 22394 21211 22394 22394 2509 1017 1012 1017 1017 1012 1012 1017 1017 1012 1012 1012 1017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995 1011 1011 1011 1011 1011 1011 1027 1027 1027 1027 1027 1027 1027 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325 1005 1005 1005 1005 1005 1005 1036 1036 1036 1036 1036 1036 1036 1000 1000 1000 1000 1012 1012 1012 1012 1012 1012 999 999 999 999 999 999 1029 1029 1029 1029 1029 1029 1045 1005 2310 2042 1005 2409 2002 1005 1055 2045 1010 1005 2128 2017 2469 1029 1005 1049 2025 2469 1045 1005 2222 2191 2009 1010 1005 1040 2017 2066 2070 5572 1029 2057 1005 2310 1037 1005 2222
|
||||
|
||||
@@ -73,6 +73,8 @@ __ggml_vocab_test__
|
||||
__ggml_vocab_test__
|
||||
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
||||
__ggml_vocab_test__
|
||||
!!!!!!
|
||||
__ggml_vocab_test__
|
||||
3
|
||||
__ggml_vocab_test__
|
||||
33
|
||||
@@ -91,6 +93,10 @@ __ggml_vocab_test__
|
||||
__ggml_vocab_test__
|
||||
333333333
|
||||
__ggml_vocab_test__
|
||||
Cửa Việt
|
||||
__ggml_vocab_test__
|
||||
discards
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user