turboquant: squash-merge TheTom/llama-cpp-turboquant feature/turboquant-kv-cache
Squashes the entire TurboQuant KV-cache feature branch from https://github.com/TheTom/llama-cpp-turboquant (tip5aeb2fdbe) onto our master. Includes: TurboQuant KV-cache types (turbo2_0, turbo3_0, turbo4_0, tq3_1s, tq4_1s), GGML_OP_TURBO_WHT op, CUDA + Metal kernels (including TQ-rotated mul_mm path), CPU reference paths, HIP template instances, perplexity tooling, and 18 post-upstream-sync fixes (CVE-2026-21869 server clamp, HIP FA pool retention, n_head_v reshape, sparse-V CUDA gating, etc.). Conflict-resolution notes (review carefully before depending on these paths): - common/arg.cpp, common/speculative.cpp: master's refactored speculative API kept (params.speculative.types / ngram_mod struct, per-sinfo n_low/i_last). - ggml-cuda/fattn.cu: head-size exclusion lists unioned (now exclude both 192 and 640 alongside other sizes). - ggml-cuda/ggml-cuda.cu: both master's ADD/SUB/MUL/DIV F16 widening AND TurboQuant's GGML_OP_TURBO_WHT support cases kept. - ggml-metal-device.h/.cpp: master's new get_pipeline_mul_mv_ext signature (const ggml_tensor * op) kept; TurboQuant's get_pipeline_turbo_wht added. - ggml-metal-ops.cpp: TurboQuant's TQ-rotated mul_mm path preserved; non-TQ else-branch adapted to master's pipeline.nr0/nr1/nsg dispatch API. - ggml-vulkan.cpp: master's spec-constant-driven flash_attn pipeline iteration taken (over TurboQuant's CREATE_FA-per-type macro approach). TURBO3_0 added to the fa_kv_ok lambda for type validation. - ggml-vulkan/flash_attn_base.glsl, vulkan-shaders-gen.cpp: master's new spec-constant FA shader generation kept; TurboQuant's DATA_A_TURBO3_0 macro path NOT carried over. *** Vulkan TURBO3_0 flash-attention paths need re-implementation against the new spec-constant API. *** Vulkan TURBO3_0 inference will likely fail until that work is redone. Squash base:7fc1c4ef78(TheTom's last upstream merge point).
This commit is contained in:
@@ -16,9 +16,9 @@
|
||||
rocmPackages,
|
||||
vulkan-headers,
|
||||
vulkan-loader,
|
||||
spirv-headers,
|
||||
openssl,
|
||||
shaderc,
|
||||
spirv-headers,
|
||||
useBlas ?
|
||||
builtins.all (x: !x) [
|
||||
useCuda
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
github: [TheTom]
|
||||
@@ -0,0 +1,109 @@
|
||||
name: TurboQuant+ Release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'tqp-v*'
|
||||
|
||||
env:
|
||||
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON"
|
||||
|
||||
jobs:
|
||||
macos-metal:
|
||||
runs-on: macos-14
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DCMAKE_INSTALL_RPATH='@loader_path' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Pack
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf turboquant-plus-${{ github.ref_name }}-macos-arm64-metal.tar.gz \
|
||||
-s ",./,turboquant-plus-${{ github.ref_name }}/," -C ./build/bin .
|
||||
|
||||
- name: Upload
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: macos-arm64-metal
|
||||
path: turboquant-plus-${{ github.ref_name }}-macos-arm64-metal.tar.gz
|
||||
|
||||
windows-cuda:
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
with:
|
||||
cuda_version: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install Ninja
|
||||
run: choco install ninja
|
||||
|
||||
- name: Build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DGGML_CUDA_FA_ALL_QUANTS=ON ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS%
|
||||
|
||||
- name: Pack
|
||||
run: |
|
||||
cp LICENSE ./build/bin/Release/
|
||||
$dst='.\build\bin\Release\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a turboquant-plus-${{ github.ref_name }}-windows-x64-cuda${{ matrix.cuda }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: windows-x64-cuda${{ matrix.cuda }}
|
||||
path: turboquant-plus-${{ github.ref_name }}-windows-x64-cuda${{ matrix.cuda }}.zip
|
||||
|
||||
release:
|
||||
needs: [macos-metal, windows-cuda]
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
steps:
|
||||
- name: Download artifacts
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
path: ./release
|
||||
merge-multiple: true
|
||||
|
||||
- name: Create Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
tag_name: ${{ github.ref_name }}
|
||||
name: TurboQuant+ ${{ github.ref_name }}
|
||||
files: ./release/*
|
||||
draft: false
|
||||
prerelease: false
|
||||
@@ -0,0 +1,362 @@
|
||||
=== SMEM M5 Benchmark: baseline ===
|
||||
Model: Qwen3.5-35B-A3B-Q8_0.gguf
|
||||
Date: Sat Mar 28 21:45:40 CDT 2026
|
||||
|
||||
--- turbo3 @ short ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105cffcb0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105cfeb30 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 6.440 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 78.47 ± 0.56 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo3 @ 8192 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1040cfae0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1040ce960 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.010 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | pp8192 | 2144.16 ± 30.18 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 78.90 ± 0.24 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo3 @ 16384 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x10500fc00 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x10500ea80 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | pp16384 | 1704.41 ± 21.63 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 78.64 ± 0.44 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo3 @ 32768 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x101c8fb00 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x101c8e980 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.013 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | pp32768 | 1238.85 ± 6.06 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 78.17 ± 0.69 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ short ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103c17f70 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103c16df0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 80.40 ± 0.72 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ 8192 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103e57d30 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103e56bb0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.010 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | pp8192 | 2048.90 ± 43.42 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 79.84 ± 0.95 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ 16384 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1060bf740 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1060be5c0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.009 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | pp16384 | 1605.18 ± 20.70 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 79.45 ± 1.55 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ 32768 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1040ef870 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1040ee6f0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.010 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | pp32768 | 1157.30 ± 8.01 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 80.64 ± 0.72 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- q8_0 @ short ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1055e78c0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1055e6740 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | q8_0 | q8_0 | 1 | tg128 | 85.48 ± 1.34 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- q8_0 @ 8192 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105ac8540 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105ac73c0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.010 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | q8_0 | q8_0 | 1 | pp8192 | 2106.47 ± 64.66 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | q8_0 | q8_0 | 1 | tg128 | 76.72 ± 2.13 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- q8_0 @ 16384 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103fefa70 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103fee8f0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | q8_0 | q8_0 | 1 | pp16384 | 1723.71 ± 28.56 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | q8_0 | q8_0 | 1 | tg128 | 78.09 ± 3.70 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- q8_0 @ 32768 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1035f7b10 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1035f6990 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | q8_0 | q8_0 | 1 | pp32768 | 1216.99 ± 28.64 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | q8_0 | q8_0 | 1 | tg128 | 86.83 ± 0.34 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
=== Done: baseline ===
|
||||
@@ -0,0 +1,413 @@
|
||||
=== SMEM M5 Benchmark: smem ===
|
||||
Model: Qwen3.5-35B-A3B-Q8_0.gguf
|
||||
Date: Sat Mar 28 22:02:19 CDT 2026
|
||||
|
||||
--- turbo3 @ short ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x104fbb670 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x104fbb5f0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 7.366 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 18.39 ± 0.76 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo3 @ 8192 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x101ee3e50 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x101ee3dd0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.009 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | pp16384 | 1337.26 ± 261.92 |
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | pp8192 | 1442.03 ± 393.22 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 40.38 ± 18.10 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo3 @ 32768 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105a3f890 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105a3e710 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.010 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 58.20 ± 8.75 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo3 @ 16384 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103d7b200 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103d7b180 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.009 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | pp16384 | 792.76 ± 57.30 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 16.47 ± 1.39 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo3 @ 32768 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x104dc31e0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x104dc3160 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.009 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | pp32768 | 806.43 ± 177.53 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 16.19 ± 1.11 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ short ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105ccfa30 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105cce8b0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 16.93 ± 0.97 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ 8192 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x10561bc80 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x10561ab00 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | pp8192 | 942.18 ± 77.19 |
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | pp32768 | 941.24 ± 180.34 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 44.84 ± 18.74 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ 16384 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1038a3d70 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1038a2bf0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo3 | turbo3 | 1 | tg128 | 61.97 ± 9.79 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ short ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x10170b580 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x10170b500 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 17.82 ± 0.64 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ 8192 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103dab490 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x103dab410 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.009 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | pp16384 | 1187.08 ± 274.35 |
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | pp8192 | 1098.56 ± 217.82 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 50.13 ± 12.92 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ 32768 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105f20300 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x105f1f180 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 58.25 ± 4.07 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ 16384 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x10588f220 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x10588f1a0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.008 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | pp16384 | 755.20 ± 28.45 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 15.58 ± 1.31 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
--- turbo4 @ 32768 ---
|
||||
ggml_metal_device_init: testing tensor API for f16 support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x1018533e0 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_device_init: testing tensor API for bfloat support
|
||||
ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
|
||||
ggml_metal_library_compile_pipeline: loaded dummy_kernel 0x101853360 | th_max = 1024 | th_width = 32
|
||||
ggml_metal_library_init: using embedded metal library
|
||||
ggml_metal_library_init: turbo3 sparse V dequant enabled
|
||||
ggml_metal_library_init: loaded in 0.009 sec
|
||||
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
|
||||
ggml_metal_device_init: GPU name: MTL0
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyApple10 (1010)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
|
||||
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4 (5002)
|
||||
ggml_metal_device_init: simdgroup reduction = true
|
||||
ggml_metal_device_init: simdgroup matrix mul. = true
|
||||
ggml_metal_device_init: has unified memory = true
|
||||
ggml_metal_device_init: has bfloat = true
|
||||
ggml_metal_device_init: has tensor = true
|
||||
ggml_metal_device_init: use residency sets = true
|
||||
ggml_metal_device_init: use shared buffers = true
|
||||
ggml_metal_device_init: recommendedMaxWorkingSetSize = 115448.73 MB
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | pp32768 | 732.00 ± 172.10 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 16.29 ± 1.78 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
|
||||
SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
|
||||
SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
|
||||
SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
|
||||
=== Done: smem ===
|
||||
| model | size | params | backend | threads | type_k | type_v | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | pp32768 | 1018.88 ± 235.19 |
|
||||
| qwen35moe 35B.A3B Q8_0 | 34.36 GiB | 34.66 B | MTL,BLAS | 1 | turbo4 | turbo4 | 1 | tg128 | 81.62 ± 0.05 |
|
||||
|
||||
build: 13afec1 (178)
|
||||
|
||||
SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
|
||||
SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
|
||||
SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
|
||||
SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
|
||||
=== Done: smem ===
|
||||
@@ -406,6 +406,9 @@ const std::vector<ggml_type> kv_cache_types = {
|
||||
GGML_TYPE_IQ4_NL,
|
||||
GGML_TYPE_Q5_0,
|
||||
GGML_TYPE_Q5_1,
|
||||
GGML_TYPE_TURBO2_0,
|
||||
GGML_TYPE_TURBO3_0,
|
||||
GGML_TYPE_TURBO4_0,
|
||||
};
|
||||
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
|
||||
@@ -44,6 +44,12 @@
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <io.h>
|
||||
#ifndef fileno
|
||||
#define fileno _fileno
|
||||
#endif
|
||||
#ifndef isatty
|
||||
#define isatty _isatty
|
||||
#endif
|
||||
#else
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
# TurboQuant on AMD Instinct MI300X & MI355X (ROCm/HIP)
|
||||
|
||||
## Summary
|
||||
|
||||
TurboQuant KV cache compression (turbo2/turbo3/turbo4) builds and runs correctly on AMD Instinct MI300X (gfx942) and MI355X (gfx950). MI300X requires zero code changes. MI355X requires adding CDNA4 arch defines to the HIP vendor header.
|
||||
|
||||
## Test Environment
|
||||
|
||||
| Component | MI300X | MI355X |
|
||||
|-----------|--------|--------|
|
||||
| GPU | MI300X (gfx942), 192 GB HBM3 | MI355X (gfx950), 288 GB HBM3e |
|
||||
| ROCm | 7.0.2 | 7.0.1 |
|
||||
| Wave Size | 64 | 64 |
|
||||
| Build | `-DAMDGPU_TARGETS="gfx942"` | `-DAMDGPU_TARGETS="gfx950"` |
|
||||
| Model | Qwen2.5-1.5B Q4_K_M (1.04 GiB) | same |
|
||||
|
||||
## WHT Kernel Correctness
|
||||
|
||||
Standalone roundtrip test (forward WHT → inverse WHT) confirms the Walsh-Hadamard Transform kernel works correctly on HIP with 64-wide wavefronts:
|
||||
|
||||
```
|
||||
=== TurboQuant WHT Roundtrip Test (HIP/gfx942) ===
|
||||
Total elements: 512 (4 heads x 128 dim)
|
||||
Forward WHT zeros: 0 / 512
|
||||
Roundtrip max error: 2.980232e-07
|
||||
Roundtrip RMSE: 6.816018e-08
|
||||
Result: PASS ✅
|
||||
```
|
||||
|
||||
The kernel uses shared memory + `__syncthreads()` (no warp shuffles), so it works correctly with GCN's 64-thread wavefronts without modification.
|
||||
|
||||
## Performance Results
|
||||
|
||||
### MI300X (single GPU, Qwen2.5-1.5B Q4_K_M)
|
||||
|
||||
| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
|
||||
|----------|--------------|--------------|----------------|---------------|
|
||||
| f16 | 24,453 ± 230 | 181.2 ± 2.0 | baseline | baseline |
|
||||
| turbo3 | ~25,200 | ~160 | **+3%** | 88% |
|
||||
| turbo4 | 25,427 ± 17 | 161.1 ± 0.2 | **+4%** | 89% |
|
||||
|
||||
### MI355X (single GPU, Qwen2.5-1.5B Q4_K_M)
|
||||
|
||||
| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
|
||||
|----------|--------------|--------------|----------------|---------------|
|
||||
| f16+FA | 40,013 ± 902 | 254.5 ± 1.0 | baseline | baseline |
|
||||
| turbo3 | 39,140 ± 475 | 162.3 ± 0.1 | 98% | 64% |
|
||||
| turbo4 | 39,232 ± 508 | 214.1 ± 0.7 | 98% | **84%** |
|
||||
|
||||
### Key Observations
|
||||
|
||||
1. **MI300X prefill is faster with TurboQuant** (+3-4%) — less KV cache data to write to HBM.
|
||||
2. **MI300X decode at 88-89% of f16** — consistent with Apple Silicon community results.
|
||||
3. **MI355X turbo4 decode at 84%** — turbo4 outperforms turbo3 in decode due to simpler 4-bit dequant.
|
||||
4. **MI355X turbo3 decode at 64%** — the 3-bit codebook + sign extraction is more expensive on gfx950.
|
||||
5. **MI355X non-FA MMQ path crashes** (xf32 MFMA issue) — turbo types force FA and work correctly.
|
||||
|
||||
## Build Instructions
|
||||
|
||||
```bash
|
||||
git clone https://github.com/TheTom/llama-cpp-turboquant.git
|
||||
cd llama-cpp-turboquant
|
||||
git checkout feature/turboquant-kv-cache
|
||||
|
||||
# MI300X (gfx942) — works without code changes
|
||||
cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx942"
|
||||
cmake --build build --config Release -j
|
||||
|
||||
# MI355X (gfx950) — requires CDNA4 define patch (see commit)
|
||||
cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx950"
|
||||
cmake --build build --config Release -j
|
||||
|
||||
# Test
|
||||
HIP_VISIBLE_DEVICES=0 ./build/bin/llama-bench \
|
||||
-m model.gguf -ctk turbo3 -ctv turbo3 -ngl 99 -r 3 -p 512 -n 128
|
||||
```
|
||||
|
||||
## Code Changes for gfx950 (MI355X)
|
||||
|
||||
Three files modified to add CDNA4 (gfx950) architecture support:
|
||||
|
||||
1. **`ggml/src/ggml-cuda/vendors/hip.h`** — Add `CDNA4` define for `__gfx950__`, include in `CDNA` family
|
||||
2. **`ggml/src/ggml-cuda/common.cuh`** — Add `GGML_CUDA_CC_CDNA4` constant and `GGML_CUDA_CC_IS_CDNA4` macro
|
||||
3. **`ggml/src/ggml-cuda/mma.cuh`** — Route CDNA4 to compatible MFMA instructions (bf16_1k, i32x16x32_i8, f32x16x4f32 — NOT xf32 which doesn't exist on gfx950)
|
||||
|
||||
## Known Limitations
|
||||
|
||||
- **MI355X non-FA MMQ crashes**: The default (non-flash-attention) matrix multiply path crashes on gfx950 due to the xf32 MFMA instruction (`mfma_f32_16x16x8_xf32`) not being available. TurboQuant types force flash attention and work correctly. Standard f16/q8_0 KV cache types need `-fa 1` flag on MI355X.
|
||||
- **llama-cli text output**: Interactive mode produces empty tokens on ROCm (display issue), but `llama-bench` confirms computation is correct.
|
||||
|
||||
## Tested By
|
||||
|
||||
Andy Luo (@andyluo7)
|
||||
- AMD Instinct MI300X (gfx942), ROCm 7.0.2 — April 2026
|
||||
- AMD Instinct MI355X (gfx950), ROCm 7.0.1 — April 2026
|
||||
@@ -11,7 +11,7 @@ extern "C" {
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
|
||||
#ifdef __cplusplus
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
|
||||
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
|
||||
#endif
|
||||
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
+18
-2
@@ -181,7 +181,7 @@
|
||||
# define GGML_API __declspec(dllimport) extern
|
||||
# endif
|
||||
# else
|
||||
# define GGML_API __attribute__ ((visibility ("default"))) extern
|
||||
# define GGML_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define GGML_API extern
|
||||
@@ -429,7 +429,12 @@ extern "C" {
|
||||
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
||||
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
|
||||
GGML_TYPE_Q1_0 = 41,
|
||||
GGML_TYPE_COUNT = 42,
|
||||
GGML_TYPE_TURBO2_0 = 42, // TurboQuant 2-bit KV cache: WHT + 2-bit PolarQuant
|
||||
GGML_TYPE_TURBO3_0 = 43, // TurboQuant 3-bit KV cache: WHT + 3-bit PolarQuant
|
||||
GGML_TYPE_TURBO4_0 = 44, // TurboQuant 4-bit KV cache: WHT + 4-bit PolarQuant
|
||||
GGML_TYPE_TQ3_1S = 45, // TurboQuant 3-bit weight: WHT-rotated 8-level Lloyd-Max, block_size=32
|
||||
GGML_TYPE_TQ4_1S = 46, // TurboQuant 4-bit weight: WHT-rotated 16-level Lloyd-Max, block_size=32
|
||||
GGML_TYPE_COUNT = 47,
|
||||
};
|
||||
|
||||
// precision
|
||||
@@ -567,6 +572,7 @@ extern "C" {
|
||||
GGML_OP_RWKV_WKV7,
|
||||
GGML_OP_SOLVE_TRI,
|
||||
GGML_OP_GATED_DELTA_NET,
|
||||
GGML_OP_TURBO_WHT,
|
||||
|
||||
GGML_OP_UNARY,
|
||||
|
||||
@@ -2555,6 +2561,16 @@ extern "C" {
|
||||
struct ggml_tensor * beta,
|
||||
struct ggml_tensor * state);
|
||||
|
||||
// TurboQuant Walsh-Hadamard Transform (O(d log d) rotation for KV cache compression)
|
||||
// Applies WHT rotation to 128-element groups along ne[0]: sign1 → butterfly → sign2 → normalize
|
||||
// direction: 0 = forward (signs1 → WHT → signs2), 1 = inverse (signs2 → WHT → signs1)
|
||||
GGML_API struct ggml_tensor * ggml_turbo_wht(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int direction,
|
||||
int group_size, // 0 = auto (64 or 128 from ne[0])
|
||||
struct ggml_tensor * scale); // NULL = no InnerQ scaling
|
||||
|
||||
// custom operators
|
||||
|
||||
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
|
||||
|
||||
@@ -206,6 +206,7 @@ add_library(ggml-base
|
||||
ggml-threading.h
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
ggml-turbo-quant.c
|
||||
gguf.cpp)
|
||||
|
||||
set_target_properties(ggml-base PROPERTIES
|
||||
|
||||
@@ -277,6 +277,97 @@ typedef struct {
|
||||
} block_tq2_0;
|
||||
static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
|
||||
|
||||
// TurboQuant 3-bit MSE-only: 3-bit PolarQuant indices (no QJL)
|
||||
// Storage block size = 32 (matches q4_0 for optimal GPU parallelism)
|
||||
// Transform group size = 128 (head_dim, for rotation Gaussianization)
|
||||
// Per block: norm(fp16) + 2-bit indices (8 bytes) + 1-bit extra (4 bytes) = 14 bytes per 32 values
|
||||
// = 3.5 bits/value → 4.6× compression vs fp16
|
||||
// The 3-bit index is split: lower 2 bits in qs[], upper 1 bit in signs[]
|
||||
#define QK_TURBO3 128 // Block size 128: one block per rotation group, eliminates redundant norms
|
||||
#define QK_TURBO3_GROUP 128 // rotation group size = head_dim
|
||||
// Derived: FA template nl parameters (auto-scale with block size)
|
||||
#define NL_TURBO3 (QK_TURBO3 / 16) // non-vec FA iterations per block
|
||||
#define NL_TURBO3_VEC (QK_TURBO3 / 4) // vec FA iterations per block
|
||||
typedef struct {
|
||||
ggml_half norm; // 2 bytes: vector L2 norm (for rescaling)
|
||||
uint8_t qs[QK_TURBO3 / 4]; // 8 bytes: lower 2-bit indices (4 per byte)
|
||||
uint8_t signs[QK_TURBO3 / 8]; // 4 bytes: upper 1-bit of 3-bit index (8 per byte)
|
||||
} block_turbo3_0; // 14 bytes total
|
||||
static_assert(sizeof(block_turbo3_0) == sizeof(ggml_half) + QK_TURBO3/4 + QK_TURBO3/8, "wrong turbo3_0 block size/padding");
|
||||
|
||||
// TurboQuant 4-bit: 3-bit PolarQuant indices + 1-bit QJL signs
|
||||
// TURBO4_USE_4BIT: switch between 4-bit PolarQuant (new) and 3-bit+QJL (legacy)
|
||||
// Default: 4-bit on all backends (Metal + CUDA validated)
|
||||
#ifndef TURBO4_USE_4BIT
|
||||
# define TURBO4_USE_4BIT 1
|
||||
#endif
|
||||
|
||||
#define QK_TURBO4 128
|
||||
|
||||
#if TURBO4_USE_4BIT
|
||||
// 4-bit PolarQuant: 16 optimal centroids, nibble packed, no QJL
|
||||
// Per block: norm(fp16) + rnorm(fp16, reserved) + 4-bit indices (64 bytes)
|
||||
// = 68 bytes per 128 values = 4.25 bits/value → 3.8× compression vs fp16
|
||||
typedef struct {
|
||||
ggml_half norm; // 2 bytes
|
||||
ggml_half rnorm; // 2 bytes (reserved, unused in 4-bit mode)
|
||||
uint8_t qs[QK_TURBO4 / 2]; // 64 bytes: 4-bit PolarQuant indices (nibble packed)
|
||||
} block_turbo4_0; // 68 bytes total
|
||||
static_assert(sizeof(block_turbo4_0) == 68, "wrong turbo4_0 block size");
|
||||
#else
|
||||
// Legacy 3-bit PolarQuant + 1-bit QJL (original paper design)
|
||||
// Per block: norm(fp16) + rnorm(fp16) + 3-bit indices (48 bytes) + 1-bit QJL signs (16 bytes)
|
||||
// = 68 bytes per 128 values = 4.25 bits/value → 3.8× compression vs fp16
|
||||
typedef struct {
|
||||
ggml_half norm; // 2 bytes
|
||||
ggml_half rnorm; // 2 bytes: residual norm for QJL scale
|
||||
uint8_t qs[QK_TURBO4 * 3 / 8]; // 48 bytes: 3-bit PolarQuant indices
|
||||
uint8_t signs[QK_TURBO4 / 8]; // 16 bytes: 1-bit QJL signs
|
||||
} block_turbo4_0; // 68 bytes total
|
||||
static_assert(sizeof(block_turbo4_0) == 2*sizeof(ggml_half) + QK_TURBO4*3/8 + QK_TURBO4/8, "wrong turbo4_0 block size");
|
||||
#endif
|
||||
|
||||
static_assert(QK_TURBO4 == 128, "turbo4 kernels assume QK_TURBO4 == 128");
|
||||
|
||||
// TurboQuant 2-bit: 2-bit PolarQuant indices only (no QJL)
|
||||
// Per block: norm(fp16) + 2-bit indices (8 bytes) = 10 bytes per 32 values
|
||||
// = 2.5 bits/value → 6.4× compression vs fp16
|
||||
// 4 centroids (Lloyd-Max for N(0, 1/128)): {-0.133462, -0.039994, 0.039994, 0.133462}
|
||||
#define QK_TURBO2 128 // Block size 128: one block per rotation group
|
||||
#define QK_TURBO2_GROUP 128 // rotation group size = head_dim
|
||||
// Derived: FA template nl parameters (auto-scale with block size)
|
||||
#define NL_TURBO2 (QK_TURBO2 / 16) // non-vec FA iterations per block
|
||||
#define NL_TURBO2_VEC (QK_TURBO2 / 4) // vec FA iterations per block
|
||||
typedef struct {
|
||||
ggml_half norm; // 2 bytes: corrected L2 norm
|
||||
uint8_t qs[QK_TURBO2 / 4]; // 8 bytes: 2-bit indices (4 per byte)
|
||||
} block_turbo2_0; // 10 bytes total
|
||||
static_assert(sizeof(block_turbo2_0) == sizeof(ggml_half) + QK_TURBO2/4, "wrong turbo2_0 block size/padding");
|
||||
|
||||
// TQ3_1S: WHT-rotated 3-bit weight quantization (8-level Lloyd-Max for N(0,1))
|
||||
// Block size 32, dual half-block scales (d0 for [0..15], d1 for [16..31])
|
||||
// Per block: d0(fp16) + d1(fp16) + 3-bit indices packed (12 bytes) = 16 bytes per 32 values
|
||||
// = 4.0 bits/value
|
||||
#define QK_TQ3_0 32
|
||||
typedef struct {
|
||||
ggml_half d0; // 2 bytes: scale for first 16 elements
|
||||
ggml_half d1; // 2 bytes: scale for last 16 elements
|
||||
uint8_t qs[QK_TQ3_0 * 3 / 8]; // 12 bytes: 3-bit indices packed (4 groups of 8 in 3 bytes)
|
||||
} block_tq3_1s; // 16 bytes total
|
||||
static_assert(sizeof(block_tq3_1s) == 16, "wrong tq3_1s block size");
|
||||
|
||||
// TQ4_1S: WHT-rotated 4-bit weight quantization (16-level Lloyd-Max for N(0,1))
|
||||
// Block size 32, dual half-block scales (d0 for [0..15], d1 for [16..31])
|
||||
// Per block: d0(fp16) + d1(fp16) + 4-bit indices packed (16 bytes) = 20 bytes per 32 values
|
||||
// = 5.0 bits/value
|
||||
#define QK_TQ4_1S 32
|
||||
typedef struct {
|
||||
ggml_half d0; // 2 bytes: scale for first 16 elements
|
||||
ggml_half d1; // 2 bytes: scale for last 16 elements
|
||||
uint8_t qs[QK_TQ4_1S / 2]; // 16 bytes: 4-bit indices nibble-packed
|
||||
} block_tq4_1s; // 20 bytes total
|
||||
static_assert(sizeof(block_tq4_1s) == 20, "wrong tq4_1s block size");
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "quants.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "unary-ops.h"
|
||||
#include "binary-ops.h"
|
||||
@@ -208,6 +209,23 @@ typedef pthread_t ggml_thread_t;
|
||||
#include <TargetConditionals.h>
|
||||
#endif
|
||||
|
||||
// Forward declarations — defined below, after utility functions
|
||||
static void ggml_vec_dot_turbo3_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
static void ggml_vec_dot_turbo2_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
static void ggml_vec_dot_turbo4_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
static void ggml_vec_dot_tq3_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
static void ggml_vec_dot_tq4_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_F32] = {
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32,
|
||||
@@ -403,6 +421,36 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_I32] = {
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
|
||||
},
|
||||
[GGML_TYPE_TURBO3_0] = {
|
||||
.from_float = (ggml_from_float_t) quantize_row_turbo3_0_ref,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_turbo3_0_f32,
|
||||
.vec_dot_type = GGML_TYPE_F32,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_TURBO2_0] = {
|
||||
.from_float = (ggml_from_float_t) quantize_row_turbo2_0_ref,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_turbo2_0_f32,
|
||||
.vec_dot_type = GGML_TYPE_F32,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_TURBO4_0] = {
|
||||
.from_float = (ggml_from_float_t) quantize_row_turbo4_0_ref,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_turbo4_0_f32,
|
||||
.vec_dot_type = GGML_TYPE_F32,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_TQ3_1S] = {
|
||||
.from_float = (ggml_from_float_t) quantize_row_tq3_1s_ref,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_tq3_1s_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_TQ4_1S] = {
|
||||
.from_float = (ggml_from_float_t) quantize_row_tq4_1s_ref,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_tq4_1s_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
};
|
||||
|
||||
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
||||
@@ -2047,6 +2095,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_gated_delta_net(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_TURBO_WHT:
|
||||
{
|
||||
ggml_compute_forward_turbo_wht(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_MAP_CUSTOM1:
|
||||
{
|
||||
ggml_compute_forward_map_custom1(params, tensor);
|
||||
@@ -2227,6 +2279,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
case GGML_OP_GATED_DELTA_NET:
|
||||
case GGML_OP_TURBO_WHT:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
@@ -2947,6 +3000,10 @@ struct ggml_cplan ggml_graph_plan(
|
||||
const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0);
|
||||
cur = per_thread * sizeof(float) * n_tasks;
|
||||
} break;
|
||||
case GGML_OP_TURBO_WHT:
|
||||
{
|
||||
cur = 0; // no extra workspace needed
|
||||
} break;
|
||||
case GGML_OP_COUNT:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -3385,6 +3442,119 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
// TurboQuant3 vec_dot: dequantize turbo3 block to f32, then dot with f32 operand.
|
||||
// Used by CPU flash attention for models with D not supported by CUDA FA (e.g. D=192).
|
||||
static void ggml_vec_dot_turbo3_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
GGML_ASSERT(nrc == 1);
|
||||
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
|
||||
|
||||
// Dequantize turbo3 to f32 temp buffer, then dot
|
||||
float * tmp = (float *)malloc(n * sizeof(float));
|
||||
GGML_ASSERT(tmp != NULL);
|
||||
ggml_get_type_traits(GGML_TYPE_TURBO3_0)->to_float(vx, tmp, n);
|
||||
|
||||
const float * y = (const float *)vy;
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < n; i++) {
|
||||
sum += tmp[i] * y[i];
|
||||
}
|
||||
free(tmp);
|
||||
*s = sum;
|
||||
}
|
||||
|
||||
// TurboQuant2 vec_dot: dequantize turbo2 block to f32, then dot with f32 operand.
|
||||
static void ggml_vec_dot_turbo2_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
GGML_ASSERT(nrc == 1);
|
||||
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
|
||||
|
||||
float * tmp = (float *)malloc(n * sizeof(float));
|
||||
GGML_ASSERT(tmp != NULL);
|
||||
ggml_get_type_traits(GGML_TYPE_TURBO2_0)->to_float(vx, tmp, n);
|
||||
|
||||
const float * y = (const float *)vy;
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < n; i++) {
|
||||
sum += tmp[i] * y[i];
|
||||
}
|
||||
free(tmp);
|
||||
*s = sum;
|
||||
}
|
||||
|
||||
// TurboQuant4 vec_dot: dequantize turbo4 block to f32, then dot with f32 operand.
|
||||
static void ggml_vec_dot_turbo4_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
GGML_ASSERT(nrc == 1);
|
||||
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
|
||||
|
||||
float * tmp = (float *)malloc(n * sizeof(float));
|
||||
GGML_ASSERT(tmp != NULL);
|
||||
ggml_get_type_traits(GGML_TYPE_TURBO4_0)->to_float(vx, tmp, n);
|
||||
|
||||
const float * y = (const float *)vy;
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < n; i++) {
|
||||
sum += tmp[i] * y[i];
|
||||
}
|
||||
free(tmp);
|
||||
*s = sum;
|
||||
}
|
||||
|
||||
// TQ3_1S vec_dot: dequantize tq3_1s block to f32, then dot with q8_0.
|
||||
// TODO: optimize with SIMD intrinsics for ARM NEON / AVX2
|
||||
static void ggml_vec_dot_tq3_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
GGML_ASSERT(nrc == 1);
|
||||
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
|
||||
|
||||
float * tmp = (float *)malloc(n * sizeof(float));
|
||||
GGML_ASSERT(tmp != NULL);
|
||||
ggml_get_type_traits(GGML_TYPE_TQ3_1S)->to_float(vx, tmp, n);
|
||||
|
||||
// Dequantize q8_0 and dot
|
||||
float * tmp2 = (float *)malloc(n * sizeof(float));
|
||||
GGML_ASSERT(tmp2 != NULL);
|
||||
ggml_get_type_traits(GGML_TYPE_Q8_0)->to_float(vy, tmp2, n);
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < n; i++) {
|
||||
sum += tmp[i] * tmp2[i];
|
||||
}
|
||||
free(tmp);
|
||||
free(tmp2);
|
||||
*s = sum;
|
||||
}
|
||||
|
||||
// TQ4_1S vec_dot: dequantize tq4_1s block to f32, then dot with q8_0.
|
||||
// TODO: optimize with SIMD intrinsics
|
||||
static void ggml_vec_dot_tq4_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const void * GGML_RESTRICT vx, size_t bx,
|
||||
const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
GGML_ASSERT(nrc == 1);
|
||||
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
|
||||
|
||||
float * tmp = (float *)malloc(n * sizeof(float));
|
||||
GGML_ASSERT(tmp != NULL);
|
||||
ggml_get_type_traits(GGML_TYPE_TQ4_1S)->to_float(vx, tmp, n);
|
||||
|
||||
float * tmp2 = (float *)malloc(n * sizeof(float));
|
||||
GGML_ASSERT(tmp2 != NULL);
|
||||
ggml_get_type_traits(GGML_TYPE_Q8_0)->to_float(vy, tmp2, n);
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < n; i++) {
|
||||
sum += tmp[i] * tmp2[i];
|
||||
}
|
||||
free(tmp);
|
||||
free(tmp2);
|
||||
*s = sum;
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
|
||||
memcpy(y, x, n * sizeof(float));
|
||||
}
|
||||
|
||||
@@ -12,6 +12,10 @@
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
|
||||
extern "C" {
|
||||
GGML_API int turbo3_cpu_wht_group_size;
|
||||
}
|
||||
|
||||
// ggml_compute_forward_dup
|
||||
|
||||
static void ggml_compute_forward_dup_same_cont(
|
||||
@@ -679,6 +683,8 @@ void ggml_compute_forward_add(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
@@ -1130,6 +1136,8 @@ void ggml_compute_forward_add1(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
@@ -1260,6 +1268,8 @@ void ggml_compute_forward_acc(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
@@ -4395,6 +4405,8 @@ void ggml_compute_forward_out_prod(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
@@ -4672,6 +4684,8 @@ void ggml_compute_forward_set(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
@@ -4896,6 +4910,8 @@ void ggml_compute_forward_get_rows(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
@@ -4978,6 +4994,13 @@ static void ggml_compute_forward_set_rows_f32(
|
||||
|
||||
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
|
||||
|
||||
// For turbo types: communicate WHT group size to the quantize function via global
|
||||
if (dst->type == GGML_TYPE_TURBO3_0 || dst->type == GGML_TYPE_TURBO4_0 || dst->type == GGML_TYPE_TURBO2_0) {
|
||||
int gs = 0;
|
||||
memcpy(&gs, dst->op_params, sizeof(int));
|
||||
turbo3_cpu_wht_group_size = (gs == 64 || gs == 128) ? gs : 0;
|
||||
}
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; ++i03) {
|
||||
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
||||
for (int64_t i = ir0; i < ir1; ++i) {
|
||||
@@ -5622,6 +5645,11 @@ void ggml_compute_forward_clamp(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
@@ -10686,6 +10714,104 @@ void ggml_compute_forward_gated_delta_net(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_turbo_wht
|
||||
|
||||
// WHT sign arrays (must match Metal shader turbo_wht_signs1/2)
|
||||
static const float turbo_wht_s1[128] = {-1,1,1,-1,-1,1,-1,1,-1,-1,1,1,1,1,1,1,1,-1,1,-1,1,-1,-1,1,1,1,-1,1,1,-1,-1,-1,-1,1,1,-1,1,1,-1,1,-1,1,1,-1,-1,1,-1,1,1,1,1,-1,-1,-1,-1,-1,1,-1,1,1,1,1,-1,1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,1,1,1,-1,-1,1,1,1,-1,-1,1,1,-1,1,1,-1,1,-1,-1,1,1,-1,1,-1,1,-1,1,1,1,1,-1,1,-1,1,1,-1,1,1,-1,-1,-1,-1,-1,1,1,-1,1,1,-1,1};
|
||||
static const float turbo_wht_s2[128] = {1,1,1,1,-1,1,1,-1,1,-1,-1,-1,1,-1,-1,-1,1,1,-1,-1,1,-1,1,-1,1,-1,-1,1,-1,1,1,1,1,1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,1,1,-1,1,-1,1,1,1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,1,1,-1,1,-1,-1,-1,-1,1,-1,1,-1,1,-1,-1,1,1,-1,1,-1,1,1,-1,1,-1,-1,-1,-1,1,-1,-1,1,-1,1,-1,1,1,1,-1,-1,1,-1,1,-1,1,1,-1,-1,1,-1,1,-1,1,1,-1,1,-1,1,-1,-1,-1,-1,-1,1,-1};
|
||||
|
||||
static void ggml_compute_forward_turbo_wht_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
const ggml_tensor * src = dst->src[0];
|
||||
const ggml_tensor * scale_tensor = dst->src[1]; // InnerQ scale_inv (may be NULL)
|
||||
const float * src_data = (const float *) src->data;
|
||||
float * dst_data = (float *) dst->data;
|
||||
const float * scale_inv = scale_tensor ? (const float *) scale_tensor->data : NULL;
|
||||
|
||||
int direction;
|
||||
int group_size;
|
||||
memcpy(&direction, dst->op_params + 0, sizeof(int));
|
||||
memcpy(&group_size, dst->op_params + sizeof(int), sizeof(int));
|
||||
|
||||
const int64_t head_dim = src->ne[0];
|
||||
const int64_t n_heads = ggml_nelements(src) / head_dim;
|
||||
const int64_t groups_per_head = head_dim / group_size;
|
||||
const int tail_size = (int)(head_dim % group_size);
|
||||
const int64_t n_groups = groups_per_head * n_heads;
|
||||
|
||||
const float inv_sqrt = 1.0f / sqrtf((float)group_size);
|
||||
|
||||
// Parallel over groups
|
||||
const int64_t ith = params->ith;
|
||||
const int64_t nth = params->nth;
|
||||
const int64_t grp_start = (n_groups * ith) / nth;
|
||||
const int64_t grp_end = (n_groups * (ith + 1)) / nth;
|
||||
|
||||
// Select sign arrays: for 64-group, use first 64 elements of the 128-element arrays
|
||||
const float * s_first = (direction == 0) ? turbo_wht_s1 : turbo_wht_s2;
|
||||
const float * s_second = (direction == 0) ? turbo_wht_s2 : turbo_wht_s1;
|
||||
|
||||
for (int64_t g = grp_start; g < grp_end; g++) {
|
||||
const int64_t head_idx = g / groups_per_head;
|
||||
const int64_t grp_in_head = g % groups_per_head;
|
||||
const int64_t base = head_idx * head_dim + grp_in_head * group_size;
|
||||
|
||||
float x[128]; // max group_size
|
||||
const float * in = src_data + base;
|
||||
|
||||
// InnerQ forward: apply scale_inv BEFORE signs+WHT (for Q pre-rotation)
|
||||
if (direction == 0 && scale_inv != NULL) {
|
||||
for (int i = 0; i < group_size; i++) x[i] = in[i] * scale_inv[i % group_size];
|
||||
} else {
|
||||
for (int i = 0; i < group_size; i++) x[i] = in[i];
|
||||
}
|
||||
|
||||
// Apply first signs
|
||||
for (int i = 0; i < group_size; i++) x[i] *= s_first[i];
|
||||
|
||||
// WHT butterfly (log2(group_size) stages)
|
||||
for (int h = 1; h < group_size; h *= 2) {
|
||||
for (int i = 0; i < group_size; i += h * 2) {
|
||||
for (int j = i; j < i + h; j++) {
|
||||
float a = x[j], b = x[j + h];
|
||||
x[j] = a + b;
|
||||
x[j + h] = a - b;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize + second signs
|
||||
float * out = dst_data + base;
|
||||
for (int i = 0; i < group_size; i++) {
|
||||
float val = x[i] * inv_sqrt * s_second[i];
|
||||
// InnerQ inverse: apply scale_inv AFTER WHT+signs (for V un-rotation)
|
||||
if (direction == 1 && scale_inv != NULL) {
|
||||
val *= scale_inv[i % group_size];
|
||||
}
|
||||
out[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy tail elements unchanged (identity pass-through)
|
||||
if (tail_size > 0 && ith == 0) {
|
||||
const int64_t tail_offset = groups_per_head * group_size;
|
||||
for (int64_t h = 0; h < n_heads; h++) {
|
||||
const int64_t base = h * head_dim + tail_offset;
|
||||
memcpy(dst_data + base, src_data + base, tail_size * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_turbo_wht(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
switch (dst->src[0]->type) {
|
||||
case GGML_TYPE_F32: ggml_compute_forward_turbo_wht_f32(params, dst); break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_rwkv_wkv7
|
||||
|
||||
static void ggml_compute_forward_rwkv_wkv7_f32(
|
||||
|
||||
@@ -104,6 +104,7 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s
|
||||
void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_turbo_wht(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
@@ -120,7 +120,28 @@ if (CUDAToolkit_FOUND)
|
||||
template-instances/fattn-vec-instance-f16-f16.cu
|
||||
template-instances/fattn-vec-instance-q4_0-q4_0.cu
|
||||
template-instances/fattn-vec-instance-q8_0-q8_0.cu
|
||||
template-instances/fattn-vec-instance-bf16-bf16.cu)
|
||||
template-instances/fattn-vec-instance-bf16-bf16.cu
|
||||
template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu
|
||||
template-instances/fattn-vec-instance-turbo3_0-q8_0.cu
|
||||
template-instances/fattn-vec-instance-q8_0-turbo3_0.cu
|
||||
template-instances/fattn-vec-instance-f16-turbo3_0.cu
|
||||
template-instances/fattn-vec-instance-turbo3_0-f16.cu
|
||||
template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu
|
||||
template-instances/fattn-vec-instance-turbo2_0-q8_0.cu
|
||||
template-instances/fattn-vec-instance-q8_0-turbo2_0.cu
|
||||
template-instances/fattn-vec-instance-f16-turbo2_0.cu
|
||||
template-instances/fattn-vec-instance-turbo2_0-f16.cu
|
||||
template-instances/fattn-vec-instance-turbo3_0-turbo2_0.cu
|
||||
template-instances/fattn-vec-instance-turbo2_0-turbo3_0.cu
|
||||
template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu
|
||||
template-instances/fattn-vec-instance-turbo4_0-q8_0.cu
|
||||
template-instances/fattn-vec-instance-q8_0-turbo4_0.cu
|
||||
template-instances/fattn-vec-instance-f16-turbo4_0.cu
|
||||
template-instances/fattn-vec-instance-turbo4_0-f16.cu
|
||||
template-instances/fattn-vec-instance-turbo4_0-turbo3_0.cu
|
||||
template-instances/fattn-vec-instance-turbo3_0-turbo4_0.cu
|
||||
template-instances/fattn-vec-instance-turbo4_0-turbo2_0.cu
|
||||
template-instances/fattn-vec-instance-turbo2_0-turbo4_0.cu)
|
||||
endif()
|
||||
|
||||
ggml_add_backend_library(ggml-cuda
|
||||
|
||||
@@ -804,7 +804,7 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
||||
static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
|
||||
// ROCm does not support fp8 in software on devices with fp8 hardware,
|
||||
// but CDNA3 supports only e4m3_fnuz (no inf).
|
||||
// but CDNA3 supports only e4m3_fnuz (no inf). CDNA4 (gfx950) uses standard e4m3fn.
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast<const __hip_fp8_e4m3_fnuz *>(&bits);
|
||||
return static_cast<float>(xf) / 2;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "convert.cuh"
|
||||
#include "dequantize.cuh"
|
||||
#include "turbo-quant.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
@@ -502,6 +503,50 @@ static void dequantize_block_cont_cuda(const void * __restrict__ vx, dst_t * __r
|
||||
dequantize_block_cuda<qk, qr, dequantize_kernel, dst_t>(vx, y, k, 1, 1, 1, k/qk, k/qk, k/qk, stream);
|
||||
}
|
||||
|
||||
// Fast warp-cooperative TQ4_1S dequant: one warp per 32-element block.
|
||||
// WHT via __shfl_xor_sync — 16× less compute than the per-element generic template.
|
||||
template <typename dst_t>
|
||||
static __global__ void k_dequantize_tq4_1s_warp(
|
||||
const block_tq4_1s * __restrict__ vx, dst_t * __restrict__ y,
|
||||
const int64_t n_elements) {
|
||||
const int64_t block_idx = (int64_t)blockIdx.x * blockDim.y + threadIdx.y;
|
||||
const int lane = threadIdx.x;
|
||||
if (block_idx * 32 + lane >= n_elements) return;
|
||||
|
||||
const block_tq4_1s * blk = &vx[block_idx];
|
||||
const float d = (lane < 16) ? __half2float(blk->d0) : __half2float(blk->d1);
|
||||
const uint8_t idx = (blk->qs[lane / 2] >> ((lane & 1) * 4)) & 0xF;
|
||||
float val = TQ4_CENTROIDS_WEIGHT[idx] * d;
|
||||
|
||||
#pragma unroll
|
||||
for (int h = 1; h < 32; h <<= 1) {
|
||||
float o = __shfl_xor_sync(0xffffffff, val, h);
|
||||
val = (lane & h) ? (o - val) : (val + o);
|
||||
}
|
||||
val *= 0.17677669529663688f * TQ_WEIGHT_SIGNS[lane];
|
||||
|
||||
y[block_idx * 32 + lane] = (dst_t)val;
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_tq4_1s_warp_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
GGML_ASSERT(k % 32 == 0);
|
||||
const int64_t n_blocks = k / 32;
|
||||
const int wpb = 4;
|
||||
const dim3 block(32, wpb);
|
||||
const dim3 grid((n_blocks + wpb - 1) / wpb);
|
||||
k_dequantize_tq4_1s_warp<<<grid, block, 0, stream>>>((const block_tq4_1s *)vx, y, k);
|
||||
}
|
||||
|
||||
// Non-contiguous version for general tensor layouts
|
||||
template <typename dst_t>
|
||||
static void dequantize_tq4_1s_warp_nc_cuda(const void * vx, dst_t * y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
|
||||
// For non-contiguous, fall back to generic per-element (rare path)
|
||||
dequantize_block_cuda<QK_TQ4_1S, QR_TQ4_1S, dequantize_tq4_1s, dst_t>(vx, y, ne00, ne01, ne02, ne03, s01, s02, s03, stream);
|
||||
}
|
||||
|
||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
||||
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
||||
@@ -758,6 +803,16 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
return dequantize_row_mxfp4_cuda;
|
||||
case GGML_TYPE_NVFP4:
|
||||
return dequantize_row_nvfp4_cuda;
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
return dequantize_block_cont_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
return dequantize_block_cont_cuda<QK_TURBO2, QR_TURBO2, dequantize_turbo2_0>;
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
return dequantize_block_cont_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
return dequantize_tq4_1s_warp_cuda<half>; // fast warp-cooperative WHT
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
return dequantize_block_cont_cuda<QK_TQ3_0, QR_TQ3_1S, dequantize_tq3_1s>;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
@@ -813,6 +868,16 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
return dequantize_row_mxfp4_cuda;
|
||||
case GGML_TYPE_NVFP4:
|
||||
return dequantize_row_nvfp4_cuda;
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
return dequantize_block_cont_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
return dequantize_block_cont_cuda<QK_TURBO2, QR_TURBO2, dequantize_turbo2_0>;
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
return dequantize_block_cont_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
return dequantize_tq4_1s_warp_cuda<float>; // fast warp-cooperative WHT
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
return dequantize_block_cont_cuda<QK_TQ3_0, QR_TQ3_1S, dequantize_tq3_1s>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cont_cuda<half>;
|
||||
case GGML_TYPE_BF16:
|
||||
@@ -838,6 +903,16 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
return dequantize_block_cuda<QK_TURBO2, QR_TURBO2, dequantize_turbo2_0>;
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
return dequantize_block_cuda<QK_TQ4_1S, QR_TQ4_1S, dequantize_tq4_1s>;
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
return dequantize_block_cuda<QK_TQ3_0, QR_TQ3_1S, dequantize_tq3_1s>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
default:
|
||||
@@ -884,6 +959,16 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
return dequantize_block_cuda<QK_TURBO2, QR_TURBO2, dequantize_turbo2_0>;
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
return dequantize_block_cuda<QK_TQ4_1S, QR_TQ4_1S, dequantize_tq4_1s>;
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
return dequantize_block_cuda<QK_TQ3_0, QR_TQ3_1S, dequantize_tq3_1s>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16, float>;
|
||||
default:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "common.cuh"
|
||||
#include "turbo-quant.cuh"
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q1_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q1_0 * x = (const block_q1_0 *) vx;
|
||||
@@ -97,3 +98,104 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
||||
v.x *= d;
|
||||
v.y *= d;
|
||||
}
|
||||
|
||||
// Turbo4: 4-bit PolarQuant (nibble packed), block size 128
|
||||
// iqs is the element index within the block (even), produces elements iqs and iqs+1
|
||||
static __device__ __forceinline__ void dequantize_turbo4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_turbo4_0 * x = (const block_turbo4_0 *) vx;
|
||||
const float norm = __half2float(x[ib].norm);
|
||||
v.x = turbo4_dequant_element(&x[ib], iqs + 0, norm);
|
||||
v.y = turbo4_dequant_element(&x[ib], iqs + 1, norm);
|
||||
}
|
||||
|
||||
// Turbo3: 3-bit PolarQuant (2-bit qs + 1-bit sign), block size 32
|
||||
// iqs is the element index within the block (even), produces elements iqs and iqs+1
|
||||
static __device__ __forceinline__ void dequantize_turbo3_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_turbo3_0 * x = (const block_turbo3_0 *) vx;
|
||||
const float norm = __half2float(x[ib].norm);
|
||||
v.x = turbo3_dequant_element(&x[ib], iqs + 0, norm);
|
||||
v.y = turbo3_dequant_element(&x[ib], iqs + 1, norm);
|
||||
}
|
||||
|
||||
// Turbo2: 2-bit PolarQuant (2-bit qs only, no sign), block size 32
|
||||
static __device__ __forceinline__ void dequantize_turbo2_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_turbo2_0 * x = (const block_turbo2_0 *) vx;
|
||||
const float norm = __half2float(x[ib].norm);
|
||||
v.x = turbo2_dequant_element(&x[ib], iqs + 0, norm);
|
||||
v.y = turbo2_dequant_element(&x[ib], iqs + 1, norm);
|
||||
}
|
||||
|
||||
// TQ4_1S: 4-bit weight type with inverse WHT, block size 32, dual half-block scales
|
||||
// Cold path only (convert.cu) — dequants full block, applies inverse RHT, returns pair
|
||||
static __device__ __forceinline__ void dequantize_tq4_1s(const void * vx, const int64_t ib, const int iqs, float2 & v) {
|
||||
const block_tq4_1s * x = (const block_tq4_1s *) vx;
|
||||
const float d0 = __half2float(x[ib].d0);
|
||||
const float d1 = __half2float(x[ib].d1);
|
||||
|
||||
// Dequant full block (centroid lookup + scale)
|
||||
float buf[32];
|
||||
for (int j = 0; j < 32; j++) {
|
||||
uint8_t idx = (x[ib].qs[j / 2] >> ((j & 1) * 4)) & 0xF;
|
||||
float d = (j < 16) ? d0 : d1;
|
||||
buf[j] = TQ4_CENTROIDS_WEIGHT[idx] * d;
|
||||
}
|
||||
|
||||
// Inverse RHT: WHT butterfly then normalize+unsign
|
||||
for (int step = 1; step < 32; step <<= 1) {
|
||||
for (int i = 0; i < 32; i += step << 1) {
|
||||
for (int j = i; j < i + step; j++) {
|
||||
float a = buf[j], b = buf[j + step];
|
||||
buf[j] = a + b; buf[j + step] = a - b;
|
||||
}
|
||||
}
|
||||
}
|
||||
const float inv_sqrt32 = 0.17677669529663688f;
|
||||
for (int j = 0; j < 32; j++) buf[j] *= inv_sqrt32 * TQ_WEIGHT_SIGNS[j];
|
||||
|
||||
v.x = buf[iqs];
|
||||
v.y = buf[iqs + 1];
|
||||
}
|
||||
|
||||
// TQ3_1S: 3-bit weight type with inverse WHT, block size 32, dual half-block scales
|
||||
// 3-bit packing: 4 groups of 8 indices in 3 bytes each (24 bits = 8 * 3-bit)
|
||||
static __device__ __forceinline__ void dequantize_tq3_1s(const void * vx, const int64_t ib, const int iqs, float2 & v) {
|
||||
const block_tq3_1s * x = (const block_tq3_1s *) vx;
|
||||
const float d0 = __half2float(x[ib].d0);
|
||||
const float d1 = __half2float(x[ib].d1);
|
||||
|
||||
// Unpack all 32 3-bit indices (4 groups of 8 in 3 bytes)
|
||||
float buf[32];
|
||||
for (int g = 0; g < 4; g++) {
|
||||
const uint8_t * qp = x[ib].qs + g * 3;
|
||||
uint8_t idx[8];
|
||||
idx[0] = qp[0] & 7;
|
||||
idx[1] = (qp[0] >> 3) & 7;
|
||||
idx[2] = ((qp[0] >> 6) | (qp[1] << 2)) & 7;
|
||||
idx[3] = (qp[1] >> 1) & 7;
|
||||
idx[4] = (qp[1] >> 4) & 7;
|
||||
idx[5] = ((qp[1] >> 7) | (qp[2] << 1)) & 7;
|
||||
idx[6] = (qp[2] >> 2) & 7;
|
||||
idx[7] = (qp[2] >> 5) & 7;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
int j = g * 8 + i;
|
||||
float d = (j < 16) ? d0 : d1;
|
||||
buf[j] = TQ3_CENTROIDS_WEIGHT[idx[i]] * d;
|
||||
}
|
||||
}
|
||||
|
||||
// Inverse RHT: WHT butterfly then normalize+unsign
|
||||
for (int step = 1; step < 32; step <<= 1) {
|
||||
for (int i = 0; i < 32; i += step << 1) {
|
||||
for (int j = i; j < i + step; j++) {
|
||||
float a = buf[j], b = buf[j + step];
|
||||
buf[j] = a + b; buf[j + step] = a - b;
|
||||
}
|
||||
}
|
||||
}
|
||||
const float inv_sqrt32 = 0.17677669529663688f;
|
||||
for (int j = 0; j < 32; j++) buf[j] *= inv_sqrt32 * TQ_WEIGHT_SIGNS[j];
|
||||
|
||||
v.x = buf[iqs];
|
||||
v.y = buf[iqs + 1];
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "common.cuh"
|
||||
#include "convert.cuh"
|
||||
#include "vecdotq.cuh"
|
||||
#include "turbo-quant.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
@@ -288,6 +289,164 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q8_0(
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Turbo3 KQ dot product: dequantize K from turbo3 blocks, dot with Q (float2/half2)
|
||||
// Uses float Q path (like f16), not q8_1 integer path.
|
||||
// Q_v is half2[] or float2[] with D/2 pairs, partitioned nthreads-strided.
|
||||
//
|
||||
// Matches the f16 pattern: outer loop steps by nthreads*cpy_ne, inner loop
|
||||
// processes cpy_ne pairs per thread per iteration so Q_v and K indices stay aligned.
|
||||
// elem0 = 2*k_KQ is always even, so elem0 and elem0+1 always share the same
|
||||
// turbo3 block (ib), qs byte, and signs byte — loaded once per pair.
|
||||
template <int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_turbo3_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_turbo3_0 * K_turbo = (const block_turbo3_0 *) K_c;
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
|
||||
const int k_KQ = k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne + k_KQ_1;
|
||||
|
||||
// elem0 is always even; elem0 and elem1 are always in the same block,
|
||||
// the same qs byte (j0%4 ∈ {0,2}), and the same signs byte (j0%8 ∈ {0,2,4,6}).
|
||||
const int elem0 = k_KQ * 2; // always even
|
||||
const int ib = elem0 / QK_TURBO3; // shared block index
|
||||
const int j0 = elem0 % QK_TURBO3; // always even, 0..30
|
||||
|
||||
// Single loads for the shared block fields
|
||||
const float norm = __half2float(K_turbo[ib].norm);
|
||||
const uint8_t qs_byte = K_turbo[ib].qs[j0 / 4]; // covers both j0 and j0+1
|
||||
const uint8_t sgn_byte = K_turbo[ib].signs[j0 / 8]; // covers both j0 and j0+1
|
||||
|
||||
// Extract 3-bit indices for elem0 and elem1 from shared bytes
|
||||
const int shift = (j0 % 4) * 2; // 0 or 4
|
||||
const uint8_t idx0 = ((qs_byte >> shift) & 0x3) | (((sgn_byte >> (j0 % 8)) & 0x1) << 2);
|
||||
const uint8_t idx1 = ((qs_byte >> (shift+2)) & 0x3) | (((sgn_byte >> (j0 % 8 + 1)) & 0x1) << 2);
|
||||
|
||||
float2 kv;
|
||||
kv.x = TURBO_CENTROIDS_3BIT[idx0] * norm;
|
||||
kv.y = TURBO_CENTROIDS_3BIT[idx1] * norm;
|
||||
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
const half2 qv = ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
|
||||
ggml_cuda_mad(sum, make_float2(kv.x, kv.y), __half22float2(qv));
|
||||
#else
|
||||
const float2 qv = ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
|
||||
sum += kv.x * qv.x + kv.y * qv.y;
|
||||
#endif // V_DOT2_F32_F16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Turbo2 KQ dot product: dequantize K from turbo2 blocks, dot with Q (float2/half2)
|
||||
// Same structure as turbo3 but reads 2-bit indices from qs only (no signs).
|
||||
template <int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_turbo2_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_turbo2_0 * K_turbo = (const block_turbo2_0 *) K_c;
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
|
||||
const int k_KQ = k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne + k_KQ_1;
|
||||
|
||||
const int elem0 = k_KQ * 2;
|
||||
const int ib = elem0 / QK_TURBO2;
|
||||
const int j0 = elem0 % QK_TURBO2;
|
||||
|
||||
const float norm = __half2float(K_turbo[ib].norm);
|
||||
const uint8_t qs_byte = K_turbo[ib].qs[j0 / 4];
|
||||
|
||||
const int shift = (j0 % 4) * 2;
|
||||
const uint8_t idx0 = (qs_byte >> shift) & 0x3;
|
||||
const uint8_t idx1 = (qs_byte >> (shift+2)) & 0x3;
|
||||
|
||||
float2 kv;
|
||||
kv.x = TURBO_CENTROIDS_2BIT[idx0] * norm;
|
||||
kv.y = TURBO_CENTROIDS_2BIT[idx1] * norm;
|
||||
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
const half2 qv = ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
|
||||
ggml_cuda_mad(sum, make_float2(kv.x, kv.y), __half22float2(qv));
|
||||
#else
|
||||
const float2 qv = ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
|
||||
sum += kv.x * qv.x + kv.y * qv.y;
|
||||
#endif // V_DOT2_F32_F16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Turbo4 KQ dot product: dequantize K from turbo4 blocks, dot with Q (float2/half2)
|
||||
// 4-bit nibble packed: qs[j/2] >> ((j%2)*4) & 0xF
|
||||
template <int D, int nthreads>
|
||||
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_turbo4_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
|
||||
const block_turbo4_0 * K_turbo = (const block_turbo4_0 *) K_c;
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
|
||||
const int k_KQ = k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne + k_KQ_1;
|
||||
|
||||
const int elem0 = k_KQ * 2; // always even
|
||||
const int ib = elem0 / QK_TURBO4; // block index
|
||||
const int j0 = elem0 % QK_TURBO4; // always even
|
||||
|
||||
const float norm = __half2float(K_turbo[ib].norm);
|
||||
// Both j0 and j0+1 are adjacent nibbles: j0/2 == (j0+1)/2 when j0 is even
|
||||
const uint8_t qs_byte = K_turbo[ib].qs[j0 / 2];
|
||||
|
||||
const uint8_t idx0 = (qs_byte >> 0) & 0xF; // low nibble = j0
|
||||
const uint8_t idx1 = (qs_byte >> 4) & 0xF; // high nibble = j0+1
|
||||
|
||||
float2 kv;
|
||||
kv.x = TURBO_CENTROIDS_4BIT[idx0] * norm;
|
||||
kv.y = TURBO_CENTROIDS_4BIT[idx1] * norm;
|
||||
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
const half2 qv = ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
|
||||
ggml_cuda_mad(sum, make_float2(kv.x, kv.y), __half22float2(qv));
|
||||
#else
|
||||
const float2 qv = ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
|
||||
sum += kv.x * qv.x + kv.y * qv.y;
|
||||
#endif // V_DOT2_F32_F16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename Tds, int ni>
|
||||
static __device__ __forceinline__ void quantize_q8_1_to_shared(
|
||||
const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) {
|
||||
@@ -577,6 +736,186 @@ static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict
|
||||
}
|
||||
}
|
||||
|
||||
// Turbo3 V dequantize: extract `ne` float/half values at position i0.
|
||||
//
|
||||
// Optimised for the ne==4 path (used by the VEC kernel with turbo3 V):
|
||||
// i0 is always a multiple of 4 from the VEC kernel access pattern, so all 4
|
||||
// elements share one qs byte and one signs byte — we load each once.
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_turbo3_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
const block_turbo3_0 * x = (const block_turbo3_0 *) vx;
|
||||
|
||||
const int64_t ib = i0 / QK_TURBO3;
|
||||
const int j0 = i0 % QK_TURBO3;
|
||||
const float norm = __half2float(x[ib].norm);
|
||||
|
||||
static_assert(ne == 2 || ne == 4, "bad ne");
|
||||
|
||||
if constexpr (ne == 4) {
|
||||
// When j0 % 4 == 0 (always true from VEC kernel), all 4 elements share one
|
||||
// qs byte (4 elements per byte) and one signs byte (8 elements per byte).
|
||||
const uint8_t qs_byte = x[ib].qs[j0 / 4];
|
||||
const uint8_t sgn_byte = x[ib].signs[j0 / 8];
|
||||
const int shift_s = j0 % 8; // 0 or 4
|
||||
|
||||
const uint8_t idx0 = ((qs_byte >> 0) & 0x3) | (((sgn_byte >> (shift_s+0)) & 0x1) << 2);
|
||||
const uint8_t idx1 = ((qs_byte >> 2) & 0x3) | (((sgn_byte >> (shift_s+1)) & 0x1) << 2);
|
||||
const uint8_t idx2 = ((qs_byte >> 4) & 0x3) | (((sgn_byte >> (shift_s+2)) & 0x1) << 2);
|
||||
const uint8_t idx3 = ((qs_byte >> 6) & 0x3) | (((sgn_byte >> (shift_s+3)) & 0x1) << 2);
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
((half2 *) dst)[0] = make_half2(
|
||||
__float2half(TURBO_CENTROIDS_3BIT[idx0] * norm),
|
||||
__float2half(TURBO_CENTROIDS_3BIT[idx1] * norm));
|
||||
((half2 *) dst)[1] = make_half2(
|
||||
__float2half(TURBO_CENTROIDS_3BIT[idx2] * norm),
|
||||
__float2half(TURBO_CENTROIDS_3BIT[idx3] * norm));
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
((float2 *) dst)[0] = make_float2(
|
||||
TURBO_CENTROIDS_3BIT[idx0] * norm,
|
||||
TURBO_CENTROIDS_3BIT[idx1] * norm);
|
||||
((float2 *) dst)[1] = make_float2(
|
||||
TURBO_CENTROIDS_3BIT[idx2] * norm,
|
||||
TURBO_CENTROIDS_3BIT[idx3] * norm);
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
} else { // ne == 2
|
||||
#ifdef FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
float v0 = turbo3_dequant_element(&x[ib], j0, norm);
|
||||
float v1 = turbo3_dequant_element(&x[ib], j0+1, norm);
|
||||
((half2 *) dst)[0] = make_half2(__float2half(v0), __float2half(v1));
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
((float *) dst)[0] = turbo3_dequant_element(&x[ib], j0, norm);
|
||||
((float *) dst)[1] = turbo3_dequant_element(&x[ib], j0+1, norm);
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Turbo2 V dequantize: extract `ne` float/half values at position i0.
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_turbo2_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
const block_turbo2_0 * x = (const block_turbo2_0 *) vx;
|
||||
|
||||
const int64_t ib = i0 / QK_TURBO2;
|
||||
const int j0 = i0 % QK_TURBO2;
|
||||
const float norm = __half2float(x[ib].norm);
|
||||
|
||||
static_assert(ne == 2 || ne == 4, "bad ne");
|
||||
|
||||
if constexpr (ne == 4) {
|
||||
const uint8_t qs_byte = x[ib].qs[j0 / 4];
|
||||
|
||||
const uint8_t idx0 = (qs_byte >> 0) & 0x3;
|
||||
const uint8_t idx1 = (qs_byte >> 2) & 0x3;
|
||||
const uint8_t idx2 = (qs_byte >> 4) & 0x3;
|
||||
const uint8_t idx3 = (qs_byte >> 6) & 0x3;
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
((half2 *) dst)[0] = make_half2(
|
||||
__float2half(TURBO_CENTROIDS_2BIT[idx0] * norm),
|
||||
__float2half(TURBO_CENTROIDS_2BIT[idx1] * norm));
|
||||
((half2 *) dst)[1] = make_half2(
|
||||
__float2half(TURBO_CENTROIDS_2BIT[idx2] * norm),
|
||||
__float2half(TURBO_CENTROIDS_2BIT[idx3] * norm));
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
((float2 *) dst)[0] = make_float2(
|
||||
TURBO_CENTROIDS_2BIT[idx0] * norm,
|
||||
TURBO_CENTROIDS_2BIT[idx1] * norm);
|
||||
((float2 *) dst)[1] = make_float2(
|
||||
TURBO_CENTROIDS_2BIT[idx2] * norm,
|
||||
TURBO_CENTROIDS_2BIT[idx3] * norm);
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
} else { // ne == 2
|
||||
#ifdef FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
float v0 = turbo2_dequant_element(&x[ib], j0, norm);
|
||||
float v1 = turbo2_dequant_element(&x[ib], j0+1, norm);
|
||||
((half2 *) dst)[0] = make_half2(__float2half(v0), __float2half(v1));
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
((float *) dst)[0] = turbo2_dequant_element(&x[ib], j0, norm);
|
||||
((float *) dst)[1] = turbo2_dequant_element(&x[ib], j0+1, norm);
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Turbo4 V dequantize: extract `ne` float/half values at position i0.
|
||||
// 4-bit nibble packed, block size 128.
|
||||
template <typename T, int ne>
|
||||
static __device__ __forceinline__ void dequantize_V_turbo4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
|
||||
const block_turbo4_0 * x = (const block_turbo4_0 *) vx;
|
||||
|
||||
const int64_t ib = i0 / QK_TURBO4;
|
||||
const int j0 = i0 % QK_TURBO4;
|
||||
const float norm = __half2float(x[ib].norm);
|
||||
|
||||
static_assert(ne == 2 || ne == 4, "bad ne");
|
||||
|
||||
if constexpr (ne == 4) {
|
||||
// j0 is always a multiple of 4 from the VEC kernel access pattern.
|
||||
// 4 consecutive elements span 2 qs bytes: j0/2 and j0/2+1.
|
||||
const uint8_t qs_byte0 = x[ib].qs[j0 / 2]; // elements j0, j0+1
|
||||
const uint8_t qs_byte1 = x[ib].qs[j0 / 2 + 1]; // elements j0+2, j0+3
|
||||
|
||||
const uint8_t idx0 = (qs_byte0 >> 0) & 0xF;
|
||||
const uint8_t idx1 = (qs_byte0 >> 4) & 0xF;
|
||||
const uint8_t idx2 = (qs_byte1 >> 0) & 0xF;
|
||||
const uint8_t idx3 = (qs_byte1 >> 4) & 0xF;
|
||||
|
||||
#ifdef FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
((half2 *) dst)[0] = make_half2(
|
||||
__float2half(TURBO_CENTROIDS_4BIT[idx0] * norm),
|
||||
__float2half(TURBO_CENTROIDS_4BIT[idx1] * norm));
|
||||
((half2 *) dst)[1] = make_half2(
|
||||
__float2half(TURBO_CENTROIDS_4BIT[idx2] * norm),
|
||||
__float2half(TURBO_CENTROIDS_4BIT[idx3] * norm));
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
((float2 *) dst)[0] = make_float2(
|
||||
TURBO_CENTROIDS_4BIT[idx0] * norm,
|
||||
TURBO_CENTROIDS_4BIT[idx1] * norm);
|
||||
((float2 *) dst)[1] = make_float2(
|
||||
TURBO_CENTROIDS_4BIT[idx2] * norm,
|
||||
TURBO_CENTROIDS_4BIT[idx3] * norm);
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
} else { // ne == 2
|
||||
#ifdef FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, half>) {
|
||||
float v0 = turbo4_dequant_element(&x[ib], j0, norm);
|
||||
float v1 = turbo4_dequant_element(&x[ib], j0+1, norm);
|
||||
((half2 *) dst)[0] = make_half2(__float2half(v0), __float2half(v1));
|
||||
} else
|
||||
#endif // FP16_AVAILABLE
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
((float *) dst)[0] = turbo4_dequant_element(&x[ib], j0, norm);
|
||||
((float *) dst)[1] = turbo4_dequant_element(&x[ib], j0+1, norm);
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <ggml_type type_K, int D, int nthreads>
|
||||
constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
|
||||
if constexpr (type_K == GGML_TYPE_F16) {
|
||||
@@ -593,6 +932,12 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
|
||||
return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_BF16) {
|
||||
return vec_dot_fattn_vec_KQ_bf16<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_TURBO3_0) {
|
||||
return vec_dot_fattn_vec_KQ_turbo3_0<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_TURBO2_0) {
|
||||
return vec_dot_fattn_vec_KQ_turbo2_0<D, nthreads>;
|
||||
} else if constexpr (type_K == GGML_TYPE_TURBO4_0) {
|
||||
return vec_dot_fattn_vec_KQ_turbo4_0<D, nthreads>;
|
||||
} else {
|
||||
static_assert(type_K == -1, "bad type");
|
||||
return nullptr;
|
||||
@@ -615,6 +960,12 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
|
||||
return dequantize_V_q8_0<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_BF16) {
|
||||
return dequantize_V_bf16<float, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_TURBO3_0) {
|
||||
return dequantize_V_turbo3_0<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_TURBO2_0) {
|
||||
return dequantize_V_turbo2_0<T, ne>;
|
||||
} else if constexpr (type_V == GGML_TYPE_TURBO4_0) {
|
||||
return dequantize_V_turbo4_0<T, ne>;
|
||||
} else {
|
||||
static_assert(type_V == -1, "bad type");
|
||||
return nullptr;
|
||||
@@ -946,8 +1297,32 @@ void launch_fattn(
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
|
||||
#ifdef GGML_USE_HIP
|
||||
// HIP/ROCm: bypass the memory pool for f16 temp buffers.
|
||||
// The legacy pool (ggml_cuda_pool_leg) retains peak-sized allocations permanently.
|
||||
// For quantized KV dequant, this means the f16 temp buffer stays allocated,
|
||||
// consuming more VRAM than the quantized KV compression saves — causing OOM.
|
||||
// Using raw alloc+free ensures the memory is released after the kernel completes.
|
||||
struct hip_f16_alloc {
|
||||
half * ptr = nullptr;
|
||||
cudaStream_t stream;
|
||||
hip_f16_alloc(cudaStream_t s) : stream(s) {}
|
||||
~hip_f16_alloc() {
|
||||
if (ptr) {
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaFree(ptr);
|
||||
}
|
||||
}
|
||||
void alloc(size_t nelements) {
|
||||
CUDA_CHECK(cudaMalloc(&ptr, nelements * sizeof(half)));
|
||||
}
|
||||
};
|
||||
hip_f16_alloc K_f16(main_stream);
|
||||
hip_f16_alloc V_f16(main_stream);
|
||||
#else
|
||||
ggml_cuda_pool_alloc<half> K_f16(pool);
|
||||
ggml_cuda_pool_alloc<half> V_f16(pool);
|
||||
#endif
|
||||
ggml_cuda_pool_alloc<int> KV_max(pool);
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||
|
||||
@@ -50,10 +50,17 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
GGML_ASSERT(V->ne[0] == K->ne[0]);
|
||||
ggml_cuda_flash_attn_ext_tile_case<512, 512>(ctx, dst);
|
||||
} break;
|
||||
#ifndef GGML_USE_HIP
|
||||
// D>=576 tile kernels exceed HIP local memory limit (67584 > 65536)
|
||||
case 576: {
|
||||
GGML_ASSERT(V->ne[0] == 512);
|
||||
ggml_cuda_flash_attn_ext_tile_case<576, 512>(ctx, dst);
|
||||
} break;
|
||||
case 640: {
|
||||
GGML_ASSERT(V->ne[0] == 512);
|
||||
ggml_cuda_flash_attn_ext_tile_case<640, 512>(ctx, dst);
|
||||
} break;
|
||||
#endif
|
||||
default: {
|
||||
GGML_ABORT("Unsupported head size");
|
||||
} break;
|
||||
|
||||
@@ -84,6 +84,10 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 16, 256, 2, 64, 64)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -152,6 +156,10 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 32, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 4, 128, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 8, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 16, 256, 2, 32, 64)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -229,6 +237,11 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 16, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 32, 512, 1, 128, 64)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -306,6 +319,11 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 16, 256, 4, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 32, 256, 2, 128, 64)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1239,7 +1257,7 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
|
||||
// On NVIDIA (Pascal and older) the GQA optimizations seem to be detrimental in some cases.
|
||||
// However, for DKQ == 576, DV == 512 only the kernel variant with GQA optimizations is implemented.
|
||||
// However, for DKQ == 576/640, DV == 512 only the kernel variant with GQA optimizations is implemented.
|
||||
const bool nvidia = GGML_CUDA_CC_IS_NVIDIA(ggml_cuda_info().devices[ggml_cuda_get_device()].cc);
|
||||
const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
@@ -1345,3 +1363,4 @@ extern DECL_FATTN_TILE_CASE(256, 256);
|
||||
extern DECL_FATTN_TILE_CASE(320, 256);
|
||||
extern DECL_FATTN_TILE_CASE(512, 512);
|
||||
extern DECL_FATTN_TILE_CASE(576, 512);
|
||||
extern DECL_FATTN_TILE_CASE(640, 512);
|
||||
|
||||
@@ -17,7 +17,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif // __clang__
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), 1)
|
||||
__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), 2)
|
||||
static __global__ void flash_attn_ext_vec(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
@@ -75,17 +75,30 @@ static __global__ void flash_attn_ext_vec(
|
||||
#endif // GGML_USE_HIP
|
||||
|
||||
constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device();
|
||||
constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q;
|
||||
constexpr int nthreads_V = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q;
|
||||
// Turbo3 uses the float Q path (like f16/bf16), not q8_1 integer path
|
||||
constexpr bool K_is_unquantized = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16 || type_K == GGML_TYPE_TURBO3_0 || type_K == GGML_TYPE_TURBO2_0 || type_K == GGML_TYPE_TURBO4_0);
|
||||
constexpr bool V_is_unquantized = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16 || type_V == GGML_TYPE_TURBO3_0 || type_V == GGML_TYPE_TURBO2_0 || type_V == GGML_TYPE_TURBO4_0);
|
||||
constexpr bool K_is_turbo = (type_K == GGML_TYPE_TURBO3_0 || type_K == GGML_TYPE_TURBO2_0 || type_K == GGML_TYPE_TURBO4_0);
|
||||
// Turbo KQ dot does byte extraction + centroid lookup + scalar mul, not vectorized f16 loads.
|
||||
// nthreads_KQ=1: each thread computes a full KQ product alone — eliminates warp_reduce_sum
|
||||
// shuffle and halves KQ loop iterations. Each thread holds full Q vector in registers.
|
||||
constexpr int nthreads_KQ = K_is_turbo ? 1 : (K_is_unquantized ? 128 / cpy_nb : nthreads_KQ_q);
|
||||
constexpr bool V_is_turbo = (type_V == GGML_TYPE_TURBO3_0 || type_V == GGML_TYPE_TURBO2_0 || type_V == GGML_TYPE_TURBO4_0);
|
||||
// Turbo V dequant is scalar (byte extract + LUT), not vectorized loads.
|
||||
// Halve nthreads_V to double V_cols_per_iter (process 2 V rows per loop iteration),
|
||||
// reducing loop overhead and improving ILP in the V aggregation phase.
|
||||
// Eighth nthreads_V for turbo: V_cols_per_iter goes from 4→8, processing 8 V positions
|
||||
// per outer loop iteration. Halves outer loop count again, more ILP from concurrent V rows.
|
||||
constexpr int nthreads_V = V_is_unquantized ? (V_is_turbo ? (nthreads_V_q / 8 < 1 ? 1 : nthreads_V_q / 8) : 128 / cpy_nb) : nthreads_V_q;
|
||||
|
||||
static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
|
||||
static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V");
|
||||
|
||||
constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4;
|
||||
constexpr int V_rows_per_thread = V_is_unquantized ? ((type_V == GGML_TYPE_TURBO3_0 || type_V == GGML_TYPE_TURBO2_0 || type_V == GGML_TYPE_TURBO4_0) ? 4 : 2*cpy_ne) : 4;
|
||||
constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V;
|
||||
|
||||
constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
|
||||
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16;
|
||||
constexpr bool Q_q8_1 = !K_is_unquantized;
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half, V_rows_per_thread>();
|
||||
#else
|
||||
@@ -120,6 +133,23 @@ static __global__ void flash_attn_ext_vec(
|
||||
__shared__ float KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
|
||||
#endif // V_DOT2_F32_F16_AVAILABLE
|
||||
|
||||
// Shared-memory LUT for turbo KQ scoring: precompute Q[d] * centroid[c] once,
|
||||
// then the hot loop does turbo_lut[d][idx] (shmem read, no multiply).
|
||||
// turbo4 excluded: 16 centroids × D exceeds shmem budget.
|
||||
// Stride = n_centroids+1 to avoid bank conflicts.
|
||||
constexpr int n_centroids_lut = (D <= 256 && type_K == GGML_TYPE_TURBO3_0) ? 8 :
|
||||
(D <= 256 && type_K == GGML_TYPE_TURBO2_0) ? 4 : 0;
|
||||
constexpr int lut_stride = n_centroids_lut > 0 ? n_centroids_lut + 1 : 1;
|
||||
__shared__ half turbo_lut[n_centroids_lut > 0 ? D : 1][lut_stride];
|
||||
|
||||
// Sparse V: skip V dequant for positions with negligible attention weights.
|
||||
// At long context, most V positions contribute < 1e-6 to the output — skipping
|
||||
// their dequant saves significant compute (especially for quantized V types).
|
||||
constexpr float sparse_v_threshold_f = 1e-6f;
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
const half sparse_v_threshold_h = __float2half(sparse_v_threshold_f);
|
||||
#endif
|
||||
|
||||
float KQ_max[ncols];
|
||||
float KQ_sum[ncols];
|
||||
#pragma unroll
|
||||
@@ -236,6 +266,20 @@ static __global__ void flash_attn_ext_vec(
|
||||
#endif // V_DOT2_F32_F16_AVAILABLE
|
||||
}
|
||||
|
||||
// Build shared-memory LUT: turbo_lut[d][c] = half(Q[d] * scale * centroid[c])
|
||||
if constexpr (n_centroids_lut > 0 && ncols == 1) {
|
||||
const float * centroids_ptr = (type_K == GGML_TYPE_TURBO3_0) ? TURBO_CENTROIDS_3BIT :
|
||||
TURBO_CENTROIDS_2BIT;
|
||||
const float * Q_f = (const float *)(Q + 0*nb01);
|
||||
for (int d = tid; d < D; d += nthreads) {
|
||||
const float q_val = Q_f[d] * scale;
|
||||
for (int c = 0; c < n_centroids_lut; c++) {
|
||||
turbo_lut[d][c] = __float2half(q_val * centroids_ptr[c]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
K += blockIdx.y*nthreads * nb11;
|
||||
V += blockIdx.y*nthreads * nb21;
|
||||
@@ -259,8 +303,50 @@ static __global__ void flash_attn_ext_vec(
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float sum = vec_dot_KQ(K + i_KQ*nb11, Q_reg[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum<nthreads_KQ>(sum);
|
||||
float sum;
|
||||
if constexpr (n_centroids_lut > 0 && ncols == 1 && type_K == GGML_TYPE_TURBO3_0) {
|
||||
// LUT scoring: 8 elements per iteration (2 qs bytes + 1 signs byte)
|
||||
const block_turbo3_0 * K_turbo = (const block_turbo3_0 *)(K + i_KQ*nb11);
|
||||
sum = 0.0f;
|
||||
for (int d0 = 0; d0 < D; d0 += 8) {
|
||||
const int ib = d0 / QK_TURBO3;
|
||||
const int jj = d0 % QK_TURBO3;
|
||||
const float norm = __half2float(K_turbo[ib].norm);
|
||||
const uint8_t qs0 = K_turbo[ib].qs[jj / 4];
|
||||
const uint8_t qs1 = K_turbo[ib].qs[jj / 4 + 1];
|
||||
const uint8_t sgn = K_turbo[ib].signs[jj / 8];
|
||||
sum += (__half2float(turbo_lut[d0 ][((qs0>>0)&3)|((sgn>>0&1)<<2)]) +
|
||||
__half2float(turbo_lut[d0+1][((qs0>>2)&3)|((sgn>>1&1)<<2)]) +
|
||||
__half2float(turbo_lut[d0+2][((qs0>>4)&3)|((sgn>>2&1)<<2)]) +
|
||||
__half2float(turbo_lut[d0+3][((qs0>>6)&3)|((sgn>>3&1)<<2)]) +
|
||||
__half2float(turbo_lut[d0+4][((qs1>>0)&3)|((sgn>>4&1)<<2)]) +
|
||||
__half2float(turbo_lut[d0+5][((qs1>>2)&3)|((sgn>>5&1)<<2)]) +
|
||||
__half2float(turbo_lut[d0+6][((qs1>>4)&3)|((sgn>>6&1)<<2)]) +
|
||||
__half2float(turbo_lut[d0+7][((qs1>>6)&3)|((sgn>>7&1)<<2)])) * norm;
|
||||
}
|
||||
} else if constexpr (n_centroids_lut > 0 && ncols == 1 && type_K == GGML_TYPE_TURBO2_0) {
|
||||
// LUT scoring for turbo2: 8 elements per iteration (2 qs bytes, no signs)
|
||||
const block_turbo2_0 * K_turbo = (const block_turbo2_0 *)(K + i_KQ*nb11);
|
||||
sum = 0.0f;
|
||||
for (int d0 = 0; d0 < D; d0 += 8) {
|
||||
const int ib = d0 / QK_TURBO2;
|
||||
const int jj = d0 % QK_TURBO2;
|
||||
const float norm = __half2float(K_turbo[ib].norm);
|
||||
const uint8_t qs0 = K_turbo[ib].qs[jj / 4];
|
||||
const uint8_t qs1 = K_turbo[ib].qs[jj / 4 + 1];
|
||||
sum += (__half2float(turbo_lut[d0 ][(qs0>>0)&3]) +
|
||||
__half2float(turbo_lut[d0+1][(qs0>>2)&3]) +
|
||||
__half2float(turbo_lut[d0+2][(qs0>>4)&3]) +
|
||||
__half2float(turbo_lut[d0+3][(qs0>>6)&3]) +
|
||||
__half2float(turbo_lut[d0+4][(qs1>>0)&3]) +
|
||||
__half2float(turbo_lut[d0+5][(qs1>>2)&3]) +
|
||||
__half2float(turbo_lut[d0+6][(qs1>>4)&3]) +
|
||||
__half2float(turbo_lut[d0+7][(qs1>>6)&3])) * norm;
|
||||
}
|
||||
} else {
|
||||
sum = vec_dot_KQ(K + i_KQ*nb11, Q_reg[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum<nthreads_KQ>(sum);
|
||||
}
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum = logit_softcap*tanhf(sum);
|
||||
@@ -284,12 +370,12 @@ static __global__ void flash_attn_ext_vec(
|
||||
for (int offset = nthreads_KQ; offset < WARP_SIZE; offset <<= 1) {
|
||||
KQ_max_new[j] = fmaxf(KQ_max_new[j], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[j], offset, WARP_SIZE));
|
||||
}
|
||||
const float KQ_max_scale = expf(KQ_max[j] - KQ_max_new[j]);
|
||||
const float KQ_max_scale = __expf(KQ_max[j] - KQ_max_new[j]);
|
||||
KQ_max[j] = KQ_max_new[j];
|
||||
|
||||
KQ_reg[j] = expf(KQ_reg[j] - KQ_max[j]);
|
||||
KQ_reg[j] = __expf(KQ_reg[j] - KQ_max[j]);
|
||||
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + KQ_reg[j];
|
||||
KQ[j*nthreads + tid] = KQ_reg[j];
|
||||
if constexpr (!V_is_turbo) { KQ[j*nthreads + tid] = KQ_reg[j]; }
|
||||
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
@@ -307,7 +393,7 @@ static __global__ void flash_attn_ext_vec(
|
||||
}
|
||||
|
||||
#ifndef GGML_USE_HIP
|
||||
__syncwarp();
|
||||
if constexpr (!V_is_turbo) { __syncwarp(); }
|
||||
#endif // GGML_USE_HIP
|
||||
|
||||
#pragma unroll
|
||||
@@ -318,8 +404,29 @@ static __global__ void flash_attn_ext_vec(
|
||||
half2 KQ_k[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ_k[j] = __half2half2(KQ[j*nthreads + k]);
|
||||
if constexpr (V_is_turbo) {
|
||||
const float kq_val = __shfl_sync(0xFFFFFFFF, KQ_reg[j], k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V));
|
||||
KQ_k[j] = make_half2(__float2half(kq_val), __float2half(kq_val));
|
||||
} else {
|
||||
KQ_k[j] = __half2half2(KQ[j*nthreads + k]);
|
||||
}
|
||||
}
|
||||
|
||||
// Sparse V: skip V dequant if all attention weights for this position are negligible.
|
||||
// For turbo types, the check is compiled out: at typical decode context lengths
|
||||
// (< ~4K tokens) with threshold 1e-6, no positions are ever skipped, so the
|
||||
// per-position branch is pure overhead (misprediction + comparison cost). This
|
||||
// also dodges the warp-divergence regression on turbo paths that motivated the
|
||||
// April 24 revert (commit f2dc968).
|
||||
if constexpr (!V_is_turbo) {
|
||||
bool dominated = true;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (__hgt(__low2half(KQ_k[j]), sparse_v_threshold_h)) { dominated = false; break; }
|
||||
}
|
||||
if (dominated) { continue; }
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
half2 tmp[V_rows_per_thread/2];
|
||||
@@ -347,19 +454,142 @@ static __global__ void flash_attn_ext_vec(
|
||||
float KQ_k[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ_k[j] = KQ[j*nthreads + k];
|
||||
if constexpr (V_is_turbo) {
|
||||
KQ_k[j] = __shfl_sync(0xFFFFFFFF, KQ_reg[j], k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V));
|
||||
} else {
|
||||
KQ_k[j] = KQ[j*nthreads + k];
|
||||
}
|
||||
}
|
||||
|
||||
// Sparse V: skip V dequant if all attention weights for this position are negligible.
|
||||
// Compiled out for turbo types — see half2 path comment above.
|
||||
if constexpr (!V_is_turbo) {
|
||||
bool dominated = true;
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
float2 tmp[V_rows_per_thread/2];
|
||||
dequantize_V(V + k*nb21, tmp,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (KQ_k[j] >= sparse_v_threshold_f) { dominated = false; break; }
|
||||
}
|
||||
if (dominated) { continue; }
|
||||
}
|
||||
|
||||
// Turbo V path: precompute scaled centroids once per block to eliminate
|
||||
// per-element norm multiply. centroid[idx]*norm is computed 8/4/16 times
|
||||
// (once per centroid) instead of D times (once per element).
|
||||
if constexpr (type_V == GGML_TYPE_TURBO3_0) {
|
||||
const block_turbo3_0 * vb = (const block_turbo3_0 *)(V + k*nb21);
|
||||
int prev_ib = -1;
|
||||
float sc[8];
|
||||
|
||||
#pragma unroll
|
||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
const int i0 = 2*i_VKQ_0 + (threadIdx.x % nthreads_V)*V_rows_per_thread;
|
||||
const int ib = i0 / QK_TURBO3;
|
||||
const int j0 = i0 % QK_TURBO3;
|
||||
|
||||
if (ib != prev_ib) {
|
||||
prev_ib = ib;
|
||||
const float norm = __half2float(vb[ib].norm);
|
||||
#pragma unroll
|
||||
for (int c = 0; c < 8; ++c) { sc[c] = TURBO_CENTROIDS_3BIT[c] * norm; }
|
||||
}
|
||||
|
||||
const uint8_t qs_byte = vb[ib].qs[j0 / 4];
|
||||
const uint8_t sgn_byte = vb[ib].signs[j0 / 8];
|
||||
const int shift_s = j0 % 8;
|
||||
|
||||
const uint8_t idx0 = ((qs_byte >> 0) & 0x3) | (((sgn_byte >> (shift_s+0)) & 0x1) << 2);
|
||||
const uint8_t idx1 = ((qs_byte >> 2) & 0x3) | (((sgn_byte >> (shift_s+1)) & 0x1) << 2);
|
||||
const uint8_t idx2 = ((qs_byte >> 4) & 0x3) | (((sgn_byte >> (shift_s+2)) & 0x1) << 2);
|
||||
const uint8_t idx3 = ((qs_byte >> 6) & 0x3) | (((sgn_byte >> (shift_s+3)) & 0x1) << 2);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].x += tmp[i_VKQ_1].x*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].y += tmp[i_VKQ_1].y*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 0].x += sc[idx0]*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 0].y += sc[idx1]*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 1].x += sc[idx2]*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 1].y += sc[idx3]*KQ_k[j];
|
||||
}
|
||||
}
|
||||
} else if constexpr (type_V == GGML_TYPE_TURBO2_0) {
|
||||
const block_turbo2_0 * vb = (const block_turbo2_0 *)(V + k*nb21);
|
||||
int prev_ib = -1;
|
||||
float sc[4];
|
||||
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
const int i0 = 2*i_VKQ_0 + (threadIdx.x % nthreads_V)*V_rows_per_thread;
|
||||
const int ib = i0 / QK_TURBO2;
|
||||
const int j0 = i0 % QK_TURBO2;
|
||||
|
||||
if (ib != prev_ib) {
|
||||
prev_ib = ib;
|
||||
const float norm = __half2float(vb[ib].norm);
|
||||
#pragma unroll
|
||||
for (int c = 0; c < 4; ++c) { sc[c] = TURBO_CENTROIDS_2BIT[c] * norm; }
|
||||
}
|
||||
|
||||
const uint8_t qs_byte = vb[ib].qs[j0 / 4];
|
||||
|
||||
const uint8_t idx0 = (qs_byte >> 0) & 0x3;
|
||||
const uint8_t idx1 = (qs_byte >> 2) & 0x3;
|
||||
const uint8_t idx2 = (qs_byte >> 4) & 0x3;
|
||||
const uint8_t idx3 = (qs_byte >> 6) & 0x3;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 0].x += sc[idx0]*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 0].y += sc[idx1]*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 1].x += sc[idx2]*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 1].y += sc[idx3]*KQ_k[j];
|
||||
}
|
||||
}
|
||||
} else if constexpr (type_V == GGML_TYPE_TURBO4_0) {
|
||||
const block_turbo4_0 * vb = (const block_turbo4_0 *)(V + k*nb21);
|
||||
int prev_ib = -1;
|
||||
float sc[16];
|
||||
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
const int i0 = 2*i_VKQ_0 + (threadIdx.x % nthreads_V)*V_rows_per_thread;
|
||||
const int ib = i0 / QK_TURBO4;
|
||||
const int j0 = i0 % QK_TURBO4;
|
||||
|
||||
if (ib != prev_ib) {
|
||||
prev_ib = ib;
|
||||
const float norm = __half2float(vb[ib].norm);
|
||||
#pragma unroll
|
||||
for (int c = 0; c < 16; ++c) { sc[c] = TURBO_CENTROIDS_4BIT[c] * norm; }
|
||||
}
|
||||
|
||||
const uint8_t qs_byte0 = vb[ib].qs[j0 / 2];
|
||||
const uint8_t qs_byte1 = vb[ib].qs[j0 / 2 + 1];
|
||||
|
||||
const uint8_t idx0 = (qs_byte0 >> 0) & 0xF;
|
||||
const uint8_t idx1 = (qs_byte0 >> 4) & 0xF;
|
||||
const uint8_t idx2 = (qs_byte1 >> 0) & 0xF;
|
||||
const uint8_t idx3 = (qs_byte1 >> 4) & 0xF;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 0].x += sc[idx0]*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 0].y += sc[idx1]*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 1].x += sc[idx2]*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + 1].y += sc[idx3]*KQ_k[j];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
float2 tmp[V_rows_per_thread/2];
|
||||
dequantize_V(V + k*nb21, tmp,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
#pragma unroll
|
||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].x += tmp[i_VKQ_1].x*KQ_k[j];
|
||||
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].y += tmp[i_VKQ_1].y*KQ_k[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -379,10 +609,10 @@ static __global__ void flash_attn_ext_vec(
|
||||
}
|
||||
|
||||
const float kqmax_new_j = fmaxf(sink, KQ_max[j]);
|
||||
const float KQ_max_scale = expf(KQ_max[j] - kqmax_new_j);
|
||||
const float KQ_max_scale = __expf(KQ_max[j] - kqmax_new_j);
|
||||
KQ_max[j] = kqmax_new_j;
|
||||
|
||||
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + (threadIdx.x == 0 ? expf(sink - KQ_max[j]) : 0.0f);
|
||||
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + (threadIdx.x == 0 ? __expf(sink - KQ_max[j]) : 0.0f);
|
||||
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
@@ -428,7 +658,7 @@ static __global__ void flash_attn_ext_vec(
|
||||
|
||||
float kqmax_new = KQ_max_shared[j_VKQ][threadIdx.x];
|
||||
kqmax_new = warp_reduce_max(kqmax_new);
|
||||
const float kqmax_scale = expf(KQ_max[j_VKQ] - kqmax_new);
|
||||
const float kqmax_scale = __expf(KQ_max[j_VKQ] - kqmax_new);
|
||||
KQ_max[j_VKQ] = kqmax_new;
|
||||
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
@@ -598,3 +828,99 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
|
||||
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16)
|
||||
|
||||
// TurboQuant3 — turbo3 K + turbo3 V (KV cache uses same type)
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
|
||||
|
||||
// Mixed turbo3/q8_0 KV cache types
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
|
||||
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
|
||||
|
||||
// Mixed f16/turbo3 KV cache types
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
|
||||
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
|
||||
|
||||
// TurboQuant2 -- turbo2 K + turbo2 V
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
|
||||
|
||||
// Mixed turbo2/q8_0 KV cache types
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
|
||||
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
|
||||
|
||||
// Mixed f16/turbo2 KV cache types
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
|
||||
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
|
||||
|
||||
// Mixed turbo3/turbo2 KV cache types
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
|
||||
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
|
||||
|
||||
// TurboQuant4 — turbo4 K + turbo4 V (KV cache uses same type)
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
|
||||
|
||||
// Mixed turbo4/q8_0 KV cache types
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
|
||||
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
|
||||
|
||||
// Mixed f16/turbo4 KV cache types
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
|
||||
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
|
||||
|
||||
// Mixed turbo4/turbo3 KV cache types
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
|
||||
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
|
||||
|
||||
// Mixed turbo4/turbo2 KV cache types
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
|
||||
|
||||
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
|
||||
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
|
||||
|
||||
@@ -58,6 +58,8 @@
|
||||
#include "ggml-cuda/gated_delta_net.cuh"
|
||||
#include "ggml-cuda/set.cuh"
|
||||
#include "ggml-cuda/set-rows.cuh"
|
||||
#include "ggml-cuda/turbo-wht.cuh"
|
||||
#include "ggml-cuda/mmvq-tq.cuh"
|
||||
#include "ggml-cuda/pad_reflect_1d.cuh"
|
||||
#include "ggml-cuda/solve_tri.cuh"
|
||||
#include "ggml-cuda/tri.cuh"
|
||||
@@ -679,10 +681,49 @@ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer,
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
|
||||
// TQ4_1S load-time q8_0 conversion: ON by default for best prefill speed.
|
||||
// Native TQ4_1S decode is faster (+29-33%) but prefill is 2× slower because
|
||||
// cuBLAS dequant-to-f16 requires per-element inverse WHT.
|
||||
// Opt-out: GGML_TQ_NATIVE=1 for decode-heavy workloads (saves 1.7× VRAM).
|
||||
static bool ggml_tq_convert_q8() {
|
||||
static int val = -1;
|
||||
if (val == -1) {
|
||||
const char * env = getenv("GGML_TQ_NATIVE");
|
||||
val = (env && env[0] == '1') ? 0 : 1; // default ON, GGML_TQ_NATIVE=1 disables
|
||||
}
|
||||
return val == 1;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
// TQ4_1S → q8_0 load-time conversion (opt-in: GGML_TQ_CONVERT_Q8=1)
|
||||
if (ggml_tq_convert_q8() && tensor->type == GGML_TYPE_TQ4_1S && offset == 0 && size == ggml_nbytes(tensor)) {
|
||||
const int64_t n_elements = ggml_nelements(tensor);
|
||||
|
||||
// Upload TQ4_1S to a temp GPU buffer
|
||||
void * tmp_tq4;
|
||||
CUDA_CHECK(cudaMalloc(&tmp_tq4, size));
|
||||
CUDA_CHECK(cudaMemcpyAsync(tmp_tq4, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
|
||||
// Convert TQ4_1S (tmp) → q8_0 (tensor->data, which has q8_0-sized allocation)
|
||||
ggml_cuda_convert_tq4_1s_to_q8_0(tmp_tq4, tensor->data, n_elements, cudaStreamPerThread);
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
|
||||
CUDA_CHECK(cudaFree(tmp_tq4));
|
||||
|
||||
// Update tensor metadata to q8_0
|
||||
tensor->type = GGML_TYPE_Q8_0;
|
||||
tensor->nb[0] = ggml_type_size(GGML_TYPE_Q8_0);
|
||||
tensor->nb[1] = tensor->nb[0] * (tensor->ne[0] / ggml_blck_size(GGML_TYPE_Q8_0));
|
||||
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
||||
tensor->nb[i] = tensor->nb[i-1] * tensor->ne[i-1];
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
@@ -803,6 +844,13 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
|
||||
// TQ4_1S → q8_0 load-time conversion: allocate q8_0-sized space if opted in
|
||||
if (ggml_tq_convert_q8() && tensor->type == GGML_TYPE_TQ4_1S) {
|
||||
// q8_0 block: 34 bytes per 32 elements. TQ4_1S block: 20 bytes per 32 elements.
|
||||
const int64_t n_blocks = ggml_nelements(tensor) / QK_TQ4_1S;
|
||||
size = n_blocks * sizeof(block_q8_0);
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
|
||||
@@ -2505,7 +2553,9 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
|
||||
ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
|
||||
src0->view_src;
|
||||
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
|
||||
const bool is_tq_weight = (src0->type == GGML_TYPE_TQ4_1S || src0->type == GGML_TYPE_TQ3_1S);
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && !is_tq_weight &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
|
||||
// fusion is not universally faster on Pascal
|
||||
@@ -2547,10 +2597,12 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
bool use_mul_mat_f = !ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||
// TQ weight types use fused dp4a path (all batch sizes), not mmvq/mmq
|
||||
const bool is_tq_weight = (src0->type == GGML_TYPE_TQ4_1S || src0->type == GGML_TYPE_TQ3_1S);
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && !is_tq_weight
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear && !is_tq_weight
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
bool any_gpus_with_slow_fp16 = false;
|
||||
@@ -2614,6 +2666,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
|
||||
} else if (use_mul_mat_q) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
|
||||
} else if (!split && is_tq_weight && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE) {
|
||||
// Fused TQ weight mul_mat with pre-rotated activations via warp shuffle WHT
|
||||
// Handles ne[1]=1 (decode) and ne[1]≤8 (multi-token / speculative decoding)
|
||||
ggml_cuda_mul_mat_tq(ctx, src0, src1, dst);
|
||||
} else if (!split && is_tq_weight && src0->type == GGML_TYPE_TQ4_1S) {
|
||||
// Large prefill: runtime TQ4_1S → q8_0 scratch conversion + cuBLAS
|
||||
// Gets tensor core throughput without permanent 1.7× VRAM cost
|
||||
ggml_cuda_mul_mat_tq4_1s_cublas(ctx, src0, src1, dst);
|
||||
} else {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
|
||||
}
|
||||
@@ -2633,16 +2693,18 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
|
||||
// TQ weight types use dequant-to-f16 cuBLAS path only (no mmvq/mmq kernels)
|
||||
const bool is_tq_weight_id = (src0->type == GGML_TYPE_TQ4_1S || src0->type == GGML_TYPE_TQ3_1S);
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
|
||||
if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
|
||||
if (ggml_is_quantized(src0->type)) {
|
||||
if (ggml_is_quantized(src0->type) && !is_tq_weight_id) {
|
||||
const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
|
||||
if (ne2 <= mmvq_mmid_max) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
} else if (!ggml_is_quantized(src0->type)) {
|
||||
if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
@@ -2800,6 +2862,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_SET_ROWS:
|
||||
ggml_cuda_op_set_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_TURBO_WHT:
|
||||
ggml_cuda_turbo_wht(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SET:
|
||||
ggml_cuda_op_set(ctx, dst);
|
||||
break;
|
||||
@@ -3258,10 +3323,11 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
}
|
||||
|
||||
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
|
||||
const bool is_tq_w = (node->src[0]->type == GGML_TYPE_TQ4_1S || node->src[0]->type == GGML_TYPE_TQ3_1S);
|
||||
if (node->op == GGML_OP_MUL_MAT_ID) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc);
|
||||
if (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max) {
|
||||
if (!ggml_is_quantized(node->src[0]->type) || is_tq_w || node->ne[2] > mmvq_mmid_max) {
|
||||
// under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
|
||||
// TODO: figure out a way to enable for larger batch sizes, without hurting performance
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18958
|
||||
@@ -5160,6 +5226,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -5180,6 +5248,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -5191,9 +5261,18 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// turbo types require head_dim divisible by appropriate group size
|
||||
if ((op->type == GGML_TYPE_TURBO3_0 || op->type == GGML_TYPE_TURBO2_0) && op->src[0]->ne[0] % 64 != 0) {
|
||||
return false;
|
||||
}
|
||||
// turbo4 block size is 128, so head_dim must be divisible by 128
|
||||
if (op->type == GGML_TYPE_TURBO4_0 && op->src[0]->ne[0] % 128 != 0) {
|
||||
return false;
|
||||
}
|
||||
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
|
||||
op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
|
||||
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
|
||||
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL ||
|
||||
op->type == GGML_TYPE_TURBO3_0 || op->type == GGML_TYPE_TURBO2_0 || op->type == GGML_TYPE_TURBO4_0) &&
|
||||
op->src[0]->type == GGML_TYPE_F32 &&
|
||||
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
|
||||
} break;
|
||||
@@ -5323,6 +5402,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
(op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) &&
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
||||
case GGML_OP_TURBO_WHT:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
|
||||
op->src[0]->ne[0] % 32 == 0;
|
||||
case GGML_OP_SSM_SCAN: {
|
||||
if (op->src[3]->ne[0] == 1) {
|
||||
// Mamba2
|
||||
|
||||
@@ -0,0 +1,568 @@
|
||||
/*
|
||||
* Fused mul_mat for TQ4_1S / TQ3_1S weight types.
|
||||
*
|
||||
* ne[1]≤8: dp4a multi-token kernel (weight reuse across tokens)
|
||||
* ne[1]>8: runtime TQ4_1S→q8_0 scratch + cuBLAS tensor core GEMM
|
||||
*/
|
||||
|
||||
#include "mmvq-tq.cuh"
|
||||
#include "turbo-quant.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
#define MMVQ_TQ_NWARPS 4
|
||||
|
||||
// ============================================================================
|
||||
// Pre-rotate activation to q8_1 format (for TQ4_1S dp4a path)
|
||||
// ============================================================================
|
||||
|
||||
static __global__ void tq_prerotate_q8_1(
|
||||
const float * __restrict__ src,
|
||||
block_q8_1 * __restrict__ dst,
|
||||
const int n_elements) {
|
||||
|
||||
const int block_idx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
const int lane = threadIdx.x;
|
||||
const int offset = block_idx * 32 + lane;
|
||||
if (offset >= n_elements) return;
|
||||
|
||||
float val = src[offset];
|
||||
val *= TQ_WEIGHT_SIGNS[lane];
|
||||
|
||||
#pragma unroll
|
||||
for (int h = 1; h < 32; h <<= 1) {
|
||||
float o = __shfl_xor_sync(0xffffffff, val, h);
|
||||
val = (lane & h) ? (o - val) : (val + o);
|
||||
}
|
||||
val *= 0.17677669529663688f;
|
||||
|
||||
float amax = fabsf(val);
|
||||
#pragma unroll
|
||||
for (int off = 16; off > 0; off >>= 1)
|
||||
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, off));
|
||||
|
||||
float sum = val;
|
||||
#pragma unroll
|
||||
for (int off = 16; off > 0; off >>= 1)
|
||||
sum += __shfl_xor_sync(0xffffffff, sum, off);
|
||||
|
||||
const float d = amax / 127.0f;
|
||||
const float id = (d > 0.0f) ? 127.0f / amax : 0.0f;
|
||||
|
||||
dst[block_idx].qs[lane] = (int8_t)roundf(val * id);
|
||||
if (lane == 0) {
|
||||
dst[block_idx].ds = make_half2(__float2half(d), __float2half(sum));
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// TQ4_1S: dp4a path with fixed int8 centroid LUT + q8_1 activation
|
||||
// ============================================================================
|
||||
|
||||
// Fixed int8 centroid table: centroid_i8[i] = round(TQ4_CENTROIDS_WEIGHT[i] * 127 / 2.733)
|
||||
// Rescale factor to recover float centroids: 2.733 / 127
|
||||
static constexpr float TQ4_CENTROID_I8_RESCALE = 2.733f / 127.0f;
|
||||
|
||||
// Register-based centroid lookup: maps 4 qs bytes (1 uint32) to 2 packed 4× centroid_i8 for dp4a.
|
||||
// Processes a full uint32 at once, sharing nibble extraction across both byte pairs.
|
||||
__device__ __forceinline__ void tq4_cents8_reg(uint32_t four_bytes, int &c0, int &c1) {
|
||||
// Centroid i8 values packed into 4 registers (little-endian byte order):
|
||||
// [-127,-96,-75,-58] [-44,-31,-18,-6] [6,18,31,44] [58,75,96,127]
|
||||
constexpr uint32_t CR03 = 0xC6B5A081u;
|
||||
constexpr uint32_t CR47 = 0xFAEEE1D4u;
|
||||
constexpr uint32_t CR8B = 0x2C1F1206u;
|
||||
constexpr uint32_t CRCF = 0x7F604B3Au;
|
||||
|
||||
// Extract all 8 nibbles from 4 bytes at once (shared across both pairs)
|
||||
const uint32_t lo = four_bytes & 0x0F0F0F0Fu;
|
||||
const uint32_t hi = (four_bytes >> 4) & 0x0F0F0F0Fu;
|
||||
|
||||
// Interleave: bytes 0-1 → sel0 [n0,n1,n2,n3], bytes 2-3 → sel1 [n4,n5,n6,n7]
|
||||
const uint32_t sel0 = __byte_perm(lo, hi, 0x5140u);
|
||||
const uint32_t sel1 = __byte_perm(lo, hi, 0x7362u);
|
||||
|
||||
// Lookup centroids for sel0 (elements from qs bytes 0-1)
|
||||
{
|
||||
const uint32_t flo = __byte_perm(CR03, CR47, sel0);
|
||||
const uint32_t fhi = __byte_perm(CR8B, CRCF, sel0);
|
||||
const uint32_t msb = (sel0 >> 3) & 0x01010101u;
|
||||
const uint32_t psel = 0x03020100u | (msb << 2);
|
||||
c0 = (int)__byte_perm(flo, fhi, psel);
|
||||
}
|
||||
|
||||
// Lookup centroids for sel1 (elements from qs bytes 2-3)
|
||||
{
|
||||
const uint32_t flo = __byte_perm(CR03, CR47, sel1);
|
||||
const uint32_t fhi = __byte_perm(CR8B, CRCF, sel1);
|
||||
const uint32_t msb = (sel1 >> 3) & 0x01010101u;
|
||||
const uint32_t psel = 0x03020100u | (msb << 2);
|
||||
c1 = (int)__byte_perm(flo, fhi, psel);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Pre-rotate activation to half (for TQ3_1S scalar path)
|
||||
// ============================================================================
|
||||
|
||||
static __global__ void tq_prerotate_activation(
|
||||
const float * __restrict__ src,
|
||||
half * __restrict__ dst,
|
||||
const int n_elements) {
|
||||
|
||||
const int block_idx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
const int lane = threadIdx.x;
|
||||
const int offset = block_idx * 32 + lane;
|
||||
if (offset >= n_elements) return;
|
||||
|
||||
float val = src[offset];
|
||||
val *= TQ_WEIGHT_SIGNS[lane];
|
||||
|
||||
#pragma unroll
|
||||
for (int h = 1; h < 32; h <<= 1) {
|
||||
float o = __shfl_xor_sync(0xffffffff, val, h);
|
||||
val = (lane & h) ? (o - val) : (val + o);
|
||||
}
|
||||
val *= 0.17677669529663688f;
|
||||
dst[offset] = __float2half(val);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ uint8_t tq3_extract_index(const uint8_t * __restrict__ qs, int lane) {
|
||||
const int group = lane / 8;
|
||||
const int lane_in_group = lane % 8;
|
||||
const uint8_t * qp = qs + group * 3;
|
||||
const uint32_t packed = (uint32_t)qp[0] | ((uint32_t)qp[1] << 8) | ((uint32_t)qp[2] << 16);
|
||||
return (packed >> (lane_in_group * 3)) & 7;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Multi-token TQ4_1S dp4a kernel (ncols_dst ≤ 8)
|
||||
// Weight data loaded once per block, reused across all ncols_dst tokens.
|
||||
// ============================================================================
|
||||
|
||||
template <int ncols_dst>
|
||||
static __global__ void mul_mat_tq4_1s_dp4a_multi(
|
||||
const void * __restrict__ vx,
|
||||
const block_q8_1 * __restrict__ vy_q8,
|
||||
float * __restrict__ dst,
|
||||
const int ncols_x,
|
||||
const int nrows_x,
|
||||
const int stride_col_y,
|
||||
const int stride_col_dst) {
|
||||
|
||||
const int row = blockIdx.x * MMVQ_TQ_NWARPS + threadIdx.y;
|
||||
if (row >= nrows_x) return;
|
||||
|
||||
const int lane = threadIdx.x;
|
||||
const int blocks_per_row = ncols_x / QK_TQ4_1S;
|
||||
const block_tq4_1s * x_row = ((const block_tq4_1s *) vx) + (int64_t)row * blocks_per_row;
|
||||
|
||||
float sumf[ncols_dst] = {};
|
||||
|
||||
for (int ib = lane; ib < blocks_per_row; ib += WARP_SIZE) {
|
||||
const block_tq4_1s * blk = &x_row[ib];
|
||||
const float fd0 = __half2float(blk->d0);
|
||||
const float fd1 = __half2float(blk->d1);
|
||||
|
||||
// Load weight once, reuse across all tokens
|
||||
const uint32_t * qs32 = (const uint32_t *)(blk->qs);
|
||||
const uint32_t w0 = qs32[0], w1 = qs32[1], w2 = qs32[2], w3 = qs32[3];
|
||||
|
||||
int c0_0, c1_0, c0_1, c1_1, c0_2, c1_2, c0_3, c1_3;
|
||||
tq4_cents8_reg(w0, c0_0, c1_0);
|
||||
tq4_cents8_reg(w1, c0_1, c1_1);
|
||||
tq4_cents8_reg(w2, c0_2, c1_2);
|
||||
tq4_cents8_reg(w3, c0_3, c1_3);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++) {
|
||||
const block_q8_1 * a_blk = &vy_q8[j * stride_col_y + ib];
|
||||
const float d_act = __half2float((__half)a_blk->ds.x);
|
||||
const int * a_qs = (const int *)(a_blk->qs);
|
||||
|
||||
const int s0 = ggml_cuda_dp4a(c0_0, a_qs[0], ggml_cuda_dp4a(c1_0, a_qs[1],
|
||||
ggml_cuda_dp4a(c0_1, a_qs[2], ggml_cuda_dp4a(c1_1, a_qs[3], 0))));
|
||||
const int s1 = ggml_cuda_dp4a(c0_2, a_qs[4], ggml_cuda_dp4a(c1_2, a_qs[5],
|
||||
ggml_cuda_dp4a(c0_3, a_qs[6], ggml_cuda_dp4a(c1_3, a_qs[7], 0))));
|
||||
|
||||
sumf[j] += d_act * (fd0 * (float)s0 + fd1 * (float)s1);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply centroid int8→float rescale + warp reduction
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++)
|
||||
sumf[j] *= TQ4_CENTROID_I8_RESCALE;
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = 16; offset > 0; offset >>= 1) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++)
|
||||
sumf[j] += __shfl_xor_sync(0xffffffff, sumf[j], offset);
|
||||
}
|
||||
|
||||
if (lane == 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++)
|
||||
dst[j * stride_col_dst + row] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Multi-token TQ3_1S scalar kernel (ncols_dst ≤ 8)
|
||||
// ============================================================================
|
||||
|
||||
template <int ncols_dst>
|
||||
static __global__ void mul_mat_tq3_1s_multi(
|
||||
const void * __restrict__ vx,
|
||||
const half * __restrict__ vy_rot,
|
||||
float * __restrict__ dst,
|
||||
const int ncols_x,
|
||||
const int nrows_x,
|
||||
const int stride_col_y,
|
||||
const int stride_col_dst) {
|
||||
|
||||
__shared__ float s_lut[8];
|
||||
if (threadIdx.y == 0 && threadIdx.x < 8) {
|
||||
s_lut[threadIdx.x] = TQ3_CENTROIDS_WEIGHT[threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const int row = blockIdx.x * MMVQ_TQ_NWARPS + threadIdx.y;
|
||||
if (row >= nrows_x) return;
|
||||
|
||||
const int lane = threadIdx.x;
|
||||
const int blocks_per_row = ncols_x / QK_TQ3_0;
|
||||
const block_tq3_1s * x_row = ((const block_tq3_1s *) vx) + (int64_t)row * blocks_per_row;
|
||||
|
||||
float sumf[ncols_dst] = {};
|
||||
|
||||
for (int ib = 0; ib < blocks_per_row; ib++) {
|
||||
const float d = (lane < 16) ? __half2float(x_row[ib].d0) : __half2float(x_row[ib].d1);
|
||||
const uint8_t idx = tq3_extract_index(x_row[ib].qs, lane);
|
||||
const float w = s_lut[idx] * d;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++) {
|
||||
const float act = __half2float(vy_rot[j * stride_col_y + ib * QK_TQ3_0 + lane]);
|
||||
sumf[j] += act * w;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = 16; offset > 0; offset >>= 1) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++)
|
||||
sumf[j] += __shfl_xor_sync(0xffffffff, sumf[j], offset);
|
||||
}
|
||||
|
||||
if (lane == 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++)
|
||||
dst[j * stride_col_dst + row] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// TQ4_1S scalar/half kernel (AMD fallback — no dp4a)
|
||||
// Same pattern as TQ3_1S: pre-rotated half activations, scalar centroid lookup.
|
||||
// On RDNA4, sudot4 throughput differs from NVIDIA dp4a — this path is faster.
|
||||
// ============================================================================
|
||||
|
||||
template <int ncols_dst>
|
||||
static __global__ void mul_mat_tq4_1s_scalar_multi(
|
||||
const void * __restrict__ vx,
|
||||
const half * __restrict__ vy_rot,
|
||||
float * __restrict__ dst,
|
||||
const int ncols_x,
|
||||
const int nrows_x,
|
||||
const int stride_col_y,
|
||||
const int stride_col_dst) {
|
||||
|
||||
__shared__ float s_lut[16];
|
||||
if (threadIdx.y == 0 && threadIdx.x < 16) {
|
||||
s_lut[threadIdx.x] = TQ4_CENTROIDS_WEIGHT[threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const int row = blockIdx.x * MMVQ_TQ_NWARPS + threadIdx.y;
|
||||
if (row >= nrows_x) return;
|
||||
|
||||
const int lane = threadIdx.x;
|
||||
const int blocks_per_row = ncols_x / QK_TQ4_1S;
|
||||
const block_tq4_1s * x_row = ((const block_tq4_1s *) vx) + (int64_t)row * blocks_per_row;
|
||||
|
||||
float sumf[ncols_dst] = {};
|
||||
|
||||
for (int ib = 0; ib < blocks_per_row; ib++) {
|
||||
const float d = (lane < 16) ? __half2float(x_row[ib].d0) : __half2float(x_row[ib].d1);
|
||||
const uint8_t idx = (x_row[ib].qs[lane / 2] >> ((lane & 1) * 4)) & 0xF;
|
||||
const float w = s_lut[idx] * d;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++) {
|
||||
const float act = __half2float(vy_rot[j * stride_col_y + ib * QK_TQ4_1S + lane]);
|
||||
sumf[j] += act * w;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = 16; offset > 0; offset >>= 1) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++)
|
||||
sumf[j] += __shfl_xor_sync(0xffffffff, sumf[j], offset);
|
||||
}
|
||||
|
||||
if (lane == 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; j++)
|
||||
dst[j * stride_col_dst + row] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Dispatch: ne[1]=1 (decode), ne[1]≤8 (multi-token dp4a / scalar)
|
||||
// ne[1]>8 handled by ggml_cuda_mul_mat_tq4_1s_cublas (runtime dequant + cuBLAS)
|
||||
// AMD: uses scalar half path for TQ4_1S (dp4a regresses on RDNA4)
|
||||
// ============================================================================
|
||||
|
||||
template <int ncols_dst>
|
||||
static void launch_tq4_1s_multi(
|
||||
const void * src0_d, const block_q8_1 * q8_buf,
|
||||
float * dst_d, int ncols_x, int nrows_x,
|
||||
int stride_col_y, int stride_col_dst, cudaStream_t stream) {
|
||||
const dim3 block(WARP_SIZE, MMVQ_TQ_NWARPS);
|
||||
const dim3 grid((nrows_x + MMVQ_TQ_NWARPS - 1) / MMVQ_TQ_NWARPS);
|
||||
mul_mat_tq4_1s_dp4a_multi<ncols_dst><<<grid, block, 0, stream>>>(
|
||||
src0_d, q8_buf, dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst);
|
||||
}
|
||||
|
||||
template <int ncols_dst>
|
||||
static void launch_tq4_1s_scalar_multi(
|
||||
const void * src0_d, const half * act_buf,
|
||||
float * dst_d, int ncols_x, int nrows_x,
|
||||
int stride_col_y, int stride_col_dst, cudaStream_t stream) {
|
||||
const dim3 block(WARP_SIZE, MMVQ_TQ_NWARPS);
|
||||
const dim3 grid((nrows_x + MMVQ_TQ_NWARPS - 1) / MMVQ_TQ_NWARPS);
|
||||
mul_mat_tq4_1s_scalar_multi<ncols_dst><<<grid, block, 0, stream>>>(
|
||||
src0_d, act_buf, dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst);
|
||||
}
|
||||
|
||||
template <int ncols_dst>
|
||||
static void launch_tq3_1s_multi(
|
||||
const void * src0_d, const half * act_buf,
|
||||
float * dst_d, int ncols_x, int nrows_x,
|
||||
int stride_col_y, int stride_col_dst, cudaStream_t stream) {
|
||||
const dim3 block(WARP_SIZE, MMVQ_TQ_NWARPS);
|
||||
const dim3 grid((nrows_x + MMVQ_TQ_NWARPS - 1) / MMVQ_TQ_NWARPS);
|
||||
mul_mat_tq3_1s_multi<ncols_dst><<<grid, block, 0, stream>>>(
|
||||
src0_d, act_buf, dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_tq(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_TQ4_1S || src0->type == GGML_TYPE_TQ3_1S);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int ncols_x = src0->ne[0];
|
||||
const int nrows_x = src0->ne[1];
|
||||
const int ncols_dst = src1->ne[1];
|
||||
GGML_ASSERT(ncols_x % 32 == 0);
|
||||
|
||||
const void * src0_d = src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int n_total_elements = ncols_x * ncols_dst;
|
||||
const bool use_dp4a = !GGML_CUDA_CC_IS_AMD(cc) && src0->type == GGML_TYPE_TQ4_1S;
|
||||
|
||||
if (use_dp4a) {
|
||||
// NVIDIA TQ4_1S: dp4a int8 path (optimized for Turing+ dp4a throughput)
|
||||
const int n_total_blocks = n_total_elements / 32;
|
||||
ggml_cuda_pool_alloc<block_q8_1> q8_1_buf(ctx.pool(id), n_total_blocks);
|
||||
|
||||
// Phase 1: Pre-rotate all tokens → q8_1
|
||||
{
|
||||
const int wpb = 4;
|
||||
const dim3 block(32, wpb);
|
||||
const dim3 grid((n_total_blocks + wpb - 1) / wpb);
|
||||
tq_prerotate_q8_1<<<grid, block, 0, stream>>>(src1_d, q8_1_buf.get(), n_total_elements);
|
||||
}
|
||||
|
||||
// Phase 2: dispatch based on ncols_dst
|
||||
const int stride_col_y = ncols_x / 32; // q8_1 blocks per column
|
||||
const int stride_col_dst = nrows_x;
|
||||
|
||||
switch (ncols_dst) {
|
||||
case 1: launch_tq4_1s_multi<1>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
|
||||
case 2: launch_tq4_1s_multi<2>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
|
||||
case 3: launch_tq4_1s_multi<3>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
|
||||
case 4: launch_tq4_1s_multi<4>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
|
||||
case 5: launch_tq4_1s_multi<5>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
|
||||
case 6: launch_tq4_1s_multi<6>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
|
||||
case 7: launch_tq4_1s_multi<7>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
|
||||
case 8: launch_tq4_1s_multi<8>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
|
||||
}
|
||||
} else {
|
||||
// Scalar half path: TQ3_1S (all vendors) + TQ4_1S on AMD (dp4a regresses on RDNA4)
|
||||
ggml_cuda_pool_alloc<half> act_buf(ctx.pool(id), n_total_elements);
|
||||
|
||||
{
|
||||
const int n_total_blocks = n_total_elements / 32;
|
||||
const int wpb = 4;
|
||||
const dim3 block(32, wpb);
|
||||
const dim3 grid((n_total_blocks + wpb - 1) / wpb);
|
||||
tq_prerotate_activation<<<grid, block, 0, stream>>>(src1_d, act_buf.get(), n_total_elements);
|
||||
}
|
||||
|
||||
const int stride_col_y = ncols_x; // half elements per column
|
||||
const int stride_col_dst = nrows_x;
|
||||
const bool is_tq4 = (src0->type == GGML_TYPE_TQ4_1S);
|
||||
|
||||
// Macro to dispatch to the right kernel based on quant type
|
||||
#define LAUNCH_SCALAR(N, src0_ptr, act_ptr, dst_ptr) \
|
||||
if (is_tq4) { launch_tq4_1s_scalar_multi<N>(src0_ptr, act_ptr, dst_ptr, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); } \
|
||||
else { launch_tq3_1s_multi<N>(src0_ptr, act_ptr, dst_ptr, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); }
|
||||
|
||||
if (ncols_dst <= 8) {
|
||||
switch (ncols_dst) {
|
||||
case 1: LAUNCH_SCALAR(1, src0_d, act_buf.get(), dst_d); break;
|
||||
case 2: LAUNCH_SCALAR(2, src0_d, act_buf.get(), dst_d); break;
|
||||
case 3: LAUNCH_SCALAR(3, src0_d, act_buf.get(), dst_d); break;
|
||||
case 4: LAUNCH_SCALAR(4, src0_d, act_buf.get(), dst_d); break;
|
||||
case 5: LAUNCH_SCALAR(5, src0_d, act_buf.get(), dst_d); break;
|
||||
case 6: LAUNCH_SCALAR(6, src0_d, act_buf.get(), dst_d); break;
|
||||
case 7: LAUNCH_SCALAR(7, src0_d, act_buf.get(), dst_d); break;
|
||||
case 8: LAUNCH_SCALAR(8, src0_d, act_buf.get(), dst_d); break;
|
||||
}
|
||||
} else {
|
||||
// Large prefill: batch in groups of 8
|
||||
for (int j = 0; j < ncols_dst; j += 8) {
|
||||
const int batch = min(8, ncols_dst - j);
|
||||
const half * act_j = act_buf.get() + j * ncols_x;
|
||||
float * dst_j = dst_d + j * nrows_x;
|
||||
switch (batch) {
|
||||
case 1: LAUNCH_SCALAR(1, src0_d, act_j, dst_j); break;
|
||||
case 2: LAUNCH_SCALAR(2, src0_d, act_j, dst_j); break;
|
||||
case 3: LAUNCH_SCALAR(3, src0_d, act_j, dst_j); break;
|
||||
case 4: LAUNCH_SCALAR(4, src0_d, act_j, dst_j); break;
|
||||
case 5: LAUNCH_SCALAR(5, src0_d, act_j, dst_j); break;
|
||||
case 6: LAUNCH_SCALAR(6, src0_d, act_j, dst_j); break;
|
||||
case 7: LAUNCH_SCALAR(7, src0_d, act_j, dst_j); break;
|
||||
case 8: LAUNCH_SCALAR(8, src0_d, act_j, dst_j); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef LAUNCH_SCALAR
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
// Load-time conversion: TQ4_1S → q8_0 (opt-in via GGML_TQ_CONVERT_Q8=1)
|
||||
// ============================================================================
|
||||
|
||||
static __global__ void k_convert_tq4_1s_to_q8_0(
|
||||
const block_tq4_1s * __restrict__ src,
|
||||
block_q8_0 * __restrict__ dst,
|
||||
const int n_blocks) {
|
||||
|
||||
const int block_idx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
if (block_idx >= n_blocks) return;
|
||||
const int lane = threadIdx.x;
|
||||
const block_tq4_1s * blk = &src[block_idx];
|
||||
|
||||
const float d_scale = (lane < 16) ? __half2float(blk->d0) : __half2float(blk->d1);
|
||||
const uint8_t idx = (blk->qs[lane / 2] >> ((lane & 1) * 4)) & 0xF;
|
||||
float val = TQ4_CENTROIDS_WEIGHT[idx] * d_scale;
|
||||
|
||||
#pragma unroll
|
||||
for (int h = 1; h < 32; h <<= 1) {
|
||||
float o = __shfl_xor_sync(0xffffffff, val, h);
|
||||
val = (lane & h) ? (o - val) : (val + o);
|
||||
}
|
||||
val *= 0.17677669529663688f;
|
||||
val *= TQ_WEIGHT_SIGNS[lane];
|
||||
|
||||
float amax = fabsf(val);
|
||||
#pragma unroll
|
||||
for (int off = 16; off > 0; off >>= 1)
|
||||
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, off));
|
||||
|
||||
const float d = amax / 127.0f;
|
||||
const float id = (d > 0.0f) ? 127.0f / amax : 0.0f;
|
||||
|
||||
dst[block_idx].qs[lane] = (int8_t)roundf(val * id);
|
||||
if (lane == 0) dst[block_idx].d = __float2half(d);
|
||||
}
|
||||
|
||||
void ggml_cuda_convert_tq4_1s_to_q8_0(const void * src_tq4, void * dst_q8, int64_t n_elements, cudaStream_t stream) {
|
||||
GGML_ASSERT(n_elements % QK_TQ4_1S == 0);
|
||||
const int n_blocks = n_elements / QK_TQ4_1S;
|
||||
const int wpb = 4;
|
||||
const dim3 block(32, wpb);
|
||||
const dim3 grid((n_blocks + wpb - 1) / wpb);
|
||||
k_convert_tq4_1s_to_q8_0<<<grid, block, 0, stream>>>(
|
||||
(const block_tq4_1s *)src_tq4, (block_q8_0 *)dst_q8, n_blocks);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Large prefill: runtime TQ4_1S → q8_0 scratch + q8_0→fp16 dequant + cuBLAS
|
||||
// Gets tensor core throughput without permanent 1.7× VRAM cost.
|
||||
// ============================================================================
|
||||
|
||||
void ggml_cuda_mul_mat_tq4_1s_cublas(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_TQ4_1S);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0]; // K (hidden dim)
|
||||
const int64_t ne01 = src0->ne[1]; // M (rows = output features)
|
||||
const int64_t ne10 = src1->ne[0]; // K
|
||||
const int64_t ne11 = src1->ne[1]; // N (tokens)
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int64_t n_elements = ne00 * ne01;
|
||||
|
||||
// Step 1: TQ4_1S → fp16 via warp-cooperative dequant (WHT in-warp)
|
||||
ggml_cuda_pool_alloc<half> src0_f16(ctx.pool(id), n_elements);
|
||||
{
|
||||
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(GGML_TYPE_TQ4_1S);
|
||||
GGML_ASSERT(to_fp16 != nullptr);
|
||||
to_fp16((const char *)src0->data, src0_f16.get(), n_elements, stream);
|
||||
}
|
||||
|
||||
// Step 2: src1 f32 → fp16
|
||||
ggml_cuda_pool_alloc<half> src1_f16(ctx.pool(id), ne10 * ne11);
|
||||
{
|
||||
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(GGML_TYPE_F32);
|
||||
GGML_ASSERT(to_fp16 != nullptr);
|
||||
to_fp16((const char *)src1->data, src1_f16.get(), ne10 * ne11, stream);
|
||||
}
|
||||
|
||||
// Step 3: cuBLAS fp16 GEMM with fp32 compute (tensor cores)
|
||||
// dst[M×N] = src0[M×K]^T × src1[K×N]
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int64_t ldc = dst->ne[0]; // M
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne00,
|
||||
&alpha, src0_f16.get(), CUDA_R_16F, ne00,
|
||||
src1_f16.get(), CUDA_R_16F, ne10,
|
||||
&beta, (float *)dst->data, CUDA_R_32F, ldc,
|
||||
CUBLAS_COMPUTE_32F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
// Fused TQ weight mul_mat: handles ne[1]=1 (decode) and ne[1]>1 (prefill/speculative)
|
||||
void ggml_cuda_mul_mat_tq(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
// Legacy single-token alias
|
||||
inline void ggml_cuda_mul_mat_vec_tq(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_mul_mat_tq(ctx, src0, src1, dst);
|
||||
}
|
||||
|
||||
// Large prefill: runtime TQ4_1S → q8_0 scratch + cuBLAS
|
||||
void ggml_cuda_mul_mat_tq4_1s_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
// Load-time conversion: TQ4_1S → q8_0 in VRAM (dequant + requantize)
|
||||
void ggml_cuda_convert_tq4_1s_to_q8_0(const void * src_tq4, void * dst_q8, int64_t n_elements, cudaStream_t stream);
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "set-rows.cuh"
|
||||
#include "cpy-utils.cuh"
|
||||
#include "turbo-quant.cuh"
|
||||
|
||||
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
|
||||
|
||||
@@ -209,6 +210,928 @@ static void set_rows_cuda(
|
||||
}
|
||||
}
|
||||
|
||||
// ---- TurboQuant3 set_rows: GROUP_SIZE-element groups with WHT rotation + norm correction ----
|
||||
//
|
||||
// Templated on GROUP_SIZE (128 or 64).
|
||||
// Parallel kernel: one CUDA block per group, GROUP_SIZE threads per block.
|
||||
// Thread j handles element j within the group.
|
||||
//
|
||||
// Steps (all parallel):
|
||||
// 1. Load element j from global memory
|
||||
// 2. Parallel L2 norm (warp reduce + inter-warp via shared memory)
|
||||
// 3. Normalize
|
||||
// 4. Forward WHT (log2(GROUP_SIZE) butterfly stages, shared memory)
|
||||
// 5. Quantize element j to 3-bit centroid index
|
||||
// 6. Pack qs (warp shuffle) and signs (__ballot_sync) into turbo3 block, no atomics
|
||||
// 7. Parallel reconstruction norm (same pattern as step 2)
|
||||
// 8. Write corrected norm (one thread per sub-block)
|
||||
|
||||
template <typename idx_t, int GROUP_SIZE>
|
||||
__launch_bounds__(128) // max of 128 or 64
|
||||
static __global__ void k_set_rows_turbo3(
|
||||
const float * __restrict__ src0,
|
||||
const idx_t * __restrict__ src1,
|
||||
block_turbo3_0 * __restrict__ dst,
|
||||
const int64_t ne00,
|
||||
const int64_t ne01,
|
||||
const int64_t ne10,
|
||||
const int64_t ne11,
|
||||
const int64_t ne12,
|
||||
const int64_t ne13,
|
||||
const int64_t s01,
|
||||
const int64_t s02,
|
||||
const int64_t s03,
|
||||
const int64_t s10,
|
||||
const int64_t s11,
|
||||
const int64_t s12,
|
||||
const int64_t s1,
|
||||
const int64_t s2,
|
||||
const int64_t s3) {
|
||||
|
||||
static_assert(GROUP_SIZE == 128 || GROUP_SIZE == 64, "GROUP_SIZE must be 128 or 64");
|
||||
|
||||
// blockIdx.x = flat group index; threadIdx.x = element within group (0..GROUP_SIZE-1)
|
||||
const int j = threadIdx.x;
|
||||
|
||||
// Decode blockIdx.x → (i_grp, i01, i02, i03)
|
||||
constexpr int blocks_per_group = GROUP_SIZE / QK_TURBO3;
|
||||
const int64_t n_groups_per_row = ne00 / GROUP_SIZE;
|
||||
const int64_t g = blockIdx.x;
|
||||
const int64_t i_grp = g % n_groups_per_row;
|
||||
int64_t tmp = g / n_groups_per_row;
|
||||
const int64_t i01 = tmp % ne01;
|
||||
tmp = tmp / ne01;
|
||||
const int64_t i02 = tmp % ne12;
|
||||
const int64_t i03 = tmp / ne12;
|
||||
|
||||
const int64_t i12 = i02;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
const int64_t i10 = i01;
|
||||
|
||||
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
|
||||
const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
||||
block_turbo3_0 * dst_row_ptr = (block_turbo3_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
|
||||
block_turbo3_0 * blk_base = dst_row_ptr + i_grp * blocks_per_group;
|
||||
|
||||
// ---- Step 1: Load element j (coalesced) ----
|
||||
__shared__ float x[GROUP_SIZE];
|
||||
x[j] = src_row[i_grp * GROUP_SIZE + j];
|
||||
__syncthreads();
|
||||
|
||||
// ---- InnerQ: calibrate on original (unscaled) values ----
|
||||
if (d_innerq_calibrating) {
|
||||
atomicAdd(&d_innerq_sq_accum[j], x[j] * x[j]);
|
||||
if (j == 0) atomicAdd(&d_innerq_count, 1);
|
||||
}
|
||||
|
||||
// ---- InnerQ: apply channel scale (only when active) ----
|
||||
if (d_innerq_active) {
|
||||
x[j] *= d_innerq_scale[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// ---- Step 2: Parallel L2 norm ----
|
||||
constexpr int n_warps = GROUP_SIZE / WARP_SIZE;
|
||||
__shared__ float warp_accum[n_warps];
|
||||
float v = x[j];
|
||||
float v2 = v * v;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
v2 += __shfl_xor_sync(0xffffffff, v2, offset);
|
||||
if (j % WARP_SIZE == 0)
|
||||
warp_accum[j / WARP_SIZE] = v2;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_norm_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_norm_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float grp_norm = sqrtf(s_norm_sq);
|
||||
const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
|
||||
|
||||
// ---- Step 3: Normalize ----
|
||||
x[j] *= inv_norm;
|
||||
__syncthreads();
|
||||
|
||||
// ---- Step 4: Forward WHT (signs1 → butterfly → signs2, normalized) ----
|
||||
if (GROUP_SIZE == 128) {
|
||||
x[j] *= TURBO_WHT_SIGNS1[j];
|
||||
} else {
|
||||
x[j] *= TURBO_WHT_SIGNS1_64[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
#define WHT_STAGE_SHARED(h) \
|
||||
if (j % (2*(h)) < (h)) { float a = x[j], b = x[j+(h)]; x[j] = a+b; x[j+(h)] = a-b; } \
|
||||
__syncthreads();
|
||||
|
||||
// Butterfly stages: loop from h=1 to h<GROUP_SIZE, doubling each time
|
||||
WHT_STAGE_SHARED(1)
|
||||
WHT_STAGE_SHARED(2)
|
||||
WHT_STAGE_SHARED(4)
|
||||
WHT_STAGE_SHARED(8)
|
||||
WHT_STAGE_SHARED(16)
|
||||
WHT_STAGE_SHARED(32)
|
||||
if (GROUP_SIZE == 128) { WHT_STAGE_SHARED(64) }
|
||||
#undef WHT_STAGE_SHARED
|
||||
|
||||
constexpr float inv_sqrt_group = (GROUP_SIZE == 128) ? 0.08838834764831845f : 0.125f;
|
||||
if (GROUP_SIZE == 128) {
|
||||
x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2[j];
|
||||
} else {
|
||||
x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2_64[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// ---- Step 5: Quantize element j ----
|
||||
const float rv = x[j];
|
||||
const uint8_t idx = turbo_nearest_centroid_3bit(rv);
|
||||
|
||||
// ---- Step 6: Pack qs and signs (warp-cooperative, no atomics) ----
|
||||
// Each warp handles 32 elements. With QK_TURBO3 > WARP_SIZE, multiple warps
|
||||
// share one block and write to different byte offsets within it.
|
||||
const int warp_id = j / WARP_SIZE;
|
||||
const int lane = j % WARP_SIZE;
|
||||
const int elem_in_block = j % QK_TURBO3;
|
||||
block_turbo3_0 * blk = blk_base + (j / QK_TURBO3);
|
||||
|
||||
// Pack qs: 4 elements per byte, 2 bits each.
|
||||
// All 4 threads in a qs-group gather their low2 bits via shuffle.
|
||||
const int qs_byte_idx = elem_in_block / 4;
|
||||
const uint8_t my_low2 = idx & 0x3;
|
||||
uint8_t qs_byte = 0;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 4; k++) {
|
||||
uint8_t contrib = __shfl_sync(0xffffffff, my_low2, (lane & ~3) + k);
|
||||
qs_byte |= contrib << (k * 2);
|
||||
}
|
||||
if (lane % 4 == 0) blk->qs[qs_byte_idx] = qs_byte;
|
||||
|
||||
// Pack signs: 8 elements per byte, 1 bit each. __ballot_sync across warp.
|
||||
// Ballot is per-warp (32 bits); extract local byte, write to global position in block.
|
||||
const uint32_t ballot = __ballot_sync(0xffffffff, (idx >> 2) & 1);
|
||||
const int local_signs_byte = lane / 8; // byte within 32-bit ballot (0..3)
|
||||
const int global_signs_byte = elem_in_block / 8; // byte within block's signs array
|
||||
const uint8_t signs_byte = (uint8_t)((ballot >> (local_signs_byte * 8)) & 0xFF);
|
||||
if (lane % 8 == 0) blk->signs[global_signs_byte] = signs_byte;
|
||||
|
||||
// ---- Step 7: Reconstruction norm (parallel, same pattern as step 2) ----
|
||||
const float c = TURBO_CENTROIDS_3BIT[idx];
|
||||
float rc = c * c;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
rc += __shfl_xor_sync(0xffffffff, rc, offset);
|
||||
if (j % WARP_SIZE == 0)
|
||||
warp_accum[j / WARP_SIZE] = rc;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_recon_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_recon_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float recon_norm = sqrtf(s_recon_sq);
|
||||
const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
|
||||
|
||||
// ---- Step 8: Write corrected norm (one per turbo3 block) ----
|
||||
if (elem_in_block == 0) blk->norm = __float2half(corrected_norm);
|
||||
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
}
|
||||
|
||||
// ---- TurboQuant3 tail kernel: straight 3-bit quantize without WHT rotation ----
|
||||
//
|
||||
// For head dims not divisible by 128 (e.g. 576 = 4*128 + 64), the remainder
|
||||
// elements can't use the 128-element WHT. They are quantised directly into
|
||||
// standard turbo3 blocks. Q is also NOT rotated for these positions (the graph
|
||||
// guards on ne[0] % 128), so <Q_tail, K_tail> stays in the original space.
|
||||
//
|
||||
// One CUDA block per row, with tail_size threads (must be multiple of 32).
|
||||
|
||||
template <typename idx_t>
|
||||
static __global__ void k_set_rows_turbo3_tail(
|
||||
const float * __restrict__ src0,
|
||||
const idx_t * __restrict__ src1,
|
||||
block_turbo3_0 * __restrict__ dst,
|
||||
const int64_t ne00,
|
||||
const int64_t ne01,
|
||||
const int64_t ne10,
|
||||
const int64_t ne11,
|
||||
const int64_t ne12,
|
||||
const int64_t ne13,
|
||||
const int64_t s01,
|
||||
const int64_t s02,
|
||||
const int64_t s03,
|
||||
const int64_t s10,
|
||||
const int64_t s11,
|
||||
const int64_t s12,
|
||||
const int64_t s1,
|
||||
const int64_t s2,
|
||||
const int64_t s3,
|
||||
const int tail_size) {
|
||||
|
||||
const int j = threadIdx.x; // 0 .. tail_size-1
|
||||
|
||||
// Decode blockIdx.x → (i01, i02, i03)
|
||||
int64_t tmp = blockIdx.x;
|
||||
const int64_t i01 = tmp % ne01; tmp /= ne01;
|
||||
const int64_t i02 = tmp % ne12;
|
||||
const int64_t i03 = tmp / ne12;
|
||||
|
||||
const int64_t i11 = i01 % ne11;
|
||||
const int64_t i10 = i01;
|
||||
const int64_t i12 = i02;
|
||||
|
||||
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
|
||||
const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
||||
block_turbo3_0 * dst_row_ptr = (block_turbo3_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
|
||||
|
||||
// Tail starts after all full 128-element groups
|
||||
const int64_t n_full = ne00 / QK_TURBO3_GROUP;
|
||||
const int64_t tail_start = n_full * QK_TURBO3_GROUP;
|
||||
block_turbo3_0 * blk_base = dst_row_ptr + n_full * (QK_TURBO3_GROUP / QK_TURBO3);
|
||||
|
||||
// ---- Load ----
|
||||
const float val = src_row[tail_start + j];
|
||||
|
||||
// ---- L2 norm over the tail group (warp reduce + inter-warp) ----
|
||||
const int n_warps = tail_size / WARP_SIZE;
|
||||
const int warp_id = j / WARP_SIZE;
|
||||
const int lane = j % WARP_SIZE;
|
||||
|
||||
__shared__ float warp_accum[4]; // max 3 warps (tail ≤ 96)
|
||||
float v2 = val * val;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
v2 += __shfl_xor_sync(0xffffffff, v2, offset);
|
||||
if (lane == 0) warp_accum[warp_id] = v2;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_norm_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_norm_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float grp_norm = sqrtf(s_norm_sq);
|
||||
const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
|
||||
|
||||
// ---- Normalize (no WHT!) ----
|
||||
const float rv = val * inv_norm;
|
||||
|
||||
// ---- Quantize ----
|
||||
const uint8_t idx = turbo_nearest_centroid_3bit(rv);
|
||||
|
||||
// ---- Pack qs and signs (same warp-cooperative logic) ----
|
||||
block_turbo3_0 * blk = blk_base + warp_id;
|
||||
|
||||
const uint8_t my_low2 = idx & 0x3;
|
||||
uint8_t qs_byte = 0;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 4; k++) {
|
||||
uint8_t contrib = __shfl_sync(0xffffffff, my_low2, (lane & ~3) + k);
|
||||
qs_byte |= contrib << (k * 2);
|
||||
}
|
||||
if (lane % 4 == 0) blk->qs[lane / 4] = qs_byte;
|
||||
|
||||
const uint32_t ballot = __ballot_sync(0xffffffff, (idx >> 2) & 1);
|
||||
const int signs_byte_idx = lane / 8;
|
||||
const uint8_t signs_byte = (uint8_t)((ballot >> (signs_byte_idx * 8)) & 0xFF);
|
||||
if (lane % 8 == 0) blk->signs[signs_byte_idx] = signs_byte;
|
||||
|
||||
// ---- Reconstruction norm ----
|
||||
const float c = TURBO_CENTROIDS_3BIT[idx];
|
||||
float rc = c * c;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
rc += __shfl_xor_sync(0xffffffff, rc, offset);
|
||||
if (lane == 0) warp_accum[warp_id] = rc;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_recon_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_recon_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float recon_norm = sqrtf(s_recon_sq);
|
||||
const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
|
||||
|
||||
if (lane == 0) blk->norm = __float2half(corrected_norm);
|
||||
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
}
|
||||
|
||||
template<typename idx_t>
|
||||
static void set_rows_cuda_turbo3(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const idx_t * src1_d = (const idx_t *)src1->data;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
GGML_ASSERT(ne00 % QK_TURBO3 == 0); // must be block-aligned (32)
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
// Read WHT group size from op_params (set by llama-kv-cache.cpp based on head_dim).
|
||||
// Default to 128 if not set (backward compat with head_dim=128 models).
|
||||
int group_size = 128;
|
||||
memcpy(&group_size, dst->op_params, sizeof(int));
|
||||
if (group_size != 64 && group_size != 128) group_size = 128;
|
||||
GGML_ASSERT(ne00 % group_size == 0);
|
||||
|
||||
const int64_t n_full_groups = ne00 / group_size;
|
||||
const int tail_size = (int)(ne00 % group_size);
|
||||
|
||||
const int64_t s01 = nb01/sizeof(float);
|
||||
const int64_t s02 = nb02/sizeof(float);
|
||||
const int64_t s03 = nb03/sizeof(float);
|
||||
const int64_t s10 = nb10/sizeof(idx_t);
|
||||
const int64_t s11 = nb11/sizeof(idx_t);
|
||||
const int64_t s12 = nb12/sizeof(idx_t);
|
||||
|
||||
// InnerQ: check/finalize calibration before kernel launch
|
||||
turbo_innerq_check_finalize(group_size, ne00);
|
||||
|
||||
// Launch 1: full groups with WHT rotation
|
||||
if (n_full_groups > 0) {
|
||||
const int64_t ne_total = n_full_groups * ne01 * ne02 * ne03;
|
||||
if (group_size == 128) {
|
||||
k_set_rows_turbo3<idx_t, 128><<<(int)ne_total, 128, 0, stream>>>(
|
||||
src0_d, src1_d, (block_turbo3_0 *)dst->data,
|
||||
ne00, ne01, ne10, ne11, ne12, ne13,
|
||||
s01, s02, s03, s10, s11, s12,
|
||||
nb1, nb2, nb3);
|
||||
} else {
|
||||
k_set_rows_turbo3<idx_t, 64><<<(int)ne_total, 64, 0, stream>>>(
|
||||
src0_d, src1_d, (block_turbo3_0 *)dst->data,
|
||||
ne00, ne01, ne10, ne11, ne12, ne13,
|
||||
s01, s02, s03, s10, s11, s12,
|
||||
nb1, nb2, nb3);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch 2: tail elements (no WHT, straight quantize)
|
||||
// Not needed for 64-aligned dims but kept for potential future use
|
||||
if (tail_size > 0) {
|
||||
GGML_ASSERT(tail_size % QK_TURBO3 == 0); // tail must be block-aligned
|
||||
const int64_t n_rows = ne01 * ne02 * ne03;
|
||||
k_set_rows_turbo3_tail<idx_t><<<(int)n_rows, tail_size, 0, stream>>>(
|
||||
src0_d, src1_d, (block_turbo3_0 *)dst->data,
|
||||
ne00, ne01, ne10, ne11, ne12, ne13,
|
||||
s01, s02, s03, s10, s11, s12,
|
||||
nb1, nb2, nb3, tail_size);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- TurboQuant2 set_rows: GROUP_SIZE-element groups with WHT rotation + norm correction ----
|
||||
//
|
||||
// Same structure as turbo3 but 2-bit quantization only (no signs byte).
|
||||
|
||||
template <typename idx_t, int GROUP_SIZE>
|
||||
__launch_bounds__(128)
|
||||
static __global__ void k_set_rows_turbo2(
|
||||
const float * __restrict__ src0,
|
||||
const idx_t * __restrict__ src1,
|
||||
block_turbo2_0 * __restrict__ dst,
|
||||
const int64_t ne00,
|
||||
const int64_t ne01,
|
||||
const int64_t ne10,
|
||||
const int64_t ne11,
|
||||
const int64_t ne12,
|
||||
const int64_t ne13,
|
||||
const int64_t s01,
|
||||
const int64_t s02,
|
||||
const int64_t s03,
|
||||
const int64_t s10,
|
||||
const int64_t s11,
|
||||
const int64_t s12,
|
||||
const int64_t s1,
|
||||
const int64_t s2,
|
||||
const int64_t s3) {
|
||||
|
||||
static_assert(GROUP_SIZE == 128 || GROUP_SIZE == 64, "GROUP_SIZE must be 128 or 64");
|
||||
|
||||
const int j = threadIdx.x;
|
||||
|
||||
constexpr int blocks_per_group = GROUP_SIZE / QK_TURBO2;
|
||||
const int64_t n_groups_per_row = ne00 / GROUP_SIZE;
|
||||
const int64_t g = blockIdx.x;
|
||||
const int64_t i_grp = g % n_groups_per_row;
|
||||
int64_t tmp = g / n_groups_per_row;
|
||||
const int64_t i01 = tmp % ne01;
|
||||
tmp = tmp / ne01;
|
||||
const int64_t i02 = tmp % ne12;
|
||||
const int64_t i03 = tmp / ne12;
|
||||
|
||||
const int64_t i12 = i02;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
const int64_t i10 = i01;
|
||||
|
||||
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
|
||||
const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
||||
block_turbo2_0 * dst_row_ptr = (block_turbo2_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
|
||||
block_turbo2_0 * blk_base = dst_row_ptr + i_grp * blocks_per_group;
|
||||
|
||||
// ---- Step 1: Load element j (coalesced) ----
|
||||
__shared__ float x[GROUP_SIZE];
|
||||
x[j] = src_row[i_grp * GROUP_SIZE + j];
|
||||
__syncthreads();
|
||||
|
||||
// ---- InnerQ: calibrate on original (unscaled) values ----
|
||||
if (d_innerq_calibrating) {
|
||||
atomicAdd(&d_innerq_sq_accum[j], x[j] * x[j]);
|
||||
if (j == 0) atomicAdd(&d_innerq_count, 1);
|
||||
}
|
||||
|
||||
// ---- InnerQ: apply channel scale (only when active) ----
|
||||
if (d_innerq_active) {
|
||||
x[j] *= d_innerq_scale[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// ---- Step 2: Parallel L2 norm ----
|
||||
constexpr int n_warps = GROUP_SIZE / WARP_SIZE;
|
||||
__shared__ float warp_accum[n_warps];
|
||||
float v = x[j];
|
||||
float v2 = v * v;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
v2 += __shfl_xor_sync(0xffffffff, v2, offset);
|
||||
if (j % WARP_SIZE == 0)
|
||||
warp_accum[j / WARP_SIZE] = v2;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_norm_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_norm_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float grp_norm = sqrtf(s_norm_sq);
|
||||
const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
|
||||
|
||||
// ---- Step 3: Normalize ----
|
||||
x[j] *= inv_norm;
|
||||
__syncthreads();
|
||||
|
||||
// ---- Step 4: Forward WHT ----
|
||||
if (GROUP_SIZE == 128) {
|
||||
x[j] *= TURBO_WHT_SIGNS1[j];
|
||||
} else {
|
||||
x[j] *= TURBO_WHT_SIGNS1_64[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
#define WHT_STAGE_SHARED_T2(h) \
|
||||
if (j % (2*(h)) < (h)) { float a = x[j], b = x[j+(h)]; x[j] = a+b; x[j+(h)] = a-b; } \
|
||||
__syncthreads();
|
||||
|
||||
WHT_STAGE_SHARED_T2(1)
|
||||
WHT_STAGE_SHARED_T2(2)
|
||||
WHT_STAGE_SHARED_T2(4)
|
||||
WHT_STAGE_SHARED_T2(8)
|
||||
WHT_STAGE_SHARED_T2(16)
|
||||
WHT_STAGE_SHARED_T2(32)
|
||||
if (GROUP_SIZE == 128) { WHT_STAGE_SHARED_T2(64) }
|
||||
#undef WHT_STAGE_SHARED_T2
|
||||
|
||||
constexpr float inv_sqrt_group = (GROUP_SIZE == 128) ? 0.08838834764831845f : 0.125f;
|
||||
if (GROUP_SIZE == 128) {
|
||||
x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2[j];
|
||||
} else {
|
||||
x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2_64[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// ---- Step 5: Quantize element j to 2-bit centroid ----
|
||||
const float rv = x[j];
|
||||
const uint8_t idx = turbo_nearest_centroid_2bit(rv);
|
||||
|
||||
// ---- Step 6: Pack qs (warp-cooperative, no atomics) ----
|
||||
// Each warp handles 32 elements. With QK_TURBO2 > WARP_SIZE, multiple warps
|
||||
// share one block and write to different byte offsets within it.
|
||||
const int warp_id = j / WARP_SIZE;
|
||||
const int lane = j % WARP_SIZE;
|
||||
const int elem_in_block = j % QK_TURBO2;
|
||||
block_turbo2_0 * blk = blk_base + (j / QK_TURBO2);
|
||||
|
||||
// Pack qs: 4 elements per byte, 2 bits each.
|
||||
const uint8_t my_bits = idx & 0x3;
|
||||
uint8_t qs_byte = 0;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 4; k++) {
|
||||
uint8_t contrib = __shfl_sync(0xffffffff, my_bits, (lane & ~3) + k);
|
||||
qs_byte |= contrib << (k * 2);
|
||||
}
|
||||
if (lane % 4 == 0) blk->qs[elem_in_block / 4] = qs_byte;
|
||||
|
||||
// No signs packing needed for turbo2
|
||||
|
||||
// ---- Step 7: Reconstruction norm ----
|
||||
const float c = TURBO_CENTROIDS_2BIT[idx];
|
||||
float rc = c * c;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
rc += __shfl_xor_sync(0xffffffff, rc, offset);
|
||||
if (j % WARP_SIZE == 0)
|
||||
warp_accum[j / WARP_SIZE] = rc;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_recon_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_recon_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float recon_norm = sqrtf(s_recon_sq);
|
||||
const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
|
||||
|
||||
// ---- Step 8: Write corrected norm (one per turbo2 block) ----
|
||||
if (elem_in_block == 0) blk->norm = __float2half(corrected_norm);
|
||||
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
}
|
||||
|
||||
// ---- TurboQuant2 tail kernel: straight 2-bit quantize without WHT rotation ----
|
||||
|
||||
template <typename idx_t>
|
||||
static __global__ void k_set_rows_turbo2_tail(
|
||||
const float * __restrict__ src0,
|
||||
const idx_t * __restrict__ src1,
|
||||
block_turbo2_0 * __restrict__ dst,
|
||||
const int64_t ne00,
|
||||
const int64_t ne01,
|
||||
const int64_t ne10,
|
||||
const int64_t ne11,
|
||||
const int64_t ne12,
|
||||
const int64_t ne13,
|
||||
const int64_t s01,
|
||||
const int64_t s02,
|
||||
const int64_t s03,
|
||||
const int64_t s10,
|
||||
const int64_t s11,
|
||||
const int64_t s12,
|
||||
const int64_t s1,
|
||||
const int64_t s2,
|
||||
const int64_t s3,
|
||||
const int tail_size) {
|
||||
|
||||
const int j = threadIdx.x;
|
||||
|
||||
int64_t tmp = blockIdx.x;
|
||||
const int64_t i01 = tmp % ne01; tmp /= ne01;
|
||||
const int64_t i02 = tmp % ne12;
|
||||
const int64_t i03 = tmp / ne12;
|
||||
|
||||
const int64_t i11 = i01 % ne11;
|
||||
const int64_t i10 = i01;
|
||||
const int64_t i12 = i02;
|
||||
|
||||
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
|
||||
const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
||||
block_turbo2_0 * dst_row_ptr = (block_turbo2_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
|
||||
|
||||
const int64_t n_full = ne00 / QK_TURBO2_GROUP;
|
||||
const int64_t tail_start = n_full * QK_TURBO2_GROUP;
|
||||
block_turbo2_0 * blk_base = dst_row_ptr + n_full * (QK_TURBO2_GROUP / QK_TURBO2);
|
||||
|
||||
// ---- Load ----
|
||||
const float val = src_row[tail_start + j];
|
||||
|
||||
// ---- L2 norm ----
|
||||
const int n_warps = tail_size / WARP_SIZE;
|
||||
const int warp_id = j / WARP_SIZE;
|
||||
const int lane = j % WARP_SIZE;
|
||||
|
||||
__shared__ float warp_accum[4];
|
||||
float v2 = val * val;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
v2 += __shfl_xor_sync(0xffffffff, v2, offset);
|
||||
if (lane == 0) warp_accum[warp_id] = v2;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_norm_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_norm_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float grp_norm = sqrtf(s_norm_sq);
|
||||
const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
|
||||
|
||||
// ---- Normalize (no WHT!) ----
|
||||
const float rv = val * inv_norm;
|
||||
|
||||
// ---- Quantize ----
|
||||
const uint8_t idx = turbo_nearest_centroid_2bit(rv);
|
||||
|
||||
// ---- Pack qs ----
|
||||
block_turbo2_0 * blk = blk_base + warp_id;
|
||||
|
||||
const uint8_t my_bits = idx & 0x3;
|
||||
uint8_t qs_byte = 0;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 4; k++) {
|
||||
uint8_t contrib = __shfl_sync(0xffffffff, my_bits, (lane & ~3) + k);
|
||||
qs_byte |= contrib << (k * 2);
|
||||
}
|
||||
if (lane % 4 == 0) blk->qs[lane / 4] = qs_byte;
|
||||
|
||||
// ---- Reconstruction norm ----
|
||||
const float c = TURBO_CENTROIDS_2BIT[idx];
|
||||
float rc = c * c;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
rc += __shfl_xor_sync(0xffffffff, rc, offset);
|
||||
if (lane == 0) warp_accum[warp_id] = rc;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_recon_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_recon_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float recon_norm = sqrtf(s_recon_sq);
|
||||
const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
|
||||
|
||||
if (lane == 0) blk->norm = __float2half(corrected_norm);
|
||||
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne00);
|
||||
}
|
||||
|
||||
template<typename idx_t>
|
||||
static void set_rows_cuda_turbo2(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const idx_t * src1_d = (const idx_t *)src1->data;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
GGML_ASSERT(ne00 % QK_TURBO2 == 0);
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
int group_size = 128;
|
||||
memcpy(&group_size, dst->op_params, sizeof(int));
|
||||
if (group_size != 64 && group_size != 128) group_size = 128;
|
||||
GGML_ASSERT(ne00 % group_size == 0);
|
||||
|
||||
const int64_t n_full_groups = ne00 / group_size;
|
||||
const int tail_size = (int)(ne00 % group_size);
|
||||
|
||||
const int64_t s01 = nb01/sizeof(float);
|
||||
const int64_t s02 = nb02/sizeof(float);
|
||||
const int64_t s03 = nb03/sizeof(float);
|
||||
const int64_t s10 = nb10/sizeof(idx_t);
|
||||
const int64_t s11 = nb11/sizeof(idx_t);
|
||||
const int64_t s12 = nb12/sizeof(idx_t);
|
||||
|
||||
// InnerQ: check/finalize calibration before kernel launch
|
||||
turbo_innerq_check_finalize(group_size, ne00);
|
||||
|
||||
if (n_full_groups > 0) {
|
||||
const int64_t ne_total = n_full_groups * ne01 * ne02 * ne03;
|
||||
if (group_size == 128) {
|
||||
k_set_rows_turbo2<idx_t, 128><<<(int)ne_total, 128, 0, stream>>>(
|
||||
src0_d, src1_d, (block_turbo2_0 *)dst->data,
|
||||
ne00, ne01, ne10, ne11, ne12, ne13,
|
||||
s01, s02, s03, s10, s11, s12,
|
||||
nb1, nb2, nb3);
|
||||
} else {
|
||||
k_set_rows_turbo2<idx_t, 64><<<(int)ne_total, 64, 0, stream>>>(
|
||||
src0_d, src1_d, (block_turbo2_0 *)dst->data,
|
||||
ne00, ne01, ne10, ne11, ne12, ne13,
|
||||
s01, s02, s03, s10, s11, s12,
|
||||
nb1, nb2, nb3);
|
||||
}
|
||||
}
|
||||
|
||||
if (tail_size > 0) {
|
||||
GGML_ASSERT(tail_size % QK_TURBO2 == 0);
|
||||
const int64_t n_rows = ne01 * ne02 * ne03;
|
||||
k_set_rows_turbo2_tail<idx_t><<<(int)n_rows, tail_size, 0, stream>>>(
|
||||
src0_d, src1_d, (block_turbo2_0 *)dst->data,
|
||||
ne00, ne01, ne10, ne11, ne12, ne13,
|
||||
s01, s02, s03, s10, s11, s12,
|
||||
nb1, nb2, nb3, tail_size);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- TurboQuant4 set_rows: 128-element groups with WHT rotation + 4-bit quantization ----
|
||||
//
|
||||
// turbo4 block size IS the WHT group size (128), so 1 CUDA block = 1 turbo4 block.
|
||||
// 128 threads per block, thread j handles element j.
|
||||
// 4-bit centroids (16 values), nibble packed: qs[j/2] |= (idx & 0xF) << ((j%2)*4)
|
||||
|
||||
template <typename idx_t>
|
||||
__launch_bounds__(128)
|
||||
static __global__ void k_set_rows_turbo4(
|
||||
const float * __restrict__ src0,
|
||||
const idx_t * __restrict__ src1,
|
||||
block_turbo4_0 * __restrict__ dst,
|
||||
const int64_t ne00,
|
||||
const int64_t ne01,
|
||||
const int64_t ne10,
|
||||
const int64_t ne11,
|
||||
const int64_t ne12,
|
||||
const int64_t ne13,
|
||||
const int64_t s01,
|
||||
const int64_t s02,
|
||||
const int64_t s03,
|
||||
const int64_t s10,
|
||||
const int64_t s11,
|
||||
const int64_t s12,
|
||||
const int64_t s1,
|
||||
const int64_t s2,
|
||||
const int64_t s3) {
|
||||
|
||||
// blockIdx.x = flat block index; threadIdx.x = element within block (0..127)
|
||||
const int j = threadIdx.x;
|
||||
|
||||
// Decode blockIdx.x → (i_blk, i01, i02, i03)
|
||||
const int64_t n_blocks_per_row = ne00 / QK_TURBO4;
|
||||
const int64_t g = blockIdx.x;
|
||||
const int64_t i_blk = g % n_blocks_per_row;
|
||||
int64_t tmp = g / n_blocks_per_row;
|
||||
const int64_t i01 = tmp % ne01;
|
||||
tmp = tmp / ne01;
|
||||
const int64_t i02 = tmp % ne12;
|
||||
const int64_t i03 = tmp / ne12;
|
||||
|
||||
const int64_t i12 = i02;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
const int64_t i10 = i01;
|
||||
|
||||
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
|
||||
const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
||||
block_turbo4_0 * dst_row_ptr = (block_turbo4_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
|
||||
block_turbo4_0 * blk = dst_row_ptr + i_blk;
|
||||
|
||||
// ---- Step 1: Load element j (coalesced) ----
|
||||
__shared__ float x[128];
|
||||
x[j] = src_row[i_blk * QK_TURBO4 + j];
|
||||
__syncthreads();
|
||||
|
||||
// ---- InnerQ: calibrate on original (unscaled) values ----
|
||||
if (d_innerq_calibrating) {
|
||||
atomicAdd(&d_innerq_sq_accum[j], x[j] * x[j]);
|
||||
if (j == 0) atomicAdd(&d_innerq_count, 1);
|
||||
}
|
||||
|
||||
// ---- InnerQ: apply channel scale (only when active) ----
|
||||
if (d_innerq_active) {
|
||||
x[j] *= d_innerq_scale[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// ---- Step 2: Parallel L2 norm ----
|
||||
constexpr int n_warps = 128 / WARP_SIZE; // = 4
|
||||
__shared__ float warp_accum[n_warps];
|
||||
float v = x[j];
|
||||
float v2 = v * v;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
v2 += __shfl_xor_sync(0xffffffff, v2, offset);
|
||||
if (j % WARP_SIZE == 0)
|
||||
warp_accum[j / WARP_SIZE] = v2;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_norm_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_norm_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float grp_norm = sqrtf(s_norm_sq);
|
||||
const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
|
||||
|
||||
// ---- Step 3: Normalize ----
|
||||
x[j] *= inv_norm;
|
||||
__syncthreads();
|
||||
|
||||
// ---- Step 4: Forward WHT (signs1 → butterfly → signs2, normalized) ----
|
||||
x[j] *= TURBO_WHT_SIGNS1[j];
|
||||
__syncthreads();
|
||||
|
||||
#define WHT_STAGE_SHARED_T4(h) \
|
||||
if (j % (2*(h)) < (h)) { float a = x[j], b = x[j+(h)]; x[j] = a+b; x[j+(h)] = a-b; } \
|
||||
__syncthreads();
|
||||
|
||||
WHT_STAGE_SHARED_T4(1)
|
||||
WHT_STAGE_SHARED_T4(2)
|
||||
WHT_STAGE_SHARED_T4(4)
|
||||
WHT_STAGE_SHARED_T4(8)
|
||||
WHT_STAGE_SHARED_T4(16)
|
||||
WHT_STAGE_SHARED_T4(32)
|
||||
WHT_STAGE_SHARED_T4(64)
|
||||
#undef WHT_STAGE_SHARED_T4
|
||||
|
||||
constexpr float inv_sqrt_128 = 0.08838834764831845f;
|
||||
x[j] = x[j] * inv_sqrt_128 * TURBO_WHT_SIGNS2[j];
|
||||
__syncthreads();
|
||||
|
||||
// ---- Step 5: Quantize element j to 4-bit centroid ----
|
||||
const float rv = x[j];
|
||||
const uint8_t idx = turbo_nearest_centroid_4bit(rv);
|
||||
|
||||
// ---- Step 6: Pack qs (nibble packed, warp-cooperative) ----
|
||||
// 2 elements per byte, 4 bits each.
|
||||
// Thread pairs (j, j+1) share a qs byte.
|
||||
const int lane = j % WARP_SIZE;
|
||||
const uint8_t my_nibble = idx & 0xF;
|
||||
uint8_t qs_byte = 0;
|
||||
// Gather nibble from partner thread
|
||||
uint8_t partner_nibble = __shfl_sync(0xffffffff, my_nibble, lane ^ 1);
|
||||
if (j % 2 == 0) {
|
||||
qs_byte = my_nibble | (partner_nibble << 4);
|
||||
blk->qs[j / 2] = qs_byte;
|
||||
}
|
||||
|
||||
// ---- Step 7: Reconstruction norm (parallel) ----
|
||||
const float c = TURBO_CENTROIDS_4BIT[idx];
|
||||
float rc = c * c;
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
|
||||
rc += __shfl_xor_sync(0xffffffff, rc, offset);
|
||||
if (j % WARP_SIZE == 0)
|
||||
warp_accum[j / WARP_SIZE] = rc;
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float s_recon_sq;
|
||||
if (j == 0) {
|
||||
float total = 0.0f;
|
||||
for (int w = 0; w < n_warps; w++) total += warp_accum[w];
|
||||
s_recon_sq = total;
|
||||
}
|
||||
__syncthreads();
|
||||
const float recon_norm = sqrtf(s_recon_sq);
|
||||
const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
|
||||
|
||||
// ---- Step 8: Write corrected norm and zero rnorm (one thread) ----
|
||||
if (j == 0) {
|
||||
blk->norm = __float2half(corrected_norm);
|
||||
blk->rnorm = __float2half(0.0f);
|
||||
}
|
||||
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
}
|
||||
|
||||
template<typename idx_t>
|
||||
static void set_rows_cuda_turbo4(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const idx_t * src1_d = (const idx_t *)src1->data;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
GGML_ASSERT(ne00 % QK_TURBO4 == 0); // must be block-aligned (128)
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
// turbo4 block size = WHT group size = 128, always
|
||||
const int64_t n_blocks = ne00 / QK_TURBO4;
|
||||
|
||||
const int64_t s01 = nb01/sizeof(float);
|
||||
const int64_t s02 = nb02/sizeof(float);
|
||||
const int64_t s03 = nb03/sizeof(float);
|
||||
const int64_t s10 = nb10/sizeof(idx_t);
|
||||
const int64_t s11 = nb11/sizeof(idx_t);
|
||||
const int64_t s12 = nb12/sizeof(idx_t);
|
||||
|
||||
// InnerQ: check/finalize calibration before kernel launch
|
||||
turbo_innerq_check_finalize(QK_TURBO4, ne00);
|
||||
|
||||
if (n_blocks > 0) {
|
||||
const int64_t ne_total = n_blocks * ne01 * ne02 * ne03;
|
||||
k_set_rows_turbo4<idx_t><<<(int)ne_total, 128, 0, stream>>>(
|
||||
src0_d, src1_d, (block_turbo4_0 *)dst->data,
|
||||
ne00, ne01, ne10, ne11, ne12, ne13,
|
||||
s01, s02, s03, s10, s11, s12,
|
||||
nb1, nb2, nb3);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename src_t, typename idx_t>
|
||||
static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const src_t * src0_d = (const src_t *)src0->data;
|
||||
@@ -309,6 +1232,12 @@ static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * s
|
||||
nb1, nb2, nb3,
|
||||
stream
|
||||
);
|
||||
} else if (dst->type == GGML_TYPE_TURBO3_0) {
|
||||
set_rows_cuda_turbo3<idx_t>(ctx, src0, src1, dst);
|
||||
} else if (dst->type == GGML_TYPE_TURBO2_0) {
|
||||
set_rows_cuda_turbo2<idx_t>(ctx, src0, src1, dst);
|
||||
} else if (dst->type == GGML_TYPE_TURBO4_0) {
|
||||
set_rows_cuda_turbo4<idx_t>(ctx, src0, src1, dst);
|
||||
} else {
|
||||
GGML_ABORT("unsupported type %s", ggml_type_name(dst->type));
|
||||
}
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 1, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(640, 512, 1, 16);
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 2, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(640, 512, 2, 16);
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 4, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(640, 512, 4, 16);
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-tile.cuh"
|
||||
|
||||
DECL_FATTN_TILE_CASE(640, 512);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: f16 K + turbo2 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: f16 K + turbo3 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: f16 K + turbo4 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: q8_0 K + turbo2 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: q8_0 K + turbo3 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: q8_0 K + turbo4 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo2 K + f16 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo2 K + q8_0 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// TurboQuant2 CUDA flash attention vec kernel instantiation
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo2 K + turbo3 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo2 K + turbo4 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo3 K + f16 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo3 K + q8_0 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo3 K + turbo2 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// TurboQuant3 CUDA flash attention vec kernel instantiation
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo3 K + turbo4 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo4 K + f16 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo4 K + q8_0 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo4 K + turbo2 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// Mixed KV: turbo4 K + turbo3 V
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
|
||||
@@ -0,0 +1,7 @@
|
||||
// TurboQuant4 CUDA flash attention vec kernel instantiation
|
||||
|
||||
#include "../fattn-vec.cuh"
|
||||
|
||||
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
|
||||
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
|
||||
@@ -0,0 +1,32 @@
|
||||
#include "turbo-innerq.cuh"
|
||||
#include <cstring>
|
||||
|
||||
// Host-side shared state for InnerQ cross-TU communication
|
||||
TURBO_IQ_API bool g_innerq_finalized = false;
|
||||
TURBO_IQ_API float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
|
||||
};
|
||||
|
||||
static bool g_innerq_tensor_needs_update = false;
|
||||
|
||||
void turbo_innerq_publish(const float * scale_inv, int group_size) {
|
||||
for (int i = 0; i < group_size && i < INNERQ_MAX_CHANNELS; i++) {
|
||||
g_innerq_scale_inv_host[i] = scale_inv[i];
|
||||
}
|
||||
for (int i = group_size; i < INNERQ_MAX_CHANNELS; i++) {
|
||||
g_innerq_scale_inv_host[i] = 1.0f;
|
||||
}
|
||||
g_innerq_finalized = true;
|
||||
g_innerq_tensor_needs_update = true;
|
||||
}
|
||||
|
||||
TURBO_IQ_API bool turbo_innerq_needs_tensor_update(void) {
|
||||
return g_innerq_tensor_needs_update;
|
||||
}
|
||||
|
||||
TURBO_IQ_API void turbo_innerq_mark_tensor_updated(void) {
|
||||
g_innerq_tensor_needs_update = false;
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
// TurboQuant InnerQ per-channel equalization — cross-TU shared state
|
||||
// The host-side state lives in turbo-innerq.cu; device-side state is per-TU
|
||||
// in turbo-quant.cuh (only set-rows.cu needs device access).
|
||||
|
||||
#define INNERQ_MAX_CHANNELS 128
|
||||
|
||||
#ifdef GGML_BACKEND_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef GGML_BACKEND_BUILD
|
||||
# define TURBO_IQ_API __declspec(dllexport)
|
||||
# else
|
||||
# define TURBO_IQ_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define TURBO_IQ_API __attribute__((visibility("default")))
|
||||
# endif
|
||||
#else
|
||||
# define TURBO_IQ_API
|
||||
#endif
|
||||
|
||||
// Host-side shared state (defined in turbo-innerq.cu)
|
||||
TURBO_IQ_API extern bool g_innerq_finalized;
|
||||
TURBO_IQ_API extern float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS];
|
||||
|
||||
// Called from set-rows.cu after InnerQ finalization to publish scale_inv
|
||||
void turbo_innerq_publish(const float * scale_inv, int group_size);
|
||||
|
||||
// Called from llama-kv-cache.cpp (or equivalent) to check if tensor needs update
|
||||
TURBO_IQ_API bool turbo_innerq_needs_tensor_update(void);
|
||||
|
||||
// Called after tensor update to clear the flag
|
||||
TURBO_IQ_API void turbo_innerq_mark_tensor_updated(void);
|
||||
@@ -0,0 +1,453 @@
|
||||
/*
|
||||
* TurboQuant CUDA kernels for KV cache compression
|
||||
* Based on: arXiv 2504.19874 (ICLR 2026)
|
||||
*
|
||||
* Implements GGML_TYPE_TURBO3_0 (3-bit PolarQuant, block size 32)
|
||||
* Constants, WHT rotation, quantize/dequantize device functions.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
#include "turbo-innerq.cuh"
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
|
||||
// ---- Quantization ratios for dequantize_block template ----
|
||||
#define QR_TURBO3 1 // Each dequantize call produces 2 consecutive elements (like q8_0)
|
||||
#define QR_TURBO2 1 // Each dequantize call produces 2 consecutive elements (like q8_0)
|
||||
#define QR_TURBO4 1 // Each dequantize call produces 2 consecutive elements (like q8_0)
|
||||
|
||||
// ---- 2-bit centroids (Lloyd-Max for N(0, 1/128)) ----
|
||||
|
||||
static __constant__ float TURBO_CENTROIDS_2BIT[4] = {
|
||||
-0.133462f, -0.039994f, 0.039994f, 0.133462f
|
||||
};
|
||||
|
||||
static __constant__ float TURBO_MID_2BIT[3] = {
|
||||
-0.086728f, 0.0f, 0.086728f
|
||||
};
|
||||
|
||||
// ---- 3-bit centroids (Lloyd-Max for N(0, 1/128)) ----
|
||||
|
||||
static __constant__ float TURBO_CENTROIDS_3BIT[8] = {
|
||||
-0.190685f, -0.117832f, -0.065717f, -0.021460f,
|
||||
0.021460f, 0.065717f, 0.117832f, 0.190685f
|
||||
};
|
||||
|
||||
// ---- Midpoints for nearest centroid lookup ----
|
||||
|
||||
static __constant__ float TURBO_MID_3BIT[7] = {
|
||||
-0.154259f, -0.091775f, -0.043589f, 0.0f,
|
||||
0.043589f, 0.091775f, 0.154259f
|
||||
};
|
||||
|
||||
// ---- WHT sign arrays (seed=42) ----
|
||||
|
||||
static __constant__ float TURBO_WHT_SIGNS1[128] = {
|
||||
-1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
|
||||
1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f,
|
||||
-1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f,
|
||||
1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f,
|
||||
-1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f,
|
||||
1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f,
|
||||
-1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f,
|
||||
1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f
|
||||
};
|
||||
|
||||
static __constant__ float TURBO_WHT_SIGNS2[128] = {
|
||||
1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f,
|
||||
1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,
|
||||
1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f,
|
||||
1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f,
|
||||
1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f,
|
||||
-1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f,
|
||||
1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f,
|
||||
-1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f
|
||||
};
|
||||
|
||||
// ---- 64-element WHT sign arrays (first 64 of the 128-element arrays) ----
|
||||
|
||||
static __constant__ float TURBO_WHT_SIGNS1_64[64] = {
|
||||
-1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
|
||||
1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f,
|
||||
-1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f,
|
||||
1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f
|
||||
};
|
||||
|
||||
static __constant__ float TURBO_WHT_SIGNS2_64[64] = {
|
||||
1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f,
|
||||
1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,
|
||||
1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f,
|
||||
1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f
|
||||
};
|
||||
|
||||
// ---- Fast Walsh-Hadamard Transform (in-place, normalized) ----
|
||||
// O(n log n) = 896 ops for n=128
|
||||
|
||||
static __device__ __forceinline__ void turbo_fwht_128(float * x) {
|
||||
for (int h = 1; h < 128; h *= 2) {
|
||||
for (int i = 0; i < 128; i += h * 2) {
|
||||
for (int j = i; j < i + h; j++) {
|
||||
float a = x[j];
|
||||
float b = x[j + h];
|
||||
x[j] = a + b;
|
||||
x[j + h] = a - b;
|
||||
}
|
||||
}
|
||||
}
|
||||
const float inv_sqrt_128 = 0.08838834764831845f;
|
||||
for (int i = 0; i < 128; i++) {
|
||||
x[i] *= inv_sqrt_128;
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Fast Walsh-Hadamard Transform for 64-element groups ----
|
||||
// O(n log n) = 384 ops for n=64
|
||||
|
||||
static __device__ __forceinline__ void turbo_fwht_64(float * x) {
|
||||
for (int h = 1; h < 64; h *= 2) {
|
||||
for (int i = 0; i < 64; i += h * 2) {
|
||||
for (int j = i; j < i + h; j++) {
|
||||
float a = x[j];
|
||||
float b = x[j + h];
|
||||
x[j] = a + b;
|
||||
x[j + h] = a - b;
|
||||
}
|
||||
}
|
||||
}
|
||||
const float inv_sqrt_64 = 0.125f;
|
||||
for (int i = 0; i < 64; i++) {
|
||||
x[i] *= inv_sqrt_64;
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Forward rotation: signs1 → FWHT → signs2 ----
|
||||
|
||||
static __device__ __forceinline__ void turbo_rotate_forward(float * x) {
|
||||
for (int i = 0; i < 128; i++) x[i] *= TURBO_WHT_SIGNS1[i];
|
||||
turbo_fwht_128(x);
|
||||
for (int i = 0; i < 128; i++) x[i] *= TURBO_WHT_SIGNS2[i];
|
||||
}
|
||||
|
||||
// ---- Forward rotation for 64-element groups ----
|
||||
|
||||
static __device__ __forceinline__ void turbo_rotate_forward_64(float * x) {
|
||||
for (int i = 0; i < 64; i++) x[i] *= TURBO_WHT_SIGNS1_64[i];
|
||||
turbo_fwht_64(x);
|
||||
for (int i = 0; i < 64; i++) x[i] *= TURBO_WHT_SIGNS2_64[i];
|
||||
}
|
||||
|
||||
// ---- InnerQ per-channel equalization ----
|
||||
// Equalizes K channel variances before WHT rotation to reduce quantization error.
|
||||
// Enabled via TURBO_INNERQ=N env var (N = calibration token count).
|
||||
// Math: <Q/s, s*K> = <Q, K> preserves dot products.
|
||||
// INNERQ_MAX_CHANNELS is defined in turbo-innerq.cuh
|
||||
|
||||
static __device__ float d_innerq_scale[INNERQ_MAX_CHANNELS];
|
||||
static __device__ float d_innerq_scale_inv[INNERQ_MAX_CHANNELS];
|
||||
static __device__ float d_innerq_sq_accum[INNERQ_MAX_CHANNELS];
|
||||
static __device__ int d_innerq_count;
|
||||
static __device__ int d_innerq_active; // 0 = scales are identity, 1 = scales applied
|
||||
static __device__ int d_innerq_calibrating; // 1 = accumulating K² stats
|
||||
|
||||
static int innerq_enabled = 0; // host: 0=off, 1=calibrating, 2=active
|
||||
static int innerq_target_tokens = 0;
|
||||
static float innerq_strength = 0.5f;
|
||||
static bool innerq_initialized = false;
|
||||
|
||||
// Host: read TURBO_INNERQ env, start calibration if enabled
|
||||
static void turbo_innerq_init(void) {
|
||||
if (innerq_initialized) return;
|
||||
innerq_initialized = true;
|
||||
|
||||
const char * env = getenv("TURBO_INNERQ");
|
||||
if (!env || atoi(env) <= 0) {
|
||||
innerq_enabled = 0;
|
||||
return;
|
||||
}
|
||||
innerq_target_tokens = atoi(env);
|
||||
innerq_enabled = 1; // calibrating
|
||||
|
||||
const char * env_str = getenv("TURBO_INNERQ_STRENGTH");
|
||||
if (env_str) innerq_strength = atof(env_str);
|
||||
if (innerq_strength <= 0.0f || innerq_strength > 1.0f) innerq_strength = 0.5f;
|
||||
|
||||
// Zero accumulators and set calibrating flag on device
|
||||
float zeros[INNERQ_MAX_CHANNELS] = {0};
|
||||
int zero = 0, one = 1;
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int)));
|
||||
|
||||
GGML_LOG_INFO("%s: InnerQ calibration started (target=%d tokens, strength=%.2f)\n",
|
||||
__func__, innerq_target_tokens, innerq_strength);
|
||||
}
|
||||
|
||||
// Host: finalize calibration — compute scales, upload, activate
|
||||
static void turbo_innerq_finalize(int group_size) {
|
||||
// Read accumulators from device
|
||||
float sq_accum[INNERQ_MAX_CHANNELS];
|
||||
int count = 0;
|
||||
CUDA_CHECK(cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float)));
|
||||
CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
|
||||
|
||||
if (count <= 0) {
|
||||
GGML_LOG_WARN("%s: InnerQ calibration got 0 tokens, disabling\n", __func__);
|
||||
innerq_enabled = 0;
|
||||
int zero = 0;
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
|
||||
return;
|
||||
}
|
||||
|
||||
// Compute per-channel RMS
|
||||
float rms[INNERQ_MAX_CHANNELS];
|
||||
float mean_rms = 0.0f;
|
||||
float max_ratio = 0.0f, min_ratio = 1e30f;
|
||||
for (int i = 0; i < group_size; i++) {
|
||||
rms[i] = sqrtf(sq_accum[i] / (float)count);
|
||||
mean_rms += rms[i];
|
||||
}
|
||||
mean_rms /= (float)group_size;
|
||||
|
||||
// Compute scale[i] = (mean_rms / channel_rms[i])^strength, clamp to [0.5, 2.0]
|
||||
float scale[INNERQ_MAX_CHANNELS];
|
||||
float scale_inv[INNERQ_MAX_CHANNELS];
|
||||
for (int i = 0; i < group_size; i++) {
|
||||
float ratio = (rms[i] > 1e-10f) ? (mean_rms / rms[i]) : 1.0f;
|
||||
float s = powf(ratio, innerq_strength);
|
||||
if (s < 0.5f) s = 0.5f;
|
||||
if (s > 2.0f) s = 2.0f;
|
||||
scale[i] = s;
|
||||
scale_inv[i] = 1.0f / s;
|
||||
if (ratio > max_ratio) max_ratio = ratio;
|
||||
if (ratio < min_ratio) min_ratio = ratio;
|
||||
}
|
||||
|
||||
// Auto-skip if max channel ratio < 1.2 (already balanced)
|
||||
if (max_ratio < 1.2f && min_ratio > (1.0f / 1.2f)) {
|
||||
GGML_LOG_INFO("%s: InnerQ auto-disabled (channels already balanced, max_ratio=%.3f)\n",
|
||||
__func__, max_ratio);
|
||||
innerq_enabled = 0;
|
||||
int zero = 0;
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
|
||||
return;
|
||||
}
|
||||
|
||||
// Stop calibrating, upload scales, activate
|
||||
int zero = 0, one = 1;
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float)));
|
||||
CUDA_CHECK(cudaDeviceSynchronize()); // ensure scales are visible before activating
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int)));
|
||||
|
||||
innerq_enabled = 2; // active
|
||||
|
||||
// Publish scale_inv to shared host state for cross-TU tensor update
|
||||
turbo_innerq_publish(scale_inv, group_size);
|
||||
|
||||
GGML_LOG_INFO("%s: InnerQ finalized (%d tokens, max_ratio=%.3f, min_ratio=%.3f)\n",
|
||||
__func__, count, max_ratio, min_ratio);
|
||||
}
|
||||
|
||||
// Host: called before each set_rows kernel launch
|
||||
static void turbo_innerq_check_finalize(int group_size, int64_t ne00) {
|
||||
if (!innerq_initialized) {
|
||||
turbo_innerq_init();
|
||||
}
|
||||
if (innerq_enabled == 0) return;
|
||||
|
||||
// InnerQ only works when each WHT group = one head (group_size == head_dim).
|
||||
// For standard models: ne00 = n_heads * head_dim, group_size = head_dim → ne00 % group_size == 0, fine.
|
||||
// For non-standard models (head_dim > group_size, e.g. GLM 576 → 64-group):
|
||||
// ne00 = head_dim (single head), group_size = 64, ne00/group_size = 9 groups per head → WRONG.
|
||||
// Detect: if ne00 / group_size doesn't divide evenly into standard head counts (1,2,4,8,16,32,64,128),
|
||||
// it's likely multi-group-per-head. Simpler check: group_size < 128 means head_dim > 128.
|
||||
const bool multi_group_per_head = (group_size < 128); // 64-group → head_dim > 128, multi-group
|
||||
if (multi_group_per_head) {
|
||||
if (innerq_enabled == 1) {
|
||||
GGML_LOG_WARN("%s: InnerQ disabled (ne00=%lld != group_size=%d, multi-group heads)\n",
|
||||
__func__, (long long)ne00, group_size);
|
||||
innerq_enabled = 0;
|
||||
int zero = 0;
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if calibration is complete
|
||||
if (innerq_enabled == 1) {
|
||||
int count = 0;
|
||||
CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
|
||||
if (count >= innerq_target_tokens) {
|
||||
turbo_innerq_finalize(group_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Host: check if InnerQ is currently active (finalized)
|
||||
static bool turbo_innerq_is_active(void) {
|
||||
return innerq_enabled == 2;
|
||||
}
|
||||
|
||||
// ---- 4-bit centroids (Lloyd-Max for N(0, 1/128)) ----
|
||||
|
||||
static __constant__ float TURBO_CENTROIDS_4BIT[16] = {
|
||||
-0.173926f, -0.117195f, -0.089527f, -0.068756f,
|
||||
-0.051262f, -0.035597f, -0.020989f, -0.006938f,
|
||||
0.006938f, 0.020989f, 0.035597f, 0.051262f,
|
||||
0.068756f, 0.089527f, 0.117195f, 0.173926f
|
||||
};
|
||||
|
||||
// ---- Midpoints for nearest 4-bit centroid lookup ----
|
||||
|
||||
static __constant__ float TURBO_MID_4BIT[15] = {
|
||||
-0.145561f, -0.103361f, -0.079142f, -0.060009f,
|
||||
-0.043430f, -0.028293f, -0.013964f, 0.000000f,
|
||||
0.013964f, 0.028293f, 0.043430f, 0.060009f,
|
||||
0.079142f, 0.103361f, 0.145561f
|
||||
};
|
||||
|
||||
// ---- Nearest 4-bit centroid index ----
|
||||
|
||||
static __device__ __forceinline__ uint8_t turbo_nearest_centroid_4bit(float val) {
|
||||
if (val < TURBO_MID_4BIT[ 0]) return 0;
|
||||
else if (val < TURBO_MID_4BIT[ 1]) return 1;
|
||||
else if (val < TURBO_MID_4BIT[ 2]) return 2;
|
||||
else if (val < TURBO_MID_4BIT[ 3]) return 3;
|
||||
else if (val < TURBO_MID_4BIT[ 4]) return 4;
|
||||
else if (val < TURBO_MID_4BIT[ 5]) return 5;
|
||||
else if (val < TURBO_MID_4BIT[ 6]) return 6;
|
||||
else if (val < TURBO_MID_4BIT[ 7]) return 7;
|
||||
else if (val < TURBO_MID_4BIT[ 8]) return 8;
|
||||
else if (val < TURBO_MID_4BIT[ 9]) return 9;
|
||||
else if (val < TURBO_MID_4BIT[10]) return 10;
|
||||
else if (val < TURBO_MID_4BIT[11]) return 11;
|
||||
else if (val < TURBO_MID_4BIT[12]) return 12;
|
||||
else if (val < TURBO_MID_4BIT[13]) return 13;
|
||||
else if (val < TURBO_MID_4BIT[14]) return 14;
|
||||
else return 15;
|
||||
}
|
||||
|
||||
// ---- Per-block quantize for turbo4 (128 elements, expects already-rotated input) ----
|
||||
|
||||
static __device__ void quantize_f32_turbo4_0_block(const float * __restrict__ src,
|
||||
block_turbo4_0 * __restrict__ dst) {
|
||||
for (int j = 0; j < QK_TURBO4 / 2; j++) dst->qs[j] = 0;
|
||||
|
||||
for (int j = 0; j < QK_TURBO4; j++) {
|
||||
uint8_t idx = turbo_nearest_centroid_4bit(src[j]);
|
||||
dst->qs[j / 2] |= (idx & 0xF) << ((j % 2) * 4);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Inline dequant helper: extract one float from turbo4 block ----
|
||||
|
||||
static __device__ __forceinline__ float turbo4_dequant_element(
|
||||
const block_turbo4_0 * __restrict__ x, int j, float norm) {
|
||||
uint8_t idx = (x->qs[j / 2] >> ((j % 2) * 4)) & 0xF;
|
||||
return TURBO_CENTROIDS_4BIT[idx] * norm;
|
||||
}
|
||||
|
||||
// ---- Nearest 3-bit centroid index ----
|
||||
|
||||
static __device__ __forceinline__ uint8_t turbo_nearest_centroid_3bit(float val) {
|
||||
if (val < TURBO_MID_3BIT[0]) return 0;
|
||||
else if (val < TURBO_MID_3BIT[1]) return 1;
|
||||
else if (val < TURBO_MID_3BIT[2]) return 2;
|
||||
else if (val < TURBO_MID_3BIT[3]) return 3;
|
||||
else if (val < TURBO_MID_3BIT[4]) return 4;
|
||||
else if (val < TURBO_MID_3BIT[5]) return 5;
|
||||
else if (val < TURBO_MID_3BIT[6]) return 6;
|
||||
else return 7;
|
||||
}
|
||||
|
||||
// ---- Per-block quantize (32 elements, expects already-rotated input) ----
|
||||
// Used by set_rows after group-level WHT rotation
|
||||
|
||||
static __device__ void quantize_f32_turbo3_0_block(const float * __restrict__ src,
|
||||
block_turbo3_0 * __restrict__ dst) {
|
||||
for (int j = 0; j < QK_TURBO3 / 4; j++) dst->qs[j] = 0;
|
||||
for (int j = 0; j < QK_TURBO3 / 8; j++) dst->signs[j] = 0;
|
||||
|
||||
for (int j = 0; j < QK_TURBO3; j++) {
|
||||
uint8_t idx = turbo_nearest_centroid_3bit(src[j]);
|
||||
dst->qs[j / 4] |= (idx & 0x3) << ((j % 4) * 2);
|
||||
if (idx & 0x4) {
|
||||
dst->signs[j / 8] |= (1 << (j % 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Inline dequant helper: extract one float from turbo3 block ----
|
||||
|
||||
static __device__ __forceinline__ float turbo3_dequant_element(
|
||||
const block_turbo3_0 * __restrict__ x, int j, float norm) {
|
||||
uint8_t low2 = (x->qs[j / 4] >> ((j % 4) * 2)) & 0x3;
|
||||
uint8_t hi1 = (x->signs[j / 8] >> (j % 8)) & 0x1;
|
||||
uint8_t idx = low2 | (hi1 << 2);
|
||||
return TURBO_CENTROIDS_3BIT[idx] * norm;
|
||||
}
|
||||
|
||||
// ---- Nearest 2-bit centroid index ----
|
||||
|
||||
static __device__ __forceinline__ uint8_t turbo_nearest_centroid_2bit(float val) {
|
||||
if (val < TURBO_MID_2BIT[0]) return 0;
|
||||
else if (val < TURBO_MID_2BIT[1]) return 1;
|
||||
else if (val < TURBO_MID_2BIT[2]) return 2;
|
||||
else return 3;
|
||||
}
|
||||
|
||||
// ---- Per-block quantize for turbo2 (32 elements, expects already-rotated input) ----
|
||||
|
||||
static __device__ void quantize_f32_turbo2_0_block(const float * __restrict__ src,
|
||||
block_turbo2_0 * __restrict__ dst) {
|
||||
for (int j = 0; j < QK_TURBO2 / 4; j++) dst->qs[j] = 0;
|
||||
|
||||
for (int j = 0; j < QK_TURBO2; j++) {
|
||||
uint8_t idx = turbo_nearest_centroid_2bit(src[j]);
|
||||
dst->qs[j / 4] |= (idx & 0x3) << ((j % 4) * 2);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Inline dequant helper: extract one float from turbo2 block ----
|
||||
|
||||
static __device__ __forceinline__ float turbo2_dequant_element(
|
||||
const block_turbo2_0 * __restrict__ x, int j, float norm) {
|
||||
uint8_t idx = (x->qs[j / 4] >> ((j % 4) * 2)) & 0x3;
|
||||
return TURBO_CENTROIDS_2BIT[idx] * norm;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Weight compression types (TQ3_1S, TQ4_1S)
|
||||
// These use N(0,1) centroids (NOT N(0,1/128) like KV cache types)
|
||||
// and require inverse WHT (RHT) after centroid lookup.
|
||||
// ============================================================================
|
||||
|
||||
#define QR_TQ4_1S 1 // dequantize produces 2 consecutive elements
|
||||
#define QR_TQ3_1S 1
|
||||
|
||||
// ---- Weight centroids: Lloyd-Max for N(0,1) ----
|
||||
|
||||
static __constant__ float TQ4_CENTROIDS_WEIGHT[16] = {
|
||||
-2.732590f, -2.069017f, -1.618046f, -1.256231f,
|
||||
-0.942340f, -0.656759f, -0.388048f, -0.128395f,
|
||||
0.128395f, 0.388048f, 0.656759f, 0.942340f,
|
||||
1.256231f, 1.618046f, 2.069017f, 2.732590f
|
||||
};
|
||||
|
||||
static __constant__ float TQ3_CENTROIDS_WEIGHT[8] = {
|
||||
-1.996684f, -1.291398f, -0.740341f, -0.247508f,
|
||||
0.230106f, 0.725222f, 1.277503f, 1.988943f
|
||||
};
|
||||
|
||||
// ---- Sign array for weight WHT (golden ratio hash, 32 elements) ----
|
||||
|
||||
static __constant__ float TQ_WEIGHT_SIGNS[32] = {
|
||||
+1.0f, -1.0f, +1.0f, -1.0f, +1.0f, +1.0f, -1.0f, +1.0f,
|
||||
-1.0f, -1.0f, +1.0f, -1.0f, +1.0f, +1.0f, -1.0f, +1.0f,
|
||||
-1.0f, -1.0f, +1.0f, -1.0f, +1.0f, -1.0f, -1.0f, +1.0f,
|
||||
-1.0f, +1.0f, +1.0f, -1.0f, +1.0f, -1.0f, -1.0f, +1.0f
|
||||
};
|
||||
@@ -0,0 +1,189 @@
|
||||
#include "turbo-quant.cuh"
|
||||
#include "turbo-wht.cuh"
|
||||
|
||||
// ─── CUDA kernel ──────────────────────────────────────────────────────────────
|
||||
//
|
||||
// Templated on direction and group_size (128 or 64).
|
||||
// One block per group, group_size threads per block.
|
||||
// direction: 0 = forward (signs1 → WHT → signs2), 1 = inverse (signs2 → WHT → signs1)
|
||||
//
|
||||
// When head_dim is not a multiple of group_size, only the full groups
|
||||
// within each head are processed. Tail elements are left unchanged (identity).
|
||||
//
|
||||
// Algorithm mirrors the CPU implementation in ggml-cpu/ops.cpp:
|
||||
// 1. Apply s_first elementwise
|
||||
// 2. Radix-2 Hadamard butterfly (log2(group_size) stages, in-place)
|
||||
// 3. Normalize by 1/sqrt(group_size) and apply s_second elementwise
|
||||
//
|
||||
// InnerQ scale_inv: when non-null, applies per-channel inverse scaling for
|
||||
// Q/V equalization. For forward (Q rotation): multiply BEFORE signs+WHT.
|
||||
// For inverse (V un-rotation): multiply AFTER WHT+signs.
|
||||
|
||||
template <int direction, int group_size>
|
||||
static __global__ void k_turbo_wht_f32(const float * __restrict__ src,
|
||||
float * __restrict__ dst,
|
||||
const float * __restrict__ scale_inv,
|
||||
int64_t n_groups,
|
||||
int64_t head_dim,
|
||||
int64_t groups_per_head) {
|
||||
static_assert(group_size == 128 || group_size == 64 || group_size == 32, "group_size must be 128, 64, or 32");
|
||||
|
||||
const int64_t g = blockIdx.x;
|
||||
if (g >= n_groups) return;
|
||||
|
||||
const int t = threadIdx.x; // 0 .. group_size-1
|
||||
|
||||
// Map group index to position in the tensor:
|
||||
// each head has groups_per_head full groups, then a gap of tail elements.
|
||||
const int64_t head_idx = g / groups_per_head;
|
||||
const int64_t grp_in_head = g % groups_per_head;
|
||||
const int64_t base = head_idx * head_dim + grp_in_head * group_size;
|
||||
|
||||
__shared__ float x[group_size];
|
||||
|
||||
// Load from global memory
|
||||
x[t] = src[base + t];
|
||||
__syncthreads();
|
||||
|
||||
// InnerQ forward: apply scale_inv BEFORE signs+WHT (for Q pre-rotation)
|
||||
if (direction == 0 && scale_inv != nullptr) {
|
||||
x[t] *= scale_inv[t % group_size];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Apply first sign array
|
||||
if (group_size == 128) {
|
||||
x[t] *= (direction == 0) ? TURBO_WHT_SIGNS1[t] : TURBO_WHT_SIGNS2[t];
|
||||
} else if (group_size == 64) {
|
||||
x[t] *= (direction == 0) ? TURBO_WHT_SIGNS1_64[t] : TURBO_WHT_SIGNS2_64[t];
|
||||
} else {
|
||||
// group_size == 32: TQ weight signs (same for forward and inverse)
|
||||
x[t] *= TQ_WEIGHT_SIGNS[t];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// WHT butterfly — log2(group_size) stages.
|
||||
// In stage h, threads where (t % (2h)) < h read x[t] and x[t+h],
|
||||
// then write x[t] = a+b and x[t+h] = a-b. Each active thread
|
||||
// owns a disjoint pair, so no intra-stage conflicts exist.
|
||||
#define WHT_STAGE(h) \
|
||||
if (t % (2*(h)) < (h)) { float a = x[t], b = x[t+(h)]; x[t] = a+b; x[t+(h)] = a-b; } \
|
||||
__syncthreads();
|
||||
|
||||
WHT_STAGE(1)
|
||||
WHT_STAGE(2)
|
||||
WHT_STAGE(4)
|
||||
WHT_STAGE(8)
|
||||
WHT_STAGE(16)
|
||||
if (group_size >= 64) { WHT_STAGE(32) }
|
||||
if (group_size == 128) { WHT_STAGE(64) }
|
||||
#undef WHT_STAGE
|
||||
|
||||
// Normalize and apply second sign array, write to output
|
||||
constexpr float inv_sqrt = (group_size == 128) ? 0.08838834764831845f :
|
||||
(group_size == 64) ? 0.125f :
|
||||
0.17677669529663688f; // 1/sqrt(32)
|
||||
float result;
|
||||
if (group_size == 128) {
|
||||
result = x[t] * inv_sqrt *
|
||||
((direction == 0) ? TURBO_WHT_SIGNS2[t] : TURBO_WHT_SIGNS1[t]);
|
||||
} else if (group_size == 64) {
|
||||
result = x[t] * inv_sqrt *
|
||||
((direction == 0) ? TURBO_WHT_SIGNS2_64[t] : TURBO_WHT_SIGNS1_64[t]);
|
||||
} else {
|
||||
// group_size == 32: normalize only (signs already applied before butterfly)
|
||||
result = x[t] * inv_sqrt;
|
||||
}
|
||||
|
||||
// InnerQ inverse: apply scale_inv AFTER WHT+signs (for V un-rotation)
|
||||
if (direction == 1 && scale_inv != nullptr) {
|
||||
result *= scale_inv[t % group_size];
|
||||
}
|
||||
|
||||
dst[base + t] = result;
|
||||
}
|
||||
|
||||
// ─── Simple copy kernel for tail elements (identity pass-through) ────────────
|
||||
|
||||
static __global__ void k_turbo_wht_copy_tail(const float * __restrict__ src,
|
||||
float * __restrict__ dst,
|
||||
int64_t n_heads,
|
||||
int64_t head_dim,
|
||||
int64_t tail_offset,
|
||||
int tail_size) {
|
||||
const int64_t i = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i >= n_heads * tail_size) return;
|
||||
|
||||
const int64_t head_idx = i / tail_size;
|
||||
const int64_t tail_elem = i % tail_size;
|
||||
const int64_t offset = head_idx * head_dim + tail_offset + tail_elem;
|
||||
dst[offset] = src[offset];
|
||||
}
|
||||
|
||||
// ─── Dispatch ─────────────────────────────────────────────────────────────────
|
||||
|
||||
void ggml_cuda_turbo_wht(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src = dst->src[0];
|
||||
const ggml_tensor * scale_tensor = dst->src[1]; // InnerQ scale_inv (may be NULL)
|
||||
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
int direction;
|
||||
int group_size;
|
||||
memcpy(&direction, dst->op_params + 0, sizeof(int));
|
||||
memcpy(&group_size, dst->op_params + sizeof(int), sizeof(int));
|
||||
|
||||
const int64_t head_dim = src->ne[0];
|
||||
const int64_t n_heads = ggml_nelements(src) / head_dim;
|
||||
|
||||
GGML_ASSERT(group_size == 32 || group_size == 64 || group_size == 128);
|
||||
const int64_t groups_per_head = head_dim / group_size;
|
||||
const int tail_size = (int)(head_dim % group_size);
|
||||
const int64_t n_groups = groups_per_head * n_heads;
|
||||
|
||||
const float * src_ptr = (const float *) src->data;
|
||||
float * dst_ptr = (float *) dst->data;
|
||||
const float * scale_inv_ptr = scale_tensor ? (const float *) scale_tensor->data : nullptr;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
// Process full groups
|
||||
if (n_groups > 0) {
|
||||
dim3 blocks(n_groups);
|
||||
if (group_size == 128) {
|
||||
dim3 threads(128);
|
||||
if (direction == 0) {
|
||||
k_turbo_wht_f32<0, 128><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
|
||||
} else {
|
||||
k_turbo_wht_f32<1, 128><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
|
||||
}
|
||||
} else if (group_size == 64) {
|
||||
dim3 threads(64);
|
||||
if (direction == 0) {
|
||||
k_turbo_wht_f32<0, 64><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
|
||||
} else {
|
||||
k_turbo_wht_f32<1, 64><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
|
||||
}
|
||||
} else {
|
||||
dim3 threads(32);
|
||||
if (direction == 0) {
|
||||
k_turbo_wht_f32<0, 32><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
|
||||
} else {
|
||||
k_turbo_wht_f32<1, 32><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass through tail elements unchanged (no rotation)
|
||||
// Not needed for 64-aligned dims but kept for completeness
|
||||
if (tail_size > 0) {
|
||||
const int64_t total_tail = n_heads * tail_size;
|
||||
const int block_sz = 256;
|
||||
const int n_blocks = (int)((total_tail + block_sz - 1) / block_sz);
|
||||
k_turbo_wht_copy_tail<<<n_blocks, block_sz, 0, stream>>>(
|
||||
src_ptr, dst_ptr, n_heads, head_dim, groups_per_head * group_size, tail_size);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_turbo_wht(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
Vendored
+29
-3
@@ -33,11 +33,30 @@
|
||||
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
|
||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
|
||||
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
|
||||
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
|
||||
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
// __shfl_sync: support both 3-arg (mask, var, srcLane) and 4-arg (mask, var, srcLane, width) calls
|
||||
// HIP ignores the mask but requires it to be 64-bit, so we cast explicitly.
|
||||
#define __SHFL_SYNC_3(mask, var, srcLane) __shfl(var, srcLane, warpSize)
|
||||
#define __SHFL_SYNC_4(mask, var, srcLane, width) __shfl(var, srcLane, width)
|
||||
#define __SHFL_GET_MACRO(_1, _2, _3, _4, NAME, ...) NAME
|
||||
#define __shfl_sync(...) __SHFL_GET_MACRO(__VA_ARGS__, __SHFL_SYNC_4, __SHFL_SYNC_3)(__VA_ARGS__)
|
||||
// __shfl_up_sync: support 3-arg and 4-arg calls (HIP ignores mask)
|
||||
#define __SHFL_UP_SYNC_3(mask, var, delta) __shfl_up(var, delta, warpSize)
|
||||
#define __SHFL_UP_SYNC_4(mask, var, delta, width) __shfl_up(var, delta, width)
|
||||
#define __SHFL_UP_GET(_1, _2, _3, _4, NAME, ...) NAME
|
||||
#define __shfl_up_sync(...) __SHFL_UP_GET(__VA_ARGS__, __SHFL_UP_SYNC_4, __SHFL_UP_SYNC_3)(__VA_ARGS__)
|
||||
// __shfl_xor_sync: support 3-arg and 4-arg calls (HIP ignores mask)
|
||||
#define __SHFL_XOR_SYNC_3(mask, var, laneMask) __shfl_xor(var, laneMask, warpSize)
|
||||
#define __SHFL_XOR_SYNC_4(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define __SHFL_XOR_GET(_1, _2, _3, _4, NAME, ...) NAME
|
||||
#define __shfl_xor_sync(...) __SHFL_XOR_GET(__VA_ARGS__, __SHFL_XOR_SYNC_4, __SHFL_XOR_SYNC_3)(__VA_ARGS__)
|
||||
// __shfl_down_sync: support 3-arg and 4-arg calls (HIP ignores mask)
|
||||
#define __SHFL_DOWN_SYNC_3(mask, var, delta) __shfl_down(var, delta, warpSize)
|
||||
#define __SHFL_DOWN_SYNC_4(mask, var, delta, width) __shfl_down(var, delta, width)
|
||||
#define __SHFL_DOWN_GET(_1, _2, _3, _4, NAME, ...) NAME
|
||||
#define __shfl_down_sync(...) __SHFL_DOWN_GET(__VA_ARGS__, __SHFL_DOWN_SYNC_4, __SHFL_DOWN_SYNC_3)(__VA_ARGS__)
|
||||
#define __all_sync(mask, var) __all(var)
|
||||
#define __any_sync(mask, var) __any(var)
|
||||
#define __ballot_sync(mask, var) ((uint32_t)__ballot(var))
|
||||
#define cublasStrsmBatched hipblasStrsmBatched
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasDestroy hipblasDestroy
|
||||
@@ -121,6 +140,10 @@
|
||||
#define cudaStreamPerThread hipStreamPerThread
|
||||
#define cudaStreamSynchronize hipStreamSynchronize
|
||||
#define cudaStreamWaitEvent hipStreamWaitEvent
|
||||
#define cudaMemcpyToSymbol hipMemcpyToSymbol
|
||||
#define cudaMemcpyFromSymbol hipMemcpyFromSymbol
|
||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||
#define cudaGraphExec_t hipGraphExec_t
|
||||
#define cudaGraphNode_t hipGraphNode_t
|
||||
#define cudaKernelNodeParams hipKernelNodeParams
|
||||
@@ -133,6 +156,9 @@
|
||||
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
|
||||
#define cudaGraphInstantiate hipGraphInstantiate
|
||||
#define cudaStreamEndCapture hipStreamEndCapture
|
||||
#define cudaStreamCaptureStatus hipStreamCaptureStatus
|
||||
#define cudaStreamIsCapturing hipStreamIsCapturing
|
||||
#define cudaStreamCaptureStatusNone hipStreamCaptureStatusNone
|
||||
#define cudaGraphDestroy hipGraphDestroy
|
||||
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
|
||||
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
|
||||
|
||||
@@ -62,6 +62,8 @@ list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
|
||||
|
||||
file(GLOB GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-tile*.cu")
|
||||
# Exclude D>=576 tile kernels: exceed HIP local memory limit (67584 > 65536)
|
||||
list(FILTER SRCS EXCLUDE REGEX "dkq(576|640)")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
@@ -79,7 +81,28 @@ else()
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu)
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-q8_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo3_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo3_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-f16.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-q8_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo2_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo2_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-f16.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo2_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo3_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-q8_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo4_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo4_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-f16.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo3_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo4_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo2_0.cu
|
||||
../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo4_0.cu)
|
||||
endif()
|
||||
|
||||
ggml_add_backend_library(ggml-hip
|
||||
|
||||
@@ -650,6 +650,22 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_m
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_turbo_wht(ggml_metal_library_t lib) {
|
||||
const char * name = "kernel_turbo_wht";
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
res = ggml_metal_library_compile_pipeline(lib, name, name, cv);
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
res.nsg = 1;
|
||||
res.smem = 0;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_metal_library_t lib, const ggml_tensor * op, int nsg, int nxpsg, int r1ptg) {
|
||||
char base[256];
|
||||
char name[256];
|
||||
@@ -813,6 +829,18 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
|
||||
nr0 = N_R0_Q8_0;
|
||||
smem = 32*sizeof(float)*N_R0_Q8_0;
|
||||
} break;
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
{
|
||||
nsg = N_SG_TQ3_1S;
|
||||
nr0 = N_R0_TQ3_1S;
|
||||
smem = 32*sizeof(float)*N_R0_TQ3_1S;
|
||||
} break;
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
{
|
||||
nsg = N_SG_TQ4_1S;
|
||||
nr0 = N_R0_TQ4_1S;
|
||||
smem = 32*sizeof(float)*N_R0_TQ4_1S;
|
||||
} break;
|
||||
case GGML_TYPE_MXFP4:
|
||||
{
|
||||
nsg = N_SG_MXFP4;
|
||||
@@ -931,6 +959,81 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
|
||||
return res;
|
||||
}
|
||||
|
||||
// TQ3_1S / TQ4_1S rotated variant: uses dequantize_*_rotated (no inverse RHT)
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_tq_rotated(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
const ggml_type tsrc0 = op->src[0]->type;
|
||||
const ggml_type tsrc1 = op->src[1]->type;
|
||||
|
||||
const bool bc_inp = op->src[0]->ne[0] % 32 != 0;
|
||||
const bool bc_out = op->ne[0] % 64 != 0 || op->ne[1] % 32 != 0;
|
||||
|
||||
snprintf(base, 256, "kernel_mul_mm_%s_rotated_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
|
||||
snprintf(name, 256, "%s_bci=%d_bco=%d", base, bc_inp, bc_out);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_bool(cv, bc_inp, FC_MUL_MM + 0);
|
||||
ggml_metal_cv_set_bool(cv, bc_out, FC_MUL_MM + 1);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
res.smem = bc_out ? 8192 : 4096 + 2048;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// TQ3_1S / TQ4_1S rotated MUL_MAT_ID variant
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_tq_rotated(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
const ggml_type tsrc0 = op->src[0]->type;
|
||||
const ggml_type tsrc1 = op->src[1]->type;
|
||||
|
||||
const bool bc_inp = op->src[0]->ne[0] % 32 != 0;
|
||||
|
||||
snprintf(base, 256, "kernel_mul_mm_id_%s_rotated_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
|
||||
snprintf(name, 256, "%s_bci=%d", base, bc_inp);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_bool(cv, bc_inp, FC_MUL_MM + 0);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
res.smem = 8192;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// TQ3_1S / TQ4_1S activation pre-rotation pipeline (shared by both)
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tq3_rotate_act(ggml_metal_library_t lib, bool inverse) {
|
||||
char name[256];
|
||||
const char * base = inverse ? "kernel_tq3_unrotate_act" : "kernel_tq3_rotate_act";
|
||||
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0(ggml_metal_library_t lib, int ne02, int ne20) {
|
||||
char base[256];
|
||||
char name[256];
|
||||
@@ -943,7 +1046,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0(g
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
}
|
||||
|
||||
// Graph reservation may pass worst-case ne20=ne02 (e.g. 256*256*2=128KB).
|
||||
// At runtime ne20 is the actual n_expert_used (e.g. 8), keeping shmem within limits.
|
||||
// Cap to 32KB (Apple Silicon threadgroup memory limit) to prevent reservation assert
|
||||
// on high-expert-count MoE models (Qwen3.5-35B with 256 experts).
|
||||
res.smem = (size_t) ne02*ne20*sizeof(uint16_t);
|
||||
if (res.smem > 32768) {
|
||||
res.smem = 32768;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
@@ -1037,6 +1147,18 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_id(ggml_m
|
||||
nr0 = N_R0_Q8_0;
|
||||
smem = 32*sizeof(float)*N_R0_Q8_0;
|
||||
} break;
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
{
|
||||
nsg = N_SG_TQ3_1S;
|
||||
nr0 = N_R0_TQ3_1S;
|
||||
smem = 32*sizeof(float)*N_R0_TQ3_1S;
|
||||
} break;
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
{
|
||||
nsg = N_SG_TQ4_1S;
|
||||
nr0 = N_R0_TQ4_1S;
|
||||
smem = 32*sizeof(float)*N_R0_TQ4_1S;
|
||||
} break;
|
||||
case GGML_TYPE_MXFP4:
|
||||
{
|
||||
nsg = N_SG_MXFP4;
|
||||
@@ -1387,12 +1509,17 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
|
||||
// do bounds checks for the mask?
|
||||
const bool bc_mask = op->src[3] && (op->src[3]->ne[1] % 8 != 0);
|
||||
|
||||
snprintf(base, 256, "kernel_%s_%s_dk%d_dv%d",
|
||||
// Asymmetric K/V: always encode both K and V types in the pipeline name.
|
||||
// Symmetric case: ktype == vtype, so the name just has the type twice.
|
||||
// This avoids ambiguity if a type name contains underscores (e.g. q4_0).
|
||||
snprintf(base, 256, "kernel_%s_k%s_v%s_dk%d_dv%d",
|
||||
"flash_attn_ext",
|
||||
ggml_type_name(op->src[1]->type),
|
||||
ggml_type_name(op->src[2]->type),
|
||||
dk,
|
||||
dv);
|
||||
|
||||
|
||||
snprintf(name, 256, "%s_mask=%d_sinks=%d_bias=%d_scap=%d_kvpad=%d_bcm=%d_ns10=%d_ns20=%d_nsg=%d",
|
||||
base,
|
||||
has_mask,
|
||||
@@ -1450,12 +1577,16 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_v
|
||||
const int32_t ns10 = op->src[1]->nb[1]/op->src[1]->nb[0];
|
||||
const int32_t ns20 = op->src[2]->nb[1]/op->src[2]->nb[0];
|
||||
|
||||
snprintf(base, 256, "kernel_%s_%s_dk%d_dv%d",
|
||||
// Asymmetric K/V: always encode both K and V types in the pipeline name.
|
||||
// Uses k/v prefix to avoid ambiguity with type names containing underscores.
|
||||
snprintf(base, 256, "kernel_%s_k%s_v%s_dk%d_dv%d",
|
||||
"flash_attn_ext_vec",
|
||||
ggml_type_name(op->src[1]->type),
|
||||
ggml_type_name(op->src[2]->type),
|
||||
dk,
|
||||
dv);
|
||||
|
||||
|
||||
snprintf(name, 256, "%s_mask=%d_sink=%d_bias=%d_scap=%d_kvpad=%d_ns10=%d_ns20=%d_nsg=%d_nwg=%d",
|
||||
base,
|
||||
has_mask,
|
||||
|
||||
@@ -129,8 +129,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_turbo_wht (ggml_metal_library_t lib);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, const struct ggml_tensor * op, int nsg, int nxpsg, int r1ptg);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_tq_rotated(ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_tq_rotated(ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tq3_rotate_act (ggml_metal_library_t lib, bool inverse);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0 (ggml_metal_library_t lib, int ne02, int ne20);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
|
||||
@@ -225,6 +225,43 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
|
||||
// TurboQuant: auto-select dequant path based on hardware
|
||||
// M1/M2/M3/M4 (no tensor API): 4-mag LUT (+38-45% decode at long ctx)
|
||||
// M5+ (has tensor API): 8-entry full LUT (best decode speed)
|
||||
{
|
||||
const char * force_4mag = getenv("TURBO_FORCE_4MAG");
|
||||
// Always compile with 4-mag support. The dispatch code selects
|
||||
// 4-mag vs 8-LUT based on context depth at runtime.
|
||||
// Pre-M5: always 4-mag (constant cache too slow)
|
||||
// M5+: 4-mag for mid-context (8K-20K), 8-LUT otherwise
|
||||
if (!ggml_metal_device_get_props(dev)->has_tensor || (force_4mag && force_4mag[0] == '1')) {
|
||||
[prep setObject:@"1" forKey:@"TURBO_USE_4MAG"];
|
||||
GGML_LOG_INFO("%s: turbo3 using 4-mag LUT%s\n", __func__,
|
||||
force_4mag ? " (forced)" : " (pre-M5 hardware)");
|
||||
}
|
||||
// Sparse V dequant: skip V for negligible attention weights
|
||||
// Enabled by default on all Metal (validated: PPL identical, NIAH 9/9, 30+ testers)
|
||||
// Opt-out via TURBO_SPARSE_V=0
|
||||
const char * sparse_v_env = getenv("TURBO_SPARSE_V");
|
||||
const bool sparse_v_disabled = sparse_v_env && sparse_v_env[0] == '0';
|
||||
if (!sparse_v_disabled) {
|
||||
[prep setObject:@"1" forKey:@"TURBO_SPARSE_V"];
|
||||
GGML_LOG_INFO("%s: turbo3 sparse V dequant enabled (opt-out: TURBO_SPARSE_V=0)\n", __func__);
|
||||
}
|
||||
// TODO: context-adaptive dispatch — compile both 4-mag and 8-LUT
|
||||
// FA kernel instantiations, select based on ne11 (KV cache size)
|
||||
// at dispatch time in ggml_metal_op_flash_attn_ext()
|
||||
}
|
||||
|
||||
// TurboQuant profiling: set TURBO_PROFILE_MODE env var (0-4)
|
||||
{
|
||||
const char * pm = getenv("TURBO_PROFILE_MODE");
|
||||
if (pm && pm[0] >= '0' && pm[0] <= '4') {
|
||||
[prep setObject:[NSString stringWithUTF8String:pm] forKey:@"TURBO_PROFILE_MODE"];
|
||||
GGML_LOG_INFO("%s: TURBO_PROFILE_MODE=%s\n", __func__, pm);
|
||||
}
|
||||
}
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
@@ -1164,7 +1201,23 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->type != op->src[2]->type) {
|
||||
return false;
|
||||
// Allow asymmetric K/V for supported mixed pairs:
|
||||
// - turbo x turbo (any combination)
|
||||
// - q8_0 x turbo (either direction)
|
||||
const bool k_is_turbo = (op->src[1]->type == GGML_TYPE_TURBO2_0 ||
|
||||
op->src[1]->type == GGML_TYPE_TURBO3_0 ||
|
||||
op->src[1]->type == GGML_TYPE_TURBO4_0);
|
||||
const bool v_is_turbo = (op->src[2]->type == GGML_TYPE_TURBO2_0 ||
|
||||
op->src[2]->type == GGML_TYPE_TURBO3_0 ||
|
||||
op->src[2]->type == GGML_TYPE_TURBO4_0);
|
||||
const bool k_is_q8 = (op->src[1]->type == GGML_TYPE_Q8_0);
|
||||
const bool v_is_q8 = (op->src[2]->type == GGML_TYPE_Q8_0);
|
||||
const bool supported = (k_is_turbo && v_is_turbo) ||
|
||||
(k_is_q8 && v_is_turbo) ||
|
||||
(k_is_turbo && v_is_q8);
|
||||
if (!supported) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
switch (op->src[1]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
@@ -1174,6 +1227,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
if (!has_bfloat) {
|
||||
@@ -1192,6 +1248,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
return true;
|
||||
case GGML_OP_GATED_DELTA_NET:
|
||||
return has_simdgroup_reduction && op->src[2]->ne[0] % 32 == 0;
|
||||
case GGML_OP_TURBO_WHT:
|
||||
return op->src[0]->ne[0] % 128 == 0;
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
@@ -1215,6 +1273,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -1241,6 +1302,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
@@ -1272,6 +1335,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
@@ -84,6 +84,12 @@
|
||||
#define N_R0_IQ4_XS 2
|
||||
#define N_SG_IQ4_XS 2
|
||||
|
||||
#define N_R0_TQ3_1S 8
|
||||
#define N_SG_TQ3_1S 2
|
||||
|
||||
#define N_R0_TQ4_1S 8
|
||||
#define N_SG_TQ4_1S 2
|
||||
|
||||
// function constants offsets
|
||||
#define FC_FLASH_ATTN_EXT_PAD 100
|
||||
#define FC_FLASH_ATTN_EXT_BLK 200
|
||||
@@ -101,6 +107,9 @@
|
||||
#define FC_SUM_ROWS 1400
|
||||
#define FC_UPSCALE 1500
|
||||
#define FC_GATED_DELTA_NET 1600
|
||||
#define FC_TURBO_WHT 1700
|
||||
#define FC_TURBO_FLASH_P1 1800
|
||||
#define FC_TURBO_FLASH_P2 1900
|
||||
|
||||
// op-specific constants
|
||||
#define OP_FLASH_ATTN_EXT_NQPSG 8
|
||||
@@ -429,6 +438,40 @@ typedef struct {
|
||||
float logit_softcap;
|
||||
} ggml_metal_kargs_flash_attn_ext_vec;
|
||||
|
||||
// TurboFlash two-pass: asymmetric K=q8_0, V=turbo3 fused attention
|
||||
// Pass 1 args (block scoring + partial V accumulation)
|
||||
typedef struct {
|
||||
int32_t ne01; // number of query heads (batch dim 1)
|
||||
int32_t ne02; // number of query heads (batch dim 2)
|
||||
int32_t ne03; // number of query heads (batch dim 3)
|
||||
uint64_t nb01; // Q row stride
|
||||
uint64_t nb02; // Q head stride
|
||||
uint64_t nb03; // Q batch stride
|
||||
int32_t ne11; // KV sequence length (T_kv)
|
||||
int32_t ne_12_2; // KV head count dim 2
|
||||
int32_t ne_12_3; // KV head count dim 3
|
||||
uint64_t nb11; // K row stride (bytes per KV token in K)
|
||||
uint64_t nb12; // K head stride
|
||||
uint64_t nb13; // K batch stride
|
||||
uint64_t nb21; // V row stride (bytes per KV token in V)
|
||||
uint64_t nb22; // V head stride
|
||||
uint64_t nb23; // V batch stride
|
||||
int32_t ne31; // mask dim 1
|
||||
int32_t ne32; // mask dim 2
|
||||
int32_t ne33; // mask dim 3
|
||||
uint64_t nb31; // mask stride 1
|
||||
uint64_t nb32; // mask stride 2
|
||||
uint64_t nb33; // mask stride 3
|
||||
float scale; // attention scale (1/sqrt(dk))
|
||||
int32_t n_blocks; // ceil(ne11 / BLOCK_SIZE)
|
||||
} ggml_metal_kargs_turbo_flash_p1;
|
||||
|
||||
// Pass 2 args (merge partials + inverse WHT + write output)
|
||||
typedef struct {
|
||||
int32_t ne01; // number of query heads (total n_bh)
|
||||
int32_t n_blocks; // number of blocks from pass 1
|
||||
} ggml_metal_kargs_turbo_flash_p2;
|
||||
|
||||
typedef struct {
|
||||
int32_t nrows;
|
||||
} ggml_metal_kargs_flash_attn_ext_vec_reduce;
|
||||
@@ -889,6 +932,11 @@ typedef struct {
|
||||
uint64_t nb3;
|
||||
} ggml_metal_kargs_gated_delta_net;
|
||||
|
||||
typedef struct {
|
||||
int64_t n_elements; // total elements in tensor
|
||||
int32_t direction; // 0 = forward, 1 = inverse
|
||||
} ggml_metal_kargs_turbo_wht;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne01;
|
||||
|
||||
@@ -172,6 +172,26 @@ static bool ggml_metal_op_concurrency_add(ggml_metal_op_t ctx, const ggml_tensor
|
||||
return ggml_mem_ranges_add(ctx->mem_ranges, node);
|
||||
}
|
||||
|
||||
static bool ggml_metal_op_mutates_tq_src1(const ggml_tensor * node) {
|
||||
if (node == nullptr || node->src[0] == nullptr || node->src[1] == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool is_tq_weight = node->src[0]->type == GGML_TYPE_TQ3_1S ||
|
||||
node->src[0]->type == GGML_TYPE_TQ4_1S;
|
||||
if (!is_tq_weight) {
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
struct ggml_tensor * node = ctx->node(idx);
|
||||
|
||||
@@ -209,6 +229,15 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
int n_fuse = 1;
|
||||
|
||||
// Rotated TQ weight kernels temporarily rotate src1 in-place before the
|
||||
// matmul and restore it afterwards. The generic range tracker only sees a
|
||||
// read dependency on src1, so sibling projections can be scheduled as
|
||||
// concurrent even though they race on the shared activation buffer.
|
||||
// Gemma4 GEGLU / MoE fan-out is especially sensitive to this hazard.
|
||||
if (ggml_metal_op_mutates_tq_src1(node)) {
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
}
|
||||
|
||||
// check if the current node can run concurrently with other nodes before it
|
||||
// the condition is that:
|
||||
// - the current node cannot write to any previous src or dst ranges
|
||||
@@ -337,6 +366,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
{
|
||||
n_fuse = ggml_metal_op_gated_delta_net(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_TURBO_WHT:
|
||||
{
|
||||
n_fuse = ggml_metal_op_turbo_wht(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
{
|
||||
n_fuse = ggml_metal_op_solve_tri(ctx, idx);
|
||||
@@ -1656,6 +1689,39 @@ int ggml_metal_op_gated_delta_net(ggml_metal_op_t ctx, int idx) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_turbo_wht(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
ggml_metal_library_t lib = ctx->lib;
|
||||
ggml_metal_encoder_t enc = ctx->enc;
|
||||
|
||||
int direction;
|
||||
memcpy(&direction, op->op_params, sizeof(int));
|
||||
|
||||
const int64_t n_elements = ggml_nelements(op->src[0]);
|
||||
const int64_t n_groups = n_elements / 128;
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_turbo_wht(lib);
|
||||
|
||||
ggml_metal_kargs_turbo_wht args = {
|
||||
/*.n_elements =*/ n_elements,
|
||||
/*.direction =*/ direction,
|
||||
};
|
||||
|
||||
int ida = 0;
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), ida++);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), ida++);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), ida++);
|
||||
|
||||
// One thread per 128-element group, 256 threads per threadgroup
|
||||
const int threads_per_tg = 256;
|
||||
const int n_threadgroups = (n_groups + threads_per_tg - 1) / threads_per_tg;
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, n_threadgroups, 1, 1, threads_per_tg, 1, 1);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_solve_tri(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
@@ -2155,52 +2221,103 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
||||
!ggml_is_transposed(op->src[1]) &&
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
props_dev->has_simdgroup_mm && ne00 >= 64 && ne11 > ne11_mm_min) {
|
||||
//GGML_LOG_INFO("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
props_dev->has_simdgroup_mm && ne00 >= 64 &&
|
||||
(ne11 > ne11_mm_min || op->src[0]->type == GGML_TYPE_TQ3_1S || op->src[0]->type == GGML_TYPE_TQ4_1S)) {
|
||||
// Route all TQ weights through the rotated mul_mm path.
|
||||
// Gemma4 decode still degrades on the fused mul_mv kernel even after the broader
|
||||
// TQ backend fixes, while the rotated mul_mm path matches CPU behavior.
|
||||
|
||||
// some Metal matrix data types require aligned pointers
|
||||
// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
|
||||
//switch (op->src[0]->type) {
|
||||
// case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
|
||||
// case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break;
|
||||
// case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break;
|
||||
// default: break;
|
||||
//}
|
||||
const bool is_tq_weight = (op->src[0]->type == GGML_TYPE_TQ3_1S || op->src[0]->type == GGML_TYPE_TQ4_1S);
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
|
||||
// TQ weight optimization: pre-rotate activations, use no-RHT dequant, then un-rotate
|
||||
if (is_tq_weight && ne00 % 32 == 0) {
|
||||
// Step 1: Forward-rotate src1 in-place
|
||||
const int64_t n_act = (int64_t)ne10 * ne11 * ne12 * ne13;
|
||||
int64_t n_act_val = n_act;
|
||||
|
||||
ggml_metal_kargs_mul_mm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
auto pipeline_rot = ggml_metal_library_get_pipeline_tq3_rotate_act(lib, false);
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline_rot);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 0);
|
||||
ggml_metal_encoder_set_bytes (enc, &n_act_val, sizeof(n_act_val), 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (int)(n_act_val / 32), 1, 1, 32, 1, 1);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
|
||||
// Memory barrier between rotate and matmul
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
const size_t smem = pipeline.smem;
|
||||
// Step 2: Dispatch rotated mul_mm (uses no-RHT dequant)
|
||||
auto pipeline_mm = ggml_metal_library_get_pipeline_mul_mm_tq_rotated(lib, op);
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
ggml_metal_kargs_mul_mm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
|
||||
const int nr0 = pipeline.nr0;
|
||||
const int nr1 = pipeline.nr1;
|
||||
const int nsg = pipeline.nsg;
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline_mm);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + nr1 - 1) / nr1), ((ne01 + nr0 - 1) / nr0), ne12 * ne13, 32, nsg, 1);
|
||||
const size_t smem_tq = pipeline_mm.smem;
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem_tq, 0);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
|
||||
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
auto pipeline_unrot = ggml_metal_library_get_pipeline_tq3_rotate_act(lib, true);
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline_unrot);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 0);
|
||||
ggml_metal_encoder_set_bytes (enc, &n_act_val, sizeof(n_act_val), 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (int)(n_act_val / 32), 1, 1, 32, 1, 1);
|
||||
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
} else {
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
|
||||
|
||||
ggml_metal_kargs_mul_mm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
|
||||
|
||||
const size_t smem = pipeline.smem;
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
|
||||
const int nr0 = pipeline.nr0;
|
||||
const int nr1 = pipeline.nr1;
|
||||
const int nsg = pipeline.nsg;
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + nr1 - 1) / nr1), ((ne01 + nr0 - 1) / nr0), ne12 * ne13, 32, nsg, 1);
|
||||
}
|
||||
} else {
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
|
||||
|
||||
@@ -2243,7 +2360,9 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
||||
if (op->src[0]->type == GGML_TYPE_F32 ||
|
||||
op->src[0]->type == GGML_TYPE_F16 ||
|
||||
op->src[0]->type == GGML_TYPE_BF16 ||
|
||||
op->src[0]->type == GGML_TYPE_Q8_0) {
|
||||
op->src[0]->type == GGML_TYPE_Q8_0 ||
|
||||
op->src[0]->type == GGML_TYPE_TQ3_1S ||
|
||||
op->src[0]->type == GGML_TYPE_TQ4_1S) {
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0 - 1)/(nr0)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
|
||||
} else {
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0*nsg - 1)/(nr0*nsg)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
|
||||
@@ -2303,12 +2422,13 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
const uint32_t r2 = 1;
|
||||
const uint32_t r3 = 1;
|
||||
const bool is_tq_weight = (op->src[0]->type == GGML_TYPE_TQ3_1S || op->src[0]->type == GGML_TYPE_TQ4_1S);
|
||||
|
||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||
// to the matrix-vector kernel
|
||||
// ne20 = n_used_experts
|
||||
// ne21 = n_rows (batch size)
|
||||
const int ne21_mm_id_min = 32;
|
||||
const int ne21_mm_id_min = is_tq_weight ? 1 : 32;
|
||||
|
||||
if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min)) {
|
||||
// some Metal matrix data types require aligned pointers
|
||||
@@ -2362,40 +2482,96 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
{
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
|
||||
// TQ weight MoE: pre-rotate activations for rotated dispatch
|
||||
if (is_tq_weight && ne00 % 32 == 0) {
|
||||
const int64_t n_act = (int64_t)ne10 * ne11 * ne12 * ne13;
|
||||
int64_t n_act_val = n_act;
|
||||
|
||||
ggml_metal_kargs_mul_mm_id args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne11 =*/ ne11, // n_expert_used (bcast)
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne20 =*/ ne20, // n_expert_used
|
||||
/*.ne21 =*/ ne21, // n_tokens
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
auto pipeline_rot = ggml_metal_library_get_pipeline_tq3_rotate_act(lib, false);
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline_rot);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src1, 0);
|
||||
ggml_metal_encoder_set_bytes (enc, &n_act_val, sizeof(n_act_val), 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (int)(n_act_val / 32), 1, 1, 32, 1, 1);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_tpe, 3);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_ids, 4);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 5);
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
const size_t smem = pipeline.smem;
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id_tq_rotated(lib, op);
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
ggml_metal_kargs_mul_mm_id args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne20 =*/ ne20,
|
||||
/*.ne21 =*/ ne21,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_tpe, 3);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_ids, 4);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 5);
|
||||
|
||||
const size_t smem = pipeline.smem;
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
|
||||
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
auto pipeline_unrot = ggml_metal_library_get_pipeline_tq3_rotate_act(lib, true);
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline_unrot);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src1, 0);
|
||||
ggml_metal_encoder_set_bytes (enc, &n_act_val, sizeof(n_act_val), 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (int)(n_act_val / 32), 1, 1, 32, 1, 1);
|
||||
|
||||
// Barrier: ensure unrotate completes before any subsequent op reads src1
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
} else {
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
|
||||
|
||||
ggml_metal_kargs_mul_mm_id args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne20 =*/ ne20,
|
||||
/*.ne21 =*/ ne21,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_tpe, 3);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_ids, 4);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 5);
|
||||
|
||||
const size_t smem = pipeline.smem;
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
|
||||
@@ -2448,7 +2624,9 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
|
||||
if (op->src[0]->type == GGML_TYPE_F32 ||
|
||||
op->src[0]->type == GGML_TYPE_F16 ||
|
||||
op->src[0]->type == GGML_TYPE_BF16 ||
|
||||
op->src[0]->type == GGML_TYPE_Q8_0) {
|
||||
op->src[0]->type == GGML_TYPE_Q8_0 ||
|
||||
op->src[0]->type == GGML_TYPE_TQ3_1S ||
|
||||
op->src[0]->type == GGML_TYPE_TQ4_1S) {
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0 - 1)/(nr0), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
|
||||
} else {
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0*nsg - 1)/(nr0*nsg), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
|
||||
@@ -2511,9 +2689,57 @@ bool ggml_metal_op_flash_attn_ext_use_vec(const ggml_tensor * op) {
|
||||
const int64_t ne01 = op->src[0]->ne[1]; // batch size
|
||||
|
||||
// use vec kernel if the batch size is small and if the head size is supported
|
||||
// EXPERIMENT: force non-vec for turbo3 on pre-M5 hardware.
|
||||
// The vec kernel uses nl=8 (4 elements per dequant call) which has 4x more
|
||||
// loop iterations than the non-vec nl=2 path. On M2 Pro, this loop overhead
|
||||
// dominates — the non-vec path may be faster even for batch=1.
|
||||
const ggml_type ktype = op->src[1]->type;
|
||||
if (ktype == GGML_TYPE_TURBO2_0 || ktype == GGML_TYPE_TURBO3_0 || ktype == GGML_TYPE_TURBO4_0) {
|
||||
const char * force_nonvec = getenv("TURBO_FORCE_NONVEC");
|
||||
if (force_nonvec && force_nonvec[0] == '1') {
|
||||
return false; // force non-vec path
|
||||
}
|
||||
}
|
||||
return (ne01 < 20) && (ne00 % 32 == 0);
|
||||
}
|
||||
|
||||
// TurboFlash: two-pass fused asymmetric attention for turbo3 V decode
|
||||
// Returns true when V=turbo3, single-token decode (ne01==1), and K is q8_0 or turbo3
|
||||
static bool ggml_metal_op_flash_attn_ext_use_turbo_flash(const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_FLASH_ATTN_EXT);
|
||||
|
||||
const int64_t ne01 = op->src[0]->ne[1]; // batch size (queries)
|
||||
const int64_t ne00 = op->src[0]->ne[0]; // head size
|
||||
|
||||
const ggml_type type_k = op->src[1]->type;
|
||||
const ggml_type type_v = op->src[2]->type;
|
||||
|
||||
// Only for single-token decode (VEC path conditions)
|
||||
if (ne01 != 1) return false;
|
||||
|
||||
// Only for turbo3 V cache
|
||||
if (type_v != GGML_TYPE_TURBO3_0) return false;
|
||||
|
||||
// Only for q8_0 or turbo3 K — asymmetric or symmetric turbo
|
||||
if (type_k != GGML_TYPE_Q8_0 && type_k != GGML_TYPE_TURBO3_0) return false;
|
||||
|
||||
// Only for supported head dims (64, 96, 128) and power-of-2 aligned to 32
|
||||
if (ne00 % 32 != 0) return false;
|
||||
if (ne00 != 64 && ne00 != 96 && ne00 != 128) return false;
|
||||
|
||||
// Check environment variable to opt-out
|
||||
const char * turbo_flash_env = getenv("TURBO_FLASH");
|
||||
if (turbo_flash_env && turbo_flash_env[0] == '0') return false;
|
||||
|
||||
// Check environment variable to force enable (bypasses other checks)
|
||||
if (turbo_flash_env && turbo_flash_env[0] == '1') return true;
|
||||
|
||||
// Default: disabled — TurboFlash two-pass kernel produces corrupt output
|
||||
// on Apple10 (M5 Max) and possibly other Metal4 GPUs. Use TURBO_FLASH=1
|
||||
// to opt-in for testing. See PR #91.
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_FLASH_ATTN_EXT);
|
||||
|
||||
@@ -2625,6 +2851,20 @@ size_t ggml_metal_op_flash_attn_ext_extra_tmp(const ggml_tensor * op) {
|
||||
res += ggml_type_size(GGML_TYPE_F32)*(ne01_max*ne02*ne03*nwg*(ne20 + 2));
|
||||
}
|
||||
|
||||
// TurboFlash two-pass temp is only needed when the TurboFlash path is eligible.
|
||||
// Reserving it unconditionally can massively inflate graph scratch usage for
|
||||
// large-context models even when the normal FA path is selected.
|
||||
if (ggml_metal_op_flash_attn_ext_use_turbo_flash(op)) {
|
||||
// partial_out: float[n_bh * n_blocks * dv]
|
||||
// partial_ms: float[n_bh * n_blocks * 2] (max + sum per block)
|
||||
const int64_t n_bh = ne01 * ne02 * ne03;
|
||||
const int64_t ne11 = op->src[1]->ne[1]; // T_kv
|
||||
const int64_t n_blocks = (ne11 + 63) / 64; // ceil(T_kv / 64)
|
||||
const int64_t dv = ne20;
|
||||
|
||||
res += ggml_type_size(GGML_TYPE_F32) * n_bh * n_blocks * (dv + 2);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -2650,7 +2890,22 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
|
||||
GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(op->src[1]->type == op->src[2]->type);
|
||||
|
||||
// Allow asymmetric K/V quantization for supported mixed pairs
|
||||
{
|
||||
const ggml_type type_k = op->src[1]->type;
|
||||
const ggml_type type_v = op->src[2]->type;
|
||||
if (type_k != type_v) {
|
||||
const bool k_is_turbo = (type_k == GGML_TYPE_TURBO2_0 || type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0);
|
||||
const bool v_is_turbo = (type_v == GGML_TYPE_TURBO2_0 || type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0);
|
||||
const bool k_is_q8 = (type_k == GGML_TYPE_Q8_0);
|
||||
const bool v_is_q8 = (type_v == GGML_TYPE_Q8_0);
|
||||
const bool supported = (k_is_turbo && v_is_turbo) ||
|
||||
(k_is_q8 && v_is_turbo) ||
|
||||
(k_is_turbo && v_is_q8);
|
||||
GGML_ASSERT(supported && "asymmetric K/V types only supported for turbo and q8_0 mixed pairs");
|
||||
}
|
||||
}
|
||||
|
||||
//GGML_ASSERT(ggml_are_same_shape (src1, src2));
|
||||
GGML_ASSERT(ne11 == ne21);
|
||||
@@ -2702,6 +2957,164 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_metal_buffer_id bid_tmp = bid_blk;
|
||||
bid_tmp.offs += ggml_metal_op_flash_attn_ext_extra_blk(op);
|
||||
|
||||
// ==================== TurboFlash two-pass dispatch ====================
|
||||
// Intercept before the normal VEC/non-VEC path when conditions are met:
|
||||
// - V is turbo3, K is q8_0 or turbo3
|
||||
// - Single-token decode (ne01 == 1)
|
||||
// - Supported head dimensions (64, 96, 128)
|
||||
if (ggml_metal_op_flash_attn_ext_use_turbo_flash(op)) {
|
||||
const int32_t dk = ne00; // head dim for K
|
||||
const int32_t dv = ne20; // head dim for V
|
||||
|
||||
// Compute TurboFlash block parameters
|
||||
constexpr int BLOCK_SIZE = 64;
|
||||
const int32_t n_blocks = (ne11 + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int32_t n_bh = ne01 * ne02 * ne03; // total query heads across batch
|
||||
|
||||
// Temp buffer layout (within bid_tmp):
|
||||
// [0 .. existing_vec_tmp) — existing VEC temp buffer (skip past it)
|
||||
// [turbo_base .. turbo_base + partial_out_bytes) — partial_out
|
||||
// [turbo_base + partial_out_bytes .. turbo_base + partial_out_bytes + partial_ms_bytes) — partial_ms
|
||||
//
|
||||
// Actually, we placed turbo tmp allocation AFTER the vec tmp in extra_tmp,
|
||||
// so turbo data starts at:
|
||||
{
|
||||
const int64_t nwg = 32;
|
||||
const int64_t ne01_max = std::min(ne01, (int32_t)32);
|
||||
size_t vec_tmp_size = ggml_type_size(GGML_TYPE_F32)*(ne01_max*ne02*ne03*nwg*(ne20 + 2));
|
||||
|
||||
ggml_metal_buffer_id bid_turbo = bid_tmp;
|
||||
bid_turbo.offs += vec_tmp_size;
|
||||
|
||||
const size_t partial_out_bytes = sizeof(float) * n_bh * n_blocks * dv;
|
||||
|
||||
ggml_metal_buffer_id bid_partial_out = bid_turbo;
|
||||
ggml_metal_buffer_id bid_partial_ms = bid_turbo;
|
||||
bid_partial_ms.offs += partial_out_bytes;
|
||||
|
||||
// ---- Pass 1: Block scoring + partial V accumulation ----
|
||||
{
|
||||
ggml_metal_kargs_turbo_flash_p1 args_p1 = {
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.ne_12_2 =*/ ne12,
|
||||
/*.ne_12_3 =*/ ne13,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.nb21 =*/ nb21,
|
||||
/*.nb22 =*/ nb22,
|
||||
/*.nb23 =*/ nb23,
|
||||
/*.ne31 =*/ ne31,
|
||||
/*.ne32 =*/ ne32,
|
||||
/*.ne33 =*/ ne33,
|
||||
/*.nb31 =*/ nb31,
|
||||
/*.nb32 =*/ nb32,
|
||||
/*.nb33 =*/ nb33,
|
||||
/*.scale =*/ scale,
|
||||
/*.n_blocks =*/ n_blocks,
|
||||
};
|
||||
|
||||
// Pipeline name: kernel_turbo_flash_p1_dk{dk}_dv{dv}
|
||||
const ggml_type type_k = op->src[1]->type;
|
||||
const bool k_is_turbo3 = (type_k == GGML_TYPE_TURBO3_0);
|
||||
|
||||
char p1_base[128];
|
||||
char p1_name[256];
|
||||
snprintf(p1_base, 128, "kernel_turbo_flash_p1_dk%d_dv%d", dk, dv);
|
||||
snprintf(p1_name, 256, "%s_mask=%d_dk=%d_dv=%d_kt3=%d",
|
||||
p1_base, has_mask ? 1 : 0, dk, dv, k_is_turbo3 ? 1 : 0);
|
||||
|
||||
// The kernel uses FC_turbo_flash_p1_has_mask and FC_turbo_flash_p1_k_is_turbo3 as function constants
|
||||
ggml_metal_pipeline_with_params res_p1 = ggml_metal_library_get_pipeline(lib, p1_name);
|
||||
if (!res_p1.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
ggml_metal_cv_set_int32(cv, dk, FC_TURBO_FLASH_P1 + 0);
|
||||
ggml_metal_cv_set_int32(cv, dv, FC_TURBO_FLASH_P1 + 1);
|
||||
ggml_metal_cv_set_bool(cv, has_mask, FC_TURBO_FLASH_P1 + 2);
|
||||
ggml_metal_cv_set_bool(cv, k_is_turbo3, FC_TURBO_FLASH_P1 + 3);
|
||||
|
||||
res_p1 = ggml_metal_library_compile_pipeline(lib, p1_base, p1_name, cv);
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
if (!res_p1.pipeline) {
|
||||
// Fall through to normal FA path by NOT returning
|
||||
goto turbo_flash_end;
|
||||
}
|
||||
|
||||
// V4: no shared memory in pass 1 (all registers)
|
||||
// Metal requires at least 16 bytes for threadgroup memory
|
||||
const size_t smem_p1 = 16;
|
||||
constexpr int TG_SIZE = 32; // 1 SIMD group
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, res_p1);
|
||||
ggml_metal_encoder_set_bytes (enc, &args_p1, sizeof(args_p1), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src0, 1); // q
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src1, 2); // k (q8_0)
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src2, 3); // v (turbo3)
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src3, 4); // mask
|
||||
ggml_metal_encoder_set_buffer (enc, bid_partial_out, 5); // partial_out
|
||||
ggml_metal_encoder_set_buffer (enc, bid_partial_ms, 6); // partial_ms
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem_p1, 0);
|
||||
|
||||
// Grid: (n_bh, n_blocks, 1), Threadgroup: (32, 1, 1) — 1 SIMD group
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, n_bh, n_blocks, 1, TG_SIZE, 1, 1);
|
||||
}
|
||||
|
||||
// Barrier between pass 1 and pass 2
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
// ---- Pass 2: Merge partials + inverse WHT + write output ----
|
||||
{
|
||||
ggml_metal_kargs_turbo_flash_p2 args_p2 = {
|
||||
/*.ne01 =*/ n_bh,
|
||||
/*.n_blocks =*/ n_blocks,
|
||||
};
|
||||
|
||||
char p2_base[128];
|
||||
char p2_name[256];
|
||||
snprintf(p2_base, 128, "kernel_turbo_flash_p2_dv%d", dv);
|
||||
snprintf(p2_name, 256, "%s_dv=%d", p2_base, dv);
|
||||
|
||||
ggml_metal_pipeline_with_params res_p2 = ggml_metal_library_get_pipeline(lib, p2_name);
|
||||
if (!res_p2.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
ggml_metal_cv_set_int32(cv, dv, FC_TURBO_FLASH_P2 + 0);
|
||||
|
||||
res_p2 = ggml_metal_library_compile_pipeline(lib, p2_base, p2_name, cv);
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
// Shared memory: DV + 2 floats (shared_out[DV] + global_max + global_sum)
|
||||
const size_t smem_p2 = sizeof(float) * (dv + 2);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, res_p2);
|
||||
ggml_metal_encoder_set_bytes (enc, &args_p2, sizeof(args_p2), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_partial_out, 1); // partial_out
|
||||
ggml_metal_encoder_set_buffer (enc, bid_partial_ms, 2); // partial_ms
|
||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 3); // output
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem_p2, 0);
|
||||
|
||||
// Grid: (n_bh, 1, 1), Threadgroup: (max(dv, 128), 1, 1)
|
||||
// Need at least DV threads for the WHT butterfly
|
||||
const int tg_size = std::max(dv, (int32_t)128);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, n_bh, 1, 1, tg_size, 1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
turbo_flash_end:;
|
||||
}
|
||||
// ==================== End TurboFlash ====================
|
||||
|
||||
if (!ggml_metal_op_flash_attn_ext_use_vec(op)) {
|
||||
// half8x8 kernel
|
||||
const int nqptg = OP_FLASH_ATTN_EXT_NQPSG; // queries per threadgroup
|
||||
@@ -2935,7 +3348,8 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
// ne20*(nsg)
|
||||
// each simdgroup has a full f32 head vector in shared mem to accumulate results
|
||||
//
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD(((GGML_PAD(ne00, 128) + 4*ncpsg + 2*GGML_PAD(ne20, 128))*(nsg))*(sizeof(float)/2), 16))
|
||||
// Extra 128 floats (512 bytes) for TurboQuant pre-dequantized block cache in threadgroup memory
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD(((GGML_PAD(ne00, 128) + 4*ncpsg + 2*GGML_PAD(ne20, 128))*(nsg))*(sizeof(float)/2) + 128*sizeof(float), 16))
|
||||
|
||||
int64_t nsg = 1;
|
||||
|
||||
|
||||
@@ -59,6 +59,7 @@ int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_gated_delta_net (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_turbo_wht (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_solve_tri (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_set (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_cpy (ggml_metal_op_t ctx, int idx);
|
||||
|
||||
+2395
-200
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,49 @@
|
||||
// TurboQuant Fast Walsh-Hadamard rotation for Metal
|
||||
// Replaces 256KB dense matrices with 512 bytes of sign arrays + O(d log d) butterfly
|
||||
// Generated with seed=42 (rotation) and seed=1042 (QJL)
|
||||
|
||||
// --- Rotation sign arrays ---
|
||||
constant float turbo_wht_signs1[128] = {
|
||||
-1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f};
|
||||
constant float turbo_wht_signs2[128] = {
|
||||
1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f};
|
||||
|
||||
// --- QJL sign arrays ---
|
||||
constant float turbo_qjl_wht_signs1[128] = {
|
||||
1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
|
||||
constant float turbo_qjl_wht_signs2[128] = {
|
||||
1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f};
|
||||
|
||||
// --- Fast Walsh-Hadamard Transform (in-place, normalized) ---
|
||||
// O(n log n) = 896 operations for n=128, vs O(n²) = 16384 for dense matvec
|
||||
static void turbo_fwht_128(thread float * x) {
|
||||
for (int h = 1; h < 128; h *= 2) {
|
||||
for (int i = 0; i < 128; i += h * 2) {
|
||||
for (int j = i; j < i + h; j++) {
|
||||
float a = x[j];
|
||||
float b = x[j + h];
|
||||
x[j] = a + b;
|
||||
x[j + h] = a - b;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Normalize by 1/sqrt(128)
|
||||
const float inv_sqrt_128 = 0.08838834764831845f; // 1/sqrt(128)
|
||||
for (int i = 0; i < 128; i++) {
|
||||
x[i] *= inv_sqrt_128;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Forward rotation: signs1 → FWHT → signs2 ---
|
||||
static void turbo_rotate_forward(thread float * x, constant float * s1, constant float * s2) {
|
||||
for (int i = 0; i < 128; i++) x[i] *= s1[i];
|
||||
turbo_fwht_128(x);
|
||||
for (int i = 0; i < 128; i++) x[i] *= s2[i];
|
||||
}
|
||||
|
||||
// --- Inverse rotation: signs2 → FWHT → signs1 (FWHT is its own inverse) ---
|
||||
static void turbo_rotate_inverse(thread float * x, constant float * s1, constant float * s2) {
|
||||
for (int i = 0; i < 128; i++) x[i] *= s2[i];
|
||||
turbo_fwht_128(x);
|
||||
for (int i = 0; i < 128; i++) x[i] *= s1[i];
|
||||
}
|
||||
@@ -5480,6 +5480,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||
case GGML_TYPE_I64:
|
||||
// nothing to validate
|
||||
break;
|
||||
case GGML_TYPE_TQ3_1S:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
// WHT-rotated / TurboQuant types: just validate scales are not NaN/Inf
|
||||
// TODO: add more thorough validation if needed
|
||||
break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
|
||||
|
||||
@@ -102,6 +102,27 @@ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTR
|
||||
GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// TurboQuant KV cache compression (arXiv 2504.19874)
|
||||
GGML_API void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT x, block_turbo3_0 * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_turbo3_0(const block_turbo3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_turbo3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_turbo4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API void quantize_row_turbo2_0_ref(const float * GGML_RESTRICT x, block_turbo2_0 * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_turbo2_0(const block_turbo2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_turbo2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// TQ3_1S: WHT-rotated 3-bit weight quantization (8-level Lloyd-Max)
|
||||
GGML_API void quantize_row_tq3_1s_ref(const float * GGML_RESTRICT x, block_tq3_1s * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_tq3_1s(const block_tq3_1s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_tq3_1s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// TQ4_1S: WHT-rotated 4-bit weight quantization (16-level Lloyd-Max)
|
||||
GGML_API void quantize_row_tq4_1s_ref(const float * GGML_RESTRICT x, block_tq4_1s * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_tq4_1s(const block_tq4_1s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_tq4_1s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
GGML_API void iq2xs_init_impl(enum ggml_type type);
|
||||
GGML_API void iq2xs_free_impl(enum ggml_type type);
|
||||
GGML_API void iq3xs_init_impl(int grid_size);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -847,6 +847,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_conv_transpose_1d_f32;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
vk_pipeline pipeline_turbo_wht;
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
// [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128
|
||||
@@ -4298,6 +4299,30 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
const uint32_t force_subgroup_size = use_subgroups ? subgroup_size : 0;
|
||||
const uint32_t force_subgroup_size16 = use_subgroups16 ? subgroup_size16 : 0;
|
||||
|
||||
// TQ4_1S uses a dedicated pipeline whose workgroup size is always 32 and
|
||||
// whose reduction path is always the shared-memory variant.
|
||||
//
|
||||
// The Walsh-Hadamard butterfly inside the shader operates on 32-element
|
||||
// blocks with one element per thread, so the workgroup contract is fixed
|
||||
// regardless of what the rest of the mul_mat_vec family picks for the
|
||||
// current DMMV_WG_SIZE bucket. We always use 32 threads per workgroup.
|
||||
//
|
||||
// Reduction choice: the shader uses the SHMEM tree reduction even when
|
||||
// subgroup arithmetic is available. A subgroup-shuffle butterfly + pure
|
||||
// subgroupAdd reduction variant was tried and measured ~70 %% slower on
|
||||
// Intel Arc (Mesa Xe HPG), where subgroup shuffles and subgroup adds are
|
||||
// emulated over LDS and end up doing the same amount of LDS traffic as
|
||||
// the explicit shared-memory path but with extra driver overhead. Going
|
||||
// through SHMEM directly is always correct and is fastest on the devices
|
||||
// we can actually measure. Future vendor-specific heuristics can switch
|
||||
// to the hybrid reduction variant on NVIDIA / AMD RDNA if hardware
|
||||
// subgroup shuffles beat the LDS roundtrip there.
|
||||
const uint32_t tq4_1s_wg_size = 32u;
|
||||
const uint32_t tq4_1s_force_sg_size = 0u;
|
||||
const bool tq4_1s_use_subgroups = false;
|
||||
const shader_reduction_mode tq4_1s_reduc = SHADER_REDUCTION_MODE_SHMEM;
|
||||
|
||||
static constexpr uint32_t mul_mat_vec_num_bindings = 5;
|
||||
static constexpr uint32_t mul_mat_vec_id_num_bindings = 6;
|
||||
|
||||
@@ -4339,6 +4364,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[reduc16], arr_dmmv_iq4_nl_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[reduc16], arr_dmmv_mxfp4_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_NVFP4][i], "mul_mat_vec_nvfp4_f32_f32", arr_dmmv_nvfp4_f32_f32_len[reduc16], arr_dmmv_nvfp4_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
// TQ4_1S: fixed 32-thread workgroup, shared-memory WHT butterfly,
|
||||
// shared-memory reduction. NUM_ROWS=8 amortises the butterfly cost
|
||||
// across 8 output rows per workgroup.
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_TQ4_1S][i], "mul_mat_vec_tq4_1s_f32_f32", arr_dmmv_tq4_1s_f32_f32_len[tq4_1s_reduc], arr_dmmv_tq4_1s_f32_f32_data[tq4_1s_reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {8, 1, 1}, {tq4_1s_wg_size, 8, i+1}, 1, true, tq4_1s_use_subgroups, tq4_1s_force_sg_size);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
|
||||
@@ -4365,6 +4394,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[reduc16], arr_dmmv_iq4_nl_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[reduc16], arr_dmmv_mxfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_NVFP4][i], "mul_mat_vec_nvfp4_f16_f32", arr_dmmv_nvfp4_f16_f32_len[reduc16], arr_dmmv_nvfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_TQ4_1S][i], "mul_mat_vec_tq4_1s_f16_f32", arr_dmmv_tq4_1s_f16_f32_len[tq4_1s_reduc], arr_dmmv_tq4_1s_f16_f32_data[tq4_1s_reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {8, 1, 1}, {tq4_1s_wg_size, 8, i+1}, 1, true, tq4_1s_use_subgroups, tq4_1s_force_sg_size);
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
if (device->integer_dot_product) {
|
||||
@@ -4473,6 +4503,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_MXFP4], "dequant_mxfp4", dequant_mxfp4_len, dequant_mxfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_NVFP4], "dequant_nvfp4", dequant_nvfp4_len, dequant_nvfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_TURBO3_0], "dequant_turbo3_0", dequant_turbo3_0_len, dequant_turbo3_0_data, "main", 2, 5 * sizeof(uint32_t), {128, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_TQ4_1S], "dequant_tq4_1s", dequant_tq4_1s_len, dequant_tq4_1s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
||||
|
||||
// TurboQuant WHT
|
||||
ggml_vk_create_pipeline(device, device->pipeline_turbo_wht, "turbo_wht", turbo_wht_len, turbo_wht_data, "main", 2, 3 * sizeof(uint32_t), {128, 1, 1}, {}, 1);
|
||||
|
||||
// get_rows
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
@@ -4500,6 +4535,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_NVFP4], "get_rows_nvfp4", get_rows_nvfp4_len, get_rows_nvfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_TURBO3_0], "get_rows_turbo3_0", get_rows_turbo3_0_len, get_rows_turbo3_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
@@ -4527,6 +4563,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_NVFP4], "get_rows_nvfp4_f32", get_rows_nvfp4_f32_len, get_rows_nvfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_TURBO3_0], "get_rows_turbo3_0_f32", get_rows_turbo3_0_f32_len, get_rows_turbo3_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||
@@ -4588,25 +4625,32 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_32, "cpy_transpose_32", cpy_transpose_32_len, cpy_transpose_32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_len, cpy_f32_q1_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
// RTE fp16 rounding is now patched into SPIR-V programmatically (see
|
||||
// ggml_vk_create_pipeline_func), so we only compile a single variant of
|
||||
// each cpy_f32_quant / set_rows shader rather than _rte/non-_rte pairs.
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_len, cpy_f32_q1_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
|
||||
#define SET_ROWS(itype) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F32], "set_rows_f32" #itype, set_rows_f32 ## itype ## _len, set_rows_f32 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16], "set_rows_f16" #itype, set_rows_f16 ## itype ## _len, set_rows_f16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16], "set_rows_bf16" #itype, set_rows_bf16 ## itype ## _len, set_rows_bf16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0], "set_rows_q1_0" #itype, set_rows_q1_0 ## itype ## _len, set_rows_q1_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0], "set_rows_q4_0" #itype, set_rows_q4_0 ## itype ## _len, set_rows_q4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1], "set_rows_q4_1" #itype, set_rows_q4_1 ## itype ## _len, set_rows_q4_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## _len, set_rows_q5_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1], "set_rows_q5_1" #itype, set_rows_q5_1 ## itype ## _len, set_rows_q5_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0], "set_rows_q8_0" #itype, set_rows_q8_0 ## itype ## _len, set_rows_q8_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL], "set_rows_iq4_nl" #itype, set_rows_iq4_nl ## itype ## _len, set_rows_iq4_nl ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F32], "set_rows_f32" #itype, set_rows_f32 ## itype ## _len, set_rows_f32 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16], "set_rows_f16" #itype, set_rows_f16 ## itype ## _len, set_rows_f16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16], "set_rows_bf16" #itype, set_rows_bf16 ## itype ## _len, set_rows_bf16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0], "set_rows_q1_0" #itype, set_rows_q1_0 ## itype ## _len, set_rows_q1_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0], "set_rows_q4_0" #itype, set_rows_q4_0 ## itype ## _len, set_rows_q4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1], "set_rows_q4_1" #itype, set_rows_q4_1 ## itype ## _len, set_rows_q4_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## _len, set_rows_q5_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1], "set_rows_q5_1" #itype, set_rows_q5_1 ## itype ## _len, set_rows_q5_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0], "set_rows_q8_0" #itype, set_rows_q8_0 ## itype ## _len, set_rows_q8_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL], "set_rows_iq4_nl" #itype, set_rows_iq4_nl ## itype ## _len, set_rows_iq4_nl ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TURBO2_0], "set_rows_turbo2_0" #itype, set_rows_turbo2_0 ## itype ## _len, set_rows_turbo2_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TURBO3_0], "set_rows_turbo3_0" #itype, set_rows_turbo3_0 ## itype ## _len, set_rows_turbo3_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TURBO4_0], "set_rows_turbo4_0" #itype, set_rows_turbo4_0 ## itype ## _len, set_rows_turbo4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TQ4_1S], "set_rows_tq4_1s" #itype, set_rows_tq4_1s ## itype ## _len, set_rows_tq4_1s ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
|
||||
|
||||
SET_ROWS(_i32)
|
||||
SET_ROWS(_i64)
|
||||
@@ -4620,6 +4664,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_TURBO3_0], "cpy_turbo3_0_f32", cpy_turbo3_0_f32_len, cpy_turbo3_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_TURBO3_0), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_TQ4_1S], "cpy_tq4_1s_f32", cpy_tq4_1s_f32_len, cpy_tq4_1s_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_TQ4_1S), 1, 1}, {}, 1);
|
||||
|
||||
auto get_suffix = [](bool src0_f16, bool src1_f16, bool dst_f16) {
|
||||
std::string s;
|
||||
@@ -6285,6 +6331,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
break;
|
||||
default:
|
||||
return nullptr;
|
||||
@@ -6425,6 +6472,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
break;
|
||||
default:
|
||||
return nullptr;
|
||||
@@ -6440,6 +6488,10 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
||||
if (m < 4096 && k >= 1024) {
|
||||
dmmv_wg = DMMV_WG_SIZE_LARGE;
|
||||
}
|
||||
} else if (a_type == GGML_TYPE_TQ4_1S) {
|
||||
// TQ4_1S needs exactly 32 threads (one subgroup) to cooperate on the
|
||||
// 32-element WHT butterfly in shared memory. Force SUBGROUP-sized wg.
|
||||
dmmv_wg = DMMV_WG_SIZE_SUBGROUP;
|
||||
} else {
|
||||
if (m <= 8192 && k >= 1024) {
|
||||
dmmv_wg = DMMV_WG_SIZE_LARGE;
|
||||
@@ -7591,6 +7643,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
return ctx->device->pipeline_cpy_quant_f32[src->type];
|
||||
default:
|
||||
break;
|
||||
@@ -10422,7 +10476,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
uint32_t ne = ggml_nelements(src0);
|
||||
if (ggml_is_quantized(dst->type)) {
|
||||
if (dst->type == GGML_TYPE_TURBO2_0 ||
|
||||
dst->type == GGML_TYPE_TURBO3_0 ||
|
||||
dst->type == GGML_TYPE_TURBO4_0) {
|
||||
ne = ne / 128;
|
||||
} else if (dst->type == GGML_TYPE_TQ4_1S) {
|
||||
ne = ne / 32;
|
||||
} else if (ggml_is_quantized(dst->type)) {
|
||||
// quants run 32 threads each doing QUANT_K elements
|
||||
ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type));
|
||||
} else {
|
||||
@@ -11215,6 +11275,32 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_vk_turbo_wht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
int direction, group_size;
|
||||
memcpy(&direction, dst->op_params + 0, sizeof(int));
|
||||
memcpy(&group_size, dst->op_params + sizeof(int), sizeof(int));
|
||||
struct { uint32_t ne; uint32_t direction; uint32_t group_size; } pc = {
|
||||
(uint32_t)ggml_nelements(src0), (uint32_t)direction, (uint32_t)group_size,
|
||||
};
|
||||
vk_pipeline pipeline = ctx->device->pipeline_turbo_wht;
|
||||
GGML_ASSERT(pipeline != nullptr);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src0, false);
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false);
|
||||
// Spread workgroups across Y/Z to stay within maxComputeWorkGroupCount[0].
|
||||
// elements[0] / group_size = wg0; each row of 512 workgroups uses one Y slice.
|
||||
const uint32_t n_groups = pc.ne / (uint32_t)group_size;
|
||||
std::array<uint32_t, 3> elements;
|
||||
if (n_groups > 262144) {
|
||||
elements = { 512 * (uint32_t)group_size, 512, CEIL_DIV(n_groups, 262144) };
|
||||
} else if (n_groups > 512) {
|
||||
elements = { 512 * (uint32_t)group_size, CEIL_DIV(n_groups, 512), 1 };
|
||||
} else {
|
||||
elements = { pc.ne, 1, 1 };
|
||||
}
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc, elements);
|
||||
}
|
||||
|
||||
static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
|
||||
}
|
||||
@@ -13402,6 +13488,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_SET_ROWS:
|
||||
ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_TURBO_WHT:
|
||||
ggml_vk_turbo_wht(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_SILU_BACK:
|
||||
ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node);
|
||||
@@ -15831,6 +15921,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -15888,6 +15979,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
return true;
|
||||
case GGML_TYPE_Q1_0:
|
||||
return coopmat2;
|
||||
@@ -15932,6 +16024,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
case GGML_TYPE_I32:
|
||||
return true;
|
||||
default:
|
||||
@@ -15951,6 +16044,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_TURBO2_0:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
case GGML_TYPE_TURBO4_0:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -15991,6 +16088,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_TURBO3_0:
|
||||
case GGML_TYPE_TQ4_1S:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
@@ -16245,6 +16344,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
ggml_is_contiguous(op->src[1]) &&
|
||||
ggml_is_contiguous(op));
|
||||
}
|
||||
case GGML_OP_TURBO_WHT:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[0]->ne[0] % 128 == 0;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -30,6 +30,41 @@ void main() {
|
||||
|
||||
const uint a_offset = 0;
|
||||
const uint ib = src_idx;
|
||||
|
||||
#if defined(DATA_A_TQ4_1S)
|
||||
// TQ4_1S requires full inverse WHT after centroid*scale dequant.
|
||||
// Dequant all 32 elements into a buffer, apply butterfly, then write.
|
||||
const float tq4_signs[32] = float[32](
|
||||
+1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
|
||||
-1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
|
||||
-1.0, -1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0,
|
||||
-1.0, +1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0
|
||||
);
|
||||
const float TQ4_INV_SQRT32 = 0.17677669529663688;
|
||||
|
||||
float buf[32];
|
||||
for (int j = 0; j < 32; j += 2) {
|
||||
vec2 v = dequantize(ib, j, a_offset);
|
||||
buf[j] = v.x;
|
||||
buf[j+1] = v.y;
|
||||
}
|
||||
|
||||
// Inverse WHT butterfly (5 stages for 32 elements)
|
||||
for (uint step = 1u; step < 32u; step <<= 1u) {
|
||||
for (uint i = 0u; i < 32u; i += step * 2u) {
|
||||
for (uint j2 = i; j2 < i + step; j2++) {
|
||||
float a2 = buf[j2], b2 = buf[j2 + step];
|
||||
buf[j2] = a2 + b2;
|
||||
buf[j2 + step] = a2 - b2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize and apply sign pattern
|
||||
for (int j = 0; j < 32; j++) {
|
||||
data_d[dst_idx + j] = buf[j] * TQ4_INV_SQRT32 * tq4_signs[j];
|
||||
}
|
||||
#else
|
||||
const vec2 dm = get_dm(ib, a_offset);
|
||||
|
||||
[[unroll]] for (int j = 0; j < QUANT_K; j += 4) {
|
||||
@@ -48,4 +83,5 @@ void main() {
|
||||
data_d[dst_idx + j + 3] = v[3];
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : enable
|
||||
#extension GL_KHR_shader_subgroup_ballot : enable
|
||||
#extension GL_KHR_shader_subgroup_shuffle : enable
|
||||
#include "types.glsl"
|
||||
|
||||
#if defined(SET_ROWS) && QUANT_K == 1
|
||||
#if defined(SET_ROWS) && (defined(DATA_A_TURBO2_0) || defined(DATA_A_TURBO3_0) || defined(DATA_A_TURBO4_0))
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
const uint BLOCK_SIZE = 128;
|
||||
#elif defined(SET_ROWS) && QUANT_K == 1
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
const uint BLOCK_SIZE = 512;
|
||||
#else
|
||||
@@ -208,6 +214,109 @@ void quantize(uint dst_idx, uint src_idx)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_TURBO3_0)
|
||||
const float TS1[128] = float[128](
|
||||
-1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1,
|
||||
1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1,
|
||||
-1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1,
|
||||
1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1,
|
||||
-1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, 1,
|
||||
1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1,
|
||||
-1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1,
|
||||
1, -1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1
|
||||
);
|
||||
|
||||
const float TS2[128] = float[128](
|
||||
1, 1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1,
|
||||
1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1,
|
||||
1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1,
|
||||
1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1,
|
||||
1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, 1,
|
||||
-1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1,
|
||||
1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1,
|
||||
-1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1
|
||||
);
|
||||
|
||||
const float TINV = 0.08838834764831845; // 1 / sqrt(128)
|
||||
|
||||
const float TC[8] = float[8](
|
||||
-0.190685, -0.117832, -0.065717, -0.021460,
|
||||
0.021460, 0.065717, 0.117832, 0.190685
|
||||
);
|
||||
|
||||
const float TM[7] = float[7](
|
||||
-0.154259, -0.091775, -0.043589,
|
||||
0.0,
|
||||
0.043589, 0.091775, 0.154259
|
||||
);
|
||||
|
||||
#if defined(SET_ROWS)
|
||||
|
||||
shared float wht[128];
|
||||
shared float sg_acc[16];
|
||||
shared float gnrm;
|
||||
|
||||
void quantize_block(uint b, uint o) {
|
||||
[[unroll]] for (int j = 0; j < 32; ++j) data_q[b].qs[j] = uint8_t(0);
|
||||
[[unroll]] for (int j = 0; j < 16; ++j) data_q[b].signs[j] = uint8_t(0);
|
||||
float rs = 0.0;
|
||||
[[unroll]] for (int j = 0; j < 128; ++j) {
|
||||
float v = wht[o + j];
|
||||
uint i = v < TM[0] ? 0 : v < TM[1] ? 1 : v < TM[2] ? 2 : v < TM[3] ? 3 :
|
||||
v < TM[4] ? 4 : v < TM[5] ? 5 : v < TM[6] ? 6 : 7;
|
||||
rs += TC[i] * TC[i];
|
||||
uint low2 = i & 0x3;
|
||||
uint hi1 = (i >> 2) & 0x1;
|
||||
data_q[b].qs[j / 4] |= uint8_t(low2 << ((j % 4) * 2));
|
||||
data_q[b].signs[j / 8] |= uint8_t(hi1 << (j % 8));
|
||||
}
|
||||
float rn = sqrt(rs);
|
||||
data_q[b].norm = float16_t((rn > 1e-10) ? (gnrm / rn) : gnrm);
|
||||
}
|
||||
|
||||
#endif // defined(SET_ROWS)
|
||||
#endif // defined(DATA_A_TURBO3_0)
|
||||
|
||||
#if defined(DATA_A_TQ4_1S)
|
||||
|
||||
const float TQ4_SIGNS[32] = float[32](
|
||||
+1, -1, +1, -1, +1, +1, -1, +1,
|
||||
-1, -1, +1, -1, +1, +1, -1, +1,
|
||||
-1, -1, +1, -1, +1, -1, -1, +1,
|
||||
-1, +1, +1, -1, +1, -1, -1, +1
|
||||
);
|
||||
|
||||
const float TQ4_INV_SQRT32 = 0.17677669529663688; // 1 / sqrt(32)
|
||||
|
||||
const float TQ4_CENTROIDS[16] = float[16](
|
||||
-2.732590, -2.069017, -1.618046, -1.256231,
|
||||
-0.942340, -0.656759, -0.388048, -0.128395,
|
||||
0.128395, 0.388048, 0.656759, 0.942340,
|
||||
1.256231, 1.618046, 2.069017, 2.732590
|
||||
);
|
||||
|
||||
// Midpoint thresholds for nearest-centroid binary search (16 centroids)
|
||||
uint tq4_choose_index(float val) {
|
||||
if (val < -2.400804) return 0u;
|
||||
if (val < -1.843532) return 1u;
|
||||
if (val < -1.437139) return 2u;
|
||||
if (val < -1.099286) return 3u;
|
||||
if (val < -0.799550) return 4u;
|
||||
if (val < -0.522404) return 5u;
|
||||
if (val < -0.258222) return 6u;
|
||||
if (val < 0.000000) return 7u;
|
||||
if (val < 0.258222) return 8u;
|
||||
if (val < 0.522404) return 9u;
|
||||
if (val < 0.799550) return 10u;
|
||||
if (val < 1.099286) return 11u;
|
||||
if (val < 1.437139) return 12u;
|
||||
if (val < 1.843532) return 13u;
|
||||
if (val < 2.400804) return 14u;
|
||||
return 15u;
|
||||
}
|
||||
|
||||
#endif // defined(DATA_A_TQ4_1S)
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
uint best_index(float x) {
|
||||
if (x <= kvalues_iq4nl[0]) return 0;
|
||||
@@ -270,7 +379,451 @@ void quantize(uint dst_idx, uint src_idx)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(SET_ROWS)
|
||||
#if defined(SET_ROWS) && defined(DATA_A_TURBO3_0)
|
||||
void main() {
|
||||
const uint t = gl_LocalInvocationID.x;
|
||||
const uint g = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint gpr = p.ne00 / 128;
|
||||
|
||||
if (gpr == 0) return;
|
||||
if (g >= p.ne / 128) return;
|
||||
|
||||
uint tmp = g;
|
||||
const uint ig = tmp % gpr; tmp /= gpr;
|
||||
const uint i01 = tmp % p.ne01; tmp /= p.ne01;
|
||||
const uint i02 = tmp % p.ne12;
|
||||
const uint i03 = tmp / p.ne12;
|
||||
|
||||
const uint sb = src0_idx(ig * 128, i01, i02, i03) + get_aoffset();
|
||||
const uint i1 = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE;
|
||||
const uint db = dst_idx(ig, i1, i02, i03) + get_doffset();
|
||||
|
||||
// Step 1: load into shared memory
|
||||
wht[t] = data_s[sb + t];
|
||||
barrier();
|
||||
|
||||
// Step 2: L2 norm via subgroup reduction
|
||||
float v2 = wht[t] * wht[t];
|
||||
v2 = subgroupAdd(v2);
|
||||
if (gl_SubgroupInvocationID == 0) sg_acc[gl_SubgroupID] = v2;
|
||||
barrier();
|
||||
if (t == 0) {
|
||||
float total = 0.0;
|
||||
for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc[w];
|
||||
gnrm = sqrt(total);
|
||||
}
|
||||
barrier();
|
||||
|
||||
// Step 3: normalize, then apply forward WHT: signs1 -> butterfly -> signs2
|
||||
wht[t] *= (gnrm > 1e-10) ? (1.0 / gnrm) : 0.0;
|
||||
barrier();
|
||||
|
||||
wht[t] *= TS1[t];
|
||||
barrier();
|
||||
|
||||
[[unroll]] for (uint h = 1; h < 128; h *= 2) {
|
||||
if ((t % (2 * h)) < h) {
|
||||
float a = wht[t];
|
||||
float b = wht[t + h];
|
||||
wht[t] = a + b;
|
||||
wht[t + h] = a - b;
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
// Step 5: apply signs2 + scaling
|
||||
float rv = wht[t] * TINV * TS2[t];
|
||||
|
||||
// Step 6: quantize -- all 128 threads participate
|
||||
uint idx = rv < TM[0] ? 0u : rv < TM[1] ? 1u : rv < TM[2] ? 2u : rv < TM[3] ? 3u :
|
||||
rv < TM[4] ? 4u : rv < TM[5] ? 5u : rv < TM[6] ? 6u : 7u;
|
||||
|
||||
// Pack qs: 4 elements per byte via subgroup shuffle
|
||||
uint sg_lane = gl_SubgroupInvocationID;
|
||||
uint my_low2 = idx & 0x3u;
|
||||
uint qs_byte = 0u;
|
||||
[[unroll]] for (uint k = 0; k < 4; k++) {
|
||||
uint contrib = subgroupShuffle(my_low2, (sg_lane & ~3u) + k);
|
||||
qs_byte |= contrib << (k * 2u);
|
||||
}
|
||||
if (sg_lane % 4u == 0u) {
|
||||
data_q[db].qs[t / 4u] = uint8_t(qs_byte);
|
||||
}
|
||||
|
||||
// Pack signs: 8 elements per byte via subgroup ballot
|
||||
uvec4 ballot = subgroupBallot(((idx >> 2u) & 1u) != 0u);
|
||||
if (sg_lane % 8u == 0u) {
|
||||
uint local_byte = sg_lane / 8u;
|
||||
data_q[db].signs[t / 8u] = uint8_t((ballot.x >> (local_byte * 8u)) & 0xFFu);
|
||||
}
|
||||
|
||||
// Step 7: reconstruction norm via subgroup reduction
|
||||
float rc = TC[idx] * TC[idx];
|
||||
rc = subgroupAdd(rc);
|
||||
if (sg_lane == 0u) sg_acc[gl_SubgroupID] = rc;
|
||||
barrier();
|
||||
if (t == 0u) {
|
||||
float total = 0.0;
|
||||
for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc[w];
|
||||
float rn = sqrt(total);
|
||||
data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm / rn) : gnrm);
|
||||
}
|
||||
}
|
||||
#elif defined(SET_ROWS) && defined(DATA_A_TURBO2_0)
|
||||
// Mirror of the TURBO3_0 block above, adapted for turbo2 (4 centroids,
|
||||
// 2-bit pack, no signs byte). WHT tables and reduction structure are
|
||||
// identical (QK = 128 for both).
|
||||
const float TS1_T2[128] = float[128](
|
||||
-1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1,
|
||||
1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1,
|
||||
-1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1,
|
||||
1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1,
|
||||
-1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, 1,
|
||||
1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1,
|
||||
-1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1,
|
||||
1, -1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1
|
||||
);
|
||||
const float TS2_T2[128] = float[128](
|
||||
1, 1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1,
|
||||
1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1,
|
||||
1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1,
|
||||
1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1,
|
||||
1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, 1,
|
||||
-1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1,
|
||||
1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1,
|
||||
-1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1
|
||||
);
|
||||
const float TINV_T2 = 0.08838834764831845; // 1 / sqrt(128)
|
||||
// Lloyd-Max centroids for N(0, 1/128), 4 levels (matches CENTROIDS_2BIT in C ref)
|
||||
const float TC2[4] = float[4](-0.133462, -0.039994, 0.039994, 0.133462);
|
||||
// Midpoints between adjacent centroids
|
||||
const float TM2[3] = float[3](-0.086728, 0.0, 0.086728);
|
||||
|
||||
shared float wht_t2[128];
|
||||
shared float sg_acc_t2[16];
|
||||
shared float gnrm_t2;
|
||||
|
||||
void main() {
|
||||
const uint t = gl_LocalInvocationID.x;
|
||||
const uint g = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint gpr = p.ne00 / 128;
|
||||
|
||||
if (gpr == 0) return;
|
||||
if (g >= p.ne / 128) return;
|
||||
|
||||
uint tmp = g;
|
||||
const uint ig = tmp % gpr; tmp /= gpr;
|
||||
const uint i01 = tmp % p.ne01; tmp /= p.ne01;
|
||||
const uint i02 = tmp % p.ne12;
|
||||
const uint i03 = tmp / p.ne12;
|
||||
|
||||
const uint sb = src0_idx(ig * 128, i01, i02, i03) + get_aoffset();
|
||||
const uint i1 = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE;
|
||||
const uint db = dst_idx(ig, i1, i02, i03) + get_doffset();
|
||||
|
||||
wht_t2[t] = data_s[sb + t];
|
||||
barrier();
|
||||
|
||||
float v2 = wht_t2[t] * wht_t2[t];
|
||||
v2 = subgroupAdd(v2);
|
||||
if (gl_SubgroupInvocationID == 0) sg_acc_t2[gl_SubgroupID] = v2;
|
||||
barrier();
|
||||
if (t == 0) {
|
||||
float total = 0.0;
|
||||
for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t2[w];
|
||||
gnrm_t2 = sqrt(total);
|
||||
}
|
||||
barrier();
|
||||
|
||||
wht_t2[t] *= (gnrm_t2 > 1e-10) ? (1.0 / gnrm_t2) : 0.0;
|
||||
barrier();
|
||||
|
||||
wht_t2[t] *= TS1_T2[t];
|
||||
barrier();
|
||||
|
||||
[[unroll]] for (uint h = 1; h < 128; h *= 2) {
|
||||
if ((t % (2 * h)) < h) {
|
||||
float a = wht_t2[t];
|
||||
float b = wht_t2[t + h];
|
||||
wht_t2[t] = a + b;
|
||||
wht_t2[t + h] = a - b;
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
float rv = wht_t2[t] * TINV_T2 * TS2_T2[t];
|
||||
|
||||
// Quantize to nearest of 4 centroids (2-bit index, no signs byte)
|
||||
uint idx = rv < TM2[0] ? 0u : rv < TM2[1] ? 1u : rv < TM2[2] ? 2u : 3u;
|
||||
|
||||
// Pack qs: 4 elements per byte (full 2-bit each, no high bit)
|
||||
uint sg_lane = gl_SubgroupInvocationID;
|
||||
uint qs_byte = 0u;
|
||||
[[unroll]] for (uint k = 0; k < 4; k++) {
|
||||
uint contrib = subgroupShuffle(idx & 0x3u, (sg_lane & ~3u) + k);
|
||||
qs_byte |= contrib << (k * 2u);
|
||||
}
|
||||
if (sg_lane % 4u == 0u) {
|
||||
data_q[db].qs[t / 4u] = uint8_t(qs_byte);
|
||||
}
|
||||
|
||||
// Reconstruction norm via subgroup reduction
|
||||
float rc = TC2[idx] * TC2[idx];
|
||||
rc = subgroupAdd(rc);
|
||||
if (sg_lane == 0u) sg_acc_t2[gl_SubgroupID] = rc;
|
||||
barrier();
|
||||
if (t == 0u) {
|
||||
float total = 0.0;
|
||||
for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t2[w];
|
||||
float rn = sqrt(total);
|
||||
data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm_t2 / rn) : gnrm_t2);
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(SET_ROWS) && defined(DATA_A_TURBO4_0)
|
||||
// Mirror of the TURBO3_0 block above, adapted for turbo4 (16 centroids,
|
||||
// 4-bit nibble pack, no signs byte). WHT tables and reduction structure
|
||||
// are identical (QK = 128 for both). The block struct keeps a reserved
|
||||
// rnorm field for ABI parity with the legacy 3-bit + QJL layout.
|
||||
const float TS1_T4[128] = float[128](
|
||||
-1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1,
|
||||
1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1,
|
||||
-1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1,
|
||||
1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1,
|
||||
-1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, 1,
|
||||
1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1,
|
||||
-1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1,
|
||||
1, -1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1
|
||||
);
|
||||
const float TS2_T4[128] = float[128](
|
||||
1, 1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1,
|
||||
1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1,
|
||||
1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1,
|
||||
1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1,
|
||||
1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, 1,
|
||||
-1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1,
|
||||
1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1,
|
||||
-1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1
|
||||
);
|
||||
const float TINV_T4 = 0.08838834764831845; // 1 / sqrt(128)
|
||||
// Lloyd-Max centroids for N(0, 1/128), 16 levels (matches CENTROIDS_4BIT in C ref)
|
||||
const float TC4[16] = float[16](
|
||||
-0.173926, -0.117195, -0.089527, -0.068756,
|
||||
-0.051262, -0.035597, -0.020989, -0.006938,
|
||||
0.006938, 0.020989, 0.035597, 0.051262,
|
||||
0.068756, 0.089527, 0.117195, 0.173926
|
||||
);
|
||||
// 15 midpoints between adjacent centroids
|
||||
const float TM4[15] = float[15](
|
||||
-0.145561, -0.103361, -0.079142, -0.060009,
|
||||
-0.043430, -0.028293, -0.013964, 0.0,
|
||||
0.013964, 0.028293, 0.043430, 0.060009,
|
||||
0.079142, 0.103361, 0.145561
|
||||
);
|
||||
|
||||
shared float wht_t4[128];
|
||||
shared float sg_acc_t4[16];
|
||||
shared float gnrm_t4;
|
||||
|
||||
void main() {
|
||||
const uint t = gl_LocalInvocationID.x;
|
||||
const uint g = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint gpr = p.ne00 / 128;
|
||||
|
||||
if (gpr == 0) return;
|
||||
if (g >= p.ne / 128) return;
|
||||
|
||||
uint tmp = g;
|
||||
const uint ig = tmp % gpr; tmp /= gpr;
|
||||
const uint i01 = tmp % p.ne01; tmp /= p.ne01;
|
||||
const uint i02 = tmp % p.ne12;
|
||||
const uint i03 = tmp / p.ne12;
|
||||
|
||||
const uint sb = src0_idx(ig * 128, i01, i02, i03) + get_aoffset();
|
||||
const uint i1 = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE;
|
||||
const uint db = dst_idx(ig, i1, i02, i03) + get_doffset();
|
||||
|
||||
wht_t4[t] = data_s[sb + t];
|
||||
barrier();
|
||||
|
||||
float v2 = wht_t4[t] * wht_t4[t];
|
||||
v2 = subgroupAdd(v2);
|
||||
if (gl_SubgroupInvocationID == 0) sg_acc_t4[gl_SubgroupID] = v2;
|
||||
barrier();
|
||||
if (t == 0) {
|
||||
float total = 0.0;
|
||||
for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t4[w];
|
||||
gnrm_t4 = sqrt(total);
|
||||
}
|
||||
barrier();
|
||||
|
||||
wht_t4[t] *= (gnrm_t4 > 1e-10) ? (1.0 / gnrm_t4) : 0.0;
|
||||
barrier();
|
||||
|
||||
wht_t4[t] *= TS1_T4[t];
|
||||
barrier();
|
||||
|
||||
[[unroll]] for (uint h = 1; h < 128; h *= 2) {
|
||||
if ((t % (2 * h)) < h) {
|
||||
float a = wht_t4[t];
|
||||
float b = wht_t4[t + h];
|
||||
wht_t4[t] = a + b;
|
||||
wht_t4[t + h] = a - b;
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
float rv = wht_t4[t] * TINV_T4 * TS2_T4[t];
|
||||
|
||||
// Quantize to nearest of 16 centroids (4-bit index, no signs byte)
|
||||
uint idx = 0u;
|
||||
[[unroll]] for (uint i = 0; i < 15; i++) {
|
||||
if (rv >= TM4[i]) idx = i + 1u;
|
||||
}
|
||||
|
||||
// Pack qs: 2 elements per byte (4-bit nibble each)
|
||||
uint sg_lane = gl_SubgroupInvocationID;
|
||||
uint pair_low = subgroupShuffle(idx & 0xFu, sg_lane & ~1u);
|
||||
uint pair_high = subgroupShuffle(idx & 0xFu, (sg_lane & ~1u) + 1u);
|
||||
uint qs_byte = pair_low | (pair_high << 4u);
|
||||
if (sg_lane % 2u == 0u) {
|
||||
data_q[db].qs[t / 2u] = uint8_t(qs_byte);
|
||||
}
|
||||
|
||||
// Reset rnorm field (reserved in 4-bit mode)
|
||||
if (t == 0u) {
|
||||
data_q[db].rnorm = float16_t(0.0);
|
||||
}
|
||||
|
||||
// Reconstruction norm via subgroup reduction
|
||||
float rc = TC4[idx] * TC4[idx];
|
||||
rc = subgroupAdd(rc);
|
||||
if (sg_lane == 0u) sg_acc_t4[gl_SubgroupID] = rc;
|
||||
barrier();
|
||||
if (t == 0u) {
|
||||
float total = 0.0;
|
||||
for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t4[w];
|
||||
float rn = sqrt(total);
|
||||
data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm_t4 / rn) : gnrm_t4);
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(SET_ROWS) && defined(DATA_A_TQ4_1S)
|
||||
|
||||
void main() {
|
||||
const uint t = gl_LocalInvocationID.x; // 0..31, one per block element
|
||||
const uint g = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint gpr = p.ne00 / 32; // blocks per row
|
||||
|
||||
if (gpr == 0) return;
|
||||
if (g >= p.ne / 32) return;
|
||||
|
||||
uint tmp = g;
|
||||
const uint ig = tmp % gpr; tmp /= gpr;
|
||||
const uint i01 = tmp % p.ne01; tmp /= p.ne01;
|
||||
const uint i02 = tmp % p.ne12;
|
||||
const uint i03 = tmp / p.ne12;
|
||||
|
||||
const uint sb = src0_idx(ig * 32, i01, i02, i03) + get_aoffset();
|
||||
const uint i1 = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE;
|
||||
const uint db = dst_idx(ig, i1, i02, i03) + get_doffset();
|
||||
|
||||
// Step 1: load one f32 input per thread
|
||||
float val = data_s[sb + t];
|
||||
|
||||
// Step 2: Forward RHT via subgroup ops
|
||||
// Sign flip
|
||||
val *= TQ4_SIGNS[t];
|
||||
|
||||
// WHT butterfly via subgroupShuffleXor
|
||||
[[unroll]] for (uint h = 1u; h < 32u; h <<= 1u) {
|
||||
const float other = subgroupShuffleXor(val, h);
|
||||
val = ((t & h) == 0u) ? (val + other) : (other - val);
|
||||
}
|
||||
|
||||
// Normalize
|
||||
val *= TQ4_INV_SQRT32;
|
||||
|
||||
// Step 3: Dual half-block RMS scale computation
|
||||
float sq = val * val;
|
||||
float sum_sq_lo = subgroupAdd((t < 16u) ? sq : 0.0);
|
||||
float sum_sq_hi = subgroupAdd((t >= 16u) ? sq : 0.0);
|
||||
float rms_lo = sqrt(sum_sq_lo / 16.0);
|
||||
float rms_hi = sqrt(sum_sq_hi / 16.0);
|
||||
|
||||
// Step 4: Scale search (9 points) — matches CPU quantize_row_tq4_1s_ref
|
||||
const float SCALES[9] = float[9](0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.35, 1.5);
|
||||
float best_d0 = rms_lo;
|
||||
float best_d1 = rms_hi;
|
||||
float best_err_total = 1e30;
|
||||
|
||||
for (uint si = 0u; si < 9u; si++) {
|
||||
float d0 = rms_lo * SCALES[si];
|
||||
float d1 = rms_hi * SCALES[si];
|
||||
float inv = (t < 16u)
|
||||
? ((d0 > 1e-10) ? 1.0 / d0 : 0.0)
|
||||
: ((d1 > 1e-10) ? 1.0 / d1 : 0.0);
|
||||
|
||||
uint idx = tq4_choose_index(val * inv);
|
||||
float c = TQ4_CENTROIDS[idx];
|
||||
float d = (t < 16u) ? d0 : d1;
|
||||
float diff2 = val - c * d;
|
||||
float local_err = diff2 * diff2;
|
||||
|
||||
float err_total = subgroupAdd(local_err);
|
||||
if (err_total < best_err_total) {
|
||||
best_err_total = err_total;
|
||||
best_d0 = d0;
|
||||
best_d1 = d1;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Iterative refinement (6 iterations)
|
||||
[[unroll]] for (uint iter = 0u; iter < 6u; iter++) {
|
||||
float inv = (t < 16u)
|
||||
? ((best_d0 > 1e-10) ? 1.0 / best_d0 : 0.0)
|
||||
: ((best_d1 > 1e-10) ? 1.0 / best_d1 : 0.0);
|
||||
|
||||
uint idx = tq4_choose_index(val * inv);
|
||||
float c = TQ4_CENTROIDS[idx];
|
||||
|
||||
float num_lo = subgroupAdd((t < 16u) ? val * c : 0.0);
|
||||
float den_lo = subgroupAdd((t < 16u) ? c * c : 0.0);
|
||||
float num_hi = subgroupAdd((t >= 16u) ? val * c : 0.0);
|
||||
float den_hi = subgroupAdd((t >= 16u) ? c * c : 0.0);
|
||||
|
||||
if (den_lo > 1e-10) best_d0 = num_lo / den_lo;
|
||||
if (den_hi > 1e-10) best_d1 = num_hi / den_hi;
|
||||
}
|
||||
|
||||
// Step 6: Final quantization
|
||||
float inv_final = (t < 16u)
|
||||
? ((best_d0 > 1e-10) ? 1.0 / best_d0 : 0.0)
|
||||
: ((best_d1 > 1e-10) ? 1.0 / best_d1 : 0.0);
|
||||
uint cidx = tq4_choose_index(val * inv_final);
|
||||
|
||||
// Step 7: Nibble packing — two 4-bit indices per byte
|
||||
// Even element j: low nibble; odd element j: high nibble
|
||||
// Thread t pairs with thread t^1
|
||||
uint partner_idx = subgroupShuffle(cidx, t ^ 1u);
|
||||
uint byte_val;
|
||||
if ((t & 1u) == 0u) {
|
||||
byte_val = cidx | (partner_idx << 4u);
|
||||
} else {
|
||||
byte_val = partner_idx | (cidx << 4u);
|
||||
}
|
||||
// Only even threads write (one byte per pair)
|
||||
if ((t & 1u) == 0u) {
|
||||
data_q[db].qs[t >> 1u] = uint8_t(byte_val);
|
||||
}
|
||||
|
||||
// Step 8: Store scales (thread 0 writes both)
|
||||
if (t == 0u) {
|
||||
data_q[db].d0 = float16_t(best_d0);
|
||||
data_q[db].d1 = float16_t(best_d1);
|
||||
}
|
||||
}
|
||||
#elif defined(SET_ROWS)
|
||||
|
||||
void main() {
|
||||
#ifdef NEEDS_INIT_IQ_SHMEM
|
||||
|
||||
@@ -651,3 +651,78 @@ vec2 get_dm(uint ib, uint a_offset) {
|
||||
return vec2(1, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_TURBO3_0)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
// PolarQuant 3-bit centroids (Lloyd-Max for Gaussian)
|
||||
const float centroids[8] = float[8](
|
||||
-0.190685, -0.117832, -0.065717, -0.021460,
|
||||
0.021460, 0.065717, 0.117832, 0.190685
|
||||
);
|
||||
|
||||
// iqs is the element index within the block (0..31), we decode 2 consecutive elements
|
||||
const uint j0 = iqs;
|
||||
const uint j1 = iqs + 1;
|
||||
|
||||
// Extract 2-bit low indices from qs (4 per byte)
|
||||
const uint low2_0 = (uint(data_a[a_offset + ib].qs[j0 / 4]) >> ((j0 % 4) * 2)) & 0x3;
|
||||
const uint low2_1 = (uint(data_a[a_offset + ib].qs[j1 / 4]) >> ((j1 % 4) * 2)) & 0x3;
|
||||
|
||||
// Extract 1-bit high from signs (8 per byte)
|
||||
const uint hi1_0 = (uint(data_a[a_offset + ib].signs[j0 / 8]) >> (j0 % 8)) & 0x1;
|
||||
const uint hi1_1 = (uint(data_a[a_offset + ib].signs[j1 / 8]) >> (j1 % 8)) & 0x1;
|
||||
|
||||
// Combine to 3-bit index
|
||||
const uint idx0 = low2_0 | (hi1_0 << 2);
|
||||
const uint idx1 = low2_1 | (hi1_1 << 2);
|
||||
|
||||
return vec2(centroids[idx0], centroids[idx1]);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
vec2 v0 = dequantize(ib, iqs, a_offset);
|
||||
vec2 v1 = dequantize(ib, iqs + 2, a_offset);
|
||||
return vec4(v0.x, v0.y, v1.x, v1.y);
|
||||
}
|
||||
vec2 get_dm(uint ib, uint a_offset) {
|
||||
return vec2(float(data_a[a_offset + ib].norm), 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_TQ4_1S)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
// TQ4_1S: 16-level Lloyd-Max centroids for N(0,1)
|
||||
const float centroids[16] = float[16](
|
||||
-2.732590, -2.069017, -1.618046, -1.256231,
|
||||
-0.942340, -0.656759, -0.388048, -0.128395,
|
||||
0.128395, 0.388048, 0.656759, 0.942340,
|
||||
1.256231, 1.618046, 2.069017, 2.732590
|
||||
);
|
||||
|
||||
// iqs is the element pair index within the block (0..15)
|
||||
const uint j0 = iqs;
|
||||
const uint j1 = iqs + 1;
|
||||
|
||||
// Extract 4-bit nibble indices from qs (2 per byte)
|
||||
const uint idx0 = (uint(data_a[a_offset + ib].qs[j0 / 2]) >> ((j0 & 1) * 4)) & 0xF;
|
||||
const uint idx1 = (uint(data_a[a_offset + ib].qs[j1 / 2]) >> ((j1 & 1) * 4)) & 0xF;
|
||||
|
||||
// Scale by d0 (elements 0-15) or d1 (elements 16-31)
|
||||
const float d0 = float(data_a[a_offset + ib].d0);
|
||||
const float d1 = float(data_a[a_offset + ib].d1);
|
||||
const float s0 = (j0 < 16) ? d0 : d1;
|
||||
const float s1 = (j1 < 16) ? d0 : d1;
|
||||
|
||||
// Returns centroid * scale WITHOUT RHT inverse
|
||||
// (caller must handle pre-rotation for correctness)
|
||||
return vec2(centroids[idx0] * s0, centroids[idx1] * s1);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
vec2 v0 = dequantize(ib, iqs, a_offset);
|
||||
vec2 v1 = dequantize(ib, iqs + 2, a_offset);
|
||||
return vec4(v0.x, v0.y, v1.x, v1.y);
|
||||
}
|
||||
vec2 get_dm(uint ib, uint a_offset) {
|
||||
// No global scale/min — scales are applied per-element in dequantize()
|
||||
return vec2(1, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -715,6 +715,33 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_TURBO3_0)
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufTURBO3_0 {
|
||||
block_turbo3_0 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncTURBO3_0(const in decodeBufTURBO3_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float centroids[8] = float[8](
|
||||
-0.190685, -0.117832, -0.065717, -0.021460,
|
||||
0.021460, 0.065717, 0.117832, 0.190685
|
||||
);
|
||||
const float norm = float(bl.block.norm);
|
||||
const uint j = coordInBlock[1];
|
||||
|
||||
// Extract 2-bit low index from qs (4 per byte)
|
||||
const uint low2 = (uint(bl.block.qs[j / 4]) >> ((j % 4) * 2)) & 0x3;
|
||||
|
||||
// Extract 1-bit high from signs (8 per byte)
|
||||
const uint hi1 = (uint(bl.block.signs[j / 8]) >> (j % 8)) & 0x1;
|
||||
|
||||
// Combine to 3-bit index
|
||||
const uint idx = low2 | (hi1 << 2);
|
||||
|
||||
return float16_t(centroids[idx] * norm);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q1_0)
|
||||
#define dequantFuncA dequantFuncQ1_0
|
||||
#elif defined(DATA_A_Q4_0)
|
||||
@@ -763,6 +790,8 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords
|
||||
#define dequantFuncA dequantFuncMXFP4
|
||||
#elif defined(DATA_A_NVFP4)
|
||||
#define dequantFuncA dequantFuncNVFP4
|
||||
#elif defined(DATA_A_TURBO3_0)
|
||||
#define dequantFuncA dequantFuncTURBO3_0
|
||||
#elif defined(DATA_A_F32)
|
||||
#define dequantFuncA dequantFuncF32
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
#version 450
|
||||
|
||||
#include "dequant_head.glsl"
|
||||
|
||||
// 256 threads per workgroup, each thread fully dequants one TQ4_1S block
|
||||
// (32 elements, 20 bytes). Workgroups process 256 blocks = 8192 elements each
|
||||
// so the x dispatch stays under maxComputeWorkGroupCount[0] for large tensors.
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_tq4_1s data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
// Lloyd-Max centroids for 4-bit normal quantization
|
||||
const float centroids[16] = float[16](
|
||||
-2.732590, -2.069017, -1.618046, -1.256231,
|
||||
-0.942340, -0.656759, -0.388048, -0.128395,
|
||||
0.128395, 0.388048, 0.656759, 0.942340,
|
||||
1.256231, 1.618046, 2.069017, 2.732590
|
||||
);
|
||||
|
||||
// WHT sign pattern for inverse RHT normalization
|
||||
const float signs[32] = float[32](
|
||||
+1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
|
||||
-1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
|
||||
-1.0, -1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0,
|
||||
-1.0, +1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0
|
||||
);
|
||||
|
||||
const float INV_SQRT32 = 0.17677669529663688;
|
||||
|
||||
const uint ib = gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
|
||||
if (ib >= p.nel / 32) return;
|
||||
|
||||
const float d0 = float(data_a[ib].d0);
|
||||
const float d1 = float(data_a[ib].d1);
|
||||
|
||||
// Load centroid*scale into per-thread buffer
|
||||
float buf[32];
|
||||
for (int j = 0; j < 32; j++) {
|
||||
const uint byte_idx = uint(j) / 2u;
|
||||
const uint nibble_shift = (uint(j) & 1u) * 4u;
|
||||
const uint idx = (uint(data_a[ib].qs[byte_idx]) >> nibble_shift) & 0xFu;
|
||||
const float d = (j < 16) ? d0 : d1;
|
||||
buf[j] = centroids[idx] * d;
|
||||
}
|
||||
|
||||
// Inverse WHT butterfly (5 stages for 32 elements) — matches CPU reference
|
||||
for (uint step = 1u; step < 32u; step <<= 1u) {
|
||||
for (uint i = 0u; i < 32u; i += step * 2u) {
|
||||
for (uint j = i; j < i + step; j++) {
|
||||
const float a = buf[j];
|
||||
const float b = buf[j + step];
|
||||
buf[j] = a + b;
|
||||
buf[j + step] = a - b;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize and apply sign pattern
|
||||
const uint out_base = ib * 32u;
|
||||
for (int j = 0; j < 32; j++) {
|
||||
data_b[out_base + uint(j)] = D_TYPE(buf[j] * INV_SQRT32 * signs[j]);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
#version 450
|
||||
|
||||
#include "dequant_head.glsl"
|
||||
|
||||
// 128 elements per block (QK_TURBO3 = 128)
|
||||
// Each workgroup processes one block. 128 threads, 1 element per thread.
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_turbo3_0 data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const float centroids[8] = float[8](
|
||||
-0.190685, -0.117832, -0.065717, -0.021460,
|
||||
0.021460, 0.065717, 0.117832, 0.190685
|
||||
);
|
||||
|
||||
// Each workgroup processes one 128-element block
|
||||
const uint ib = gl_WorkGroupID.x;
|
||||
const uint j = gl_LocalInvocationID.x; // element index 0..127
|
||||
|
||||
if (ib >= p.nel / 128) return;
|
||||
|
||||
const float norm = float(data_a[ib].norm);
|
||||
|
||||
// Extract 2-bit low index from qs (4 per byte)
|
||||
const uint low2 = (uint(data_a[ib].qs[j / 4]) >> ((j % 4) * 2)) & 0x3;
|
||||
|
||||
// Extract 1-bit high from signs (8 per byte)
|
||||
const uint hi1 = (uint(data_a[ib].signs[j / 8]) >> (j % 8)) & 0x1;
|
||||
|
||||
// Combine to 3-bit index
|
||||
const uint idx = low2 | (hi1 << 2);
|
||||
|
||||
data_b[ib * 128 + j] = D_TYPE(centroids[idx] * norm);
|
||||
}
|
||||
@@ -35,10 +35,12 @@ const uint32_t num_subgroups = SubGroupSize == 0 ? 0 : WorkGroupSize / SubGroupS
|
||||
|
||||
layout (binding = 0) readonly buffer Q {float data_q[];};
|
||||
layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
|
||||
#if !defined(DATA_A_TURBO3_0)
|
||||
layout (binding = 1) readonly buffer K {float16_t data_k[];};
|
||||
layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
|
||||
layout (binding = 2) readonly buffer V {float16_t data_v[];};
|
||||
layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
|
||||
#endif
|
||||
layout (binding = 3) readonly buffer M {float16_t data_m[];};
|
||||
|
||||
// If SubGroupSize is set to 0 then only use shmem reductions
|
||||
|
||||
@@ -140,6 +140,35 @@ uint fa_quant_r_mmq(uint ty) {
|
||||
#define USE_DECODE_K (FaTypeK != FA_TYPE_F16)
|
||||
#define USE_DECODE_V (FaTypeV != FA_TYPE_F16)
|
||||
|
||||
#if defined(DATA_A_TURBO3_0)
|
||||
const float T3C[8] = float[8](
|
||||
-0.190685, -0.117832, -0.065717, -0.021460,
|
||||
0.021460, 0.065717, 0.117832, 0.190685
|
||||
);
|
||||
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
FLOAT_TYPEV4 r;
|
||||
for (int k = 0; k < 4; k++) {
|
||||
uint j = iqs + uint(k);
|
||||
float nm;
|
||||
uint qb;
|
||||
uint sb;
|
||||
if (binding_idx == BINDING_IDX_K) {
|
||||
nm = float(data_k_t3[a_offset + ib].norm);
|
||||
qb = uint(data_k_t3[a_offset + ib].qs[j / 4]);
|
||||
sb = uint(data_k_t3[a_offset + ib].signs[j / 8]);
|
||||
} else {
|
||||
nm = float(data_v_t3[a_offset + ib].norm);
|
||||
qb = uint(data_v_t3[a_offset + ib].qs[j / 4]);
|
||||
sb = uint(data_v_t3[a_offset + ib].signs[j / 8]);
|
||||
}
|
||||
uint lo = (qb >> ((j % 4) * 2)) & 0x3;
|
||||
uint hi = (sb >> (j % 8)) & 0x1;
|
||||
r[k] = FLOAT_TYPE(T3C[lo | (hi << 2)] * nm);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
|
||||
#include "mul_mat_vec_base.glsl"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
// Lloyd-Max centroids for TQ4_1S (4-bit, 16 levels) — N(0, 1) optimal
|
||||
const float TQ4_CENTROIDS[16] = float[16](
|
||||
-2.732590, -2.069017, -1.618046, -1.256231,
|
||||
-0.942340, -0.656759, -0.388048, -0.128395,
|
||||
0.128395, 0.388048, 0.656759, 0.942340,
|
||||
1.256231, 1.618046, 2.069017, 2.732590
|
||||
);
|
||||
|
||||
// WHT sign pattern for 32-element blocks (shared by TQ3 and TQ4)
|
||||
const float TQ4_SIGNS[32] = float[32](
|
||||
+1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
|
||||
-1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
|
||||
-1.0, -1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0,
|
||||
-1.0, +1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0
|
||||
);
|
||||
|
||||
const float TQ4_INV_SQRT32 = 0.17677669529663688;
|
||||
|
||||
// See the commit message on a850ccc for the full derivation and portability
|
||||
// rationale. Short version: pre-rotate the activation block via forward WHT
|
||||
// in shared memory, then dot-product against the raw centroid*scale weights.
|
||||
//
|
||||
// Shared memory budget: NUM_COLS * 32 floats (max 1 KiB at NUM_COLS=8)
|
||||
// plus whatever tmpsh the reduction helper allocates.
|
||||
|
||||
shared float tq4_smem[8 * 32];
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < NUM_ROWS; ++n) {
|
||||
temp[j][n] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / 32u;
|
||||
const uint byte_idx = tid / 2u;
|
||||
const uint nibble_shift = (tid & 1u) * 4u;
|
||||
const float sign_tid = TQ4_SIGNS[tid];
|
||||
|
||||
for (uint blk = 0; blk < num_blocks_per_row; blk++) {
|
||||
// --- Stage 1: load activation, sign-flip, write to shared memory ---
|
||||
[[unroll]] for (uint c = 0; c < NUM_COLS; ++c) {
|
||||
const uint b_base = c * p.batch_stride_b + b_offset + blk * 32u;
|
||||
tq4_smem[c * 32u + tid] = float(data_b[b_base + tid]) * sign_tid;
|
||||
}
|
||||
barrier();
|
||||
|
||||
// --- Stage 2: forward WHT butterfly in shared memory (5 stages) ---
|
||||
[[unroll]] for (uint step = 1u; step < 32u; step <<= 1u) {
|
||||
if ((tid & step) == 0u) {
|
||||
const uint partner = tid + step;
|
||||
[[unroll]] for (uint c = 0; c < NUM_COLS; ++c) {
|
||||
const uint base = c * 32u;
|
||||
const float a = tq4_smem[base + tid];
|
||||
const float b = tq4_smem[base + partner];
|
||||
tq4_smem[base + tid] = a + b;
|
||||
tq4_smem[base + partner] = a - b;
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
// --- Stage 3: dequant all rows' weights for this block position ---
|
||||
// Pre-computing the weight for every row before touching the column
|
||||
// accumulator lets the compiler treat the smem read in stage 4 as
|
||||
// loop-invariant across rows, which is the Vulkan analogue of the
|
||||
// "hot loop load dedup" optimisation in the CUDA kernel (PR #57).
|
||||
float w_vals[NUM_ROWS];
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib = (first_row + n) * num_blocks_per_row + blk;
|
||||
const uint idx = (uint(data_a[a_offset + ib].qs[byte_idx]) >> nibble_shift) & 0xFu;
|
||||
const float d = (tid < 16u)
|
||||
? float(data_a[a_offset + ib].d0)
|
||||
: float(data_a[a_offset + ib].d1);
|
||||
w_vals[n] = TQ4_CENTROIDS[idx] * d * TQ4_INV_SQRT32;
|
||||
}
|
||||
|
||||
// --- Stage 4: accumulate dot products ---
|
||||
// Read the rotated activation once per column; reuse across all rows.
|
||||
[[unroll]] for (uint c = 0; c < NUM_COLS; ++c) {
|
||||
const float b_rotated = tq4_smem[c * 32u + tid];
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
temp[c][n] += FLOAT_TYPE(w_vals[n] * b_rotated);
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure every thread is done reading before the next block's store.
|
||||
barrier();
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
layout (local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter { uint ne; uint direction; uint group_size; } p;
|
||||
|
||||
layout (binding = 0) readonly buffer A { float data_a[]; };
|
||||
layout (binding = 1) writeonly buffer D { float data_d[]; };
|
||||
|
||||
shared float x[128];
|
||||
|
||||
// Pre-scramble sign vectors applied before and after the WHT.
|
||||
// direction == 0: pre = S1, post = S2; direction == 1: pre = S2, post = S1.
|
||||
const float S1[128] = float[128](
|
||||
-1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1,
|
||||
1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1,
|
||||
-1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1,
|
||||
1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1,
|
||||
-1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, 1,
|
||||
1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1,
|
||||
-1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1,
|
||||
1, -1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1
|
||||
);
|
||||
|
||||
const float S2[128] = float[128](
|
||||
1, 1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1,
|
||||
1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1,
|
||||
1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1,
|
||||
1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1,
|
||||
1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, 1,
|
||||
-1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1,
|
||||
1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1,
|
||||
-1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1
|
||||
);
|
||||
|
||||
const float INV_SQRT_128 = 0.08838834764831845; // 1 / sqrt(128)
|
||||
|
||||
void main() {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint base = (gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x) * 128;
|
||||
|
||||
if (base + tid >= p.ne) return;
|
||||
|
||||
const float fs = (p.direction == 0) ? S1[tid] : S2[tid];
|
||||
const float ss = (p.direction == 0) ? S2[tid] : S1[tid];
|
||||
|
||||
x[tid] = data_a[base + tid];
|
||||
barrier();
|
||||
|
||||
x[tid] *= fs;
|
||||
barrier();
|
||||
|
||||
[[unroll]] for (uint h = 1; h < 128; h *= 2) {
|
||||
if ((tid % (2 * h)) < h) {
|
||||
float a = x[tid];
|
||||
float b = x[tid + h];
|
||||
x[tid] = a + b;
|
||||
x[tid + h] = a - b;
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
data_d[base + tid] = x[tid] * INV_SQRT_128 * ss;
|
||||
}
|
||||
@@ -6,6 +6,7 @@
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
|
||||
#if defined(DATA_A_F32)
|
||||
#define QUANT_K 1
|
||||
@@ -1729,6 +1730,70 @@ struct block_nvfp4
|
||||
#define A_TYPE block_nvfp4
|
||||
#endif
|
||||
|
||||
#define QUANT_K_TURBO3_0 128
|
||||
#define QUANT_R_TURBO3_0 1
|
||||
|
||||
struct block_turbo3_0
|
||||
{
|
||||
float16_t norm;
|
||||
uint8_t qs[32]; // 2-bit centroid indices (4 per byte), 128/4 = 32 bytes
|
||||
uint8_t signs[16]; // 1-bit high bit of 3-bit index (8 per byte), 128/8 = 16 bytes
|
||||
};
|
||||
|
||||
#if defined(DATA_A_TURBO3_0)
|
||||
#define QUANT_K QUANT_K_TURBO3_0
|
||||
#define QUANT_R QUANT_R_TURBO3_0
|
||||
#define QUANT_AUXF 1
|
||||
#define A_TYPE block_turbo3_0
|
||||
#endif
|
||||
|
||||
#define QUANT_K_TURBO2_0 128
|
||||
#define QUANT_R_TURBO2_0 1
|
||||
struct block_turbo2_0
|
||||
{
|
||||
float16_t norm;
|
||||
uint8_t qs[32]; // 2-bit centroid indices (4 per byte), 128/4 = 32 bytes
|
||||
};
|
||||
#if defined(DATA_A_TURBO2_0)
|
||||
#define QUANT_K QUANT_K_TURBO2_0
|
||||
#define QUANT_R QUANT_R_TURBO2_0
|
||||
#define QUANT_AUXF 1
|
||||
#define A_TYPE block_turbo2_0
|
||||
#endif
|
||||
|
||||
#define QUANT_K_TURBO4_0 128
|
||||
#define QUANT_R_TURBO4_0 1
|
||||
struct block_turbo4_0
|
||||
{
|
||||
float16_t norm;
|
||||
float16_t rnorm; // reserved in 4-bit mode (kept for ABI parity with legacy)
|
||||
uint8_t qs[64]; // 4-bit centroid indices, nibble-packed (2 per byte), 128/2 = 64 bytes
|
||||
};
|
||||
#if defined(DATA_A_TURBO4_0)
|
||||
#define QUANT_K QUANT_K_TURBO4_0
|
||||
#define QUANT_R QUANT_R_TURBO4_0
|
||||
#define QUANT_AUXF 1
|
||||
#define A_TYPE block_turbo4_0
|
||||
#endif
|
||||
|
||||
|
||||
#define QUANT_K_TQ4_1S 32
|
||||
#define QUANT_R_TQ4_1S 1
|
||||
|
||||
struct block_tq4_1s
|
||||
{
|
||||
float16_t d0; // scale for elements 0-15
|
||||
float16_t d1; // scale for elements 16-31
|
||||
uint8_t qs[16]; // 4-bit nibble-packed centroid indices (2 per byte)
|
||||
};
|
||||
|
||||
#if defined(DATA_A_TQ4_1S)
|
||||
#define QUANT_K QUANT_K_TQ4_1S
|
||||
#define QUANT_R QUANT_R_TQ4_1S
|
||||
#define QUANT_AUXF 1
|
||||
#define A_TYPE block_tq4_1s
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
|
||||
const int8_t kvalues_iq4nl_const[16] = {
|
||||
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
|
||||
|
||||
@@ -68,6 +68,8 @@ const std::vector<std::string> type_names = {
|
||||
"mxfp4",
|
||||
"nvfp4",
|
||||
"bf16",
|
||||
"turbo3_0",
|
||||
"tq4_1s",
|
||||
};
|
||||
|
||||
enum MatMulIdType {
|
||||
@@ -563,6 +565,11 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
if (tname == "bf16") {
|
||||
continue;
|
||||
}
|
||||
// TQ4_1S uses a specialized mul_mat_vec shader for small N and
|
||||
// the dequant+f16 matmul fallback for large N. No dedicated mul_mm needed.
|
||||
if (tname == "tq4_1s") {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
||||
@@ -667,7 +674,7 @@ void process_shaders() {
|
||||
for (const auto& tname : type_names) {
|
||||
// mul mat vec
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
||||
std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_") || tname == "tq4_1s") ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
||||
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPEV2", "vec2"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV2", "f16vec2"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}));
|
||||
@@ -750,12 +757,19 @@ void process_shaders() {
|
||||
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}
|
||||
// turbo3_0 copy-from-quant only; copy-to-quant (cpy_f32_turbo3_0) omitted because the non-SET_ROWS quantize() path lacks the WHT transform
|
||||
string_to_spv("cpy_turbo3_0_f32", "copy_from_quant.comp", {{"DATA_A_TURBO3_0", "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
// tq4_1s copy-from-quant only; copy-to-quant requires WHT forward (handled in SET_ROWS path)
|
||||
string_to_spv("cpy_tq4_1s_f32", "copy_from_quant.comp", {{"DATA_A_TQ4_1S", "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
|
||||
for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl", "turbo2_0", "turbo3_0", "turbo4_0", "tq4_1s"}) {
|
||||
string_to_spv("set_rows_" + t + "_i32", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uint"}, {"B_SIZE", "32"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("set_rows_" + t + "_i64", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"B_SIZE", "64"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}
|
||||
|
||||
// TurboQuant WHT operation
|
||||
string_to_spv("turbo_wht", "turbo_wht.comp", {});
|
||||
|
||||
auto get_type_str = [](bool f16) {
|
||||
return f16 ? "float16_t" : "float";
|
||||
};
|
||||
|
||||
+81
-2
@@ -749,6 +749,46 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
.to_float = (ggml_to_float_t) dequantize_row_nvfp4,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_nvfp4_ref,
|
||||
},
|
||||
[GGML_TYPE_TURBO3_0] = {
|
||||
.type_name = "turbo3",
|
||||
.blck_size = QK_TURBO3,
|
||||
.type_size = sizeof(block_turbo3_0),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_turbo3_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_turbo3_0_ref,
|
||||
},
|
||||
[GGML_TYPE_TURBO4_0] = {
|
||||
.type_name = "turbo4",
|
||||
.blck_size = QK_TURBO4,
|
||||
.type_size = sizeof(block_turbo4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_turbo4_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_turbo4_0_ref,
|
||||
},
|
||||
[GGML_TYPE_TURBO2_0] = {
|
||||
.type_name = "turbo2",
|
||||
.blck_size = QK_TURBO2,
|
||||
.type_size = sizeof(block_turbo2_0),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_turbo2_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_turbo2_0_ref,
|
||||
},
|
||||
[GGML_TYPE_TQ3_1S] = {
|
||||
.type_name = "tq3_1s",
|
||||
.blck_size = QK_TQ3_0,
|
||||
.type_size = sizeof(block_tq3_1s),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_tq3_1s,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_tq3_1s_ref,
|
||||
},
|
||||
[GGML_TYPE_TQ4_1S] = {
|
||||
.type_name = "tq4_1s",
|
||||
.blck_size = QK_TQ4_1S,
|
||||
.type_size = sizeof(block_tq4_1s),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_tq4_1s,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_tq4_1s_ref,
|
||||
},
|
||||
[GGML_TYPE_Q2_K] = {
|
||||
.type_name = "q2_K",
|
||||
.blck_size = QK_K,
|
||||
@@ -1063,6 +1103,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"RWKV_WKV7",
|
||||
"SOLVE_TRI",
|
||||
"GATED_DELTA_NET",
|
||||
"TURBO_WHT",
|
||||
|
||||
"UNARY",
|
||||
|
||||
@@ -1080,7 +1121,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"GLU",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
|
||||
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1173,6 +1214,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"rwkv_wkv7(r, w, k, v, a, b, s)",
|
||||
"A X = B, A triangular, solve X",
|
||||
"gated_delta_net(q, k, v, g, beta, s)",
|
||||
"turbo_wht(a)",
|
||||
|
||||
"unary(x)",
|
||||
|
||||
@@ -1190,7 +1232,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"glu(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
|
||||
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -6230,6 +6272,38 @@ struct ggml_tensor * ggml_gated_delta_net(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_turbo_wht
|
||||
|
||||
struct ggml_tensor * ggml_turbo_wht(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int direction,
|
||||
int group_size,
|
||||
struct ggml_tensor * scale) {
|
||||
GGML_ASSERT(ggml_is_contiguous(a));
|
||||
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(direction == 0 || direction == 1);
|
||||
|
||||
// Auto-detect group size from tensor dimension if not specified
|
||||
if (group_size == 0) {
|
||||
group_size = (a->ne[0] % 128 == 0) ? 128 : 64;
|
||||
}
|
||||
GGML_ASSERT(group_size == 32 || group_size == 64 || group_size == 128);
|
||||
GGML_ASSERT(a->ne[0] % group_size == 0);
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
|
||||
|
||||
result->op = GGML_OP_TURBO_WHT;
|
||||
result->src[0] = a;
|
||||
result->src[1] = scale; // InnerQ scale_inv (NULL = no scaling)
|
||||
|
||||
// Store direction and group_size in op_params
|
||||
memcpy(result->op_params + 0, &direction, sizeof(int));
|
||||
memcpy(result->op_params + sizeof(int), &group_size, sizeof(int));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
||||
@@ -7714,6 +7788,11 @@ size_t ggml_quantize_chunk(
|
||||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_TURBO3_0: result = quantize_turbo3_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_TURBO4_0: result = quantize_turbo4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_TURBO2_0: result = quantize_turbo2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_TQ3_1S: result = quantize_tq3_1s(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_TQ4_1S: result = quantize_tq4_1s(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
|
||||
@@ -4142,6 +4142,8 @@ class GGMLQuantizationType(IntEnum):
|
||||
MXFP4 = 39
|
||||
NVFP4 = 40
|
||||
Q1_0 = 41
|
||||
TQ3_1S = 45
|
||||
TQ4_1S = 46
|
||||
|
||||
|
||||
class ExpertGatingFuncType(IntEnum):
|
||||
@@ -4196,6 +4198,8 @@ class LlamaFileType(IntEnum):
|
||||
MOSTLY_MXFP4_MOE = 38 # except 1d tensors
|
||||
MOSTLY_NVFP4 = 39 # except 1d tensors
|
||||
MOSTLY_Q1_0 = 40 # except 1d tensors
|
||||
MOSTLY_TQ3_1S = 43 # except 1d tensors
|
||||
MOSTLY_TQ4_1S = 44 # except 1d tensors
|
||||
|
||||
GUESSED = 1024 # not specified in the model file
|
||||
|
||||
@@ -4317,6 +4321,8 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||
GGMLQuantizationType.MXFP4: (32, 1 + 16),
|
||||
GGMLQuantizationType.NVFP4: (64, 4 + 32),
|
||||
GGMLQuantizationType.Q1_0: (128, 2 + 16),
|
||||
GGMLQuantizationType.TQ3_1S: (32, 2 + 2 + 12),
|
||||
GGMLQuantizationType.TQ4_1S: (32, 2 + 2 + 16),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -155,6 +155,8 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ3_1S = 43, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ4_1S = 44, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
||||
Executable
+13
@@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
# build.sh — Build CUDA target. Exits non-zero on failure.
|
||||
set -euo pipefail
|
||||
REPO_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||
cd "$REPO_DIR"
|
||||
|
||||
# Configure if build dir doesn't exist
|
||||
if [[ ! -d build-cuda ]]; then
|
||||
cmake -B build-cuda -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="89;120" \
|
||||
-DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF 2>&1
|
||||
fi
|
||||
|
||||
cmake --build build-cuda -j$(nproc) 2>&1
|
||||
Executable
+140
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env bash
|
||||
# run_experiment.sh — Build, benchmark, and validate a CUDA kernel experiment.
|
||||
# Outputs structured JSON to stdout. All logging goes to stderr.
|
||||
#
|
||||
# Usage: run_experiment.sh <track-name> [--quick]
|
||||
# --quick: skip PPL validation (speed-only iteration)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
TRACK="${1:?Usage: run_experiment.sh <track-name> [--quick]}"
|
||||
QUICK=false
|
||||
[[ "${2:-}" == "--quick" ]] && QUICK=true
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
REPO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
TRACK_DIR="$SCRIPT_DIR/$TRACK"
|
||||
|
||||
if [[ ! -f "$TRACK_DIR/baseline.json" ]]; then
|
||||
echo '{"status": "error", "error": "No baseline.json found for track: '"$TRACK"'"}'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Load track config from baseline.json
|
||||
BENCH_ARGS=$(jq -r '.bench_args // ""' "$TRACK_DIR/baseline.json")
|
||||
MODEL=$(jq -r '.model' "$TRACK_DIR/baseline.json")
|
||||
NO_CONVERT=$(jq -r '.no_convert // false' "$TRACK_DIR/baseline.json")
|
||||
|
||||
# For tracks that benchmark the TQ4_1S runtime kernel, disable load-time conversion
|
||||
BENCH_ENV=""
|
||||
if [[ "$NO_CONVERT" == "true" ]]; then
|
||||
BENCH_ENV="GGML_TQ_NO_CONVERT=1"
|
||||
fi
|
||||
PPL_BASELINE=$(jq -r '.ppl // 0' "$TRACK_DIR/baseline.json")
|
||||
PPL_THRESHOLD=$(jq -r '.ppl_threshold // 0.1' "$TRACK_DIR/baseline.json")
|
||||
PPL_FILE=$(jq -r '.ppl_file // ""' "$TRACK_DIR/baseline.json")
|
||||
|
||||
# ---- Phase 1: Build ----
|
||||
echo ">>> Building..." >&2
|
||||
BUILD_START=$(date +%s)
|
||||
|
||||
BUILD_OUTPUT=$("$REPO_DIR/scripts/autoresearch/build.sh" 2>&1) || {
|
||||
BUILD_END=$(date +%s)
|
||||
# Extract last 20 lines of error for the agent
|
||||
ERROR=$(echo "$BUILD_OUTPUT" | grep "error:" | head -10)
|
||||
jq -n \
|
||||
--arg status "build_failed" \
|
||||
--arg error "$ERROR" \
|
||||
--argjson build_time_s $((BUILD_END - BUILD_START)) \
|
||||
'{status: $status, error: $error, build_time_s: $build_time_s}'
|
||||
exit 0
|
||||
}
|
||||
BUILD_END=$(date +%s)
|
||||
echo ">>> Build OK ($(( BUILD_END - BUILD_START ))s)" >&2
|
||||
|
||||
# ---- Phase 2: Benchmark ----
|
||||
echo ">>> Benchmarking..." >&2
|
||||
BENCH_START=$(date +%s)
|
||||
|
||||
BENCH_OUTPUT=$(timeout 120 env $BENCH_ENV "$REPO_DIR/build-cuda/bin/llama-bench" \
|
||||
-m "$MODEL" -ngl 99 $BENCH_ARGS -r 3 2>&1) || {
|
||||
BENCH_END=$(date +%s)
|
||||
ERROR=$(echo "$BENCH_OUTPUT" | grep -iE "error|crash|abort|signal" | head -5)
|
||||
jq -n \
|
||||
--arg status "runtime_crash" \
|
||||
--arg error "${ERROR:-unknown crash}" \
|
||||
--argjson build_time_s $((BUILD_END - BUILD_START)) \
|
||||
--argjson bench_time_s $(($(date +%s) - BENCH_START)) \
|
||||
'{status: $status, error: $error, build_time_s: $build_time_s, bench_time_s: $bench_time_s}'
|
||||
exit 0
|
||||
}
|
||||
BENCH_END=$(date +%s)
|
||||
|
||||
# Parse t/s from llama-bench output
|
||||
# Format: "| model | size | params | backend | ngl | test | t/s ± err |"
|
||||
# The t/s value is before the "±" in the last data column
|
||||
parse_ts() {
|
||||
grep -E "^\|" | grep -v "model" | grep -v "\-\-\-" | \
|
||||
grep -oP '\d+\.\d+\s*±' | grep -oP '[\d.]+' | head -1
|
||||
}
|
||||
|
||||
TG128=$(echo "$BENCH_OUTPUT" | parse_ts)
|
||||
# For tracks with both pp and tg, there will be two data rows
|
||||
BENCH_LINES=$(echo "$BENCH_OUTPUT" | grep -E "^\|" | grep -v "model" | grep -v "\-\-\-" | wc -l)
|
||||
if (( BENCH_LINES > 1 )); then
|
||||
PP512=$(echo "$BENCH_OUTPUT" | grep -E "^\|" | grep -v "model" | grep -v "\-\-\-" | head -1 | grep -oP '\d+\.\d+\s*±' | grep -oP '[\d.]+')
|
||||
TG128=$(echo "$BENCH_OUTPUT" | grep -E "^\|" | grep -v "model" | grep -v "\-\-\-" | tail -1 | grep -oP '\d+\.\d+\s*±' | grep -oP '[\d.]+')
|
||||
else
|
||||
PP512=""
|
||||
fi
|
||||
|
||||
echo ">>> Benchmark: tg=${TG128:-?} pp=${PP512:-n/a}" >&2
|
||||
|
||||
# ---- Phase 3: Correctness (unless --quick) ----
|
||||
PPL=""
|
||||
if [[ "$QUICK" == "false" && -n "$PPL_FILE" && -n "$PPL_BASELINE" && "$PPL_BASELINE" != "0" ]]; then
|
||||
echo ">>> PPL check..." >&2
|
||||
PPL_OUTPUT=$(timeout 180 env $BENCH_ENV "$REPO_DIR/build-cuda/bin/llama-perplexity" \
|
||||
-m "$MODEL" -f "$PPL_FILE" -ngl 99 --chunks 10 2>&1) || true
|
||||
PPL=$(echo "$PPL_OUTPUT" | grep "Final estimate" | grep -oP 'PPL = \K[0-9.]+')
|
||||
|
||||
if [[ -n "$PPL" ]]; then
|
||||
PPL_DELTA=$(echo "$PPL - $PPL_BASELINE" | bc -l 2>/dev/null || echo "999")
|
||||
if (( $(echo "$PPL_DELTA > $PPL_THRESHOLD" | bc -l 2>/dev/null || echo 1) )); then
|
||||
echo ">>> PPL regression: $PPL (baseline: $PPL_BASELINE, delta: $PPL_DELTA)" >&2
|
||||
jq -n \
|
||||
--arg status "ppl_regression" \
|
||||
--argjson ppl "${PPL}" \
|
||||
--argjson ppl_baseline "${PPL_BASELINE}" \
|
||||
--argjson build_time_s $((BUILD_END - BUILD_START)) \
|
||||
--argjson bench_time_s $((BENCH_END - BENCH_START)) \
|
||||
--arg tg128 "${TG128:-0}" \
|
||||
'{status: $status, tg128: ($tg128|tonumber), ppl: $ppl, ppl_baseline: $ppl_baseline, build_time_s: $build_time_s, bench_time_s: $bench_time_s}'
|
||||
exit 0
|
||||
fi
|
||||
echo ">>> PPL OK: $PPL (baseline: $PPL_BASELINE)" >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
# ---- Phase 4: Compute delta and output ----
|
||||
BASELINE_TG=$(jq -r '.tg128 // 0' "$TRACK_DIR/baseline.json")
|
||||
if [[ -n "$TG128" && "$BASELINE_TG" != "0" ]]; then
|
||||
DELTA_PCT=$(echo "scale=1; ($TG128 - $BASELINE_TG) / $BASELINE_TG * 100" | bc -l 2>/dev/null || echo "0")
|
||||
else
|
||||
DELTA_PCT="0"
|
||||
fi
|
||||
|
||||
# GPU temperature
|
||||
GPU_TEMP=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "")
|
||||
|
||||
jq -n \
|
||||
--arg status "success" \
|
||||
--arg tg128 "${TG128:-0}" \
|
||||
--arg pp512 "${PP512:-}" \
|
||||
--arg ppl "${PPL:-}" \
|
||||
--argjson baseline_tg128 "${BASELINE_TG}" \
|
||||
--arg delta_pct "${DELTA_PCT}%" \
|
||||
--argjson build_time_s $((BUILD_END - BUILD_START)) \
|
||||
--argjson bench_time_s $((BENCH_END - BENCH_START)) \
|
||||
--arg gpu_temp_c "${GPU_TEMP}" \
|
||||
'{status: $status, tg128: ($tg128|tonumber), baseline_tg128: $baseline_tg128, delta_pct: $delta_pct, build_time_s: $build_time_s, bench_time_s: $bench_time_s, gpu_temp_c: $gpu_temp_c} + (if $pp512 != "" then {pp512: ($pp512|tonumber)} else {} end) + (if $ppl != "" then {ppl: ($ppl|tonumber)} else {} end)'
|
||||
Executable
+249
@@ -0,0 +1,249 @@
|
||||
#!/usr/bin/env bash
|
||||
# run_track.sh — Outer loop: invoke AI agent to optimize a CUDA kernel track.
|
||||
#
|
||||
# Usage: run_track.sh <track-name> --experiments <N> [--quick-until-improvement] [--max-hours <H>]
|
||||
|
||||
set -uo pipefail
|
||||
# Note: NOT using set -e — we handle errors explicitly to keep the loop running
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
REPO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
# ---- Parse args ----
|
||||
TRACK="${1:?Usage: run_track.sh <track-name> --experiments <N>}"
|
||||
shift
|
||||
EXPERIMENTS=10
|
||||
QUICK_UNTIL_IMPROVEMENT=false
|
||||
MAX_HOURS=8
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--experiments) EXPERIMENTS="$2"; shift 2 ;;
|
||||
--quick-until-improvement) QUICK_UNTIL_IMPROVEMENT=true; shift ;;
|
||||
--max-hours) MAX_HOURS="$2"; shift 2 ;;
|
||||
*) echo "Unknown arg: $1" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
TRACK_DIR="$SCRIPT_DIR/$TRACK"
|
||||
if [[ ! -f "$TRACK_DIR/program.md" ]]; then
|
||||
echo "Error: $TRACK_DIR/program.md not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TARGET_FILE=$(jq -r '.target_file' "$TRACK_DIR/baseline.json")
|
||||
HISTORY_FILE="$TRACK_DIR/history.jsonl"
|
||||
BASELINE_FILE="$TRACK_DIR/baseline.json"
|
||||
START_TIME=$(date +%s)
|
||||
MAX_SECONDS=$((MAX_HOURS * 3600))
|
||||
|
||||
# ---- Create experiment branch ----
|
||||
BRANCH_NAME="autoresearch/$TRACK/$(date +%Y%m%d-%H%M%S)"
|
||||
cd "$REPO_DIR"
|
||||
git checkout -b "$BRANCH_NAME" 2>/dev/null || true
|
||||
BASELINE_SHA=$(git rev-parse HEAD)
|
||||
echo ">>> Branch: $BRANCH_NAME (base: ${BASELINE_SHA:0:8})" >&2
|
||||
|
||||
# ---- Establish baseline ----
|
||||
echo ">>> Establishing baseline..." >&2
|
||||
BASELINE_RESULT=$("$SCRIPT_DIR/run_experiment.sh" "$TRACK")
|
||||
echo "$BASELINE_RESULT" | jq . >&2
|
||||
|
||||
BASELINE_STATUS=$(echo "$BASELINE_RESULT" | jq -r '.status')
|
||||
if [[ "$BASELINE_STATUS" != "success" ]]; then
|
||||
echo "Error: baseline benchmark failed: $BASELINE_STATUS" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Update baseline with current measurements
|
||||
BASELINE_TG=$(echo "$BASELINE_RESULT" | jq '.tg128')
|
||||
BASELINE_PPL=$(echo "$BASELINE_RESULT" | jq '.ppl // empty')
|
||||
jq --argjson tg "$BASELINE_TG" '.tg128 = $tg' "$BASELINE_FILE" > "$BASELINE_FILE.tmp" && mv "$BASELINE_FILE.tmp" "$BASELINE_FILE"
|
||||
if [[ -n "${BASELINE_PPL:-}" ]]; then
|
||||
jq --argjson ppl "$BASELINE_PPL" '.ppl = $ppl' "$BASELINE_FILE" > "$BASELINE_FILE.tmp" && mv "$BASELINE_FILE.tmp" "$BASELINE_FILE"
|
||||
fi
|
||||
|
||||
echo ">>> Baseline: ${BASELINE_TG} t/s" >&2
|
||||
|
||||
# ---- Counters ----
|
||||
CONSECUTIVE_FAILURES=0
|
||||
CONSECUTIVE_NO_IMPROVEMENT=0
|
||||
TOTAL_KEPT=0
|
||||
TOTAL_REVERTED=0
|
||||
BEST_TG=$BASELINE_TG
|
||||
|
||||
# ---- Experiment loop ----
|
||||
for i in $(seq 1 "$EXPERIMENTS"); do
|
||||
ELAPSED=$(( $(date +%s) - START_TIME ))
|
||||
if (( ELAPSED > MAX_SECONDS )); then
|
||||
echo ">>> Time limit reached (${MAX_HOURS}h). Stopping." >&2
|
||||
break
|
||||
fi
|
||||
|
||||
echo "" >&2
|
||||
echo "================================================================" >&2
|
||||
echo ">>> Experiment $i / $EXPERIMENTS" >&2
|
||||
echo "================================================================" >&2
|
||||
|
||||
# Determine if we should skip PPL this round
|
||||
QUICK_FLAG=""
|
||||
if [[ "$QUICK_UNTIL_IMPROVEMENT" == "true" ]]; then
|
||||
QUICK_FLAG="--quick"
|
||||
fi
|
||||
# Every 10th experiment always runs PPL
|
||||
if (( i % 10 == 0 )); then
|
||||
QUICK_FLAG=""
|
||||
fi
|
||||
|
||||
# Build the agent prompt
|
||||
LAST_RESULT=""
|
||||
if [[ -f "$HISTORY_FILE" ]]; then
|
||||
LAST_RESULT=$(tail -1 "$HISTORY_FILE" 2>/dev/null || echo "")
|
||||
fi
|
||||
|
||||
STALL_HINT=""
|
||||
if (( CONSECUTIVE_NO_IMPROVEMENT >= 5 )); then
|
||||
STALL_HINT="IMPORTANT: The last $CONSECUTIVE_NO_IMPROVEMENT experiments showed no improvement. Try a fundamentally different approach — different algorithm, different memory access pattern, different thread mapping."
|
||||
CONSECUTIVE_NO_IMPROVEMENT=0
|
||||
fi
|
||||
|
||||
AGENT_PROMPT="You are optimizing a CUDA kernel. Read the program file and make ONE modification to improve performance.
|
||||
|
||||
Read: $TRACK_DIR/program.md
|
||||
Read: $REPO_DIR/$TARGET_FILE
|
||||
|
||||
$(if [[ -n "$LAST_RESULT" ]]; then echo "Last experiment result: $LAST_RESULT"; fi)
|
||||
$(if [[ -n "$STALL_HINT" ]]; then echo "$STALL_HINT"; fi)
|
||||
|
||||
Current best: ${BEST_TG} t/s (baseline: ${BASELINE_TG} t/s)
|
||||
|
||||
Rules:
|
||||
- Modify ONLY: $TARGET_FILE
|
||||
- Make exactly ONE conceptual change per experiment
|
||||
- Do NOT run any builds, benchmarks, or git commands
|
||||
- After editing, briefly describe what you changed and why (one line to stderr)"
|
||||
|
||||
# Invoke the agent
|
||||
echo ">>> Invoking agent..." >&2
|
||||
claude -p "$AGENT_PROMPT" \
|
||||
--allowedTools Read,Edit \
|
||||
--max-turns 20 \
|
||||
2>&1 | tee /dev/stderr | tail -1 > /dev/null || true
|
||||
|
||||
# Check if the file was actually modified
|
||||
if ! git diff --quiet -- "$TARGET_FILE" 2>/dev/null; then
|
||||
# Check no OTHER source files were modified (ignore config/untracked)
|
||||
OTHER_CHANGES=$(git diff --name-only -- '*.cu' '*.cuh' '*.cpp' '*.h' '*.c' | grep -vF "$TARGET_FILE" | head -5)
|
||||
if [[ -n "$OTHER_CHANGES" ]]; then
|
||||
echo ">>> SAFETY: Agent modified non-target files: $OTHER_CHANGES — reverting all" >&2
|
||||
git checkout -- .
|
||||
RESULT='{"status": "safety_revert", "error": "modified non-target files"}'
|
||||
else
|
||||
# Run the experiment
|
||||
echo ">>> Running experiment..." >&2
|
||||
RESULT=$("$SCRIPT_DIR/run_experiment.sh" "$TRACK" $QUICK_FLAG)
|
||||
echo "$RESULT" | jq . >&2
|
||||
fi
|
||||
else
|
||||
echo ">>> Agent made no changes. Skipping." >&2
|
||||
RESULT='{"status": "no_change"}'
|
||||
fi
|
||||
|
||||
STATUS=$(echo "$RESULT" | jq -r '.status')
|
||||
TG=$(echo "$RESULT" | jq -r '.tg128 // 0')
|
||||
|
||||
# Decide: keep or revert
|
||||
KEPT=false
|
||||
if [[ "$STATUS" == "success" ]]; then
|
||||
# Check if it's actually faster
|
||||
IMPROVEMENT=$(echo "$TG > $BEST_TG" | bc -l 2>/dev/null || echo 0)
|
||||
if [[ "$IMPROVEMENT" == "1" ]]; then
|
||||
# Speed improvement found. If we were in quick mode, validate PPL now.
|
||||
if [[ -n "$QUICK_FLAG" && "$QUICK_UNTIL_IMPROVEMENT" == "true" ]]; then
|
||||
echo ">>> Speed improvement found (+$(echo "$RESULT" | jq -r '.delta_pct')). Validating PPL..." >&2
|
||||
PPL_RESULT=$("$SCRIPT_DIR/run_experiment.sh" "$TRACK")
|
||||
PPL_STATUS=$(echo "$PPL_RESULT" | jq -r '.status')
|
||||
if [[ "$PPL_STATUS" == "ppl_regression" ]]; then
|
||||
echo ">>> PPL regression — reverting despite speed gain" >&2
|
||||
git checkout -- "$TARGET_FILE"
|
||||
KEPT=false
|
||||
STATUS="ppl_regression"
|
||||
RESULT="$PPL_RESULT"
|
||||
else
|
||||
KEPT=true
|
||||
fi
|
||||
else
|
||||
KEPT=true
|
||||
fi
|
||||
|
||||
if [[ "$KEPT" == "true" ]]; then
|
||||
BEST_TG="$TG"
|
||||
git add "$TARGET_FILE"
|
||||
SUMMARY=$(git diff --cached --stat | head -1)
|
||||
git commit -m "autoresearch($TRACK): +$(echo "$RESULT" | jq -r '.delta_pct') tg128 (experiment $i)" --no-verify
|
||||
# Update baseline
|
||||
jq --argjson tg "$TG" '.tg128 = $tg' "$BASELINE_FILE" > "$BASELINE_FILE.tmp" && mv "$BASELINE_FILE.tmp" "$BASELINE_FILE"
|
||||
PPL_VAL=$(echo "$RESULT" | jq '.ppl // empty')
|
||||
if [[ -n "${PPL_VAL:-}" ]]; then
|
||||
jq --argjson ppl "$PPL_VAL" '.ppl = $ppl' "$BASELINE_FILE" > "$BASELINE_FILE.tmp" && mv "$BASELINE_FILE.tmp" "$BASELINE_FILE"
|
||||
fi
|
||||
TOTAL_KEPT=$((TOTAL_KEPT + 1))
|
||||
CONSECUTIVE_NO_IMPROVEMENT=0
|
||||
CONSECUTIVE_FAILURES=0
|
||||
echo ">>> KEPT: ${TG} t/s (+$(echo "$RESULT" | jq -r '.delta_pct'))" >&2
|
||||
fi
|
||||
else
|
||||
echo ">>> No improvement (${TG} vs best ${BEST_TG}). Reverting." >&2
|
||||
git checkout -- "$TARGET_FILE"
|
||||
CONSECUTIVE_NO_IMPROVEMENT=$((CONSECUTIVE_NO_IMPROVEMENT + 1))
|
||||
fi
|
||||
elif [[ "$STATUS" == "build_failed" ]]; then
|
||||
echo ">>> Build failed. Reverting." >&2
|
||||
git checkout -- "$TARGET_FILE"
|
||||
CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
|
||||
elif [[ "$STATUS" == "runtime_crash" ]]; then
|
||||
echo ">>> Runtime crash. Reverting." >&2
|
||||
git checkout -- "$TARGET_FILE"
|
||||
CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
|
||||
else
|
||||
echo ">>> Status: $STATUS. Reverting." >&2
|
||||
git checkout -- "$TARGET_FILE"
|
||||
fi
|
||||
|
||||
if [[ "$KEPT" == "false" ]]; then
|
||||
TOTAL_REVERTED=$((TOTAL_REVERTED + 1))
|
||||
fi
|
||||
|
||||
# Log to history
|
||||
TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
echo "$RESULT" | jq -c --arg exp "$i" --arg ts "$TIMESTAMP" --argjson kept "$KEPT" \
|
||||
'. + {experiment: ($exp|tonumber), timestamp: $ts, kept: $kept}' >> "$HISTORY_FILE"
|
||||
|
||||
# Safety: too many consecutive failures
|
||||
if (( CONSECUTIVE_FAILURES >= 3 )); then
|
||||
echo ">>> WARNING: 3 consecutive failures. Pausing for review." >&2
|
||||
echo ">>> Last error: $(echo "$RESULT" | jq -r '.error // .status')" >&2
|
||||
CONSECUTIVE_FAILURES=0
|
||||
# Don't exit — just reset counter and let the stall hint kick in
|
||||
fi
|
||||
|
||||
# GPU cooldown
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# ---- Summary ----
|
||||
echo "" >&2
|
||||
echo "================================================================" >&2
|
||||
echo ">>> AUTORESEARCH COMPLETE" >&2
|
||||
echo ">>> Track: $TRACK" >&2
|
||||
echo ">>> Experiments: $((TOTAL_KEPT + TOTAL_REVERTED))" >&2
|
||||
echo ">>> Kept: $TOTAL_KEPT" >&2
|
||||
echo ">>> Reverted: $TOTAL_REVERTED" >&2
|
||||
echo ">>> Baseline: ${BASELINE_TG} t/s → Best: ${BEST_TG} t/s" >&2
|
||||
if [[ "$BEST_TG" != "$BASELINE_TG" ]]; then
|
||||
TOTAL_GAIN=$(echo "scale=1; ($BEST_TG - $BASELINE_TG) / $BASELINE_TG * 100" | bc -l 2>/dev/null || echo "?")
|
||||
echo ">>> Total improvement: +${TOTAL_GAIN}%" >&2
|
||||
fi
|
||||
echo ">>> Branch: $BRANCH_NAME" >&2
|
||||
echo ">>> History: $HISTORY_FILE" >&2
|
||||
echo "================================================================" >&2
|
||||
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"model": "/tmp/qwen2.5-7b-instruct-q4_0.gguf",
|
||||
"bench_args": "-p 512 -n 128 --cache-type-k turbo3 --cache-type-v turbo3",
|
||||
"tg128": 236.0,
|
||||
"pp512": 15000,
|
||||
"ppl": 0,
|
||||
"ppl_threshold": 0.1,
|
||||
"ppl_file": "/mnt/ai/data/wikitext-2-raw/wiki.test.raw",
|
||||
"target_file": "ggml/src/ggml-cuda/fattn-vec.cuh"
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
# TurboQuant KV Cache Kernel Optimization
|
||||
|
||||
## Goal
|
||||
Maximize decode t/s for TurboQuant KV cache types (turbo2, turbo3, turbo4) on CUDA
|
||||
(RTX 5090, Blackwell sm_120). Focus on the VEC flash attention decode kernel
|
||||
(`fattn-vec.cuh`) which dominates decode-time compute.
|
||||
|
||||
Current baseline: ~187 t/s with turbo3 KV on Qwen3.5-35B-A3B (Q4_K_M weights).
|
||||
Target: close the gap to q8_0 KV (~200+ t/s).
|
||||
|
||||
## Target File
|
||||
`ggml/src/ggml-cuda/fattn-vec.cuh` — ONLY modify this file.
|
||||
|
||||
## Model & Benchmark
|
||||
- Model: `/mnt/ai/models/huggingface/qwen3.5-35b-a3b-GGUF/Qwen_Qwen3.5-35B-A3B-Q4_K_M.gguf`
|
||||
- Benchmark: `llama-bench -ngl 99 -p 512 -n 128 -r 3 --cache-type-k turbo3 --cache-type-v turbo3`
|
||||
- Correctness: PPL must stay within 0.1 of baseline
|
||||
- Also test: `--cache-type-k turbo4 --cache-type-v turbo4` and `--cache-type-k turbo2 --cache-type-v turbo2`
|
||||
|
||||
## Architecture Overview
|
||||
TurboQuant KV cache compresses K and V tensors using PolarQuant (WHT rotation +
|
||||
Lloyd-Max quantization). Block size = 128, with norm + 2/3/4-bit quantized values.
|
||||
|
||||
### VEC Flash Attention Decode Kernel
|
||||
The VEC kernel handles single-token decode (n_tokens ≤ 2). Each warp computes
|
||||
attention for one head. The kernel has two main phases:
|
||||
|
||||
**KQ scoring (Q × K^T):**
|
||||
- Q is pre-rotated and quantized to q8_1 format
|
||||
- K is stored in turbo format (128-element blocks with norms + quantized values)
|
||||
- Uses shared-memory LUT: precompute Q×centroid products, then score via LUT lookup
|
||||
- turbo3: 8-entry LUT per Q block; turbo2: 4-entry LUT
|
||||
|
||||
**V aggregation (softmax(KQ) × V):**
|
||||
- V is stored in turbo format
|
||||
- Dequant V values, multiply by attention weight, accumulate
|
||||
- Sparse V optimization: skip dequant for negligible attention weights
|
||||
|
||||
### Key Performance Features Already Implemented
|
||||
- Shared-memory Q×centroid LUT (eliminates multiply in KQ inner loop)
|
||||
- q8_1 Q quantization path (int8 Q values for turbo KQ scoring)
|
||||
- __expf fast-math softmax
|
||||
- L2 prefetch for K+V blocks
|
||||
- Sparse V thresholds (skip V dequant for low attention weights)
|
||||
- __launch_bounds__ occupancy 3
|
||||
- nthreads_KQ=8 for turbo types
|
||||
|
||||
## Already Tried — Do NOT Re-explore
|
||||
| Approach | Result | Why it failed |
|
||||
|----------|--------|--------------|
|
||||
| Larger LUT (16-entry for turbo3) | No improvement | 8-entry already covers 3-bit |
|
||||
| Different occupancy (1, 2, 4) | 3 is optimal | Lower occupancy = less latency hiding |
|
||||
| V dequant loop unroll | No improvement | Compiler already unrolling |
|
||||
| `expf` → `__expf` fast-math | Already applied | +0.1%, already in current code |
|
||||
| Sparse V threshold tuning | Already at 1e-3 | Hill-climbed 1e-6→1e-4→5e-4→1e-3→2e-3, diminishing returns. Do NOT keep bumping this — higher thresholds risk PPL regression at long context. The current value is already aggressive. |
|
||||
| L2 prefetch for next K/V blocks | +0.1% | Already tried, marginal gain |
|
||||
| L1 vs L2 prefetch | No difference | Tried both, within noise |
|
||||
| `__launch_bounds__` occupancy 1→2→3 | Occupancy 2 marginally best | Already applied |
|
||||
|
||||
## Promising Directions to Explore
|
||||
Focus on STRUCTURAL changes to the kernel, not parameter tuning.
|
||||
|
||||
### From community discussion (ggml-org/llama.cpp#20969)
|
||||
|
||||
- **Fused K tile loader (dusterbloom/Madreag approach)**: Keep K in compressed TBQ3
|
||||
format in the MMA kernel, fuse dequant into the tile loader. Zero temp buffer for K.
|
||||
This is how Madreag's optimized fork achieves near-parity with q8_0 on prefill.
|
||||
- **cp.async pipeline for V tiles**: Bulk dequant V → fp16, then use cp.async.cg
|
||||
for V tile loads into shared memory. Overlaps V dequant with K scoring compute.
|
||||
- **Hybrid prefill architecture**: Different code paths for prefill (MMA with fused
|
||||
tile loaders) vs decode (VEC with current approach). Prefill benefits most from
|
||||
tile-level fusion.
|
||||
- **Precomputed scaled centroids per V block**: Instead of `centroid[idx] * norm`
|
||||
per element, precompute `scaled_centroid[idx] = centroid[idx] * norm` once per
|
||||
block (4 or 8 entries × 1 float each). Eliminates one multiply per V element.
|
||||
- **Cross-head WHT (AmesianX)**: For models with head_dim=64, apply WHT across
|
||||
multiple KV heads via Kronecker decomposition (H_512 = H_8 ⊗ H_64). Claims
|
||||
better decorrelation for small head dims.
|
||||
|
||||
### Kernel-level ideas
|
||||
|
||||
- **KQ scoring with dp4a**: Q is already q8_1. If K centroids can be mapped to
|
||||
int8 per-block (like we proved with TQ4_0), dp4a for KQ dot product.
|
||||
- **Warp specialization**: Dedicate some warps to K prefetch, others to V prefetch.
|
||||
- **Double buffering**: Prefetch next KV block while processing current one
|
||||
using cp.async or separate warp.
|
||||
- **Register pressure reduction**: Profile register usage, reduce if spilling.
|
||||
- **Shared memory V cache**: Cache frequently-accessed V blocks in shmem.
|
||||
- **Half2 accumulation**: Use fp16 for intermediate attention weight accumulation.
|
||||
- **Fused softmax + V aggregation**: Combine the two passes into one.
|
||||
- **Vectorized memory loads**: Use `float4` or `uint4` loads for K/V data.
|
||||
- **Loop interchange**: Change iteration order (heads vs KV positions) for
|
||||
better cache locality.
|
||||
- **Reduce warp reduction overhead**: The `__shfl_xor_sync` reduction at end
|
||||
of KQ scoring runs 5 stages — can we accumulate differently?
|
||||
|
||||
## Constraints
|
||||
- Must not change the turbo block format ABI (shared with Metal/CPU)
|
||||
- Must not modify any file other than fattn-vec.cuh
|
||||
- Must maintain correct attention output (PPL gate catches corruption)
|
||||
- Must work on Blackwell (sm_120) and Ampere (sm_86)
|
||||
- The kernel is templated — changes affect all turbo type instantiations
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"model": "/tmp/qwen2.5-7b-instruct-tq4_1s.gguf",
|
||||
"bench_args": "-p 0 -n 128",
|
||||
"tg128": 69.2,
|
||||
"ppl": 7.599,
|
||||
"ppl_threshold": 0.1,
|
||||
"ppl_file": "/mnt/ai/data/wikitext-2-raw/wiki.test.raw",
|
||||
"target_file": "ggml/src/ggml-cuda/mmvq-tq.cu",
|
||||
"no_convert": true,
|
||||
"coherence_prompts": [
|
||||
{"prompt": "What is the capital of France? One word.", "expect": "Paris"},
|
||||
{"prompt": "What is 2+2? Just the number.", "expect": "4"},
|
||||
{"prompt": "Who wrote Romeo and Juliet? One name.", "expect": "Shakespeare"}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
{"status":"success","tg128":68.11,"baseline_tg128":67.94,"delta_pct":"0%","build_time_s":4,"bench_time_s":7,"gpu_temp_c":"53","experiment":1,"timestamp":"2026-04-05T16:12:51Z","kept":true}
|
||||
{"status":"success","tg128":53.41,"baseline_tg128":68.11,"delta_pct":"-20.0%","build_time_s":8,"bench_time_s":8,"gpu_temp_c":"53","experiment":2,"timestamp":"2026-04-05T16:24:50Z","kept":false}
|
||||
{"status":"safety_revert","error":"modified non-target files","experiment":3,"timestamp":"2026-04-05T16:31:37Z","kept":false}
|
||||
{"status":"error","error":"No baseline.json found for track: track-weight","experiment":4,"timestamp":"2026-04-05T16:42:20Z","kept":false}
|
||||
{"status":"error","error":"No baseline.json found for track: track-weight","experiment":5,"timestamp":"2026-04-05T16:43:17Z","kept":false}
|
||||
{"status":"error","error":"No baseline.json found for track: track-weight","experiment":6,"timestamp":"2026-04-05T16:44:37Z","kept":false}
|
||||
{"status":"error","error":"No baseline.json found for track: track-weight","experiment":7,"timestamp":"2026-04-05T16:45:37Z","kept":false}
|
||||
{"status":"error","error":"No baseline.json found for track: track-weight","experiment":8,"timestamp":"2026-04-05T16:46:50Z","kept":false}
|
||||
{"status":"error","error":"No baseline.json found for track: track-weight","experiment":9,"timestamp":"2026-04-05T16:47:42Z","kept":false}
|
||||
{"status":"error","error":"No baseline.json found for track: track-weight","experiment":10,"timestamp":"2026-04-05T16:49:54Z","kept":false}
|
||||
{"status":"error","error":"No baseline.json found for track: track-weight","experiment":11,"timestamp":"2026-04-05T16:50:58Z","kept":false}
|
||||
{"status":"error","error":"No baseline.json found for track: track-weight","experiment":12,"timestamp":"2026-04-05T16:51:51Z","kept":false}
|
||||
{"status":"success","tg128":129.00,"baseline_tg128":69.2,"delta_pct":"80.0%","build_time_s":288,"bench_time_s":4,"gpu_temp_c":"59","experiment":13,"timestamp":"2026-04-05T16:59:41Z","kept":true}
|
||||
{"status":"success","tg128":150.85,"baseline_tg128":129.00,"delta_pct":"10.0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"55","experiment":14,"timestamp":"2026-04-05T17:07:38Z","kept":true}
|
||||
{"status":"success","tg128":151.41,"baseline_tg128":150.85,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"50","experiment":15,"timestamp":"2026-04-05T17:09:05Z","kept":true}
|
||||
{"status":"success","tg128":151.79,"baseline_tg128":151.41,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"52","experiment":16,"timestamp":"2026-04-05T17:11:54Z","kept":true}
|
||||
{"status":"success","tg128":151.02,"baseline_tg128":151.79,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"54","experiment":17,"timestamp":"2026-04-05T17:19:57Z","kept":false}
|
||||
{"status":"success","tg128":151.43,"baseline_tg128":151.79,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"51","experiment":18,"timestamp":"2026-04-05T17:26:15Z","kept":false}
|
||||
{"status":"success","tg128":219.12,"baseline_tg128":151.79,"delta_pct":"40.0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"53","experiment":19,"timestamp":"2026-04-05T17:28:31Z","kept":true}
|
||||
{"status":"success","tg128":220.78,"baseline_tg128":219.12,"delta_pct":"0%","build_time_s":9,"bench_time_s":3,"gpu_temp_c":"49","ppl":7.5425,"experiment":20,"timestamp":"2026-04-05T17:30:22Z","kept":true}
|
||||
{"status":"success","tg128":220.46,"baseline_tg128":220.78,"delta_pct":"0%","build_time_s":8,"bench_time_s":2,"gpu_temp_c":"49","experiment":21,"timestamp":"2026-04-05T17:33:02Z","kept":false}
|
||||
{"status":"success","tg128":212.39,"baseline_tg128":220.78,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"52","experiment":22,"timestamp":"2026-04-05T17:35:31Z","kept":false}
|
||||
{"status":"success","tg128":223.52,"baseline_tg128":220.78,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"55","experiment":23,"timestamp":"2026-04-05T17:46:25Z","kept":true}
|
||||
{"status":"success","tg128":223.51,"baseline_tg128":223.52,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"54","experiment":24,"timestamp":"2026-04-05T18:07:17Z","kept":false}
|
||||
{"status":"success","tg128":223.14,"baseline_tg128":223.52,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"53","experiment":25,"timestamp":"2026-04-05T18:20:26Z","kept":false}
|
||||
{"status":"build_failed","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/mmvq-tq.cu(191): error: more than one instance of overloaded function \"__dp4a\" matches the argument list:\n/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/mmvq-tq.cu(192): error: more than one instance of overloaded function \"__dp4a\" matches the argument list:\n/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/mmvq-tq.cu(199): error: more than one instance of overloaded function \"__dp4a\" matches the argument list:\n/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/mmvq-tq.cu(200): error: more than one instance of overloaded function \"__dp4a\" matches the argument list:","build_time_s":1,"experiment":26,"timestamp":"2026-04-05T18:35:52Z","kept":false}
|
||||
{"status":"success","tg128":223.66,"baseline_tg128":223.52,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"56","experiment":27,"timestamp":"2026-04-05T18:37:09Z","kept":true}
|
||||
{"status":"success","tg128":209.52,"baseline_tg128":223.66,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"53","experiment":28,"timestamp":"2026-04-05T18:46:10Z","kept":false}
|
||||
{"status":"success","tg128":223.32,"baseline_tg128":223.66,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"59","experiment":29,"timestamp":"2026-04-05T19:05:19Z","kept":false}
|
||||
{"status":"success","tg128":223.73,"baseline_tg128":223.66,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"56","ppl":7.5425,"experiment":30,"timestamp":"2026-04-05T19:19:40Z","kept":true}
|
||||
{"status":"success","tg128":216.46,"baseline_tg128":223.73,"delta_pct":"0%","build_time_s":8,"bench_time_s":2,"gpu_temp_c":"53","experiment":31,"timestamp":"2026-04-05T19:27:42Z","kept":false}
|
||||
{"status":"no_change","experiment":32,"timestamp":"2026-04-05T19:40:52Z","kept":false}
|
||||
{"status":"success","tg128":223.91,"baseline_tg128":223.73,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"56","experiment":33,"timestamp":"2026-04-05T19:47:35Z","kept":true}
|
||||
{"status":"success","tg128":223.47,"baseline_tg128":223.91,"delta_pct":"0%","build_time_s":8,"bench_time_s":2,"gpu_temp_c":"53","experiment":34,"timestamp":"2026-04-05T19:57:29Z","kept":false}
|
||||
{"status":"success","tg128":223.86,"baseline_tg128":223.91,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"56","experiment":35,"timestamp":"2026-04-05T20:13:18Z","kept":false}
|
||||
{"status":"success","tg128":224.45,"baseline_tg128":223.91,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"56","experiment":36,"timestamp":"2026-04-05T20:41:59Z","kept":true}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x760e8084fb1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x760e795ca737]","build_time_s":8,"bench_time_s":1,"experiment":37,"timestamp":"2026-04-05T20:43:05Z","kept":false}
|
||||
{"status":"success","tg128":221.19,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"49","experiment":38,"timestamp":"2026-04-05T20:44:35Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x74069c1cdb1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x740694fca737]","build_time_s":5,"bench_time_s":1,"experiment":39,"timestamp":"2026-04-05T20:45:49Z","kept":false}
|
||||
{"status":"success","tg128":208.20,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"55","ppl":7.5425,"experiment":40,"timestamp":"2026-04-05T20:52:14Z","kept":false}
|
||||
{"status":"success","tg128":224.23,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"50","experiment":41,"timestamp":"2026-04-05T20:55:47Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x70aa29337b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x70aa225ca737]","build_time_s":5,"bench_time_s":1,"experiment":42,"timestamp":"2026-04-05T20:57:48Z","kept":false}
|
||||
{"status":"success","tg128":219.29,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"55","experiment":43,"timestamp":"2026-04-05T21:04:11Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7c3f28670b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7c3f213ca737]","build_time_s":4,"bench_time_s":1,"experiment":44,"timestamp":"2026-04-05T21:05:11Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x73a442c62b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x73a43b9ca737]","build_time_s":5,"bench_time_s":1,"experiment":45,"timestamp":"2026-04-05T21:06:35Z","kept":false}
|
||||
{"status":"success","tg128":218.41,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"53","experiment":46,"timestamp":"2026-04-05T21:08:49Z","kept":false}
|
||||
{"status":"success","tg128":223.73,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"56","experiment":47,"timestamp":"2026-04-05T21:17:51Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7bad3bc68b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7bad349ca737]","build_time_s":5,"bench_time_s":1,"experiment":48,"timestamp":"2026-04-05T21:18:38Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7f0fe0dd2b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7f0fd9bca737]","build_time_s":6,"bench_time_s":0,"experiment":49,"timestamp":"2026-04-05T21:22:46Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7eb6a0537b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7eb6997ca737]","build_time_s":5,"bench_time_s":1,"experiment":50,"timestamp":"2026-04-05T21:24:04Z","kept":false}
|
||||
{"status":"success","tg128":223.85,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"55","experiment":51,"timestamp":"2026-04-05T21:29:33Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7492739b3b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x74926c7ca737]","build_time_s":5,"bench_time_s":1,"experiment":52,"timestamp":"2026-04-05T21:30:24Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x759986646b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x75997f3ca737]","build_time_s":5,"bench_time_s":0,"experiment":53,"timestamp":"2026-04-05T21:31:34Z","kept":false}
|
||||
{"status":"success","tg128":223.94,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"50","experiment":54,"timestamp":"2026-04-05T21:33:26Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x73c90cb37b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x73c905dca737]","build_time_s":4,"bench_time_s":1,"experiment":55,"timestamp":"2026-04-05T21:34:32Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7f926c661b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7f92653ca737]","build_time_s":5,"bench_time_s":1,"experiment":56,"timestamp":"2026-04-05T21:41:29Z","kept":false}
|
||||
{"status":"success","tg128":225.17,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":6,"bench_time_s":2,"gpu_temp_c":"57","experiment":57,"timestamp":"2026-04-05T21:50:14Z","kept":true}
|
||||
{"status":"success","tg128":222.27,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"49","experiment":58,"timestamp":"2026-04-05T21:54:06Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7bb3e0137b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7bb3d93ca737]","build_time_s":5,"bench_time_s":1,"experiment":59,"timestamp":"2026-04-05T21:55:23Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7b9533f37b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7b952d1ca737]","build_time_s":5,"bench_time_s":1,"experiment":60,"timestamp":"2026-04-05T21:56:34Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7a274f7b4b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7a27485ca737]","build_time_s":4,"bench_time_s":1,"experiment":61,"timestamp":"2026-04-05T21:57:42Z","kept":false}
|
||||
{"status":"success","tg128":207.76,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"56","experiment":62,"timestamp":"2026-04-05T22:04:53Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7dee6b826b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7dee645ca737]","build_time_s":5,"bench_time_s":1,"experiment":63,"timestamp":"2026-04-05T22:05:49Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7bddc9b37b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7bddc2dca737]","build_time_s":5,"bench_time_s":1,"experiment":64,"timestamp":"2026-04-05T22:07:04Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x71109ac95b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7110939ca737]","build_time_s":4,"bench_time_s":1,"experiment":65,"timestamp":"2026-04-05T22:11:06Z","kept":false}
|
||||
{"status":"success","tg128":219.26,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"56","experiment":66,"timestamp":"2026-04-05T22:19:51Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x72b8b9337b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x72b8b25ca737]","build_time_s":6,"bench_time_s":1,"experiment":67,"timestamp":"2026-04-05T22:20:59Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7d8176937b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7d816fbca737]","build_time_s":5,"bench_time_s":1,"experiment":68,"timestamp":"2026-04-05T22:25:11Z","kept":false}
|
||||
{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x784b43537b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x784b3c7ca737]","build_time_s":5,"bench_time_s":1,"experiment":69,"timestamp":"2026-04-05T22:27:18Z","kept":false}
|
||||
{"status":"success","tg128":224.98,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"58","ppl":7.5425,"experiment":70,"timestamp":"2026-04-05T22:33:37Z","kept":false}
|
||||
{"status":"success","tg128":225.46,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"51","experiment":71,"timestamp":"2026-04-05T22:39:32Z","kept":true}
|
||||
{"status":"success","tg128":220.92,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":8,"bench_time_s":2,"gpu_temp_c":"55","experiment":72,"timestamp":"2026-04-05T22:51:55Z","kept":false}
|
||||
{"status":"success","tg128":220.68,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"52","experiment":73,"timestamp":"2026-04-05T22:57:46Z","kept":false}
|
||||
{"status":"success","tg128":219.24,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"55","experiment":74,"timestamp":"2026-04-05T23:02:38Z","kept":false}
|
||||
{"status":"success","tg128":219.54,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"53","experiment":75,"timestamp":"2026-04-05T23:07:50Z","kept":false}
|
||||
{"status":"success","tg128":222.66,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"56","experiment":76,"timestamp":"2026-04-05T23:17:39Z","kept":false}
|
||||
{"status":"success","tg128":210.26,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"48","experiment":77,"timestamp":"2026-04-05T23:20:13Z","kept":false}
|
||||
{"status":"success","tg128":220.71,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"53","experiment":78,"timestamp":"2026-04-05T23:22:33Z","kept":false}
|
||||
{"status":"success","tg128":219.25,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"58","experiment":79,"timestamp":"2026-04-05T23:31:36Z","kept":false}
|
||||
{"status":"success","tg128":210.19,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"53","ppl":7.5425,"experiment":80,"timestamp":"2026-04-05T23:38:24Z","kept":false}
|
||||
{"status":"success","tg128":221.41,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"51","experiment":81,"timestamp":"2026-04-05T23:39:51Z","kept":false}
|
||||
{"status":"success","tg128":215.12,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"51","experiment":82,"timestamp":"2026-04-05T23:45:46Z","kept":false}
|
||||
{"status":"success","tg128":210.01,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"55","experiment":83,"timestamp":"2026-04-05T23:52:38Z","kept":false}
|
||||
{"status":"success","tg128":221.42,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"51","experiment":84,"timestamp":"2026-04-05T23:56:44Z","kept":false}
|
||||
{"status":"success","tg128":220.24,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"54","experiment":85,"timestamp":"2026-04-06T00:01:16Z","kept":false}
|
||||
{"status":"success","tg128":225.90,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"56","experiment":86,"timestamp":"2026-04-06T00:20:56Z","kept":true}
|
||||
@@ -0,0 +1,118 @@
|
||||
# TQ4_1S Weight Kernel Optimization
|
||||
|
||||
## Goal
|
||||
Maximize decode t/s for TQ4_1S `mul_mat_vec` on CUDA (RTX 5090, Blackwell sm_120).
|
||||
Current baseline: ~69 t/s. Target: close the gap to q4_0 (267 t/s).
|
||||
|
||||
## Target File
|
||||
`ggml/src/ggml-cuda/mmvq-tq.cu` — ONLY modify this file.
|
||||
|
||||
## Model & Benchmark
|
||||
- Model: `/tmp/qwen2.5-7b-instruct-tq4_1s.gguf`
|
||||
- Benchmark: `llama-bench -ngl 99 -p 0 -n 128 -r 3` (decode only)
|
||||
- Correctness: PPL on wikitext-2 must stay within 0.1 of baseline (7.599)
|
||||
|
||||
## Architecture Overview
|
||||
The TQ4_1S format stores WHT-rotated 4-bit weights with non-linear Lloyd-Max centroids.
|
||||
Block size = 32 elements, dual half-block scales (d0 for [0..15], d1 for [16..31]).
|
||||
20 bytes per block = 5.0 bits/value.
|
||||
|
||||
Dequant per element: `centroid_table[4bit_index] * half_block_scale`
|
||||
Then inverse WHT (Walsh-Hadamard Transform) to recover original weight space.
|
||||
|
||||
The fused mmvq kernel avoids per-block inverse WHT by pre-rotating the activation
|
||||
vector (WHT forward) once, then the inner loop is just:
|
||||
```
|
||||
sum += rotated_activation[lane] * centroid[idx] * d
|
||||
```
|
||||
|
||||
### Current Kernel (V8)
|
||||
- 8 warps per CUDA block (MMVQ_TQ_NWARPS = 8), each warp handles one output row
|
||||
- 32 lanes per warp, each lane handles element `lane` within every block
|
||||
- Activation pre-rotated to float scratch buffer via warp shuffle WHT
|
||||
- Inner loop: 1 float FMA per element per lane
|
||||
- Warp reduction via `__shfl_xor_sync`
|
||||
|
||||
### Block Layout (block_tq4_1s)
|
||||
```c
|
||||
struct block_tq4_1s {
|
||||
half d0; // 2 bytes: scale for elements [0..15]
|
||||
half d1; // 2 bytes: scale for elements [16..31]
|
||||
uint8_t qs[16]; // 16 bytes: 4-bit indices, consecutive pair packing
|
||||
// qs[j/2] >> ((j&1)*4) & 0xF = centroid index for element j
|
||||
};
|
||||
```
|
||||
|
||||
### Centroid Table (constant memory)
|
||||
16 Lloyd-Max optimal values for N(0,1):
|
||||
```
|
||||
[-2.733, -2.069, -1.618, -1.256, -0.942, -0.657, -0.388, -0.128,
|
||||
0.128, 0.388, 0.657, 0.942, 1.256, 1.618, 2.069, 2.733]
|
||||
```
|
||||
|
||||
## Confirmed Bottleneck
|
||||
The centroid lookup itself is NOT the bottleneck — confirmed via ablation
|
||||
(replacing `centroid[idx]` with `(idx-8)` gives identical 69 t/s).
|
||||
|
||||
The real bottleneck is:
|
||||
1. **Float32 activation bandwidth**: 4 bytes/element vs q8_1's 1 byte. Each warp
|
||||
reads the full activation vector from global memory per row.
|
||||
2. **Float FMA arithmetic density**: 1 MAC per instruction vs dp4a's 4 MACs.
|
||||
q4_0 processes 8 elements per dp4a pair; V8 processes 1 element per FMA.
|
||||
|
||||
## Already Tried — Do NOT Re-explore
|
||||
| Version | Approach | Result | Why it failed |
|
||||
|---------|----------|--------|--------------|
|
||||
| V9-V11 | Multi-row NR0=2,4 (shmem or registers) | Regressed | __syncthreads overhead or register spill |
|
||||
| V12 | Shmem activation broadcast | ~67 t/s | No help on 5090 (128MB L2 already caching) |
|
||||
| V13 | Loop unroll ×4 | ~69 t/s | Compiler already optimal |
|
||||
| V14 | WMMA tensor cores | 6 t/s | Setup overhead >> throughput for matvec |
|
||||
| V15 | L2 prefetch hints | ~69 t/s | No measurable effect |
|
||||
| V16 | __launch_bounds__ tuning | ~69 t/s | Occupancy changes no effect |
|
||||
| V18 | Per-block int8 LUT + dp4a | 46 t/s | LUT build + pack overhead > dp4a gain |
|
||||
| V19 | ILP 4× unroll | 70 t/s | Negligible improvement |
|
||||
|
||||
## Promising Directions to Explore
|
||||
|
||||
### From community discussion (ggml-org/llama.cpp#20969)
|
||||
|
||||
- **Entropy-coded weight compression (karambaso idea)**: With only 16 centroid
|
||||
values, 4-bit indices have low entropy. Runtime Huffman/ANS decompression in
|
||||
shmem could reduce effective bandwidth 30-50%. Decode is memory-bound, so extra
|
||||
compute for decompression may be free. This reframes the problem: instead of
|
||||
faster dequant, read less data.
|
||||
- **Fused tile loader pattern (from Madreag's KV work)**: Load multiple weight
|
||||
blocks into shmem, dequant in-register from shmem. Amortizes global memory
|
||||
latency across a tile of blocks.
|
||||
- **F32 vs fp16 activation precision**: AmesianX notes WHT amplifies q8_1
|
||||
quantization error ~16x. Our V8 uses f32 activation which avoids this.
|
||||
But fp16 activation would halve bandwidth. Worth testing if the quality
|
||||
tradeoff is acceptable for weights (less sensitive than KV cache).
|
||||
|
||||
### Kernel-level ideas
|
||||
|
||||
- **half2 packed FMA**: Process 2 elements per `__hmul2`/`__hfma2` instruction.
|
||||
Centroids in fp16 constant memory, activation in fp16. 2x arithmetic density.
|
||||
- **Warp-cooperative coalesced loading**: Reorganize memory access so weight loads
|
||||
are fully coalesced (currently scattered due to per-lane block access).
|
||||
- **Register blocking across blocks**: Each lane accumulates across multiple
|
||||
blocks before reducing, keeping partial sums in registers.
|
||||
- **Async memory copy (cp.async)**: Prefetch next block's weight data while
|
||||
computing current block.
|
||||
- **Different warp configurations**: Try 4 or 16 warps instead of 8.
|
||||
- **Two-level tiling**: Load a tile of blocks to shmem, process tile, repeat.
|
||||
This is the shmem activation variant (V12) but for weights instead.
|
||||
- **Vectorized weight loads**: Load 4 bytes (8 nibbles) per lane per iteration
|
||||
instead of extracting one nibble at a time.
|
||||
- **Activation compression**: Quantize pre-rotated activation to fp16 or int8
|
||||
to reduce bandwidth (loses some precision but may be worth it).
|
||||
- **Stream-K style decomposition**: Different work partitioning across warps.
|
||||
- **Per-block centroid pre-scale**: Pre-multiply centroid × d_half into a
|
||||
16-entry fp16 LUT in registers (not shmem). Then inner loop is just
|
||||
`lut[idx] * activation` — one FMA instead of two multiplies.
|
||||
|
||||
## Constraints
|
||||
- Must not change the block_tq4_1s ABI (format is shared with Metal/CPU)
|
||||
- Must not modify any file other than mmvq-tq.cu
|
||||
- Output must be bit-exact for the same input (deterministic)
|
||||
- Must work on both Blackwell (sm_120) and Ampere (sm_86)
|
||||
Executable
+58
@@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
# SMEM Pre-Dequant Benchmark — M5 Max
|
||||
# Tests SMEM vs baseline at multiple context depths
|
||||
#
|
||||
# BEFORE RUNNING:
|
||||
# 1. cd /Users/tom/local_llms/llama.cpp
|
||||
# 2. git checkout experiment/smem-pre-dequant
|
||||
# 3. Build WITHOUT SMEM first (baseline):
|
||||
# cmake --build build -j12
|
||||
# 4. Run: ./scripts/bench-smem-m5.sh baseline
|
||||
# 5. Build WITH SMEM:
|
||||
# TURBO_SMEM_DEQUANT=1 cmake --build build -j12
|
||||
# 6. Run: ./scripts/bench-smem-m5.sh smem
|
||||
#
|
||||
# Uses Qwen3.5-35B-A3B (MoE, fits in memory, attention-heavy)
|
||||
|
||||
set -e
|
||||
|
||||
LABEL="${1:-baseline}"
|
||||
LLAMA_BENCH="/Users/tom/local_llms/llama.cpp/build/bin/llama-bench"
|
||||
MODEL="/Users/tom/local_llms/models/Qwen3.5-35B-A3B-Q8_0.gguf"
|
||||
OUTFILE="/Users/tom/local_llms/llama.cpp/bench-smem-m5-${LABEL}.txt"
|
||||
|
||||
CONTEXTS=(0 8192 16384 32768)
|
||||
KV_TYPES=("turbo3" "turbo4" "q8_0")
|
||||
|
||||
echo "=== SMEM M5 Benchmark: ${LABEL} ===" | tee "$OUTFILE"
|
||||
echo "Model: $(basename $MODEL)" | tee -a "$OUTFILE"
|
||||
echo "Date: $(date)" | tee -a "$OUTFILE"
|
||||
echo "" | tee -a "$OUTFILE"
|
||||
|
||||
for ctk in "${KV_TYPES[@]}"; do
|
||||
for p in "${CONTEXTS[@]}"; do
|
||||
if [[ "$ctk" == "q8_0" && "$LABEL" == "smem" ]]; then
|
||||
echo "SKIP: q8_0 + smem (q8_0 unaffected by SMEM)" | tee -a "$OUTFILE"
|
||||
continue
|
||||
fi
|
||||
|
||||
depth_label="short"
|
||||
[[ $p -gt 0 ]] && depth_label="${p}"
|
||||
|
||||
echo "--- ${ctk} @ ${depth_label} ---" | tee -a "$OUTFILE"
|
||||
|
||||
ctv="$ctk"
|
||||
$LLAMA_BENCH \
|
||||
-m "$MODEL" \
|
||||
-ngl 99 -fa 1 \
|
||||
-ctk "$ctk" -ctv "$ctv" \
|
||||
-t 1 \
|
||||
-p "$p" -n 128 \
|
||||
2>&1 | tee -a "$OUTFILE"
|
||||
|
||||
echo "" | tee -a "$OUTFILE"
|
||||
done
|
||||
done
|
||||
|
||||
echo "=== Done: ${LABEL} ===" | tee -a "$OUTFILE"
|
||||
echo "Results saved to: $OUTFILE"
|
||||
Executable
+80
@@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
# TurboQuant quality + speed gate — run BEFORE pushing any changes
|
||||
# Checks: (1) perplexity within 5% of q8_0, (2) context scaling ratio > 0.95
|
||||
#
|
||||
# Usage: bash scripts/turbo-quality-gate.sh
|
||||
# Exit 0 = PASS, Exit 1 = FAIL
|
||||
|
||||
set -e
|
||||
|
||||
LLAMA=${LLAMA:-~/local_llms/llama.cpp/build-turbo/bin}
|
||||
MODEL=${MODEL:-~/local_llms/models/Qwen3.5-35B-A3B-Q8_0.gguf}
|
||||
WIKI=${WIKI:-~/local_llms/llama.cpp/wikitext-2-raw/wiki.test.raw}
|
||||
|
||||
if [ ! -f "$WIKI" ]; then
|
||||
echo "Downloading wikitext-2..."
|
||||
bash ~/local_llms/llama.cpp/scripts/get-wikitext-2.sh
|
||||
fi
|
||||
|
||||
FAIL=0
|
||||
|
||||
echo "========================================"
|
||||
echo " TurboQuant Quality + Speed Gate"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
|
||||
# --- Test 1: Perplexity ---
|
||||
echo "[1/2] Running perplexity check (8 chunks)..."
|
||||
PPL_TURBO=$($LLAMA/llama-perplexity -m $MODEL -f $WIKI -c 512 -ctk turbo3 -ctv turbo3 -fa on --chunks 8 -ngl 99 2>&1 | grep "Final" | grep -oE 'PPL = [0-9.]+' | grep -oE '[0-9.]+')
|
||||
|
||||
if [ -z "$PPL_TURBO" ]; then
|
||||
echo " FAIL: Could not get turbo3 perplexity (crash or timeout)"
|
||||
FAIL=1
|
||||
else
|
||||
BASELINE_PPL=6.111
|
||||
MAX_PPL=$(echo "$BASELINE_PPL * 1.05" | bc)
|
||||
PPL_OK=$(echo "$PPL_TURBO < $MAX_PPL" | bc)
|
||||
if [ "$PPL_OK" -eq 1 ]; then
|
||||
echo " PASS: turbo3 PPL = $PPL_TURBO (< $MAX_PPL, within 5% of q8_0 $BASELINE_PPL)"
|
||||
else
|
||||
echo " FAIL: turbo3 PPL = $PPL_TURBO (> $MAX_PPL, exceeds 5% threshold)"
|
||||
FAIL=1
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# --- Test 2: Context Scaling ---
|
||||
echo "[2/2] Running context scaling check (4K prefill)..."
|
||||
TURBO_TPS=$($LLAMA/llama-perplexity -m $MODEL -f $WIKI -c 4096 -ctk turbo3 -ctv turbo3 -fa on --chunks 4 -ngl 99 2>&1 | grep "prompt eval" | grep -oE '[0-9.]+ tokens per second' | grep -oE '[0-9.]+')
|
||||
Q8_TPS=$($LLAMA/llama-perplexity -m $MODEL -f $WIKI -c 4096 -ctk q8_0 -ctv q8_0 -fa on --chunks 4 -ngl 99 2>&1 | grep "prompt eval" | grep -oE '[0-9.]+ tokens per second' | grep -oE '[0-9.]+')
|
||||
|
||||
if [ -z "$TURBO_TPS" ] || [ -z "$Q8_TPS" ]; then
|
||||
echo " FAIL: Could not measure speed (crash or timeout)"
|
||||
echo " turbo3=$TURBO_TPS q8_0=$Q8_TPS"
|
||||
FAIL=1
|
||||
else
|
||||
RATIO=$(echo "scale=4; $TURBO_TPS / $Q8_TPS" | bc)
|
||||
RATIO_OK=$(echo "$RATIO > 0.95" | bc)
|
||||
if [ "$RATIO_OK" -eq 1 ]; then
|
||||
echo " PASS: turbo3/q8_0 = ${RATIO}x at 4K context (> 0.95 threshold)"
|
||||
echo " turbo3 = $TURBO_TPS tok/s, q8_0 = $Q8_TPS tok/s"
|
||||
else
|
||||
echo " FAIL: turbo3/q8_0 = ${RATIO}x at 4K context (< 0.95 threshold)"
|
||||
echo " turbo3 = $TURBO_TPS tok/s, q8_0 = $Q8_TPS tok/s"
|
||||
echo " Context scaling regression detected!"
|
||||
FAIL=1
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# --- Summary ---
|
||||
echo "========================================"
|
||||
if [ "$FAIL" -eq 0 ]; then
|
||||
echo " ALL CHECKS PASSED"
|
||||
echo "========================================"
|
||||
exit 0
|
||||
else
|
||||
echo " CHECKS FAILED — DO NOT PUSH"
|
||||
echo "========================================"
|
||||
exit 1
|
||||
fi
|
||||
+27
-2
@@ -3398,8 +3398,16 @@ llama_context * llama_init_from_model(
|
||||
|
||||
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
||||
const bool k_is_turbo = (params.type_k == GGML_TYPE_TURBO2_0 ||
|
||||
params.type_k == GGML_TYPE_TURBO3_0 ||
|
||||
params.type_k == GGML_TYPE_TURBO4_0);
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
|
||||
uint32_t head_k = model->hparams.n_embd_head_k(il);
|
||||
// Turbo types zero-pad heads to next multiple of 128 in llama-kv-cache.cpp
|
||||
if (k_is_turbo && head_k % 128 != 0) {
|
||||
head_k = ((head_k + 127) / 128) * 128;
|
||||
}
|
||||
if (head_k % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
|
||||
return nullptr;
|
||||
@@ -3409,8 +3417,17 @@ llama_context * llama_init_from_model(
|
||||
|
||||
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
||||
const bool v_is_turbo = (params.type_v == GGML_TYPE_TURBO2_0 ||
|
||||
params.type_v == GGML_TYPE_TURBO3_0 ||
|
||||
params.type_v == GGML_TYPE_TURBO4_0);
|
||||
const bool is_mla = model->hparams.is_mla();
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
|
||||
uint32_t head_v = model->hparams.n_embd_head_v(il);
|
||||
// Turbo types zero-pad; MLA has no separate V cache (V = view of K)
|
||||
if (v_is_turbo && !is_mla && head_v % 128 != 0) {
|
||||
head_v = ((head_v + 127) / 128) * 128;
|
||||
}
|
||||
if (head_v % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
|
||||
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
|
||||
return nullptr;
|
||||
@@ -3418,6 +3435,14 @@ llama_context * llama_init_from_model(
|
||||
}
|
||||
}
|
||||
|
||||
// TurboQuant cache types require flash attention — auto-enable if disabled
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED &&
|
||||
(params.type_k == GGML_TYPE_TURBO2_0 || params.type_k == GGML_TYPE_TURBO3_0 || params.type_k == GGML_TYPE_TURBO4_0 ||
|
||||
params.type_v == GGML_TYPE_TURBO2_0 || params.type_v == GGML_TYPE_TURBO3_0 || params.type_v == GGML_TYPE_TURBO4_0)) {
|
||||
LLAMA_LOG_WARN("%s: turbo cache types require flash_attn — enabling automatically\n", __func__);
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(params.type_v) && params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED) {
|
||||
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
||||
return nullptr;
|
||||
|
||||
@@ -1958,6 +1958,10 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
||||
v = ggml_permute(ctx0, v, 0, 2, 1, 3);
|
||||
|
||||
// TurboQuant note: graph-side Q rotation (pre-rotate-queries) is implemented below
|
||||
// in the flash-attn path. The VEC kernel bug (wrong Q/K stride in
|
||||
// vec_dot_fattn_vec_KQ_turbo3_0) was fixed in fattn-common.cuh to match f16 pattern.
|
||||
|
||||
ggml_tensor * cur;
|
||||
|
||||
const bool use_flash_attn = cparams.flash_attn && kq_b == nullptr;
|
||||
@@ -1984,6 +1988,20 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
||||
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
||||
|
||||
// TurboQuant: inverse WHT on FA output when V values are WHT-rotated.
|
||||
// For MLA, V is a view of K with different ne[0] (e.g. V=512, K=576).
|
||||
// Group size must come from K (which determines the WHT rotation), not V.
|
||||
if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
|
||||
const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0);
|
||||
const ggml_tensor * group_src = k_is_turbo ? k : v;
|
||||
const int turbo_group = (group_src->ne[0] % 128 == 0) ? 128 : 64;
|
||||
if (cur->ne[0] % turbo_group == 0) {
|
||||
if (!ggml_is_contiguous(cur)) { cur = ggml_cont(ctx0, cur); }
|
||||
ggml_tensor * innerq_scale = mctx ? mctx->get_turbo_innerq_scale_inv() : nullptr;
|
||||
cur = ggml_turbo_wht(ctx0, cur, 1, turbo_group, innerq_scale); // 1 = inverse
|
||||
}
|
||||
}
|
||||
|
||||
if (v_mla) {
|
||||
#if 0
|
||||
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
|
||||
@@ -2050,6 +2068,18 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
||||
cb(kqv, "kqv", il);
|
||||
|
||||
// TurboQuant: inverse WHT on attention output (non-FA path)
|
||||
if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
|
||||
const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0);
|
||||
const ggml_tensor * group_src = k_is_turbo ? k : v;
|
||||
const int turbo_group = (group_src->ne[0] % 128 == 0) ? 128 : 64;
|
||||
if (kqv->ne[0] % turbo_group == 0) {
|
||||
if (!ggml_is_contiguous(kqv)) { kqv = ggml_cont(ctx0, kqv); }
|
||||
ggml_tensor * innerq_scale = mctx ? mctx->get_turbo_innerq_scale_inv() : nullptr;
|
||||
kqv = ggml_turbo_wht(ctx0, kqv, 1, turbo_group, innerq_scale);
|
||||
}
|
||||
}
|
||||
|
||||
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
|
||||
if (v_mla) {
|
||||
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
|
||||
@@ -2067,6 +2097,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
}
|
||||
}
|
||||
|
||||
// TurboQuant: graph-side inverse WHT on attention output (undoes V rotation)
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return cur;
|
||||
@@ -2227,9 +2259,48 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||
|
||||
// TurboQuant pre-rotate-queries: O(d log d) WHT rotation via custom op
|
||||
// Q shape: (n_embd_head, n_head, n_tokens)
|
||||
// For zero-padded models (head_dim not 128-aligned), pad Q to match padded K dim first.
|
||||
if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
|
||||
// Pad Q per-head to next multiple of 128 if needed
|
||||
if (q->ne[0] % 128 != 0) {
|
||||
const int64_t pad = ((q->ne[0] + 127) / 128) * 128 - q->ne[0];
|
||||
q = ggml_pad(ctx0, q, pad, 0, 0, 0);
|
||||
}
|
||||
if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); }
|
||||
ggml_tensor * innerq_scale = mctx_cur->get_turbo_innerq_scale_inv();
|
||||
q = ggml_turbo_wht(ctx0, q, 0, 0, innerq_scale); // 0 = forward, 0 = auto group size from q->ne[0]
|
||||
}
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
// TurboQuant: if V was padded, the output has padded dimensions.
|
||||
// Extract original V head_dim after inverse WHT (applied inside build_attn_mha).
|
||||
// NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo
|
||||
if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
|
||||
const int64_t orig_v_head = hparams.n_embd_head_v(il);
|
||||
// cur is 2D: (n_embd_head * n_head, n_tokens) after build_attn_mha
|
||||
const int64_t padded_v_head = v->ne[0];
|
||||
if (padded_v_head != orig_v_head) {
|
||||
// Reshape to 4D, extract original head_dim, reshape back to 2D
|
||||
// Fix #78 (bingh0): cur shape post-MHA is (n_embd_head * n_head, n_tokens),
|
||||
// not (n_embd_head * n_head_kv, n_tokens). Reshape needs n_head
|
||||
// (Q-head count) so GQA models with n_head != n_head_kv (e.g.
|
||||
// Qwen2.5-0.5B head_dim=64 padded → 128) don't fail the element
|
||||
// count check in ggml_reshape_3d.
|
||||
const int64_t n_head_v = hparams.n_head(il);
|
||||
const int64_t n_tokens_cur = cur->ne[1];
|
||||
cur = ggml_reshape_3d(ctx0, cur, padded_v_head, n_head_v, n_tokens_cur);
|
||||
// ggml_view_3d to extract first orig_v_head elements per head
|
||||
cur = ggml_view_3d(ctx0, cur, orig_v_head, n_head_v, n_tokens_cur,
|
||||
cur->nb[1], cur->nb[2], 0);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_2d(ctx0, cur, orig_v_head * n_head_v, n_tokens_cur);
|
||||
}
|
||||
}
|
||||
|
||||
if (inp->self_v_rot) {
|
||||
cur = ggml_mul_mat_aux(ctx0, cur, inp->self_v_rot);
|
||||
}
|
||||
@@ -2318,9 +2389,44 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
|
||||
|
||||
// TurboQuant: pre-rotate Q for K-only (MLA) attention
|
||||
// For zero-padded models, pad Q to match padded K dim first.
|
||||
if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
|
||||
// Pad Q per-head to next multiple of 128 if needed
|
||||
if (q->ne[0] % 128 != 0) {
|
||||
const int64_t pad = ((q->ne[0] + 127) / 128) * 128 - q->ne[0];
|
||||
q = ggml_pad(ctx0, q, pad, 0, 0, 0);
|
||||
}
|
||||
if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); }
|
||||
ggml_tensor * innerq_scale = mctx_cur->get_turbo_innerq_scale_inv();
|
||||
q = ggml_turbo_wht(ctx0, q, 0, 0, innerq_scale); // 0 = forward, 0 = auto group size
|
||||
}
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
// TurboQuant: if V was padded (MLA: V is view of K, may have padded dim),
|
||||
// extract original V head_dim after inverse WHT.
|
||||
if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
|
||||
const int64_t orig_v_head = v_cur->ne[0]; // original V head_dim from model
|
||||
const int64_t padded_v_head = v->ne[0]; // padded V head_dim in cache
|
||||
if (padded_v_head != orig_v_head) {
|
||||
// cur is 2D: (padded_v_head * n_head, n_tokens) after build_attn_mha
|
||||
// Fix #78 (bingh0): cur shape post-MHA is (n_embd_head * n_head, n_tokens),
|
||||
// not (n_embd_head * n_head_kv, n_tokens). Reshape needs n_head
|
||||
// (Q-head count) so GQA models with n_head != n_head_kv (e.g.
|
||||
// Qwen2.5-0.5B head_dim=64 padded → 128) don't fail the element
|
||||
// count check in ggml_reshape_3d.
|
||||
const int64_t n_head_v = hparams.n_head(il);
|
||||
const int64_t n_tokens_cur = cur->ne[1];
|
||||
cur = ggml_reshape_3d(ctx0, cur, padded_v_head, n_head_v, n_tokens_cur);
|
||||
cur = ggml_view_3d(ctx0, cur, orig_v_head, n_head_v, n_tokens_cur,
|
||||
cur->nb[1], cur->nb[2], 0);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_2d(ctx0, cur, orig_v_head * n_head_v, n_tokens_cur);
|
||||
}
|
||||
}
|
||||
|
||||
if (wo) {
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
||||
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
||||
@@ -2406,9 +2512,41 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||
|
||||
// TurboQuant: pre-rotate Q for ISWA attention (pad to 128-aligned if needed)
|
||||
if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
|
||||
if (q->ne[0] % 128 != 0) {
|
||||
const int64_t pad = ((q->ne[0] + 127) / 128) * 128 - q->ne[0];
|
||||
q = ggml_pad(ctx0, q, pad, 0, 0, 0);
|
||||
}
|
||||
if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); }
|
||||
ggml_tensor * innerq_scale = mctx_cur->get_turbo_innerq_scale_inv();
|
||||
q = ggml_turbo_wht(ctx0, q, 0, 0, innerq_scale);
|
||||
}
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
// TurboQuant: if V was padded, extract original V head_dim after inverse WHT
|
||||
// NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo
|
||||
if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
|
||||
const int64_t orig_v_head = hparams.n_embd_head_v(il);
|
||||
const int64_t padded_v_head = v->ne[0];
|
||||
if (padded_v_head != orig_v_head) {
|
||||
// Fix #78 (bingh0): cur shape post-MHA is (n_embd_head * n_head, n_tokens),
|
||||
// not (n_embd_head * n_head_kv, n_tokens). Reshape needs n_head
|
||||
// (Q-head count) so GQA models with n_head != n_head_kv (e.g.
|
||||
// Qwen2.5-0.5B head_dim=64 padded → 128) don't fail the element
|
||||
// count check in ggml_reshape_3d.
|
||||
const int64_t n_head_v = hparams.n_head(il);
|
||||
const int64_t n_tokens_cur = cur->ne[1];
|
||||
cur = ggml_reshape_3d(ctx0, cur, padded_v_head, n_head_v, n_tokens_cur);
|
||||
cur = ggml_view_3d(ctx0, cur, orig_v_head, n_head_v, n_tokens_cur,
|
||||
cur->nb[1], cur->nb[2], 0);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_2d(ctx0, cur, orig_v_head * n_head_v, n_tokens_cur);
|
||||
}
|
||||
}
|
||||
|
||||
if (v_rot) {
|
||||
cur = ggml_mul_mat_aux(ctx0, cur, v_rot);
|
||||
}
|
||||
|
||||
+383
-53
@@ -73,6 +73,30 @@ static ggml_tensor * ggml_mul_mat_aux(
|
||||
return res;
|
||||
}
|
||||
|
||||
// InnerQ: cross-TU shared state for CUDA per-channel equalization.
|
||||
// These are defined in ggml-cuda/turbo-innerq.cu (when CUDA is enabled).
|
||||
// When CUDA is not available, we provide stub implementations.
|
||||
#ifndef INNERQ_MAX_CHANNELS
|
||||
#define INNERQ_MAX_CHANNELS 128
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# define TURBO_IQ_IMPORT __declspec(dllimport)
|
||||
#else
|
||||
# define TURBO_IQ_IMPORT
|
||||
#endif
|
||||
extern TURBO_IQ_IMPORT bool g_innerq_finalized;
|
||||
extern TURBO_IQ_IMPORT float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS];
|
||||
TURBO_IQ_IMPORT bool turbo_innerq_needs_tensor_update(void);
|
||||
TURBO_IQ_IMPORT void turbo_innerq_mark_tensor_updated(void);
|
||||
#else
|
||||
static bool g_innerq_finalized = false;
|
||||
static float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS] = {};
|
||||
static bool turbo_innerq_needs_tensor_update(void) { return false; }
|
||||
static void turbo_innerq_mark_tensor_updated(void) {}
|
||||
#endif
|
||||
|
||||
//
|
||||
// llama_kv_cache
|
||||
//
|
||||
@@ -96,6 +120,32 @@ llama_kv_cache::llama_kv_cache(
|
||||
|
||||
GGML_ASSERT(kv_size % n_pad == 0);
|
||||
|
||||
// Auto-asymmetric: when symmetric turbo K+V is requested and the model has
|
||||
// high GQA ratio (few KV heads serving many Q heads), upgrade K to q8_0.
|
||||
// Turbo K quantization error gets amplified by the GQA broadcast factor.
|
||||
// Qwen2.5: 4 KV heads / 28 Q heads = 7:1 → turbo3 K PPL catastrophic (2887 vs 7.4 baseline)
|
||||
// Mistral: 8 KV heads / 32 Q heads = 4:1 → turbo3 K works fine (+4.4% PPL)
|
||||
// Threshold: GQA ratio >= 6 triggers auto-asymmetric.
|
||||
{
|
||||
const bool k_is_turbo = (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0);
|
||||
if (k_is_turbo) {
|
||||
const uint32_t n_head = hparams.n_head(0);
|
||||
const uint32_t n_head_kv = hparams.n_head_kv(0);
|
||||
const uint32_t gqa_ratio = (n_head_kv > 0) ? n_head / n_head_kv : 1;
|
||||
|
||||
const char * env = getenv("TURBO_AUTO_ASYMMETRIC");
|
||||
const bool disabled = (env && env[0] == '0');
|
||||
|
||||
if (!disabled && gqa_ratio >= 6 && type_k == type_v) {
|
||||
LLAMA_LOG_WARN("%s: auto-asymmetric: GQA ratio %u:1 (n_head=%u, n_head_kv=%u) — "
|
||||
"upgrading K from %s to q8_0 to prevent quality degradation. "
|
||||
"Disable with TURBO_AUTO_ASYMMETRIC=0\n",
|
||||
__func__, gqa_ratio, n_head, n_head_kv, ggml_type_name(type_k));
|
||||
type_k = GGML_TYPE_Q8_0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t n_layer_kv = hparams.n_layer_kv();
|
||||
|
||||
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
||||
@@ -111,7 +161,8 @@ llama_kv_cache::llama_kv_cache(
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
|
||||
// +3 for turbo rotation matrices (turbo_rotation + turbo_rotation_inv + turbo_innerq_scale_inv)
|
||||
/*.mem_size =*/ size_t((2u*(1 + n_stream)*n_layer_kv + 3)*ggml_tensor_overhead()),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
@@ -204,11 +255,107 @@ llama_kv_cache::llama_kv_cache(
|
||||
throw std::runtime_error("failed to create ggml context for kv cache");
|
||||
}
|
||||
|
||||
// TurboQuant zero-padding: for models with non-128-aligned head_dim (e.g. DeepSeek
|
||||
// head_dim_k=192), pad each head to the next multiple of 128. The padded zeros don't
|
||||
// affect dot products since WHT preserves inner products:
|
||||
// <WHT(Q_padded), WHT(K_padded)> = <Q_padded, K_padded> = <Q, K> + <0, 0> = <Q, K>
|
||||
const uint32_t n_embd_head_k = hparams.n_embd_head_k(il);
|
||||
|
||||
|
||||
const bool has_k = true;
|
||||
const bool has_v = !is_mla;
|
||||
|
||||
ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
|
||||
ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
|
||||
// Layer-adaptive: use higher precision for quality-sensitive layers
|
||||
// Config: TURBO_LAYER_ADAPTIVE env var controls the strategy
|
||||
// 0 = uniform (default)
|
||||
// 1 = q8_0 K+V for first+last 4 layers
|
||||
// 2 = q8_0 K+V for last 8 layers
|
||||
// 5 = Boundary V: first2+last2 V=turbo4, rest V=turbo2 (K unchanged)
|
||||
// 6 = V-only: last 8 V=turbo4, rest V=turbo2 (K unchanged)
|
||||
// 7 = Boundary V (recommended): first2+last2 V=q8_0, rest V=turbo2 (K unchanged)
|
||||
ggml_type layer_type_k = type_k;
|
||||
ggml_type layer_type_v = type_v;
|
||||
{
|
||||
static const int adaptive_mode = [&]() {
|
||||
const char * env = getenv("TURBO_LAYER_ADAPTIVE");
|
||||
if (env) {
|
||||
int mode = atoi(env);
|
||||
if (mode > 0) {
|
||||
LLAMA_LOG_INFO("llama_kv_cache: layer-adaptive mode %d enabled (env)\n", mode);
|
||||
}
|
||||
return mode;
|
||||
}
|
||||
// Auto-enable Boundary V (mode 7) when V is turbo2
|
||||
if (type_v == GGML_TYPE_TURBO2_0 && hparams.n_layer >= 8) {
|
||||
LLAMA_LOG_INFO("llama_kv_cache: Boundary V auto-enabled for turbo2-V (opt-out: TURBO_LAYER_ADAPTIVE=0)\n");
|
||||
return 7;
|
||||
}
|
||||
return 0;
|
||||
}();
|
||||
const bool is_turbo = (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0);
|
||||
const bool v_is_turbo = (type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0 || type_v == GGML_TYPE_TURBO2_0);
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
if (adaptive_mode == 1 && is_turbo && n_layer >= 8) {
|
||||
if (il < 4 || il >= n_layer - 4) {
|
||||
layer_type_k = GGML_TYPE_Q8_0;
|
||||
layer_type_v = GGML_TYPE_Q8_0;
|
||||
}
|
||||
} else if (adaptive_mode == 2 && is_turbo && n_layer >= 8) {
|
||||
if (il >= n_layer - 8) {
|
||||
layer_type_k = GGML_TYPE_Q8_0;
|
||||
layer_type_v = GGML_TYPE_Q8_0;
|
||||
}
|
||||
} else if (adaptive_mode == 5 && v_is_turbo && n_layer >= 8) {
|
||||
// Boundary V (turbo4 boundaries): first2+last2 V=turbo4, rest V=turbo2
|
||||
const bool is_boundary = (il < 2 || il >= n_layer - 2);
|
||||
layer_type_v = is_boundary ? GGML_TYPE_TURBO4_0 : GGML_TYPE_TURBO2_0;
|
||||
if (il == 0) {
|
||||
LLAMA_LOG_INFO("llama_kv_cache: Boundary V mode 5: first2+last2 V=turbo4, rest V=turbo2\n");
|
||||
}
|
||||
} else if (adaptive_mode == 6 && v_is_turbo && n_layer >= 8) {
|
||||
// V-only: last 8 V=turbo4, rest V=turbo2
|
||||
layer_type_v = (il >= n_layer - 8) ? GGML_TYPE_TURBO4_0 : GGML_TYPE_TURBO2_0;
|
||||
if (il == 0) {
|
||||
LLAMA_LOG_INFO("llama_kv_cache: V-only LA mode 6: last8 V=turbo4, rest V=turbo2\n");
|
||||
}
|
||||
} else if (adaptive_mode == 7 && v_is_turbo && n_layer >= 8) {
|
||||
// Boundary V (recommended): first2+last2 V=q8_0, rest V=turbo2
|
||||
const bool is_boundary = (il < 2 || il >= n_layer - 2);
|
||||
layer_type_v = is_boundary ? GGML_TYPE_Q8_0 : GGML_TYPE_TURBO2_0;
|
||||
if (il == 0) {
|
||||
LLAMA_LOG_INFO("llama_kv_cache: Boundary V mode 7: first2+last2 V=q8_0, rest V=turbo2\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
// For turbo types, pad K head_dim to next multiple of 128 for full WHT groups
|
||||
uint32_t n_embd_k_gqa_eff = n_embd_k_gqa;
|
||||
const bool k_is_turbo = (layer_type_k == GGML_TYPE_TURBO3_0 || layer_type_k == GGML_TYPE_TURBO4_0 || layer_type_k == GGML_TYPE_TURBO2_0);
|
||||
if (k_is_turbo && n_embd_head_k % 128 != 0) {
|
||||
const uint32_t padded_head_k = ((n_embd_head_k + 127) / 128) * 128;
|
||||
const uint32_t n_head_kv = n_embd_k_gqa / n_embd_head_k;
|
||||
n_embd_k_gqa_eff = n_head_kv * padded_head_k;
|
||||
if (il == 0) {
|
||||
LLAMA_LOG_INFO("%s: turbo zero-padding K head_dim %u -> %u (cache %u -> %u)\n",
|
||||
__func__, n_embd_head_k, padded_head_k, n_embd_k_gqa, n_embd_k_gqa_eff);
|
||||
}
|
||||
}
|
||||
|
||||
// For turbo types, pad V head_dim to next multiple of 128 if needed
|
||||
const uint32_t n_embd_head_v = hparams.n_embd_head_v(il);
|
||||
uint32_t n_embd_v_gqa_eff = n_embd_v_gqa;
|
||||
const bool v_is_turbo = (layer_type_v == GGML_TYPE_TURBO3_0 || layer_type_v == GGML_TYPE_TURBO4_0 || layer_type_v == GGML_TYPE_TURBO2_0);
|
||||
if (v_is_turbo && !is_mla && n_embd_head_v % 128 != 0) {
|
||||
const uint32_t padded_head_v = ((n_embd_head_v + 127) / 128) * 128;
|
||||
const uint32_t n_head_kv = n_embd_v_gqa / n_embd_head_v;
|
||||
n_embd_v_gqa_eff = n_head_kv * padded_head_v;
|
||||
if (il == 0) {
|
||||
LLAMA_LOG_INFO("%s: turbo zero-padding V head_dim %u -> %u (cache %u -> %u)\n",
|
||||
__func__, n_embd_head_v, padded_head_v, n_embd_v_gqa, n_embd_v_gqa_eff);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, layer_type_k, n_embd_k_gqa_eff, kv_size, n_stream) : nullptr;
|
||||
ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, layer_type_v, n_embd_v_gqa_eff, kv_size, n_stream) : nullptr;
|
||||
|
||||
has_k && ggml_format_name(k, "cache_k_l%d", il);
|
||||
has_v && ggml_format_name(v, "cache_v_l%d", il);
|
||||
@@ -217,13 +364,26 @@ llama_kv_cache::llama_kv_cache(
|
||||
std::vector<ggml_tensor *> v_stream;
|
||||
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
|
||||
v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
|
||||
k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa_eff, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
|
||||
v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa_eff, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
|
||||
}
|
||||
|
||||
map_layer_ids[il] = layers.size();
|
||||
|
||||
layers.push_back({ il, k, v, k_stream, v_stream, });
|
||||
|
||||
// TurboQuant: create rotation matrix tensors (once, shared across layers)
|
||||
if (turbo_rotation == nullptr &&
|
||||
(type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0)) {
|
||||
turbo_rotation = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 128, 128);
|
||||
ggml_format_name(turbo_rotation, "turbo_rotation"); // R^T
|
||||
turbo_rotation_inv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 128, 128);
|
||||
ggml_format_name(turbo_rotation_inv, "turbo_rotation_inv"); // R
|
||||
|
||||
// InnerQ: per-channel scale_inv tensor (128 floats, initialized to all 1.0)
|
||||
turbo_innerq_scale_inv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, INNERQ_MAX_CHANNELS);
|
||||
ggml_format_name(turbo_innerq_scale_inv, "turbo_innerq_scale_inv");
|
||||
}
|
||||
}
|
||||
|
||||
if (reuse) {
|
||||
@@ -268,6 +428,28 @@ llama_kv_cache::llama_kv_cache(
|
||||
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
|
||||
// Fill turbo rotation matrices AFTER buffer clear (clear zeroes everything)
|
||||
if (turbo_rotation != nullptr && turbo_rotation->buffer != nullptr && !model.hparams.no_alloc) {
|
||||
#include "turbo-rotation-data.h"
|
||||
// ggml is column-major; C arrays are row-major. Storing a row-major matrix
|
||||
// into ggml implicitly transposes it. ggml_mul_mat(A, x) computes A^T @ x.
|
||||
// To get R @ q: store R^T → ggml sees (R^T)^T_col = R → mul_mat gives R @ q. Wait no —
|
||||
// store R so ggml col-major reads it as R^T, then mul_mat gives (R^T)^T = R. ✓
|
||||
// Store R for Q forward rotation, R^T for V inverse rotation
|
||||
// ggml_mul_mat(A,x) computes A@x for row-major stored A (verified by test)
|
||||
ggml_backend_tensor_set(turbo_rotation, TURBO_ROTATION_R, 0, 128 * 128 * sizeof(float));
|
||||
ggml_backend_tensor_set(turbo_rotation_inv, TURBO_ROTATION_RT, 0, 128 * 128 * sizeof(float));
|
||||
|
||||
// Initialize InnerQ scale_inv to all 1.0 (identity scaling)
|
||||
if (turbo_innerq_scale_inv != nullptr && turbo_innerq_scale_inv->buffer != nullptr) {
|
||||
float ones[INNERQ_MAX_CHANNELS];
|
||||
for (int i = 0; i < INNERQ_MAX_CHANNELS; i++) ones[i] = 1.0f;
|
||||
ggml_backend_tensor_set(turbo_innerq_scale_inv, ones, 0, INNERQ_MAX_CHANNELS * sizeof(float));
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: TurboQuant rotation matrices initialized (128x128)\n", __func__);
|
||||
}
|
||||
ctxs_bufs.emplace_back(std::move(ctx), buf);
|
||||
}
|
||||
|
||||
@@ -281,23 +463,62 @@ llama_kv_cache::llama_kv_cache(
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
|
||||
// TurboQuant: master's #21038 attention rotation is OFF by default on this
|
||||
// fork. Enable per-side via LLAMA_ATTN_ROT_K_OVERRIDE=1 and/or
|
||||
// LLAMA_ATTN_ROT_V_OVERRIDE=1 if your specific model+KV combo benefits.
|
||||
//
|
||||
// Why default OFF: empirical PPL+KLD testing on 7 model families
|
||||
// (gemma-4 26B-A4B/31B/E2B, Qwen2.5-7B, Qwen3.5-2B, Mistral-Small-24B,
|
||||
// phi-4, on q8/turbo4 KV) showed the optimal rotation policy is highly
|
||||
// model-and-quant specific:
|
||||
//
|
||||
// • gemma-4 31B Q8 q8/turbo4: V-only rotation gives -43% PPL (huge win).
|
||||
// • gemma-4 26B-A4B Q8 q8/turbo4: V-only gives -3.9%.
|
||||
// • gemma-4 E2B Q4_K_L q8/turbo4: V-only HURTS by +6.7%.
|
||||
// • phi-4 Q8 q8/turbo4: V-side rotation crashes (graph hash overflow).
|
||||
// • Qwen2.5/3.5/Mistral: rotation effect is within standard error.
|
||||
//
|
||||
// No single default is correct everywhere, including within the same
|
||||
// architecture family (gemma-4 above shows three distinct optima across
|
||||
// three sizes). Per-arch heuristics in code would silently regress users
|
||||
// on variants we haven't tested. Default OFF + per-side env knobs lets
|
||||
// each user tune for their specific config; documented findings in the
|
||||
// README guide the choice.
|
||||
//
|
||||
// Reported by @erazortt (TheTom/turboquant_plus#88).
|
||||
//
|
||||
// LLAMA_ATTN_ROT_DISABLE retained as a no-op alias (default OFF makes it
|
||||
// redundant but historical scripts may set it).
|
||||
// Default attn_rot_disable=false now that rotation is OFF by default. The
|
||||
// env var is preserved as a hard lock-out (=1 forces rotation off and
|
||||
// blocks overrides), useful for users who want to guarantee no rotation
|
||||
// regardless of any LLAMA_ATTN_ROT_*_OVERRIDE settings.
|
||||
const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE");
|
||||
const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false;
|
||||
if (attn_rot_disable) {
|
||||
LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__);
|
||||
const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? (atoi(LLAMA_ATTN_ROT_DISABLE) != 0) : false;
|
||||
|
||||
// Default: rotation OFF on both sides (safe across all tested model families).
|
||||
// Override per side via env vars below.
|
||||
attn_rot_k = false;
|
||||
attn_rot_v = false;
|
||||
|
||||
// Per-side overrides. Set LLAMA_ATTN_ROT_K_OVERRIDE=1 / LLAMA_ATTN_ROT_V_OVERRIDE=1
|
||||
// to enable rotation. The cache type and head-dim alignment guards below
|
||||
// still apply: rotation only takes effect on quantized types with
|
||||
// head_dim % 64 == 0 (master's #21038 requirements).
|
||||
const char * ROT_K_OV = getenv("LLAMA_ATTN_ROT_K_OVERRIDE");
|
||||
if (ROT_K_OV && atoi(ROT_K_OV) != 0 && !attn_rot_disable) {
|
||||
attn_rot_k =
|
||||
n_embd_head_k_all > 0 &&
|
||||
ggml_is_quantized(type_k) &&
|
||||
hparams.n_embd_head_k() % 64 == 0;
|
||||
}
|
||||
const char * ROT_V_OV = getenv("LLAMA_ATTN_ROT_V_OVERRIDE");
|
||||
if (ROT_V_OV && atoi(ROT_V_OV) != 0 && !attn_rot_disable) {
|
||||
attn_rot_v =
|
||||
n_embd_head_v_all > 0 &&
|
||||
ggml_is_quantized(type_v) &&
|
||||
hparams.n_embd_head_v() % 64 == 0;
|
||||
}
|
||||
|
||||
attn_rot_k =
|
||||
!attn_rot_disable &&
|
||||
n_embd_head_k_all > 0 &&
|
||||
ggml_is_quantized(type_k) &&
|
||||
hparams.n_embd_head_k() % 64 == 0;
|
||||
|
||||
attn_rot_v =
|
||||
!attn_rot_disable &&
|
||||
n_embd_head_v_all > 0 &&
|
||||
ggml_is_quantized(type_v) &&
|
||||
hparams.n_embd_head_v() % 64 == 0;
|
||||
|
||||
LLAMA_LOG_INFO("%s: attn_rot_k = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_k, n_embd_head_k_all);
|
||||
LLAMA_LOG_INFO("%s: attn_rot_v = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_v, n_embd_head_v_all);
|
||||
@@ -337,6 +558,20 @@ void llama_kv_cache::clear(bool data) {
|
||||
for (auto & [_, buf] : ctxs_bufs) {
|
||||
ggml_backend_buffer_clear(buf.get(), 0);
|
||||
}
|
||||
|
||||
// Re-initialize turbo rotation matrices after buffer clear (clear zeroes everything)
|
||||
if (turbo_rotation != nullptr && turbo_rotation->buffer != nullptr) {
|
||||
#include "turbo-rotation-data.h"
|
||||
ggml_backend_tensor_set(turbo_rotation, TURBO_ROTATION_R, 0, 128 * 128 * sizeof(float));
|
||||
ggml_backend_tensor_set(turbo_rotation_inv, TURBO_ROTATION_RT, 0, 128 * 128 * sizeof(float));
|
||||
|
||||
// Re-initialize InnerQ scale_inv to all 1.0
|
||||
if (turbo_innerq_scale_inv != nullptr && turbo_innerq_scale_inv->buffer != nullptr) {
|
||||
float ones[INNERQ_MAX_CHANNELS];
|
||||
for (int i = 0; i < INNERQ_MAX_CHANNELS; i++) ones[i] = 1.0f;
|
||||
ggml_backend_tensor_set(turbo_innerq_scale_inv, ones, 0, INNERQ_MAX_CHANNELS * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1150,13 +1385,24 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k
|
||||
const uint64_t kv_size = get_size();
|
||||
const uint64_t n_embd_k_gqa = k->ne[0];
|
||||
|
||||
assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il));
|
||||
// For turbo-padded caches, n_embd_k_gqa may be larger than hparams value
|
||||
const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0);
|
||||
if (k_is_turbo) {
|
||||
assert(n_embd_k_gqa >= hparams.n_embd_k_gqa(il));
|
||||
} else {
|
||||
assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il));
|
||||
}
|
||||
|
||||
// Use padded head_dim for turbo types so the full padded data is returned
|
||||
const uint32_t head_k = hparams.n_embd_head_k(il);
|
||||
const uint32_t head_k_eff = (k_is_turbo && head_k % 128 != 0)
|
||||
? ((head_k + 127) / 128) * 128 : head_k;
|
||||
|
||||
const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
||||
|
||||
return ggml_view_4d(ctx, k,
|
||||
hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(k->type, hparams.n_embd_head_k(il)),
|
||||
head_k_eff, hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(k->type, head_k_eff),
|
||||
ggml_row_size(k->type, n_embd_k_gqa),
|
||||
ggml_row_size(k->type, n_embd_k_gqa*kv_size),
|
||||
ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
|
||||
@@ -1170,27 +1416,33 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
||||
const uint64_t kv_size = get_size();
|
||||
const uint64_t n_embd_v_gqa = v->ne[0];
|
||||
|
||||
// [TAG_V_CACHE_VARIABLE]
|
||||
// [TAG_V_CACHE_VARIABLE] — for turbo-padded V, cache may be larger
|
||||
assert(n_embd_v_gqa >= hparams.n_embd_v_gqa(il));
|
||||
|
||||
// Use padded head_dim for turbo types
|
||||
const bool v_is_turbo = (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0);
|
||||
const uint32_t head_v = hparams.n_embd_head_v(il);
|
||||
const uint32_t head_v_eff = (v_is_turbo && head_v % 128 != 0)
|
||||
? ((head_v + 127) / 128) * 128 : head_v;
|
||||
|
||||
const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
||||
|
||||
if (!v_trans) {
|
||||
// note: v->nb[1] <= v->nb[2]
|
||||
return ggml_view_4d(ctx, v,
|
||||
hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(v->type, hparams.n_embd_head_v(il)), // v->nb[1]
|
||||
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
|
||||
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
|
||||
head_v_eff, hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(v->type, head_v_eff), // v->nb[1]
|
||||
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
|
||||
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
|
||||
ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
|
||||
}
|
||||
|
||||
// note: v->nb[1] > v->nb[2]
|
||||
return ggml_view_4d(ctx, v,
|
||||
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
|
||||
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)), // v->nb[1]
|
||||
ggml_row_size(v->type, kv_size), // v->nb[2]
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
|
||||
n_kv, hparams.n_head_kv(il), head_v_eff, ns,
|
||||
ggml_row_size(v->type, kv_size*head_v_eff), // v->nb[1]
|
||||
ggml_row_size(v->type, kv_size), // v->nb[2]
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
||||
}
|
||||
|
||||
@@ -1201,11 +1453,22 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
|
||||
|
||||
ggml_tensor * k = layers[ikv].k;
|
||||
|
||||
const int64_t n_embd_head = k_cur->ne[0];
|
||||
int64_t n_embd_head = k_cur->ne[0];
|
||||
const int64_t n_head = k_cur->ne[1];
|
||||
const int64_t n_tokens = k_cur->ne[2];
|
||||
|
||||
const int64_t n_embd_gqa = n_embd_head*n_head;
|
||||
// Turbo zero-padding: pad each head to next multiple of 128 before merging dims.
|
||||
// k_cur shape here is (n_embd_head, n_head, n_tokens).
|
||||
// ggml_pad pads ne[0] with zeros — exactly what we need per-head.
|
||||
const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0);
|
||||
const bool k_needs_pad = k_is_turbo && (n_embd_head % 128 != 0);
|
||||
if (k_needs_pad) {
|
||||
const int64_t pad_amount = ((n_embd_head + 127) / 128) * 128 - n_embd_head;
|
||||
k_cur = ggml_pad(ctx, k_cur, pad_amount, 0, 0, 0);
|
||||
n_embd_head = k_cur->ne[0]; // now 128-aligned
|
||||
}
|
||||
|
||||
int64_t n_embd_gqa = n_embd_head * n_head;
|
||||
|
||||
// we can merge dims 0 and 1
|
||||
// TODO: add ggml helper function for this?
|
||||
@@ -1226,7 +1489,16 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
|
||||
}
|
||||
|
||||
// store the current K values into the cache
|
||||
return ggml_set_rows(ctx, k, k_cur, k_idxs);
|
||||
ggml_tensor * result = ggml_set_rows(ctx, k, k_cur, k_idxs);
|
||||
|
||||
// For turbo: store WHT group size in op_params so the CUDA kernel knows.
|
||||
// With zero-padding, all groups are always full 128-element WHT groups.
|
||||
if (k_is_turbo) {
|
||||
int32_t wht_group = 128; // always 128 with padding
|
||||
memcpy(result->op_params, &wht_group, sizeof(int32_t));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
|
||||
@@ -1236,11 +1508,20 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
|
||||
|
||||
auto * v = layers[ikv].v;
|
||||
|
||||
const int64_t n_embd_head = v_cur->ne[0];
|
||||
int64_t n_embd_head = v_cur->ne[0];
|
||||
const int64_t n_head = v_cur->ne[1];
|
||||
const int64_t n_tokens = v_cur->ne[2];
|
||||
|
||||
const int64_t n_embd_gqa = n_embd_head*n_head;
|
||||
// Turbo zero-padding: pad V head_dim to next multiple of 128
|
||||
const bool v_is_turbo = (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0);
|
||||
const bool v_needs_pad = v_is_turbo && (n_embd_head % 128 != 0);
|
||||
if (v_needs_pad) {
|
||||
const int64_t pad_amount = ((n_embd_head + 127) / 128) * 128 - n_embd_head;
|
||||
v_cur = ggml_pad(ctx, v_cur, pad_amount, 0, 0, 0);
|
||||
n_embd_head = v_cur->ne[0]; // now 128-aligned
|
||||
}
|
||||
|
||||
int64_t n_embd_gqa = n_embd_head * n_head;
|
||||
|
||||
// we can merge dims 0 and 1
|
||||
GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
|
||||
@@ -1261,7 +1542,13 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
|
||||
v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
|
||||
}
|
||||
|
||||
return ggml_set_rows(ctx, v, v_cur, v_idxs);
|
||||
ggml_tensor * result = ggml_set_rows(ctx, v, v_cur, v_idxs);
|
||||
// With zero-padding, all groups are always full 128-element WHT groups
|
||||
if (v_is_turbo) {
|
||||
int32_t wht_group = 128; // always 128 with padding
|
||||
memcpy(result->op_params, &wht_group, sizeof(int32_t));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
|
||||
@@ -1315,14 +1602,23 @@ ggml_tensor * llama_kv_cache::build_input_k_rot(ggml_context * ctx) const {
|
||||
ggml_tensor * res = nullptr;
|
||||
|
||||
if (attn_rot_k) {
|
||||
int nrot = 64;
|
||||
|
||||
// TODO: investigate if using the smallest rotation matrix is beneficial also for K (similar as for V)
|
||||
// EXPERIMENT (master TODO): force smallest rotation matrix (nrot=64)
|
||||
// for K, mirroring V's choice. Master defaults to the largest power-of-2
|
||||
// that divides head_dim, but the upstream comment hypothesizes smaller
|
||||
// tiles preserve more local structure → less PPL hit on sensitive models
|
||||
// (gemma-4 26B-A4B reportedly regresses with the largest tile).
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
|
||||
do {
|
||||
nrot *= 2;
|
||||
} while (n_embd_head_k_all % nrot == 0);
|
||||
nrot /= 2;
|
||||
const char * LLAMA_ATTN_ROT_K_NROT = getenv("LLAMA_ATTN_ROT_K_NROT");
|
||||
int nrot = LLAMA_ATTN_ROT_K_NROT ? atoi(LLAMA_ATTN_ROT_K_NROT) : 64;
|
||||
|
||||
// Original master behavior (largest power-of-2): set LLAMA_ATTN_ROT_K_NROT=0
|
||||
if (nrot == 0) {
|
||||
nrot = 64;
|
||||
do {
|
||||
nrot *= 2;
|
||||
} while (n_embd_head_k_all % nrot == 0);
|
||||
nrot /= 2;
|
||||
}
|
||||
|
||||
res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
|
||||
ggml_set_input(res);
|
||||
@@ -1980,10 +2276,11 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
||||
for (const auto & layer : layers) {
|
||||
const uint32_t il = layer.il;
|
||||
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
auto * k = layer.k_stream[cr.strm];
|
||||
|
||||
// Use actual tensor width (may be padded for turbo types: e.g. 576→640)
|
||||
const uint32_t n_embd_k_gqa = (uint32_t) k->ne[0];
|
||||
|
||||
// Write key type
|
||||
const int32_t k_type_i = (int32_t) k->type;
|
||||
io.write(&k_type_i, sizeof(k_type_i));
|
||||
@@ -2004,13 +2301,14 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
||||
for (const auto & layer : layers) {
|
||||
const uint32_t il = layer.il;
|
||||
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[cr.strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Use actual tensor width (may be padded for turbo types)
|
||||
const uint32_t n_embd_v_gqa = (uint32_t) v->ne[0];
|
||||
|
||||
// Write value type
|
||||
const int32_t v_type_i = (int32_t) v->type;
|
||||
io.write(&v_type_i, sizeof(v_type_i));
|
||||
@@ -2212,10 +2510,11 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
||||
for (const auto & layer : layers) {
|
||||
const uint32_t il = layer.il;
|
||||
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
auto * k = layer.k_stream[strm];
|
||||
|
||||
// Use actual tensor width (may be padded for turbo types)
|
||||
const uint32_t n_embd_k_gqa = (uint32_t) k->ne[0];
|
||||
|
||||
// Read type of key
|
||||
int32_t k_type_i_ref;
|
||||
io.read(&k_type_i_ref, sizeof(k_type_i_ref));
|
||||
@@ -2252,13 +2551,14 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
||||
for (const auto & layer : layers) {
|
||||
const uint32_t il = layer.il;
|
||||
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Use actual tensor width (may be padded for turbo types)
|
||||
const uint32_t n_embd_v_gqa = (uint32_t) v->ne[0];
|
||||
|
||||
// Read type of value
|
||||
int32_t v_type_i_ref;
|
||||
io.read(&v_type_i_ref, sizeof(v_type_i_ref));
|
||||
@@ -2416,6 +2716,16 @@ bool llama_kv_cache_context::apply() {
|
||||
kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
|
||||
n_kv = kv->get_n_kv(sinfos[i_cur]);
|
||||
|
||||
// InnerQ: check if CUDA calibration finalized and tensor needs update
|
||||
if (kv->get_turbo_innerq_scale_inv() != nullptr && turbo_innerq_needs_tensor_update()) {
|
||||
ggml_tensor * t = kv->get_turbo_innerq_scale_inv();
|
||||
if (t->buffer != nullptr) {
|
||||
ggml_backend_tensor_set(t, g_innerq_scale_inv_host, 0, INNERQ_MAX_CHANNELS * sizeof(float));
|
||||
turbo_innerq_mark_tensor_updated();
|
||||
LLAMA_LOG_INFO("%s: InnerQ scale_inv tensor updated\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2449,6 +2759,26 @@ ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) cons
|
||||
return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::get_turbo_rotation() const {
|
||||
return kv->get_turbo_rotation();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::get_turbo_rotation_inv() const {
|
||||
return kv->get_turbo_rotation_inv();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::get_turbo_rot_forward() const {
|
||||
return kv->get_turbo_rotation();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::get_turbo_rot_inverse() const {
|
||||
return kv->get_turbo_rotation_inv();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::get_turbo_innerq_scale_inv() const {
|
||||
return kv->get_turbo_innerq_scale_inv();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
|
||||
return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user