turboquant: squash-merge TheTom/llama-cpp-turboquant feature/turboquant-kv-cache

Squashes the entire TurboQuant KV-cache feature branch from https://github.com/TheTom/llama-cpp-turboquant (tip 5aeb2fdbe) onto our master. Includes: TurboQuant KV-cache types (turbo2_0, turbo3_0, turbo4_0, tq3_1s, tq4_1s), GGML_OP_TURBO_WHT op, CUDA + Metal kernels (including TQ-rotated mul_mm path), CPU reference paths, HIP template instances, perplexity tooling, and 18 post-upstream-sync fixes (CVE-2026-21869 server clamp, HIP FA pool retention, n_head_v reshape, sparse-V CUDA gating, etc.). Conflict-resolution notes (review carefully before depending on these paths): - common/arg.cpp, common/speculative.cpp: master's refactored speculative API kept (params.speculative.types / ngram_mod struct, per-sinfo n_low/i_last). - ggml-cuda/fattn.cu: head-size exclusion lists unioned (now exclude both 192 and 640 alongside other sizes). - ggml-cuda/ggml-cuda.cu: both master's ADD/SUB/MUL/DIV F16 widening AND TurboQuant's GGML_OP_TURBO_WHT support cases kept. - ggml-metal-device.h/.cpp: master's new get_pipeline_mul_mv_ext signature (const ggml_tensor * op) kept; TurboQuant's get_pipeline_turbo_wht added. - ggml-metal-ops.cpp: TurboQuant's TQ-rotated mul_mm path preserved; non-TQ else-branch adapted to master's pipeline.nr0/nr1/nsg dispatch API. - ggml-vulkan.cpp: master's spec-constant-driven flash_attn pipeline iteration taken (over TurboQuant's CREATE_FA-per-type macro approach). TURBO3_0 added to the fa_kv_ok lambda for type validation. - ggml-vulkan/flash_attn_base.glsl, vulkan-shaders-gen.cpp: master's new spec-constant FA shader generation kept; TurboQuant's DATA_A_TURBO3_0 macro path NOT carried over. *** Vulkan TURBO3_0 flash-attention paths need re-implementation against the new spec-constant API. *** Vulkan TURBO3_0 inference will likely fail until that work is redone. Squash base: 7fc1c4ef78 (TheTom's last upstream merge point).
2026-05-13 23:01:46 +02:00
parent d14ce3dab4
commit ddebb5ddf6
115 changed files with 24655 additions and 404 deletions
@@ -16,9 +16,9 @@
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
+  spirv-headers,
  openssl,
  shaderc,
-  spirv-headers,
  useBlas ?
    builtins.all (x: !x) [
      useCuda
@@ -0,0 +1 @@
+github: [TheTom]
@@ -0,0 +1,109 @@
+name: TurboQuant+ Release
+
+on:
+  push:
+    tags:
+      - 'tqp-v*'
+
+env:
+  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON"
+
+jobs:
+  macos-metal:
+    runs-on: macos-14
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Build
+        run: |
+          cmake -B build \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DCMAKE_INSTALL_RPATH='@loader_path' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Pack
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf turboquant-plus-${{ github.ref_name }}-macos-arm64-metal.tar.gz \
+            -s ",./,turboquant-plus-${{ github.ref_name }}/," -C ./build/bin .
+
+      - name: Upload
+        uses: actions/upload-artifact@v6
+        with:
+          name: macos-arm64-metal
+          path: turboquant-plus-${{ github.ref_name }}-macos-arm64-metal.tar.gz
+
+  windows-cuda:
+    runs-on: windows-2022
+
+    strategy:
+      matrix:
+        cuda: ['12.4']
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+
+      - name: Install Cuda Toolkit
+        uses: ./.github/actions/windows-setup-cuda
+        with:
+          cuda_version: ${{ matrix.cuda }}
+
+      - name: Install Ninja
+        run: choco install ninja
+
+      - name: Build
+        shell: cmd
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_CUDA=ON ^
+            -DGGML_CUDA_FA_ALL_QUANTS=ON ^
+            ${{ env.CMAKE_ARGS }}
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS%
+
+      - name: Pack
+        run: |
+          cp LICENSE ./build/bin/Release/
+          $dst='.\build\bin\Release\'
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          7z a turboquant-plus-${{ github.ref_name }}-windows-x64-cuda${{ matrix.cuda }}.zip .\build\bin\Release\*
+
+      - name: Upload
+        uses: actions/upload-artifact@v6
+        with:
+          name: windows-x64-cuda${{ matrix.cuda }}
+          path: turboquant-plus-${{ github.ref_name }}-windows-x64-cuda${{ matrix.cuda }}.zip
+
+  release:
+    needs: [macos-metal, windows-cuda]
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v7
+        with:
+          path: ./release
+          merge-multiple: true
+
+      - name: Create Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ github.ref_name }}
+          name: TurboQuant+ ${{ github.ref_name }}
+          files: ./release/*
+          draft: false
+          prerelease: false
@@ -0,0 +1,362 @@
+=== SMEM M5 Benchmark: baseline ===
+Model: Qwen3.5-35B-A3B-Q8_0.gguf
+Date: Sat Mar 28 21:45:40 CDT 2026
+
+--- turbo3 @ short ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105cffcb0 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105cfeb30 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 6.440 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |         78.47 ± 0.56 |
+
+build: 13afec1 (178)
+
+--- turbo3 @ 8192 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1040cfae0 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1040ce960 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.010 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |          pp8192 |      2144.16 ± 30.18 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |         78.90 ± 0.24 |
+
+build: 13afec1 (178)
+
+--- turbo3 @ 16384 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x10500fc00 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x10500ea80 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |         pp16384 |      1704.41 ± 21.63 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |         78.64 ± 0.44 |
+
+build: 13afec1 (178)
+
+--- turbo3 @ 32768 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x101c8fb00 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x101c8e980 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.013 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |         pp32768 |       1238.85 ± 6.06 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |         78.17 ± 0.69 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ short ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103c17f70 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103c16df0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         80.40 ± 0.72 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ 8192 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103e57d30 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103e56bb0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.010 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |          pp8192 |      2048.90 ± 43.42 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         79.84 ± 0.95 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ 16384 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1060bf740 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1060be5c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.009 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |         pp16384 |      1605.18 ± 20.70 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         79.45 ± 1.55 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ 32768 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1040ef870 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1040ee6f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.010 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |         pp32768 |       1157.30 ± 8.01 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         80.64 ± 0.72 |
+
+build: 13afec1 (178)
+
+--- q8_0 @ short ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1055e78c0 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1055e6740 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 |   q8_0 |   q8_0 |  1 |           tg128 |         85.48 ± 1.34 |
+
+build: 13afec1 (178)
+
+--- q8_0 @ 8192 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105ac8540 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105ac73c0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.010 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 |   q8_0 |   q8_0 |  1 |          pp8192 |      2106.47 ± 64.66 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 |   q8_0 |   q8_0 |  1 |           tg128 |         76.72 ± 2.13 |
+
+build: 13afec1 (178)
+
+--- q8_0 @ 16384 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103fefa70 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103fee8f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 |   q8_0 |   q8_0 |  1 |         pp16384 |      1723.71 ± 28.56 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 |   q8_0 |   q8_0 |  1 |           tg128 |         78.09 ± 3.70 |
+
+build: 13afec1 (178)
+
+--- q8_0 @ 32768 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1035f7b10 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1035f6990 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 |   q8_0 |   q8_0 |  1 |         pp32768 |      1216.99 ± 28.64 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 |   q8_0 |   q8_0 |  1 |           tg128 |         86.83 ± 0.34 |
+
+build: 13afec1 (178)
+
+=== Done: baseline ===
@@ -0,0 +1,413 @@
+=== SMEM M5 Benchmark: smem ===
+Model: Qwen3.5-35B-A3B-Q8_0.gguf
+Date: Sat Mar 28 22:02:19 CDT 2026
+
+--- turbo3 @ short ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x104fbb670 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x104fbb5f0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 7.366 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |         18.39 ± 0.76 |
+
+build: 13afec1 (178)
+
+--- turbo3 @ 8192 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x101ee3e50 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x101ee3dd0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.009 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |         pp16384 |     1337.26 ± 261.92 |
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |          pp8192 |     1442.03 ± 393.22 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |        40.38 ± 18.10 |
+
+build: 13afec1 (178)
+
+--- turbo3 @ 32768 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105a3f890 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105a3e710 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
+ggml_metal_library_init: loaded in 0.010 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |         58.20 ± 8.75 |
+
+build: 13afec1 (178)
+
+--- turbo3 @ 16384 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103d7b200 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103d7b180 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.009 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |         pp16384 |       792.76 ± 57.30 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |         16.47 ± 1.39 |
+
+build: 13afec1 (178)
+
+--- turbo3 @ 32768 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x104dc31e0 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x104dc3160 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.009 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |         pp32768 |      806.43 ± 177.53 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |         16.19 ± 1.11 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ short ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105ccfa30 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105cce8b0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         16.93 ± 0.97 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ 8192 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x10561bc80 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x10561ab00 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |          pp8192 |       942.18 ± 77.19 |
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |         pp32768 |      941.24 ± 180.34 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |        44.84 ± 18.74 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ 16384 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1038a3d70 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1038a2bf0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo3 | turbo3 |  1 |           tg128 |         61.97 ± 9.79 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ short ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x10170b580 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x10170b500 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         17.82 ± 0.64 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ 8192 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103dab490 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x103dab410 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.009 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |         pp16384 |     1187.08 ± 274.35 |
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |          pp8192 |     1098.56 ± 217.82 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |        50.13 ± 12.92 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ 32768 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105f20300 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x105f1f180 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: turbo3/4 SMEM pre-dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         58.25 ± 4.07 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ 16384 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x10588f220 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x10588f1a0 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.008 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |         pp16384 |       755.20 ± 28.45 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         15.58 ± 1.31 |
+
+build: 13afec1 (178)
+
+--- turbo4 @ 32768 ---
+ggml_metal_device_init: testing tensor API for f16 support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x1018533e0 | th_max = 1024 | th_width =   32
+ggml_metal_device_init: testing tensor API for bfloat support
+ggml_metal_library_compile_pipeline: compiling pipeline: base = 'dummy_kernel', name = 'dummy_kernel'
+ggml_metal_library_compile_pipeline: loaded dummy_kernel                                  0x101853360 | th_max = 1024 | th_width =   32
+ggml_metal_library_init: using embedded metal library
+ggml_metal_library_init: turbo3 sparse V dequant enabled
+ggml_metal_library_init: loaded in 0.009 sec
+ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
+ggml_metal_device_init: GPU name:   MTL0
+ggml_metal_device_init: GPU family: MTLGPUFamilyApple10  (1010)
+ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
+ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
+ggml_metal_device_init: simdgroup reduction   = true
+ggml_metal_device_init: simdgroup matrix mul. = true
+ggml_metal_device_init: has unified memory    = true
+ggml_metal_device_init: has bfloat            = true
+ggml_metal_device_init: has tensor            = true
+ggml_metal_device_init: use residency sets    = true
+ggml_metal_device_init: use shared buffers    = true
+ggml_metal_device_init: recommendedMaxWorkingSetSize  = 115448.73 MB
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |         pp32768 |      732.00 ± 172.10 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         16.29 ± 1.78 |
+
+build: 13afec1 (178)
+
+SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
+SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
+SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
+SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
+=== Done: smem ===
+| model                          |       size |     params | backend    | threads | type_k | type_v | fa |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -----: | -----: | -: | --------------: | -------------------: |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |         pp32768 |     1018.88 ± 235.19 |
+| qwen35moe 35B.A3B Q8_0         |  34.36 GiB |    34.66 B | MTL,BLAS   |       1 | turbo4 | turbo4 |  1 |           tg128 |         81.62 ± 0.05 |
+
+build: 13afec1 (178)
+
+SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
+SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
+SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
+SKIP: q8_0 + smem (q8_0 unaffected by SMEM)
+=== Done: smem ===
@@ -406,6 +406,9 @@ const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_IQ4_NL,
    GGML_TYPE_Q5_0,
    GGML_TYPE_Q5_1,
+    GGML_TYPE_TURBO2_0,
+    GGML_TYPE_TURBO3_0,
+    GGML_TYPE_TURBO4_0,
 };

 static ggml_type kv_cache_type_from_str(const std::string & s) {
@@ -44,6 +44,12 @@
 #include <string.h>
 #include <fcntl.h>
 #include <io.h>
+#ifndef fileno
+#define fileno _fileno
+#endif
+#ifndef isatty
+#define isatty _isatty
+#endif
 #else
 #include <sys/ioctl.h>
 #include <sys/stat.h>
@@ -0,0 +1,95 @@
+# TurboQuant on AMD Instinct MI300X & MI355X (ROCm/HIP)
+
+## Summary
+
+TurboQuant KV cache compression (turbo2/turbo3/turbo4) builds and runs correctly on AMD Instinct MI300X (gfx942) and MI355X (gfx950). MI300X requires zero code changes. MI355X requires adding CDNA4 arch defines to the HIP vendor header.
+
+## Test Environment
+
+| Component | MI300X | MI355X |
+|-----------|--------|--------|
+| GPU | MI300X (gfx942), 192 GB HBM3 | MI355X (gfx950), 288 GB HBM3e |
+| ROCm | 7.0.2 | 7.0.1 |
+| Wave Size | 64 | 64 |
+| Build | `-DAMDGPU_TARGETS="gfx942"` | `-DAMDGPU_TARGETS="gfx950"` |
+| Model | Qwen2.5-1.5B Q4_K_M (1.04 GiB) | same |
+
+## WHT Kernel Correctness
+
+Standalone roundtrip test (forward WHT → inverse WHT) confirms the Walsh-Hadamard Transform kernel works correctly on HIP with 64-wide wavefronts:
+
+```
+=== TurboQuant WHT Roundtrip Test (HIP/gfx942) ===
+Total elements: 512 (4 heads x 128 dim)
+Forward WHT zeros: 0 / 512
+Roundtrip max error: 2.980232e-07
+Roundtrip RMSE:      6.816018e-08
+Result: PASS ✅
+```
+
+The kernel uses shared memory + `__syncthreads()` (no warp shuffles), so it works correctly with GCN's 64-thread wavefronts without modification.
+
+## Performance Results
+
+### MI300X (single GPU, Qwen2.5-1.5B Q4_K_M)
+
+| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
+|----------|--------------|--------------|----------------|---------------|
+| f16 | 24,453 ± 230 | 181.2 ± 2.0 | baseline | baseline |
+| turbo3 | ~25,200 | ~160 | **+3%** | 88% |
+| turbo4 | 25,427 ± 17 | 161.1 ± 0.2 | **+4%** | 89% |
+
+### MI355X (single GPU, Qwen2.5-1.5B Q4_K_M)
+
+| KV Cache | pp512 (tok/s) | tg128 (tok/s) | Prefill vs f16 | Decode vs f16 |
+|----------|--------------|--------------|----------------|---------------|
+| f16+FA | 40,013 ± 902 | 254.5 ± 1.0 | baseline | baseline |
+| turbo3 | 39,140 ± 475 | 162.3 ± 0.1 | 98% | 64% |
+| turbo4 | 39,232 ± 508 | 214.1 ± 0.7 | 98% | **84%** |
+
+### Key Observations
+
+1. **MI300X prefill is faster with TurboQuant** (+3-4%) — less KV cache data to write to HBM.
+2. **MI300X decode at 88-89% of f16** — consistent with Apple Silicon community results.
+3. **MI355X turbo4 decode at 84%** — turbo4 outperforms turbo3 in decode due to simpler 4-bit dequant.
+4. **MI355X turbo3 decode at 64%** — the 3-bit codebook + sign extraction is more expensive on gfx950.
+5. **MI355X non-FA MMQ path crashes** (xf32 MFMA issue) — turbo types force FA and work correctly.
+
+## Build Instructions
+
+```bash
+git clone https://github.com/TheTom/llama-cpp-turboquant.git
+cd llama-cpp-turboquant
+git checkout feature/turboquant-kv-cache
+
+# MI300X (gfx942) — works without code changes
+cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx942"
+cmake --build build --config Release -j
+
+# MI355X (gfx950) — requires CDNA4 define patch (see commit)
+cmake -B build -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx950"
+cmake --build build --config Release -j
+
+# Test
+HIP_VISIBLE_DEVICES=0 ./build/bin/llama-bench \
+  -m model.gguf -ctk turbo3 -ctv turbo3 -ngl 99 -r 3 -p 512 -n 128
+```
+
+## Code Changes for gfx950 (MI355X)
+
+Three files modified to add CDNA4 (gfx950) architecture support:
+
+1. **`ggml/src/ggml-cuda/vendors/hip.h`** — Add `CDNA4` define for `__gfx950__`, include in `CDNA` family
+2. **`ggml/src/ggml-cuda/common.cuh`** — Add `GGML_CUDA_CC_CDNA4` constant and `GGML_CUDA_CC_IS_CDNA4` macro
+3. **`ggml/src/ggml-cuda/mma.cuh`** — Route CDNA4 to compatible MFMA instructions (bf16_1k, i32x16x32_i8, f32x16x4f32 — NOT xf32 which doesn't exist on gfx950)
+
+## Known Limitations
+
+- **MI355X non-FA MMQ crashes**: The default (non-flash-attention) matrix multiply path crashes on gfx950 due to the xf32 MFMA instruction (`mfma_f32_16x16x8_xf32`) not being available. TurboQuant types force flash attention and work correctly. Standard f16/q8_0 KV cache types need `-fa 1` flag on MI355X.
+- **llama-cli text output**: Interactive mode produces empty tokens on ROCm (display issue), but `llama-bench` confirms computation is correct.
+
+## Tested By
+
+Andy Luo (@andyluo7)
+- AMD Instinct MI300X (gfx942), ROCm 7.0.2 — April 2026
+- AMD Instinct MI355X (gfx950), ROCm 7.0.1 — April 2026
@@ -11,7 +11,7 @@ extern "C" {
 #define RPC_PROTO_PATCH_VERSION    0

 #ifdef  __cplusplus
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
 #endif

 #define GGML_RPC_MAX_SERVERS       16
@@ -181,7 +181,7 @@
 #            define GGML_API __declspec(dllimport) extern
 #        endif
 #    else
-#        define GGML_API __attribute__ ((visibility ("default"))) extern
+#        define GGML_API __attribute__ ((visibility ("default")))
 #    endif
 #else
 #    define GGML_API extern
@@ -429,7 +429,12 @@ extern "C" {
        GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
        GGML_TYPE_NVFP4   = 40, // NVFP4 (4 blocks, E4M3 scale)
        GGML_TYPE_Q1_0    = 41,
-        GGML_TYPE_COUNT   = 42,
+        GGML_TYPE_TURBO2_0 = 42, // TurboQuant 2-bit KV cache: WHT + 2-bit PolarQuant
+        GGML_TYPE_TURBO3_0 = 43, // TurboQuant 3-bit KV cache: WHT + 3-bit PolarQuant
+        GGML_TYPE_TURBO4_0 = 44, // TurboQuant 4-bit KV cache: WHT + 4-bit PolarQuant
+        GGML_TYPE_TQ3_1S  = 45, // TurboQuant 3-bit weight: WHT-rotated 8-level Lloyd-Max, block_size=32
+        GGML_TYPE_TQ4_1S  = 46, // TurboQuant 4-bit weight: WHT-rotated 16-level Lloyd-Max, block_size=32
+        GGML_TYPE_COUNT   = 47,
    };

    // precision
@@ -567,6 +572,7 @@ extern "C" {
        GGML_OP_RWKV_WKV7,
        GGML_OP_SOLVE_TRI,
        GGML_OP_GATED_DELTA_NET,
+        GGML_OP_TURBO_WHT,

        GGML_OP_UNARY,

@@ -2555,6 +2561,16 @@ extern "C" {
            struct ggml_tensor  * beta,
            struct ggml_tensor  * state);

+    // TurboQuant Walsh-Hadamard Transform (O(d log d) rotation for KV cache compression)
+    // Applies WHT rotation to 128-element groups along ne[0]: sign1 → butterfly → sign2 → normalize
+    // direction: 0 = forward (signs1 → WHT → signs2), 1 = inverse (signs2 → WHT → signs1)
+    GGML_API struct ggml_tensor * ggml_turbo_wht(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   direction,
+            int                   group_size,    // 0 = auto (64 or 128 from ne[0])
+            struct ggml_tensor  * scale);        // NULL = no InnerQ scaling
+
    // custom operators

    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
@@ -206,6 +206,7 @@ add_library(ggml-base
            ggml-threading.h
            ggml-quants.c
            ggml-quants.h
+            ggml-turbo-quant.c
            gguf.cpp)

 set_target_properties(ggml-base PROPERTIES
@@ -277,6 +277,97 @@ typedef struct {
 } block_tq2_0;
 static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");

+// TurboQuant 3-bit MSE-only: 3-bit PolarQuant indices (no QJL)
+// Storage block size = 32 (matches q4_0 for optimal GPU parallelism)
+// Transform group size = 128 (head_dim, for rotation Gaussianization)
+// Per block: norm(fp16) + 2-bit indices (8 bytes) + 1-bit extra (4 bytes) = 14 bytes per 32 values
+// = 3.5 bits/value → 4.6× compression vs fp16
+// The 3-bit index is split: lower 2 bits in qs[], upper 1 bit in signs[]
+#define QK_TURBO3 128   // Block size 128: one block per rotation group, eliminates redundant norms
+#define QK_TURBO3_GROUP 128  // rotation group size = head_dim
+// Derived: FA template nl parameters (auto-scale with block size)
+#define NL_TURBO3     (QK_TURBO3 / 16)   // non-vec FA iterations per block
+#define NL_TURBO3_VEC (QK_TURBO3 / 4)    // vec FA iterations per block
+typedef struct {
+    ggml_half  norm;                    //  2 bytes: vector L2 norm (for rescaling)
+    uint8_t    qs[QK_TURBO3 / 4];      //  8 bytes: lower 2-bit indices (4 per byte)
+    uint8_t    signs[QK_TURBO3 / 8];   //  4 bytes: upper 1-bit of 3-bit index (8 per byte)
+} block_turbo3_0;                       // 14 bytes total
+static_assert(sizeof(block_turbo3_0) == sizeof(ggml_half) + QK_TURBO3/4 + QK_TURBO3/8, "wrong turbo3_0 block size/padding");
+
+// TurboQuant 4-bit: 3-bit PolarQuant indices + 1-bit QJL signs
+// TURBO4_USE_4BIT: switch between 4-bit PolarQuant (new) and 3-bit+QJL (legacy)
+// Default: 4-bit on all backends (Metal + CUDA validated)
+#ifndef TURBO4_USE_4BIT
+#  define TURBO4_USE_4BIT 1
+#endif
+
+#define QK_TURBO4 128
+
+#if TURBO4_USE_4BIT
+// 4-bit PolarQuant: 16 optimal centroids, nibble packed, no QJL
+// Per block: norm(fp16) + rnorm(fp16, reserved) + 4-bit indices (64 bytes)
+// = 68 bytes per 128 values = 4.25 bits/value → 3.8× compression vs fp16
+typedef struct {
+    ggml_half  norm;                    //  2 bytes
+    ggml_half  rnorm;                   //  2 bytes (reserved, unused in 4-bit mode)
+    uint8_t    qs[QK_TURBO4 / 2];      // 64 bytes: 4-bit PolarQuant indices (nibble packed)
+} block_turbo4_0;                       // 68 bytes total
+static_assert(sizeof(block_turbo4_0) == 68, "wrong turbo4_0 block size");
+#else
+// Legacy 3-bit PolarQuant + 1-bit QJL (original paper design)
+// Per block: norm(fp16) + rnorm(fp16) + 3-bit indices (48 bytes) + 1-bit QJL signs (16 bytes)
+// = 68 bytes per 128 values = 4.25 bits/value → 3.8× compression vs fp16
+typedef struct {
+    ggml_half  norm;                    //  2 bytes
+    ggml_half  rnorm;                   //  2 bytes: residual norm for QJL scale
+    uint8_t    qs[QK_TURBO4 * 3 / 8];  // 48 bytes: 3-bit PolarQuant indices
+    uint8_t    signs[QK_TURBO4 / 8];   // 16 bytes: 1-bit QJL signs
+} block_turbo4_0;                       // 68 bytes total
+static_assert(sizeof(block_turbo4_0) == 2*sizeof(ggml_half) + QK_TURBO4*3/8 + QK_TURBO4/8, "wrong turbo4_0 block size");
+#endif
+
+static_assert(QK_TURBO4 == 128, "turbo4 kernels assume QK_TURBO4 == 128");
+
+// TurboQuant 2-bit: 2-bit PolarQuant indices only (no QJL)
+// Per block: norm(fp16) + 2-bit indices (8 bytes) = 10 bytes per 32 values
+// = 2.5 bits/value → 6.4× compression vs fp16
+// 4 centroids (Lloyd-Max for N(0, 1/128)): {-0.133462, -0.039994, 0.039994, 0.133462}
+#define QK_TURBO2 128   // Block size 128: one block per rotation group
+#define QK_TURBO2_GROUP 128  // rotation group size = head_dim
+// Derived: FA template nl parameters (auto-scale with block size)
+#define NL_TURBO2     (QK_TURBO2 / 16)   // non-vec FA iterations per block
+#define NL_TURBO2_VEC (QK_TURBO2 / 4)    // vec FA iterations per block
+typedef struct {
+    ggml_half  norm;                    //  2 bytes: corrected L2 norm
+    uint8_t    qs[QK_TURBO2 / 4];      //  8 bytes: 2-bit indices (4 per byte)
+} block_turbo2_0;                       // 10 bytes total
+static_assert(sizeof(block_turbo2_0) == sizeof(ggml_half) + QK_TURBO2/4, "wrong turbo2_0 block size/padding");
+
+// TQ3_1S: WHT-rotated 3-bit weight quantization (8-level Lloyd-Max for N(0,1))
+// Block size 32, dual half-block scales (d0 for [0..15], d1 for [16..31])
+// Per block: d0(fp16) + d1(fp16) + 3-bit indices packed (12 bytes) = 16 bytes per 32 values
+// = 4.0 bits/value
+#define QK_TQ3_0 32
+typedef struct {
+    ggml_half d0;                       //  2 bytes: scale for first 16 elements
+    ggml_half d1;                       //  2 bytes: scale for last 16 elements
+    uint8_t   qs[QK_TQ3_0 * 3 / 8];   // 12 bytes: 3-bit indices packed (4 groups of 8 in 3 bytes)
+} block_tq3_1s;                         // 16 bytes total
+static_assert(sizeof(block_tq3_1s) == 16, "wrong tq3_1s block size");
+
+// TQ4_1S: WHT-rotated 4-bit weight quantization (16-level Lloyd-Max for N(0,1))
+// Block size 32, dual half-block scales (d0 for [0..15], d1 for [16..31])
+// Per block: d0(fp16) + d1(fp16) + 4-bit indices packed (16 bytes) = 20 bytes per 32 values
+// = 5.0 bits/value
+#define QK_TQ4_1S 32
+typedef struct {
+    ggml_half d0;                       //  2 bytes: scale for first 16 elements
+    ggml_half d1;                       //  2 bytes: scale for last 16 elements
+    uint8_t   qs[QK_TQ4_1S / 2];      // 16 bytes: 4-bit indices nibble-packed
+} block_tq4_1s;                         // 20 bytes total
+static_assert(sizeof(block_tq4_1s) == 20, "wrong tq4_1s block size");
+
 //
 // Super-block quantization structures
 //
@@ -7,6 +7,7 @@
 #include "ggml-cpu-impl.h"
 #include "ggml-impl.h"
 #include "quants.h"
+#include "ggml-quants.h"
 #include "ggml-threading.h"
 #include "unary-ops.h"
 #include "binary-ops.h"
@@ -208,6 +209,23 @@ typedef pthread_t ggml_thread_t;
 #include <TargetConditionals.h>
 #endif

+// Forward declarations — defined below, after utility functions
+static void ggml_vec_dot_turbo3_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc);
+static void ggml_vec_dot_turbo2_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc);
+static void ggml_vec_dot_turbo4_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc);
+static void ggml_vec_dot_tq3_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc);
+static void ggml_vec_dot_tq4_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc);
+
 static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
    [GGML_TYPE_F32] = {
        .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_fp32,
@@ -403,6 +421,36 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
    [GGML_TYPE_I32] = {
        .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
    },
+    [GGML_TYPE_TURBO3_0] = {
+        .from_float               = (ggml_from_float_t) quantize_row_turbo3_0_ref,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_turbo3_0_f32,
+        .vec_dot_type             = GGML_TYPE_F32,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TURBO2_0] = {
+        .from_float               = (ggml_from_float_t) quantize_row_turbo2_0_ref,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_turbo2_0_f32,
+        .vec_dot_type             = GGML_TYPE_F32,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TURBO4_0] = {
+        .from_float               = (ggml_from_float_t) quantize_row_turbo4_0_ref,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_turbo4_0_f32,
+        .vec_dot_type             = GGML_TYPE_F32,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TQ3_1S] = {
+        .from_float               = (ggml_from_float_t) quantize_row_tq3_1s_ref,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_tq3_1s_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TQ4_1S] = {
+        .from_float               = (ggml_from_float_t) quantize_row_tq4_1s_ref,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_tq4_1s_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
 };

 const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -2047,6 +2095,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_gated_delta_net(params, tensor);
            } break;
+        case GGML_OP_TURBO_WHT:
+            {
+                ggml_compute_forward_turbo_wht(params, tensor);
+            } break;
        case GGML_OP_MAP_CUSTOM1:
            {
                ggml_compute_forward_map_custom1(params, tensor);
@@ -2227,6 +2279,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_COUNT_EQUAL:
        case GGML_OP_SOLVE_TRI:
        case GGML_OP_GATED_DELTA_NET:
+        case GGML_OP_TURBO_WHT:
            {
                n_tasks = n_threads;
            } break;
@@ -2947,6 +3000,10 @@ struct ggml_cplan ggml_graph_plan(
                        const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0);
                        cur = per_thread * sizeof(float) * n_tasks;
                    } break;
+                case GGML_OP_TURBO_WHT:
+                    {
+                        cur = 0;  // no extra workspace needed
+                    } break;
                case GGML_OP_COUNT:
                    {
                        GGML_ABORT("fatal error");
@@ -3385,6 +3442,119 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
    return ggml_graph_compute(cgraph, &cplan);
 }

+// TurboQuant3 vec_dot: dequantize turbo3 block to f32, then dot with f32 operand.
+// Used by CPU flash attention for models with D not supported by CUDA FA (e.g. D=192).
+static void ggml_vec_dot_turbo3_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    GGML_ASSERT(nrc == 1);
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+
+    // Dequantize turbo3 to f32 temp buffer, then dot
+    float * tmp = (float *)malloc(n * sizeof(float));
+    GGML_ASSERT(tmp != NULL);
+    ggml_get_type_traits(GGML_TYPE_TURBO3_0)->to_float(vx, tmp, n);
+
+    const float * y = (const float *)vy;
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) {
+        sum += tmp[i] * y[i];
+    }
+    free(tmp);
+    *s = sum;
+}
+
+// TurboQuant2 vec_dot: dequantize turbo2 block to f32, then dot with f32 operand.
+static void ggml_vec_dot_turbo2_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    GGML_ASSERT(nrc == 1);
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+
+    float * tmp = (float *)malloc(n * sizeof(float));
+    GGML_ASSERT(tmp != NULL);
+    ggml_get_type_traits(GGML_TYPE_TURBO2_0)->to_float(vx, tmp, n);
+
+    const float * y = (const float *)vy;
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) {
+        sum += tmp[i] * y[i];
+    }
+    free(tmp);
+    *s = sum;
+}
+
+// TurboQuant4 vec_dot: dequantize turbo4 block to f32, then dot with f32 operand.
+static void ggml_vec_dot_turbo4_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    GGML_ASSERT(nrc == 1);
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+
+    float * tmp = (float *)malloc(n * sizeof(float));
+    GGML_ASSERT(tmp != NULL);
+    ggml_get_type_traits(GGML_TYPE_TURBO4_0)->to_float(vx, tmp, n);
+
+    const float * y = (const float *)vy;
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) {
+        sum += tmp[i] * y[i];
+    }
+    free(tmp);
+    *s = sum;
+}
+
+// TQ3_1S vec_dot: dequantize tq3_1s block to f32, then dot with q8_0.
+// TODO: optimize with SIMD intrinsics for ARM NEON / AVX2
+static void ggml_vec_dot_tq3_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    GGML_ASSERT(nrc == 1);
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+
+    float * tmp = (float *)malloc(n * sizeof(float));
+    GGML_ASSERT(tmp != NULL);
+    ggml_get_type_traits(GGML_TYPE_TQ3_1S)->to_float(vx, tmp, n);
+
+    // Dequantize q8_0 and dot
+    float * tmp2 = (float *)malloc(n * sizeof(float));
+    GGML_ASSERT(tmp2 != NULL);
+    ggml_get_type_traits(GGML_TYPE_Q8_0)->to_float(vy, tmp2, n);
+
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) {
+        sum += tmp[i] * tmp2[i];
+    }
+    free(tmp);
+    free(tmp2);
+    *s = sum;
+}
+
+// TQ4_1S vec_dot: dequantize tq4_1s block to f32, then dot with q8_0.
+// TODO: optimize with SIMD intrinsics
+static void ggml_vec_dot_tq4_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
+                                       const void * GGML_RESTRICT vx, size_t bx,
+                                       const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    GGML_ASSERT(nrc == 1);
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+
+    float * tmp = (float *)malloc(n * sizeof(float));
+    GGML_ASSERT(tmp != NULL);
+    ggml_get_type_traits(GGML_TYPE_TQ4_1S)->to_float(vx, tmp, n);
+
+    float * tmp2 = (float *)malloc(n * sizeof(float));
+    GGML_ASSERT(tmp2 != NULL);
+    ggml_get_type_traits(GGML_TYPE_Q8_0)->to_float(vy, tmp2, n);
+
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) {
+        sum += tmp[i] * tmp2[i];
+    }
+    free(tmp);
+    free(tmp2);
+    *s = sum;
+}
+
 void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
    memcpy(y, x, n * sizeof(float));
 }
@@ -12,6 +12,10 @@
 #include <cfloat>
 #include <cmath>

+extern "C" {
+GGML_API int turbo3_cpu_wht_group_size;
+}
+
 // ggml_compute_forward_dup

 static void ggml_compute_forward_dup_same_cont(
@@ -679,6 +683,8 @@ void ggml_compute_forward_add(
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_TQ1_0:
        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_1S:
+        case GGML_TYPE_TQ4_1S:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
@@ -1130,6 +1136,8 @@ void ggml_compute_forward_add1(
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_TQ1_0:
        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_1S:
+        case GGML_TYPE_TQ4_1S:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
@@ -1260,6 +1268,8 @@ void ggml_compute_forward_acc(
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_TQ1_0:
        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_1S:
+        case GGML_TYPE_TQ4_1S:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
@@ -4395,6 +4405,8 @@ void ggml_compute_forward_out_prod(
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_TQ1_0:
        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_1S:
+        case GGML_TYPE_TQ4_1S:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
@@ -4672,6 +4684,8 @@ void ggml_compute_forward_set(
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_TQ1_0:
        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_1S:
+        case GGML_TYPE_TQ4_1S:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
@@ -4896,6 +4910,8 @@ void ggml_compute_forward_get_rows(
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_TQ1_0:
        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_1S:
+        case GGML_TYPE_TQ4_1S:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
@@ -4978,6 +4994,13 @@ static void ggml_compute_forward_set_rows_f32(

    ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;

+    // For turbo types: communicate WHT group size to the quantize function via global
+    if (dst->type == GGML_TYPE_TURBO3_0 || dst->type == GGML_TYPE_TURBO4_0 || dst->type == GGML_TYPE_TURBO2_0) {
+        int gs = 0;
+        memcpy(&gs, dst->op_params, sizeof(int));
+        turbo3_cpu_wht_group_size = (gs == 64 || gs == 128) ? gs : 0;
+    }
+
    for (int64_t i03 = 0; i03 < ne03; ++i03) {
        for (int64_t i02 = 0; i02 < ne02; ++i02) {
            for (int64_t i = ir0; i < ir1; ++i) {
@@ -5622,6 +5645,11 @@ void ggml_compute_forward_clamp(
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_TQ1_0:
        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_1S:
+        case GGML_TYPE_TQ4_1S:
+        case GGML_TYPE_TURBO2_0:
+        case GGML_TYPE_TURBO3_0:
+        case GGML_TYPE_TURBO4_0:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
@@ -10686,6 +10714,104 @@ void ggml_compute_forward_gated_delta_net(
    }
 }

+// ggml_compute_forward_turbo_wht
+
+// WHT sign arrays (must match Metal shader turbo_wht_signs1/2)
+static const float turbo_wht_s1[128] = {-1,1,1,-1,-1,1,-1,1,-1,-1,1,1,1,1,1,1,1,-1,1,-1,1,-1,-1,1,1,1,-1,1,1,-1,-1,-1,-1,1,1,-1,1,1,-1,1,-1,1,1,-1,-1,1,-1,1,1,1,1,-1,-1,-1,-1,-1,1,-1,1,1,1,1,-1,1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,1,1,1,-1,-1,1,1,1,-1,-1,1,1,-1,1,1,-1,1,-1,-1,1,1,-1,1,-1,1,-1,1,1,1,1,-1,1,-1,1,1,-1,1,1,-1,-1,-1,-1,-1,1,1,-1,1,1,-1,1};
+static const float turbo_wht_s2[128] = {1,1,1,1,-1,1,1,-1,1,-1,-1,-1,1,-1,-1,-1,1,1,-1,-1,1,-1,1,-1,1,-1,-1,1,-1,1,1,1,1,1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,1,1,-1,1,-1,1,1,1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,1,1,-1,1,-1,-1,-1,-1,1,-1,1,-1,1,-1,-1,1,1,-1,1,-1,1,1,-1,1,-1,-1,-1,-1,1,-1,-1,1,-1,1,-1,1,1,1,-1,-1,1,-1,1,-1,1,1,-1,-1,1,-1,1,-1,1,1,-1,1,-1,1,-1,-1,-1,-1,-1,1,-1};
+
+static void ggml_compute_forward_turbo_wht_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src = dst->src[0];
+    const ggml_tensor * scale_tensor = dst->src[1];  // InnerQ scale_inv (may be NULL)
+    const float * src_data = (const float *) src->data;
+    float * dst_data = (float *) dst->data;
+    const float * scale_inv = scale_tensor ? (const float *) scale_tensor->data : NULL;
+
+    int direction;
+    int group_size;
+    memcpy(&direction, dst->op_params + 0, sizeof(int));
+    memcpy(&group_size, dst->op_params + sizeof(int), sizeof(int));
+
+    const int64_t head_dim        = src->ne[0];
+    const int64_t n_heads         = ggml_nelements(src) / head_dim;
+    const int64_t groups_per_head = head_dim / group_size;
+    const int     tail_size       = (int)(head_dim % group_size);
+    const int64_t n_groups        = groups_per_head * n_heads;
+
+    const float inv_sqrt = 1.0f / sqrtf((float)group_size);
+
+    // Parallel over groups
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+    const int64_t grp_start = (n_groups * ith) / nth;
+    const int64_t grp_end = (n_groups * (ith + 1)) / nth;
+
+    // Select sign arrays: for 64-group, use first 64 elements of the 128-element arrays
+    const float * s_first = (direction == 0) ? turbo_wht_s1 : turbo_wht_s2;
+    const float * s_second = (direction == 0) ? turbo_wht_s2 : turbo_wht_s1;
+
+    for (int64_t g = grp_start; g < grp_end; g++) {
+        const int64_t head_idx    = g / groups_per_head;
+        const int64_t grp_in_head = g % groups_per_head;
+        const int64_t base        = head_idx * head_dim + grp_in_head * group_size;
+
+        float x[128];  // max group_size
+        const float * in = src_data + base;
+
+        // InnerQ forward: apply scale_inv BEFORE signs+WHT (for Q pre-rotation)
+        if (direction == 0 && scale_inv != NULL) {
+            for (int i = 0; i < group_size; i++) x[i] = in[i] * scale_inv[i % group_size];
+        } else {
+            for (int i = 0; i < group_size; i++) x[i] = in[i];
+        }
+
+        // Apply first signs
+        for (int i = 0; i < group_size; i++) x[i] *= s_first[i];
+
+        // WHT butterfly (log2(group_size) stages)
+        for (int h = 1; h < group_size; h *= 2) {
+            for (int i = 0; i < group_size; i += h * 2) {
+                for (int j = i; j < i + h; j++) {
+                    float a = x[j], b = x[j + h];
+                    x[j] = a + b;
+                    x[j + h] = a - b;
+                }
+            }
+        }
+
+        // Normalize + second signs
+        float * out = dst_data + base;
+        for (int i = 0; i < group_size; i++) {
+            float val = x[i] * inv_sqrt * s_second[i];
+            // InnerQ inverse: apply scale_inv AFTER WHT+signs (for V un-rotation)
+            if (direction == 1 && scale_inv != NULL) {
+                val *= scale_inv[i % group_size];
+            }
+            out[i] = val;
+        }
+    }
+
+    // Copy tail elements unchanged (identity pass-through)
+    if (tail_size > 0 && ith == 0) {
+        const int64_t tail_offset = groups_per_head * group_size;
+        for (int64_t h = 0; h < n_heads; h++) {
+            const int64_t base = h * head_dim + tail_offset;
+            memcpy(dst_data + base, src_data + base, tail_size * sizeof(float));
+        }
+    }
+}
+
+void ggml_compute_forward_turbo_wht(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    switch (dst->src[0]->type) {
+        case GGML_TYPE_F32: ggml_compute_forward_turbo_wht_f32(params, dst); break;
+        default: GGML_ABORT("fatal error");
+    }
+}
+
 // ggml_compute_forward_rwkv_wkv7

 static void ggml_compute_forward_rwkv_wkv7_f32(
@@ -104,6 +104,7 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s
 void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_turbo_wht(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -120,7 +120,28 @@ if (CUDAToolkit_FOUND)
            template-instances/fattn-vec-instance-f16-f16.cu
            template-instances/fattn-vec-instance-q4_0-q4_0.cu
            template-instances/fattn-vec-instance-q8_0-q8_0.cu
-            template-instances/fattn-vec-instance-bf16-bf16.cu)
+            template-instances/fattn-vec-instance-bf16-bf16.cu
+            template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu
+            template-instances/fattn-vec-instance-turbo3_0-q8_0.cu
+            template-instances/fattn-vec-instance-q8_0-turbo3_0.cu
+            template-instances/fattn-vec-instance-f16-turbo3_0.cu
+            template-instances/fattn-vec-instance-turbo3_0-f16.cu
+            template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu
+            template-instances/fattn-vec-instance-turbo2_0-q8_0.cu
+            template-instances/fattn-vec-instance-q8_0-turbo2_0.cu
+            template-instances/fattn-vec-instance-f16-turbo2_0.cu
+            template-instances/fattn-vec-instance-turbo2_0-f16.cu
+            template-instances/fattn-vec-instance-turbo3_0-turbo2_0.cu
+            template-instances/fattn-vec-instance-turbo2_0-turbo3_0.cu
+            template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu
+            template-instances/fattn-vec-instance-turbo4_0-q8_0.cu
+            template-instances/fattn-vec-instance-q8_0-turbo4_0.cu
+            template-instances/fattn-vec-instance-f16-turbo4_0.cu
+            template-instances/fattn-vec-instance-turbo4_0-f16.cu
+            template-instances/fattn-vec-instance-turbo4_0-turbo3_0.cu
+            template-instances/fattn-vec-instance-turbo3_0-turbo4_0.cu
+            template-instances/fattn-vec-instance-turbo4_0-turbo2_0.cu
+            template-instances/fattn-vec-instance-turbo2_0-turbo4_0.cu)
    endif()

    ggml_add_backend_library(ggml-cuda
@@ -804,7 +804,7 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
 static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
 #if defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
    // ROCm does not support fp8 in software on devices with fp8 hardware,
-    // but CDNA3 supports only e4m3_fnuz (no inf).
+    // but CDNA3 supports only e4m3_fnuz (no inf). CDNA4 (gfx950) uses standard e4m3fn.
    const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
    const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast<const __hip_fp8_e4m3_fnuz *>(&bits);
    return static_cast<float>(xf) / 2;
@@ -1,5 +1,6 @@
 #include "convert.cuh"
 #include "dequantize.cuh"
+#include "turbo-quant.cuh"

 #include <cstdint>

@@ -502,6 +503,50 @@ static void dequantize_block_cont_cuda(const void * __restrict__ vx, dst_t * __r
    dequantize_block_cuda<qk, qr, dequantize_kernel, dst_t>(vx, y, k, 1, 1, 1, k/qk, k/qk, k/qk, stream);
 }

+// Fast warp-cooperative TQ4_1S dequant: one warp per 32-element block.
+// WHT via __shfl_xor_sync — 16× less compute than the per-element generic template.
+template <typename dst_t>
+static __global__ void k_dequantize_tq4_1s_warp(
+        const block_tq4_1s * __restrict__ vx, dst_t * __restrict__ y,
+        const int64_t n_elements) {
+    const int64_t block_idx = (int64_t)blockIdx.x * blockDim.y + threadIdx.y;
+    const int lane = threadIdx.x;
+    if (block_idx * 32 + lane >= n_elements) return;
+
+    const block_tq4_1s * blk = &vx[block_idx];
+    const float d = (lane < 16) ? __half2float(blk->d0) : __half2float(blk->d1);
+    const uint8_t idx = (blk->qs[lane / 2] >> ((lane & 1) * 4)) & 0xF;
+    float val = TQ4_CENTROIDS_WEIGHT[idx] * d;
+
+    #pragma unroll
+    for (int h = 1; h < 32; h <<= 1) {
+        float o = __shfl_xor_sync(0xffffffff, val, h);
+        val = (lane & h) ? (o - val) : (val + o);
+    }
+    val *= 0.17677669529663688f * TQ_WEIGHT_SIGNS[lane];
+
+    y[block_idx * 32 + lane] = (dst_t)val;
+}
+
+template <typename dst_t>
+static void dequantize_tq4_1s_warp_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    GGML_ASSERT(k % 32 == 0);
+    const int64_t n_blocks = k / 32;
+    const int wpb = 4;
+    const dim3 block(32, wpb);
+    const dim3 grid((n_blocks + wpb - 1) / wpb);
+    k_dequantize_tq4_1s_warp<<<grid, block, 0, stream>>>((const block_tq4_1s *)vx, y, k);
+}
+
+// Non-contiguous version for general tensor layouts
+template <typename dst_t>
+static void dequantize_tq4_1s_warp_nc_cuda(const void * vx, dst_t * y,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
+    // For non-contiguous, fall back to generic per-element (rare path)
+    dequantize_block_cuda<QK_TQ4_1S, QR_TQ4_1S, dequantize_tq4_1s, dst_t>(vx, y, ne00, ne01, ne02, ne03, s01, s02, s03, stream);
+}
+
 static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
@@ -758,6 +803,16 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
            return dequantize_row_mxfp4_cuda;
        case GGML_TYPE_NVFP4:
            return dequantize_row_nvfp4_cuda;
+        case GGML_TYPE_TURBO3_0:
+            return dequantize_block_cont_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
+        case GGML_TYPE_TURBO2_0:
+            return dequantize_block_cont_cuda<QK_TURBO2, QR_TURBO2, dequantize_turbo2_0>;
+        case GGML_TYPE_TURBO4_0:
+            return dequantize_block_cont_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
+        case GGML_TYPE_TQ4_1S:
+            return dequantize_tq4_1s_warp_cuda<half>;  // fast warp-cooperative WHT
+        case GGML_TYPE_TQ3_1S:
+            return dequantize_block_cont_cuda<QK_TQ3_0, QR_TQ3_1S, dequantize_tq3_1s>;
        case GGML_TYPE_F32:
            return convert_unary_cont_cuda<float>;
        case GGML_TYPE_BF16:
@@ -813,6 +868,16 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
            return dequantize_row_mxfp4_cuda;
        case GGML_TYPE_NVFP4:
            return dequantize_row_nvfp4_cuda;
+        case GGML_TYPE_TURBO3_0:
+            return dequantize_block_cont_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
+        case GGML_TYPE_TURBO2_0:
+            return dequantize_block_cont_cuda<QK_TURBO2, QR_TURBO2, dequantize_turbo2_0>;
+        case GGML_TYPE_TURBO4_0:
+            return dequantize_block_cont_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
+        case GGML_TYPE_TQ4_1S:
+            return dequantize_tq4_1s_warp_cuda<float>;  // fast warp-cooperative WHT
+        case GGML_TYPE_TQ3_1S:
+            return dequantize_block_cont_cuda<QK_TQ3_0, QR_TQ3_1S, dequantize_tq3_1s>;
        case GGML_TYPE_F16:
            return convert_unary_cont_cuda<half>;
        case GGML_TYPE_BF16:
@@ -838,6 +903,16 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_TURBO3_0:
+            return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
+        case GGML_TYPE_TURBO2_0:
+            return dequantize_block_cuda<QK_TURBO2, QR_TURBO2, dequantize_turbo2_0>;
+        case GGML_TYPE_TURBO4_0:
+            return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
+        case GGML_TYPE_TQ4_1S:
+            return dequantize_block_cuda<QK_TQ4_1S, QR_TQ4_1S, dequantize_tq4_1s>;
+        case GGML_TYPE_TQ3_1S:
+            return dequantize_block_cuda<QK_TQ3_0, QR_TQ3_1S, dequantize_tq3_1s>;
        case GGML_TYPE_BF16:
            return convert_unary_cuda<nv_bfloat16>;
        default:
@@ -884,6 +959,16 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_TURBO3_0:
+            return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
+        case GGML_TYPE_TURBO2_0:
+            return dequantize_block_cuda<QK_TURBO2, QR_TURBO2, dequantize_turbo2_0>;
+        case GGML_TYPE_TURBO4_0:
+            return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
+        case GGML_TYPE_TQ4_1S:
+            return dequantize_block_cuda<QK_TQ4_1S, QR_TQ4_1S, dequantize_tq4_1s>;
+        case GGML_TYPE_TQ3_1S:
+            return dequantize_block_cuda<QK_TQ3_0, QR_TQ3_1S, dequantize_tq3_1s>;
        case GGML_TYPE_BF16:
            return convert_unary_cuda<nv_bfloat16, float>;
        default:
@@ -1,4 +1,5 @@
 #include "common.cuh"
+#include "turbo-quant.cuh"

 static __device__ __forceinline__ void dequantize_q1_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
    const block_q1_0 * x = (const block_q1_0 *) vx;
@@ -97,3 +98,104 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
    v.x *= d;
    v.y *= d;
 }
+
+// Turbo4: 4-bit PolarQuant (nibble packed), block size 128
+// iqs is the element index within the block (even), produces elements iqs and iqs+1
+static __device__ __forceinline__ void dequantize_turbo4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
+    const block_turbo4_0 * x = (const block_turbo4_0 *) vx;
+    const float norm = __half2float(x[ib].norm);
+    v.x = turbo4_dequant_element(&x[ib], iqs + 0, norm);
+    v.y = turbo4_dequant_element(&x[ib], iqs + 1, norm);
+}
+
+// Turbo3: 3-bit PolarQuant (2-bit qs + 1-bit sign), block size 32
+// iqs is the element index within the block (even), produces elements iqs and iqs+1
+static __device__ __forceinline__ void dequantize_turbo3_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
+    const block_turbo3_0 * x = (const block_turbo3_0 *) vx;
+    const float norm = __half2float(x[ib].norm);
+    v.x = turbo3_dequant_element(&x[ib], iqs + 0, norm);
+    v.y = turbo3_dequant_element(&x[ib], iqs + 1, norm);
+}
+
+// Turbo2: 2-bit PolarQuant (2-bit qs only, no sign), block size 32
+static __device__ __forceinline__ void dequantize_turbo2_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
+    const block_turbo2_0 * x = (const block_turbo2_0 *) vx;
+    const float norm = __half2float(x[ib].norm);
+    v.x = turbo2_dequant_element(&x[ib], iqs + 0, norm);
+    v.y = turbo2_dequant_element(&x[ib], iqs + 1, norm);
+}
+
+// TQ4_1S: 4-bit weight type with inverse WHT, block size 32, dual half-block scales
+// Cold path only (convert.cu) — dequants full block, applies inverse RHT, returns pair
+static __device__ __forceinline__ void dequantize_tq4_1s(const void * vx, const int64_t ib, const int iqs, float2 & v) {
+    const block_tq4_1s * x = (const block_tq4_1s *) vx;
+    const float d0 = __half2float(x[ib].d0);
+    const float d1 = __half2float(x[ib].d1);
+
+    // Dequant full block (centroid lookup + scale)
+    float buf[32];
+    for (int j = 0; j < 32; j++) {
+        uint8_t idx = (x[ib].qs[j / 2] >> ((j & 1) * 4)) & 0xF;
+        float d = (j < 16) ? d0 : d1;
+        buf[j] = TQ4_CENTROIDS_WEIGHT[idx] * d;
+    }
+
+    // Inverse RHT: WHT butterfly then normalize+unsign
+    for (int step = 1; step < 32; step <<= 1) {
+        for (int i = 0; i < 32; i += step << 1) {
+            for (int j = i; j < i + step; j++) {
+                float a = buf[j], b = buf[j + step];
+                buf[j] = a + b; buf[j + step] = a - b;
+            }
+        }
+    }
+    const float inv_sqrt32 = 0.17677669529663688f;
+    for (int j = 0; j < 32; j++) buf[j] *= inv_sqrt32 * TQ_WEIGHT_SIGNS[j];
+
+    v.x = buf[iqs];
+    v.y = buf[iqs + 1];
+}
+
+// TQ3_1S: 3-bit weight type with inverse WHT, block size 32, dual half-block scales
+// 3-bit packing: 4 groups of 8 indices in 3 bytes each (24 bits = 8 * 3-bit)
+static __device__ __forceinline__ void dequantize_tq3_1s(const void * vx, const int64_t ib, const int iqs, float2 & v) {
+    const block_tq3_1s * x = (const block_tq3_1s *) vx;
+    const float d0 = __half2float(x[ib].d0);
+    const float d1 = __half2float(x[ib].d1);
+
+    // Unpack all 32 3-bit indices (4 groups of 8 in 3 bytes)
+    float buf[32];
+    for (int g = 0; g < 4; g++) {
+        const uint8_t * qp = x[ib].qs + g * 3;
+        uint8_t idx[8];
+        idx[0] =  qp[0]       & 7;
+        idx[1] = (qp[0] >> 3) & 7;
+        idx[2] = ((qp[0] >> 6) | (qp[1] << 2)) & 7;
+        idx[3] = (qp[1] >> 1) & 7;
+        idx[4] = (qp[1] >> 4) & 7;
+        idx[5] = ((qp[1] >> 7) | (qp[2] << 1)) & 7;
+        idx[6] = (qp[2] >> 2) & 7;
+        idx[7] = (qp[2] >> 5) & 7;
+
+        for (int i = 0; i < 8; i++) {
+            int j = g * 8 + i;
+            float d = (j < 16) ? d0 : d1;
+            buf[j] = TQ3_CENTROIDS_WEIGHT[idx[i]] * d;
+        }
+    }
+
+    // Inverse RHT: WHT butterfly then normalize+unsign
+    for (int step = 1; step < 32; step <<= 1) {
+        for (int i = 0; i < 32; i += step << 1) {
+            for (int j = i; j < i + step; j++) {
+                float a = buf[j], b = buf[j + step];
+                buf[j] = a + b; buf[j + step] = a - b;
+            }
+        }
+    }
+    const float inv_sqrt32 = 0.17677669529663688f;
+    for (int j = 0; j < 32; j++) buf[j] *= inv_sqrt32 * TQ_WEIGHT_SIGNS[j];
+
+    v.x = buf[iqs];
+    v.y = buf[iqs + 1];
+}
@@ -3,6 +3,7 @@
 #include "common.cuh"
 #include "convert.cuh"
 #include "vecdotq.cuh"
+#include "turbo-quant.cuh"

 #include <cstdint>

@@ -288,6 +289,164 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q8_0(
    return sum;
 }

+// Turbo3 KQ dot product: dequantize K from turbo3 blocks, dot with Q (float2/half2)
+// Uses float Q path (like f16), not q8_1 integer path.
+// Q_v is half2[] or float2[] with D/2 pairs, partitioned nthreads-strided.
+//
+// Matches the f16 pattern: outer loop steps by nthreads*cpy_ne, inner loop
+// processes cpy_ne pairs per thread per iteration so Q_v and K indices stay aligned.
+// elem0 = 2*k_KQ is always even, so elem0 and elem0+1 always share the same
+// turbo3 block (ib), qs byte, and signs byte — loaded once per pair.
+template <int D, int nthreads>
+static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_turbo3_0(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
+
+    const block_turbo3_0 * K_turbo = (const block_turbo3_0 *) K_c;
+    GGML_UNUSED(Q_q8);
+    GGML_UNUSED(Q_ds_v);
+
+    constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
+    constexpr int cpy_ne = cpy_nb / 4;
+
+    float sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
+#pragma unroll
+        for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
+            const int k_KQ = k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne + k_KQ_1;
+
+            // elem0 is always even; elem0 and elem1 are always in the same block,
+            // the same qs byte (j0%4 ∈ {0,2}), and the same signs byte (j0%8 ∈ {0,2,4,6}).
+            const int elem0 = k_KQ * 2;                  // always even
+            const int ib    = elem0 / QK_TURBO3;          // shared block index
+            const int j0    = elem0 % QK_TURBO3;          // always even, 0..30
+
+            // Single loads for the shared block fields
+            const float     norm     = __half2float(K_turbo[ib].norm);
+            const uint8_t   qs_byte  = K_turbo[ib].qs[j0 / 4];      // covers both j0 and j0+1
+            const uint8_t   sgn_byte = K_turbo[ib].signs[j0 / 8];   // covers both j0 and j0+1
+
+            // Extract 3-bit indices for elem0 and elem1 from shared bytes
+            const int     shift  = (j0 % 4) * 2;                     // 0 or 4
+            const uint8_t idx0   = ((qs_byte >> shift)     & 0x3) | (((sgn_byte >> (j0 % 8))     & 0x1) << 2);
+            const uint8_t idx1   = ((qs_byte >> (shift+2)) & 0x3) | (((sgn_byte >> (j0 % 8 + 1)) & 0x1) << 2);
+
+            float2 kv;
+            kv.x = TURBO_CENTROIDS_3BIT[idx0] * norm;
+            kv.y = TURBO_CENTROIDS_3BIT[idx1] * norm;
+
+#ifdef V_DOT2_F32_F16_AVAILABLE
+            const half2 qv = ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
+            ggml_cuda_mad(sum, make_float2(kv.x, kv.y), __half22float2(qv));
+#else
+            const float2 qv = ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
+            sum += kv.x * qv.x + kv.y * qv.y;
+#endif // V_DOT2_F32_F16_AVAILABLE
+        }
+    }
+
+    return sum;
+}
+
+// Turbo2 KQ dot product: dequantize K from turbo2 blocks, dot with Q (float2/half2)
+// Same structure as turbo3 but reads 2-bit indices from qs only (no signs).
+template <int D, int nthreads>
+static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_turbo2_0(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
+
+    const block_turbo2_0 * K_turbo = (const block_turbo2_0 *) K_c;
+    GGML_UNUSED(Q_q8);
+    GGML_UNUSED(Q_ds_v);
+
+    constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
+    constexpr int cpy_ne = cpy_nb / 4;
+
+    float sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
+#pragma unroll
+        for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
+            const int k_KQ = k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne + k_KQ_1;
+
+            const int elem0 = k_KQ * 2;
+            const int ib    = elem0 / QK_TURBO2;
+            const int j0    = elem0 % QK_TURBO2;
+
+            const float     norm     = __half2float(K_turbo[ib].norm);
+            const uint8_t   qs_byte  = K_turbo[ib].qs[j0 / 4];
+
+            const int     shift  = (j0 % 4) * 2;
+            const uint8_t idx0   = (qs_byte >> shift)     & 0x3;
+            const uint8_t idx1   = (qs_byte >> (shift+2)) & 0x3;
+
+            float2 kv;
+            kv.x = TURBO_CENTROIDS_2BIT[idx0] * norm;
+            kv.y = TURBO_CENTROIDS_2BIT[idx1] * norm;
+
+#ifdef V_DOT2_F32_F16_AVAILABLE
+            const half2 qv = ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
+            ggml_cuda_mad(sum, make_float2(kv.x, kv.y), __half22float2(qv));
+#else
+            const float2 qv = ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
+            sum += kv.x * qv.x + kv.y * qv.y;
+#endif // V_DOT2_F32_F16_AVAILABLE
+        }
+    }
+
+    return sum;
+}
+
+// Turbo4 KQ dot product: dequantize K from turbo4 blocks, dot with Q (float2/half2)
+// 4-bit nibble packed: qs[j/2] >> ((j%2)*4) & 0xF
+template <int D, int nthreads>
+static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_turbo4_0(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
+
+    const block_turbo4_0 * K_turbo = (const block_turbo4_0 *) K_c;
+    GGML_UNUSED(Q_q8);
+    GGML_UNUSED(Q_ds_v);
+
+    constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
+    constexpr int cpy_ne = cpy_nb / 4;
+
+    float sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
+#pragma unroll
+        for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
+            const int k_KQ = k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne + k_KQ_1;
+
+            const int elem0 = k_KQ * 2;                   // always even
+            const int ib    = elem0 / QK_TURBO4;           // block index
+            const int j0    = elem0 % QK_TURBO4;           // always even
+
+            const float   norm    = __half2float(K_turbo[ib].norm);
+            // Both j0 and j0+1 are adjacent nibbles: j0/2 == (j0+1)/2 when j0 is even
+            const uint8_t qs_byte = K_turbo[ib].qs[j0 / 2];
+
+            const uint8_t idx0 = (qs_byte >> 0) & 0xF;    // low nibble = j0
+            const uint8_t idx1 = (qs_byte >> 4) & 0xF;    // high nibble = j0+1
+
+            float2 kv;
+            kv.x = TURBO_CENTROIDS_4BIT[idx0] * norm;
+            kv.y = TURBO_CENTROIDS_4BIT[idx1] * norm;
+
+#ifdef V_DOT2_F32_F16_AVAILABLE
+            const half2 qv = ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
+            ggml_cuda_mad(sum, make_float2(kv.x, kv.y), __half22float2(qv));
+#else
+            const float2 qv = ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1];
+            sum += kv.x * qv.x + kv.y * qv.y;
+#endif // V_DOT2_F32_F16_AVAILABLE
+        }
+    }
+
+    return sum;
+}
+
 template <typename Tds, int ni>
 static __device__ __forceinline__ void quantize_q8_1_to_shared(
    const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) {
@@ -577,6 +736,186 @@ static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict
    }
 }

+// Turbo3 V dequantize: extract `ne` float/half values at position i0.
+//
+// Optimised for the ne==4 path (used by the VEC kernel with turbo3 V):
+// i0 is always a multiple of 4 from the VEC kernel access pattern, so all 4
+// elements share one qs byte and one signs byte — we load each once.
+template <typename T, int ne>
+static __device__ __forceinline__ void dequantize_V_turbo3_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
+    const block_turbo3_0 * x = (const block_turbo3_0 *) vx;
+
+    const int64_t ib   = i0 / QK_TURBO3;
+    const int     j0   = i0 % QK_TURBO3;
+    const float   norm = __half2float(x[ib].norm);
+
+    static_assert(ne == 2 || ne == 4, "bad ne");
+
+    if constexpr (ne == 4) {
+        // When j0 % 4 == 0 (always true from VEC kernel), all 4 elements share one
+        // qs byte (4 elements per byte) and one signs byte (8 elements per byte).
+        const uint8_t qs_byte  = x[ib].qs[j0 / 4];
+        const uint8_t sgn_byte = x[ib].signs[j0 / 8];
+        const int     shift_s  = j0 % 8;   // 0 or 4
+
+        const uint8_t idx0 = ((qs_byte >> 0) & 0x3) | (((sgn_byte >> (shift_s+0)) & 0x1) << 2);
+        const uint8_t idx1 = ((qs_byte >> 2) & 0x3) | (((sgn_byte >> (shift_s+1)) & 0x1) << 2);
+        const uint8_t idx2 = ((qs_byte >> 4) & 0x3) | (((sgn_byte >> (shift_s+2)) & 0x1) << 2);
+        const uint8_t idx3 = ((qs_byte >> 6) & 0x3) | (((sgn_byte >> (shift_s+3)) & 0x1) << 2);
+
+#ifdef FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, half>) {
+            ((half2 *) dst)[0] = make_half2(
+                __float2half(TURBO_CENTROIDS_3BIT[idx0] * norm),
+                __float2half(TURBO_CENTROIDS_3BIT[idx1] * norm));
+            ((half2 *) dst)[1] = make_half2(
+                __float2half(TURBO_CENTROIDS_3BIT[idx2] * norm),
+                __float2half(TURBO_CENTROIDS_3BIT[idx3] * norm));
+        } else
+#endif // FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, float>) {
+            ((float2 *) dst)[0] = make_float2(
+                TURBO_CENTROIDS_3BIT[idx0] * norm,
+                TURBO_CENTROIDS_3BIT[idx1] * norm);
+            ((float2 *) dst)[1] = make_float2(
+                TURBO_CENTROIDS_3BIT[idx2] * norm,
+                TURBO_CENTROIDS_3BIT[idx3] * norm);
+        } else {
+            static_assert(std::is_same_v<T, void>, "unsupported type");
+        }
+    } else { // ne == 2
+#ifdef FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, half>) {
+            float v0 = turbo3_dequant_element(&x[ib], j0,   norm);
+            float v1 = turbo3_dequant_element(&x[ib], j0+1, norm);
+            ((half2 *) dst)[0] = make_half2(__float2half(v0), __float2half(v1));
+        } else
+#endif // FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, float>) {
+            ((float *) dst)[0] = turbo3_dequant_element(&x[ib], j0,   norm);
+            ((float *) dst)[1] = turbo3_dequant_element(&x[ib], j0+1, norm);
+        } else {
+            static_assert(std::is_same_v<T, void>, "unsupported type");
+        }
+    }
+}
+
+// Turbo2 V dequantize: extract `ne` float/half values at position i0.
+template <typename T, int ne>
+static __device__ __forceinline__ void dequantize_V_turbo2_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
+    const block_turbo2_0 * x = (const block_turbo2_0 *) vx;
+
+    const int64_t ib   = i0 / QK_TURBO2;
+    const int     j0   = i0 % QK_TURBO2;
+    const float   norm = __half2float(x[ib].norm);
+
+    static_assert(ne == 2 || ne == 4, "bad ne");
+
+    if constexpr (ne == 4) {
+        const uint8_t qs_byte = x[ib].qs[j0 / 4];
+
+        const uint8_t idx0 = (qs_byte >> 0) & 0x3;
+        const uint8_t idx1 = (qs_byte >> 2) & 0x3;
+        const uint8_t idx2 = (qs_byte >> 4) & 0x3;
+        const uint8_t idx3 = (qs_byte >> 6) & 0x3;
+
+#ifdef FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, half>) {
+            ((half2 *) dst)[0] = make_half2(
+                __float2half(TURBO_CENTROIDS_2BIT[idx0] * norm),
+                __float2half(TURBO_CENTROIDS_2BIT[idx1] * norm));
+            ((half2 *) dst)[1] = make_half2(
+                __float2half(TURBO_CENTROIDS_2BIT[idx2] * norm),
+                __float2half(TURBO_CENTROIDS_2BIT[idx3] * norm));
+        } else
+#endif // FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, float>) {
+            ((float2 *) dst)[0] = make_float2(
+                TURBO_CENTROIDS_2BIT[idx0] * norm,
+                TURBO_CENTROIDS_2BIT[idx1] * norm);
+            ((float2 *) dst)[1] = make_float2(
+                TURBO_CENTROIDS_2BIT[idx2] * norm,
+                TURBO_CENTROIDS_2BIT[idx3] * norm);
+        } else {
+            static_assert(std::is_same_v<T, void>, "unsupported type");
+        }
+    } else { // ne == 2
+#ifdef FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, half>) {
+            float v0 = turbo2_dequant_element(&x[ib], j0,   norm);
+            float v1 = turbo2_dequant_element(&x[ib], j0+1, norm);
+            ((half2 *) dst)[0] = make_half2(__float2half(v0), __float2half(v1));
+        } else
+#endif // FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, float>) {
+            ((float *) dst)[0] = turbo2_dequant_element(&x[ib], j0,   norm);
+            ((float *) dst)[1] = turbo2_dequant_element(&x[ib], j0+1, norm);
+        } else {
+            static_assert(std::is_same_v<T, void>, "unsupported type");
+        }
+    }
+}
+
+// Turbo4 V dequantize: extract `ne` float/half values at position i0.
+// 4-bit nibble packed, block size 128.
+template <typename T, int ne>
+static __device__ __forceinline__ void dequantize_V_turbo4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
+    const block_turbo4_0 * x = (const block_turbo4_0 *) vx;
+
+    const int64_t ib   = i0 / QK_TURBO4;
+    const int     j0   = i0 % QK_TURBO4;
+    const float   norm = __half2float(x[ib].norm);
+
+    static_assert(ne == 2 || ne == 4, "bad ne");
+
+    if constexpr (ne == 4) {
+        // j0 is always a multiple of 4 from the VEC kernel access pattern.
+        // 4 consecutive elements span 2 qs bytes: j0/2 and j0/2+1.
+        const uint8_t qs_byte0 = x[ib].qs[j0 / 2];      // elements j0, j0+1
+        const uint8_t qs_byte1 = x[ib].qs[j0 / 2 + 1];  // elements j0+2, j0+3
+
+        const uint8_t idx0 = (qs_byte0 >> 0) & 0xF;
+        const uint8_t idx1 = (qs_byte0 >> 4) & 0xF;
+        const uint8_t idx2 = (qs_byte1 >> 0) & 0xF;
+        const uint8_t idx3 = (qs_byte1 >> 4) & 0xF;
+
+#ifdef FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, half>) {
+            ((half2 *) dst)[0] = make_half2(
+                __float2half(TURBO_CENTROIDS_4BIT[idx0] * norm),
+                __float2half(TURBO_CENTROIDS_4BIT[idx1] * norm));
+            ((half2 *) dst)[1] = make_half2(
+                __float2half(TURBO_CENTROIDS_4BIT[idx2] * norm),
+                __float2half(TURBO_CENTROIDS_4BIT[idx3] * norm));
+        } else
+#endif // FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, float>) {
+            ((float2 *) dst)[0] = make_float2(
+                TURBO_CENTROIDS_4BIT[idx0] * norm,
+                TURBO_CENTROIDS_4BIT[idx1] * norm);
+            ((float2 *) dst)[1] = make_float2(
+                TURBO_CENTROIDS_4BIT[idx2] * norm,
+                TURBO_CENTROIDS_4BIT[idx3] * norm);
+        } else {
+            static_assert(std::is_same_v<T, void>, "unsupported type");
+        }
+    } else { // ne == 2
+#ifdef FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, half>) {
+            float v0 = turbo4_dequant_element(&x[ib], j0,   norm);
+            float v1 = turbo4_dequant_element(&x[ib], j0+1, norm);
+            ((half2 *) dst)[0] = make_half2(__float2half(v0), __float2half(v1));
+        } else
+#endif // FP16_AVAILABLE
+        if constexpr (std::is_same_v<T, float>) {
+            ((float *) dst)[0] = turbo4_dequant_element(&x[ib], j0,   norm);
+            ((float *) dst)[1] = turbo4_dequant_element(&x[ib], j0+1, norm);
+        } else {
+            static_assert(std::is_same_v<T, void>, "unsupported type");
+        }
+    }
+}
+
 template <ggml_type type_K, int D, int nthreads>
 constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
    if constexpr (type_K == GGML_TYPE_F16) {
@@ -593,6 +932,12 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
        return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
    } else if constexpr (type_K == GGML_TYPE_BF16) {
        return vec_dot_fattn_vec_KQ_bf16<D, nthreads>;
+    } else if constexpr (type_K == GGML_TYPE_TURBO3_0) {
+        return vec_dot_fattn_vec_KQ_turbo3_0<D, nthreads>;
+    } else if constexpr (type_K == GGML_TYPE_TURBO2_0) {
+        return vec_dot_fattn_vec_KQ_turbo2_0<D, nthreads>;
+    } else if constexpr (type_K == GGML_TYPE_TURBO4_0) {
+        return vec_dot_fattn_vec_KQ_turbo4_0<D, nthreads>;
    } else {
        static_assert(type_K == -1, "bad type");
        return nullptr;
@@ -615,6 +960,12 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
        return dequantize_V_q8_0<T, ne>;
    } else if constexpr (type_V == GGML_TYPE_BF16) {
        return dequantize_V_bf16<float, ne>;
+    } else if constexpr (type_V == GGML_TYPE_TURBO3_0) {
+        return dequantize_V_turbo3_0<T, ne>;
+    } else if constexpr (type_V == GGML_TYPE_TURBO2_0) {
+        return dequantize_V_turbo2_0<T, ne>;
+    } else if constexpr (type_V == GGML_TYPE_TURBO4_0) {
+        return dequantize_V_turbo4_0<T, ne>;
    } else {
        static_assert(type_V == -1, "bad type");
        return nullptr;
@@ -946,8 +1297,32 @@ void launch_fattn(
    const int cc  = ggml_cuda_info().devices[id].cc;
    const int nsm = ggml_cuda_info().devices[id].nsm;

+#ifdef GGML_USE_HIP
+    // HIP/ROCm: bypass the memory pool for f16 temp buffers.
+    // The legacy pool (ggml_cuda_pool_leg) retains peak-sized allocations permanently.
+    // For quantized KV dequant, this means the f16 temp buffer stays allocated,
+    // consuming more VRAM than the quantized KV compression saves — causing OOM.
+    // Using raw alloc+free ensures the memory is released after the kernel completes.
+    struct hip_f16_alloc {
+        half * ptr = nullptr;
+        cudaStream_t stream;
+        hip_f16_alloc(cudaStream_t s) : stream(s) {}
+        ~hip_f16_alloc() {
+            if (ptr) {
+                cudaStreamSynchronize(stream);
+                cudaFree(ptr);
+            }
+        }
+        void alloc(size_t nelements) {
+            CUDA_CHECK(cudaMalloc(&ptr, nelements * sizeof(half)));
+        }
+    };
+    hip_f16_alloc K_f16(main_stream);
+    hip_f16_alloc V_f16(main_stream);
+#else
    ggml_cuda_pool_alloc<half>   K_f16(pool);
    ggml_cuda_pool_alloc<half>   V_f16(pool);
+#endif
    ggml_cuda_pool_alloc<int>    KV_max(pool);
    ggml_cuda_pool_alloc<float>  dst_tmp(pool);
    ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
@@ -50,10 +50,17 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
            GGML_ASSERT(V->ne[0] == K->ne[0]);
            ggml_cuda_flash_attn_ext_tile_case<512, 512>(ctx, dst);
        } break;
+#ifndef GGML_USE_HIP
+        // D>=576 tile kernels exceed HIP local memory limit (67584 > 65536)
        case 576: {
            GGML_ASSERT(V->ne[0] == 512);
            ggml_cuda_flash_attn_ext_tile_case<576, 512>(ctx, dst);
        } break;
+        case 640: {
+            GGML_ASSERT(V->ne[0] == 512);
+            ggml_cuda_flash_attn_ext_tile_case<640, 512>(ctx, dst);
+        } break;
+#endif
        default: {
            GGML_ABORT("Unsupported head size");
        } break;
@@ -84,6 +84,10 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512,  8, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 16, 256, 2,  64,  64)
+
    return 0;
 }

@@ -152,6 +156,10 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512,  4, 128, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512,  8, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 16, 256, 2,  32,  64)
+
    return 0;
 }

@@ -229,6 +237,11 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512,  8, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 16, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 32, 512, 1, 128,  64)
+
    return 0;
 }

@@ -306,6 +319,11 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512,  8, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 16, 256, 4,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(640, 512, 32, 256, 2, 128,  64)
+
    return 0;
 }

@@ -1239,7 +1257,7 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
    const int gqa_ratio = Q->ne[2] / K->ne[2];

    // On NVIDIA (Pascal and older) the GQA optimizations seem to be detrimental in some cases.
-    // However, for DKQ == 576, DV == 512 only the kernel variant with GQA optimizations is implemented.
+    // However, for DKQ == 576/640, DV == 512 only the kernel variant with GQA optimizations is implemented.
    const bool nvidia = GGML_CUDA_CC_IS_NVIDIA(ggml_cuda_info().devices[ggml_cuda_get_device()].cc);
    const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
    const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;
@@ -1345,3 +1363,4 @@ extern DECL_FATTN_TILE_CASE(256, 256);
 extern DECL_FATTN_TILE_CASE(320, 256);
 extern DECL_FATTN_TILE_CASE(512, 512);
 extern DECL_FATTN_TILE_CASE(576, 512);
+extern DECL_FATTN_TILE_CASE(640, 512);
@@ -17,7 +17,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
 #pragma clang diagnostic ignored "-Wpass-failed"
 #endif // __clang__
 template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), 1)
+__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), 2)
 static __global__ void flash_attn_ext_vec(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@@ -75,17 +75,30 @@ static __global__ void flash_attn_ext_vec(
 #endif // GGML_USE_HIP

    constexpr int nthreads    = ggml_cuda_fattn_vec_get_nthreads_device();
-    constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q;
-    constexpr int nthreads_V  = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q;
+    // Turbo3 uses the float Q path (like f16/bf16), not q8_1 integer path
+    constexpr bool K_is_unquantized = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16 || type_K == GGML_TYPE_TURBO3_0 || type_K == GGML_TYPE_TURBO2_0 || type_K == GGML_TYPE_TURBO4_0);
+    constexpr bool V_is_unquantized = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16 || type_V == GGML_TYPE_TURBO3_0 || type_V == GGML_TYPE_TURBO2_0 || type_V == GGML_TYPE_TURBO4_0);
+    constexpr bool K_is_turbo = (type_K == GGML_TYPE_TURBO3_0 || type_K == GGML_TYPE_TURBO2_0 || type_K == GGML_TYPE_TURBO4_0);
+    // Turbo KQ dot does byte extraction + centroid lookup + scalar mul, not vectorized f16 loads.
+    // nthreads_KQ=1: each thread computes a full KQ product alone — eliminates warp_reduce_sum
+    // shuffle and halves KQ loop iterations. Each thread holds full Q vector in registers.
+    constexpr int nthreads_KQ = K_is_turbo ? 1 : (K_is_unquantized ? 128 / cpy_nb : nthreads_KQ_q);
+    constexpr bool V_is_turbo = (type_V == GGML_TYPE_TURBO3_0 || type_V == GGML_TYPE_TURBO2_0 || type_V == GGML_TYPE_TURBO4_0);
+    // Turbo V dequant is scalar (byte extract + LUT), not vectorized loads.
+    // Halve nthreads_V to double V_cols_per_iter (process 2 V rows per loop iteration),
+    // reducing loop overhead and improving ILP in the V aggregation phase.
+    // Eighth nthreads_V for turbo: V_cols_per_iter goes from 4→8, processing 8 V positions
+    // per outer loop iteration. Halves outer loop count again, more ILP from concurrent V rows.
+    constexpr int nthreads_V  = V_is_unquantized ? (V_is_turbo ? (nthreads_V_q / 8 < 1 ? 1 : nthreads_V_q / 8) : 128 / cpy_nb) : nthreads_V_q;

    static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
    static_assert(WARP_SIZE % nthreads_V  == 0, "bad nthreads_V");

-    constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4;
+    constexpr int V_rows_per_thread = V_is_unquantized ? ((type_V == GGML_TYPE_TURBO3_0 || type_V == GGML_TYPE_TURBO2_0 || type_V == GGML_TYPE_TURBO4_0) ? 4 : 2*cpy_ne) : 4;
    constexpr int V_cols_per_iter   = WARP_SIZE / nthreads_V;

    constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
-    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16;
+    constexpr bool Q_q8_1 = !K_is_unquantized;
 #ifdef V_DOT2_F32_F16_AVAILABLE
    constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half,  V_rows_per_thread>();
 #else
@@ -120,6 +133,23 @@ static __global__ void flash_attn_ext_vec(
    __shared__ float  KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
 #endif // V_DOT2_F32_F16_AVAILABLE

+    // Shared-memory LUT for turbo KQ scoring: precompute Q[d] * centroid[c] once,
+    // then the hot loop does turbo_lut[d][idx] (shmem read, no multiply).
+    // turbo4 excluded: 16 centroids × D exceeds shmem budget.
+    // Stride = n_centroids+1 to avoid bank conflicts.
+    constexpr int n_centroids_lut = (D <= 256 && type_K == GGML_TYPE_TURBO3_0) ? 8 :
+                                    (D <= 256 && type_K == GGML_TYPE_TURBO2_0) ? 4 : 0;
+    constexpr int lut_stride = n_centroids_lut > 0 ? n_centroids_lut + 1 : 1;
+    __shared__ half turbo_lut[n_centroids_lut > 0 ? D : 1][lut_stride];
+
+    // Sparse V: skip V dequant for positions with negligible attention weights.
+    // At long context, most V positions contribute < 1e-6 to the output — skipping
+    // their dequant saves significant compute (especially for quantized V types).
+    constexpr float sparse_v_threshold_f = 1e-6f;
+#ifdef V_DOT2_F32_F16_AVAILABLE
+    const     half  sparse_v_threshold_h = __float2half(sparse_v_threshold_f);
+#endif
+
    float KQ_max[ncols];
    float KQ_sum[ncols];
 #pragma unroll
@@ -236,6 +266,20 @@ static __global__ void flash_attn_ext_vec(
 #endif // V_DOT2_F32_F16_AVAILABLE
    }

+    // Build shared-memory LUT: turbo_lut[d][c] = half(Q[d] * scale * centroid[c])
+    if constexpr (n_centroids_lut > 0 && ncols == 1) {
+        const float * centroids_ptr = (type_K == GGML_TYPE_TURBO3_0) ? TURBO_CENTROIDS_3BIT :
+                                      TURBO_CENTROIDS_2BIT;
+        const float * Q_f = (const float *)(Q + 0*nb01);
+        for (int d = tid; d < D; d += nthreads) {
+            const float q_val = Q_f[d] * scale;
+            for (int c = 0; c < n_centroids_lut; c++) {
+                turbo_lut[d][c] = __float2half(q_val * centroids_ptr[c]);
+            }
+        }
+        __syncthreads();
+    }
+
    const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
    K     += blockIdx.y*nthreads * nb11;
    V     += blockIdx.y*nthreads * nb21;
@@ -259,8 +303,50 @@ static __global__ void flash_attn_ext_vec(

 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
-                float sum = vec_dot_KQ(K + i_KQ*nb11, Q_reg[j], Q_i32[j], Q_ds[j]);
-                sum = warp_reduce_sum<nthreads_KQ>(sum);
+                float sum;
+                if constexpr (n_centroids_lut > 0 && ncols == 1 && type_K == GGML_TYPE_TURBO3_0) {
+                    // LUT scoring: 8 elements per iteration (2 qs bytes + 1 signs byte)
+                    const block_turbo3_0 * K_turbo = (const block_turbo3_0 *)(K + i_KQ*nb11);
+                    sum = 0.0f;
+                    for (int d0 = 0; d0 < D; d0 += 8) {
+                        const int ib = d0 / QK_TURBO3;
+                        const int jj = d0 % QK_TURBO3;
+                        const float norm = __half2float(K_turbo[ib].norm);
+                        const uint8_t qs0 = K_turbo[ib].qs[jj / 4];
+                        const uint8_t qs1 = K_turbo[ib].qs[jj / 4 + 1];
+                        const uint8_t sgn = K_turbo[ib].signs[jj / 8];
+                        sum += (__half2float(turbo_lut[d0  ][((qs0>>0)&3)|((sgn>>0&1)<<2)]) +
+                                __half2float(turbo_lut[d0+1][((qs0>>2)&3)|((sgn>>1&1)<<2)]) +
+                                __half2float(turbo_lut[d0+2][((qs0>>4)&3)|((sgn>>2&1)<<2)]) +
+                                __half2float(turbo_lut[d0+3][((qs0>>6)&3)|((sgn>>3&1)<<2)]) +
+                                __half2float(turbo_lut[d0+4][((qs1>>0)&3)|((sgn>>4&1)<<2)]) +
+                                __half2float(turbo_lut[d0+5][((qs1>>2)&3)|((sgn>>5&1)<<2)]) +
+                                __half2float(turbo_lut[d0+6][((qs1>>4)&3)|((sgn>>6&1)<<2)]) +
+                                __half2float(turbo_lut[d0+7][((qs1>>6)&3)|((sgn>>7&1)<<2)])) * norm;
+                    }
+                } else if constexpr (n_centroids_lut > 0 && ncols == 1 && type_K == GGML_TYPE_TURBO2_0) {
+                    // LUT scoring for turbo2: 8 elements per iteration (2 qs bytes, no signs)
+                    const block_turbo2_0 * K_turbo = (const block_turbo2_0 *)(K + i_KQ*nb11);
+                    sum = 0.0f;
+                    for (int d0 = 0; d0 < D; d0 += 8) {
+                        const int ib = d0 / QK_TURBO2;
+                        const int jj = d0 % QK_TURBO2;
+                        const float norm = __half2float(K_turbo[ib].norm);
+                        const uint8_t qs0 = K_turbo[ib].qs[jj / 4];
+                        const uint8_t qs1 = K_turbo[ib].qs[jj / 4 + 1];
+                        sum += (__half2float(turbo_lut[d0  ][(qs0>>0)&3]) +
+                                __half2float(turbo_lut[d0+1][(qs0>>2)&3]) +
+                                __half2float(turbo_lut[d0+2][(qs0>>4)&3]) +
+                                __half2float(turbo_lut[d0+3][(qs0>>6)&3]) +
+                                __half2float(turbo_lut[d0+4][(qs1>>0)&3]) +
+                                __half2float(turbo_lut[d0+5][(qs1>>2)&3]) +
+                                __half2float(turbo_lut[d0+6][(qs1>>4)&3]) +
+                                __half2float(turbo_lut[d0+7][(qs1>>6)&3])) * norm;
+                    }
+                } else {
+                    sum = vec_dot_KQ(K + i_KQ*nb11, Q_reg[j], Q_i32[j], Q_ds[j]);
+                    sum = warp_reduce_sum<nthreads_KQ>(sum);
+                }

                if (use_logit_softcap) {
                    sum = logit_softcap*tanhf(sum);
@@ -284,12 +370,12 @@ static __global__ void flash_attn_ext_vec(
            for (int offset = nthreads_KQ; offset < WARP_SIZE; offset <<= 1) {
                KQ_max_new[j] = fmaxf(KQ_max_new[j], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[j], offset, WARP_SIZE));
            }
-            const float KQ_max_scale = expf(KQ_max[j] - KQ_max_new[j]);
+            const float KQ_max_scale = __expf(KQ_max[j] - KQ_max_new[j]);
            KQ_max[j] = KQ_max_new[j];

-            KQ_reg[j] = expf(KQ_reg[j] - KQ_max[j]);
+            KQ_reg[j] = __expf(KQ_reg[j] - KQ_max[j]);
            KQ_sum[j] = KQ_sum[j]*KQ_max_scale + KQ_reg[j];
-            KQ[j*nthreads + tid] = KQ_reg[j];
+            if constexpr (!V_is_turbo) { KQ[j*nthreads + tid] = KQ_reg[j]; }

 #ifdef V_DOT2_F32_F16_AVAILABLE
            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
@@ -307,7 +393,7 @@ static __global__ void flash_attn_ext_vec(
        }

 #ifndef GGML_USE_HIP
-        __syncwarp();
+        if constexpr (!V_is_turbo) { __syncwarp(); }
 #endif // GGML_USE_HIP

 #pragma unroll
@@ -318,8 +404,29 @@ static __global__ void flash_attn_ext_vec(
            half2 KQ_k[ncols];
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
-                KQ_k[j] = __half2half2(KQ[j*nthreads + k]);
+                if constexpr (V_is_turbo) {
+                    const float kq_val = __shfl_sync(0xFFFFFFFF, KQ_reg[j], k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V));
+                    KQ_k[j] = make_half2(__float2half(kq_val), __float2half(kq_val));
+                } else {
+                    KQ_k[j] = __half2half2(KQ[j*nthreads + k]);
+                }
            }
+
+            // Sparse V: skip V dequant if all attention weights for this position are negligible.
+            // For turbo types, the check is compiled out: at typical decode context lengths
+            // (< ~4K tokens) with threshold 1e-6, no positions are ever skipped, so the
+            // per-position branch is pure overhead (misprediction + comparison cost). This
+            // also dodges the warp-divergence regression on turbo paths that motivated the
+            // April 24 revert (commit f2dc968).
+            if constexpr (!V_is_turbo) {
+                bool dominated = true;
+#pragma unroll
+                for (int j = 0; j < ncols; ++j) {
+                    if (__hgt(__low2half(KQ_k[j]), sparse_v_threshold_h)) { dominated = false; break; }
+                }
+                if (dominated) { continue; }
+            }
+
 #pragma unroll
            for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
                half2 tmp[V_rows_per_thread/2];
@@ -347,19 +454,142 @@ static __global__ void flash_attn_ext_vec(
            float KQ_k[ncols];
 #pragma unroll
            for (int j = 0; j < ncols; ++j) {
-                KQ_k[j] = KQ[j*nthreads + k];
+                if constexpr (V_is_turbo) {
+                    KQ_k[j] = __shfl_sync(0xFFFFFFFF, KQ_reg[j], k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V));
+                } else {
+                    KQ_k[j] = KQ[j*nthreads + k];
+                }
            }
+
+            // Sparse V: skip V dequant if all attention weights for this position are negligible.
+            // Compiled out for turbo types — see half2 path comment above.
+            if constexpr (!V_is_turbo) {
+                bool dominated = true;
 #pragma unroll
-            for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
-                float2 tmp[V_rows_per_thread/2];
-                dequantize_V(V + k*nb21, tmp,
-                    2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
+                for (int j = 0; j < ncols; ++j) {
+                    if (KQ_k[j] >= sparse_v_threshold_f) { dominated = false; break; }
+                }
+                if (dominated) { continue; }
+            }
+
+            // Turbo V path: precompute scaled centroids once per block to eliminate
+            // per-element norm multiply.  centroid[idx]*norm is computed 8/4/16 times
+            // (once per centroid) instead of D times (once per element).
+            if constexpr (type_V == GGML_TYPE_TURBO3_0) {
+                const block_turbo3_0 * vb = (const block_turbo3_0 *)(V + k*nb21);
+                int prev_ib = -1;
+                float sc[8];
+
 #pragma unroll
-                for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
+                for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
+                    const int i0 = 2*i_VKQ_0 + (threadIdx.x % nthreads_V)*V_rows_per_thread;
+                    const int ib = i0 / QK_TURBO3;
+                    const int j0 = i0 % QK_TURBO3;
+
+                    if (ib != prev_ib) {
+                        prev_ib = ib;
+                        const float norm = __half2float(vb[ib].norm);
+#pragma unroll
+                        for (int c = 0; c < 8; ++c) { sc[c] = TURBO_CENTROIDS_3BIT[c] * norm; }
+                    }
+
+                    const uint8_t qs_byte  = vb[ib].qs[j0 / 4];
+                    const uint8_t sgn_byte = vb[ib].signs[j0 / 8];
+                    const int     shift_s  = j0 % 8;
+
+                    const uint8_t idx0 = ((qs_byte >> 0) & 0x3) | (((sgn_byte >> (shift_s+0)) & 0x1) << 2);
+                    const uint8_t idx1 = ((qs_byte >> 2) & 0x3) | (((sgn_byte >> (shift_s+1)) & 0x1) << 2);
+                    const uint8_t idx2 = ((qs_byte >> 4) & 0x3) | (((sgn_byte >> (shift_s+2)) & 0x1) << 2);
+                    const uint8_t idx3 = ((qs_byte >> 6) & 0x3) | (((sgn_byte >> (shift_s+3)) & 0x1) << 2);
+
 #pragma unroll
                    for (int j = 0; j < ncols; ++j) {
-                        VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].x += tmp[i_VKQ_1].x*KQ_k[j];
-                        VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].y += tmp[i_VKQ_1].y*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 0].x += sc[idx0]*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 0].y += sc[idx1]*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 1].x += sc[idx2]*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 1].y += sc[idx3]*KQ_k[j];
+                    }
+                }
+            } else if constexpr (type_V == GGML_TYPE_TURBO2_0) {
+                const block_turbo2_0 * vb = (const block_turbo2_0 *)(V + k*nb21);
+                int prev_ib = -1;
+                float sc[4];
+
+#pragma unroll
+                for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
+                    const int i0 = 2*i_VKQ_0 + (threadIdx.x % nthreads_V)*V_rows_per_thread;
+                    const int ib = i0 / QK_TURBO2;
+                    const int j0 = i0 % QK_TURBO2;
+
+                    if (ib != prev_ib) {
+                        prev_ib = ib;
+                        const float norm = __half2float(vb[ib].norm);
+#pragma unroll
+                        for (int c = 0; c < 4; ++c) { sc[c] = TURBO_CENTROIDS_2BIT[c] * norm; }
+                    }
+
+                    const uint8_t qs_byte = vb[ib].qs[j0 / 4];
+
+                    const uint8_t idx0 = (qs_byte >> 0) & 0x3;
+                    const uint8_t idx1 = (qs_byte >> 2) & 0x3;
+                    const uint8_t idx2 = (qs_byte >> 4) & 0x3;
+                    const uint8_t idx3 = (qs_byte >> 6) & 0x3;
+
+#pragma unroll
+                    for (int j = 0; j < ncols; ++j) {
+                        VKQ[j][i_VKQ_0/nthreads_V + 0].x += sc[idx0]*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 0].y += sc[idx1]*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 1].x += sc[idx2]*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 1].y += sc[idx3]*KQ_k[j];
+                    }
+                }
+            } else if constexpr (type_V == GGML_TYPE_TURBO4_0) {
+                const block_turbo4_0 * vb = (const block_turbo4_0 *)(V + k*nb21);
+                int prev_ib = -1;
+                float sc[16];
+
+#pragma unroll
+                for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
+                    const int i0 = 2*i_VKQ_0 + (threadIdx.x % nthreads_V)*V_rows_per_thread;
+                    const int ib = i0 / QK_TURBO4;
+                    const int j0 = i0 % QK_TURBO4;
+
+                    if (ib != prev_ib) {
+                        prev_ib = ib;
+                        const float norm = __half2float(vb[ib].norm);
+#pragma unroll
+                        for (int c = 0; c < 16; ++c) { sc[c] = TURBO_CENTROIDS_4BIT[c] * norm; }
+                    }
+
+                    const uint8_t qs_byte0 = vb[ib].qs[j0 / 2];
+                    const uint8_t qs_byte1 = vb[ib].qs[j0 / 2 + 1];
+
+                    const uint8_t idx0 = (qs_byte0 >> 0) & 0xF;
+                    const uint8_t idx1 = (qs_byte0 >> 4) & 0xF;
+                    const uint8_t idx2 = (qs_byte1 >> 0) & 0xF;
+                    const uint8_t idx3 = (qs_byte1 >> 4) & 0xF;
+
+#pragma unroll
+                    for (int j = 0; j < ncols; ++j) {
+                        VKQ[j][i_VKQ_0/nthreads_V + 0].x += sc[idx0]*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 0].y += sc[idx1]*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 1].x += sc[idx2]*KQ_k[j];
+                        VKQ[j][i_VKQ_0/nthreads_V + 1].y += sc[idx3]*KQ_k[j];
+                    }
+                }
+            } else {
+#pragma unroll
+                for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
+                    float2 tmp[V_rows_per_thread/2];
+                    dequantize_V(V + k*nb21, tmp,
+                        2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
+#pragma unroll
+                    for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
+#pragma unroll
+                        for (int j = 0; j < ncols; ++j) {
+                            VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].x += tmp[i_VKQ_1].x*KQ_k[j];
+                            VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].y += tmp[i_VKQ_1].y*KQ_k[j];
+                        }
                    }
                }
            }
@@ -379,10 +609,10 @@ static __global__ void flash_attn_ext_vec(
            }

            const float kqmax_new_j = fmaxf(sink, KQ_max[j]);
-            const float KQ_max_scale = expf(KQ_max[j] - kqmax_new_j);
+            const float KQ_max_scale = __expf(KQ_max[j] - kqmax_new_j);
            KQ_max[j] = kqmax_new_j;

-            KQ_sum[j] = KQ_sum[j]*KQ_max_scale + (threadIdx.x == 0 ? expf(sink - KQ_max[j]) : 0.0f);
+            KQ_sum[j] = KQ_sum[j]*KQ_max_scale + (threadIdx.x == 0 ? __expf(sink - KQ_max[j]) : 0.0f);

 #ifdef V_DOT2_F32_F16_AVAILABLE
            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
@@ -428,7 +658,7 @@ static __global__ void flash_attn_ext_vec(

        float kqmax_new = KQ_max_shared[j_VKQ][threadIdx.x];
        kqmax_new = warp_reduce_max(kqmax_new);
-        const float kqmax_scale = expf(KQ_max[j_VKQ] - kqmax_new);
+        const float kqmax_scale = __expf(KQ_max[j_VKQ] - kqmax_new);
        KQ_max[j_VKQ] = kqmax_new;

 #ifdef V_DOT2_F32_F16_AVAILABLE
@@ -598,3 +828,99 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16)
+
+// TurboQuant3 — turbo3 K + turbo3 V (KV cache uses same type)
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+
+// Mixed turbo3/q8_0 KV cache types
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
+
+// Mixed f16/turbo3 KV cache types
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
+
+// TurboQuant2 -- turbo2 K + turbo2 V
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
+
+// Mixed turbo2/q8_0 KV cache types
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
+
+// Mixed f16/turbo2 KV cache types
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
+
+// Mixed turbo3/turbo2 KV cache types
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
+
+// TurboQuant4 — turbo4 K + turbo4 V (KV cache uses same type)
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+
+// Mixed turbo4/q8_0 KV cache types
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
+
+// Mixed f16/turbo4 KV cache types
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
+
+// Mixed turbo4/turbo3 KV cache types
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
+
+// Mixed turbo4/turbo2 KV cache types
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
@@ -58,6 +58,8 @@
 #include "ggml-cuda/gated_delta_net.cuh"
 #include "ggml-cuda/set.cuh"
 #include "ggml-cuda/set-rows.cuh"
+#include "ggml-cuda/turbo-wht.cuh"
+#include "ggml-cuda/mmvq-tq.cuh"
 #include "ggml-cuda/pad_reflect_1d.cuh"
 #include "ggml-cuda/solve_tri.cuh"
 #include "ggml-cuda/tri.cuh"
@@ -679,10 +681,49 @@ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer,
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }

+// TQ4_1S load-time q8_0 conversion: ON by default for best prefill speed.
+// Native TQ4_1S decode is faster (+29-33%) but prefill is 2× slower because
+// cuBLAS dequant-to-f16 requires per-element inverse WHT.
+// Opt-out: GGML_TQ_NATIVE=1 for decode-heavy workloads (saves 1.7× VRAM).
+static bool ggml_tq_convert_q8() {
+    static int val = -1;
+    if (val == -1) {
+        const char * env = getenv("GGML_TQ_NATIVE");
+        val = (env && env[0] == '1') ? 0 : 1;  // default ON, GGML_TQ_NATIVE=1 disables
+    }
+    return val == 1;
+}
+
 static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;

    ggml_cuda_set_device(ctx->device);
+    // TQ4_1S → q8_0 load-time conversion (opt-in: GGML_TQ_CONVERT_Q8=1)
+    if (ggml_tq_convert_q8() && tensor->type == GGML_TYPE_TQ4_1S && offset == 0 && size == ggml_nbytes(tensor)) {
+        const int64_t n_elements = ggml_nelements(tensor);
+
+        // Upload TQ4_1S to a temp GPU buffer
+        void * tmp_tq4;
+        CUDA_CHECK(cudaMalloc(&tmp_tq4, size));
+        CUDA_CHECK(cudaMemcpyAsync(tmp_tq4, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+
+        // Convert TQ4_1S (tmp) → q8_0 (tensor->data, which has q8_0-sized allocation)
+        ggml_cuda_convert_tq4_1s_to_q8_0(tmp_tq4, tensor->data, n_elements, cudaStreamPerThread);
+        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+
+        CUDA_CHECK(cudaFree(tmp_tq4));
+
+        // Update tensor metadata to q8_0
+        tensor->type = GGML_TYPE_Q8_0;
+        tensor->nb[0] = ggml_type_size(GGML_TYPE_Q8_0);
+        tensor->nb[1] = tensor->nb[0] * (tensor->ne[0] / ggml_blck_size(GGML_TYPE_Q8_0));
+        for (int i = 2; i < GGML_MAX_DIMS; i++) {
+            tensor->nb[i] = tensor->nb[i-1] * tensor->ne[i-1];
+        }
+
+        return;
+    }
+
    CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }
@@ -803,6 +844,13 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
    size_t size = ggml_nbytes(tensor);
    int64_t ne0 = tensor->ne[0];

+    // TQ4_1S → q8_0 load-time conversion: allocate q8_0-sized space if opted in
+    if (ggml_tq_convert_q8() && tensor->type == GGML_TYPE_TQ4_1S) {
+        // q8_0 block: 34 bytes per 32 elements. TQ4_1S block: 20 bytes per 32 elements.
+        const int64_t n_blocks = ggml_nelements(tensor) / QK_TQ4_1S;
+        size = n_blocks * sizeof(block_q8_0);
+    }
+
    if (ggml_is_quantized(tensor->type)) {
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
@@ -2505,7 +2553,9 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
                                   ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
                                   src0->view_src;

-    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
+    const bool is_tq_weight = (src0->type == GGML_TYPE_TQ4_1S || src0->type == GGML_TYPE_TQ3_1S);
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && !is_tq_weight &&
+                             src1->type == GGML_TYPE_F32 &&
                             dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;

    // fusion is not universally faster on Pascal
@@ -2547,10 +2597,12 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
    bool use_mul_mat_f     = !ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
+    // TQ weight types use fused dp4a path (all batch sizes), not mmvq/mmq
+    const bool is_tq_weight = (src0->type == GGML_TYPE_TQ4_1S || src0->type == GGML_TYPE_TQ3_1S);
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && !is_tq_weight
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-    bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
+    bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear && !is_tq_weight
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

    bool any_gpus_with_slow_fp16 = false;
@@ -2614,6 +2666,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
    } else if (use_mul_mat_q) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
+    } else if (!split && is_tq_weight && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE) {
+        // Fused TQ weight mul_mat with pre-rotated activations via warp shuffle WHT
+        // Handles ne[1]=1 (decode) and ne[1]≤8 (multi-token / speculative decoding)
+        ggml_cuda_mul_mat_tq(ctx, src0, src1, dst);
+    } else if (!split && is_tq_weight && src0->type == GGML_TYPE_TQ4_1S) {
+        // Large prefill: runtime TQ4_1S → q8_0 scratch conversion + cuBLAS
+        // Gets tensor core throughput without permanent 1.7× VRAM cost
+        ggml_cuda_mul_mat_tq4_1s_cublas(ctx, src0, src1, dst);
    } else {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
    }
@@ -2633,16 +2693,18 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;

    // [TAG_MUL_MAT_ID_CUDA_GRAPHS]
+    // TQ weight types use dequant-to-f16 cuBLAS path only (no mmvq/mmq kernels)
+    const bool is_tq_weight_id = (src0->type == GGML_TYPE_TQ4_1S || src0->type == GGML_TYPE_TQ3_1S);
    if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
        static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
        if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
-            if (ggml_is_quantized(src0->type)) {
+            if (ggml_is_quantized(src0->type) && !is_tq_weight_id) {
                const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
                if (ne2 <= mmvq_mmid_max) {
                    ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
                    return;
                }
-            } else {
+            } else if (!ggml_is_quantized(src0->type)) {
                if (GGML_CUDA_CC_IS_AMD(cc)) {
                    ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
                    return;
@@ -2800,6 +2862,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SET_ROWS:
            ggml_cuda_op_set_rows(ctx, dst);
            break;
+        case GGML_OP_TURBO_WHT:
+            ggml_cuda_turbo_wht(ctx, dst);
+            break;
        case GGML_OP_SET:
            ggml_cuda_op_set(ctx, dst);
            break;
@@ -3258,10 +3323,11 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
        }

        // [TAG_MUL_MAT_ID_CUDA_GRAPHS]
+        const bool is_tq_w = (node->src[0]->type == GGML_TYPE_TQ4_1S || node->src[0]->type == GGML_TYPE_TQ3_1S);
        if (node->op == GGML_OP_MUL_MAT_ID) {
            const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
            const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc);
-            if (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max) {
+            if (!ggml_is_quantized(node->src[0]->type) || is_tq_w || node->ne[2] > mmvq_mmid_max) {
                // under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
                // TODO: figure out a way to enable for larger batch sizes, without hurting performance
                // ref: https://github.com/ggml-org/llama.cpp/pull/18958
@@ -5160,6 +5226,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                    case GGML_TYPE_IQ4_NL:
                    case GGML_TYPE_IQ4_XS:
                    case GGML_TYPE_BF16:
+                    case GGML_TYPE_TQ4_1S:
+                    case GGML_TYPE_TQ3_1S:
                        return true;
                    default:
                        return false;
@@ -5180,6 +5248,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_TQ4_1S:
+                    case GGML_TYPE_TQ3_1S:
                        return true;
                    default:
                        return false;
@@ -5191,9 +5261,18 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            } break;
        case GGML_OP_SET_ROWS:
            {
+                // turbo types require head_dim divisible by appropriate group size
+                if ((op->type == GGML_TYPE_TURBO3_0 || op->type == GGML_TYPE_TURBO2_0) && op->src[0]->ne[0] % 64 != 0) {
+                    return false;
+                }
+                // turbo4 block size is 128, so head_dim must be divisible by 128
+                if (op->type == GGML_TYPE_TURBO4_0 && op->src[0]->ne[0] % 128 != 0) {
+                    return false;
+                }
                return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
                       op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
-                       op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
+                       op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL ||
+                       op->type == GGML_TYPE_TURBO3_0 || op->type == GGML_TYPE_TURBO2_0 || op->type == GGML_TYPE_TURBO4_0) &&
                       op->src[0]->type == GGML_TYPE_F32 &&
                       (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
            } break;
@@ -5323,6 +5402,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                   (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) &&
                   (op->type         == GGML_TYPE_F32 || op->type         == GGML_TYPE_F16);
+        case GGML_OP_TURBO_WHT:
+            return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
+                   op->src[0]->ne[0] % 32 == 0;
        case GGML_OP_SSM_SCAN: {
            if (op->src[3]->ne[0] == 1) {
                // Mamba2
@@ -0,0 +1,568 @@
+/*
+ * Fused mul_mat for TQ4_1S / TQ3_1S weight types.
+ *
+ * ne[1]≤8: dp4a multi-token kernel (weight reuse across tokens)
+ * ne[1]>8: runtime TQ4_1S→q8_0 scratch + cuBLAS tensor core GEMM
+ */
+
+#include "mmvq-tq.cuh"
+#include "turbo-quant.cuh"
+#include "convert.cuh"
+
+#define MMVQ_TQ_NWARPS 4
+
+// ============================================================================
+// Pre-rotate activation to q8_1 format (for TQ4_1S dp4a path)
+// ============================================================================
+
+static __global__ void tq_prerotate_q8_1(
+        const float * __restrict__ src,
+        block_q8_1  * __restrict__ dst,
+        const int n_elements) {
+
+    const int block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    const int lane = threadIdx.x;
+    const int offset = block_idx * 32 + lane;
+    if (offset >= n_elements) return;
+
+    float val = src[offset];
+    val *= TQ_WEIGHT_SIGNS[lane];
+
+    #pragma unroll
+    for (int h = 1; h < 32; h <<= 1) {
+        float o = __shfl_xor_sync(0xffffffff, val, h);
+        val = (lane & h) ? (o - val) : (val + o);
+    }
+    val *= 0.17677669529663688f;
+
+    float amax = fabsf(val);
+    #pragma unroll
+    for (int off = 16; off > 0; off >>= 1)
+        amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, off));
+
+    float sum = val;
+    #pragma unroll
+    for (int off = 16; off > 0; off >>= 1)
+        sum += __shfl_xor_sync(0xffffffff, sum, off);
+
+    const float d = amax / 127.0f;
+    const float id = (d > 0.0f) ? 127.0f / amax : 0.0f;
+
+    dst[block_idx].qs[lane] = (int8_t)roundf(val * id);
+    if (lane == 0) {
+        dst[block_idx].ds = make_half2(__float2half(d), __float2half(sum));
+    }
+}
+
+// ============================================================================
+// TQ4_1S: dp4a path with fixed int8 centroid LUT + q8_1 activation
+// ============================================================================
+
+// Fixed int8 centroid table: centroid_i8[i] = round(TQ4_CENTROIDS_WEIGHT[i] * 127 / 2.733)
+// Rescale factor to recover float centroids: 2.733 / 127
+static constexpr float TQ4_CENTROID_I8_RESCALE = 2.733f / 127.0f;
+
+// Register-based centroid lookup: maps 4 qs bytes (1 uint32) to 2 packed 4× centroid_i8 for dp4a.
+// Processes a full uint32 at once, sharing nibble extraction across both byte pairs.
+__device__ __forceinline__ void tq4_cents8_reg(uint32_t four_bytes, int &c0, int &c1) {
+    // Centroid i8 values packed into 4 registers (little-endian byte order):
+    // [-127,-96,-75,-58] [-44,-31,-18,-6] [6,18,31,44] [58,75,96,127]
+    constexpr uint32_t CR03 = 0xC6B5A081u;
+    constexpr uint32_t CR47 = 0xFAEEE1D4u;
+    constexpr uint32_t CR8B = 0x2C1F1206u;
+    constexpr uint32_t CRCF = 0x7F604B3Au;
+
+    // Extract all 8 nibbles from 4 bytes at once (shared across both pairs)
+    const uint32_t lo = four_bytes & 0x0F0F0F0Fu;
+    const uint32_t hi = (four_bytes >> 4) & 0x0F0F0F0Fu;
+
+    // Interleave: bytes 0-1 → sel0 [n0,n1,n2,n3], bytes 2-3 → sel1 [n4,n5,n6,n7]
+    const uint32_t sel0 = __byte_perm(lo, hi, 0x5140u);
+    const uint32_t sel1 = __byte_perm(lo, hi, 0x7362u);
+
+    // Lookup centroids for sel0 (elements from qs bytes 0-1)
+    {
+        const uint32_t flo = __byte_perm(CR03, CR47, sel0);
+        const uint32_t fhi = __byte_perm(CR8B, CRCF, sel0);
+        const uint32_t msb = (sel0 >> 3) & 0x01010101u;
+        const uint32_t psel = 0x03020100u | (msb << 2);
+        c0 = (int)__byte_perm(flo, fhi, psel);
+    }
+
+    // Lookup centroids for sel1 (elements from qs bytes 2-3)
+    {
+        const uint32_t flo = __byte_perm(CR03, CR47, sel1);
+        const uint32_t fhi = __byte_perm(CR8B, CRCF, sel1);
+        const uint32_t msb = (sel1 >> 3) & 0x01010101u;
+        const uint32_t psel = 0x03020100u | (msb << 2);
+        c1 = (int)__byte_perm(flo, fhi, psel);
+    }
+}
+
+// ============================================================================
+// Pre-rotate activation to half (for TQ3_1S scalar path)
+// ============================================================================
+
+static __global__ void tq_prerotate_activation(
+        const float * __restrict__ src,
+        half        * __restrict__ dst,
+        const int n_elements) {
+
+    const int block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    const int lane = threadIdx.x;
+    const int offset = block_idx * 32 + lane;
+    if (offset >= n_elements) return;
+
+    float val = src[offset];
+    val *= TQ_WEIGHT_SIGNS[lane];
+
+    #pragma unroll
+    for (int h = 1; h < 32; h <<= 1) {
+        float o = __shfl_xor_sync(0xffffffff, val, h);
+        val = (lane & h) ? (o - val) : (val + o);
+    }
+    val *= 0.17677669529663688f;
+    dst[offset] = __float2half(val);
+}
+
+static __device__ __forceinline__ uint8_t tq3_extract_index(const uint8_t * __restrict__ qs, int lane) {
+    const int group = lane / 8;
+    const int lane_in_group = lane % 8;
+    const uint8_t * qp = qs + group * 3;
+    const uint32_t packed = (uint32_t)qp[0] | ((uint32_t)qp[1] << 8) | ((uint32_t)qp[2] << 16);
+    return (packed >> (lane_in_group * 3)) & 7;
+}
+
+// ============================================================================
+// Multi-token TQ4_1S dp4a kernel (ncols_dst ≤ 8)
+// Weight data loaded once per block, reused across all ncols_dst tokens.
+// ============================================================================
+
+template <int ncols_dst>
+static __global__ void mul_mat_tq4_1s_dp4a_multi(
+        const void       * __restrict__ vx,
+        const block_q8_1 * __restrict__ vy_q8,
+        float            * __restrict__ dst,
+        const int ncols_x,
+        const int nrows_x,
+        const int stride_col_y,
+        const int stride_col_dst) {
+
+    const int row = blockIdx.x * MMVQ_TQ_NWARPS + threadIdx.y;
+    if (row >= nrows_x) return;
+
+    const int lane = threadIdx.x;
+    const int blocks_per_row = ncols_x / QK_TQ4_1S;
+    const block_tq4_1s * x_row = ((const block_tq4_1s *) vx) + (int64_t)row * blocks_per_row;
+
+    float sumf[ncols_dst] = {};
+
+    for (int ib = lane; ib < blocks_per_row; ib += WARP_SIZE) {
+        const block_tq4_1s * blk = &x_row[ib];
+        const float fd0 = __half2float(blk->d0);
+        const float fd1 = __half2float(blk->d1);
+
+        // Load weight once, reuse across all tokens
+        const uint32_t * qs32 = (const uint32_t *)(blk->qs);
+        const uint32_t w0 = qs32[0], w1 = qs32[1], w2 = qs32[2], w3 = qs32[3];
+
+        int c0_0, c1_0, c0_1, c1_1, c0_2, c1_2, c0_3, c1_3;
+        tq4_cents8_reg(w0, c0_0, c1_0);
+        tq4_cents8_reg(w1, c0_1, c1_1);
+        tq4_cents8_reg(w2, c0_2, c1_2);
+        tq4_cents8_reg(w3, c0_3, c1_3);
+
+        #pragma unroll
+        for (int j = 0; j < ncols_dst; j++) {
+            const block_q8_1 * a_blk = &vy_q8[j * stride_col_y + ib];
+            const float d_act = __half2float((__half)a_blk->ds.x);
+            const int * a_qs = (const int *)(a_blk->qs);
+
+            const int s0 = ggml_cuda_dp4a(c0_0, a_qs[0], ggml_cuda_dp4a(c1_0, a_qs[1],
+                           ggml_cuda_dp4a(c0_1, a_qs[2], ggml_cuda_dp4a(c1_1, a_qs[3], 0))));
+            const int s1 = ggml_cuda_dp4a(c0_2, a_qs[4], ggml_cuda_dp4a(c1_2, a_qs[5],
+                           ggml_cuda_dp4a(c0_3, a_qs[6], ggml_cuda_dp4a(c1_3, a_qs[7], 0))));
+
+            sumf[j] += d_act * (fd0 * (float)s0 + fd1 * (float)s1);
+        }
+    }
+
+    // Apply centroid int8→float rescale + warp reduction
+    #pragma unroll
+    for (int j = 0; j < ncols_dst; j++)
+        sumf[j] *= TQ4_CENTROID_I8_RESCALE;
+
+    #pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        #pragma unroll
+        for (int j = 0; j < ncols_dst; j++)
+            sumf[j] += __shfl_xor_sync(0xffffffff, sumf[j], offset);
+    }
+
+    if (lane == 0) {
+        #pragma unroll
+        for (int j = 0; j < ncols_dst; j++)
+            dst[j * stride_col_dst + row] = sumf[j];
+    }
+}
+
+// ============================================================================
+// Multi-token TQ3_1S scalar kernel (ncols_dst ≤ 8)
+// ============================================================================
+
+template <int ncols_dst>
+static __global__ void mul_mat_tq3_1s_multi(
+        const void  * __restrict__ vx,
+        const half  * __restrict__ vy_rot,
+        float       * __restrict__ dst,
+        const int ncols_x,
+        const int nrows_x,
+        const int stride_col_y,
+        const int stride_col_dst) {
+
+    __shared__ float s_lut[8];
+    if (threadIdx.y == 0 && threadIdx.x < 8) {
+        s_lut[threadIdx.x] = TQ3_CENTROIDS_WEIGHT[threadIdx.x];
+    }
+    __syncthreads();
+
+    const int row  = blockIdx.x * MMVQ_TQ_NWARPS + threadIdx.y;
+    if (row >= nrows_x) return;
+
+    const int lane = threadIdx.x;
+    const int blocks_per_row = ncols_x / QK_TQ3_0;
+    const block_tq3_1s * x_row = ((const block_tq3_1s *) vx) + (int64_t)row * blocks_per_row;
+
+    float sumf[ncols_dst] = {};
+
+    for (int ib = 0; ib < blocks_per_row; ib++) {
+        const float d = (lane < 16) ? __half2float(x_row[ib].d0) : __half2float(x_row[ib].d1);
+        const uint8_t idx = tq3_extract_index(x_row[ib].qs, lane);
+        const float w = s_lut[idx] * d;
+
+        #pragma unroll
+        for (int j = 0; j < ncols_dst; j++) {
+            const float act = __half2float(vy_rot[j * stride_col_y + ib * QK_TQ3_0 + lane]);
+            sumf[j] += act * w;
+        }
+    }
+
+    #pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        #pragma unroll
+        for (int j = 0; j < ncols_dst; j++)
+            sumf[j] += __shfl_xor_sync(0xffffffff, sumf[j], offset);
+    }
+
+    if (lane == 0) {
+        #pragma unroll
+        for (int j = 0; j < ncols_dst; j++)
+            dst[j * stride_col_dst + row] = sumf[j];
+    }
+}
+
+// ============================================================================
+// TQ4_1S scalar/half kernel (AMD fallback — no dp4a)
+// Same pattern as TQ3_1S: pre-rotated half activations, scalar centroid lookup.
+// On RDNA4, sudot4 throughput differs from NVIDIA dp4a — this path is faster.
+// ============================================================================
+
+template <int ncols_dst>
+static __global__ void mul_mat_tq4_1s_scalar_multi(
+        const void  * __restrict__ vx,
+        const half  * __restrict__ vy_rot,
+        float       * __restrict__ dst,
+        const int ncols_x,
+        const int nrows_x,
+        const int stride_col_y,
+        const int stride_col_dst) {
+
+    __shared__ float s_lut[16];
+    if (threadIdx.y == 0 && threadIdx.x < 16) {
+        s_lut[threadIdx.x] = TQ4_CENTROIDS_WEIGHT[threadIdx.x];
+    }
+    __syncthreads();
+
+    const int row  = blockIdx.x * MMVQ_TQ_NWARPS + threadIdx.y;
+    if (row >= nrows_x) return;
+
+    const int lane = threadIdx.x;
+    const int blocks_per_row = ncols_x / QK_TQ4_1S;
+    const block_tq4_1s * x_row = ((const block_tq4_1s *) vx) + (int64_t)row * blocks_per_row;
+
+    float sumf[ncols_dst] = {};
+
+    for (int ib = 0; ib < blocks_per_row; ib++) {
+        const float d = (lane < 16) ? __half2float(x_row[ib].d0) : __half2float(x_row[ib].d1);
+        const uint8_t idx = (x_row[ib].qs[lane / 2] >> ((lane & 1) * 4)) & 0xF;
+        const float w = s_lut[idx] * d;
+
+        #pragma unroll
+        for (int j = 0; j < ncols_dst; j++) {
+            const float act = __half2float(vy_rot[j * stride_col_y + ib * QK_TQ4_1S + lane]);
+            sumf[j] += act * w;
+        }
+    }
+
+    #pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        #pragma unroll
+        for (int j = 0; j < ncols_dst; j++)
+            sumf[j] += __shfl_xor_sync(0xffffffff, sumf[j], offset);
+    }
+
+    if (lane == 0) {
+        #pragma unroll
+        for (int j = 0; j < ncols_dst; j++)
+            dst[j * stride_col_dst + row] = sumf[j];
+    }
+}
+
+// ============================================================================
+// Dispatch: ne[1]=1 (decode), ne[1]≤8 (multi-token dp4a / scalar)
+// ne[1]>8 handled by ggml_cuda_mul_mat_tq4_1s_cublas (runtime dequant + cuBLAS)
+// AMD: uses scalar half path for TQ4_1S (dp4a regresses on RDNA4)
+// ============================================================================
+
+template <int ncols_dst>
+static void launch_tq4_1s_multi(
+        const void * src0_d, const block_q8_1 * q8_buf,
+        float * dst_d, int ncols_x, int nrows_x,
+        int stride_col_y, int stride_col_dst, cudaStream_t stream) {
+    const dim3 block(WARP_SIZE, MMVQ_TQ_NWARPS);
+    const dim3 grid((nrows_x + MMVQ_TQ_NWARPS - 1) / MMVQ_TQ_NWARPS);
+    mul_mat_tq4_1s_dp4a_multi<ncols_dst><<<grid, block, 0, stream>>>(
+        src0_d, q8_buf, dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst);
+}
+
+template <int ncols_dst>
+static void launch_tq4_1s_scalar_multi(
+        const void * src0_d, const half * act_buf,
+        float * dst_d, int ncols_x, int nrows_x,
+        int stride_col_y, int stride_col_dst, cudaStream_t stream) {
+    const dim3 block(WARP_SIZE, MMVQ_TQ_NWARPS);
+    const dim3 grid((nrows_x + MMVQ_TQ_NWARPS - 1) / MMVQ_TQ_NWARPS);
+    mul_mat_tq4_1s_scalar_multi<ncols_dst><<<grid, block, 0, stream>>>(
+        src0_d, act_buf, dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst);
+}
+
+template <int ncols_dst>
+static void launch_tq3_1s_multi(
+        const void * src0_d, const half * act_buf,
+        float * dst_d, int ncols_x, int nrows_x,
+        int stride_col_y, int stride_col_dst, cudaStream_t stream) {
+    const dim3 block(WARP_SIZE, MMVQ_TQ_NWARPS);
+    const dim3 grid((nrows_x + MMVQ_TQ_NWARPS - 1) / MMVQ_TQ_NWARPS);
+    mul_mat_tq3_1s_multi<ncols_dst><<<grid, block, 0, stream>>>(
+        src0_d, act_buf, dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst);
+}
+
+void ggml_cuda_mul_mat_tq(ggml_backend_cuda_context & ctx,
+                           const ggml_tensor * src0,
+                           const ggml_tensor * src1,
+                           ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_TQ4_1S || src0->type == GGML_TYPE_TQ3_1S);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int ncols_x   = src0->ne[0];
+    const int nrows_x   = src0->ne[1];
+    const int ncols_dst = src1->ne[1];
+    GGML_ASSERT(ncols_x % 32 == 0);
+
+    const void  * src0_d = src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       * dst_d  = (float *) dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    const int id = ggml_cuda_get_device();
+    const int cc = ggml_cuda_info().devices[id].cc;
+    const int n_total_elements = ncols_x * ncols_dst;
+    const bool use_dp4a = !GGML_CUDA_CC_IS_AMD(cc) && src0->type == GGML_TYPE_TQ4_1S;
+
+    if (use_dp4a) {
+        // NVIDIA TQ4_1S: dp4a int8 path (optimized for Turing+ dp4a throughput)
+        const int n_total_blocks = n_total_elements / 32;
+        ggml_cuda_pool_alloc<block_q8_1> q8_1_buf(ctx.pool(id), n_total_blocks);
+
+        // Phase 1: Pre-rotate all tokens → q8_1
+        {
+            const int wpb = 4;
+            const dim3 block(32, wpb);
+            const dim3 grid((n_total_blocks + wpb - 1) / wpb);
+            tq_prerotate_q8_1<<<grid, block, 0, stream>>>(src1_d, q8_1_buf.get(), n_total_elements);
+        }
+
+        // Phase 2: dispatch based on ncols_dst
+        const int stride_col_y   = ncols_x / 32;  // q8_1 blocks per column
+        const int stride_col_dst = nrows_x;
+
+        switch (ncols_dst) {
+            case 1: launch_tq4_1s_multi<1>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
+            case 2: launch_tq4_1s_multi<2>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
+            case 3: launch_tq4_1s_multi<3>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
+            case 4: launch_tq4_1s_multi<4>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
+            case 5: launch_tq4_1s_multi<5>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
+            case 6: launch_tq4_1s_multi<6>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
+            case 7: launch_tq4_1s_multi<7>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
+            case 8: launch_tq4_1s_multi<8>(src0_d, q8_1_buf.get(), dst_d, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); break;
+        }
+    } else {
+        // Scalar half path: TQ3_1S (all vendors) + TQ4_1S on AMD (dp4a regresses on RDNA4)
+        ggml_cuda_pool_alloc<half> act_buf(ctx.pool(id), n_total_elements);
+
+        {
+            const int n_total_blocks = n_total_elements / 32;
+            const int wpb = 4;
+            const dim3 block(32, wpb);
+            const dim3 grid((n_total_blocks + wpb - 1) / wpb);
+            tq_prerotate_activation<<<grid, block, 0, stream>>>(src1_d, act_buf.get(), n_total_elements);
+        }
+
+        const int stride_col_y   = ncols_x;  // half elements per column
+        const int stride_col_dst = nrows_x;
+        const bool is_tq4 = (src0->type == GGML_TYPE_TQ4_1S);
+
+        // Macro to dispatch to the right kernel based on quant type
+        #define LAUNCH_SCALAR(N, src0_ptr, act_ptr, dst_ptr) \
+            if (is_tq4) { launch_tq4_1s_scalar_multi<N>(src0_ptr, act_ptr, dst_ptr, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); } \
+            else        { launch_tq3_1s_multi<N>(src0_ptr, act_ptr, dst_ptr, ncols_x, nrows_x, stride_col_y, stride_col_dst, stream); }
+
+        if (ncols_dst <= 8) {
+            switch (ncols_dst) {
+                case 1: LAUNCH_SCALAR(1, src0_d, act_buf.get(), dst_d); break;
+                case 2: LAUNCH_SCALAR(2, src0_d, act_buf.get(), dst_d); break;
+                case 3: LAUNCH_SCALAR(3, src0_d, act_buf.get(), dst_d); break;
+                case 4: LAUNCH_SCALAR(4, src0_d, act_buf.get(), dst_d); break;
+                case 5: LAUNCH_SCALAR(5, src0_d, act_buf.get(), dst_d); break;
+                case 6: LAUNCH_SCALAR(6, src0_d, act_buf.get(), dst_d); break;
+                case 7: LAUNCH_SCALAR(7, src0_d, act_buf.get(), dst_d); break;
+                case 8: LAUNCH_SCALAR(8, src0_d, act_buf.get(), dst_d); break;
+            }
+        } else {
+            // Large prefill: batch in groups of 8
+            for (int j = 0; j < ncols_dst; j += 8) {
+                const int batch = min(8, ncols_dst - j);
+                const half * act_j = act_buf.get() + j * ncols_x;
+                float * dst_j = dst_d + j * nrows_x;
+                switch (batch) {
+                    case 1: LAUNCH_SCALAR(1, src0_d, act_j, dst_j); break;
+                    case 2: LAUNCH_SCALAR(2, src0_d, act_j, dst_j); break;
+                    case 3: LAUNCH_SCALAR(3, src0_d, act_j, dst_j); break;
+                    case 4: LAUNCH_SCALAR(4, src0_d, act_j, dst_j); break;
+                    case 5: LAUNCH_SCALAR(5, src0_d, act_j, dst_j); break;
+                    case 6: LAUNCH_SCALAR(6, src0_d, act_j, dst_j); break;
+                    case 7: LAUNCH_SCALAR(7, src0_d, act_j, dst_j); break;
+                    case 8: LAUNCH_SCALAR(8, src0_d, act_j, dst_j); break;
+                }
+            }
+        }
+        #undef LAUNCH_SCALAR
+    }
+}
+
+
+// ============================================================================
+// Load-time conversion: TQ4_1S → q8_0 (opt-in via GGML_TQ_CONVERT_Q8=1)
+// ============================================================================
+
+static __global__ void k_convert_tq4_1s_to_q8_0(
+        const block_tq4_1s * __restrict__ src,
+        block_q8_0         * __restrict__ dst,
+        const int n_blocks) {
+
+    const int block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if (block_idx >= n_blocks) return;
+    const int lane = threadIdx.x;
+    const block_tq4_1s * blk = &src[block_idx];
+
+    const float d_scale = (lane < 16) ? __half2float(blk->d0) : __half2float(blk->d1);
+    const uint8_t idx = (blk->qs[lane / 2] >> ((lane & 1) * 4)) & 0xF;
+    float val = TQ4_CENTROIDS_WEIGHT[idx] * d_scale;
+
+    #pragma unroll
+    for (int h = 1; h < 32; h <<= 1) {
+        float o = __shfl_xor_sync(0xffffffff, val, h);
+        val = (lane & h) ? (o - val) : (val + o);
+    }
+    val *= 0.17677669529663688f;
+    val *= TQ_WEIGHT_SIGNS[lane];
+
+    float amax = fabsf(val);
+    #pragma unroll
+    for (int off = 16; off > 0; off >>= 1)
+        amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, off));
+
+    const float d = amax / 127.0f;
+    const float id = (d > 0.0f) ? 127.0f / amax : 0.0f;
+
+    dst[block_idx].qs[lane] = (int8_t)roundf(val * id);
+    if (lane == 0) dst[block_idx].d = __float2half(d);
+}
+
+void ggml_cuda_convert_tq4_1s_to_q8_0(const void * src_tq4, void * dst_q8, int64_t n_elements, cudaStream_t stream) {
+    GGML_ASSERT(n_elements % QK_TQ4_1S == 0);
+    const int n_blocks = n_elements / QK_TQ4_1S;
+    const int wpb = 4;
+    const dim3 block(32, wpb);
+    const dim3 grid((n_blocks + wpb - 1) / wpb);
+    k_convert_tq4_1s_to_q8_0<<<grid, block, 0, stream>>>(
+        (const block_tq4_1s *)src_tq4, (block_q8_0 *)dst_q8, n_blocks);
+}
+
+// ============================================================================
+// Large prefill: runtime TQ4_1S → q8_0 scratch + q8_0→fp16 dequant + cuBLAS
+// Gets tensor core throughput without permanent 1.7× VRAM cost.
+// ============================================================================
+
+void ggml_cuda_mul_mat_tq4_1s_cublas(ggml_backend_cuda_context & ctx,
+                                      const ggml_tensor * src0,
+                                      const ggml_tensor * src1,
+                                      ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_TQ4_1S);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];  // K (hidden dim)
+    const int64_t ne01 = src0->ne[1];  // M (rows = output features)
+    const int64_t ne10 = src1->ne[0];  // K
+    const int64_t ne11 = src1->ne[1];  // N (tokens)
+    GGML_ASSERT(ne00 == ne10);
+
+    const int id = ggml_cuda_get_device();
+    cudaStream_t stream = ctx.stream();
+
+    const int64_t n_elements = ne00 * ne01;
+
+    // Step 1: TQ4_1S → fp16 via warp-cooperative dequant (WHT in-warp)
+    ggml_cuda_pool_alloc<half> src0_f16(ctx.pool(id), n_elements);
+    {
+        const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(GGML_TYPE_TQ4_1S);
+        GGML_ASSERT(to_fp16 != nullptr);
+        to_fp16((const char *)src0->data, src0_f16.get(), n_elements, stream);
+    }
+
+    // Step 2: src1 f32 → fp16
+    ggml_cuda_pool_alloc<half> src1_f16(ctx.pool(id), ne10 * ne11);
+    {
+        const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(GGML_TYPE_F32);
+        GGML_ASSERT(to_fp16 != nullptr);
+        to_fp16((const char *)src1->data, src1_f16.get(), ne10 * ne11, stream);
+    }
+
+    // Step 3: cuBLAS fp16 GEMM with fp32 compute (tensor cores)
+    // dst[M×N] = src0[M×K]^T × src1[K×N]
+    const float alpha = 1.0f;
+    const float beta  = 0.0f;
+    const int64_t ldc = dst->ne[0];  // M
+
+    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+    CUBLAS_CHECK(
+        cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+                ne01, ne11, ne00,
+                &alpha, src0_f16.get(), CUDA_R_16F, ne00,
+                        src1_f16.get(), CUDA_R_16F, ne10,
+                &beta,  (float *)dst->data, CUDA_R_32F, ldc,
+                CUBLAS_COMPUTE_32F,
+                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "common.cuh"
+
+// Fused TQ weight mul_mat: handles ne[1]=1 (decode) and ne[1]>1 (prefill/speculative)
+void ggml_cuda_mul_mat_tq(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+// Legacy single-token alias
+inline void ggml_cuda_mul_mat_vec_tq(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_mul_mat_tq(ctx, src0, src1, dst);
+}
+
+// Large prefill: runtime TQ4_1S → q8_0 scratch + cuBLAS
+void ggml_cuda_mul_mat_tq4_1s_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+// Load-time conversion: TQ4_1S → q8_0 in VRAM (dequant + requantize)
+void ggml_cuda_convert_tq4_1s_to_q8_0(const void * src_tq4, void * dst_q8, int64_t n_elements, cudaStream_t stream);
@@ -1,5 +1,6 @@
 #include "set-rows.cuh"
 #include "cpy-utils.cuh"
+#include "turbo-quant.cuh"

 typedef void (*set_rows_kernel_t)(const char * src, char * dst);

@@ -209,6 +210,928 @@ static void set_rows_cuda(
    }
 }

+// ---- TurboQuant3 set_rows: GROUP_SIZE-element groups with WHT rotation + norm correction ----
+//
+// Templated on GROUP_SIZE (128 or 64).
+// Parallel kernel: one CUDA block per group, GROUP_SIZE threads per block.
+// Thread j handles element j within the group.
+//
+// Steps (all parallel):
+//   1. Load element j from global memory
+//   2. Parallel L2 norm (warp reduce + inter-warp via shared memory)
+//   3. Normalize
+//   4. Forward WHT (log2(GROUP_SIZE) butterfly stages, shared memory)
+//   5. Quantize element j to 3-bit centroid index
+//   6. Pack qs (warp shuffle) and signs (__ballot_sync) into turbo3 block, no atomics
+//   7. Parallel reconstruction norm (same pattern as step 2)
+//   8. Write corrected norm (one thread per sub-block)
+
+template <typename idx_t, int GROUP_SIZE>
+__launch_bounds__(128)  // max of 128 or 64
+static __global__ void k_set_rows_turbo3(
+        const float * __restrict__ src0,
+        const idx_t * __restrict__ src1,
+        block_turbo3_0 * __restrict__ dst,
+        const int64_t ne00,
+        const int64_t ne01,
+        const int64_t ne10,
+        const int64_t ne11,
+        const int64_t ne12,
+        const int64_t ne13,
+        const int64_t s01,
+        const int64_t s02,
+        const int64_t s03,
+        const int64_t s10,
+        const int64_t s11,
+        const int64_t s12,
+        const int64_t s1,
+        const int64_t s2,
+        const int64_t s3) {
+
+    static_assert(GROUP_SIZE == 128 || GROUP_SIZE == 64, "GROUP_SIZE must be 128 or 64");
+
+    // blockIdx.x = flat group index; threadIdx.x = element within group (0..GROUP_SIZE-1)
+    const int j = threadIdx.x;
+
+    // Decode blockIdx.x → (i_grp, i01, i02, i03)
+    constexpr int blocks_per_group = GROUP_SIZE / QK_TURBO3;
+    const int64_t n_groups_per_row = ne00 / GROUP_SIZE;
+    const int64_t g = blockIdx.x;
+    const int64_t i_grp = g % n_groups_per_row;
+    int64_t       tmp   = g / n_groups_per_row;
+    const int64_t i01   = tmp % ne01;
+    tmp                 = tmp / ne01;
+    const int64_t i02   = tmp % ne12;
+    const int64_t i03   = tmp / ne12;
+
+    const int64_t i12 = i02;
+    const int64_t i11 = i01 % ne11;
+    const int64_t i10 = i01;
+
+    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
+    const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
+    block_turbo3_0 * dst_row_ptr = (block_turbo3_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
+    block_turbo3_0 * blk_base    = dst_row_ptr + i_grp * blocks_per_group;
+
+    // ---- Step 1: Load element j (coalesced) ----
+    __shared__ float x[GROUP_SIZE];
+    x[j] = src_row[i_grp * GROUP_SIZE + j];
+    __syncthreads();
+
+    // ---- InnerQ: calibrate on original (unscaled) values ----
+    if (d_innerq_calibrating) {
+        atomicAdd(&d_innerq_sq_accum[j], x[j] * x[j]);
+        if (j == 0) atomicAdd(&d_innerq_count, 1);
+    }
+
+    // ---- InnerQ: apply channel scale (only when active) ----
+    if (d_innerq_active) {
+        x[j] *= d_innerq_scale[j];
+    }
+    __syncthreads();
+
+    // ---- Step 2: Parallel L2 norm ----
+    constexpr int n_warps = GROUP_SIZE / WARP_SIZE;
+    __shared__ float warp_accum[n_warps];
+    float v = x[j];
+    float v2 = v * v;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        v2 += __shfl_xor_sync(0xffffffff, v2, offset);
+    if (j % WARP_SIZE == 0)
+        warp_accum[j / WARP_SIZE] = v2;
+    __syncthreads();
+
+    __shared__ float s_norm_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_norm_sq = total;
+    }
+    __syncthreads();
+    const float grp_norm  = sqrtf(s_norm_sq);
+    const float inv_norm  = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
+
+    // ---- Step 3: Normalize ----
+    x[j] *= inv_norm;
+    __syncthreads();
+
+    // ---- Step 4: Forward WHT (signs1 → butterfly → signs2, normalized) ----
+    if (GROUP_SIZE == 128) {
+        x[j] *= TURBO_WHT_SIGNS1[j];
+    } else {
+        x[j] *= TURBO_WHT_SIGNS1_64[j];
+    }
+    __syncthreads();
+
+#define WHT_STAGE_SHARED(h) \
+    if (j % (2*(h)) < (h)) { float a = x[j], b = x[j+(h)]; x[j] = a+b; x[j+(h)] = a-b; } \
+    __syncthreads();
+
+    // Butterfly stages: loop from h=1 to h<GROUP_SIZE, doubling each time
+    WHT_STAGE_SHARED(1)
+    WHT_STAGE_SHARED(2)
+    WHT_STAGE_SHARED(4)
+    WHT_STAGE_SHARED(8)
+    WHT_STAGE_SHARED(16)
+    WHT_STAGE_SHARED(32)
+    if (GROUP_SIZE == 128) { WHT_STAGE_SHARED(64) }
+#undef WHT_STAGE_SHARED
+
+    constexpr float inv_sqrt_group = (GROUP_SIZE == 128) ? 0.08838834764831845f : 0.125f;
+    if (GROUP_SIZE == 128) {
+        x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2[j];
+    } else {
+        x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2_64[j];
+    }
+    __syncthreads();
+
+    // ---- Step 5: Quantize element j ----
+    const float rv = x[j];
+    const uint8_t idx = turbo_nearest_centroid_3bit(rv);
+
+    // ---- Step 6: Pack qs and signs (warp-cooperative, no atomics) ----
+    // Each warp handles 32 elements. With QK_TURBO3 > WARP_SIZE, multiple warps
+    // share one block and write to different byte offsets within it.
+    const int warp_id = j / WARP_SIZE;
+    const int lane    = j % WARP_SIZE;
+    const int elem_in_block = j % QK_TURBO3;
+    block_turbo3_0 * blk = blk_base + (j / QK_TURBO3);
+
+    // Pack qs: 4 elements per byte, 2 bits each.
+    // All 4 threads in a qs-group gather their low2 bits via shuffle.
+    const int qs_byte_idx = elem_in_block / 4;
+    const uint8_t my_low2 = idx & 0x3;
+    uint8_t qs_byte = 0;
+#pragma unroll
+    for (int k = 0; k < 4; k++) {
+        uint8_t contrib = __shfl_sync(0xffffffff, my_low2, (lane & ~3) + k);
+        qs_byte |= contrib << (k * 2);
+    }
+    if (lane % 4 == 0) blk->qs[qs_byte_idx] = qs_byte;
+
+    // Pack signs: 8 elements per byte, 1 bit each.  __ballot_sync across warp.
+    // Ballot is per-warp (32 bits); extract local byte, write to global position in block.
+    const uint32_t ballot = __ballot_sync(0xffffffff, (idx >> 2) & 1);
+    const int local_signs_byte = lane / 8;             // byte within 32-bit ballot (0..3)
+    const int global_signs_byte = elem_in_block / 8;   // byte within block's signs array
+    const uint8_t signs_byte = (uint8_t)((ballot >> (local_signs_byte * 8)) & 0xFF);
+    if (lane % 8 == 0) blk->signs[global_signs_byte] = signs_byte;
+
+    // ---- Step 7: Reconstruction norm (parallel, same pattern as step 2) ----
+    const float c = TURBO_CENTROIDS_3BIT[idx];
+    float rc = c * c;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        rc += __shfl_xor_sync(0xffffffff, rc, offset);
+    if (j % WARP_SIZE == 0)
+        warp_accum[j / WARP_SIZE] = rc;
+    __syncthreads();
+
+    __shared__ float s_recon_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_recon_sq = total;
+    }
+    __syncthreads();
+    const float recon_norm     = sqrtf(s_recon_sq);
+    const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
+
+    // ---- Step 8: Write corrected norm (one per turbo3 block) ----
+    if (elem_in_block == 0) blk->norm = __float2half(corrected_norm);
+
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne13);
+}
+
+// ---- TurboQuant3 tail kernel: straight 3-bit quantize without WHT rotation ----
+//
+// For head dims not divisible by 128 (e.g. 576 = 4*128 + 64), the remainder
+// elements can't use the 128-element WHT. They are quantised directly into
+// standard turbo3 blocks.  Q is also NOT rotated for these positions (the graph
+// guards on ne[0] % 128), so <Q_tail, K_tail> stays in the original space.
+//
+// One CUDA block per row, with tail_size threads (must be multiple of 32).
+
+template <typename idx_t>
+static __global__ void k_set_rows_turbo3_tail(
+        const float * __restrict__ src0,
+        const idx_t * __restrict__ src1,
+        block_turbo3_0 * __restrict__ dst,
+        const int64_t ne00,
+        const int64_t ne01,
+        const int64_t ne10,
+        const int64_t ne11,
+        const int64_t ne12,
+        const int64_t ne13,
+        const int64_t s01,
+        const int64_t s02,
+        const int64_t s03,
+        const int64_t s10,
+        const int64_t s11,
+        const int64_t s12,
+        const int64_t s1,
+        const int64_t s2,
+        const int64_t s3,
+        const int tail_size) {
+
+    const int j = threadIdx.x;  // 0 .. tail_size-1
+
+    // Decode blockIdx.x → (i01, i02, i03)
+    int64_t tmp = blockIdx.x;
+    const int64_t i01 = tmp % ne01; tmp /= ne01;
+    const int64_t i02 = tmp % ne12;
+    const int64_t i03 = tmp / ne12;
+
+    const int64_t i11 = i01 % ne11;
+    const int64_t i10 = i01;
+    const int64_t i12 = i02;
+
+    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
+    const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
+    block_turbo3_0 * dst_row_ptr = (block_turbo3_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
+
+    // Tail starts after all full 128-element groups
+    const int64_t n_full = ne00 / QK_TURBO3_GROUP;
+    const int64_t tail_start = n_full * QK_TURBO3_GROUP;
+    block_turbo3_0 * blk_base = dst_row_ptr + n_full * (QK_TURBO3_GROUP / QK_TURBO3);
+
+    // ---- Load ----
+    const float val = src_row[tail_start + j];
+
+    // ---- L2 norm over the tail group (warp reduce + inter-warp) ----
+    const int n_warps = tail_size / WARP_SIZE;
+    const int warp_id = j / WARP_SIZE;
+    const int lane    = j % WARP_SIZE;
+
+    __shared__ float warp_accum[4];  // max 3 warps (tail ≤ 96)
+    float v2 = val * val;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        v2 += __shfl_xor_sync(0xffffffff, v2, offset);
+    if (lane == 0) warp_accum[warp_id] = v2;
+    __syncthreads();
+
+    __shared__ float s_norm_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_norm_sq = total;
+    }
+    __syncthreads();
+    const float grp_norm = sqrtf(s_norm_sq);
+    const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
+
+    // ---- Normalize (no WHT!) ----
+    const float rv = val * inv_norm;
+
+    // ---- Quantize ----
+    const uint8_t idx = turbo_nearest_centroid_3bit(rv);
+
+    // ---- Pack qs and signs (same warp-cooperative logic) ----
+    block_turbo3_0 * blk = blk_base + warp_id;
+
+    const uint8_t my_low2 = idx & 0x3;
+    uint8_t qs_byte = 0;
+#pragma unroll
+    for (int k = 0; k < 4; k++) {
+        uint8_t contrib = __shfl_sync(0xffffffff, my_low2, (lane & ~3) + k);
+        qs_byte |= contrib << (k * 2);
+    }
+    if (lane % 4 == 0) blk->qs[lane / 4] = qs_byte;
+
+    const uint32_t ballot = __ballot_sync(0xffffffff, (idx >> 2) & 1);
+    const int signs_byte_idx = lane / 8;
+    const uint8_t signs_byte = (uint8_t)((ballot >> (signs_byte_idx * 8)) & 0xFF);
+    if (lane % 8 == 0) blk->signs[signs_byte_idx] = signs_byte;
+
+    // ---- Reconstruction norm ----
+    const float c = TURBO_CENTROIDS_3BIT[idx];
+    float rc = c * c;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        rc += __shfl_xor_sync(0xffffffff, rc, offset);
+    if (lane == 0) warp_accum[warp_id] = rc;
+    __syncthreads();
+
+    __shared__ float s_recon_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_recon_sq = total;
+    }
+    __syncthreads();
+    const float recon_norm     = sqrtf(s_recon_sq);
+    const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
+
+    if (lane == 0) blk->norm = __float2half(corrected_norm);
+
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne13);
+}
+
+template<typename idx_t>
+static void set_rows_cuda_turbo3(
+        ggml_backend_cuda_context & ctx,
+        const ggml_tensor * src0,
+        const ggml_tensor * src1,
+        ggml_tensor * dst) {
+
+    const float * src0_d = (const float *)src0->data;
+    const idx_t * src1_d = (const idx_t *)src1->data;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_ASSERT(ne00 % QK_TURBO3 == 0);  // must be block-aligned (32)
+
+    cudaStream_t stream = ctx.stream();
+
+    // Read WHT group size from op_params (set by llama-kv-cache.cpp based on head_dim).
+    // Default to 128 if not set (backward compat with head_dim=128 models).
+    int group_size = 128;
+    memcpy(&group_size, dst->op_params, sizeof(int));
+    if (group_size != 64 && group_size != 128) group_size = 128;
+    GGML_ASSERT(ne00 % group_size == 0);
+
+    const int64_t n_full_groups   = ne00 / group_size;
+    const int     tail_size       = (int)(ne00 % group_size);
+
+    const int64_t s01 = nb01/sizeof(float);
+    const int64_t s02 = nb02/sizeof(float);
+    const int64_t s03 = nb03/sizeof(float);
+    const int64_t s10 = nb10/sizeof(idx_t);
+    const int64_t s11 = nb11/sizeof(idx_t);
+    const int64_t s12 = nb12/sizeof(idx_t);
+
+    // InnerQ: check/finalize calibration before kernel launch
+    turbo_innerq_check_finalize(group_size, ne00);
+
+    // Launch 1: full groups with WHT rotation
+    if (n_full_groups > 0) {
+        const int64_t ne_total = n_full_groups * ne01 * ne02 * ne03;
+        if (group_size == 128) {
+            k_set_rows_turbo3<idx_t, 128><<<(int)ne_total, 128, 0, stream>>>(
+                src0_d, src1_d, (block_turbo3_0 *)dst->data,
+                ne00, ne01, ne10, ne11, ne12, ne13,
+                s01, s02, s03, s10, s11, s12,
+                nb1, nb2, nb3);
+        } else {
+            k_set_rows_turbo3<idx_t, 64><<<(int)ne_total, 64, 0, stream>>>(
+                src0_d, src1_d, (block_turbo3_0 *)dst->data,
+                ne00, ne01, ne10, ne11, ne12, ne13,
+                s01, s02, s03, s10, s11, s12,
+                nb1, nb2, nb3);
+        }
+    }
+
+    // Launch 2: tail elements (no WHT, straight quantize)
+    // Not needed for 64-aligned dims but kept for potential future use
+    if (tail_size > 0) {
+        GGML_ASSERT(tail_size % QK_TURBO3 == 0);  // tail must be block-aligned
+        const int64_t n_rows = ne01 * ne02 * ne03;
+        k_set_rows_turbo3_tail<idx_t><<<(int)n_rows, tail_size, 0, stream>>>(
+            src0_d, src1_d, (block_turbo3_0 *)dst->data,
+            ne00, ne01, ne10, ne11, ne12, ne13,
+            s01, s02, s03, s10, s11, s12,
+            nb1, nb2, nb3, tail_size);
+    }
+}
+
+// ---- TurboQuant2 set_rows: GROUP_SIZE-element groups with WHT rotation + norm correction ----
+//
+// Same structure as turbo3 but 2-bit quantization only (no signs byte).
+
+template <typename idx_t, int GROUP_SIZE>
+__launch_bounds__(128)
+static __global__ void k_set_rows_turbo2(
+        const float * __restrict__ src0,
+        const idx_t * __restrict__ src1,
+        block_turbo2_0 * __restrict__ dst,
+        const int64_t ne00,
+        const int64_t ne01,
+        const int64_t ne10,
+        const int64_t ne11,
+        const int64_t ne12,
+        const int64_t ne13,
+        const int64_t s01,
+        const int64_t s02,
+        const int64_t s03,
+        const int64_t s10,
+        const int64_t s11,
+        const int64_t s12,
+        const int64_t s1,
+        const int64_t s2,
+        const int64_t s3) {
+
+    static_assert(GROUP_SIZE == 128 || GROUP_SIZE == 64, "GROUP_SIZE must be 128 or 64");
+
+    const int j = threadIdx.x;
+
+    constexpr int blocks_per_group = GROUP_SIZE / QK_TURBO2;
+    const int64_t n_groups_per_row = ne00 / GROUP_SIZE;
+    const int64_t g = blockIdx.x;
+    const int64_t i_grp = g % n_groups_per_row;
+    int64_t       tmp   = g / n_groups_per_row;
+    const int64_t i01   = tmp % ne01;
+    tmp                 = tmp / ne01;
+    const int64_t i02   = tmp % ne12;
+    const int64_t i03   = tmp / ne12;
+
+    const int64_t i12 = i02;
+    const int64_t i11 = i01 % ne11;
+    const int64_t i10 = i01;
+
+    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
+    const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
+    block_turbo2_0 * dst_row_ptr = (block_turbo2_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
+    block_turbo2_0 * blk_base    = dst_row_ptr + i_grp * blocks_per_group;
+
+    // ---- Step 1: Load element j (coalesced) ----
+    __shared__ float x[GROUP_SIZE];
+    x[j] = src_row[i_grp * GROUP_SIZE + j];
+    __syncthreads();
+
+    // ---- InnerQ: calibrate on original (unscaled) values ----
+    if (d_innerq_calibrating) {
+        atomicAdd(&d_innerq_sq_accum[j], x[j] * x[j]);
+        if (j == 0) atomicAdd(&d_innerq_count, 1);
+    }
+
+    // ---- InnerQ: apply channel scale (only when active) ----
+    if (d_innerq_active) {
+        x[j] *= d_innerq_scale[j];
+    }
+    __syncthreads();
+
+    // ---- Step 2: Parallel L2 norm ----
+    constexpr int n_warps = GROUP_SIZE / WARP_SIZE;
+    __shared__ float warp_accum[n_warps];
+    float v = x[j];
+    float v2 = v * v;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        v2 += __shfl_xor_sync(0xffffffff, v2, offset);
+    if (j % WARP_SIZE == 0)
+        warp_accum[j / WARP_SIZE] = v2;
+    __syncthreads();
+
+    __shared__ float s_norm_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_norm_sq = total;
+    }
+    __syncthreads();
+    const float grp_norm  = sqrtf(s_norm_sq);
+    const float inv_norm  = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
+
+    // ---- Step 3: Normalize ----
+    x[j] *= inv_norm;
+    __syncthreads();
+
+    // ---- Step 4: Forward WHT ----
+    if (GROUP_SIZE == 128) {
+        x[j] *= TURBO_WHT_SIGNS1[j];
+    } else {
+        x[j] *= TURBO_WHT_SIGNS1_64[j];
+    }
+    __syncthreads();
+
+#define WHT_STAGE_SHARED_T2(h) \
+    if (j % (2*(h)) < (h)) { float a = x[j], b = x[j+(h)]; x[j] = a+b; x[j+(h)] = a-b; } \
+    __syncthreads();
+
+    WHT_STAGE_SHARED_T2(1)
+    WHT_STAGE_SHARED_T2(2)
+    WHT_STAGE_SHARED_T2(4)
+    WHT_STAGE_SHARED_T2(8)
+    WHT_STAGE_SHARED_T2(16)
+    WHT_STAGE_SHARED_T2(32)
+    if (GROUP_SIZE == 128) { WHT_STAGE_SHARED_T2(64) }
+#undef WHT_STAGE_SHARED_T2
+
+    constexpr float inv_sqrt_group = (GROUP_SIZE == 128) ? 0.08838834764831845f : 0.125f;
+    if (GROUP_SIZE == 128) {
+        x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2[j];
+    } else {
+        x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2_64[j];
+    }
+    __syncthreads();
+
+    // ---- Step 5: Quantize element j to 2-bit centroid ----
+    const float rv = x[j];
+    const uint8_t idx = turbo_nearest_centroid_2bit(rv);
+
+    // ---- Step 6: Pack qs (warp-cooperative, no atomics) ----
+    // Each warp handles 32 elements. With QK_TURBO2 > WARP_SIZE, multiple warps
+    // share one block and write to different byte offsets within it.
+    const int warp_id = j / WARP_SIZE;
+    const int lane    = j % WARP_SIZE;
+    const int elem_in_block = j % QK_TURBO2;
+    block_turbo2_0 * blk = blk_base + (j / QK_TURBO2);
+
+    // Pack qs: 4 elements per byte, 2 bits each.
+    const uint8_t my_bits = idx & 0x3;
+    uint8_t qs_byte = 0;
+#pragma unroll
+    for (int k = 0; k < 4; k++) {
+        uint8_t contrib = __shfl_sync(0xffffffff, my_bits, (lane & ~3) + k);
+        qs_byte |= contrib << (k * 2);
+    }
+    if (lane % 4 == 0) blk->qs[elem_in_block / 4] = qs_byte;
+
+    // No signs packing needed for turbo2
+
+    // ---- Step 7: Reconstruction norm ----
+    const float c = TURBO_CENTROIDS_2BIT[idx];
+    float rc = c * c;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        rc += __shfl_xor_sync(0xffffffff, rc, offset);
+    if (j % WARP_SIZE == 0)
+        warp_accum[j / WARP_SIZE] = rc;
+    __syncthreads();
+
+    __shared__ float s_recon_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_recon_sq = total;
+    }
+    __syncthreads();
+    const float recon_norm     = sqrtf(s_recon_sq);
+    const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
+
+    // ---- Step 8: Write corrected norm (one per turbo2 block) ----
+    if (elem_in_block == 0) blk->norm = __float2half(corrected_norm);
+
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne13);
+}
+
+// ---- TurboQuant2 tail kernel: straight 2-bit quantize without WHT rotation ----
+
+template <typename idx_t>
+static __global__ void k_set_rows_turbo2_tail(
+        const float * __restrict__ src0,
+        const idx_t * __restrict__ src1,
+        block_turbo2_0 * __restrict__ dst,
+        const int64_t ne00,
+        const int64_t ne01,
+        const int64_t ne10,
+        const int64_t ne11,
+        const int64_t ne12,
+        const int64_t ne13,
+        const int64_t s01,
+        const int64_t s02,
+        const int64_t s03,
+        const int64_t s10,
+        const int64_t s11,
+        const int64_t s12,
+        const int64_t s1,
+        const int64_t s2,
+        const int64_t s3,
+        const int tail_size) {
+
+    const int j = threadIdx.x;
+
+    int64_t tmp = blockIdx.x;
+    const int64_t i01 = tmp % ne01; tmp /= ne01;
+    const int64_t i02 = tmp % ne12;
+    const int64_t i03 = tmp / ne12;
+
+    const int64_t i11 = i01 % ne11;
+    const int64_t i10 = i01;
+    const int64_t i12 = i02;
+
+    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
+    const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
+    block_turbo2_0 * dst_row_ptr = (block_turbo2_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
+
+    const int64_t n_full = ne00 / QK_TURBO2_GROUP;
+    const int64_t tail_start = n_full * QK_TURBO2_GROUP;
+    block_turbo2_0 * blk_base = dst_row_ptr + n_full * (QK_TURBO2_GROUP / QK_TURBO2);
+
+    // ---- Load ----
+    const float val = src_row[tail_start + j];
+
+    // ---- L2 norm ----
+    const int n_warps = tail_size / WARP_SIZE;
+    const int warp_id = j / WARP_SIZE;
+    const int lane    = j % WARP_SIZE;
+
+    __shared__ float warp_accum[4];
+    float v2 = val * val;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        v2 += __shfl_xor_sync(0xffffffff, v2, offset);
+    if (lane == 0) warp_accum[warp_id] = v2;
+    __syncthreads();
+
+    __shared__ float s_norm_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_norm_sq = total;
+    }
+    __syncthreads();
+    const float grp_norm = sqrtf(s_norm_sq);
+    const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
+
+    // ---- Normalize (no WHT!) ----
+    const float rv = val * inv_norm;
+
+    // ---- Quantize ----
+    const uint8_t idx = turbo_nearest_centroid_2bit(rv);
+
+    // ---- Pack qs ----
+    block_turbo2_0 * blk = blk_base + warp_id;
+
+    const uint8_t my_bits = idx & 0x3;
+    uint8_t qs_byte = 0;
+#pragma unroll
+    for (int k = 0; k < 4; k++) {
+        uint8_t contrib = __shfl_sync(0xffffffff, my_bits, (lane & ~3) + k);
+        qs_byte |= contrib << (k * 2);
+    }
+    if (lane % 4 == 0) blk->qs[lane / 4] = qs_byte;
+
+    // ---- Reconstruction norm ----
+    const float c = TURBO_CENTROIDS_2BIT[idx];
+    float rc = c * c;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        rc += __shfl_xor_sync(0xffffffff, rc, offset);
+    if (lane == 0) warp_accum[warp_id] = rc;
+    __syncthreads();
+
+    __shared__ float s_recon_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_recon_sq = total;
+    }
+    __syncthreads();
+    const float recon_norm     = sqrtf(s_recon_sq);
+    const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
+
+    if (lane == 0) blk->norm = __float2half(corrected_norm);
+
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne13);
+    GGML_UNUSED(ne00);
+}
+
+template<typename idx_t>
+static void set_rows_cuda_turbo2(
+        ggml_backend_cuda_context & ctx,
+        const ggml_tensor * src0,
+        const ggml_tensor * src1,
+        ggml_tensor * dst) {
+
+    const float * src0_d = (const float *)src0->data;
+    const idx_t * src1_d = (const idx_t *)src1->data;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_ASSERT(ne00 % QK_TURBO2 == 0);
+
+    cudaStream_t stream = ctx.stream();
+
+    int group_size = 128;
+    memcpy(&group_size, dst->op_params, sizeof(int));
+    if (group_size != 64 && group_size != 128) group_size = 128;
+    GGML_ASSERT(ne00 % group_size == 0);
+
+    const int64_t n_full_groups   = ne00 / group_size;
+    const int     tail_size       = (int)(ne00 % group_size);
+
+    const int64_t s01 = nb01/sizeof(float);
+    const int64_t s02 = nb02/sizeof(float);
+    const int64_t s03 = nb03/sizeof(float);
+    const int64_t s10 = nb10/sizeof(idx_t);
+    const int64_t s11 = nb11/sizeof(idx_t);
+    const int64_t s12 = nb12/sizeof(idx_t);
+
+    // InnerQ: check/finalize calibration before kernel launch
+    turbo_innerq_check_finalize(group_size, ne00);
+
+    if (n_full_groups > 0) {
+        const int64_t ne_total = n_full_groups * ne01 * ne02 * ne03;
+        if (group_size == 128) {
+            k_set_rows_turbo2<idx_t, 128><<<(int)ne_total, 128, 0, stream>>>(
+                src0_d, src1_d, (block_turbo2_0 *)dst->data,
+                ne00, ne01, ne10, ne11, ne12, ne13,
+                s01, s02, s03, s10, s11, s12,
+                nb1, nb2, nb3);
+        } else {
+            k_set_rows_turbo2<idx_t, 64><<<(int)ne_total, 64, 0, stream>>>(
+                src0_d, src1_d, (block_turbo2_0 *)dst->data,
+                ne00, ne01, ne10, ne11, ne12, ne13,
+                s01, s02, s03, s10, s11, s12,
+                nb1, nb2, nb3);
+        }
+    }
+
+    if (tail_size > 0) {
+        GGML_ASSERT(tail_size % QK_TURBO2 == 0);
+        const int64_t n_rows = ne01 * ne02 * ne03;
+        k_set_rows_turbo2_tail<idx_t><<<(int)n_rows, tail_size, 0, stream>>>(
+            src0_d, src1_d, (block_turbo2_0 *)dst->data,
+            ne00, ne01, ne10, ne11, ne12, ne13,
+            s01, s02, s03, s10, s11, s12,
+            nb1, nb2, nb3, tail_size);
+    }
+}
+
+// ---- TurboQuant4 set_rows: 128-element groups with WHT rotation + 4-bit quantization ----
+//
+// turbo4 block size IS the WHT group size (128), so 1 CUDA block = 1 turbo4 block.
+// 128 threads per block, thread j handles element j.
+// 4-bit centroids (16 values), nibble packed: qs[j/2] |= (idx & 0xF) << ((j%2)*4)
+
+template <typename idx_t>
+__launch_bounds__(128)
+static __global__ void k_set_rows_turbo4(
+        const float * __restrict__ src0,
+        const idx_t * __restrict__ src1,
+        block_turbo4_0 * __restrict__ dst,
+        const int64_t ne00,
+        const int64_t ne01,
+        const int64_t ne10,
+        const int64_t ne11,
+        const int64_t ne12,
+        const int64_t ne13,
+        const int64_t s01,
+        const int64_t s02,
+        const int64_t s03,
+        const int64_t s10,
+        const int64_t s11,
+        const int64_t s12,
+        const int64_t s1,
+        const int64_t s2,
+        const int64_t s3) {
+
+    // blockIdx.x = flat block index; threadIdx.x = element within block (0..127)
+    const int j = threadIdx.x;
+
+    // Decode blockIdx.x → (i_blk, i01, i02, i03)
+    const int64_t n_blocks_per_row = ne00 / QK_TURBO4;
+    const int64_t g = blockIdx.x;
+    const int64_t i_blk = g % n_blocks_per_row;
+    int64_t       tmp   = g / n_blocks_per_row;
+    const int64_t i01   = tmp % ne01;
+    tmp                 = tmp / ne01;
+    const int64_t i02   = tmp % ne12;
+    const int64_t i03   = tmp / ne12;
+
+    const int64_t i12 = i02;
+    const int64_t i11 = i01 % ne11;
+    const int64_t i10 = i01;
+
+    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
+    const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03;
+    block_turbo4_0 * dst_row_ptr = (block_turbo4_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
+    block_turbo4_0 * blk = dst_row_ptr + i_blk;
+
+    // ---- Step 1: Load element j (coalesced) ----
+    __shared__ float x[128];
+    x[j] = src_row[i_blk * QK_TURBO4 + j];
+    __syncthreads();
+
+    // ---- InnerQ: calibrate on original (unscaled) values ----
+    if (d_innerq_calibrating) {
+        atomicAdd(&d_innerq_sq_accum[j], x[j] * x[j]);
+        if (j == 0) atomicAdd(&d_innerq_count, 1);
+    }
+
+    // ---- InnerQ: apply channel scale (only when active) ----
+    if (d_innerq_active) {
+        x[j] *= d_innerq_scale[j];
+    }
+    __syncthreads();
+
+    // ---- Step 2: Parallel L2 norm ----
+    constexpr int n_warps = 128 / WARP_SIZE;  // = 4
+    __shared__ float warp_accum[n_warps];
+    float v = x[j];
+    float v2 = v * v;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        v2 += __shfl_xor_sync(0xffffffff, v2, offset);
+    if (j % WARP_SIZE == 0)
+        warp_accum[j / WARP_SIZE] = v2;
+    __syncthreads();
+
+    __shared__ float s_norm_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_norm_sq = total;
+    }
+    __syncthreads();
+    const float grp_norm  = sqrtf(s_norm_sq);
+    const float inv_norm  = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f;
+
+    // ---- Step 3: Normalize ----
+    x[j] *= inv_norm;
+    __syncthreads();
+
+    // ---- Step 4: Forward WHT (signs1 → butterfly → signs2, normalized) ----
+    x[j] *= TURBO_WHT_SIGNS1[j];
+    __syncthreads();
+
+#define WHT_STAGE_SHARED_T4(h) \
+    if (j % (2*(h)) < (h)) { float a = x[j], b = x[j+(h)]; x[j] = a+b; x[j+(h)] = a-b; } \
+    __syncthreads();
+
+    WHT_STAGE_SHARED_T4(1)
+    WHT_STAGE_SHARED_T4(2)
+    WHT_STAGE_SHARED_T4(4)
+    WHT_STAGE_SHARED_T4(8)
+    WHT_STAGE_SHARED_T4(16)
+    WHT_STAGE_SHARED_T4(32)
+    WHT_STAGE_SHARED_T4(64)
+#undef WHT_STAGE_SHARED_T4
+
+    constexpr float inv_sqrt_128 = 0.08838834764831845f;
+    x[j] = x[j] * inv_sqrt_128 * TURBO_WHT_SIGNS2[j];
+    __syncthreads();
+
+    // ---- Step 5: Quantize element j to 4-bit centroid ----
+    const float rv = x[j];
+    const uint8_t idx = turbo_nearest_centroid_4bit(rv);
+
+    // ---- Step 6: Pack qs (nibble packed, warp-cooperative) ----
+    // 2 elements per byte, 4 bits each.
+    // Thread pairs (j, j+1) share a qs byte.
+    const int lane = j % WARP_SIZE;
+    const uint8_t my_nibble = idx & 0xF;
+    uint8_t qs_byte = 0;
+    // Gather nibble from partner thread
+    uint8_t partner_nibble = __shfl_sync(0xffffffff, my_nibble, lane ^ 1);
+    if (j % 2 == 0) {
+        qs_byte = my_nibble | (partner_nibble << 4);
+        blk->qs[j / 2] = qs_byte;
+    }
+
+    // ---- Step 7: Reconstruction norm (parallel) ----
+    const float c = TURBO_CENTROIDS_4BIT[idx];
+    float rc = c * c;
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        rc += __shfl_xor_sync(0xffffffff, rc, offset);
+    if (j % WARP_SIZE == 0)
+        warp_accum[j / WARP_SIZE] = rc;
+    __syncthreads();
+
+    __shared__ float s_recon_sq;
+    if (j == 0) {
+        float total = 0.0f;
+        for (int w = 0; w < n_warps; w++) total += warp_accum[w];
+        s_recon_sq = total;
+    }
+    __syncthreads();
+    const float recon_norm     = sqrtf(s_recon_sq);
+    const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
+
+    // ---- Step 8: Write corrected norm and zero rnorm (one thread) ----
+    if (j == 0) {
+        blk->norm  = __float2half(corrected_norm);
+        blk->rnorm = __float2half(0.0f);
+    }
+
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne13);
+}
+
+template<typename idx_t>
+static void set_rows_cuda_turbo4(
+        ggml_backend_cuda_context & ctx,
+        const ggml_tensor * src0,
+        const ggml_tensor * src1,
+        ggml_tensor * dst) {
+
+    const float * src0_d = (const float *)src0->data;
+    const idx_t * src1_d = (const idx_t *)src1->data;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_ASSERT(ne00 % QK_TURBO4 == 0);  // must be block-aligned (128)
+
+    cudaStream_t stream = ctx.stream();
+
+    // turbo4 block size = WHT group size = 128, always
+    const int64_t n_blocks = ne00 / QK_TURBO4;
+
+    const int64_t s01 = nb01/sizeof(float);
+    const int64_t s02 = nb02/sizeof(float);
+    const int64_t s03 = nb03/sizeof(float);
+    const int64_t s10 = nb10/sizeof(idx_t);
+    const int64_t s11 = nb11/sizeof(idx_t);
+    const int64_t s12 = nb12/sizeof(idx_t);
+
+    // InnerQ: check/finalize calibration before kernel launch
+    turbo_innerq_check_finalize(QK_TURBO4, ne00);
+
+    if (n_blocks > 0) {
+        const int64_t ne_total = n_blocks * ne01 * ne02 * ne03;
+        k_set_rows_turbo4<idx_t><<<(int)ne_total, 128, 0, stream>>>(
+            src0_d, src1_d, (block_turbo4_0 *)dst->data,
+            ne00, ne01, ne10, ne11, ne12, ne13,
+            s01, s02, s03, s10, s11, s12,
+            nb1, nb2, nb3);
+    }
+}
+
 template<typename src_t, typename idx_t>
 static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const src_t * src0_d = (const src_t *)src0->data;
@@ -309,6 +1232,12 @@ static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * s
            nb1, nb2, nb3,
            stream
        );
+    } else if (dst->type == GGML_TYPE_TURBO3_0) {
+        set_rows_cuda_turbo3<idx_t>(ctx, src0, src1, dst);
+    } else if (dst->type == GGML_TYPE_TURBO2_0) {
+        set_rows_cuda_turbo2<idx_t>(ctx, src0, src1, dst);
+    } else if (dst->type == GGML_TYPE_TURBO4_0) {
+        set_rows_cuda_turbo4<idx_t>(ctx, src0, src1, dst);
    } else {
        GGML_ABORT("unsupported type %s", ggml_type_name(dst->type));
    }
@@ -4,3 +4,4 @@

 DECL_FATTN_MMA_F16_CASE(192, 128, 1, 16);
 DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
+DECL_FATTN_MMA_F16_CASE(640, 512, 1, 16);
@@ -4,3 +4,4 @@

 DECL_FATTN_MMA_F16_CASE(192, 128, 2, 16);
 DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
+DECL_FATTN_MMA_F16_CASE(640, 512, 2, 16);
@@ -4,3 +4,4 @@

 DECL_FATTN_MMA_F16_CASE(192, 128, 4, 16);
 DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
+DECL_FATTN_MMA_F16_CASE(640, 512, 4, 16);
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-tile.cuh"
+
+DECL_FATTN_TILE_CASE(640, 512);
@@ -0,0 +1,7 @@
+// Mixed KV: f16 K + turbo2 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO2_0);
@@ -0,0 +1,7 @@
+// Mixed KV: f16 K + turbo3 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO3_0);
@@ -0,0 +1,7 @@
+// Mixed KV: f16 K + turbo4 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_TURBO4_0);
@@ -0,0 +1,7 @@
+// Mixed KV: q8_0 K + turbo2 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0);
@@ -0,0 +1,7 @@
+// Mixed KV: q8_0 K + turbo3 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0);
@@ -0,0 +1,7 @@
+// Mixed KV: q8_0 K + turbo4 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO4_0);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo2 K + f16 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_F16);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo2 K + q8_0 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0);
@@ -0,0 +1,7 @@
+// TurboQuant2 CUDA flash attention vec kernel instantiation
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo2 K + turbo3 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo2 K + turbo4 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO4_0);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo3 K + f16 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_F16);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo3 K + q8_0 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo3 K + turbo2 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO2_0);
@@ -0,0 +1,7 @@
+// TurboQuant3 CUDA flash attention vec kernel instantiation
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo3 K + turbo4 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo4 K + f16 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_F16);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo4 K + q8_0 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_Q8_0);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo4 K + turbo2 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO2_0);
@@ -0,0 +1,7 @@
+// Mixed KV: turbo4 K + turbo3 V
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO3_0);
@@ -0,0 +1,7 @@
+// TurboQuant4 CUDA flash attention vec kernel instantiation
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
@@ -0,0 +1,32 @@
+#include "turbo-innerq.cuh"
+#include <cstring>
+
+// Host-side shared state for InnerQ cross-TU communication
+TURBO_IQ_API bool  g_innerq_finalized = false;
+TURBO_IQ_API float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+};
+
+static bool g_innerq_tensor_needs_update = false;
+
+void turbo_innerq_publish(const float * scale_inv, int group_size) {
+    for (int i = 0; i < group_size && i < INNERQ_MAX_CHANNELS; i++) {
+        g_innerq_scale_inv_host[i] = scale_inv[i];
+    }
+    for (int i = group_size; i < INNERQ_MAX_CHANNELS; i++) {
+        g_innerq_scale_inv_host[i] = 1.0f;
+    }
+    g_innerq_finalized = true;
+    g_innerq_tensor_needs_update = true;
+}
+
+TURBO_IQ_API bool turbo_innerq_needs_tensor_update(void) {
+    return g_innerq_tensor_needs_update;
+}
+
+TURBO_IQ_API void turbo_innerq_mark_tensor_updated(void) {
+    g_innerq_tensor_needs_update = false;
+}
@@ -0,0 +1,34 @@
+#pragma once
+
+// TurboQuant InnerQ per-channel equalization — cross-TU shared state
+// The host-side state lives in turbo-innerq.cu; device-side state is per-TU
+// in turbo-quant.cuh (only set-rows.cu needs device access).
+
+#define INNERQ_MAX_CHANNELS 128
+
+#ifdef GGML_BACKEND_SHARED
+#  if defined(_WIN32) && !defined(__MINGW32__)
+#    ifdef GGML_BACKEND_BUILD
+#      define TURBO_IQ_API __declspec(dllexport)
+#    else
+#      define TURBO_IQ_API __declspec(dllimport)
+#    endif
+#  else
+#    define TURBO_IQ_API __attribute__((visibility("default")))
+#  endif
+#else
+#  define TURBO_IQ_API
+#endif
+
+// Host-side shared state (defined in turbo-innerq.cu)
+TURBO_IQ_API extern bool  g_innerq_finalized;
+TURBO_IQ_API extern float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS];
+
+// Called from set-rows.cu after InnerQ finalization to publish scale_inv
+void turbo_innerq_publish(const float * scale_inv, int group_size);
+
+// Called from llama-kv-cache.cpp (or equivalent) to check if tensor needs update
+TURBO_IQ_API bool turbo_innerq_needs_tensor_update(void);
+
+// Called after tensor update to clear the flag
+TURBO_IQ_API void turbo_innerq_mark_tensor_updated(void);
@@ -0,0 +1,453 @@
+/*
+ * TurboQuant CUDA kernels for KV cache compression
+ * Based on: arXiv 2504.19874 (ICLR 2026)
+ *
+ * Implements GGML_TYPE_TURBO3_0 (3-bit PolarQuant, block size 32)
+ * Constants, WHT rotation, quantize/dequantize device functions.
+ */
+
+#pragma once
+
+#include "common.cuh"
+#include "turbo-innerq.cuh"
+#include <cstdlib>
+#include <cmath>
+
+// ---- Quantization ratios for dequantize_block template ----
+#define QR_TURBO3 1  // Each dequantize call produces 2 consecutive elements (like q8_0)
+#define QR_TURBO2 1  // Each dequantize call produces 2 consecutive elements (like q8_0)
+#define QR_TURBO4 1  // Each dequantize call produces 2 consecutive elements (like q8_0)
+
+// ---- 2-bit centroids (Lloyd-Max for N(0, 1/128)) ----
+
+static __constant__ float TURBO_CENTROIDS_2BIT[4] = {
+    -0.133462f, -0.039994f, 0.039994f, 0.133462f
+};
+
+static __constant__ float TURBO_MID_2BIT[3] = {
+    -0.086728f, 0.0f, 0.086728f
+};
+
+// ---- 3-bit centroids (Lloyd-Max for N(0, 1/128)) ----
+
+static __constant__ float TURBO_CENTROIDS_3BIT[8] = {
+    -0.190685f, -0.117832f, -0.065717f, -0.021460f,
+     0.021460f,  0.065717f,  0.117832f,  0.190685f
+};
+
+// ---- Midpoints for nearest centroid lookup ----
+
+static __constant__ float TURBO_MID_3BIT[7] = {
+    -0.154259f, -0.091775f, -0.043589f, 0.0f,
+     0.043589f,  0.091775f,  0.154259f
+};
+
+// ---- WHT sign arrays (seed=42) ----
+
+static __constant__ float TURBO_WHT_SIGNS1[128] = {
+    -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+    1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f,
+    -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f,
+    1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f,
+    -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f,
+    1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f,
+    -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f,
+    1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f
+};
+
+static __constant__ float TURBO_WHT_SIGNS2[128] = {
+    1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f,
+    1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,
+    1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f,
+    1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f,
+    1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f,
+    -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f,
+    1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f,
+    -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f
+};
+
+// ---- 64-element WHT sign arrays (first 64 of the 128-element arrays) ----
+
+static __constant__ float TURBO_WHT_SIGNS1_64[64] = {
+    -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+    1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f,
+    -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f,
+    1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f
+};
+
+static __constant__ float TURBO_WHT_SIGNS2_64[64] = {
+    1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f,
+    1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,
+    1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f,
+    1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f
+};
+
+// ---- Fast Walsh-Hadamard Transform (in-place, normalized) ----
+// O(n log n) = 896 ops for n=128
+
+static __device__ __forceinline__ void turbo_fwht_128(float * x) {
+    for (int h = 1; h < 128; h *= 2) {
+        for (int i = 0; i < 128; i += h * 2) {
+            for (int j = i; j < i + h; j++) {
+                float a = x[j];
+                float b = x[j + h];
+                x[j]     = a + b;
+                x[j + h] = a - b;
+            }
+        }
+    }
+    const float inv_sqrt_128 = 0.08838834764831845f;
+    for (int i = 0; i < 128; i++) {
+        x[i] *= inv_sqrt_128;
+    }
+}
+
+// ---- Fast Walsh-Hadamard Transform for 64-element groups ----
+// O(n log n) = 384 ops for n=64
+
+static __device__ __forceinline__ void turbo_fwht_64(float * x) {
+    for (int h = 1; h < 64; h *= 2) {
+        for (int i = 0; i < 64; i += h * 2) {
+            for (int j = i; j < i + h; j++) {
+                float a = x[j];
+                float b = x[j + h];
+                x[j]     = a + b;
+                x[j + h] = a - b;
+            }
+        }
+    }
+    const float inv_sqrt_64 = 0.125f;
+    for (int i = 0; i < 64; i++) {
+        x[i] *= inv_sqrt_64;
+    }
+}
+
+// ---- Forward rotation: signs1 → FWHT → signs2 ----
+
+static __device__ __forceinline__ void turbo_rotate_forward(float * x) {
+    for (int i = 0; i < 128; i++) x[i] *= TURBO_WHT_SIGNS1[i];
+    turbo_fwht_128(x);
+    for (int i = 0; i < 128; i++) x[i] *= TURBO_WHT_SIGNS2[i];
+}
+
+// ---- Forward rotation for 64-element groups ----
+
+static __device__ __forceinline__ void turbo_rotate_forward_64(float * x) {
+    for (int i = 0; i < 64; i++) x[i] *= TURBO_WHT_SIGNS1_64[i];
+    turbo_fwht_64(x);
+    for (int i = 0; i < 64; i++) x[i] *= TURBO_WHT_SIGNS2_64[i];
+}
+
+// ---- InnerQ per-channel equalization ----
+// Equalizes K channel variances before WHT rotation to reduce quantization error.
+// Enabled via TURBO_INNERQ=N env var (N = calibration token count).
+// Math: <Q/s, s*K> = <Q, K> preserves dot products.
+// INNERQ_MAX_CHANNELS is defined in turbo-innerq.cuh
+
+static __device__ float d_innerq_scale[INNERQ_MAX_CHANNELS];
+static __device__ float d_innerq_scale_inv[INNERQ_MAX_CHANNELS];
+static __device__ float d_innerq_sq_accum[INNERQ_MAX_CHANNELS];
+static __device__ int   d_innerq_count;
+static __device__ int   d_innerq_active;       // 0 = scales are identity, 1 = scales applied
+static __device__ int   d_innerq_calibrating;  // 1 = accumulating K² stats
+
+static int  innerq_enabled       = 0;  // host: 0=off, 1=calibrating, 2=active
+static int  innerq_target_tokens = 0;
+static float innerq_strength     = 0.5f;
+static bool  innerq_initialized  = false;
+
+// Host: read TURBO_INNERQ env, start calibration if enabled
+static void turbo_innerq_init(void) {
+    if (innerq_initialized) return;
+    innerq_initialized = true;
+
+    const char * env = getenv("TURBO_INNERQ");
+    if (!env || atoi(env) <= 0) {
+        innerq_enabled = 0;
+        return;
+    }
+    innerq_target_tokens = atoi(env);
+    innerq_enabled = 1;  // calibrating
+
+    const char * env_str = getenv("TURBO_INNERQ_STRENGTH");
+    if (env_str) innerq_strength = atof(env_str);
+    if (innerq_strength <= 0.0f || innerq_strength > 1.0f) innerq_strength = 0.5f;
+
+    // Zero accumulators and set calibrating flag on device
+    float zeros[INNERQ_MAX_CHANNELS] = {0};
+    int zero = 0, one = 1;
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int)));
+
+    GGML_LOG_INFO("%s: InnerQ calibration started (target=%d tokens, strength=%.2f)\n",
+                   __func__, innerq_target_tokens, innerq_strength);
+}
+
+// Host: finalize calibration — compute scales, upload, activate
+static void turbo_innerq_finalize(int group_size) {
+    // Read accumulators from device
+    float sq_accum[INNERQ_MAX_CHANNELS];
+    int count = 0;
+    CUDA_CHECK(cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float)));
+    CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
+
+    if (count <= 0) {
+        GGML_LOG_WARN("%s: InnerQ calibration got 0 tokens, disabling\n", __func__);
+        innerq_enabled = 0;
+        int zero = 0;
+        CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
+        return;
+    }
+
+    // Compute per-channel RMS
+    float rms[INNERQ_MAX_CHANNELS];
+    float mean_rms = 0.0f;
+    float max_ratio = 0.0f, min_ratio = 1e30f;
+    for (int i = 0; i < group_size; i++) {
+        rms[i] = sqrtf(sq_accum[i] / (float)count);
+        mean_rms += rms[i];
+    }
+    mean_rms /= (float)group_size;
+
+    // Compute scale[i] = (mean_rms / channel_rms[i])^strength, clamp to [0.5, 2.0]
+    float scale[INNERQ_MAX_CHANNELS];
+    float scale_inv[INNERQ_MAX_CHANNELS];
+    for (int i = 0; i < group_size; i++) {
+        float ratio = (rms[i] > 1e-10f) ? (mean_rms / rms[i]) : 1.0f;
+        float s = powf(ratio, innerq_strength);
+        if (s < 0.5f) s = 0.5f;
+        if (s > 2.0f) s = 2.0f;
+        scale[i] = s;
+        scale_inv[i] = 1.0f / s;
+        if (ratio > max_ratio) max_ratio = ratio;
+        if (ratio < min_ratio) min_ratio = ratio;
+    }
+
+    // Auto-skip if max channel ratio < 1.2 (already balanced)
+    if (max_ratio < 1.2f && min_ratio > (1.0f / 1.2f)) {
+        GGML_LOG_INFO("%s: InnerQ auto-disabled (channels already balanced, max_ratio=%.3f)\n",
+                       __func__, max_ratio);
+        innerq_enabled = 0;
+        int zero = 0;
+        CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
+        return;
+    }
+
+    // Stop calibrating, upload scales, activate
+    int zero = 0, one = 1;
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float)));
+    CUDA_CHECK(cudaDeviceSynchronize());  // ensure scales are visible before activating
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int)));
+
+    innerq_enabled = 2;  // active
+
+    // Publish scale_inv to shared host state for cross-TU tensor update
+    turbo_innerq_publish(scale_inv, group_size);
+
+    GGML_LOG_INFO("%s: InnerQ finalized (%d tokens, max_ratio=%.3f, min_ratio=%.3f)\n",
+                   __func__, count, max_ratio, min_ratio);
+}
+
+// Host: called before each set_rows kernel launch
+static void turbo_innerq_check_finalize(int group_size, int64_t ne00) {
+    if (!innerq_initialized) {
+        turbo_innerq_init();
+    }
+    if (innerq_enabled == 0) return;
+
+    // InnerQ only works when each WHT group = one head (group_size == head_dim).
+    // For standard models: ne00 = n_heads * head_dim, group_size = head_dim → ne00 % group_size == 0, fine.
+    // For non-standard models (head_dim > group_size, e.g. GLM 576 → 64-group):
+    //   ne00 = head_dim (single head), group_size = 64, ne00/group_size = 9 groups per head → WRONG.
+    // Detect: if ne00 / group_size doesn't divide evenly into standard head counts (1,2,4,8,16,32,64,128),
+    // it's likely multi-group-per-head. Simpler check: group_size < 128 means head_dim > 128.
+    const bool multi_group_per_head = (group_size < 128);  // 64-group → head_dim > 128, multi-group
+    if (multi_group_per_head) {
+        if (innerq_enabled == 1) {
+            GGML_LOG_WARN("%s: InnerQ disabled (ne00=%lld != group_size=%d, multi-group heads)\n",
+                           __func__, (long long)ne00, group_size);
+            innerq_enabled = 0;
+            int zero = 0;
+            CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
+        }
+        return;
+    }
+
+    // Check if calibration is complete
+    if (innerq_enabled == 1) {
+        int count = 0;
+        CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
+        if (count >= innerq_target_tokens) {
+            turbo_innerq_finalize(group_size);
+        }
+    }
+}
+
+// Host: check if InnerQ is currently active (finalized)
+static bool turbo_innerq_is_active(void) {
+    return innerq_enabled == 2;
+}
+
+// ---- 4-bit centroids (Lloyd-Max for N(0, 1/128)) ----
+
+static __constant__ float TURBO_CENTROIDS_4BIT[16] = {
+    -0.173926f, -0.117195f, -0.089527f, -0.068756f,
+    -0.051262f, -0.035597f, -0.020989f, -0.006938f,
+     0.006938f,  0.020989f,  0.035597f,  0.051262f,
+     0.068756f,  0.089527f,  0.117195f,  0.173926f
+};
+
+// ---- Midpoints for nearest 4-bit centroid lookup ----
+
+static __constant__ float TURBO_MID_4BIT[15] = {
+    -0.145561f, -0.103361f, -0.079142f, -0.060009f,
+    -0.043430f, -0.028293f, -0.013964f,  0.000000f,
+     0.013964f,  0.028293f,  0.043430f,  0.060009f,
+     0.079142f,  0.103361f,  0.145561f
+};
+
+// ---- Nearest 4-bit centroid index ----
+
+static __device__ __forceinline__ uint8_t turbo_nearest_centroid_4bit(float val) {
+    if      (val < TURBO_MID_4BIT[ 0]) return  0;
+    else if (val < TURBO_MID_4BIT[ 1]) return  1;
+    else if (val < TURBO_MID_4BIT[ 2]) return  2;
+    else if (val < TURBO_MID_4BIT[ 3]) return  3;
+    else if (val < TURBO_MID_4BIT[ 4]) return  4;
+    else if (val < TURBO_MID_4BIT[ 5]) return  5;
+    else if (val < TURBO_MID_4BIT[ 6]) return  6;
+    else if (val < TURBO_MID_4BIT[ 7]) return  7;
+    else if (val < TURBO_MID_4BIT[ 8]) return  8;
+    else if (val < TURBO_MID_4BIT[ 9]) return  9;
+    else if (val < TURBO_MID_4BIT[10]) return 10;
+    else if (val < TURBO_MID_4BIT[11]) return 11;
+    else if (val < TURBO_MID_4BIT[12]) return 12;
+    else if (val < TURBO_MID_4BIT[13]) return 13;
+    else if (val < TURBO_MID_4BIT[14]) return 14;
+    else                               return 15;
+}
+
+// ---- Per-block quantize for turbo4 (128 elements, expects already-rotated input) ----
+
+static __device__ void quantize_f32_turbo4_0_block(const float * __restrict__ src,
+                                                    block_turbo4_0 * __restrict__ dst) {
+    for (int j = 0; j < QK_TURBO4 / 2; j++) dst->qs[j] = 0;
+
+    for (int j = 0; j < QK_TURBO4; j++) {
+        uint8_t idx = turbo_nearest_centroid_4bit(src[j]);
+        dst->qs[j / 2] |= (idx & 0xF) << ((j % 2) * 4);
+    }
+}
+
+// ---- Inline dequant helper: extract one float from turbo4 block ----
+
+static __device__ __forceinline__ float turbo4_dequant_element(
+        const block_turbo4_0 * __restrict__ x, int j, float norm) {
+    uint8_t idx = (x->qs[j / 2] >> ((j % 2) * 4)) & 0xF;
+    return TURBO_CENTROIDS_4BIT[idx] * norm;
+}
+
+// ---- Nearest 3-bit centroid index ----
+
+static __device__ __forceinline__ uint8_t turbo_nearest_centroid_3bit(float val) {
+    if      (val < TURBO_MID_3BIT[0]) return 0;
+    else if (val < TURBO_MID_3BIT[1]) return 1;
+    else if (val < TURBO_MID_3BIT[2]) return 2;
+    else if (val < TURBO_MID_3BIT[3]) return 3;
+    else if (val < TURBO_MID_3BIT[4]) return 4;
+    else if (val < TURBO_MID_3BIT[5]) return 5;
+    else if (val < TURBO_MID_3BIT[6]) return 6;
+    else                              return 7;
+}
+
+// ---- Per-block quantize (32 elements, expects already-rotated input) ----
+// Used by set_rows after group-level WHT rotation
+
+static __device__ void quantize_f32_turbo3_0_block(const float * __restrict__ src,
+                                                    block_turbo3_0 * __restrict__ dst) {
+    for (int j = 0; j < QK_TURBO3 / 4; j++) dst->qs[j] = 0;
+    for (int j = 0; j < QK_TURBO3 / 8; j++) dst->signs[j] = 0;
+
+    for (int j = 0; j < QK_TURBO3; j++) {
+        uint8_t idx = turbo_nearest_centroid_3bit(src[j]);
+        dst->qs[j / 4] |= (idx & 0x3) << ((j % 4) * 2);
+        if (idx & 0x4) {
+            dst->signs[j / 8] |= (1 << (j % 8));
+        }
+    }
+}
+
+// ---- Inline dequant helper: extract one float from turbo3 block ----
+
+static __device__ __forceinline__ float turbo3_dequant_element(
+        const block_turbo3_0 * __restrict__ x, int j, float norm) {
+    uint8_t low2 = (x->qs[j / 4] >> ((j % 4) * 2)) & 0x3;
+    uint8_t hi1  = (x->signs[j / 8] >> (j % 8)) & 0x1;
+    uint8_t idx  = low2 | (hi1 << 2);
+    return TURBO_CENTROIDS_3BIT[idx] * norm;
+}
+
+// ---- Nearest 2-bit centroid index ----
+
+static __device__ __forceinline__ uint8_t turbo_nearest_centroid_2bit(float val) {
+    if      (val < TURBO_MID_2BIT[0]) return 0;
+    else if (val < TURBO_MID_2BIT[1]) return 1;
+    else if (val < TURBO_MID_2BIT[2]) return 2;
+    else                              return 3;
+}
+
+// ---- Per-block quantize for turbo2 (32 elements, expects already-rotated input) ----
+
+static __device__ void quantize_f32_turbo2_0_block(const float * __restrict__ src,
+                                                    block_turbo2_0 * __restrict__ dst) {
+    for (int j = 0; j < QK_TURBO2 / 4; j++) dst->qs[j] = 0;
+
+    for (int j = 0; j < QK_TURBO2; j++) {
+        uint8_t idx = turbo_nearest_centroid_2bit(src[j]);
+        dst->qs[j / 4] |= (idx & 0x3) << ((j % 4) * 2);
+    }
+}
+
+// ---- Inline dequant helper: extract one float from turbo2 block ----
+
+static __device__ __forceinline__ float turbo2_dequant_element(
+        const block_turbo2_0 * __restrict__ x, int j, float norm) {
+    uint8_t idx = (x->qs[j / 4] >> ((j % 4) * 2)) & 0x3;
+    return TURBO_CENTROIDS_2BIT[idx] * norm;
+}
+
+// ============================================================================
+// Weight compression types (TQ3_1S, TQ4_1S)
+// These use N(0,1) centroids (NOT N(0,1/128) like KV cache types)
+// and require inverse WHT (RHT) after centroid lookup.
+// ============================================================================
+
+#define QR_TQ4_1S 1  // dequantize produces 2 consecutive elements
+#define QR_TQ3_1S 1
+
+// ---- Weight centroids: Lloyd-Max for N(0,1) ----
+
+static __constant__ float TQ4_CENTROIDS_WEIGHT[16] = {
+    -2.732590f, -2.069017f, -1.618046f, -1.256231f,
+    -0.942340f, -0.656759f, -0.388048f, -0.128395f,
+     0.128395f,  0.388048f,  0.656759f,  0.942340f,
+     1.256231f,  1.618046f,  2.069017f,  2.732590f
+};
+
+static __constant__ float TQ3_CENTROIDS_WEIGHT[8] = {
+    -1.996684f, -1.291398f, -0.740341f, -0.247508f,
+     0.230106f,  0.725222f,  1.277503f,  1.988943f
+};
+
+// ---- Sign array for weight WHT (golden ratio hash, 32 elements) ----
+
+static __constant__ float TQ_WEIGHT_SIGNS[32] = {
+    +1.0f, -1.0f, +1.0f, -1.0f, +1.0f, +1.0f, -1.0f, +1.0f,
+    -1.0f, -1.0f, +1.0f, -1.0f, +1.0f, +1.0f, -1.0f, +1.0f,
+    -1.0f, -1.0f, +1.0f, -1.0f, +1.0f, -1.0f, -1.0f, +1.0f,
+    -1.0f, +1.0f, +1.0f, -1.0f, +1.0f, -1.0f, -1.0f, +1.0f
+};
@@ -0,0 +1,189 @@
+#include "turbo-quant.cuh"
+#include "turbo-wht.cuh"
+
+// ─── CUDA kernel ──────────────────────────────────────────────────────────────
+//
+// Templated on direction and group_size (128 or 64).
+// One block per group, group_size threads per block.
+// direction: 0 = forward (signs1 → WHT → signs2), 1 = inverse (signs2 → WHT → signs1)
+//
+// When head_dim is not a multiple of group_size, only the full groups
+// within each head are processed.  Tail elements are left unchanged (identity).
+//
+// Algorithm mirrors the CPU implementation in ggml-cpu/ops.cpp:
+//   1. Apply s_first elementwise
+//   2. Radix-2 Hadamard butterfly (log2(group_size) stages, in-place)
+//   3. Normalize by 1/sqrt(group_size) and apply s_second elementwise
+//
+// InnerQ scale_inv: when non-null, applies per-channel inverse scaling for
+// Q/V equalization. For forward (Q rotation): multiply BEFORE signs+WHT.
+// For inverse (V un-rotation): multiply AFTER WHT+signs.
+
+template <int direction, int group_size>
+static __global__ void k_turbo_wht_f32(const float * __restrict__ src,
+                                        float * __restrict__ dst,
+                                        const float * __restrict__ scale_inv,
+                                        int64_t n_groups,
+                                        int64_t head_dim,
+                                        int64_t groups_per_head) {
+    static_assert(group_size == 128 || group_size == 64 || group_size == 32, "group_size must be 128, 64, or 32");
+
+    const int64_t g = blockIdx.x;
+    if (g >= n_groups) return;
+
+    const int t = threadIdx.x;  // 0 .. group_size-1
+
+    // Map group index to position in the tensor:
+    // each head has groups_per_head full groups, then a gap of tail elements.
+    const int64_t head_idx     = g / groups_per_head;
+    const int64_t grp_in_head  = g % groups_per_head;
+    const int64_t base         = head_idx * head_dim + grp_in_head * group_size;
+
+    __shared__ float x[group_size];
+
+    // Load from global memory
+    x[t] = src[base + t];
+    __syncthreads();
+
+    // InnerQ forward: apply scale_inv BEFORE signs+WHT (for Q pre-rotation)
+    if (direction == 0 && scale_inv != nullptr) {
+        x[t] *= scale_inv[t % group_size];
+        __syncthreads();
+    }
+
+    // Apply first sign array
+    if (group_size == 128) {
+        x[t] *= (direction == 0) ? TURBO_WHT_SIGNS1[t] : TURBO_WHT_SIGNS2[t];
+    } else if (group_size == 64) {
+        x[t] *= (direction == 0) ? TURBO_WHT_SIGNS1_64[t] : TURBO_WHT_SIGNS2_64[t];
+    } else {
+        // group_size == 32: TQ weight signs (same for forward and inverse)
+        x[t] *= TQ_WEIGHT_SIGNS[t];
+    }
+    __syncthreads();
+
+    // WHT butterfly — log2(group_size) stages.
+    // In stage h, threads where (t % (2h)) < h read x[t] and x[t+h],
+    // then write x[t] = a+b and x[t+h] = a-b.  Each active thread
+    // owns a disjoint pair, so no intra-stage conflicts exist.
+#define WHT_STAGE(h) \
+    if (t % (2*(h)) < (h)) { float a = x[t], b = x[t+(h)]; x[t] = a+b; x[t+(h)] = a-b; } \
+    __syncthreads();
+
+    WHT_STAGE(1)
+    WHT_STAGE(2)
+    WHT_STAGE(4)
+    WHT_STAGE(8)
+    WHT_STAGE(16)
+    if (group_size >= 64) { WHT_STAGE(32) }
+    if (group_size == 128) { WHT_STAGE(64) }
+#undef WHT_STAGE
+
+    // Normalize and apply second sign array, write to output
+    constexpr float inv_sqrt = (group_size == 128) ? 0.08838834764831845f :
+                               (group_size == 64)  ? 0.125f :
+                                                     0.17677669529663688f; // 1/sqrt(32)
+    float result;
+    if (group_size == 128) {
+        result = x[t] * inv_sqrt *
+            ((direction == 0) ? TURBO_WHT_SIGNS2[t] : TURBO_WHT_SIGNS1[t]);
+    } else if (group_size == 64) {
+        result = x[t] * inv_sqrt *
+            ((direction == 0) ? TURBO_WHT_SIGNS2_64[t] : TURBO_WHT_SIGNS1_64[t]);
+    } else {
+        // group_size == 32: normalize only (signs already applied before butterfly)
+        result = x[t] * inv_sqrt;
+    }
+
+    // InnerQ inverse: apply scale_inv AFTER WHT+signs (for V un-rotation)
+    if (direction == 1 && scale_inv != nullptr) {
+        result *= scale_inv[t % group_size];
+    }
+
+    dst[base + t] = result;
+}
+
+// ─── Simple copy kernel for tail elements (identity pass-through) ────────────
+
+static __global__ void k_turbo_wht_copy_tail(const float * __restrict__ src,
+                                              float * __restrict__ dst,
+                                              int64_t n_heads,
+                                              int64_t head_dim,
+                                              int64_t tail_offset,
+                                              int tail_size) {
+    const int64_t i = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n_heads * tail_size) return;
+
+    const int64_t head_idx  = i / tail_size;
+    const int64_t tail_elem = i % tail_size;
+    const int64_t offset    = head_idx * head_dim + tail_offset + tail_elem;
+    dst[offset] = src[offset];
+}
+
+// ─── Dispatch ─────────────────────────────────────────────────────────────────
+
+void ggml_cuda_turbo_wht(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src = dst->src[0];
+    const ggml_tensor * scale_tensor = dst->src[1];  // InnerQ scale_inv (may be NULL)
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    int direction;
+    int group_size;
+    memcpy(&direction, dst->op_params + 0, sizeof(int));
+    memcpy(&group_size, dst->op_params + sizeof(int), sizeof(int));
+
+    const int64_t head_dim        = src->ne[0];
+    const int64_t n_heads         = ggml_nelements(src) / head_dim;
+
+    GGML_ASSERT(group_size == 32 || group_size == 64 || group_size == 128);
+    const int64_t groups_per_head = head_dim / group_size;
+    const int     tail_size       = (int)(head_dim % group_size);
+    const int64_t n_groups        = groups_per_head * n_heads;
+
+    const float * src_ptr = (const float *) src->data;
+    float       * dst_ptr = (float       *) dst->data;
+    const float * scale_inv_ptr = scale_tensor ? (const float *) scale_tensor->data : nullptr;
+
+    cudaStream_t stream = ctx.stream();
+
+    // Process full groups
+    if (n_groups > 0) {
+        dim3 blocks(n_groups);
+        if (group_size == 128) {
+            dim3 threads(128);
+            if (direction == 0) {
+                k_turbo_wht_f32<0, 128><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
+            } else {
+                k_turbo_wht_f32<1, 128><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
+            }
+        } else if (group_size == 64) {
+            dim3 threads(64);
+            if (direction == 0) {
+                k_turbo_wht_f32<0, 64><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
+            } else {
+                k_turbo_wht_f32<1, 64><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
+            }
+        } else {
+            dim3 threads(32);
+            if (direction == 0) {
+                k_turbo_wht_f32<0, 32><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
+            } else {
+                k_turbo_wht_f32<1, 32><<<blocks, threads, 0, stream>>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head);
+            }
+        }
+    }
+
+    // Pass through tail elements unchanged (no rotation)
+    // Not needed for 64-aligned dims but kept for completeness
+    if (tail_size > 0) {
+        const int64_t total_tail = n_heads * tail_size;
+        const int block_sz = 256;
+        const int n_blocks = (int)((total_tail + block_sz - 1) / block_sz);
+        k_turbo_wht_copy_tail<<<n_blocks, block_sz, 0, stream>>>(
+            src_ptr, dst_ptr, n_heads, head_dim, groups_per_head * group_size, tail_size);
+    }
+}
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "common.cuh"
+
+void ggml_cuda_turbo_wht(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -33,11 +33,30 @@
 #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
 #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
 #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
-#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
-#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+// __shfl_sync: support both 3-arg (mask, var, srcLane) and 4-arg (mask, var, srcLane, width) calls
+// HIP ignores the mask but requires it to be 64-bit, so we cast explicitly.
+#define __SHFL_SYNC_3(mask, var, srcLane)        __shfl(var, srcLane, warpSize)
+#define __SHFL_SYNC_4(mask, var, srcLane, width) __shfl(var, srcLane, width)
+#define __SHFL_GET_MACRO(_1, _2, _3, _4, NAME, ...) NAME
+#define __shfl_sync(...) __SHFL_GET_MACRO(__VA_ARGS__, __SHFL_SYNC_4, __SHFL_SYNC_3)(__VA_ARGS__)
+// __shfl_up_sync: support 3-arg and 4-arg calls (HIP ignores mask)
+#define __SHFL_UP_SYNC_3(mask, var, delta)        __shfl_up(var, delta, warpSize)
+#define __SHFL_UP_SYNC_4(mask, var, delta, width) __shfl_up(var, delta, width)
+#define __SHFL_UP_GET(_1, _2, _3, _4, NAME, ...) NAME
+#define __shfl_up_sync(...) __SHFL_UP_GET(__VA_ARGS__, __SHFL_UP_SYNC_4, __SHFL_UP_SYNC_3)(__VA_ARGS__)
+// __shfl_xor_sync: support 3-arg and 4-arg calls (HIP ignores mask)
+#define __SHFL_XOR_SYNC_3(mask, var, laneMask)        __shfl_xor(var, laneMask, warpSize)
+#define __SHFL_XOR_SYNC_4(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define __SHFL_XOR_GET(_1, _2, _3, _4, NAME, ...) NAME
+#define __shfl_xor_sync(...) __SHFL_XOR_GET(__VA_ARGS__, __SHFL_XOR_SYNC_4, __SHFL_XOR_SYNC_3)(__VA_ARGS__)
+// __shfl_down_sync: support 3-arg and 4-arg calls (HIP ignores mask)
+#define __SHFL_DOWN_SYNC_3(mask, var, delta)        __shfl_down(var, delta, warpSize)
+#define __SHFL_DOWN_SYNC_4(mask, var, delta, width) __shfl_down(var, delta, width)
+#define __SHFL_DOWN_GET(_1, _2, _3, _4, NAME, ...) NAME
+#define __shfl_down_sync(...) __SHFL_DOWN_GET(__VA_ARGS__, __SHFL_DOWN_SYNC_4, __SHFL_DOWN_SYNC_3)(__VA_ARGS__)
 #define __all_sync(mask, var) __all(var)
 #define __any_sync(mask, var) __any(var)
+#define __ballot_sync(mask, var) ((uint32_t)__ballot(var))
 #define cublasStrsmBatched hipblasStrsmBatched
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
@@ -121,6 +140,10 @@
 #define cudaStreamPerThread hipStreamPerThread
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaMemcpyToSymbol hipMemcpyToSymbol
+#define cudaMemcpyFromSymbol hipMemcpyFromSymbol
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaGraphExec_t hipGraphExec_t
 #define cudaGraphNode_t hipGraphNode_t
 #define cudaKernelNodeParams hipKernelNodeParams
@@ -133,6 +156,9 @@
 #define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
 #define cudaGraphInstantiate hipGraphInstantiate
 #define cudaStreamEndCapture hipStreamEndCapture
+#define cudaStreamCaptureStatus hipStreamCaptureStatus
+#define cudaStreamIsCapturing hipStreamIsCapturing
+#define cudaStreamCaptureStatusNone hipStreamCaptureStatusNone
 #define cudaGraphDestroy hipGraphDestroy
 #define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
 #define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
@@ -62,6 +62,8 @@ list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")

 file(GLOB   GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
 file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-tile*.cu")
+# Exclude D>=576 tile kernels: exceed HIP local memory limit (67584 > 65536)
+list(FILTER SRCS EXCLUDE REGEX "dkq(576|640)")
 list(APPEND GGML_SOURCES_ROCM ${SRCS})
 file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
 list(APPEND GGML_SOURCES_ROCM ${SRCS})
@@ -79,7 +81,28 @@ else()
        ../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu
        ../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu
        ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
-        ../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu)
+        ../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-q8_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo3_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo3_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-f16.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-q8_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo2_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo2_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-f16.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo2_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo3_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-q8_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo4_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-f16-turbo4_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-f16.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo3_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo4_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo2_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo4_0.cu)
 endif()

 ggml_add_backend_library(ggml-hip
@@ -650,6 +650,22 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_m
    return res;
 }

+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_turbo_wht(ggml_metal_library_t lib) {
+    const char * name = "kernel_turbo_wht";
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        ggml_metal_cv_t cv = ggml_metal_cv_init();
+        res = ggml_metal_library_compile_pipeline(lib, name, name, cv);
+        ggml_metal_cv_free(cv);
+    }
+
+    res.nsg = 1;
+    res.smem = 0;
+
+    return res;
+}
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_metal_library_t lib, const ggml_tensor * op, int nsg, int nxpsg, int r1ptg) {
    char base[256];
    char name[256];
@@ -813,6 +829,18 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
                nr0 = N_R0_Q8_0;
                smem = 32*sizeof(float)*N_R0_Q8_0;
            } break;
+        case GGML_TYPE_TQ3_1S:
+            {
+                nsg = N_SG_TQ3_1S;
+                nr0 = N_R0_TQ3_1S;
+                smem = 32*sizeof(float)*N_R0_TQ3_1S;
+            } break;
+        case GGML_TYPE_TQ4_1S:
+            {
+                nsg = N_SG_TQ4_1S;
+                nr0 = N_R0_TQ4_1S;
+                smem = 32*sizeof(float)*N_R0_TQ4_1S;
+            } break;
        case GGML_TYPE_MXFP4:
            {
                nsg = N_SG_MXFP4;
@@ -931,6 +959,81 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
    return res;
 }

+// TQ3_1S / TQ4_1S rotated variant: uses dequantize_*_rotated (no inverse RHT)
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_tq_rotated(ggml_metal_library_t lib, const ggml_tensor * op) {
+    char base[256];
+    char name[256];
+
+    const ggml_type tsrc0 = op->src[0]->type;
+    const ggml_type tsrc1 = op->src[1]->type;
+
+    const bool bc_inp = op->src[0]->ne[0] % 32 != 0;
+    const bool bc_out = op->ne[0] % 64 != 0 || op->ne[1] % 32 != 0;
+
+    snprintf(base, 256, "kernel_mul_mm_%s_rotated_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
+    snprintf(name, 256, "%s_bci=%d_bco=%d", base, bc_inp, bc_out);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        ggml_metal_cv_t cv = ggml_metal_cv_init();
+
+        ggml_metal_cv_set_bool(cv, bc_inp, FC_MUL_MM + 0);
+        ggml_metal_cv_set_bool(cv, bc_out, FC_MUL_MM + 1);
+
+        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
+
+        ggml_metal_cv_free(cv);
+    }
+
+    res.smem = bc_out ? 8192 : 4096 + 2048;
+
+    return res;
+}
+
+// TQ3_1S / TQ4_1S rotated MUL_MAT_ID variant
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_tq_rotated(ggml_metal_library_t lib, const ggml_tensor * op) {
+    char base[256];
+    char name[256];
+
+    const ggml_type tsrc0 = op->src[0]->type;
+    const ggml_type tsrc1 = op->src[1]->type;
+
+    const bool bc_inp = op->src[0]->ne[0] % 32 != 0;
+
+    snprintf(base, 256, "kernel_mul_mm_id_%s_rotated_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
+    snprintf(name, 256, "%s_bci=%d", base, bc_inp);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        ggml_metal_cv_t cv = ggml_metal_cv_init();
+
+        ggml_metal_cv_set_bool(cv, bc_inp, FC_MUL_MM + 0);
+
+        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
+
+        ggml_metal_cv_free(cv);
+    }
+
+    res.smem = 8192;
+
+    return res;
+}
+
+// TQ3_1S / TQ4_1S activation pre-rotation pipeline (shared by both)
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tq3_rotate_act(ggml_metal_library_t lib, bool inverse) {
+    char name[256];
+    const char * base = inverse ? "kernel_tq3_unrotate_act" : "kernel_tq3_rotate_act";
+
+    snprintf(name, 256, "%s", base);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    }
+
+    return res;
+}
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0(ggml_metal_library_t lib, int ne02, int ne20) {
    char base[256];
    char name[256];
@@ -943,7 +1046,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0(g
        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
    }

+    // Graph reservation may pass worst-case ne20=ne02 (e.g. 256*256*2=128KB).
+    // At runtime ne20 is the actual n_expert_used (e.g. 8), keeping shmem within limits.
+    // Cap to 32KB (Apple Silicon threadgroup memory limit) to prevent reservation assert
+    // on high-expert-count MoE models (Qwen3.5-35B with 256 experts).
    res.smem = (size_t) ne02*ne20*sizeof(uint16_t);
+    if (res.smem > 32768) {
+        res.smem = 32768;
+    }

    return res;
 }
@@ -1037,6 +1147,18 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_id(ggml_m
                nr0 = N_R0_Q8_0;
                smem = 32*sizeof(float)*N_R0_Q8_0;
            } break;
+        case GGML_TYPE_TQ3_1S:
+            {
+                nsg = N_SG_TQ3_1S;
+                nr0 = N_R0_TQ3_1S;
+                smem = 32*sizeof(float)*N_R0_TQ3_1S;
+            } break;
+        case GGML_TYPE_TQ4_1S:
+            {
+                nsg = N_SG_TQ4_1S;
+                nr0 = N_R0_TQ4_1S;
+                smem = 32*sizeof(float)*N_R0_TQ4_1S;
+            } break;
        case GGML_TYPE_MXFP4:
            {
                nsg = N_SG_MXFP4;
@@ -1387,12 +1509,17 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
    // do bounds checks for the mask?
    const bool bc_mask = op->src[3] && (op->src[3]->ne[1] % 8 != 0);

-    snprintf(base, 256, "kernel_%s_%s_dk%d_dv%d",
+    // Asymmetric K/V: always encode both K and V types in the pipeline name.
+    // Symmetric case: ktype == vtype, so the name just has the type twice.
+    // This avoids ambiguity if a type name contains underscores (e.g. q4_0).
+    snprintf(base, 256, "kernel_%s_k%s_v%s_dk%d_dv%d",
            "flash_attn_ext",
            ggml_type_name(op->src[1]->type),
+            ggml_type_name(op->src[2]->type),
            dk,
            dv);

+
    snprintf(name, 256, "%s_mask=%d_sinks=%d_bias=%d_scap=%d_kvpad=%d_bcm=%d_ns10=%d_ns20=%d_nsg=%d",
            base,
            has_mask,
@@ -1450,12 +1577,16 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_v
    const int32_t ns10 = op->src[1]->nb[1]/op->src[1]->nb[0];
    const int32_t ns20 = op->src[2]->nb[1]/op->src[2]->nb[0];

-    snprintf(base, 256, "kernel_%s_%s_dk%d_dv%d",
+    // Asymmetric K/V: always encode both K and V types in the pipeline name.
+    // Uses k/v prefix to avoid ambiguity with type names containing underscores.
+    snprintf(base, 256, "kernel_%s_k%s_v%s_dk%d_dv%d",
            "flash_attn_ext_vec",
            ggml_type_name(op->src[1]->type),
+            ggml_type_name(op->src[2]->type),
            dk,
            dv);

+
    snprintf(name, 256, "%s_mask=%d_sink=%d_bias=%d_scap=%d_kvpad=%d_ns10=%d_ns20=%d_nsg=%d_nwg=%d",
            base,
            has_mask,
@@ -129,8 +129,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv              (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net   (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri         (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_turbo_wht         (ggml_metal_library_t lib);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext        (ggml_metal_library_t lib, const struct ggml_tensor * op, int nsg, int nxpsg, int r1ptg);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm            (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_tq_rotated(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_tq_rotated(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tq3_rotate_act   (ggml_metal_library_t lib, bool inverse);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv            (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0    (ggml_metal_library_t lib, int ne02, int ne20);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id         (ggml_metal_library_t lib, const struct ggml_tensor * op);
@@ -225,6 +225,43 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
                [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
 #endif

+                // TurboQuant: auto-select dequant path based on hardware
+                // M1/M2/M3/M4 (no tensor API): 4-mag LUT (+38-45% decode at long ctx)
+                // M5+ (has tensor API): 8-entry full LUT (best decode speed)
+                {
+                    const char * force_4mag = getenv("TURBO_FORCE_4MAG");
+                    // Always compile with 4-mag support. The dispatch code selects
+                    // 4-mag vs 8-LUT based on context depth at runtime.
+                    // Pre-M5: always 4-mag (constant cache too slow)
+                    // M5+: 4-mag for mid-context (8K-20K), 8-LUT otherwise
+                    if (!ggml_metal_device_get_props(dev)->has_tensor || (force_4mag && force_4mag[0] == '1')) {
+                        [prep setObject:@"1" forKey:@"TURBO_USE_4MAG"];
+                        GGML_LOG_INFO("%s: turbo3 using 4-mag LUT%s\n", __func__,
+                            force_4mag ? " (forced)" : " (pre-M5 hardware)");
+                    }
+                    // Sparse V dequant: skip V for negligible attention weights
+                    // Enabled by default on all Metal (validated: PPL identical, NIAH 9/9, 30+ testers)
+                    // Opt-out via TURBO_SPARSE_V=0
+                    const char * sparse_v_env = getenv("TURBO_SPARSE_V");
+                    const bool sparse_v_disabled = sparse_v_env && sparse_v_env[0] == '0';
+                    if (!sparse_v_disabled) {
+                        [prep setObject:@"1" forKey:@"TURBO_SPARSE_V"];
+                        GGML_LOG_INFO("%s: turbo3 sparse V dequant enabled (opt-out: TURBO_SPARSE_V=0)\n", __func__);
+                    }
+                    // TODO: context-adaptive dispatch — compile both 4-mag and 8-LUT
+                    // FA kernel instantiations, select based on ne11 (KV cache size)
+                    // at dispatch time in ggml_metal_op_flash_attn_ext()
+                }
+
+                // TurboQuant profiling: set TURBO_PROFILE_MODE env var (0-4)
+                {
+                    const char * pm = getenv("TURBO_PROFILE_MODE");
+                    if (pm && pm[0] >= '0' && pm[0] <= '4') {
+                        [prep setObject:[NSString stringWithUTF8String:pm] forKey:@"TURBO_PROFILE_MODE"];
+                        GGML_LOG_INFO("%s: TURBO_PROFILE_MODE=%s\n", __func__, pm);
+                    }
+                }
+
                MTLCompileOptions * options = [MTLCompileOptions new];
                options.preprocessorMacros = prep;

@@ -1164,7 +1201,23 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                return false;
            }
            if (op->src[1]->type != op->src[2]->type) {
-                return false;
+                // Allow asymmetric K/V for supported mixed pairs:
+                // - turbo x turbo (any combination)
+                // - q8_0 x turbo (either direction)
+                const bool k_is_turbo = (op->src[1]->type == GGML_TYPE_TURBO2_0 ||
+                                         op->src[1]->type == GGML_TYPE_TURBO3_0 ||
+                                         op->src[1]->type == GGML_TYPE_TURBO4_0);
+                const bool v_is_turbo = (op->src[2]->type == GGML_TYPE_TURBO2_0 ||
+                                         op->src[2]->type == GGML_TYPE_TURBO3_0 ||
+                                         op->src[2]->type == GGML_TYPE_TURBO4_0);
+                const bool k_is_q8 = (op->src[1]->type == GGML_TYPE_Q8_0);
+                const bool v_is_q8 = (op->src[2]->type == GGML_TYPE_Q8_0);
+                const bool supported = (k_is_turbo && v_is_turbo) ||
+                                       (k_is_q8 && v_is_turbo) ||
+                                       (k_is_turbo && v_is_q8);
+                if (!supported) {
+                    return false;
+                }
            }
            switch (op->src[1]->type) {
                case GGML_TYPE_F32:
@@ -1174,6 +1227,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                case GGML_TYPE_Q4_1:
                case GGML_TYPE_Q5_0:
                case GGML_TYPE_Q5_1:
+                case GGML_TYPE_TURBO2_0:
+                case GGML_TYPE_TURBO3_0:
+                case GGML_TYPE_TURBO4_0:
                    break;
                case GGML_TYPE_BF16:
                    if (!has_bfloat) {
@@ -1192,6 +1248,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
            return true;
        case GGML_OP_GATED_DELTA_NET:
            return has_simdgroup_reduction && op->src[2]->ne[0] % 32 == 0;
+        case GGML_OP_TURBO_WHT:
+            return op->src[0]->ne[0] % 128 == 0;
        case GGML_OP_SOLVE_TRI:
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
@@ -1215,6 +1273,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                           case GGML_TYPE_Q5_1:
                           case GGML_TYPE_IQ4_NL:
                           case GGML_TYPE_I32:
+                           case GGML_TYPE_TURBO2_0:
+                           case GGML_TYPE_TURBO3_0:
+                           case GGML_TYPE_TURBO4_0:
                                return true;
                           default:
                                return false;
@@ -1241,6 +1302,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_TQ3_1S:
+                    case GGML_TYPE_TQ4_1S:
                        switch (op->type) {
                            case GGML_TYPE_F32:
                            case GGML_TYPE_F16:
@@ -1272,6 +1335,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_IQ4_NL:
+                    case GGML_TYPE_TURBO2_0:
+                    case GGML_TYPE_TURBO3_0:
+                    case GGML_TYPE_TURBO4_0:
                        return true;
                    default:
                        return false;
@@ -84,6 +84,12 @@
 #define N_R0_IQ4_XS 2
 #define N_SG_IQ4_XS 2

+#define N_R0_TQ3_1S 8
+#define N_SG_TQ3_1S 2
+
+#define N_R0_TQ4_1S 8
+#define N_SG_TQ4_1S 2
+
 // function constants offsets
 #define FC_FLASH_ATTN_EXT_PAD          100
 #define FC_FLASH_ATTN_EXT_BLK          200
@@ -101,6 +107,9 @@
 #define FC_SUM_ROWS                    1400
 #define FC_UPSCALE                     1500
 #define FC_GATED_DELTA_NET             1600
+#define FC_TURBO_WHT                   1700
+#define FC_TURBO_FLASH_P1              1800
+#define FC_TURBO_FLASH_P2              1900

 // op-specific constants
 #define OP_FLASH_ATTN_EXT_NQPSG 8
@@ -429,6 +438,40 @@ typedef struct {
    float    logit_softcap;
 } ggml_metal_kargs_flash_attn_ext_vec;

+// TurboFlash two-pass: asymmetric K=q8_0, V=turbo3 fused attention
+// Pass 1 args (block scoring + partial V accumulation)
+typedef struct {
+    int32_t  ne01;      // number of query heads (batch dim 1)
+    int32_t  ne02;      // number of query heads (batch dim 2)
+    int32_t  ne03;      // number of query heads (batch dim 3)
+    uint64_t nb01;      // Q row stride
+    uint64_t nb02;      // Q head stride
+    uint64_t nb03;      // Q batch stride
+    int32_t  ne11;      // KV sequence length (T_kv)
+    int32_t  ne_12_2;   // KV head count dim 2
+    int32_t  ne_12_3;   // KV head count dim 3
+    uint64_t nb11;      // K row stride (bytes per KV token in K)
+    uint64_t nb12;      // K head stride
+    uint64_t nb13;      // K batch stride
+    uint64_t nb21;      // V row stride (bytes per KV token in V)
+    uint64_t nb22;      // V head stride
+    uint64_t nb23;      // V batch stride
+    int32_t  ne31;      // mask dim 1
+    int32_t  ne32;      // mask dim 2
+    int32_t  ne33;      // mask dim 3
+    uint64_t nb31;      // mask stride 1
+    uint64_t nb32;      // mask stride 2
+    uint64_t nb33;      // mask stride 3
+    float    scale;     // attention scale (1/sqrt(dk))
+    int32_t  n_blocks;  // ceil(ne11 / BLOCK_SIZE)
+} ggml_metal_kargs_turbo_flash_p1;
+
+// Pass 2 args (merge partials + inverse WHT + write output)
+typedef struct {
+    int32_t  ne01;      // number of query heads (total n_bh)
+    int32_t  n_blocks;  // number of blocks from pass 1
+} ggml_metal_kargs_turbo_flash_p2;
+
 typedef struct {
    int32_t  nrows;
 } ggml_metal_kargs_flash_attn_ext_vec_reduce;
@@ -889,6 +932,11 @@ typedef struct {
    uint64_t nb3;
 } ggml_metal_kargs_gated_delta_net;

+typedef struct {
+    int64_t  n_elements;  // total elements in tensor
+    int32_t  direction;   // 0 = forward, 1 = inverse
+} ggml_metal_kargs_turbo_wht;
+
 typedef struct {
    int32_t  ne00;
    int32_t  ne01;
@@ -172,6 +172,26 @@ static bool ggml_metal_op_concurrency_add(ggml_metal_op_t ctx, const ggml_tensor
    return ggml_mem_ranges_add(ctx->mem_ranges, node);
 }

+static bool ggml_metal_op_mutates_tq_src1(const ggml_tensor * node) {
+    if (node == nullptr || node->src[0] == nullptr || node->src[1] == nullptr) {
+        return false;
+    }
+
+    const bool is_tq_weight = node->src[0]->type == GGML_TYPE_TQ3_1S ||
+                              node->src[0]->type == GGML_TYPE_TQ4_1S;
+    if (!is_tq_weight) {
+        return false;
+    }
+
+    switch (node->op) {
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            return true;
+        default:
+            return false;
+    }
+}
+
 static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
    struct ggml_tensor * node = ctx->node(idx);

@@ -209,6 +229,15 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {

    int n_fuse = 1;

+    // Rotated TQ weight kernels temporarily rotate src1 in-place before the
+    // matmul and restore it afterwards. The generic range tracker only sees a
+    // read dependency on src1, so sibling projections can be scheduled as
+    // concurrent even though they race on the shared activation buffer.
+    // Gemma4 GEGLU / MoE fan-out is especially sensitive to this hazard.
+    if (ggml_metal_op_mutates_tq_src1(node)) {
+        ggml_metal_op_concurrency_reset(ctx);
+    }
+
    // check if the current node can run concurrently with other nodes before it
    // the condition is that:
    //  - the current node cannot write to any previous src or dst ranges
@@ -337,6 +366,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
            {
                n_fuse = ggml_metal_op_gated_delta_net(ctx, idx);
            } break;
+        case GGML_OP_TURBO_WHT:
+            {
+                n_fuse = ggml_metal_op_turbo_wht(ctx, idx);
+            } break;
        case GGML_OP_SOLVE_TRI:
            {
                n_fuse = ggml_metal_op_solve_tri(ctx, idx);
@@ -1656,6 +1689,39 @@ int ggml_metal_op_gated_delta_net(ggml_metal_op_t ctx, int idx) {
    return 1;
 }

+int ggml_metal_op_turbo_wht(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    int direction;
+    memcpy(&direction, op->op_params, sizeof(int));
+
+    const int64_t n_elements = ggml_nelements(op->src[0]);
+    const int64_t n_groups = n_elements / 128;
+
+    auto pipeline = ggml_metal_library_get_pipeline_turbo_wht(lib);
+
+    ggml_metal_kargs_turbo_wht args = {
+        /*.n_elements =*/ n_elements,
+        /*.direction  =*/ direction,
+    };
+
+    int ida = 0;
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), ida++);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), ida++);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         ida++);
+
+    // One thread per 128-element group, 256 threads per threadgroup
+    const int threads_per_tg = 256;
+    const int n_threadgroups = (n_groups + threads_per_tg - 1) / threads_per_tg;
+    ggml_metal_encoder_dispatch_threadgroups(enc, n_threadgroups, 1, 1, threads_per_tg, 1, 1);
+
+    return 1;
+}
+
 int ggml_metal_op_solve_tri(ggml_metal_op_t ctx, int idx) {
    ggml_tensor * op = ctx->node(idx);

@@ -2155,52 +2221,103 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
        !ggml_is_transposed(op->src[1]) &&
        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-        props_dev->has_simdgroup_mm && ne00 >= 64 && ne11 > ne11_mm_min) {
-        //GGML_LOG_INFO("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+        props_dev->has_simdgroup_mm && ne00 >= 64 &&
+        (ne11 > ne11_mm_min || op->src[0]->type == GGML_TYPE_TQ3_1S || op->src[0]->type == GGML_TYPE_TQ4_1S)) {
+        // Route all TQ weights through the rotated mul_mm path.
+        // Gemma4 decode still degrades on the fused mul_mv kernel even after the broader
+        // TQ backend fixes, while the rotated mul_mm path matches CPU behavior.

-        // some Metal matrix data types require aligned pointers
-        // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
-        //switch (op->src[0]->type) {
-        //    case GGML_TYPE_F32:  GGML_ASSERT(nb01 % 16 == 0); break;
-        //    case GGML_TYPE_F16:  GGML_ASSERT(nb01 % 8  == 0); break;
-        //    case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8  == 0); break;
-        //    default: break;
-        //}
+        const bool is_tq_weight = (op->src[0]->type == GGML_TYPE_TQ3_1S || op->src[0]->type == GGML_TYPE_TQ4_1S);

-        auto pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
+        // TQ weight optimization: pre-rotate activations, use no-RHT dequant, then un-rotate
+        if (is_tq_weight && ne00 % 32 == 0) {
+            // Step 1: Forward-rotate src1 in-place
+            const int64_t n_act = (int64_t)ne10 * ne11 * ne12 * ne13;
+            int64_t n_act_val = n_act;

-        ggml_metal_kargs_mul_mm args = {
-            /*.ne00 =*/ ne00,
-            /*.ne02 =*/ ne02,
-            /*.nb01 =*/ nb01,
-            /*.nb02 =*/ nb02,
-            /*.nb03 =*/ nb03,
-            /*.ne12 =*/ ne12,
-            /*.nb10 =*/ nb10,
-            /*.nb11 =*/ nb11,
-            /*.nb12 =*/ nb12,
-            /*.nb13 =*/ nb13,
-            /*.ne0  =*/ ne0,
-            /*.ne1  =*/ ne1,
-            /*.r2   =*/ r2,
-            /*.r3   =*/ r3,
-        };
+            auto pipeline_rot = ggml_metal_library_get_pipeline_tq3_rotate_act(lib, false);
+            ggml_metal_encoder_set_pipeline(enc, pipeline_rot);
+            ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 0);
+            ggml_metal_encoder_set_bytes   (enc, &n_act_val, sizeof(n_act_val), 1);
+            ggml_metal_encoder_dispatch_threadgroups(enc, (int)(n_act_val / 32), 1, 1, 32, 1, 1);

-        ggml_metal_encoder_set_pipeline(enc, pipeline);
-        ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
-        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
-        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
+            // Memory barrier between rotate and matmul
+            ggml_metal_op_concurrency_reset(ctx);

-        const size_t smem = pipeline.smem;
+            // Step 2: Dispatch rotated mul_mm (uses no-RHT dequant)
+            auto pipeline_mm = ggml_metal_library_get_pipeline_mul_mm_tq_rotated(lib, op);

-        ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+            ggml_metal_kargs_mul_mm args = {
+                /*.ne00 =*/ ne00,
+                /*.ne02 =*/ ne02,
+                /*.nb01 =*/ nb01,
+                /*.nb02 =*/ nb02,
+                /*.nb03 =*/ nb03,
+                /*.ne12 =*/ ne12,
+                /*.nb10 =*/ nb10,
+                /*.nb11 =*/ nb11,
+                /*.nb12 =*/ nb12,
+                /*.nb13 =*/ nb13,
+                /*.ne0  =*/ ne0,
+                /*.ne1  =*/ ne1,
+                /*.r2   =*/ r2,
+                /*.r3   =*/ r3,
+            };

-        const int nr0 = pipeline.nr0;
-        const int nr1 = pipeline.nr1;
-        const int nsg = pipeline.nsg;
+            ggml_metal_encoder_set_pipeline(enc, pipeline_mm);
+            ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+            ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+            ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+            ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);

-        ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + nr1 - 1) / nr1), ((ne01 + nr0 - 1) / nr0), ne12 * ne13, 32, nsg, 1);
+            const size_t smem_tq = pipeline_mm.smem;
+            ggml_metal_encoder_set_threadgroup_memory_size(enc, smem_tq, 0);
+            ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
+
+            ggml_metal_op_concurrency_reset(ctx);
+
+            auto pipeline_unrot = ggml_metal_library_get_pipeline_tq3_rotate_act(lib, true);
+            ggml_metal_encoder_set_pipeline(enc, pipeline_unrot);
+            ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 0);
+            ggml_metal_encoder_set_bytes   (enc, &n_act_val, sizeof(n_act_val), 1);
+            ggml_metal_encoder_dispatch_threadgroups(enc, (int)(n_act_val / 32), 1, 1, 32, 1, 1);
+
+            ggml_metal_op_concurrency_reset(ctx);
+        } else {
+            auto pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
+
+            ggml_metal_kargs_mul_mm args = {
+                /*.ne00 =*/ ne00,
+                /*.ne02 =*/ ne02,
+                /*.nb01 =*/ nb01,
+                /*.nb02 =*/ nb02,
+                /*.nb03 =*/ nb03,
+                /*.ne12 =*/ ne12,
+                /*.nb10 =*/ nb10,
+                /*.nb11 =*/ nb11,
+                /*.nb12 =*/ nb12,
+                /*.nb13 =*/ nb13,
+                /*.ne0  =*/ ne0,
+                /*.ne1  =*/ ne1,
+                /*.r2   =*/ r2,
+                /*.r3   =*/ r3,
+            };
+
+            ggml_metal_encoder_set_pipeline(enc, pipeline);
+            ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+            ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+            ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+            ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
+
+            const size_t smem = pipeline.smem;
+            ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+
+            const int nr0 = pipeline.nr0;
+            const int nr1 = pipeline.nr1;
+            const int nsg = pipeline.nsg;
+
+            ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + nr1 - 1) / nr1), ((ne01 + nr0 - 1) / nr0), ne12 * ne13, 32, nsg, 1);
+        }
    } else {
        auto pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);

@@ -2243,7 +2360,9 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
        if (op->src[0]->type == GGML_TYPE_F32 ||
            op->src[0]->type == GGML_TYPE_F16 ||
            op->src[0]->type == GGML_TYPE_BF16 ||
-            op->src[0]->type == GGML_TYPE_Q8_0) {
+            op->src[0]->type == GGML_TYPE_Q8_0 ||
+            op->src[0]->type == GGML_TYPE_TQ3_1S ||
+            op->src[0]->type == GGML_TYPE_TQ4_1S) {
            ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0 - 1)/(nr0)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
        } else {
            ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0*nsg - 1)/(nr0*nsg)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
@@ -2303,12 +2422,13 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {

    const uint32_t r2 = 1;
    const uint32_t r3 = 1;
+    const bool is_tq_weight = (op->src[0]->type == GGML_TYPE_TQ3_1S || op->src[0]->type == GGML_TYPE_TQ4_1S);

    // find the break-even point where the matrix-matrix kernel becomes more efficient compared
    // to the matrix-vector kernel
    // ne20 = n_used_experts
    // ne21 = n_rows (batch size)
-    const int ne21_mm_id_min = 32;
+    const int ne21_mm_id_min = is_tq_weight ? 1 : 32;

    if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min)) {
        // some Metal matrix data types require aligned pointers
@@ -2362,40 +2482,96 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
        ggml_metal_op_concurrency_reset(ctx);

        {
-            auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
+            // TQ weight MoE: pre-rotate activations for rotated dispatch
+            if (is_tq_weight && ne00 % 32 == 0) {
+                const int64_t n_act = (int64_t)ne10 * ne11 * ne12 * ne13;
+                int64_t n_act_val = n_act;

-            ggml_metal_kargs_mul_mm_id args = {
-                /*.ne00  =*/ ne00,
-                /*.ne02  =*/ ne02,
-                /*.nb01  =*/ nb01,
-                /*.nb02  =*/ nb02,
-                /*.nb03  =*/ nb03,
-                /*.ne11  =*/ ne11, // n_expert_used (bcast)
-                /*.nb10  =*/ nb10,
-                /*.nb11  =*/ nb11,
-                /*.nb12  =*/ nb12,
-                /*.nb13  =*/ nb13,
-                /*.ne20  =*/ ne20, // n_expert_used
-                /*.ne21  =*/ ne21, // n_tokens
-                /*.ne0   =*/ ne0,
-                /*.ne1   =*/ ne1,
-                /*.r2    =*/ r2,
-                /*.r3    =*/ r3,
-            };
+                auto pipeline_rot = ggml_metal_library_get_pipeline_tq3_rotate_act(lib, false);
+                ggml_metal_encoder_set_pipeline(enc, pipeline_rot);
+                ggml_metal_encoder_set_buffer  (enc, bid_src1, 0);
+                ggml_metal_encoder_set_bytes   (enc, &n_act_val, sizeof(n_act_val), 1);
+                ggml_metal_encoder_dispatch_threadgroups(enc, (int)(n_act_val / 32), 1, 1, 32, 1, 1);

-            ggml_metal_encoder_set_pipeline(enc, pipeline);
-            ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-            ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
-            ggml_metal_encoder_set_buffer  (enc, bid_src1, 2);
-            ggml_metal_encoder_set_buffer  (enc, bid_tpe,  3);
-            ggml_metal_encoder_set_buffer  (enc, bid_ids,  4);
-            ggml_metal_encoder_set_buffer  (enc, bid_dst,  5);
+                ggml_metal_op_concurrency_reset(ctx);

-            const size_t smem = pipeline.smem;
+                auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id_tq_rotated(lib, op);

-            ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+                ggml_metal_kargs_mul_mm_id args = {
+                    /*.ne00  =*/ ne00,
+                    /*.ne02  =*/ ne02,
+                    /*.nb01  =*/ nb01,
+                    /*.nb02  =*/ nb02,
+                    /*.nb03  =*/ nb03,
+                    /*.ne11  =*/ ne11,
+                    /*.nb10  =*/ nb10,
+                    /*.nb11  =*/ nb11,
+                    /*.nb12  =*/ nb12,
+                    /*.nb13  =*/ nb13,
+                    /*.ne20  =*/ ne20,
+                    /*.ne21  =*/ ne21,
+                    /*.ne0   =*/ ne0,
+                    /*.ne1   =*/ ne1,
+                    /*.r2    =*/ r2,
+                    /*.r3    =*/ r3,
+                };

-            ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
+                ggml_metal_encoder_set_pipeline(enc, pipeline);
+                ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+                ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+                ggml_metal_encoder_set_buffer  (enc, bid_src1, 2);
+                ggml_metal_encoder_set_buffer  (enc, bid_tpe,  3);
+                ggml_metal_encoder_set_buffer  (enc, bid_ids,  4);
+                ggml_metal_encoder_set_buffer  (enc, bid_dst,  5);
+
+                const size_t smem = pipeline.smem;
+                ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+                ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
+
+                ggml_metal_op_concurrency_reset(ctx);
+
+                auto pipeline_unrot = ggml_metal_library_get_pipeline_tq3_rotate_act(lib, true);
+                ggml_metal_encoder_set_pipeline(enc, pipeline_unrot);
+                ggml_metal_encoder_set_buffer  (enc, bid_src1, 0);
+                ggml_metal_encoder_set_bytes   (enc, &n_act_val, sizeof(n_act_val), 1);
+                ggml_metal_encoder_dispatch_threadgroups(enc, (int)(n_act_val / 32), 1, 1, 32, 1, 1);
+
+                // Barrier: ensure unrotate completes before any subsequent op reads src1
+                ggml_metal_op_concurrency_reset(ctx);
+            } else {
+                auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
+
+                ggml_metal_kargs_mul_mm_id args = {
+                    /*.ne00  =*/ ne00,
+                    /*.ne02  =*/ ne02,
+                    /*.nb01  =*/ nb01,
+                    /*.nb02  =*/ nb02,
+                    /*.nb03  =*/ nb03,
+                    /*.ne11  =*/ ne11,
+                    /*.nb10  =*/ nb10,
+                    /*.nb11  =*/ nb11,
+                    /*.nb12  =*/ nb12,
+                    /*.nb13  =*/ nb13,
+                    /*.ne20  =*/ ne20,
+                    /*.ne21  =*/ ne21,
+                    /*.ne0   =*/ ne0,
+                    /*.ne1   =*/ ne1,
+                    /*.r2    =*/ r2,
+                    /*.r3    =*/ r3,
+                };
+
+                ggml_metal_encoder_set_pipeline(enc, pipeline);
+                ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+                ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+                ggml_metal_encoder_set_buffer  (enc, bid_src1, 2);
+                ggml_metal_encoder_set_buffer  (enc, bid_tpe,  3);
+                ggml_metal_encoder_set_buffer  (enc, bid_ids,  4);
+                ggml_metal_encoder_set_buffer  (enc, bid_dst,  5);
+
+                const size_t smem = pipeline.smem;
+                ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+                ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
+            }
        }
    } else {
        auto pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
@@ -2448,7 +2624,9 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
        if (op->src[0]->type == GGML_TYPE_F32 ||
            op->src[0]->type == GGML_TYPE_F16 ||
            op->src[0]->type == GGML_TYPE_BF16 ||
-            op->src[0]->type == GGML_TYPE_Q8_0) {
+            op->src[0]->type == GGML_TYPE_Q8_0 ||
+            op->src[0]->type == GGML_TYPE_TQ3_1S ||
+            op->src[0]->type == GGML_TYPE_TQ4_1S) {
            ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0 - 1)/(nr0), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
        } else {
            ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0*nsg - 1)/(nr0*nsg), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
@@ -2511,9 +2689,57 @@ bool ggml_metal_op_flash_attn_ext_use_vec(const ggml_tensor * op) {
    const int64_t ne01 = op->src[0]->ne[1]; // batch size

    // use vec kernel if the batch size is small and if the head size is supported
+    // EXPERIMENT: force non-vec for turbo3 on pre-M5 hardware.
+    // The vec kernel uses nl=8 (4 elements per dequant call) which has 4x more
+    // loop iterations than the non-vec nl=2 path. On M2 Pro, this loop overhead
+    // dominates — the non-vec path may be faster even for batch=1.
+    const ggml_type ktype = op->src[1]->type;
+    if (ktype == GGML_TYPE_TURBO2_0 || ktype == GGML_TYPE_TURBO3_0 || ktype == GGML_TYPE_TURBO4_0) {
+        const char * force_nonvec = getenv("TURBO_FORCE_NONVEC");
+        if (force_nonvec && force_nonvec[0] == '1') {
+            return false;  // force non-vec path
+        }
+    }
    return (ne01 < 20) && (ne00 % 32 == 0);
 }

+// TurboFlash: two-pass fused asymmetric attention for turbo3 V decode
+// Returns true when V=turbo3, single-token decode (ne01==1), and K is q8_0 or turbo3
+static bool ggml_metal_op_flash_attn_ext_use_turbo_flash(const ggml_tensor * op) {
+    assert(op->op == GGML_OP_FLASH_ATTN_EXT);
+
+    const int64_t ne01 = op->src[0]->ne[1]; // batch size (queries)
+    const int64_t ne00 = op->src[0]->ne[0]; // head size
+
+    const ggml_type type_k = op->src[1]->type;
+    const ggml_type type_v = op->src[2]->type;
+
+    // Only for single-token decode (VEC path conditions)
+    if (ne01 != 1) return false;
+
+    // Only for turbo3 V cache
+    if (type_v != GGML_TYPE_TURBO3_0) return false;
+
+    // Only for q8_0 or turbo3 K — asymmetric or symmetric turbo
+    if (type_k != GGML_TYPE_Q8_0 && type_k != GGML_TYPE_TURBO3_0) return false;
+
+    // Only for supported head dims (64, 96, 128) and power-of-2 aligned to 32
+    if (ne00 % 32 != 0) return false;
+    if (ne00 != 64 && ne00 != 96 && ne00 != 128) return false;
+
+    // Check environment variable to opt-out
+    const char * turbo_flash_env = getenv("TURBO_FLASH");
+    if (turbo_flash_env && turbo_flash_env[0] == '0') return false;
+
+    // Check environment variable to force enable (bypasses other checks)
+    if (turbo_flash_env && turbo_flash_env[0] == '1') return true;
+
+    // Default: disabled — TurboFlash two-pass kernel produces corrupt output
+    // on Apple10 (M5 Max) and possibly other Metal4 GPUs. Use TURBO_FLASH=1
+    // to opt-in for testing. See PR #91.
+    return false;
+}
+
 size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {
    assert(op->op == GGML_OP_FLASH_ATTN_EXT);

@@ -2625,6 +2851,20 @@ size_t ggml_metal_op_flash_attn_ext_extra_tmp(const ggml_tensor * op) {
        res += ggml_type_size(GGML_TYPE_F32)*(ne01_max*ne02*ne03*nwg*(ne20 + 2));
    }

+    // TurboFlash two-pass temp is only needed when the TurboFlash path is eligible.
+    // Reserving it unconditionally can massively inflate graph scratch usage for
+    // large-context models even when the normal FA path is selected.
+    if (ggml_metal_op_flash_attn_ext_use_turbo_flash(op)) {
+        // partial_out: float[n_bh * n_blocks * dv]
+        // partial_ms:  float[n_bh * n_blocks * 2]  (max + sum per block)
+        const int64_t n_bh = ne01 * ne02 * ne03;
+        const int64_t ne11 = op->src[1]->ne[1];  // T_kv
+        const int64_t n_blocks = (ne11 + 63) / 64;  // ceil(T_kv / 64)
+        const int64_t dv = ne20;
+
+        res += ggml_type_size(GGML_TYPE_F32) * n_bh * n_blocks * (dv + 2);
+    }
+
    return res;
 }

@@ -2650,7 +2890,22 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
    GGML_ASSERT(ne00 % 4 == 0);

    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT(op->src[1]->type == op->src[2]->type);
+
+    // Allow asymmetric K/V quantization for supported mixed pairs
+    {
+        const ggml_type type_k = op->src[1]->type;
+        const ggml_type type_v = op->src[2]->type;
+        if (type_k != type_v) {
+            const bool k_is_turbo = (type_k == GGML_TYPE_TURBO2_0 || type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0);
+            const bool v_is_turbo = (type_v == GGML_TYPE_TURBO2_0 || type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0);
+            const bool k_is_q8 = (type_k == GGML_TYPE_Q8_0);
+            const bool v_is_q8 = (type_v == GGML_TYPE_Q8_0);
+            const bool supported = (k_is_turbo && v_is_turbo) ||
+                                   (k_is_q8 && v_is_turbo) ||
+                                   (k_is_turbo && v_is_q8);
+            GGML_ASSERT(supported && "asymmetric K/V types only supported for turbo and q8_0 mixed pairs");
+        }
+    }

    //GGML_ASSERT(ggml_are_same_shape (src1, src2));
    GGML_ASSERT(ne11 == ne21);
@@ -2702,6 +2957,164 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
    ggml_metal_buffer_id bid_tmp = bid_blk;
    bid_tmp.offs += ggml_metal_op_flash_attn_ext_extra_blk(op);

+    // ==================== TurboFlash two-pass dispatch ====================
+    // Intercept before the normal VEC/non-VEC path when conditions are met:
+    //   - V is turbo3, K is q8_0 or turbo3
+    //   - Single-token decode (ne01 == 1)
+    //   - Supported head dimensions (64, 96, 128)
+    if (ggml_metal_op_flash_attn_ext_use_turbo_flash(op)) {
+        const int32_t dk = ne00;  // head dim for K
+        const int32_t dv = ne20;  // head dim for V
+
+        // Compute TurboFlash block parameters
+        constexpr int BLOCK_SIZE = 64;
+        const int32_t n_blocks = (ne11 + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        const int32_t n_bh = ne01 * ne02 * ne03;  // total query heads across batch
+
+        // Temp buffer layout (within bid_tmp):
+        //   [0 .. existing_vec_tmp)  — existing VEC temp buffer (skip past it)
+        //   [turbo_base .. turbo_base + partial_out_bytes)  — partial_out
+        //   [turbo_base + partial_out_bytes .. turbo_base + partial_out_bytes + partial_ms_bytes) — partial_ms
+        //
+        // Actually, we placed turbo tmp allocation AFTER the vec tmp in extra_tmp,
+        // so turbo data starts at:
+        {
+            const int64_t nwg = 32;
+            const int64_t ne01_max = std::min(ne01, (int32_t)32);
+            size_t vec_tmp_size = ggml_type_size(GGML_TYPE_F32)*(ne01_max*ne02*ne03*nwg*(ne20 + 2));
+
+            ggml_metal_buffer_id bid_turbo = bid_tmp;
+            bid_turbo.offs += vec_tmp_size;
+
+            const size_t partial_out_bytes = sizeof(float) * n_bh * n_blocks * dv;
+
+            ggml_metal_buffer_id bid_partial_out = bid_turbo;
+            ggml_metal_buffer_id bid_partial_ms  = bid_turbo;
+            bid_partial_ms.offs += partial_out_bytes;
+
+            // ---- Pass 1: Block scoring + partial V accumulation ----
+            {
+                ggml_metal_kargs_turbo_flash_p1 args_p1 = {
+                    /*.ne01      =*/ ne01,
+                    /*.ne02      =*/ ne02,
+                    /*.ne03      =*/ ne03,
+                    /*.nb01      =*/ nb01,
+                    /*.nb02      =*/ nb02,
+                    /*.nb03      =*/ nb03,
+                    /*.ne11      =*/ ne11,
+                    /*.ne_12_2   =*/ ne12,
+                    /*.ne_12_3   =*/ ne13,
+                    /*.nb11      =*/ nb11,
+                    /*.nb12      =*/ nb12,
+                    /*.nb13      =*/ nb13,
+                    /*.nb21      =*/ nb21,
+                    /*.nb22      =*/ nb22,
+                    /*.nb23      =*/ nb23,
+                    /*.ne31      =*/ ne31,
+                    /*.ne32      =*/ ne32,
+                    /*.ne33      =*/ ne33,
+                    /*.nb31      =*/ nb31,
+                    /*.nb32      =*/ nb32,
+                    /*.nb33      =*/ nb33,
+                    /*.scale     =*/ scale,
+                    /*.n_blocks  =*/ n_blocks,
+                };
+
+                // Pipeline name: kernel_turbo_flash_p1_dk{dk}_dv{dv}
+                const ggml_type type_k = op->src[1]->type;
+                const bool k_is_turbo3 = (type_k == GGML_TYPE_TURBO3_0);
+
+                char p1_base[128];
+                char p1_name[256];
+                snprintf(p1_base, 128, "kernel_turbo_flash_p1_dk%d_dv%d", dk, dv);
+                snprintf(p1_name, 256, "%s_mask=%d_dk=%d_dv=%d_kt3=%d",
+                        p1_base, has_mask ? 1 : 0, dk, dv, k_is_turbo3 ? 1 : 0);
+
+                // The kernel uses FC_turbo_flash_p1_has_mask and FC_turbo_flash_p1_k_is_turbo3 as function constants
+                ggml_metal_pipeline_with_params res_p1 = ggml_metal_library_get_pipeline(lib, p1_name);
+                if (!res_p1.pipeline) {
+                    ggml_metal_cv_t cv = ggml_metal_cv_init();
+                    ggml_metal_cv_set_int32(cv, dk,          FC_TURBO_FLASH_P1 + 0);
+                    ggml_metal_cv_set_int32(cv, dv,          FC_TURBO_FLASH_P1 + 1);
+                    ggml_metal_cv_set_bool(cv,  has_mask,     FC_TURBO_FLASH_P1 + 2);
+                    ggml_metal_cv_set_bool(cv,  k_is_turbo3,  FC_TURBO_FLASH_P1 + 3);
+
+                    res_p1 = ggml_metal_library_compile_pipeline(lib, p1_base, p1_name, cv);
+                    ggml_metal_cv_free(cv);
+                }
+
+                if (!res_p1.pipeline) {
+                    // Fall through to normal FA path by NOT returning
+                    goto turbo_flash_end;
+                }
+
+                // V4: no shared memory in pass 1 (all registers)
+                // Metal requires at least 16 bytes for threadgroup memory
+                const size_t smem_p1 = 16;
+                constexpr int TG_SIZE = 32;  // 1 SIMD group
+
+                ggml_metal_encoder_set_pipeline(enc, res_p1);
+                ggml_metal_encoder_set_bytes   (enc, &args_p1, sizeof(args_p1), 0);
+                ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);  // q
+                ggml_metal_encoder_set_buffer  (enc, bid_src1, 2);  // k (q8_0)
+                ggml_metal_encoder_set_buffer  (enc, bid_src2, 3);  // v (turbo3)
+                ggml_metal_encoder_set_buffer  (enc, bid_src3, 4);  // mask
+                ggml_metal_encoder_set_buffer  (enc, bid_partial_out, 5);  // partial_out
+                ggml_metal_encoder_set_buffer  (enc, bid_partial_ms,  6);  // partial_ms
+
+                ggml_metal_encoder_set_threadgroup_memory_size(enc, smem_p1, 0);
+
+                // Grid: (n_bh, n_blocks, 1), Threadgroup: (32, 1, 1) — 1 SIMD group
+                ggml_metal_encoder_dispatch_threadgroups(enc, n_bh, n_blocks, 1, TG_SIZE, 1, 1);
+            }
+
+            // Barrier between pass 1 and pass 2
+            ggml_metal_op_concurrency_reset(ctx);
+
+            // ---- Pass 2: Merge partials + inverse WHT + write output ----
+            {
+                ggml_metal_kargs_turbo_flash_p2 args_p2 = {
+                    /*.ne01     =*/ n_bh,
+                    /*.n_blocks =*/ n_blocks,
+                };
+
+                char p2_base[128];
+                char p2_name[256];
+                snprintf(p2_base, 128, "kernel_turbo_flash_p2_dv%d", dv);
+                snprintf(p2_name, 256, "%s_dv=%d", p2_base, dv);
+
+                ggml_metal_pipeline_with_params res_p2 = ggml_metal_library_get_pipeline(lib, p2_name);
+                if (!res_p2.pipeline) {
+                    ggml_metal_cv_t cv = ggml_metal_cv_init();
+                    ggml_metal_cv_set_int32(cv, dv, FC_TURBO_FLASH_P2 + 0);
+
+                    res_p2 = ggml_metal_library_compile_pipeline(lib, p2_base, p2_name, cv);
+                    ggml_metal_cv_free(cv);
+                }
+
+                // Shared memory: DV + 2 floats (shared_out[DV] + global_max + global_sum)
+                const size_t smem_p2 = sizeof(float) * (dv + 2);
+
+                ggml_metal_encoder_set_pipeline(enc, res_p2);
+                ggml_metal_encoder_set_bytes   (enc, &args_p2, sizeof(args_p2), 0);
+                ggml_metal_encoder_set_buffer  (enc, bid_partial_out, 1);  // partial_out
+                ggml_metal_encoder_set_buffer  (enc, bid_partial_ms,  2);  // partial_ms
+                ggml_metal_encoder_set_buffer  (enc, bid_dst,         3);  // output
+
+                ggml_metal_encoder_set_threadgroup_memory_size(enc, smem_p2, 0);
+
+                // Grid: (n_bh, 1, 1), Threadgroup: (max(dv, 128), 1, 1)
+                // Need at least DV threads for the WHT butterfly
+                const int tg_size = std::max(dv, (int32_t)128);
+                ggml_metal_encoder_dispatch_threadgroups(enc, n_bh, 1, 1, tg_size, 1, 1);
+            }
+        }
+
+        return 1;
+    turbo_flash_end:;
+    }
+    // ==================== End TurboFlash ====================
+
    if (!ggml_metal_op_flash_attn_ext_use_vec(op)) {
        // half8x8 kernel
        const int nqptg = OP_FLASH_ATTN_EXT_NQPSG; // queries per threadgroup
@@ -2935,7 +3348,8 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
        // ne20*(nsg)
        // each simdgroup has a full f32 head vector in shared mem to accumulate results
        //
-#define FATTN_SMEM(nsg) (GGML_PAD(((GGML_PAD(ne00, 128) + 4*ncpsg + 2*GGML_PAD(ne20, 128))*(nsg))*(sizeof(float)/2), 16))
+// Extra 128 floats (512 bytes) for TurboQuant pre-dequantized block cache in threadgroup memory
+#define FATTN_SMEM(nsg) (GGML_PAD(((GGML_PAD(ne00, 128) + 4*ncpsg + 2*GGML_PAD(ne20, 128))*(nsg))*(sizeof(float)/2) + 128*sizeof(float), 16))

        int64_t nsg = 1;

@@ -59,6 +59,7 @@ int ggml_metal_op_ssm_conv          (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_ssm_scan          (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_rwkv              (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_gated_delta_net   (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_turbo_wht         (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_solve_tri         (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_set               (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_cpy               (ggml_metal_op_t ctx, int idx);
@@ -0,0 +1,49 @@
+// TurboQuant Fast Walsh-Hadamard rotation for Metal
+// Replaces 256KB dense matrices with 512 bytes of sign arrays + O(d log d) butterfly
+// Generated with seed=42 (rotation) and seed=1042 (QJL)
+
+// --- Rotation sign arrays ---
+constant float turbo_wht_signs1[128] = {
+    -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f};
+constant float turbo_wht_signs2[128] = {
+    1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f};
+
+// --- QJL sign arrays ---
+constant float turbo_qjl_wht_signs1[128] = {
+    1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
+constant float turbo_qjl_wht_signs2[128] = {
+    1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f};
+
+// --- Fast Walsh-Hadamard Transform (in-place, normalized) ---
+// O(n log n) = 896 operations for n=128, vs O(n²) = 16384 for dense matvec
+static void turbo_fwht_128(thread float * x) {
+    for (int h = 1; h < 128; h *= 2) {
+        for (int i = 0; i < 128; i += h * 2) {
+            for (int j = i; j < i + h; j++) {
+                float a = x[j];
+                float b = x[j + h];
+                x[j]     = a + b;
+                x[j + h] = a - b;
+            }
+        }
+    }
+    // Normalize by 1/sqrt(128)
+    const float inv_sqrt_128 = 0.08838834764831845f; // 1/sqrt(128)
+    for (int i = 0; i < 128; i++) {
+        x[i] *= inv_sqrt_128;
+    }
+}
+
+// --- Forward rotation: signs1 → FWHT → signs2 ---
+static void turbo_rotate_forward(thread float * x, constant float * s1, constant float * s2) {
+    for (int i = 0; i < 128; i++) x[i] *= s1[i];
+    turbo_fwht_128(x);
+    for (int i = 0; i < 128; i++) x[i] *= s2[i];
+}
+
+// --- Inverse rotation: signs2 → FWHT → signs1 (FWHT is its own inverse) ---
+static void turbo_rotate_inverse(thread float * x, constant float * s1, constant float * s2) {
+    for (int i = 0; i < 128; i++) x[i] *= s2[i];
+    turbo_fwht_128(x);
+    for (int i = 0; i < 128; i++) x[i] *= s1[i];
+}
@@ -5480,6 +5480,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
        case GGML_TYPE_I64:
            // nothing to validate
            break;
+        case GGML_TYPE_TQ3_1S:
+        case GGML_TYPE_TQ4_1S:
+        case GGML_TYPE_TURBO3_0:
+        case GGML_TYPE_TURBO4_0:
+        case GGML_TYPE_TURBO2_0:
+            // WHT-rotated / TurboQuant types: just validate scales are not NaN/Inf
+            // TODO: add more thorough validation if needed
+            break;
        default:
            {
                fprintf(stderr, "%s: invalid type %d\n", __func__, type);
@@ -102,6 +102,27 @@ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTR
 GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

+// TurboQuant KV cache compression (arXiv 2504.19874)
+GGML_API void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT x, block_turbo3_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_turbo3_0(const block_turbo3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_turbo3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_turbo4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API void quantize_row_turbo2_0_ref(const float * GGML_RESTRICT x, block_turbo2_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_turbo2_0(const block_turbo2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_turbo2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// TQ3_1S: WHT-rotated 3-bit weight quantization (8-level Lloyd-Max)
+GGML_API void quantize_row_tq3_1s_ref(const float * GGML_RESTRICT x, block_tq3_1s * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_tq3_1s(const block_tq3_1s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_tq3_1s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// TQ4_1S: WHT-rotated 4-bit weight quantization (16-level Lloyd-Max)
+GGML_API void quantize_row_tq4_1s_ref(const float * GGML_RESTRICT x, block_tq4_1s * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_tq4_1s(const block_tq4_1s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_tq4_1s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
 GGML_API void iq2xs_init_impl(enum ggml_type type);
 GGML_API void iq2xs_free_impl(enum ggml_type type);
 GGML_API void iq3xs_init_impl(int grid_size);
@@ -847,6 +847,7 @@ struct vk_device_struct {
    vk_pipeline pipeline_timestep_embedding_f32;
    vk_pipeline pipeline_conv_transpose_1d_f32;
    vk_pipeline pipeline_pool2d_f32;
+    vk_pipeline pipeline_turbo_wht;
    vk_pipeline pipeline_rwkv_wkv6_f32;
    vk_pipeline pipeline_rwkv_wkv7_f32;
    // [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128
@@ -4298,6 +4299,30 @@ static void ggml_vk_load_shaders(vk_device& device) {

    const uint32_t force_subgroup_size = use_subgroups ? subgroup_size : 0;
    const uint32_t force_subgroup_size16 = use_subgroups16 ? subgroup_size16 : 0;
+
+    // TQ4_1S uses a dedicated pipeline whose workgroup size is always 32 and
+    // whose reduction path is always the shared-memory variant.
+    //
+    // The Walsh-Hadamard butterfly inside the shader operates on 32-element
+    // blocks with one element per thread, so the workgroup contract is fixed
+    // regardless of what the rest of the mul_mat_vec family picks for the
+    // current DMMV_WG_SIZE bucket.  We always use 32 threads per workgroup.
+    //
+    // Reduction choice: the shader uses the SHMEM tree reduction even when
+    // subgroup arithmetic is available.  A subgroup-shuffle butterfly + pure
+    // subgroupAdd reduction variant was tried and measured ~70 %% slower on
+    // Intel Arc (Mesa Xe HPG), where subgroup shuffles and subgroup adds are
+    // emulated over LDS and end up doing the same amount of LDS traffic as
+    // the explicit shared-memory path but with extra driver overhead.  Going
+    // through SHMEM directly is always correct and is fastest on the devices
+    // we can actually measure.  Future vendor-specific heuristics can switch
+    // to the hybrid reduction variant on NVIDIA / AMD RDNA if hardware
+    // subgroup shuffles beat the LDS roundtrip there.
+    const uint32_t tq4_1s_wg_size            = 32u;
+    const uint32_t tq4_1s_force_sg_size      = 0u;
+    const bool     tq4_1s_use_subgroups      = false;
+    const shader_reduction_mode tq4_1s_reduc = SHADER_REDUCTION_MODE_SHMEM;
+
    static constexpr uint32_t mul_mat_vec_num_bindings = 5;
    static constexpr uint32_t mul_mat_vec_id_num_bindings = 6;

@@ -4339,6 +4364,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i],  "mul_mat_vec_iq4_nl_f32_f32",  arr_dmmv_iq4_nl_f32_f32_len[reduc16],  arr_dmmv_iq4_nl_f32_f32_data[reduc16],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i],   "mul_mat_vec_mxfp4_f32_f32",   arr_dmmv_mxfp4_f32_f32_len[reduc16],   arr_dmmv_mxfp4_f32_f32_data[reduc16],   "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_NVFP4][i],   "mul_mat_vec_nvfp4_f32_f32",   arr_dmmv_nvfp4_f32_f32_len[reduc16],   arr_dmmv_nvfp4_f32_f32_data[reduc16],   "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
+            // TQ4_1S: fixed 32-thread workgroup, shared-memory WHT butterfly,
+            // shared-memory reduction.  NUM_ROWS=8 amortises the butterfly cost
+            // across 8 output rows per workgroup.
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_TQ4_1S][i],  "mul_mat_vec_tq4_1s_f32_f32",  arr_dmmv_tq4_1s_f32_f32_len[tq4_1s_reduc],  arr_dmmv_tq4_1s_f32_f32_data[tq4_1s_reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {8, 1, 1}, {tq4_1s_wg_size, 8, i+1}, 1, true, tq4_1s_use_subgroups, tq4_1s_force_sg_size);

            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32",  arr_dmmv_f32_f16_f32_len[reduc],  arr_dmmv_f32_f16_f32_data[reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32",  arr_dmmv_f16_f16_f32_len[reduc],  arr_dmmv_f16_f16_f32_data[reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
@@ -4365,6 +4394,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i],  "mul_mat_vec_iq4_nl_f16_f32",  arr_dmmv_iq4_nl_f16_f32_len[reduc16],  arr_dmmv_iq4_nl_f16_f32_data[reduc16],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i],   "mul_mat_vec_mxfp4_f16_f32",   arr_dmmv_mxfp4_f16_f32_len[reduc16],   arr_dmmv_mxfp4_f16_f32_data[reduc16],   "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_NVFP4][i],   "mul_mat_vec_nvfp4_f16_f32",   arr_dmmv_nvfp4_f16_f32_len[reduc16],   arr_dmmv_nvfp4_f16_f32_data[reduc16],   "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_TQ4_1S][i],  "mul_mat_vec_tq4_1s_f16_f32",  arr_dmmv_tq4_1s_f16_f32_len[tq4_1s_reduc],  arr_dmmv_tq4_1s_f16_f32_data[tq4_1s_reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {8, 1, 1}, {tq4_1s_wg_size, 8, i+1}, 1, true, tq4_1s_use_subgroups, tq4_1s_force_sg_size);

 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
            if (device->integer_dot_product) {
@@ -4473,6 +4503,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL],  "dequant_iq4_nl",  dequant_iq4_nl_len,  dequant_iq4_nl_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_MXFP4],   "dequant_mxfp4",   dequant_mxfp4_len,   dequant_mxfp4_data,   "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_NVFP4],   "dequant_nvfp4",   dequant_nvfp4_len,   dequant_nvfp4_data,   "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_TURBO3_0], "dequant_turbo3_0", dequant_turbo3_0_len, dequant_turbo3_0_data, "main", 2, 5 * sizeof(uint32_t), {128, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_TQ4_1S],  "dequant_tq4_1s",  dequant_tq4_1s_len,  dequant_tq4_1s_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+
+    // TurboQuant WHT
+    ggml_vk_create_pipeline(device, device->pipeline_turbo_wht, "turbo_wht", turbo_wht_len, turbo_wht_data, "main", 2, 3 * sizeof(uint32_t), {128, 1, 1}, {}, 1);

    // get_rows
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32",  get_rows_f32_len,  get_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -4500,6 +4535,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl",  get_rows_iq4_nl_len,  get_rows_iq4_nl_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4],   "get_rows_mxfp4",   get_rows_mxfp4_len,   get_rows_mxfp4_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_NVFP4],   "get_rows_nvfp4",   get_rows_nvfp4_len,   get_rows_nvfp4_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_TURBO3_0], "get_rows_turbo3_0", get_rows_turbo3_0_len, get_rows_turbo3_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32],     "get_rows_i32",     get_rows_i32_len,     get_rows_i32_data,     "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -4527,6 +4563,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl_f32",  get_rows_iq4_nl_f32_len,  get_rows_iq4_nl_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4],   "get_rows_mxfp4_f32",   get_rows_mxfp4_f32_len,   get_rows_mxfp4_f32_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_NVFP4],   "get_rows_nvfp4_f32",   get_rows_nvfp4_f32_len,   get_rows_nvfp4_f32_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_TURBO3_0], "get_rows_turbo3_0_f32", get_rows_turbo3_0_f32_len, get_rows_turbo3_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
@@ -4588,25 +4625,32 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_32, "cpy_transpose_32", cpy_transpose_32_len, cpy_transpose_32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);

-    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_len, cpy_f32_q1_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
+    // RTE fp16 rounding is now patched into SPIR-V programmatically (see
+    // ggml_vk_create_pipeline_func), so we only compile a single variant of
+    // each cpy_f32_quant / set_rows shader rather than _rte/non-_rte pairs.
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0],   "cpy_f32_q1_0",   cpy_f32_q1_0_len,   cpy_f32_q1_0_data,   "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0],   "cpy_f32_q4_0",   cpy_f32_q4_0_len,   cpy_f32_q4_0_data,   "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1],   "cpy_f32_q4_1",   cpy_f32_q4_1_len,   cpy_f32_q4_1_data,   "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0],   "cpy_f32_q5_0",   cpy_f32_q5_0_len,   cpy_f32_q5_0_data,   "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1],   "cpy_f32_q5_1",   cpy_f32_q5_1_len,   cpy_f32_q5_1_data,   "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0],   "cpy_f32_q8_0",   cpy_f32_q8_0_len,   cpy_f32_q8_0_data,   "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);

 #define SET_ROWS(itype) \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F32],  "set_rows_f32" #itype,  set_rows_f32 ## itype ## _len,  set_rows_f32 ## itype ## _data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16],  "set_rows_f16" #itype,  set_rows_f16 ## itype ## _len,  set_rows_f16 ## itype ## _data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16], "set_rows_bf16" #itype, set_rows_bf16 ## itype ## _len, set_rows_bf16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0], "set_rows_q1_0" #itype, set_rows_q1_0 ## itype ## _len, set_rows_q1_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0], "set_rows_q4_0" #itype, set_rows_q4_0 ## itype ## _len, set_rows_q4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1], "set_rows_q4_1" #itype, set_rows_q4_1 ## itype ## _len, set_rows_q4_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## _len, set_rows_q5_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1], "set_rows_q5_1" #itype, set_rows_q5_1 ## itype ## _len, set_rows_q5_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0], "set_rows_q8_0" #itype, set_rows_q8_0 ## itype ## _len, set_rows_q8_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL], "set_rows_iq4_nl" #itype, set_rows_iq4_nl ## itype ## _len, set_rows_iq4_nl ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F32],      "set_rows_f32" #itype,      set_rows_f32 ## itype ## _len,      set_rows_f32 ## itype ## _data,      "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16],      "set_rows_f16" #itype,      set_rows_f16 ## itype ## _len,      set_rows_f16 ## itype ## _data,      "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16],     "set_rows_bf16" #itype,     set_rows_bf16 ## itype ## _len,     set_rows_bf16 ## itype ## _data,     "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0],     "set_rows_q1_0" #itype,     set_rows_q1_0 ## itype ## _len,     set_rows_q1_0 ## itype ## _data,     "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0],     "set_rows_q4_0" #itype,     set_rows_q4_0 ## itype ## _len,     set_rows_q4_0 ## itype ## _data,     "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1],     "set_rows_q4_1" #itype,     set_rows_q4_1 ## itype ## _len,     set_rows_q4_1 ## itype ## _data,     "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0],     "set_rows_q5_0" #itype,     set_rows_q5_0 ## itype ## _len,     set_rows_q5_0 ## itype ## _data,     "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1],     "set_rows_q5_1" #itype,     set_rows_q5_1 ## itype ## _len,     set_rows_q5_1 ## itype ## _data,     "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0],     "set_rows_q8_0" #itype,     set_rows_q8_0 ## itype ## _len,     set_rows_q8_0 ## itype ## _data,     "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL],   "set_rows_iq4_nl" #itype,   set_rows_iq4_nl ## itype ## _len,   set_rows_iq4_nl ## itype ## _data,   "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TURBO2_0], "set_rows_turbo2_0" #itype, set_rows_turbo2_0 ## itype ## _len, set_rows_turbo2_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TURBO3_0], "set_rows_turbo3_0" #itype, set_rows_turbo3_0 ## itype ## _len, set_rows_turbo3_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TURBO4_0], "set_rows_turbo4_0" #itype, set_rows_turbo4_0 ## itype ## _len, set_rows_turbo4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \
+        ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TQ4_1S],   "set_rows_tq4_1s" #itype,   set_rows_tq4_1s ## itype ## _len,   set_rows_tq4_1s ## itype ## _data,   "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);

    SET_ROWS(_i32)
    SET_ROWS(_i64)
@@ -4620,6 +4664,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_TURBO3_0], "cpy_turbo3_0_f32", cpy_turbo3_0_f32_len, cpy_turbo3_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_TURBO3_0), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_TQ4_1S], "cpy_tq4_1s_f32", cpy_tq4_1s_f32_len, cpy_tq4_1s_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_TQ4_1S), 1, 1}, {}, 1);

    auto get_suffix = [](bool src0_f16, bool src1_f16, bool dst_f16) {
        std::string s;
@@ -6285,6 +6331,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_MXFP4:
        case GGML_TYPE_NVFP4:
+        case GGML_TYPE_TQ4_1S:
            break;
        default:
            return nullptr;
@@ -6425,6 +6472,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_MXFP4:
        case GGML_TYPE_NVFP4:
+        case GGML_TYPE_TQ4_1S:
            break;
        default:
            return nullptr;
@@ -6440,6 +6488,10 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
            if (m < 4096 && k >= 1024) {
                dmmv_wg = DMMV_WG_SIZE_LARGE;
            }
+        } else if (a_type == GGML_TYPE_TQ4_1S) {
+            // TQ4_1S needs exactly 32 threads (one subgroup) to cooperate on the
+            // 32-element WHT butterfly in shared memory. Force SUBGROUP-sized wg.
+            dmmv_wg = DMMV_WG_SIZE_SUBGROUP;
        } else {
            if (m <= 8192 && k >= 1024) {
                dmmv_wg = DMMV_WG_SIZE_LARGE;
@@ -7591,6 +7643,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_TURBO3_0:
+        case GGML_TYPE_TQ4_1S:
            return ctx->device->pipeline_cpy_quant_f32[src->type];
        default:
            break;
@@ -10422,7 +10476,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_SET_ROWS:
        {
            uint32_t ne = ggml_nelements(src0);
-            if (ggml_is_quantized(dst->type)) {
+            if (dst->type == GGML_TYPE_TURBO2_0 ||
+                dst->type == GGML_TYPE_TURBO3_0 ||
+                dst->type == GGML_TYPE_TURBO4_0) {
+                ne = ne / 128;
+            } else if (dst->type == GGML_TYPE_TQ4_1S) {
+                ne = ne / 32;
+            } else if (ggml_is_quantized(dst->type)) {
                // quants run 32 threads each doing QUANT_K elements
                ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type));
            } else {
@@ -11215,6 +11275,32 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
    });
 }

+static void ggml_vk_turbo_wht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    int direction, group_size;
+    memcpy(&direction, dst->op_params + 0, sizeof(int));
+    memcpy(&group_size, dst->op_params + sizeof(int), sizeof(int));
+    struct { uint32_t ne; uint32_t direction; uint32_t group_size; } pc = {
+        (uint32_t)ggml_nelements(src0), (uint32_t)direction, (uint32_t)group_size,
+    };
+    vk_pipeline pipeline = ctx->device->pipeline_turbo_wht;
+    GGML_ASSERT(pipeline != nullptr);
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
+    vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src0, false);
+    vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false);
+    // Spread workgroups across Y/Z to stay within maxComputeWorkGroupCount[0].
+    // elements[0] / group_size = wg0; each row of 512 workgroups uses one Y slice.
+    const uint32_t n_groups = pc.ne / (uint32_t)group_size;
+    std::array<uint32_t, 3> elements;
+    if (n_groups > 262144) {
+        elements = { 512 * (uint32_t)group_size, 512, CEIL_DIV(n_groups, 262144) };
+    } else if (n_groups > 512) {
+        elements = { 512 * (uint32_t)group_size, CEIL_DIV(n_groups, 512), 1 };
+    } else {
+        elements = { pc.ne, 1, 1 };
+    }
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc, elements);
+}
+
 static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
 }
@@ -13402,6 +13488,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_SET_ROWS:
        ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node);

+        break;
+    case GGML_OP_TURBO_WHT:
+        ggml_vk_turbo_wht(ctx, compute_ctx, src0, node);
+
        break;
    case GGML_OP_SILU_BACK:
        ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node);
@@ -15831,6 +15921,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    case GGML_TYPE_IQ4_NL:
                    case GGML_TYPE_MXFP4:
                    case GGML_TYPE_NVFP4:
+                    case GGML_TYPE_TQ4_1S:
                        break;
                    default:
                        return false;
@@ -15888,6 +15979,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q4_1:
                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_TURBO3_0:
                        return true;
                    case GGML_TYPE_Q1_0:
                        return coopmat2;
@@ -15932,6 +16024,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    case GGML_TYPE_IQ4_NL:
                    case GGML_TYPE_MXFP4:
                    case GGML_TYPE_NVFP4:
+                    case GGML_TYPE_TURBO3_0:
                    case GGML_TYPE_I32:
                        return true;
                    default:
@@ -15951,6 +16044,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
                    case GGML_TYPE_IQ4_NL:
+                    case GGML_TYPE_TURBO2_0:
+                    case GGML_TYPE_TURBO3_0:
+                    case GGML_TYPE_TURBO4_0:
+                    case GGML_TYPE_TQ4_1S:
                        return true;
                    default:
                        return false;
@@ -15991,6 +16088,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
                    case GGML_TYPE_IQ4_NL:
+                    case GGML_TYPE_TURBO3_0:
+                    case GGML_TYPE_TQ4_1S:
                        return true;
                    default:
                        break;
@@ -16245,6 +16344,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    ggml_is_contiguous(op->src[1]) &&
                    ggml_is_contiguous(op));
            }
+        case GGML_OP_TURBO_WHT:
+            return op->src[0]->type == GGML_TYPE_F32 && op->src[0]->ne[0] % 128 == 0;
        default:
            return false;
    }
@@ -30,6 +30,41 @@ void main() {

    const uint a_offset = 0;
    const uint ib = src_idx;
+
+#if defined(DATA_A_TQ4_1S)
+    // TQ4_1S requires full inverse WHT after centroid*scale dequant.
+    // Dequant all 32 elements into a buffer, apply butterfly, then write.
+    const float tq4_signs[32] = float[32](
+        +1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
+        -1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
+        -1.0, -1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0,
+        -1.0, +1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0
+    );
+    const float TQ4_INV_SQRT32 = 0.17677669529663688;
+
+    float buf[32];
+    for (int j = 0; j < 32; j += 2) {
+        vec2 v = dequantize(ib, j, a_offset);
+        buf[j]   = v.x;
+        buf[j+1] = v.y;
+    }
+
+    // Inverse WHT butterfly (5 stages for 32 elements)
+    for (uint step = 1u; step < 32u; step <<= 1u) {
+        for (uint i = 0u; i < 32u; i += step * 2u) {
+            for (uint j2 = i; j2 < i + step; j2++) {
+                float a2 = buf[j2], b2 = buf[j2 + step];
+                buf[j2]        = a2 + b2;
+                buf[j2 + step] = a2 - b2;
+            }
+        }
+    }
+
+    // Normalize and apply sign pattern
+    for (int j = 0; j < 32; j++) {
+        data_d[dst_idx + j] = buf[j] * TQ4_INV_SQRT32 * tq4_signs[j];
+    }
+#else
    const vec2 dm = get_dm(ib, a_offset);

    [[unroll]] for (int j = 0; j < QUANT_K; j += 4) {
@@ -48,4 +83,5 @@ void main() {
        data_d[dst_idx + j + 3] = v[3];
 #endif
    }
+#endif
 }
@@ -1,8 +1,14 @@
 #version 450

+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#extension GL_KHR_shader_subgroup_ballot : enable
+#extension GL_KHR_shader_subgroup_shuffle : enable
 #include "types.glsl"

-#if defined(SET_ROWS) && QUANT_K == 1
+#if defined(SET_ROWS) && (defined(DATA_A_TURBO2_0) || defined(DATA_A_TURBO3_0) || defined(DATA_A_TURBO4_0))
+layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
+const uint BLOCK_SIZE = 128;
+#elif defined(SET_ROWS) && QUANT_K == 1
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 const uint BLOCK_SIZE = 512;
 #else
@@ -208,6 +214,109 @@ void quantize(uint dst_idx, uint src_idx)
 }
 #endif

+#if defined(DATA_A_TURBO3_0)
+const float TS1[128] = float[128](
+    -1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1, -1,
+    -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1,  1,  1,  1, -1,  1,
+    -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1,  1,
+     1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,  1, -1,
+    -1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,
+     1, -1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,  1,  1, -1,  1
+);
+
+const float TS2[128] = float[128](
+     1,  1,  1,  1, -1,  1,  1, -1,  1, -1, -1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,  1, -1,
+     1, -1,  1,  1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1,  1,
+    -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1,
+    -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1
+);
+
+const float TINV = 0.08838834764831845;  // 1 / sqrt(128)
+
+const float TC[8] = float[8](
+    -0.190685, -0.117832, -0.065717, -0.021460,
+     0.021460,  0.065717,  0.117832,  0.190685
+);
+
+const float TM[7] = float[7](
+    -0.154259, -0.091775, -0.043589,
+     0.0,
+     0.043589,  0.091775,  0.154259
+);
+
+#if defined(SET_ROWS)
+
+shared float wht[128];
+shared float sg_acc[16];
+shared float gnrm;
+
+void quantize_block(uint b, uint o) {
+    [[unroll]] for (int j = 0; j < 32; ++j) data_q[b].qs[j]    = uint8_t(0);
+    [[unroll]] for (int j = 0; j < 16; ++j) data_q[b].signs[j] = uint8_t(0);
+    float rs = 0.0;
+    [[unroll]] for (int j = 0; j < 128; ++j) {
+        float v  = wht[o + j];
+        uint  i  = v < TM[0] ? 0 : v < TM[1] ? 1 : v < TM[2] ? 2 : v < TM[3] ? 3 :
+                   v < TM[4] ? 4 : v < TM[5] ? 5 : v < TM[6] ? 6 : 7;
+        rs += TC[i] * TC[i];
+        uint low2 = i & 0x3;
+        uint hi1  = (i >> 2) & 0x1;
+        data_q[b].qs[j / 4]    |= uint8_t(low2 << ((j % 4) * 2));
+        data_q[b].signs[j / 8] |= uint8_t(hi1  << (j % 8));
+    }
+    float rn = sqrt(rs);
+    data_q[b].norm = float16_t((rn > 1e-10) ? (gnrm / rn) : gnrm);
+}
+
+#endif // defined(SET_ROWS)
+#endif // defined(DATA_A_TURBO3_0)
+
+#if defined(DATA_A_TQ4_1S)
+
+const float TQ4_SIGNS[32] = float[32](
+    +1, -1, +1, -1, +1, +1, -1, +1,
+    -1, -1, +1, -1, +1, +1, -1, +1,
+    -1, -1, +1, -1, +1, -1, -1, +1,
+    -1, +1, +1, -1, +1, -1, -1, +1
+);
+
+const float TQ4_INV_SQRT32 = 0.17677669529663688;  // 1 / sqrt(32)
+
+const float TQ4_CENTROIDS[16] = float[16](
+    -2.732590, -2.069017, -1.618046, -1.256231,
+    -0.942340, -0.656759, -0.388048, -0.128395,
+     0.128395,  0.388048,  0.656759,  0.942340,
+     1.256231,  1.618046,  2.069017,  2.732590
+);
+
+// Midpoint thresholds for nearest-centroid binary search (16 centroids)
+uint tq4_choose_index(float val) {
+    if (val < -2.400804) return 0u;
+    if (val < -1.843532) return 1u;
+    if (val < -1.437139) return 2u;
+    if (val < -1.099286) return 3u;
+    if (val < -0.799550) return 4u;
+    if (val < -0.522404) return 5u;
+    if (val < -0.258222) return 6u;
+    if (val <  0.000000) return 7u;
+    if (val <  0.258222) return 8u;
+    if (val <  0.522404) return 9u;
+    if (val <  0.799550) return 10u;
+    if (val <  1.099286) return 11u;
+    if (val <  1.437139) return 12u;
+    if (val <  1.843532) return 13u;
+    if (val <  2.400804) return 14u;
+    return 15u;
+}
+
+#endif // defined(DATA_A_TQ4_1S)
+
 #if defined(DATA_A_IQ4_NL)
 uint best_index(float x) {
    if (x <= kvalues_iq4nl[0]) return 0;
@@ -270,7 +379,451 @@ void quantize(uint dst_idx, uint src_idx)
 }
 #endif

-#if defined(SET_ROWS)
+#if defined(SET_ROWS) && defined(DATA_A_TURBO3_0)
+void main() {
+    const uint t   = gl_LocalInvocationID.x;
+    const uint g   = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint gpr = p.ne00 / 128;
+
+    if (gpr == 0) return;
+    if (g >= p.ne / 128) return;
+
+    uint tmp = g;
+    const uint ig  = tmp % gpr; tmp /= gpr;
+    const uint i01 = tmp % p.ne01; tmp /= p.ne01;
+    const uint i02 = tmp % p.ne12;
+    const uint i03 = tmp / p.ne12;
+
+    const uint sb  = src0_idx(ig * 128, i01, i02, i03) + get_aoffset();
+    const uint i1  = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE;
+    const uint db  = dst_idx(ig, i1, i02, i03) + get_doffset();
+
+    // Step 1: load into shared memory
+    wht[t] = data_s[sb + t];
+    barrier();
+
+    // Step 2: L2 norm via subgroup reduction
+    float v2 = wht[t] * wht[t];
+    v2 = subgroupAdd(v2);
+    if (gl_SubgroupInvocationID == 0) sg_acc[gl_SubgroupID] = v2;
+    barrier();
+    if (t == 0) {
+        float total = 0.0;
+        for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc[w];
+        gnrm = sqrt(total);
+    }
+    barrier();
+
+    // Step 3: normalize, then apply forward WHT: signs1 -> butterfly -> signs2
+    wht[t] *= (gnrm > 1e-10) ? (1.0 / gnrm) : 0.0;
+    barrier();
+
+    wht[t] *= TS1[t];
+    barrier();
+
+    [[unroll]] for (uint h = 1; h < 128; h *= 2) {
+        if ((t % (2 * h)) < h) {
+            float a    = wht[t];
+            float b    = wht[t + h];
+            wht[t]     = a + b;
+            wht[t + h] = a - b;
+        }
+        barrier();
+    }
+
+    // Step 5: apply signs2 + scaling
+    float rv = wht[t] * TINV * TS2[t];
+
+    // Step 6: quantize -- all 128 threads participate
+    uint idx = rv < TM[0] ? 0u : rv < TM[1] ? 1u : rv < TM[2] ? 2u : rv < TM[3] ? 3u :
+              rv < TM[4] ? 4u : rv < TM[5] ? 5u : rv < TM[6] ? 6u : 7u;
+
+    // Pack qs: 4 elements per byte via subgroup shuffle
+    uint sg_lane = gl_SubgroupInvocationID;
+    uint my_low2 = idx & 0x3u;
+    uint qs_byte = 0u;
+    [[unroll]] for (uint k = 0; k < 4; k++) {
+        uint contrib = subgroupShuffle(my_low2, (sg_lane & ~3u) + k);
+        qs_byte |= contrib << (k * 2u);
+    }
+    if (sg_lane % 4u == 0u) {
+        data_q[db].qs[t / 4u] = uint8_t(qs_byte);
+    }
+
+    // Pack signs: 8 elements per byte via subgroup ballot
+    uvec4 ballot = subgroupBallot(((idx >> 2u) & 1u) != 0u);
+    if (sg_lane % 8u == 0u) {
+        uint local_byte = sg_lane / 8u;
+        data_q[db].signs[t / 8u] = uint8_t((ballot.x >> (local_byte * 8u)) & 0xFFu);
+    }
+
+    // Step 7: reconstruction norm via subgroup reduction
+    float rc = TC[idx] * TC[idx];
+    rc = subgroupAdd(rc);
+    if (sg_lane == 0u) sg_acc[gl_SubgroupID] = rc;
+    barrier();
+    if (t == 0u) {
+        float total = 0.0;
+        for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc[w];
+        float rn = sqrt(total);
+        data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm / rn) : gnrm);
+    }
+}
+#elif defined(SET_ROWS) && defined(DATA_A_TURBO2_0)
+// Mirror of the TURBO3_0 block above, adapted for turbo2 (4 centroids,
+// 2-bit pack, no signs byte). WHT tables and reduction structure are
+// identical (QK = 128 for both).
+const float TS1_T2[128] = float[128](
+    -1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1, -1,
+    -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1,  1,  1,  1, -1,  1,
+    -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1,  1,
+     1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,  1, -1,
+    -1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,
+     1, -1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,  1,  1, -1,  1
+);
+const float TS2_T2[128] = float[128](
+     1,  1,  1,  1, -1,  1,  1, -1,  1, -1, -1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,  1, -1,
+     1, -1,  1,  1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1,  1,
+    -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1,
+    -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1
+);
+const float TINV_T2 = 0.08838834764831845;  // 1 / sqrt(128)
+// Lloyd-Max centroids for N(0, 1/128), 4 levels (matches CENTROIDS_2BIT in C ref)
+const float TC2[4] = float[4](-0.133462, -0.039994, 0.039994, 0.133462);
+// Midpoints between adjacent centroids
+const float TM2[3] = float[3](-0.086728, 0.0, 0.086728);
+
+shared float wht_t2[128];
+shared float sg_acc_t2[16];
+shared float gnrm_t2;
+
+void main() {
+    const uint t   = gl_LocalInvocationID.x;
+    const uint g   = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint gpr = p.ne00 / 128;
+
+    if (gpr == 0) return;
+    if (g >= p.ne / 128) return;
+
+    uint tmp = g;
+    const uint ig  = tmp % gpr; tmp /= gpr;
+    const uint i01 = tmp % p.ne01; tmp /= p.ne01;
+    const uint i02 = tmp % p.ne12;
+    const uint i03 = tmp / p.ne12;
+
+    const uint sb  = src0_idx(ig * 128, i01, i02, i03) + get_aoffset();
+    const uint i1  = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE;
+    const uint db  = dst_idx(ig, i1, i02, i03) + get_doffset();
+
+    wht_t2[t] = data_s[sb + t];
+    barrier();
+
+    float v2 = wht_t2[t] * wht_t2[t];
+    v2 = subgroupAdd(v2);
+    if (gl_SubgroupInvocationID == 0) sg_acc_t2[gl_SubgroupID] = v2;
+    barrier();
+    if (t == 0) {
+        float total = 0.0;
+        for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t2[w];
+        gnrm_t2 = sqrt(total);
+    }
+    barrier();
+
+    wht_t2[t] *= (gnrm_t2 > 1e-10) ? (1.0 / gnrm_t2) : 0.0;
+    barrier();
+
+    wht_t2[t] *= TS1_T2[t];
+    barrier();
+
+    [[unroll]] for (uint h = 1; h < 128; h *= 2) {
+        if ((t % (2 * h)) < h) {
+            float a    = wht_t2[t];
+            float b    = wht_t2[t + h];
+            wht_t2[t]     = a + b;
+            wht_t2[t + h] = a - b;
+        }
+        barrier();
+    }
+
+    float rv = wht_t2[t] * TINV_T2 * TS2_T2[t];
+
+    // Quantize to nearest of 4 centroids (2-bit index, no signs byte)
+    uint idx = rv < TM2[0] ? 0u : rv < TM2[1] ? 1u : rv < TM2[2] ? 2u : 3u;
+
+    // Pack qs: 4 elements per byte (full 2-bit each, no high bit)
+    uint sg_lane = gl_SubgroupInvocationID;
+    uint qs_byte = 0u;
+    [[unroll]] for (uint k = 0; k < 4; k++) {
+        uint contrib = subgroupShuffle(idx & 0x3u, (sg_lane & ~3u) + k);
+        qs_byte |= contrib << (k * 2u);
+    }
+    if (sg_lane % 4u == 0u) {
+        data_q[db].qs[t / 4u] = uint8_t(qs_byte);
+    }
+
+    // Reconstruction norm via subgroup reduction
+    float rc = TC2[idx] * TC2[idx];
+    rc = subgroupAdd(rc);
+    if (sg_lane == 0u) sg_acc_t2[gl_SubgroupID] = rc;
+    barrier();
+    if (t == 0u) {
+        float total = 0.0;
+        for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t2[w];
+        float rn = sqrt(total);
+        data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm_t2 / rn) : gnrm_t2);
+    }
+}
+
+#elif defined(SET_ROWS) && defined(DATA_A_TURBO4_0)
+// Mirror of the TURBO3_0 block above, adapted for turbo4 (16 centroids,
+// 4-bit nibble pack, no signs byte). WHT tables and reduction structure
+// are identical (QK = 128 for both). The block struct keeps a reserved
+// rnorm field for ABI parity with the legacy 3-bit + QJL layout.
+const float TS1_T4[128] = float[128](
+    -1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1, -1,
+    -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1,  1,  1,  1, -1,  1,
+    -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1,  1,
+     1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,  1, -1,
+    -1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,
+     1, -1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,  1,  1, -1,  1
+);
+const float TS2_T4[128] = float[128](
+     1,  1,  1,  1, -1,  1,  1, -1,  1, -1, -1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,  1, -1,
+     1, -1,  1,  1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1,  1,
+    -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1,
+    -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1
+);
+const float TINV_T4 = 0.08838834764831845;  // 1 / sqrt(128)
+// Lloyd-Max centroids for N(0, 1/128), 16 levels (matches CENTROIDS_4BIT in C ref)
+const float TC4[16] = float[16](
+    -0.173926, -0.117195, -0.089527, -0.068756,
+    -0.051262, -0.035597, -0.020989, -0.006938,
+     0.006938,  0.020989,  0.035597,  0.051262,
+     0.068756,  0.089527,  0.117195,  0.173926
+);
+// 15 midpoints between adjacent centroids
+const float TM4[15] = float[15](
+    -0.145561, -0.103361, -0.079142, -0.060009,
+    -0.043430, -0.028293, -0.013964,  0.0,
+     0.013964,  0.028293,  0.043430,  0.060009,
+     0.079142,  0.103361,  0.145561
+);
+
+shared float wht_t4[128];
+shared float sg_acc_t4[16];
+shared float gnrm_t4;
+
+void main() {
+    const uint t   = gl_LocalInvocationID.x;
+    const uint g   = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint gpr = p.ne00 / 128;
+
+    if (gpr == 0) return;
+    if (g >= p.ne / 128) return;
+
+    uint tmp = g;
+    const uint ig  = tmp % gpr; tmp /= gpr;
+    const uint i01 = tmp % p.ne01; tmp /= p.ne01;
+    const uint i02 = tmp % p.ne12;
+    const uint i03 = tmp / p.ne12;
+
+    const uint sb  = src0_idx(ig * 128, i01, i02, i03) + get_aoffset();
+    const uint i1  = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE;
+    const uint db  = dst_idx(ig, i1, i02, i03) + get_doffset();
+
+    wht_t4[t] = data_s[sb + t];
+    barrier();
+
+    float v2 = wht_t4[t] * wht_t4[t];
+    v2 = subgroupAdd(v2);
+    if (gl_SubgroupInvocationID == 0) sg_acc_t4[gl_SubgroupID] = v2;
+    barrier();
+    if (t == 0) {
+        float total = 0.0;
+        for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t4[w];
+        gnrm_t4 = sqrt(total);
+    }
+    barrier();
+
+    wht_t4[t] *= (gnrm_t4 > 1e-10) ? (1.0 / gnrm_t4) : 0.0;
+    barrier();
+
+    wht_t4[t] *= TS1_T4[t];
+    barrier();
+
+    [[unroll]] for (uint h = 1; h < 128; h *= 2) {
+        if ((t % (2 * h)) < h) {
+            float a    = wht_t4[t];
+            float b    = wht_t4[t + h];
+            wht_t4[t]     = a + b;
+            wht_t4[t + h] = a - b;
+        }
+        barrier();
+    }
+
+    float rv = wht_t4[t] * TINV_T4 * TS2_T4[t];
+
+    // Quantize to nearest of 16 centroids (4-bit index, no signs byte)
+    uint idx = 0u;
+    [[unroll]] for (uint i = 0; i < 15; i++) {
+        if (rv >= TM4[i]) idx = i + 1u;
+    }
+
+    // Pack qs: 2 elements per byte (4-bit nibble each)
+    uint sg_lane = gl_SubgroupInvocationID;
+    uint pair_low  = subgroupShuffle(idx & 0xFu, sg_lane & ~1u);
+    uint pair_high = subgroupShuffle(idx & 0xFu, (sg_lane & ~1u) + 1u);
+    uint qs_byte = pair_low | (pair_high << 4u);
+    if (sg_lane % 2u == 0u) {
+        data_q[db].qs[t / 2u] = uint8_t(qs_byte);
+    }
+
+    // Reset rnorm field (reserved in 4-bit mode)
+    if (t == 0u) {
+        data_q[db].rnorm = float16_t(0.0);
+    }
+
+    // Reconstruction norm via subgroup reduction
+    float rc = TC4[idx] * TC4[idx];
+    rc = subgroupAdd(rc);
+    if (sg_lane == 0u) sg_acc_t4[gl_SubgroupID] = rc;
+    barrier();
+    if (t == 0u) {
+        float total = 0.0;
+        for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t4[w];
+        float rn = sqrt(total);
+        data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm_t4 / rn) : gnrm_t4);
+    }
+}
+
+#elif defined(SET_ROWS) && defined(DATA_A_TQ4_1S)
+
+void main() {
+    const uint t   = gl_LocalInvocationID.x;    // 0..31, one per block element
+    const uint g   = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint gpr = p.ne00 / 32;  // blocks per row
+
+    if (gpr == 0) return;
+    if (g >= p.ne / 32) return;
+
+    uint tmp = g;
+    const uint ig  = tmp % gpr; tmp /= gpr;
+    const uint i01 = tmp % p.ne01; tmp /= p.ne01;
+    const uint i02 = tmp % p.ne12;
+    const uint i03 = tmp / p.ne12;
+
+    const uint sb  = src0_idx(ig * 32, i01, i02, i03) + get_aoffset();
+    const uint i1  = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE;
+    const uint db  = dst_idx(ig, i1, i02, i03) + get_doffset();
+
+    // Step 1: load one f32 input per thread
+    float val = data_s[sb + t];
+
+    // Step 2: Forward RHT via subgroup ops
+    // Sign flip
+    val *= TQ4_SIGNS[t];
+
+    // WHT butterfly via subgroupShuffleXor
+    [[unroll]] for (uint h = 1u; h < 32u; h <<= 1u) {
+        const float other = subgroupShuffleXor(val, h);
+        val = ((t & h) == 0u) ? (val + other) : (other - val);
+    }
+
+    // Normalize
+    val *= TQ4_INV_SQRT32;
+
+    // Step 3: Dual half-block RMS scale computation
+    float sq = val * val;
+    float sum_sq_lo = subgroupAdd((t < 16u) ? sq : 0.0);
+    float sum_sq_hi = subgroupAdd((t >= 16u) ? sq : 0.0);
+    float rms_lo = sqrt(sum_sq_lo / 16.0);
+    float rms_hi = sqrt(sum_sq_hi / 16.0);
+
+    // Step 4: Scale search (9 points) — matches CPU quantize_row_tq4_1s_ref
+    const float SCALES[9] = float[9](0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.35, 1.5);
+    float best_d0 = rms_lo;
+    float best_d1 = rms_hi;
+    float best_err_total = 1e30;
+
+    for (uint si = 0u; si < 9u; si++) {
+        float d0 = rms_lo * SCALES[si];
+        float d1 = rms_hi * SCALES[si];
+        float inv = (t < 16u)
+            ? ((d0 > 1e-10) ? 1.0 / d0 : 0.0)
+            : ((d1 > 1e-10) ? 1.0 / d1 : 0.0);
+
+        uint idx = tq4_choose_index(val * inv);
+        float c = TQ4_CENTROIDS[idx];
+        float d = (t < 16u) ? d0 : d1;
+        float diff2 = val - c * d;
+        float local_err = diff2 * diff2;
+
+        float err_total = subgroupAdd(local_err);
+        if (err_total < best_err_total) {
+            best_err_total = err_total;
+            best_d0 = d0;
+            best_d1 = d1;
+        }
+    }
+
+    // Step 5: Iterative refinement (6 iterations)
+    [[unroll]] for (uint iter = 0u; iter < 6u; iter++) {
+        float inv = (t < 16u)
+            ? ((best_d0 > 1e-10) ? 1.0 / best_d0 : 0.0)
+            : ((best_d1 > 1e-10) ? 1.0 / best_d1 : 0.0);
+
+        uint idx = tq4_choose_index(val * inv);
+        float c = TQ4_CENTROIDS[idx];
+
+        float num_lo = subgroupAdd((t < 16u) ? val * c : 0.0);
+        float den_lo = subgroupAdd((t < 16u) ? c * c   : 0.0);
+        float num_hi = subgroupAdd((t >= 16u) ? val * c : 0.0);
+        float den_hi = subgroupAdd((t >= 16u) ? c * c   : 0.0);
+
+        if (den_lo > 1e-10) best_d0 = num_lo / den_lo;
+        if (den_hi > 1e-10) best_d1 = num_hi / den_hi;
+    }
+
+    // Step 6: Final quantization
+    float inv_final = (t < 16u)
+        ? ((best_d0 > 1e-10) ? 1.0 / best_d0 : 0.0)
+        : ((best_d1 > 1e-10) ? 1.0 / best_d1 : 0.0);
+    uint cidx = tq4_choose_index(val * inv_final);
+
+    // Step 7: Nibble packing — two 4-bit indices per byte
+    // Even element j: low nibble; odd element j: high nibble
+    // Thread t pairs with thread t^1
+    uint partner_idx = subgroupShuffle(cidx, t ^ 1u);
+    uint byte_val;
+    if ((t & 1u) == 0u) {
+        byte_val = cidx | (partner_idx << 4u);
+    } else {
+        byte_val = partner_idx | (cidx << 4u);
+    }
+    // Only even threads write (one byte per pair)
+    if ((t & 1u) == 0u) {
+        data_q[db].qs[t >> 1u] = uint8_t(byte_val);
+    }
+
+    // Step 8: Store scales (thread 0 writes both)
+    if (t == 0u) {
+        data_q[db].d0 = float16_t(best_d0);
+        data_q[db].d1 = float16_t(best_d1);
+    }
+}
+#elif defined(SET_ROWS)

 void main() {
 #ifdef NEEDS_INIT_IQ_SHMEM
@@ -651,3 +651,78 @@ vec2 get_dm(uint ib, uint a_offset) {
    return vec2(1, 0);
 }
 #endif
+
+#if defined(DATA_A_TURBO3_0)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    // PolarQuant 3-bit centroids (Lloyd-Max for Gaussian)
+    const float centroids[8] = float[8](
+        -0.190685, -0.117832, -0.065717, -0.021460,
+         0.021460,  0.065717,  0.117832,  0.190685
+    );
+
+    // iqs is the element index within the block (0..31), we decode 2 consecutive elements
+    const uint j0 = iqs;
+    const uint j1 = iqs + 1;
+
+    // Extract 2-bit low indices from qs (4 per byte)
+    const uint low2_0 = (uint(data_a[a_offset + ib].qs[j0 / 4]) >> ((j0 % 4) * 2)) & 0x3;
+    const uint low2_1 = (uint(data_a[a_offset + ib].qs[j1 / 4]) >> ((j1 % 4) * 2)) & 0x3;
+
+    // Extract 1-bit high from signs (8 per byte)
+    const uint hi1_0 = (uint(data_a[a_offset + ib].signs[j0 / 8]) >> (j0 % 8)) & 0x1;
+    const uint hi1_1 = (uint(data_a[a_offset + ib].signs[j1 / 8]) >> (j1 % 8)) & 0x1;
+
+    // Combine to 3-bit index
+    const uint idx0 = low2_0 | (hi1_0 << 2);
+    const uint idx1 = low2_1 | (hi1_1 << 2);
+
+    return vec2(centroids[idx0], centroids[idx1]);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    vec2 v0 = dequantize(ib, iqs, a_offset);
+    vec2 v1 = dequantize(ib, iqs + 2, a_offset);
+    return vec4(v0.x, v0.y, v1.x, v1.y);
+}
+vec2 get_dm(uint ib, uint a_offset) {
+    return vec2(float(data_a[a_offset + ib].norm), 0);
+}
+#endif
+
+#if defined(DATA_A_TQ4_1S)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    // TQ4_1S: 16-level Lloyd-Max centroids for N(0,1)
+    const float centroids[16] = float[16](
+        -2.732590, -2.069017, -1.618046, -1.256231,
+        -0.942340, -0.656759, -0.388048, -0.128395,
+         0.128395,  0.388048,  0.656759,  0.942340,
+         1.256231,  1.618046,  2.069017,  2.732590
+    );
+
+    // iqs is the element pair index within the block (0..15)
+    const uint j0 = iqs;
+    const uint j1 = iqs + 1;
+
+    // Extract 4-bit nibble indices from qs (2 per byte)
+    const uint idx0 = (uint(data_a[a_offset + ib].qs[j0 / 2]) >> ((j0 & 1) * 4)) & 0xF;
+    const uint idx1 = (uint(data_a[a_offset + ib].qs[j1 / 2]) >> ((j1 & 1) * 4)) & 0xF;
+
+    // Scale by d0 (elements 0-15) or d1 (elements 16-31)
+    const float d0 = float(data_a[a_offset + ib].d0);
+    const float d1 = float(data_a[a_offset + ib].d1);
+    const float s0 = (j0 < 16) ? d0 : d1;
+    const float s1 = (j1 < 16) ? d0 : d1;
+
+    // Returns centroid * scale WITHOUT RHT inverse
+    // (caller must handle pre-rotation for correctness)
+    return vec2(centroids[idx0] * s0, centroids[idx1] * s1);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    vec2 v0 = dequantize(ib, iqs, a_offset);
+    vec2 v1 = dequantize(ib, iqs + 2, a_offset);
+    return vec4(v0.x, v0.y, v1.x, v1.y);
+}
+vec2 get_dm(uint ib, uint a_offset) {
+    // No global scale/min — scales are applied per-element in dequantize()
+    return vec2(1, 0);
+}
+#endif
@@ -715,6 +715,33 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords
 }
 #endif

+#if defined(DATA_A_TURBO3_0)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufTURBO3_0 {
+   block_turbo3_0 block;
+};
+
+float16_t dequantFuncTURBO3_0(const in decodeBufTURBO3_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float centroids[8] = float[8](
+        -0.190685, -0.117832, -0.065717, -0.021460,
+         0.021460,  0.065717,  0.117832,  0.190685
+    );
+    const float norm = float(bl.block.norm);
+    const uint j = coordInBlock[1];
+
+    // Extract 2-bit low index from qs (4 per byte)
+    const uint low2 = (uint(bl.block.qs[j / 4]) >> ((j % 4) * 2)) & 0x3;
+
+    // Extract 1-bit high from signs (8 per byte)
+    const uint hi1 = (uint(bl.block.signs[j / 8]) >> (j % 8)) & 0x1;
+
+    // Combine to 3-bit index
+    const uint idx = low2 | (hi1 << 2);
+
+    return float16_t(centroids[idx] * norm);
+}
+#endif
+
 #if defined(DATA_A_Q1_0)
 #define dequantFuncA dequantFuncQ1_0
 #elif defined(DATA_A_Q4_0)
@@ -763,6 +790,8 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords
 #define dequantFuncA dequantFuncMXFP4
 #elif defined(DATA_A_NVFP4)
 #define dequantFuncA dequantFuncNVFP4
+#elif defined(DATA_A_TURBO3_0)
+#define dequantFuncA dequantFuncTURBO3_0
 #elif defined(DATA_A_F32)
 #define dequantFuncA dequantFuncF32
 #endif
@@ -0,0 +1,65 @@
+#version 450
+
+#include "dequant_head.glsl"
+
+// 256 threads per workgroup, each thread fully dequants one TQ4_1S block
+// (32 elements, 20 bytes). Workgroups process 256 blocks = 8192 elements each
+// so the x dispatch stays under maxComputeWorkGroupCount[0] for large tensors.
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_tq4_1s data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    // Lloyd-Max centroids for 4-bit normal quantization
+    const float centroids[16] = float[16](
+        -2.732590, -2.069017, -1.618046, -1.256231,
+        -0.942340, -0.656759, -0.388048, -0.128395,
+         0.128395,  0.388048,  0.656759,  0.942340,
+         1.256231,  1.618046,  2.069017,  2.732590
+    );
+
+    // WHT sign pattern for inverse RHT normalization
+    const float signs[32] = float[32](
+        +1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
+        -1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
+        -1.0, -1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0,
+        -1.0, +1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0
+    );
+
+    const float INV_SQRT32 = 0.17677669529663688;
+
+    const uint ib = gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
+    if (ib >= p.nel / 32) return;
+
+    const float d0 = float(data_a[ib].d0);
+    const float d1 = float(data_a[ib].d1);
+
+    // Load centroid*scale into per-thread buffer
+    float buf[32];
+    for (int j = 0; j < 32; j++) {
+        const uint byte_idx = uint(j) / 2u;
+        const uint nibble_shift = (uint(j) & 1u) * 4u;
+        const uint idx = (uint(data_a[ib].qs[byte_idx]) >> nibble_shift) & 0xFu;
+        const float d = (j < 16) ? d0 : d1;
+        buf[j] = centroids[idx] * d;
+    }
+
+    // Inverse WHT butterfly (5 stages for 32 elements) — matches CPU reference
+    for (uint step = 1u; step < 32u; step <<= 1u) {
+        for (uint i = 0u; i < 32u; i += step * 2u) {
+            for (uint j = i; j < i + step; j++) {
+                const float a = buf[j];
+                const float b = buf[j + step];
+                buf[j]        = a + b;
+                buf[j + step] = a - b;
+            }
+        }
+    }
+
+    // Normalize and apply sign pattern
+    const uint out_base = ib * 32u;
+    for (int j = 0; j < 32; j++) {
+        data_b[out_base + uint(j)] = D_TYPE(buf[j] * INV_SQRT32 * signs[j]);
+    }
+}
@@ -0,0 +1,36 @@
+#version 450
+
+#include "dequant_head.glsl"
+
+// 128 elements per block (QK_TURBO3 = 128)
+// Each workgroup processes one block. 128 threads, 1 element per thread.
+layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_turbo3_0 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const float centroids[8] = float[8](
+        -0.190685, -0.117832, -0.065717, -0.021460,
+         0.021460,  0.065717,  0.117832,  0.190685
+    );
+
+    // Each workgroup processes one 128-element block
+    const uint ib  = gl_WorkGroupID.x;
+    const uint j   = gl_LocalInvocationID.x;  // element index 0..127
+
+    if (ib >= p.nel / 128) return;
+
+    const float norm = float(data_a[ib].norm);
+
+    // Extract 2-bit low index from qs (4 per byte)
+    const uint low2 = (uint(data_a[ib].qs[j / 4]) >> ((j % 4) * 2)) & 0x3;
+
+    // Extract 1-bit high from signs (8 per byte)
+    const uint hi1 = (uint(data_a[ib].signs[j / 8]) >> (j % 8)) & 0x1;
+
+    // Combine to 3-bit index
+    const uint idx = low2 | (hi1 << 2);
+
+    data_b[ib * 128 + j] = D_TYPE(centroids[idx] * norm);
+}
@@ -35,10 +35,12 @@ const uint32_t num_subgroups = SubGroupSize == 0 ? 0 : WorkGroupSize / SubGroupS

 layout (binding = 0) readonly buffer Q {float data_q[];};
 layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
+#if !defined(DATA_A_TURBO3_0)
 layout (binding = 1) readonly buffer K {float16_t data_k[];};
 layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
 layout (binding = 2) readonly buffer V {float16_t data_v[];};
 layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
+#endif
 layout (binding = 3) readonly buffer M {float16_t data_m[];};

 // If SubGroupSize is set to 0 then only use shmem reductions
@@ -140,6 +140,35 @@ uint fa_quant_r_mmq(uint ty) {
 #define USE_DECODE_K (FaTypeK != FA_TYPE_F16)
 #define USE_DECODE_V (FaTypeV != FA_TYPE_F16)

+#if defined(DATA_A_TURBO3_0)
+const float T3C[8] = float[8](
+    -0.190685, -0.117832, -0.065717, -0.021460,
+     0.021460,  0.065717,  0.117832,  0.190685
+);
+FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
+    FLOAT_TYPEV4 r;
+    for (int k = 0; k < 4; k++) {
+        uint  j  = iqs + uint(k);
+        float nm;
+        uint  qb;
+        uint  sb;
+        if (binding_idx == BINDING_IDX_K) {
+            nm = float(data_k_t3[a_offset + ib].norm);
+            qb = uint(data_k_t3[a_offset + ib].qs[j / 4]);
+            sb = uint(data_k_t3[a_offset + ib].signs[j / 8]);
+        } else {
+            nm = float(data_v_t3[a_offset + ib].norm);
+            qb = uint(data_v_t3[a_offset + ib].qs[j / 4]);
+            sb = uint(data_v_t3[a_offset + ib].signs[j / 8]);
+        }
+        uint lo = (qb >> ((j % 4) * 2)) & 0x3;
+        uint hi = (sb >> (j % 8)) & 0x1;
+        r[k] = FLOAT_TYPE(T3C[lo | (hi << 2)] * nm);
+    }
+    return r;
+}
+#endif
+
 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))


@@ -0,0 +1,119 @@
+#version 450
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.glsl"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+// Lloyd-Max centroids for TQ4_1S (4-bit, 16 levels) — N(0, 1) optimal
+const float TQ4_CENTROIDS[16] = float[16](
+    -2.732590, -2.069017, -1.618046, -1.256231,
+    -0.942340, -0.656759, -0.388048, -0.128395,
+     0.128395,  0.388048,  0.656759,  0.942340,
+     1.256231,  1.618046,  2.069017,  2.732590
+);
+
+// WHT sign pattern for 32-element blocks (shared by TQ3 and TQ4)
+const float TQ4_SIGNS[32] = float[32](
+    +1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
+    -1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0,
+    -1.0, -1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0,
+    -1.0, +1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0
+);
+
+const float TQ4_INV_SQRT32 = 0.17677669529663688;
+
+// See the commit message on a850ccc for the full derivation and portability
+// rationale.  Short version: pre-rotate the activation block via forward WHT
+// in shared memory, then dot-product against the raw centroid*scale weights.
+//
+// Shared memory budget: NUM_COLS * 32 floats (max 1 KiB at NUM_COLS=8)
+// plus whatever tmpsh the reduction helper allocates.
+
+shared float tq4_smem[8 * 32];
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    const uint tid = gl_LocalInvocationID.x;
+
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint n = 0; n < NUM_ROWS; ++n) {
+            temp[j][n] = FLOAT_TYPE(0);
+        }
+    }
+
+    const uint num_blocks_per_row = p.ncols / 32u;
+    const uint byte_idx     = tid / 2u;
+    const uint nibble_shift = (tid & 1u) * 4u;
+    const float sign_tid    = TQ4_SIGNS[tid];
+
+    for (uint blk = 0; blk < num_blocks_per_row; blk++) {
+        // --- Stage 1: load activation, sign-flip, write to shared memory ---
+        [[unroll]] for (uint c = 0; c < NUM_COLS; ++c) {
+            const uint b_base = c * p.batch_stride_b + b_offset + blk * 32u;
+            tq4_smem[c * 32u + tid] = float(data_b[b_base + tid]) * sign_tid;
+        }
+        barrier();
+
+        // --- Stage 2: forward WHT butterfly in shared memory (5 stages) ---
+        [[unroll]] for (uint step = 1u; step < 32u; step <<= 1u) {
+            if ((tid & step) == 0u) {
+                const uint partner = tid + step;
+                [[unroll]] for (uint c = 0; c < NUM_COLS; ++c) {
+                    const uint base = c * 32u;
+                    const float a = tq4_smem[base + tid];
+                    const float b = tq4_smem[base + partner];
+                    tq4_smem[base + tid]     = a + b;
+                    tq4_smem[base + partner] = a - b;
+                }
+            }
+            barrier();
+        }
+
+        // --- Stage 3: dequant all rows' weights for this block position ---
+        // Pre-computing the weight for every row before touching the column
+        // accumulator lets the compiler treat the smem read in stage 4 as
+        // loop-invariant across rows, which is the Vulkan analogue of the
+        // "hot loop load dedup" optimisation in the CUDA kernel (PR #57).
+        float w_vals[NUM_ROWS];
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib  = (first_row + n) * num_blocks_per_row + blk;
+            const uint idx = (uint(data_a[a_offset + ib].qs[byte_idx]) >> nibble_shift) & 0xFu;
+            const float d  = (tid < 16u)
+                ? float(data_a[a_offset + ib].d0)
+                : float(data_a[a_offset + ib].d1);
+            w_vals[n] = TQ4_CENTROIDS[idx] * d * TQ4_INV_SQRT32;
+        }
+
+        // --- Stage 4: accumulate dot products ---
+        // Read the rotated activation once per column; reuse across all rows.
+        [[unroll]] for (uint c = 0; c < NUM_COLS; ++c) {
+            const float b_rotated = tq4_smem[c * 32u + tid];
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                temp[c][n] += FLOAT_TYPE(w_vals[n] * b_rotated);
+            }
+        }
+
+        // Ensure every thread is done reading before the next block's store.
+        barrier();
+    }
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
@@ -0,0 +1,67 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_control_flow_attributes : require
+
+layout (local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
+
+layout (push_constant) uniform parameter { uint ne; uint direction; uint group_size; } p;
+
+layout (binding = 0) readonly  buffer A { float data_a[]; };
+layout (binding = 1) writeonly buffer D { float data_d[]; };
+
+shared float x[128];
+
+// Pre-scramble sign vectors applied before and after the WHT.
+// direction == 0: pre = S1, post = S2; direction == 1: pre = S2, post = S1.
+const float S1[128] = float[128](
+    -1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1, -1,
+    -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1,  1,  1,  1, -1,  1,
+    -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1,  1,
+     1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,  1, -1,
+    -1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,
+     1, -1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,  1,  1, -1,  1
+);
+
+const float S2[128] = float[128](
+     1,  1,  1,  1, -1,  1,  1, -1,  1, -1, -1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,  1, -1,
+     1, -1,  1,  1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1,  1,
+    -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1,
+    -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1
+);
+
+const float INV_SQRT_128 = 0.08838834764831845;  // 1 / sqrt(128)
+
+void main() {
+    const uint tid  = gl_LocalInvocationID.x;
+    const uint base = (gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x) * 128;
+
+    if (base + tid >= p.ne) return;
+
+    const float fs = (p.direction == 0) ? S1[tid] : S2[tid];
+    const float ss = (p.direction == 0) ? S2[tid] : S1[tid];
+
+    x[tid] = data_a[base + tid];
+    barrier();
+
+    x[tid] *= fs;
+    barrier();
+
+    [[unroll]] for (uint h = 1; h < 128; h *= 2) {
+        if ((tid % (2 * h)) < h) {
+            float a    = x[tid];
+            float b    = x[tid + h];
+            x[tid]     = a + b;
+            x[tid + h] = a - b;
+        }
+        barrier();
+    }
+
+    data_d[base + tid] = x[tid] * INV_SQRT_128 * ss;
+}
@@ -6,6 +6,7 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
 #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
 #extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require

 #if defined(DATA_A_F32)
 #define QUANT_K 1
@@ -1729,6 +1730,70 @@ struct block_nvfp4
 #define A_TYPE block_nvfp4
 #endif

+#define QUANT_K_TURBO3_0 128
+#define QUANT_R_TURBO3_0 1
+
+struct block_turbo3_0
+{
+    float16_t norm;
+    uint8_t qs[32];     // 2-bit centroid indices (4 per byte), 128/4 = 32 bytes
+    uint8_t signs[16]; // 1-bit high bit of 3-bit index (8 per byte), 128/8 = 16 bytes
+};
+
+#if defined(DATA_A_TURBO3_0)
+#define QUANT_K QUANT_K_TURBO3_0
+#define QUANT_R QUANT_R_TURBO3_0
+#define QUANT_AUXF 1
+#define A_TYPE block_turbo3_0
+#endif
+
+#define QUANT_K_TURBO2_0 128
+#define QUANT_R_TURBO2_0 1
+struct block_turbo2_0
+{
+    float16_t norm;
+    uint8_t qs[32];     // 2-bit centroid indices (4 per byte), 128/4 = 32 bytes
+};
+#if defined(DATA_A_TURBO2_0)
+#define QUANT_K QUANT_K_TURBO2_0
+#define QUANT_R QUANT_R_TURBO2_0
+#define QUANT_AUXF 1
+#define A_TYPE block_turbo2_0
+#endif
+
+#define QUANT_K_TURBO4_0 128
+#define QUANT_R_TURBO4_0 1
+struct block_turbo4_0
+{
+    float16_t norm;
+    float16_t rnorm;    // reserved in 4-bit mode (kept for ABI parity with legacy)
+    uint8_t qs[64];     // 4-bit centroid indices, nibble-packed (2 per byte), 128/2 = 64 bytes
+};
+#if defined(DATA_A_TURBO4_0)
+#define QUANT_K QUANT_K_TURBO4_0
+#define QUANT_R QUANT_R_TURBO4_0
+#define QUANT_AUXF 1
+#define A_TYPE block_turbo4_0
+#endif
+
+
+#define QUANT_K_TQ4_1S 32
+#define QUANT_R_TQ4_1S 1
+
+struct block_tq4_1s
+{
+    float16_t d0;      // scale for elements 0-15
+    float16_t d1;      // scale for elements 16-31
+    uint8_t qs[16];    // 4-bit nibble-packed centroid indices (2 per byte)
+};
+
+#if defined(DATA_A_TQ4_1S)
+#define QUANT_K QUANT_K_TQ4_1S
+#define QUANT_R QUANT_R_TQ4_1S
+#define QUANT_AUXF 1
+#define A_TYPE block_tq4_1s
+#endif
+
 #if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
 const int8_t kvalues_iq4nl_const[16] = {
    int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
@@ -68,6 +68,8 @@ const std::vector<std::string> type_names = {
    "mxfp4",
    "nvfp4",
    "bf16",
+    "turbo3_0",
+    "tq4_1s",
 };

 enum MatMulIdType {
@@ -563,6 +565,11 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
        if (tname == "bf16") {
            continue;
        }
+        // TQ4_1S uses a specialized mul_mat_vec shader for small N and
+        // the dequant+f16 matmul fallback for large N. No dedicated mul_mm needed.
+        if (tname == "tq4_1s") {
+            continue;
+        }

        std::string data_a_key = "DATA_A_" + to_uppercase(tname);
        // For unaligned, load one at a time for f32/f16, or two at a time for quants
@@ -667,7 +674,7 @@ void process_shaders() {
    for (const auto& tname : type_names) {
        // mul mat vec
        std::string data_a_key = "DATA_A_" + to_uppercase(tname);
-        std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
+        std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_") || tname == "tq4_1s") ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";

        string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPEV2", "vec2"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}}));
        string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV2", "f16vec2"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}));
@@ -750,12 +757,19 @@ void process_shaders() {
        string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
        string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }
+    // turbo3_0 copy-from-quant only; copy-to-quant (cpy_f32_turbo3_0) omitted because the non-SET_ROWS quantize() path lacks the WHT transform
+    string_to_spv("cpy_turbo3_0_f32", "copy_from_quant.comp", {{"DATA_A_TURBO3_0", "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+    // tq4_1s copy-from-quant only; copy-to-quant requires WHT forward (handled in SET_ROWS path)
+    string_to_spv("cpy_tq4_1s_f32", "copy_from_quant.comp", {{"DATA_A_TQ4_1S", "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});

-    for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
+    for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl", "turbo2_0", "turbo3_0", "turbo4_0", "tq4_1s"}) {
        string_to_spv("set_rows_" + t + "_i32", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uint"}, {"B_SIZE", "32"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
        string_to_spv("set_rows_" + t + "_i64", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"B_SIZE", "64"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }

+    // TurboQuant WHT operation
+    string_to_spv("turbo_wht", "turbo_wht.comp", {});
+
    auto get_type_str = [](bool f16) {
        return f16 ? "float16_t" : "float";
    };
@@ -749,6 +749,46 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
        .to_float                 = (ggml_to_float_t) dequantize_row_nvfp4,
        .from_float_ref           = (ggml_from_float_t)quantize_row_nvfp4_ref,
    },
+    [GGML_TYPE_TURBO3_0] = {
+        .type_name                = "turbo3",
+        .blck_size                = QK_TURBO3,
+        .type_size                = sizeof(block_turbo3_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_turbo3_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_turbo3_0_ref,
+    },
+    [GGML_TYPE_TURBO4_0] = {
+        .type_name                = "turbo4",
+        .blck_size                = QK_TURBO4,
+        .type_size                = sizeof(block_turbo4_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_turbo4_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_turbo4_0_ref,
+    },
+    [GGML_TYPE_TURBO2_0] = {
+        .type_name                = "turbo2",
+        .blck_size                = QK_TURBO2,
+        .type_size                = sizeof(block_turbo2_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_turbo2_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_turbo2_0_ref,
+    },
+    [GGML_TYPE_TQ3_1S] = {
+        .type_name                = "tq3_1s",
+        .blck_size                = QK_TQ3_0,
+        .type_size                = sizeof(block_tq3_1s),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_tq3_1s,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_tq3_1s_ref,
+    },
+    [GGML_TYPE_TQ4_1S] = {
+        .type_name                = "tq4_1s",
+        .blck_size                = QK_TQ4_1S,
+        .type_size                = sizeof(block_tq4_1s),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_tq4_1s,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_tq4_1s_ref,
+    },
    [GGML_TYPE_Q2_K] = {
        .type_name                = "q2_K",
        .blck_size                = QK_K,
@@ -1063,6 +1103,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "RWKV_WKV7",
    "SOLVE_TRI",
    "GATED_DELTA_NET",
+    "TURBO_WHT",

    "UNARY",

@@ -1080,7 +1121,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "GLU",
 };

-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@@ -1173,6 +1214,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "rwkv_wkv7(r, w, k, v, a, b, s)",
    "A X = B, A triangular, solve X",
    "gated_delta_net(q, k, v, g, beta, s)",
+    "turbo_wht(a)",

    "unary(x)",

@@ -1190,7 +1232,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "glu(x)",
 };

-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@@ -6230,6 +6272,38 @@ struct ggml_tensor * ggml_gated_delta_net(
    return result;
 }

+// ggml_turbo_wht
+
+struct ggml_tensor * ggml_turbo_wht(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   direction,
+        int                   group_size,
+        struct ggml_tensor  * scale) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(direction == 0 || direction == 1);
+
+    // Auto-detect group size from tensor dimension if not specified
+    if (group_size == 0) {
+        group_size = (a->ne[0] % 128 == 0) ? 128 : 64;
+    }
+    GGML_ASSERT(group_size == 32 || group_size == 64 || group_size == 128);
+    GGML_ASSERT(a->ne[0] % group_size == 0);
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
+
+    result->op = GGML_OP_TURBO_WHT;
+    result->src[0] = a;
+    result->src[1] = scale;  // InnerQ scale_inv (NULL = no scaling)
+
+    // Store direction and group_size in op_params
+    memcpy(result->op_params + 0, &direction, sizeof(int));
+    memcpy(result->op_params + sizeof(int), &group_size, sizeof(int));
+
+    return result;
+}
+
 ////////////////////////////////////////////////////////////////////////////////

 struct ggml_hash_set ggml_hash_set_new(size_t size) {
@@ -7714,6 +7788,11 @@ size_t ggml_quantize_chunk(
        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TURBO3_0: result = quantize_turbo3_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TURBO4_0: result = quantize_turbo4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TURBO2_0: result = quantize_turbo2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ3_1S:  result = quantize_tq3_1s(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ4_1S:  result = quantize_tq4_1s(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
@@ -4142,6 +4142,8 @@ class GGMLQuantizationType(IntEnum):
    MXFP4   = 39
    NVFP4   = 40
    Q1_0    = 41
+    TQ3_1S  = 45
+    TQ4_1S  = 46


 class ExpertGatingFuncType(IntEnum):
@@ -4196,6 +4198,8 @@ class LlamaFileType(IntEnum):
    MOSTLY_MXFP4_MOE     = 38  # except 1d tensors
    MOSTLY_NVFP4         = 39  # except 1d tensors
    MOSTLY_Q1_0          = 40  # except 1d tensors
+    MOSTLY_TQ3_1S        = 43  # except 1d tensors
+    MOSTLY_TQ4_1S        = 44  # except 1d tensors

    GUESSED              = 1024  # not specified in the model file

@@ -4317,6 +4321,8 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
    GGMLQuantizationType.MXFP4:   (32, 1 + 16),
    GGMLQuantizationType.NVFP4:   (64, 4 + 32),
    GGMLQuantizationType.Q1_0:    (128, 2 + 16),
+    GGMLQuantizationType.TQ3_1S:  (32, 2 + 2 + 12),
+    GGMLQuantizationType.TQ4_1S:  (32, 2 + 2 + 16),
 }


@@ -155,6 +155,8 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_NVFP4         = 39, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q1_0          = 40, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_TQ3_1S        = 43, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_TQ4_1S        = 44, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# build.sh — Build CUDA target. Exits non-zero on failure.
+set -euo pipefail
+REPO_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
+cd "$REPO_DIR"
+
+# Configure if build dir doesn't exist
+if [[ ! -d build-cuda ]]; then
+    cmake -B build-cuda -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="89;120" \
+        -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF 2>&1
+fi
+
+cmake --build build-cuda -j$(nproc) 2>&1
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# run_experiment.sh — Build, benchmark, and validate a CUDA kernel experiment.
+# Outputs structured JSON to stdout. All logging goes to stderr.
+#
+# Usage: run_experiment.sh <track-name> [--quick]
+#   --quick: skip PPL validation (speed-only iteration)
+
+set -euo pipefail
+
+TRACK="${1:?Usage: run_experiment.sh <track-name> [--quick]}"
+QUICK=false
+[[ "${2:-}" == "--quick" ]] && QUICK=true
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
+TRACK_DIR="$SCRIPT_DIR/$TRACK"
+
+if [[ ! -f "$TRACK_DIR/baseline.json" ]]; then
+    echo '{"status": "error", "error": "No baseline.json found for track: '"$TRACK"'"}'
+    exit 1
+fi
+
+# Load track config from baseline.json
+BENCH_ARGS=$(jq -r '.bench_args // ""' "$TRACK_DIR/baseline.json")
+MODEL=$(jq -r '.model' "$TRACK_DIR/baseline.json")
+NO_CONVERT=$(jq -r '.no_convert // false' "$TRACK_DIR/baseline.json")
+
+# For tracks that benchmark the TQ4_1S runtime kernel, disable load-time conversion
+BENCH_ENV=""
+if [[ "$NO_CONVERT" == "true" ]]; then
+    BENCH_ENV="GGML_TQ_NO_CONVERT=1"
+fi
+PPL_BASELINE=$(jq -r '.ppl // 0' "$TRACK_DIR/baseline.json")
+PPL_THRESHOLD=$(jq -r '.ppl_threshold // 0.1' "$TRACK_DIR/baseline.json")
+PPL_FILE=$(jq -r '.ppl_file // ""' "$TRACK_DIR/baseline.json")
+
+# ---- Phase 1: Build ----
+echo ">>> Building..." >&2
+BUILD_START=$(date +%s)
+
+BUILD_OUTPUT=$("$REPO_DIR/scripts/autoresearch/build.sh" 2>&1) || {
+    BUILD_END=$(date +%s)
+    # Extract last 20 lines of error for the agent
+    ERROR=$(echo "$BUILD_OUTPUT" | grep "error:" | head -10)
+    jq -n \
+        --arg status "build_failed" \
+        --arg error "$ERROR" \
+        --argjson build_time_s $((BUILD_END - BUILD_START)) \
+        '{status: $status, error: $error, build_time_s: $build_time_s}'
+    exit 0
+}
+BUILD_END=$(date +%s)
+echo ">>> Build OK ($(( BUILD_END - BUILD_START ))s)" >&2
+
+# ---- Phase 2: Benchmark ----
+echo ">>> Benchmarking..." >&2
+BENCH_START=$(date +%s)
+
+BENCH_OUTPUT=$(timeout 120 env $BENCH_ENV "$REPO_DIR/build-cuda/bin/llama-bench" \
+    -m "$MODEL" -ngl 99 $BENCH_ARGS -r 3 2>&1) || {
+    BENCH_END=$(date +%s)
+    ERROR=$(echo "$BENCH_OUTPUT" | grep -iE "error|crash|abort|signal" | head -5)
+    jq -n \
+        --arg status "runtime_crash" \
+        --arg error "${ERROR:-unknown crash}" \
+        --argjson build_time_s $((BUILD_END - BUILD_START)) \
+        --argjson bench_time_s $(($(date +%s) - BENCH_START)) \
+        '{status: $status, error: $error, build_time_s: $build_time_s, bench_time_s: $bench_time_s}'
+    exit 0
+}
+BENCH_END=$(date +%s)
+
+# Parse t/s from llama-bench output
+# Format: "| model | size | params | backend | ngl | test | t/s ± err |"
+# The t/s value is before the "±" in the last data column
+parse_ts() {
+    grep -E "^\|" | grep -v "model" | grep -v "\-\-\-" | \
+        grep -oP '\d+\.\d+\s*±' | grep -oP '[\d.]+' | head -1
+}
+
+TG128=$(echo "$BENCH_OUTPUT" | parse_ts)
+# For tracks with both pp and tg, there will be two data rows
+BENCH_LINES=$(echo "$BENCH_OUTPUT" | grep -E "^\|" | grep -v "model" | grep -v "\-\-\-" | wc -l)
+if (( BENCH_LINES > 1 )); then
+    PP512=$(echo "$BENCH_OUTPUT" | grep -E "^\|" | grep -v "model" | grep -v "\-\-\-" | head -1 | grep -oP '\d+\.\d+\s*±' | grep -oP '[\d.]+')
+    TG128=$(echo "$BENCH_OUTPUT" | grep -E "^\|" | grep -v "model" | grep -v "\-\-\-" | tail -1 | grep -oP '\d+\.\d+\s*±' | grep -oP '[\d.]+')
+else
+    PP512=""
+fi
+
+echo ">>> Benchmark: tg=${TG128:-?} pp=${PP512:-n/a}" >&2
+
+# ---- Phase 3: Correctness (unless --quick) ----
+PPL=""
+if [[ "$QUICK" == "false" && -n "$PPL_FILE" && -n "$PPL_BASELINE" && "$PPL_BASELINE" != "0" ]]; then
+    echo ">>> PPL check..." >&2
+    PPL_OUTPUT=$(timeout 180 env $BENCH_ENV "$REPO_DIR/build-cuda/bin/llama-perplexity" \
+        -m "$MODEL" -f "$PPL_FILE" -ngl 99 --chunks 10 2>&1) || true
+    PPL=$(echo "$PPL_OUTPUT" | grep "Final estimate" | grep -oP 'PPL = \K[0-9.]+')
+
+    if [[ -n "$PPL" ]]; then
+        PPL_DELTA=$(echo "$PPL - $PPL_BASELINE" | bc -l 2>/dev/null || echo "999")
+        if (( $(echo "$PPL_DELTA > $PPL_THRESHOLD" | bc -l 2>/dev/null || echo 1) )); then
+            echo ">>> PPL regression: $PPL (baseline: $PPL_BASELINE, delta: $PPL_DELTA)" >&2
+            jq -n \
+                --arg status "ppl_regression" \
+                --argjson ppl "${PPL}" \
+                --argjson ppl_baseline "${PPL_BASELINE}" \
+                --argjson build_time_s $((BUILD_END - BUILD_START)) \
+                --argjson bench_time_s $((BENCH_END - BENCH_START)) \
+                --arg tg128 "${TG128:-0}" \
+                '{status: $status, tg128: ($tg128|tonumber), ppl: $ppl, ppl_baseline: $ppl_baseline, build_time_s: $build_time_s, bench_time_s: $bench_time_s}'
+            exit 0
+        fi
+        echo ">>> PPL OK: $PPL (baseline: $PPL_BASELINE)" >&2
+    fi
+fi
+
+# ---- Phase 4: Compute delta and output ----
+BASELINE_TG=$(jq -r '.tg128 // 0' "$TRACK_DIR/baseline.json")
+if [[ -n "$TG128" && "$BASELINE_TG" != "0" ]]; then
+    DELTA_PCT=$(echo "scale=1; ($TG128 - $BASELINE_TG) / $BASELINE_TG * 100" | bc -l 2>/dev/null || echo "0")
+else
+    DELTA_PCT="0"
+fi
+
+# GPU temperature
+GPU_TEMP=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "")
+
+jq -n \
+    --arg status "success" \
+    --arg tg128 "${TG128:-0}" \
+    --arg pp512 "${PP512:-}" \
+    --arg ppl "${PPL:-}" \
+    --argjson baseline_tg128 "${BASELINE_TG}" \
+    --arg delta_pct "${DELTA_PCT}%" \
+    --argjson build_time_s $((BUILD_END - BUILD_START)) \
+    --argjson bench_time_s $((BENCH_END - BENCH_START)) \
+    --arg gpu_temp_c "${GPU_TEMP}" \
+    '{status: $status, tg128: ($tg128|tonumber), baseline_tg128: $baseline_tg128, delta_pct: $delta_pct, build_time_s: $build_time_s, bench_time_s: $bench_time_s, gpu_temp_c: $gpu_temp_c} + (if $pp512 != "" then {pp512: ($pp512|tonumber)} else {} end) + (if $ppl != "" then {ppl: ($ppl|tonumber)} else {} end)'
@@ -0,0 +1,249 @@
+#!/usr/bin/env bash
+# run_track.sh — Outer loop: invoke AI agent to optimize a CUDA kernel track.
+#
+# Usage: run_track.sh <track-name> --experiments <N> [--quick-until-improvement] [--max-hours <H>]
+
+set -uo pipefail
+# Note: NOT using set -e — we handle errors explicitly to keep the loop running
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# ---- Parse args ----
+TRACK="${1:?Usage: run_track.sh <track-name> --experiments <N>}"
+shift
+EXPERIMENTS=10
+QUICK_UNTIL_IMPROVEMENT=false
+MAX_HOURS=8
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --experiments) EXPERIMENTS="$2"; shift 2 ;;
+        --quick-until-improvement) QUICK_UNTIL_IMPROVEMENT=true; shift ;;
+        --max-hours) MAX_HOURS="$2"; shift 2 ;;
+        *) echo "Unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+TRACK_DIR="$SCRIPT_DIR/$TRACK"
+if [[ ! -f "$TRACK_DIR/program.md" ]]; then
+    echo "Error: $TRACK_DIR/program.md not found" >&2
+    exit 1
+fi
+
+TARGET_FILE=$(jq -r '.target_file' "$TRACK_DIR/baseline.json")
+HISTORY_FILE="$TRACK_DIR/history.jsonl"
+BASELINE_FILE="$TRACK_DIR/baseline.json"
+START_TIME=$(date +%s)
+MAX_SECONDS=$((MAX_HOURS * 3600))
+
+# ---- Create experiment branch ----
+BRANCH_NAME="autoresearch/$TRACK/$(date +%Y%m%d-%H%M%S)"
+cd "$REPO_DIR"
+git checkout -b "$BRANCH_NAME" 2>/dev/null || true
+BASELINE_SHA=$(git rev-parse HEAD)
+echo ">>> Branch: $BRANCH_NAME (base: ${BASELINE_SHA:0:8})" >&2
+
+# ---- Establish baseline ----
+echo ">>> Establishing baseline..." >&2
+BASELINE_RESULT=$("$SCRIPT_DIR/run_experiment.sh" "$TRACK")
+echo "$BASELINE_RESULT" | jq . >&2
+
+BASELINE_STATUS=$(echo "$BASELINE_RESULT" | jq -r '.status')
+if [[ "$BASELINE_STATUS" != "success" ]]; then
+    echo "Error: baseline benchmark failed: $BASELINE_STATUS" >&2
+    exit 1
+fi
+
+# Update baseline with current measurements
+BASELINE_TG=$(echo "$BASELINE_RESULT" | jq '.tg128')
+BASELINE_PPL=$(echo "$BASELINE_RESULT" | jq '.ppl // empty')
+jq --argjson tg "$BASELINE_TG" '.tg128 = $tg' "$BASELINE_FILE" > "$BASELINE_FILE.tmp" && mv "$BASELINE_FILE.tmp" "$BASELINE_FILE"
+if [[ -n "${BASELINE_PPL:-}" ]]; then
+    jq --argjson ppl "$BASELINE_PPL" '.ppl = $ppl' "$BASELINE_FILE" > "$BASELINE_FILE.tmp" && mv "$BASELINE_FILE.tmp" "$BASELINE_FILE"
+fi
+
+echo ">>> Baseline: ${BASELINE_TG} t/s" >&2
+
+# ---- Counters ----
+CONSECUTIVE_FAILURES=0
+CONSECUTIVE_NO_IMPROVEMENT=0
+TOTAL_KEPT=0
+TOTAL_REVERTED=0
+BEST_TG=$BASELINE_TG
+
+# ---- Experiment loop ----
+for i in $(seq 1 "$EXPERIMENTS"); do
+    ELAPSED=$(( $(date +%s) - START_TIME ))
+    if (( ELAPSED > MAX_SECONDS )); then
+        echo ">>> Time limit reached (${MAX_HOURS}h). Stopping." >&2
+        break
+    fi
+
+    echo "" >&2
+    echo "================================================================" >&2
+    echo ">>> Experiment $i / $EXPERIMENTS" >&2
+    echo "================================================================" >&2
+
+    # Determine if we should skip PPL this round
+    QUICK_FLAG=""
+    if [[ "$QUICK_UNTIL_IMPROVEMENT" == "true" ]]; then
+        QUICK_FLAG="--quick"
+    fi
+    # Every 10th experiment always runs PPL
+    if (( i % 10 == 0 )); then
+        QUICK_FLAG=""
+    fi
+
+    # Build the agent prompt
+    LAST_RESULT=""
+    if [[ -f "$HISTORY_FILE" ]]; then
+        LAST_RESULT=$(tail -1 "$HISTORY_FILE" 2>/dev/null || echo "")
+    fi
+
+    STALL_HINT=""
+    if (( CONSECUTIVE_NO_IMPROVEMENT >= 5 )); then
+        STALL_HINT="IMPORTANT: The last $CONSECUTIVE_NO_IMPROVEMENT experiments showed no improvement. Try a fundamentally different approach — different algorithm, different memory access pattern, different thread mapping."
+        CONSECUTIVE_NO_IMPROVEMENT=0
+    fi
+
+    AGENT_PROMPT="You are optimizing a CUDA kernel. Read the program file and make ONE modification to improve performance.
+
+Read: $TRACK_DIR/program.md
+Read: $REPO_DIR/$TARGET_FILE
+
+$(if [[ -n "$LAST_RESULT" ]]; then echo "Last experiment result: $LAST_RESULT"; fi)
+$(if [[ -n "$STALL_HINT" ]]; then echo "$STALL_HINT"; fi)
+
+Current best: ${BEST_TG} t/s (baseline: ${BASELINE_TG} t/s)
+
+Rules:
+- Modify ONLY: $TARGET_FILE
+- Make exactly ONE conceptual change per experiment
+- Do NOT run any builds, benchmarks, or git commands
+- After editing, briefly describe what you changed and why (one line to stderr)"
+
+    # Invoke the agent
+    echo ">>> Invoking agent..." >&2
+    claude -p "$AGENT_PROMPT" \
+        --allowedTools Read,Edit \
+        --max-turns 20 \
+        2>&1 | tee /dev/stderr | tail -1 > /dev/null || true
+
+    # Check if the file was actually modified
+    if ! git diff --quiet -- "$TARGET_FILE" 2>/dev/null; then
+        # Check no OTHER source files were modified (ignore config/untracked)
+        OTHER_CHANGES=$(git diff --name-only -- '*.cu' '*.cuh' '*.cpp' '*.h' '*.c' | grep -vF "$TARGET_FILE" | head -5)
+        if [[ -n "$OTHER_CHANGES" ]]; then
+            echo ">>> SAFETY: Agent modified non-target files: $OTHER_CHANGES — reverting all" >&2
+            git checkout -- .
+            RESULT='{"status": "safety_revert", "error": "modified non-target files"}'
+        else
+            # Run the experiment
+            echo ">>> Running experiment..." >&2
+            RESULT=$("$SCRIPT_DIR/run_experiment.sh" "$TRACK" $QUICK_FLAG)
+            echo "$RESULT" | jq . >&2
+        fi
+    else
+        echo ">>> Agent made no changes. Skipping." >&2
+        RESULT='{"status": "no_change"}'
+    fi
+
+    STATUS=$(echo "$RESULT" | jq -r '.status')
+    TG=$(echo "$RESULT" | jq -r '.tg128 // 0')
+
+    # Decide: keep or revert
+    KEPT=false
+    if [[ "$STATUS" == "success" ]]; then
+        # Check if it's actually faster
+        IMPROVEMENT=$(echo "$TG > $BEST_TG" | bc -l 2>/dev/null || echo 0)
+        if [[ "$IMPROVEMENT" == "1" ]]; then
+            # Speed improvement found. If we were in quick mode, validate PPL now.
+            if [[ -n "$QUICK_FLAG" && "$QUICK_UNTIL_IMPROVEMENT" == "true" ]]; then
+                echo ">>> Speed improvement found (+$(echo "$RESULT" | jq -r '.delta_pct')). Validating PPL..." >&2
+                PPL_RESULT=$("$SCRIPT_DIR/run_experiment.sh" "$TRACK")
+                PPL_STATUS=$(echo "$PPL_RESULT" | jq -r '.status')
+                if [[ "$PPL_STATUS" == "ppl_regression" ]]; then
+                    echo ">>> PPL regression — reverting despite speed gain" >&2
+                    git checkout -- "$TARGET_FILE"
+                    KEPT=false
+                    STATUS="ppl_regression"
+                    RESULT="$PPL_RESULT"
+                else
+                    KEPT=true
+                fi
+            else
+                KEPT=true
+            fi
+
+            if [[ "$KEPT" == "true" ]]; then
+                BEST_TG="$TG"
+                git add "$TARGET_FILE"
+                SUMMARY=$(git diff --cached --stat | head -1)
+                git commit -m "autoresearch($TRACK): +$(echo "$RESULT" | jq -r '.delta_pct') tg128 (experiment $i)" --no-verify
+                # Update baseline
+                jq --argjson tg "$TG" '.tg128 = $tg' "$BASELINE_FILE" > "$BASELINE_FILE.tmp" && mv "$BASELINE_FILE.tmp" "$BASELINE_FILE"
+                PPL_VAL=$(echo "$RESULT" | jq '.ppl // empty')
+                if [[ -n "${PPL_VAL:-}" ]]; then
+                    jq --argjson ppl "$PPL_VAL" '.ppl = $ppl' "$BASELINE_FILE" > "$BASELINE_FILE.tmp" && mv "$BASELINE_FILE.tmp" "$BASELINE_FILE"
+                fi
+                TOTAL_KEPT=$((TOTAL_KEPT + 1))
+                CONSECUTIVE_NO_IMPROVEMENT=0
+                CONSECUTIVE_FAILURES=0
+                echo ">>> KEPT: ${TG} t/s (+$(echo "$RESULT" | jq -r '.delta_pct'))" >&2
+            fi
+        else
+            echo ">>> No improvement (${TG} vs best ${BEST_TG}). Reverting." >&2
+            git checkout -- "$TARGET_FILE"
+            CONSECUTIVE_NO_IMPROVEMENT=$((CONSECUTIVE_NO_IMPROVEMENT + 1))
+        fi
+    elif [[ "$STATUS" == "build_failed" ]]; then
+        echo ">>> Build failed. Reverting." >&2
+        git checkout -- "$TARGET_FILE"
+        CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
+    elif [[ "$STATUS" == "runtime_crash" ]]; then
+        echo ">>> Runtime crash. Reverting." >&2
+        git checkout -- "$TARGET_FILE"
+        CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
+    else
+        echo ">>> Status: $STATUS. Reverting." >&2
+        git checkout -- "$TARGET_FILE"
+    fi
+
+    if [[ "$KEPT" == "false" ]]; then
+        TOTAL_REVERTED=$((TOTAL_REVERTED + 1))
+    fi
+
+    # Log to history
+    TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
+    echo "$RESULT" | jq -c --arg exp "$i" --arg ts "$TIMESTAMP" --argjson kept "$KEPT" \
+        '. + {experiment: ($exp|tonumber), timestamp: $ts, kept: $kept}' >> "$HISTORY_FILE"
+
+    # Safety: too many consecutive failures
+    if (( CONSECUTIVE_FAILURES >= 3 )); then
+        echo ">>> WARNING: 3 consecutive failures. Pausing for review." >&2
+        echo ">>> Last error: $(echo "$RESULT" | jq -r '.error // .status')" >&2
+        CONSECUTIVE_FAILURES=0
+        # Don't exit — just reset counter and let the stall hint kick in
+    fi
+
+    # GPU cooldown
+    sleep 5
+done
+
+# ---- Summary ----
+echo "" >&2
+echo "================================================================" >&2
+echo ">>> AUTORESEARCH COMPLETE" >&2
+echo ">>> Track: $TRACK" >&2
+echo ">>> Experiments: $((TOTAL_KEPT + TOTAL_REVERTED))" >&2
+echo ">>> Kept: $TOTAL_KEPT" >&2
+echo ">>> Reverted: $TOTAL_REVERTED" >&2
+echo ">>> Baseline: ${BASELINE_TG} t/s → Best: ${BEST_TG} t/s" >&2
+if [[ "$BEST_TG" != "$BASELINE_TG" ]]; then
+    TOTAL_GAIN=$(echo "scale=1; ($BEST_TG - $BASELINE_TG) / $BASELINE_TG * 100" | bc -l 2>/dev/null || echo "?")
+    echo ">>> Total improvement: +${TOTAL_GAIN}%" >&2
+fi
+echo ">>> Branch: $BRANCH_NAME" >&2
+echo ">>> History: $HISTORY_FILE" >&2
+echo "================================================================" >&2
@@ -0,0 +1,10 @@
+{
+  "model": "/tmp/qwen2.5-7b-instruct-q4_0.gguf",
+  "bench_args": "-p 512 -n 128 --cache-type-k turbo3 --cache-type-v turbo3",
+  "tg128": 236.0,
+  "pp512": 15000,
+  "ppl": 0,
+  "ppl_threshold": 0.1,
+  "ppl_file": "/mnt/ai/data/wikitext-2-raw/wiki.test.raw",
+  "target_file": "ggml/src/ggml-cuda/fattn-vec.cuh"
+}
@@ -0,0 +1,102 @@
+# TurboQuant KV Cache Kernel Optimization
+
+## Goal
+Maximize decode t/s for TurboQuant KV cache types (turbo2, turbo3, turbo4) on CUDA
+(RTX 5090, Blackwell sm_120). Focus on the VEC flash attention decode kernel
+(`fattn-vec.cuh`) which dominates decode-time compute.
+
+Current baseline: ~187 t/s with turbo3 KV on Qwen3.5-35B-A3B (Q4_K_M weights).
+Target: close the gap to q8_0 KV (~200+ t/s).
+
+## Target File
+`ggml/src/ggml-cuda/fattn-vec.cuh` — ONLY modify this file.
+
+## Model & Benchmark
+- Model: `/mnt/ai/models/huggingface/qwen3.5-35b-a3b-GGUF/Qwen_Qwen3.5-35B-A3B-Q4_K_M.gguf`
+- Benchmark: `llama-bench -ngl 99 -p 512 -n 128 -r 3 --cache-type-k turbo3 --cache-type-v turbo3`
+- Correctness: PPL must stay within 0.1 of baseline
+- Also test: `--cache-type-k turbo4 --cache-type-v turbo4` and `--cache-type-k turbo2 --cache-type-v turbo2`
+
+## Architecture Overview
+TurboQuant KV cache compresses K and V tensors using PolarQuant (WHT rotation +
+Lloyd-Max quantization). Block size = 128, with norm + 2/3/4-bit quantized values.
+
+### VEC Flash Attention Decode Kernel
+The VEC kernel handles single-token decode (n_tokens ≤ 2). Each warp computes
+attention for one head. The kernel has two main phases:
+
+**KQ scoring (Q × K^T):**
+- Q is pre-rotated and quantized to q8_1 format
+- K is stored in turbo format (128-element blocks with norms + quantized values)
+- Uses shared-memory LUT: precompute Q×centroid products, then score via LUT lookup
+- turbo3: 8-entry LUT per Q block; turbo2: 4-entry LUT
+
+**V aggregation (softmax(KQ) × V):**
+- V is stored in turbo format
+- Dequant V values, multiply by attention weight, accumulate
+- Sparse V optimization: skip dequant for negligible attention weights
+
+### Key Performance Features Already Implemented
+- Shared-memory Q×centroid LUT (eliminates multiply in KQ inner loop)
+- q8_1 Q quantization path (int8 Q values for turbo KQ scoring)
+- __expf fast-math softmax
+- L2 prefetch for K+V blocks
+- Sparse V thresholds (skip V dequant for low attention weights)
+- __launch_bounds__ occupancy 3
+- nthreads_KQ=8 for turbo types
+
+## Already Tried — Do NOT Re-explore
+| Approach | Result | Why it failed |
+|----------|--------|--------------|
+| Larger LUT (16-entry for turbo3) | No improvement | 8-entry already covers 3-bit |
+| Different occupancy (1, 2, 4) | 3 is optimal | Lower occupancy = less latency hiding |
+| V dequant loop unroll | No improvement | Compiler already unrolling |
+| `expf` → `__expf` fast-math | Already applied | +0.1%, already in current code |
+| Sparse V threshold tuning | Already at 1e-3 | Hill-climbed 1e-6→1e-4→5e-4→1e-3→2e-3, diminishing returns. Do NOT keep bumping this — higher thresholds risk PPL regression at long context. The current value is already aggressive. |
+| L2 prefetch for next K/V blocks | +0.1% | Already tried, marginal gain |
+| L1 vs L2 prefetch | No difference | Tried both, within noise |
+| `__launch_bounds__` occupancy 1→2→3 | Occupancy 2 marginally best | Already applied |
+
+## Promising Directions to Explore
+Focus on STRUCTURAL changes to the kernel, not parameter tuning.
+
+### From community discussion (ggml-org/llama.cpp#20969)
+
+- **Fused K tile loader (dusterbloom/Madreag approach)**: Keep K in compressed TBQ3
+  format in the MMA kernel, fuse dequant into the tile loader. Zero temp buffer for K.
+  This is how Madreag's optimized fork achieves near-parity with q8_0 on prefill.
+- **cp.async pipeline for V tiles**: Bulk dequant V → fp16, then use cp.async.cg
+  for V tile loads into shared memory. Overlaps V dequant with K scoring compute.
+- **Hybrid prefill architecture**: Different code paths for prefill (MMA with fused
+  tile loaders) vs decode (VEC with current approach). Prefill benefits most from
+  tile-level fusion.
+- **Precomputed scaled centroids per V block**: Instead of `centroid[idx] * norm`
+  per element, precompute `scaled_centroid[idx] = centroid[idx] * norm` once per
+  block (4 or 8 entries × 1 float each). Eliminates one multiply per V element.
+- **Cross-head WHT (AmesianX)**: For models with head_dim=64, apply WHT across
+  multiple KV heads via Kronecker decomposition (H_512 = H_8 ⊗ H_64). Claims
+  better decorrelation for small head dims.
+
+### Kernel-level ideas
+
+- **KQ scoring with dp4a**: Q is already q8_1. If K centroids can be mapped to
+  int8 per-block (like we proved with TQ4_0), dp4a for KQ dot product.
+- **Warp specialization**: Dedicate some warps to K prefetch, others to V prefetch.
+- **Double buffering**: Prefetch next KV block while processing current one
+  using cp.async or separate warp.
+- **Register pressure reduction**: Profile register usage, reduce if spilling.
+- **Shared memory V cache**: Cache frequently-accessed V blocks in shmem.
+- **Half2 accumulation**: Use fp16 for intermediate attention weight accumulation.
+- **Fused softmax + V aggregation**: Combine the two passes into one.
+- **Vectorized memory loads**: Use `float4` or `uint4` loads for K/V data.
+- **Loop interchange**: Change iteration order (heads vs KV positions) for
+  better cache locality.
+- **Reduce warp reduction overhead**: The `__shfl_xor_sync` reduction at end
+  of KQ scoring runs 5 stages — can we accumulate differently?
+
+## Constraints
+- Must not change the turbo block format ABI (shared with Metal/CPU)
+- Must not modify any file other than fattn-vec.cuh
+- Must maintain correct attention output (PPL gate catches corruption)
+- Must work on Blackwell (sm_120) and Ampere (sm_86)
+- The kernel is templated — changes affect all turbo type instantiations
@@ -0,0 +1,15 @@
+{
+  "model": "/tmp/qwen2.5-7b-instruct-tq4_1s.gguf",
+  "bench_args": "-p 0 -n 128",
+  "tg128": 69.2,
+  "ppl": 7.599,
+  "ppl_threshold": 0.1,
+  "ppl_file": "/mnt/ai/data/wikitext-2-raw/wiki.test.raw",
+  "target_file": "ggml/src/ggml-cuda/mmvq-tq.cu",
+  "no_convert": true,
+  "coherence_prompts": [
+    {"prompt": "What is the capital of France? One word.", "expect": "Paris"},
+    {"prompt": "What is 2+2? Just the number.", "expect": "4"},
+    {"prompt": "Who wrote Romeo and Juliet? One name.", "expect": "Shakespeare"}
+  ]
+}
@@ -0,0 +1,86 @@
+{"status":"success","tg128":68.11,"baseline_tg128":67.94,"delta_pct":"0%","build_time_s":4,"bench_time_s":7,"gpu_temp_c":"53","experiment":1,"timestamp":"2026-04-05T16:12:51Z","kept":true}
+{"status":"success","tg128":53.41,"baseline_tg128":68.11,"delta_pct":"-20.0%","build_time_s":8,"bench_time_s":8,"gpu_temp_c":"53","experiment":2,"timestamp":"2026-04-05T16:24:50Z","kept":false}
+{"status":"safety_revert","error":"modified non-target files","experiment":3,"timestamp":"2026-04-05T16:31:37Z","kept":false}
+{"status":"error","error":"No baseline.json found for track: track-weight","experiment":4,"timestamp":"2026-04-05T16:42:20Z","kept":false}
+{"status":"error","error":"No baseline.json found for track: track-weight","experiment":5,"timestamp":"2026-04-05T16:43:17Z","kept":false}
+{"status":"error","error":"No baseline.json found for track: track-weight","experiment":6,"timestamp":"2026-04-05T16:44:37Z","kept":false}
+{"status":"error","error":"No baseline.json found for track: track-weight","experiment":7,"timestamp":"2026-04-05T16:45:37Z","kept":false}
+{"status":"error","error":"No baseline.json found for track: track-weight","experiment":8,"timestamp":"2026-04-05T16:46:50Z","kept":false}
+{"status":"error","error":"No baseline.json found for track: track-weight","experiment":9,"timestamp":"2026-04-05T16:47:42Z","kept":false}
+{"status":"error","error":"No baseline.json found for track: track-weight","experiment":10,"timestamp":"2026-04-05T16:49:54Z","kept":false}
+{"status":"error","error":"No baseline.json found for track: track-weight","experiment":11,"timestamp":"2026-04-05T16:50:58Z","kept":false}
+{"status":"error","error":"No baseline.json found for track: track-weight","experiment":12,"timestamp":"2026-04-05T16:51:51Z","kept":false}
+{"status":"success","tg128":129.00,"baseline_tg128":69.2,"delta_pct":"80.0%","build_time_s":288,"bench_time_s":4,"gpu_temp_c":"59","experiment":13,"timestamp":"2026-04-05T16:59:41Z","kept":true}
+{"status":"success","tg128":150.85,"baseline_tg128":129.00,"delta_pct":"10.0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"55","experiment":14,"timestamp":"2026-04-05T17:07:38Z","kept":true}
+{"status":"success","tg128":151.41,"baseline_tg128":150.85,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"50","experiment":15,"timestamp":"2026-04-05T17:09:05Z","kept":true}
+{"status":"success","tg128":151.79,"baseline_tg128":151.41,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"52","experiment":16,"timestamp":"2026-04-05T17:11:54Z","kept":true}
+{"status":"success","tg128":151.02,"baseline_tg128":151.79,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"54","experiment":17,"timestamp":"2026-04-05T17:19:57Z","kept":false}
+{"status":"success","tg128":151.43,"baseline_tg128":151.79,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"51","experiment":18,"timestamp":"2026-04-05T17:26:15Z","kept":false}
+{"status":"success","tg128":219.12,"baseline_tg128":151.79,"delta_pct":"40.0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"53","experiment":19,"timestamp":"2026-04-05T17:28:31Z","kept":true}
+{"status":"success","tg128":220.78,"baseline_tg128":219.12,"delta_pct":"0%","build_time_s":9,"bench_time_s":3,"gpu_temp_c":"49","ppl":7.5425,"experiment":20,"timestamp":"2026-04-05T17:30:22Z","kept":true}
+{"status":"success","tg128":220.46,"baseline_tg128":220.78,"delta_pct":"0%","build_time_s":8,"bench_time_s":2,"gpu_temp_c":"49","experiment":21,"timestamp":"2026-04-05T17:33:02Z","kept":false}
+{"status":"success","tg128":212.39,"baseline_tg128":220.78,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"52","experiment":22,"timestamp":"2026-04-05T17:35:31Z","kept":false}
+{"status":"success","tg128":223.52,"baseline_tg128":220.78,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"55","experiment":23,"timestamp":"2026-04-05T17:46:25Z","kept":true}
+{"status":"success","tg128":223.51,"baseline_tg128":223.52,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"54","experiment":24,"timestamp":"2026-04-05T18:07:17Z","kept":false}
+{"status":"success","tg128":223.14,"baseline_tg128":223.52,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"53","experiment":25,"timestamp":"2026-04-05T18:20:26Z","kept":false}
+{"status":"build_failed","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/mmvq-tq.cu(191): error: more than one instance of overloaded function \"__dp4a\" matches the argument list:\n/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/mmvq-tq.cu(192): error: more than one instance of overloaded function \"__dp4a\" matches the argument list:\n/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/mmvq-tq.cu(199): error: more than one instance of overloaded function \"__dp4a\" matches the argument list:\n/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/mmvq-tq.cu(200): error: more than one instance of overloaded function \"__dp4a\" matches the argument list:","build_time_s":1,"experiment":26,"timestamp":"2026-04-05T18:35:52Z","kept":false}
+{"status":"success","tg128":223.66,"baseline_tg128":223.52,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"56","experiment":27,"timestamp":"2026-04-05T18:37:09Z","kept":true}
+{"status":"success","tg128":209.52,"baseline_tg128":223.66,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"53","experiment":28,"timestamp":"2026-04-05T18:46:10Z","kept":false}
+{"status":"success","tg128":223.32,"baseline_tg128":223.66,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"59","experiment":29,"timestamp":"2026-04-05T19:05:19Z","kept":false}
+{"status":"success","tg128":223.73,"baseline_tg128":223.66,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"56","ppl":7.5425,"experiment":30,"timestamp":"2026-04-05T19:19:40Z","kept":true}
+{"status":"success","tg128":216.46,"baseline_tg128":223.73,"delta_pct":"0%","build_time_s":8,"bench_time_s":2,"gpu_temp_c":"53","experiment":31,"timestamp":"2026-04-05T19:27:42Z","kept":false}
+{"status":"no_change","experiment":32,"timestamp":"2026-04-05T19:40:52Z","kept":false}
+{"status":"success","tg128":223.91,"baseline_tg128":223.73,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"56","experiment":33,"timestamp":"2026-04-05T19:47:35Z","kept":true}
+{"status":"success","tg128":223.47,"baseline_tg128":223.91,"delta_pct":"0%","build_time_s":8,"bench_time_s":2,"gpu_temp_c":"53","experiment":34,"timestamp":"2026-04-05T19:57:29Z","kept":false}
+{"status":"success","tg128":223.86,"baseline_tg128":223.91,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"56","experiment":35,"timestamp":"2026-04-05T20:13:18Z","kept":false}
+{"status":"success","tg128":224.45,"baseline_tg128":223.91,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"56","experiment":36,"timestamp":"2026-04-05T20:41:59Z","kept":true}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x760e8084fb1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x760e795ca737]","build_time_s":8,"bench_time_s":1,"experiment":37,"timestamp":"2026-04-05T20:43:05Z","kept":false}
+{"status":"success","tg128":221.19,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"49","experiment":38,"timestamp":"2026-04-05T20:44:35Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x74069c1cdb1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x740694fca737]","build_time_s":5,"bench_time_s":1,"experiment":39,"timestamp":"2026-04-05T20:45:49Z","kept":false}
+{"status":"success","tg128":208.20,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"55","ppl":7.5425,"experiment":40,"timestamp":"2026-04-05T20:52:14Z","kept":false}
+{"status":"success","tg128":224.23,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"50","experiment":41,"timestamp":"2026-04-05T20:55:47Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x70aa29337b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x70aa225ca737]","build_time_s":5,"bench_time_s":1,"experiment":42,"timestamp":"2026-04-05T20:57:48Z","kept":false}
+{"status":"success","tg128":219.29,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"55","experiment":43,"timestamp":"2026-04-05T21:04:11Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7c3f28670b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7c3f213ca737]","build_time_s":4,"bench_time_s":1,"experiment":44,"timestamp":"2026-04-05T21:05:11Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x73a442c62b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x73a43b9ca737]","build_time_s":5,"bench_time_s":1,"experiment":45,"timestamp":"2026-04-05T21:06:35Z","kept":false}
+{"status":"success","tg128":218.41,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"53","experiment":46,"timestamp":"2026-04-05T21:08:49Z","kept":false}
+{"status":"success","tg128":223.73,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"56","experiment":47,"timestamp":"2026-04-05T21:17:51Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7bad3bc68b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7bad349ca737]","build_time_s":5,"bench_time_s":1,"experiment":48,"timestamp":"2026-04-05T21:18:38Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7f0fe0dd2b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7f0fd9bca737]","build_time_s":6,"bench_time_s":0,"experiment":49,"timestamp":"2026-04-05T21:22:46Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7eb6a0537b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7eb6997ca737]","build_time_s":5,"bench_time_s":1,"experiment":50,"timestamp":"2026-04-05T21:24:04Z","kept":false}
+{"status":"success","tg128":223.85,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"55","experiment":51,"timestamp":"2026-04-05T21:29:33Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7492739b3b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x74926c7ca737]","build_time_s":5,"bench_time_s":1,"experiment":52,"timestamp":"2026-04-05T21:30:24Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x759986646b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x75997f3ca737]","build_time_s":5,"bench_time_s":0,"experiment":53,"timestamp":"2026-04-05T21:31:34Z","kept":false}
+{"status":"success","tg128":223.94,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"50","experiment":54,"timestamp":"2026-04-05T21:33:26Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x73c90cb37b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x73c905dca737]","build_time_s":4,"bench_time_s":1,"experiment":55,"timestamp":"2026-04-05T21:34:32Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7f926c661b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7f92653ca737]","build_time_s":5,"bench_time_s":1,"experiment":56,"timestamp":"2026-04-05T21:41:29Z","kept":false}
+{"status":"success","tg128":225.17,"baseline_tg128":224.45,"delta_pct":"0%","build_time_s":6,"bench_time_s":2,"gpu_temp_c":"57","experiment":57,"timestamp":"2026-04-05T21:50:14Z","kept":true}
+{"status":"success","tg128":222.27,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":8,"bench_time_s":3,"gpu_temp_c":"49","experiment":58,"timestamp":"2026-04-05T21:54:06Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7bb3e0137b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7bb3d93ca737]","build_time_s":5,"bench_time_s":1,"experiment":59,"timestamp":"2026-04-05T21:55:23Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7b9533f37b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7b952d1ca737]","build_time_s":5,"bench_time_s":1,"experiment":60,"timestamp":"2026-04-05T21:56:34Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7a274f7b4b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7a27485ca737]","build_time_s":4,"bench_time_s":1,"experiment":61,"timestamp":"2026-04-05T21:57:42Z","kept":false}
+{"status":"success","tg128":207.76,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"56","experiment":62,"timestamp":"2026-04-05T22:04:53Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7dee6b826b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7dee645ca737]","build_time_s":5,"bench_time_s":1,"experiment":63,"timestamp":"2026-04-05T22:05:49Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7bddc9b37b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7bddc2dca737]","build_time_s":5,"bench_time_s":1,"experiment":64,"timestamp":"2026-04-05T22:07:04Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x71109ac95b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7110939ca737]","build_time_s":4,"bench_time_s":1,"experiment":65,"timestamp":"2026-04-05T22:11:06Z","kept":false}
+{"status":"success","tg128":219.26,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"56","experiment":66,"timestamp":"2026-04-05T22:19:51Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x72b8b9337b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x72b8b25ca737]","build_time_s":6,"bench_time_s":1,"experiment":67,"timestamp":"2026-04-05T22:20:59Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x7d8176937b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x7d816fbca737]","build_time_s":5,"bench_time_s":1,"experiment":68,"timestamp":"2026-04-05T22:25:11Z","kept":false}
+{"status":"runtime_crash","error":"/mnt/ai/projects/llama-cpp-turboquant/ggml/src/ggml-cuda/ggml-cuda.cu:100: CUDA error\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-base.so.0(ggml_abort+0x15b)[0x784b43537b1b]\n/mnt/ai/projects/llama-cpp-turboquant/build-cuda/bin/libggml-cuda.so.0(_Z15ggml_cuda_errorPKcS0_S0_iS0_+0xb7)[0x784b3c7ca737]","build_time_s":5,"bench_time_s":1,"experiment":69,"timestamp":"2026-04-05T22:27:18Z","kept":false}
+{"status":"success","tg128":224.98,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"58","ppl":7.5425,"experiment":70,"timestamp":"2026-04-05T22:33:37Z","kept":false}
+{"status":"success","tg128":225.46,"baseline_tg128":225.17,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"51","experiment":71,"timestamp":"2026-04-05T22:39:32Z","kept":true}
+{"status":"success","tg128":220.92,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":8,"bench_time_s":2,"gpu_temp_c":"55","experiment":72,"timestamp":"2026-04-05T22:51:55Z","kept":false}
+{"status":"success","tg128":220.68,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"52","experiment":73,"timestamp":"2026-04-05T22:57:46Z","kept":false}
+{"status":"success","tg128":219.24,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"55","experiment":74,"timestamp":"2026-04-05T23:02:38Z","kept":false}
+{"status":"success","tg128":219.54,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"53","experiment":75,"timestamp":"2026-04-05T23:07:50Z","kept":false}
+{"status":"success","tg128":222.66,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"56","experiment":76,"timestamp":"2026-04-05T23:17:39Z","kept":false}
+{"status":"success","tg128":210.26,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"48","experiment":77,"timestamp":"2026-04-05T23:20:13Z","kept":false}
+{"status":"success","tg128":220.71,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"53","experiment":78,"timestamp":"2026-04-05T23:22:33Z","kept":false}
+{"status":"success","tg128":219.25,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"58","experiment":79,"timestamp":"2026-04-05T23:31:36Z","kept":false}
+{"status":"success","tg128":210.19,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"53","ppl":7.5425,"experiment":80,"timestamp":"2026-04-05T23:38:24Z","kept":false}
+{"status":"success","tg128":221.41,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":4,"bench_time_s":3,"gpu_temp_c":"51","experiment":81,"timestamp":"2026-04-05T23:39:51Z","kept":false}
+{"status":"success","tg128":215.12,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"51","experiment":82,"timestamp":"2026-04-05T23:45:46Z","kept":false}
+{"status":"success","tg128":210.01,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"55","experiment":83,"timestamp":"2026-04-05T23:52:38Z","kept":false}
+{"status":"success","tg128":221.42,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":3,"gpu_temp_c":"51","experiment":84,"timestamp":"2026-04-05T23:56:44Z","kept":false}
+{"status":"success","tg128":220.24,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"54","experiment":85,"timestamp":"2026-04-06T00:01:16Z","kept":false}
+{"status":"success","tg128":225.90,"baseline_tg128":225.46,"delta_pct":"0%","build_time_s":5,"bench_time_s":2,"gpu_temp_c":"56","experiment":86,"timestamp":"2026-04-06T00:20:56Z","kept":true}
@@ -0,0 +1,118 @@
+# TQ4_1S Weight Kernel Optimization
+
+## Goal
+Maximize decode t/s for TQ4_1S `mul_mat_vec` on CUDA (RTX 5090, Blackwell sm_120).
+Current baseline: ~69 t/s. Target: close the gap to q4_0 (267 t/s).
+
+## Target File
+`ggml/src/ggml-cuda/mmvq-tq.cu` — ONLY modify this file.
+
+## Model & Benchmark
+- Model: `/tmp/qwen2.5-7b-instruct-tq4_1s.gguf`
+- Benchmark: `llama-bench -ngl 99 -p 0 -n 128 -r 3` (decode only)
+- Correctness: PPL on wikitext-2 must stay within 0.1 of baseline (7.599)
+
+## Architecture Overview
+The TQ4_1S format stores WHT-rotated 4-bit weights with non-linear Lloyd-Max centroids.
+Block size = 32 elements, dual half-block scales (d0 for [0..15], d1 for [16..31]).
+20 bytes per block = 5.0 bits/value.
+
+Dequant per element: `centroid_table[4bit_index] * half_block_scale`
+Then inverse WHT (Walsh-Hadamard Transform) to recover original weight space.
+
+The fused mmvq kernel avoids per-block inverse WHT by pre-rotating the activation
+vector (WHT forward) once, then the inner loop is just:
+```
+sum += rotated_activation[lane] * centroid[idx] * d
+```
+
+### Current Kernel (V8)
+- 8 warps per CUDA block (MMVQ_TQ_NWARPS = 8), each warp handles one output row
+- 32 lanes per warp, each lane handles element `lane` within every block
+- Activation pre-rotated to float scratch buffer via warp shuffle WHT
+- Inner loop: 1 float FMA per element per lane
+- Warp reduction via `__shfl_xor_sync`
+
+### Block Layout (block_tq4_1s)
+```c
+struct block_tq4_1s {
+    half d0;           // 2 bytes: scale for elements [0..15]
+    half d1;           // 2 bytes: scale for elements [16..31]
+    uint8_t qs[16];    // 16 bytes: 4-bit indices, consecutive pair packing
+                       // qs[j/2] >> ((j&1)*4) & 0xF = centroid index for element j
+};
+```
+
+### Centroid Table (constant memory)
+16 Lloyd-Max optimal values for N(0,1):
+```
+[-2.733, -2.069, -1.618, -1.256, -0.942, -0.657, -0.388, -0.128,
+  0.128,  0.388,  0.657,  0.942,  1.256,  1.618,  2.069,  2.733]
+```
+
+## Confirmed Bottleneck
+The centroid lookup itself is NOT the bottleneck — confirmed via ablation
+(replacing `centroid[idx]` with `(idx-8)` gives identical 69 t/s).
+
+The real bottleneck is:
+1. **Float32 activation bandwidth**: 4 bytes/element vs q8_1's 1 byte. Each warp
+   reads the full activation vector from global memory per row.
+2. **Float FMA arithmetic density**: 1 MAC per instruction vs dp4a's 4 MACs.
+   q4_0 processes 8 elements per dp4a pair; V8 processes 1 element per FMA.
+
+## Already Tried — Do NOT Re-explore
+| Version | Approach | Result | Why it failed |
+|---------|----------|--------|--------------|
+| V9-V11 | Multi-row NR0=2,4 (shmem or registers) | Regressed | __syncthreads overhead or register spill |
+| V12 | Shmem activation broadcast | ~67 t/s | No help on 5090 (128MB L2 already caching) |
+| V13 | Loop unroll ×4 | ~69 t/s | Compiler already optimal |
+| V14 | WMMA tensor cores | 6 t/s | Setup overhead >> throughput for matvec |
+| V15 | L2 prefetch hints | ~69 t/s | No measurable effect |
+| V16 | __launch_bounds__ tuning | ~69 t/s | Occupancy changes no effect |
+| V18 | Per-block int8 LUT + dp4a | 46 t/s | LUT build + pack overhead > dp4a gain |
+| V19 | ILP 4× unroll | 70 t/s | Negligible improvement |
+
+## Promising Directions to Explore
+
+### From community discussion (ggml-org/llama.cpp#20969)
+
+- **Entropy-coded weight compression (karambaso idea)**: With only 16 centroid
+  values, 4-bit indices have low entropy. Runtime Huffman/ANS decompression in
+  shmem could reduce effective bandwidth 30-50%. Decode is memory-bound, so extra
+  compute for decompression may be free. This reframes the problem: instead of
+  faster dequant, read less data.
+- **Fused tile loader pattern (from Madreag's KV work)**: Load multiple weight
+  blocks into shmem, dequant in-register from shmem. Amortizes global memory
+  latency across a tile of blocks.
+- **F32 vs fp16 activation precision**: AmesianX notes WHT amplifies q8_1
+  quantization error ~16x. Our V8 uses f32 activation which avoids this.
+  But fp16 activation would halve bandwidth. Worth testing if the quality
+  tradeoff is acceptable for weights (less sensitive than KV cache).
+
+### Kernel-level ideas
+
+- **half2 packed FMA**: Process 2 elements per `__hmul2`/`__hfma2` instruction.
+  Centroids in fp16 constant memory, activation in fp16. 2x arithmetic density.
+- **Warp-cooperative coalesced loading**: Reorganize memory access so weight loads
+  are fully coalesced (currently scattered due to per-lane block access).
+- **Register blocking across blocks**: Each lane accumulates across multiple
+  blocks before reducing, keeping partial sums in registers.
+- **Async memory copy (cp.async)**: Prefetch next block's weight data while
+  computing current block.
+- **Different warp configurations**: Try 4 or 16 warps instead of 8.
+- **Two-level tiling**: Load a tile of blocks to shmem, process tile, repeat.
+  This is the shmem activation variant (V12) but for weights instead.
+- **Vectorized weight loads**: Load 4 bytes (8 nibbles) per lane per iteration
+  instead of extracting one nibble at a time.
+- **Activation compression**: Quantize pre-rotated activation to fp16 or int8
+  to reduce bandwidth (loses some precision but may be worth it).
+- **Stream-K style decomposition**: Different work partitioning across warps.
+- **Per-block centroid pre-scale**: Pre-multiply centroid × d_half into a
+  16-entry fp16 LUT in registers (not shmem). Then inner loop is just
+  `lut[idx] * activation` — one FMA instead of two multiplies.
+
+## Constraints
+- Must not change the block_tq4_1s ABI (format is shared with Metal/CPU)
+- Must not modify any file other than mmvq-tq.cu
+- Output must be bit-exact for the same input (deterministic)
+- Must work on both Blackwell (sm_120) and Ampere (sm_86)
@@ -0,0 +1,58 @@
+#!/bin/bash
+# SMEM Pre-Dequant Benchmark — M5 Max
+# Tests SMEM vs baseline at multiple context depths
+#
+# BEFORE RUNNING:
+#   1. cd /Users/tom/local_llms/llama.cpp
+#   2. git checkout experiment/smem-pre-dequant
+#   3. Build WITHOUT SMEM first (baseline):
+#      cmake --build build -j12
+#   4. Run: ./scripts/bench-smem-m5.sh baseline
+#   5. Build WITH SMEM:
+#      TURBO_SMEM_DEQUANT=1 cmake --build build -j12
+#   6. Run: ./scripts/bench-smem-m5.sh smem
+#
+# Uses Qwen3.5-35B-A3B (MoE, fits in memory, attention-heavy)
+
+set -e
+
+LABEL="${1:-baseline}"
+LLAMA_BENCH="/Users/tom/local_llms/llama.cpp/build/bin/llama-bench"
+MODEL="/Users/tom/local_llms/models/Qwen3.5-35B-A3B-Q8_0.gguf"
+OUTFILE="/Users/tom/local_llms/llama.cpp/bench-smem-m5-${LABEL}.txt"
+
+CONTEXTS=(0 8192 16384 32768)
+KV_TYPES=("turbo3" "turbo4" "q8_0")
+
+echo "=== SMEM M5 Benchmark: ${LABEL} ===" | tee "$OUTFILE"
+echo "Model: $(basename $MODEL)" | tee -a "$OUTFILE"
+echo "Date: $(date)" | tee -a "$OUTFILE"
+echo "" | tee -a "$OUTFILE"
+
+for ctk in "${KV_TYPES[@]}"; do
+    for p in "${CONTEXTS[@]}"; do
+        if [[ "$ctk" == "q8_0" && "$LABEL" == "smem" ]]; then
+            echo "SKIP: q8_0 + smem (q8_0 unaffected by SMEM)" | tee -a "$OUTFILE"
+            continue
+        fi
+
+        depth_label="short"
+        [[ $p -gt 0 ]] && depth_label="${p}"
+
+        echo "--- ${ctk} @ ${depth_label} ---" | tee -a "$OUTFILE"
+
+        ctv="$ctk"
+        $LLAMA_BENCH \
+            -m "$MODEL" \
+            -ngl 99 -fa 1 \
+            -ctk "$ctk" -ctv "$ctv" \
+            -t 1 \
+            -p "$p" -n 128 \
+            2>&1 | tee -a "$OUTFILE"
+
+        echo "" | tee -a "$OUTFILE"
+    done
+done
+
+echo "=== Done: ${LABEL} ===" | tee -a "$OUTFILE"
+echo "Results saved to: $OUTFILE"
@@ -0,0 +1,80 @@
+#!/bin/bash
+# TurboQuant quality + speed gate — run BEFORE pushing any changes
+# Checks: (1) perplexity within 5% of q8_0, (2) context scaling ratio > 0.95
+#
+# Usage: bash scripts/turbo-quality-gate.sh
+# Exit 0 = PASS, Exit 1 = FAIL
+
+set -e
+
+LLAMA=${LLAMA:-~/local_llms/llama.cpp/build-turbo/bin}
+MODEL=${MODEL:-~/local_llms/models/Qwen3.5-35B-A3B-Q8_0.gguf}
+WIKI=${WIKI:-~/local_llms/llama.cpp/wikitext-2-raw/wiki.test.raw}
+
+if [ ! -f "$WIKI" ]; then
+    echo "Downloading wikitext-2..."
+    bash ~/local_llms/llama.cpp/scripts/get-wikitext-2.sh
+fi
+
+FAIL=0
+
+echo "========================================"
+echo "  TurboQuant Quality + Speed Gate"
+echo "========================================"
+echo ""
+
+# --- Test 1: Perplexity ---
+echo "[1/2] Running perplexity check (8 chunks)..."
+PPL_TURBO=$($LLAMA/llama-perplexity -m $MODEL -f $WIKI -c 512 -ctk turbo3 -ctv turbo3 -fa on --chunks 8 -ngl 99 2>&1 | grep "Final" | grep -oE 'PPL = [0-9.]+' | grep -oE '[0-9.]+')
+
+if [ -z "$PPL_TURBO" ]; then
+    echo "  FAIL: Could not get turbo3 perplexity (crash or timeout)"
+    FAIL=1
+else
+    BASELINE_PPL=6.111
+    MAX_PPL=$(echo "$BASELINE_PPL * 1.05" | bc)
+    PPL_OK=$(echo "$PPL_TURBO < $MAX_PPL" | bc)
+    if [ "$PPL_OK" -eq 1 ]; then
+        echo "  PASS: turbo3 PPL = $PPL_TURBO (< $MAX_PPL, within 5% of q8_0 $BASELINE_PPL)"
+    else
+        echo "  FAIL: turbo3 PPL = $PPL_TURBO (> $MAX_PPL, exceeds 5% threshold)"
+        FAIL=1
+    fi
+fi
+echo ""
+
+# --- Test 2: Context Scaling ---
+echo "[2/2] Running context scaling check (4K prefill)..."
+TURBO_TPS=$($LLAMA/llama-perplexity -m $MODEL -f $WIKI -c 4096 -ctk turbo3 -ctv turbo3 -fa on --chunks 4 -ngl 99 2>&1 | grep "prompt eval" | grep -oE '[0-9.]+ tokens per second' | grep -oE '[0-9.]+')
+Q8_TPS=$($LLAMA/llama-perplexity -m $MODEL -f $WIKI -c 4096 -ctk q8_0 -ctv q8_0 -fa on --chunks 4 -ngl 99 2>&1 | grep "prompt eval" | grep -oE '[0-9.]+ tokens per second' | grep -oE '[0-9.]+')
+
+if [ -z "$TURBO_TPS" ] || [ -z "$Q8_TPS" ]; then
+    echo "  FAIL: Could not measure speed (crash or timeout)"
+    echo "  turbo3=$TURBO_TPS q8_0=$Q8_TPS"
+    FAIL=1
+else
+    RATIO=$(echo "scale=4; $TURBO_TPS / $Q8_TPS" | bc)
+    RATIO_OK=$(echo "$RATIO > 0.95" | bc)
+    if [ "$RATIO_OK" -eq 1 ]; then
+        echo "  PASS: turbo3/q8_0 = ${RATIO}x at 4K context (> 0.95 threshold)"
+        echo "  turbo3 = $TURBO_TPS tok/s, q8_0 = $Q8_TPS tok/s"
+    else
+        echo "  FAIL: turbo3/q8_0 = ${RATIO}x at 4K context (< 0.95 threshold)"
+        echo "  turbo3 = $TURBO_TPS tok/s, q8_0 = $Q8_TPS tok/s"
+        echo "  Context scaling regression detected!"
+        FAIL=1
+    fi
+fi
+echo ""
+
+# --- Summary ---
+echo "========================================"
+if [ "$FAIL" -eq 0 ]; then
+    echo "  ALL CHECKS PASSED"
+    echo "========================================"
+    exit 0
+else
+    echo "  CHECKS FAILED — DO NOT PUSH"
+    echo "========================================"
+    exit 1
+fi
@@ -3398,8 +3398,16 @@ llama_context * llama_init_from_model(

    if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
        const uint32_t blck_size = ggml_blck_size(params.type_k);
+        const bool k_is_turbo = (params.type_k == GGML_TYPE_TURBO2_0 ||
+                                 params.type_k == GGML_TYPE_TURBO3_0 ||
+                                 params.type_k == GGML_TYPE_TURBO4_0);
        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
-            if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
+            uint32_t head_k = model->hparams.n_embd_head_k(il);
+            // Turbo types zero-pad heads to next multiple of 128 in llama-kv-cache.cpp
+            if (k_is_turbo && head_k % 128 != 0) {
+                head_k = ((head_k + 127) / 128) * 128;
+            }
+            if (head_k % blck_size != 0) {
                LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
                    __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
                return nullptr;
@@ -3409,8 +3417,17 @@ llama_context * llama_init_from_model(

    if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
        const uint32_t blck_size = ggml_blck_size(params.type_v);
+        const bool v_is_turbo = (params.type_v == GGML_TYPE_TURBO2_0 ||
+                                 params.type_v == GGML_TYPE_TURBO3_0 ||
+                                 params.type_v == GGML_TYPE_TURBO4_0);
+        const bool is_mla = model->hparams.is_mla();
        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
-            if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
+            uint32_t head_v = model->hparams.n_embd_head_v(il);
+            // Turbo types zero-pad; MLA has no separate V cache (V = view of K)
+            if (v_is_turbo && !is_mla && head_v % 128 != 0) {
+                head_v = ((head_v + 127) / 128) * 128;
+            }
+            if (head_v % blck_size != 0) {
                LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
                    __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
                return nullptr;
@@ -3418,6 +3435,14 @@ llama_context * llama_init_from_model(
        }
    }

+    // TurboQuant cache types require flash attention — auto-enable if disabled
+    if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED &&
+        (params.type_k == GGML_TYPE_TURBO2_0 || params.type_k == GGML_TYPE_TURBO3_0 || params.type_k == GGML_TYPE_TURBO4_0 ||
+         params.type_v == GGML_TYPE_TURBO2_0 || params.type_v == GGML_TYPE_TURBO3_0 || params.type_v == GGML_TYPE_TURBO4_0)) {
+        LLAMA_LOG_WARN("%s: turbo cache types require flash_attn — enabling automatically\n", __func__);
+        params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
+    }
+
    if (ggml_is_quantized(params.type_v) && params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED) {
        LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
        return nullptr;
@@ -1958,6 +1958,10 @@ ggml_tensor * llm_graph_context::build_attn_mha(
    k = ggml_permute(ctx0, k, 0, 2, 1, 3);
    v = ggml_permute(ctx0, v, 0, 2, 1, 3);

+    // TurboQuant note: graph-side Q rotation (pre-rotate-queries) is implemented below
+    // in the flash-attn path. The VEC kernel bug (wrong Q/K stride in
+    // vec_dot_fattn_vec_KQ_turbo3_0) was fixed in fattn-common.cuh to match f16 pattern.
+
    ggml_tensor * cur;

    const bool use_flash_attn = cparams.flash_attn && kq_b == nullptr;
@@ -1984,6 +1988,20 @@ ggml_tensor * llm_graph_context::build_attn_mha(
        ggml_flash_attn_ext_add_sinks(cur, sinks);
        ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);

+        // TurboQuant: inverse WHT on FA output when V values are WHT-rotated.
+        // For MLA, V is a view of K with different ne[0] (e.g. V=512, K=576).
+        // Group size must come from K (which determines the WHT rotation), not V.
+        if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
+            const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0);
+            const ggml_tensor * group_src = k_is_turbo ? k : v;
+            const int turbo_group = (group_src->ne[0] % 128 == 0) ? 128 : 64;
+            if (cur->ne[0] % turbo_group == 0) {
+                if (!ggml_is_contiguous(cur)) { cur = ggml_cont(ctx0, cur); }
+                ggml_tensor * innerq_scale = mctx ? mctx->get_turbo_innerq_scale_inv() : nullptr;
+                cur = ggml_turbo_wht(ctx0, cur, 1, turbo_group, innerq_scale);  // 1 = inverse
+            }
+        }
+
        if (v_mla) {
 #if 0
            // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
@@ -2050,6 +2068,18 @@ ggml_tensor * llm_graph_context::build_attn_mha(
        ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
        cb(kqv, "kqv", il);

+        // TurboQuant: inverse WHT on attention output (non-FA path)
+        if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
+            const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0);
+            const ggml_tensor * group_src = k_is_turbo ? k : v;
+            const int turbo_group = (group_src->ne[0] % 128 == 0) ? 128 : 64;
+            if (kqv->ne[0] % turbo_group == 0) {
+                if (!ggml_is_contiguous(kqv)) { kqv = ggml_cont(ctx0, kqv); }
+                ggml_tensor * innerq_scale = mctx ? mctx->get_turbo_innerq_scale_inv() : nullptr;
+                kqv = ggml_turbo_wht(ctx0, kqv, 1, turbo_group, innerq_scale);
+            }
+        }
+
        // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
        if (v_mla) {
            kqv = ggml_mul_mat(ctx0, v_mla, kqv);
@@ -2067,6 +2097,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
        }
    }

+    // TurboQuant: graph-side inverse WHT on attention output (undoes V rotation)
+
    ggml_build_forward_expand(gf, cur);

    return cur;
@@ -2227,9 +2259,48 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
    ggml_tensor * v = mctx_cur->get_v(ctx0, il);

+    // TurboQuant pre-rotate-queries: O(d log d) WHT rotation via custom op
+    // Q shape: (n_embd_head, n_head, n_tokens)
+    // For zero-padded models (head_dim not 128-aligned), pad Q to match padded K dim first.
+    if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
+        // Pad Q per-head to next multiple of 128 if needed
+        if (q->ne[0] % 128 != 0) {
+            const int64_t pad = ((q->ne[0] + 127) / 128) * 128 - q->ne[0];
+            q = ggml_pad(ctx0, q, pad, 0, 0, 0);
+        }
+        if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); }
+        ggml_tensor * innerq_scale = mctx_cur->get_turbo_innerq_scale_inv();
+        q = ggml_turbo_wht(ctx0, q, 0, 0, innerq_scale);  // 0 = forward, 0 = auto group size from q->ne[0]
+    }
+
    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
    cb(cur, "kqv_out", il);

+    // TurboQuant: if V was padded, the output has padded dimensions.
+    // Extract original V head_dim after inverse WHT (applied inside build_attn_mha).
+    // NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo
+    if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
+        const int64_t orig_v_head = hparams.n_embd_head_v(il);
+        // cur is 2D: (n_embd_head * n_head, n_tokens) after build_attn_mha
+        const int64_t padded_v_head = v->ne[0];
+        if (padded_v_head != orig_v_head) {
+            // Reshape to 4D, extract original head_dim, reshape back to 2D
+            // Fix #78 (bingh0): cur shape post-MHA is (n_embd_head * n_head, n_tokens),
+            // not (n_embd_head * n_head_kv, n_tokens). Reshape needs n_head
+            // (Q-head count) so GQA models with n_head != n_head_kv (e.g.
+            // Qwen2.5-0.5B head_dim=64 padded → 128) don't fail the element
+            // count check in ggml_reshape_3d.
+            const int64_t n_head_v = hparams.n_head(il);
+            const int64_t n_tokens_cur = cur->ne[1];
+            cur = ggml_reshape_3d(ctx0, cur, padded_v_head, n_head_v, n_tokens_cur);
+            // ggml_view_3d to extract first orig_v_head elements per head
+            cur = ggml_view_3d(ctx0, cur, orig_v_head, n_head_v, n_tokens_cur,
+                               cur->nb[1], cur->nb[2], 0);
+            cur = ggml_cont(ctx0, cur);
+            cur = ggml_reshape_2d(ctx0, cur, orig_v_head * n_head_v, n_tokens_cur);
+        }
+    }
+
    if (inp->self_v_rot) {
        cur = ggml_mul_mat_aux(ctx0, cur, inp->self_v_rot);
    }
@@ -2318,9 +2389,44 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
    ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);

+    // TurboQuant: pre-rotate Q for K-only (MLA) attention
+    // For zero-padded models, pad Q to match padded K dim first.
+    if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
+        // Pad Q per-head to next multiple of 128 if needed
+        if (q->ne[0] % 128 != 0) {
+            const int64_t pad = ((q->ne[0] + 127) / 128) * 128 - q->ne[0];
+            q = ggml_pad(ctx0, q, pad, 0, 0, 0);
+        }
+        if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); }
+        ggml_tensor * innerq_scale = mctx_cur->get_turbo_innerq_scale_inv();
+        q = ggml_turbo_wht(ctx0, q, 0, 0, innerq_scale);  // 0 = forward, 0 = auto group size
+    }
+
    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
    cb(cur, "kqv_out", il);

+    // TurboQuant: if V was padded (MLA: V is view of K, may have padded dim),
+    // extract original V head_dim after inverse WHT.
+    if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
+        const int64_t orig_v_head = v_cur->ne[0];  // original V head_dim from model
+        const int64_t padded_v_head = v->ne[0];     // padded V head_dim in cache
+        if (padded_v_head != orig_v_head) {
+            // cur is 2D: (padded_v_head * n_head, n_tokens) after build_attn_mha
+            // Fix #78 (bingh0): cur shape post-MHA is (n_embd_head * n_head, n_tokens),
+            // not (n_embd_head * n_head_kv, n_tokens). Reshape needs n_head
+            // (Q-head count) so GQA models with n_head != n_head_kv (e.g.
+            // Qwen2.5-0.5B head_dim=64 padded → 128) don't fail the element
+            // count check in ggml_reshape_3d.
+            const int64_t n_head_v = hparams.n_head(il);
+            const int64_t n_tokens_cur = cur->ne[1];
+            cur = ggml_reshape_3d(ctx0, cur, padded_v_head, n_head_v, n_tokens_cur);
+            cur = ggml_view_3d(ctx0, cur, orig_v_head, n_head_v, n_tokens_cur,
+                               cur->nb[1], cur->nb[2], 0);
+            cur = ggml_cont(ctx0, cur);
+            cur = ggml_reshape_2d(ctx0, cur, orig_v_head * n_head_v, n_tokens_cur);
+        }
+    }
+
    if (wo) {
        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
            // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
@@ -2406,9 +2512,41 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
    ggml_tensor * v = mctx_cur->get_v(ctx0, il);

+    // TurboQuant: pre-rotate Q for ISWA attention (pad to 128-aligned if needed)
+    if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
+        if (q->ne[0] % 128 != 0) {
+            const int64_t pad = ((q->ne[0] + 127) / 128) * 128 - q->ne[0];
+            q = ggml_pad(ctx0, q, pad, 0, 0, 0);
+        }
+        if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); }
+        ggml_tensor * innerq_scale = mctx_cur->get_turbo_innerq_scale_inv();
+        q = ggml_turbo_wht(ctx0, q, 0, 0, innerq_scale);
+    }
+
    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
    cb(cur, "kqv_out", il);

+    // TurboQuant: if V was padded, extract original V head_dim after inverse WHT
+    // NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo
+    if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
+        const int64_t orig_v_head = hparams.n_embd_head_v(il);
+        const int64_t padded_v_head = v->ne[0];
+        if (padded_v_head != orig_v_head) {
+            // Fix #78 (bingh0): cur shape post-MHA is (n_embd_head * n_head, n_tokens),
+            // not (n_embd_head * n_head_kv, n_tokens). Reshape needs n_head
+            // (Q-head count) so GQA models with n_head != n_head_kv (e.g.
+            // Qwen2.5-0.5B head_dim=64 padded → 128) don't fail the element
+            // count check in ggml_reshape_3d.
+            const int64_t n_head_v = hparams.n_head(il);
+            const int64_t n_tokens_cur = cur->ne[1];
+            cur = ggml_reshape_3d(ctx0, cur, padded_v_head, n_head_v, n_tokens_cur);
+            cur = ggml_view_3d(ctx0, cur, orig_v_head, n_head_v, n_tokens_cur,
+                               cur->nb[1], cur->nb[2], 0);
+            cur = ggml_cont(ctx0, cur);
+            cur = ggml_reshape_2d(ctx0, cur, orig_v_head * n_head_v, n_tokens_cur);
+        }
+    }
+
    if (v_rot) {
        cur = ggml_mul_mat_aux(ctx0, cur, v_rot);
    }
@@ -73,6 +73,30 @@ static ggml_tensor * ggml_mul_mat_aux(
    return res;
 }

+// InnerQ: cross-TU shared state for CUDA per-channel equalization.
+// These are defined in ggml-cuda/turbo-innerq.cu (when CUDA is enabled).
+// When CUDA is not available, we provide stub implementations.
+#ifndef INNERQ_MAX_CHANNELS
+#define INNERQ_MAX_CHANNELS 128
+#endif
+
+#ifdef GGML_USE_CUDA
+#if defined(_WIN32) && !defined(__MINGW32__)
+#  define TURBO_IQ_IMPORT __declspec(dllimport)
+#else
+#  define TURBO_IQ_IMPORT
+#endif
+extern TURBO_IQ_IMPORT bool  g_innerq_finalized;
+extern TURBO_IQ_IMPORT float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS];
+TURBO_IQ_IMPORT bool turbo_innerq_needs_tensor_update(void);
+TURBO_IQ_IMPORT void turbo_innerq_mark_tensor_updated(void);
+#else
+static bool  g_innerq_finalized = false;
+static float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS] = {};
+static bool turbo_innerq_needs_tensor_update(void) { return false; }
+static void turbo_innerq_mark_tensor_updated(void) {}
+#endif
+
 //
 // llama_kv_cache
 //
@@ -96,6 +120,32 @@ llama_kv_cache::llama_kv_cache(

    GGML_ASSERT(kv_size % n_pad == 0);

+    // Auto-asymmetric: when symmetric turbo K+V is requested and the model has
+    // high GQA ratio (few KV heads serving many Q heads), upgrade K to q8_0.
+    // Turbo K quantization error gets amplified by the GQA broadcast factor.
+    // Qwen2.5: 4 KV heads / 28 Q heads = 7:1 → turbo3 K PPL catastrophic (2887 vs 7.4 baseline)
+    // Mistral:  8 KV heads / 32 Q heads = 4:1 → turbo3 K works fine (+4.4% PPL)
+    // Threshold: GQA ratio >= 6 triggers auto-asymmetric.
+    {
+        const bool k_is_turbo = (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0);
+        if (k_is_turbo) {
+            const uint32_t n_head    = hparams.n_head(0);
+            const uint32_t n_head_kv = hparams.n_head_kv(0);
+            const uint32_t gqa_ratio = (n_head_kv > 0) ? n_head / n_head_kv : 1;
+
+            const char * env = getenv("TURBO_AUTO_ASYMMETRIC");
+            const bool disabled = (env && env[0] == '0');
+
+            if (!disabled && gqa_ratio >= 6 && type_k == type_v) {
+                LLAMA_LOG_WARN("%s: auto-asymmetric: GQA ratio %u:1 (n_head=%u, n_head_kv=%u) — "
+                               "upgrading K from %s to q8_0 to prevent quality degradation. "
+                               "Disable with TURBO_AUTO_ASYMMETRIC=0\n",
+                               __func__, gqa_ratio, n_head, n_head_kv, ggml_type_name(type_k));
+                type_k = GGML_TYPE_Q8_0;
+            }
+        }
+    }
+
    const uint32_t n_layer_kv = hparams.n_layer_kv();

    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
@@ -111,7 +161,8 @@ llama_kv_cache::llama_kv_cache(
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
            ggml_init_params params = {
-                /*.mem_size   =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
+                // +3 for turbo rotation matrices (turbo_rotation + turbo_rotation_inv + turbo_innerq_scale_inv)
+                /*.mem_size   =*/ size_t((2u*(1 + n_stream)*n_layer_kv + 3)*ggml_tensor_overhead()),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
            };
@@ -204,11 +255,107 @@ llama_kv_cache::llama_kv_cache(
            throw std::runtime_error("failed to create ggml context for kv cache");
        }

+        // TurboQuant zero-padding: for models with non-128-aligned head_dim (e.g. DeepSeek
+        // head_dim_k=192), pad each head to the next multiple of 128. The padded zeros don't
+        // affect dot products since WHT preserves inner products:
+        //   <WHT(Q_padded), WHT(K_padded)> = <Q_padded, K_padded> = <Q, K> + <0, 0> = <Q, K>
+        const uint32_t n_embd_head_k = hparams.n_embd_head_k(il);
+
+
        const bool has_k = true;
        const bool has_v = !is_mla;

-        ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
-        ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
+        // Layer-adaptive: use higher precision for quality-sensitive layers
+        // Config: TURBO_LAYER_ADAPTIVE env var controls the strategy
+        //   0 = uniform (default)
+        //   1 = q8_0 K+V for first+last 4 layers
+        //   2 = q8_0 K+V for last 8 layers
+        //   5 = Boundary V: first2+last2 V=turbo4, rest V=turbo2 (K unchanged)
+        //   6 = V-only: last 8 V=turbo4, rest V=turbo2 (K unchanged)
+        //   7 = Boundary V (recommended): first2+last2 V=q8_0, rest V=turbo2 (K unchanged)
+        ggml_type layer_type_k = type_k;
+        ggml_type layer_type_v = type_v;
+        {
+            static const int adaptive_mode = [&]() {
+                const char * env = getenv("TURBO_LAYER_ADAPTIVE");
+                if (env) {
+                    int mode = atoi(env);
+                    if (mode > 0) {
+                        LLAMA_LOG_INFO("llama_kv_cache: layer-adaptive mode %d enabled (env)\n", mode);
+                    }
+                    return mode;
+                }
+                // Auto-enable Boundary V (mode 7) when V is turbo2
+                if (type_v == GGML_TYPE_TURBO2_0 && hparams.n_layer >= 8) {
+                    LLAMA_LOG_INFO("llama_kv_cache: Boundary V auto-enabled for turbo2-V (opt-out: TURBO_LAYER_ADAPTIVE=0)\n");
+                    return 7;
+                }
+                return 0;
+            }();
+            const bool is_turbo = (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0);
+            const bool v_is_turbo = (type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0 || type_v == GGML_TYPE_TURBO2_0);
+            const uint32_t n_layer = hparams.n_layer;
+            if (adaptive_mode == 1 && is_turbo && n_layer >= 8) {
+                if (il < 4 || il >= n_layer - 4) {
+                    layer_type_k = GGML_TYPE_Q8_0;
+                    layer_type_v = GGML_TYPE_Q8_0;
+                }
+            } else if (adaptive_mode == 2 && is_turbo && n_layer >= 8) {
+                if (il >= n_layer - 8) {
+                    layer_type_k = GGML_TYPE_Q8_0;
+                    layer_type_v = GGML_TYPE_Q8_0;
+                }
+            } else if (adaptive_mode == 5 && v_is_turbo && n_layer >= 8) {
+                // Boundary V (turbo4 boundaries): first2+last2 V=turbo4, rest V=turbo2
+                const bool is_boundary = (il < 2 || il >= n_layer - 2);
+                layer_type_v = is_boundary ? GGML_TYPE_TURBO4_0 : GGML_TYPE_TURBO2_0;
+                if (il == 0) {
+                    LLAMA_LOG_INFO("llama_kv_cache: Boundary V mode 5: first2+last2 V=turbo4, rest V=turbo2\n");
+                }
+            } else if (adaptive_mode == 6 && v_is_turbo && n_layer >= 8) {
+                // V-only: last 8 V=turbo4, rest V=turbo2
+                layer_type_v = (il >= n_layer - 8) ? GGML_TYPE_TURBO4_0 : GGML_TYPE_TURBO2_0;
+                if (il == 0) {
+                    LLAMA_LOG_INFO("llama_kv_cache: V-only LA mode 6: last8 V=turbo4, rest V=turbo2\n");
+                }
+            } else if (adaptive_mode == 7 && v_is_turbo && n_layer >= 8) {
+                // Boundary V (recommended): first2+last2 V=q8_0, rest V=turbo2
+                const bool is_boundary = (il < 2 || il >= n_layer - 2);
+                layer_type_v = is_boundary ? GGML_TYPE_Q8_0 : GGML_TYPE_TURBO2_0;
+                if (il == 0) {
+                    LLAMA_LOG_INFO("llama_kv_cache: Boundary V mode 7: first2+last2 V=q8_0, rest V=turbo2\n");
+                }
+            }
+        }
+        // For turbo types, pad K head_dim to next multiple of 128 for full WHT groups
+        uint32_t n_embd_k_gqa_eff = n_embd_k_gqa;
+        const bool k_is_turbo = (layer_type_k == GGML_TYPE_TURBO3_0 || layer_type_k == GGML_TYPE_TURBO4_0 || layer_type_k == GGML_TYPE_TURBO2_0);
+        if (k_is_turbo && n_embd_head_k % 128 != 0) {
+            const uint32_t padded_head_k = ((n_embd_head_k + 127) / 128) * 128;
+            const uint32_t n_head_kv = n_embd_k_gqa / n_embd_head_k;
+            n_embd_k_gqa_eff = n_head_kv * padded_head_k;
+            if (il == 0) {
+                LLAMA_LOG_INFO("%s: turbo zero-padding K head_dim %u -> %u (cache %u -> %u)\n",
+                               __func__, n_embd_head_k, padded_head_k, n_embd_k_gqa, n_embd_k_gqa_eff);
+            }
+        }
+
+        // For turbo types, pad V head_dim to next multiple of 128 if needed
+        const uint32_t n_embd_head_v = hparams.n_embd_head_v(il);
+        uint32_t n_embd_v_gqa_eff = n_embd_v_gqa;
+        const bool v_is_turbo = (layer_type_v == GGML_TYPE_TURBO3_0 || layer_type_v == GGML_TYPE_TURBO4_0 || layer_type_v == GGML_TYPE_TURBO2_0);
+        if (v_is_turbo && !is_mla && n_embd_head_v % 128 != 0) {
+            const uint32_t padded_head_v = ((n_embd_head_v + 127) / 128) * 128;
+            const uint32_t n_head_kv = n_embd_v_gqa / n_embd_head_v;
+            n_embd_v_gqa_eff = n_head_kv * padded_head_v;
+            if (il == 0) {
+                LLAMA_LOG_INFO("%s: turbo zero-padding V head_dim %u -> %u (cache %u -> %u)\n",
+                               __func__, n_embd_head_v, padded_head_v, n_embd_v_gqa, n_embd_v_gqa_eff);
+            }
+        }
+
+        ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, layer_type_k, n_embd_k_gqa_eff, kv_size, n_stream) : nullptr;
+        ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, layer_type_v, n_embd_v_gqa_eff, kv_size, n_stream) : nullptr;

        has_k && ggml_format_name(k, "cache_k_l%d", il);
        has_v && ggml_format_name(v, "cache_v_l%d", il);
@@ -217,13 +364,26 @@ llama_kv_cache::llama_kv_cache(
        std::vector<ggml_tensor *> v_stream;

        for (uint32_t s = 0; s < n_stream; ++s) {
-            k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
-            v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
+            k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa_eff, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
+            v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa_eff, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
        }

        map_layer_ids[il] = layers.size();

        layers.push_back({ il, k, v, k_stream, v_stream, });
+
+        // TurboQuant: create rotation matrix tensors (once, shared across layers)
+        if (turbo_rotation == nullptr &&
+            (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0)) {
+            turbo_rotation = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 128, 128);
+            ggml_format_name(turbo_rotation, "turbo_rotation");  // R^T
+            turbo_rotation_inv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 128, 128);
+            ggml_format_name(turbo_rotation_inv, "turbo_rotation_inv");  // R
+
+            // InnerQ: per-channel scale_inv tensor (128 floats, initialized to all 1.0)
+            turbo_innerq_scale_inv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, INNERQ_MAX_CHANNELS);
+            ggml_format_name(turbo_innerq_scale_inv, "turbo_innerq_scale_inv");
+        }
    }

    if (reuse) {
@@ -268,6 +428,28 @@ llama_kv_cache::llama_kv_cache(
        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);

        ggml_backend_buffer_clear(buf, 0);
+
+        // Fill turbo rotation matrices AFTER buffer clear (clear zeroes everything)
+        if (turbo_rotation != nullptr && turbo_rotation->buffer != nullptr && !model.hparams.no_alloc) {
+            #include "turbo-rotation-data.h"
+            // ggml is column-major; C arrays are row-major. Storing a row-major matrix
+            // into ggml implicitly transposes it. ggml_mul_mat(A, x) computes A^T @ x.
+            // To get R @ q: store R^T → ggml sees (R^T)^T_col = R → mul_mat gives R @ q. Wait no —
+            // store R so ggml col-major reads it as R^T, then mul_mat gives (R^T)^T = R. ✓
+            // Store R for Q forward rotation, R^T for V inverse rotation
+            // ggml_mul_mat(A,x) computes A@x for row-major stored A (verified by test)
+            ggml_backend_tensor_set(turbo_rotation, TURBO_ROTATION_R, 0, 128 * 128 * sizeof(float));
+            ggml_backend_tensor_set(turbo_rotation_inv, TURBO_ROTATION_RT, 0, 128 * 128 * sizeof(float));
+
+            // Initialize InnerQ scale_inv to all 1.0 (identity scaling)
+            if (turbo_innerq_scale_inv != nullptr && turbo_innerq_scale_inv->buffer != nullptr) {
+                float ones[INNERQ_MAX_CHANNELS];
+                for (int i = 0; i < INNERQ_MAX_CHANNELS; i++) ones[i] = 1.0f;
+                ggml_backend_tensor_set(turbo_innerq_scale_inv, ones, 0, INNERQ_MAX_CHANNELS * sizeof(float));
+            }
+
+            LLAMA_LOG_INFO("%s: TurboQuant rotation matrices initialized (128x128)\n", __func__);
+        }
        ctxs_bufs.emplace_back(std::move(ctx), buf);
    }

@@ -281,23 +463,62 @@ llama_kv_cache::llama_kv_cache(
                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
    }

+    // TurboQuant: master's #21038 attention rotation is OFF by default on this
+    // fork. Enable per-side via LLAMA_ATTN_ROT_K_OVERRIDE=1 and/or
+    // LLAMA_ATTN_ROT_V_OVERRIDE=1 if your specific model+KV combo benefits.
+    //
+    // Why default OFF: empirical PPL+KLD testing on 7 model families
+    // (gemma-4 26B-A4B/31B/E2B, Qwen2.5-7B, Qwen3.5-2B, Mistral-Small-24B,
+    // phi-4, on q8/turbo4 KV) showed the optimal rotation policy is highly
+    // model-and-quant specific:
+    //
+    //   • gemma-4 31B Q8 q8/turbo4: V-only rotation gives -43% PPL (huge win).
+    //   • gemma-4 26B-A4B Q8 q8/turbo4: V-only gives -3.9%.
+    //   • gemma-4 E2B Q4_K_L q8/turbo4: V-only HURTS by +6.7%.
+    //   • phi-4 Q8 q8/turbo4: V-side rotation crashes (graph hash overflow).
+    //   • Qwen2.5/3.5/Mistral: rotation effect is within standard error.
+    //
+    // No single default is correct everywhere, including within the same
+    // architecture family (gemma-4 above shows three distinct optima across
+    // three sizes). Per-arch heuristics in code would silently regress users
+    // on variants we haven't tested. Default OFF + per-side env knobs lets
+    // each user tune for their specific config; documented findings in the
+    // README guide the choice.
+    //
+    // Reported by @erazortt (TheTom/turboquant_plus#88).
+    //
+    // LLAMA_ATTN_ROT_DISABLE retained as a no-op alias (default OFF makes it
+    // redundant but historical scripts may set it).
+    // Default attn_rot_disable=false now that rotation is OFF by default. The
+    // env var is preserved as a hard lock-out (=1 forces rotation off and
+    // blocks overrides), useful for users who want to guarantee no rotation
+    // regardless of any LLAMA_ATTN_ROT_*_OVERRIDE settings.
    const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE");
-    const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false;
-    if (attn_rot_disable) {
-        LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__);
+    const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? (atoi(LLAMA_ATTN_ROT_DISABLE) != 0) : false;
+
+    // Default: rotation OFF on both sides (safe across all tested model families).
+    // Override per side via env vars below.
+    attn_rot_k = false;
+    attn_rot_v = false;
+
+    // Per-side overrides. Set LLAMA_ATTN_ROT_K_OVERRIDE=1 / LLAMA_ATTN_ROT_V_OVERRIDE=1
+    // to enable rotation. The cache type and head-dim alignment guards below
+    // still apply: rotation only takes effect on quantized types with
+    // head_dim % 64 == 0 (master's #21038 requirements).
+    const char * ROT_K_OV = getenv("LLAMA_ATTN_ROT_K_OVERRIDE");
+    if (ROT_K_OV && atoi(ROT_K_OV) != 0 && !attn_rot_disable) {
+        attn_rot_k =
+            n_embd_head_k_all > 0 &&
+            ggml_is_quantized(type_k) &&
+            hparams.n_embd_head_k() % 64 == 0;
+    }
+    const char * ROT_V_OV = getenv("LLAMA_ATTN_ROT_V_OVERRIDE");
+    if (ROT_V_OV && atoi(ROT_V_OV) != 0 && !attn_rot_disable) {
+        attn_rot_v =
+            n_embd_head_v_all > 0 &&
+            ggml_is_quantized(type_v) &&
+            hparams.n_embd_head_v() % 64 == 0;
    }
-
-    attn_rot_k =
-        !attn_rot_disable &&
-        n_embd_head_k_all > 0 &&
-        ggml_is_quantized(type_k) &&
-        hparams.n_embd_head_k() % 64 == 0;
-
-    attn_rot_v =
-        !attn_rot_disable &&
-        n_embd_head_v_all > 0 &&
-        ggml_is_quantized(type_v) &&
-        hparams.n_embd_head_v() % 64 == 0;

    LLAMA_LOG_INFO("%s: attn_rot_k = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_k, n_embd_head_k_all);
    LLAMA_LOG_INFO("%s: attn_rot_v = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_v, n_embd_head_v_all);
@@ -337,6 +558,20 @@ void llama_kv_cache::clear(bool data) {
        for (auto & [_, buf] : ctxs_bufs) {
            ggml_backend_buffer_clear(buf.get(), 0);
        }
+
+        // Re-initialize turbo rotation matrices after buffer clear (clear zeroes everything)
+        if (turbo_rotation != nullptr && turbo_rotation->buffer != nullptr) {
+            #include "turbo-rotation-data.h"
+            ggml_backend_tensor_set(turbo_rotation, TURBO_ROTATION_R, 0, 128 * 128 * sizeof(float));
+            ggml_backend_tensor_set(turbo_rotation_inv, TURBO_ROTATION_RT, 0, 128 * 128 * sizeof(float));
+
+            // Re-initialize InnerQ scale_inv to all 1.0
+            if (turbo_innerq_scale_inv != nullptr && turbo_innerq_scale_inv->buffer != nullptr) {
+                float ones[INNERQ_MAX_CHANNELS];
+                for (int i = 0; i < INNERQ_MAX_CHANNELS; i++) ones[i] = 1.0f;
+                ggml_backend_tensor_set(turbo_innerq_scale_inv, ones, 0, INNERQ_MAX_CHANNELS * sizeof(float));
+            }
+        }
    }
 }

@@ -1150,13 +1385,24 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k
    const uint64_t kv_size      = get_size();
    const uint64_t n_embd_k_gqa = k->ne[0];

-    assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il));
+    // For turbo-padded caches, n_embd_k_gqa may be larger than hparams value
+    const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0);
+    if (k_is_turbo) {
+        assert(n_embd_k_gqa >= hparams.n_embd_k_gqa(il));
+    } else {
+        assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il));
+    }
+
+    // Use padded head_dim for turbo types so the full padded data is returned
+    const uint32_t head_k = hparams.n_embd_head_k(il);
+    const uint32_t head_k_eff = (k_is_turbo && head_k % 128 != 0)
+        ? ((head_k + 127) / 128) * 128 : head_k;

    const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;

    return ggml_view_4d(ctx, k,
-            hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
-            ggml_row_size(k->type, hparams.n_embd_head_k(il)),
+            head_k_eff, hparams.n_head_kv(il), n_kv, ns,
+            ggml_row_size(k->type, head_k_eff),
            ggml_row_size(k->type, n_embd_k_gqa),
            ggml_row_size(k->type, n_embd_k_gqa*kv_size),
            ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
@@ -1170,27 +1416,33 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
    const uint64_t kv_size      = get_size();
    const uint64_t n_embd_v_gqa = v->ne[0];

-    // [TAG_V_CACHE_VARIABLE]
+    // [TAG_V_CACHE_VARIABLE] — for turbo-padded V, cache may be larger
    assert(n_embd_v_gqa >= hparams.n_embd_v_gqa(il));

+    // Use padded head_dim for turbo types
+    const bool v_is_turbo = (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0);
+    const uint32_t head_v = hparams.n_embd_head_v(il);
+    const uint32_t head_v_eff = (v_is_turbo && head_v % 128 != 0)
+        ? ((head_v + 127) / 128) * 128 : head_v;
+
    const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;

    if (!v_trans) {
        // note: v->nb[1] <= v->nb[2]
        return ggml_view_4d(ctx, v,
-                hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
-                ggml_row_size(v->type, hparams.n_embd_head_v(il)),          // v->nb[1]
-                ggml_row_size(v->type, n_embd_v_gqa),                   // v->nb[2]
-                ggml_row_size(v->type, n_embd_v_gqa*kv_size),           // v->nb[3]
+                head_v_eff, hparams.n_head_kv(il), n_kv, ns,
+                ggml_row_size(v->type, head_v_eff),                      // v->nb[1]
+                ggml_row_size(v->type, n_embd_v_gqa),                    // v->nb[2]
+                ggml_row_size(v->type, n_embd_v_gqa*kv_size),            // v->nb[3]
                ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
    }

    // note: v->nb[1] > v->nb[2]
    return ggml_view_4d(ctx, v,
-            n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
-            ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)),  // v->nb[1]
-            ggml_row_size(v->type, kv_size),                        // v->nb[2]
-            ggml_row_size(v->type, kv_size*n_embd_v_gqa),           // v->nb[3]
+            n_kv, hparams.n_head_kv(il), head_v_eff, ns,
+            ggml_row_size(v->type, kv_size*head_v_eff),              // v->nb[1]
+            ggml_row_size(v->type, kv_size),                         // v->nb[2]
+            ggml_row_size(v->type, kv_size*n_embd_v_gqa),            // v->nb[3]
            ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
 }

@@ -1201,11 +1453,22 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm

    ggml_tensor * k = layers[ikv].k;

-    const int64_t n_embd_head = k_cur->ne[0];
+    int64_t n_embd_head = k_cur->ne[0];
    const int64_t n_head      = k_cur->ne[1];
    const int64_t n_tokens    = k_cur->ne[2];

-    const int64_t n_embd_gqa = n_embd_head*n_head;
+    // Turbo zero-padding: pad each head to next multiple of 128 before merging dims.
+    // k_cur shape here is (n_embd_head, n_head, n_tokens).
+    // ggml_pad pads ne[0] with zeros — exactly what we need per-head.
+    const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0);
+    const bool k_needs_pad = k_is_turbo && (n_embd_head % 128 != 0);
+    if (k_needs_pad) {
+        const int64_t pad_amount = ((n_embd_head + 127) / 128) * 128 - n_embd_head;
+        k_cur = ggml_pad(ctx, k_cur, pad_amount, 0, 0, 0);
+        n_embd_head = k_cur->ne[0];  // now 128-aligned
+    }
+
+    int64_t n_embd_gqa = n_embd_head * n_head;

    // we can merge dims 0 and 1
    // TODO: add ggml helper function for this?
@@ -1226,7 +1489,16 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
    }

    // store the current K values into the cache
-    return ggml_set_rows(ctx, k, k_cur, k_idxs);
+    ggml_tensor * result = ggml_set_rows(ctx, k, k_cur, k_idxs);
+
+    // For turbo: store WHT group size in op_params so the CUDA kernel knows.
+    // With zero-padding, all groups are always full 128-element WHT groups.
+    if (k_is_turbo) {
+        int32_t wht_group = 128;  // always 128 with padding
+        memcpy(result->op_params, &wht_group, sizeof(int32_t));
+    }
+
+    return result;
 }

 ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
@@ -1236,11 +1508,20 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm

    auto * v = layers[ikv].v;

-    const int64_t n_embd_head = v_cur->ne[0];
+    int64_t n_embd_head = v_cur->ne[0];
    const int64_t n_head      = v_cur->ne[1];
    const int64_t n_tokens    = v_cur->ne[2];

-    const int64_t n_embd_gqa = n_embd_head*n_head;
+    // Turbo zero-padding: pad V head_dim to next multiple of 128
+    const bool v_is_turbo = (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0);
+    const bool v_needs_pad = v_is_turbo && (n_embd_head % 128 != 0);
+    if (v_needs_pad) {
+        const int64_t pad_amount = ((n_embd_head + 127) / 128) * 128 - n_embd_head;
+        v_cur = ggml_pad(ctx, v_cur, pad_amount, 0, 0, 0);
+        n_embd_head = v_cur->ne[0];  // now 128-aligned
+    }
+
+    int64_t n_embd_gqa = n_embd_head * n_head;

    // we can merge dims 0 and 1
    GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
@@ -1261,7 +1542,13 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
            v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
        }

-        return ggml_set_rows(ctx, v, v_cur, v_idxs);
+        ggml_tensor * result = ggml_set_rows(ctx, v, v_cur, v_idxs);
+        // With zero-padding, all groups are always full 128-element WHT groups
+        if (v_is_turbo) {
+            int32_t wht_group = 128;  // always 128 with padding
+            memcpy(result->op_params, &wht_group, sizeof(int32_t));
+        }
+        return result;
    }

    if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
@@ -1315,14 +1602,23 @@ ggml_tensor * llama_kv_cache::build_input_k_rot(ggml_context * ctx) const {
    ggml_tensor * res = nullptr;

    if (attn_rot_k) {
-        int nrot = 64;
-
-        // TODO: investigate if using the smallest rotation matrix is beneficial also for K (similar as for V)
+        // EXPERIMENT (master TODO): force smallest rotation matrix (nrot=64)
+        // for K, mirroring V's choice. Master defaults to the largest power-of-2
+        // that divides head_dim, but the upstream comment hypothesizes smaller
+        // tiles preserve more local structure → less PPL hit on sensitive models
+        // (gemma-4 26B-A4B reportedly regresses with the largest tile).
        // ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
-        do {
-            nrot *= 2;
-        } while (n_embd_head_k_all % nrot == 0);
-        nrot /= 2;
+        const char * LLAMA_ATTN_ROT_K_NROT = getenv("LLAMA_ATTN_ROT_K_NROT");
+        int nrot = LLAMA_ATTN_ROT_K_NROT ? atoi(LLAMA_ATTN_ROT_K_NROT) : 64;
+
+        // Original master behavior (largest power-of-2): set LLAMA_ATTN_ROT_K_NROT=0
+        if (nrot == 0) {
+            nrot = 64;
+            do {
+                nrot *= 2;
+            } while (n_embd_head_k_all % nrot == 0);
+            nrot /= 2;
+        }

        res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
        ggml_set_input(res);
@@ -1980,10 +2276,11 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
    for (const auto & layer : layers) {
        const uint32_t il = layer.il;

-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-
        auto * k = layer.k_stream[cr.strm];

+        // Use actual tensor width (may be padded for turbo types: e.g. 576→640)
+        const uint32_t n_embd_k_gqa = (uint32_t) k->ne[0];
+
        // Write key type
        const int32_t k_type_i = (int32_t) k->type;
        io.write(&k_type_i, sizeof(k_type_i));
@@ -2004,13 +2301,14 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
        for (const auto & layer : layers) {
            const uint32_t il = layer.il;

-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
            auto * v = layer.v_stream[cr.strm];
            if (!v) {
                continue;
            }

+            // Use actual tensor width (may be padded for turbo types)
+            const uint32_t n_embd_v_gqa = (uint32_t) v->ne[0];
+
            // Write value type
            const int32_t v_type_i = (int32_t) v->type;
            io.write(&v_type_i, sizeof(v_type_i));
@@ -2212,10 +2510,11 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
    for (const auto & layer : layers) {
        const uint32_t il = layer.il;

-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-
        auto * k = layer.k_stream[strm];

+        // Use actual tensor width (may be padded for turbo types)
+        const uint32_t n_embd_k_gqa = (uint32_t) k->ne[0];
+
        // Read type of key
        int32_t k_type_i_ref;
        io.read(&k_type_i_ref, sizeof(k_type_i_ref));
@@ -2252,13 +2551,14 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
        for (const auto & layer : layers) {
            const uint32_t il = layer.il;

-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
            auto * v = layer.v_stream[strm];
            if (!v) {
                continue;
            }

+            // Use actual tensor width (may be padded for turbo types)
+            const uint32_t n_embd_v_gqa = (uint32_t) v->ne[0];
+
            // Read type of value
            int32_t v_type_i_ref;
            io.read(&v_type_i_ref, sizeof(v_type_i_ref));
@@ -2416,6 +2716,16 @@ bool llama_kv_cache_context::apply() {
    kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
    n_kv = kv->get_n_kv(sinfos[i_cur]);

+    // InnerQ: check if CUDA calibration finalized and tensor needs update
+    if (kv->get_turbo_innerq_scale_inv() != nullptr && turbo_innerq_needs_tensor_update()) {
+        ggml_tensor * t = kv->get_turbo_innerq_scale_inv();
+        if (t->buffer != nullptr) {
+            ggml_backend_tensor_set(t, g_innerq_scale_inv_host, 0, INNERQ_MAX_CHANNELS * sizeof(float));
+            turbo_innerq_mark_tensor_updated();
+            LLAMA_LOG_INFO("%s: InnerQ scale_inv tensor updated\n", __func__);
+        }
+    }
+
    return true;
 }

@@ -2449,6 +2759,26 @@ ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) cons
    return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
 }

+ggml_tensor * llama_kv_cache_context::get_turbo_rotation() const {
+    return kv->get_turbo_rotation();
+}
+
+ggml_tensor * llama_kv_cache_context::get_turbo_rotation_inv() const {
+    return kv->get_turbo_rotation_inv();
+}
+
+ggml_tensor * llama_kv_cache_context::get_turbo_rot_forward() const {
+    return kv->get_turbo_rotation();
+}
+
+ggml_tensor * llama_kv_cache_context::get_turbo_rot_inverse() const {
+    return kv->get_turbo_rotation_inv();
+}
+
+ggml_tensor * llama_kv_cache_context::get_turbo_innerq_scale_inv() const {
+    return kv->get_turbo_innerq_scale_inv();
+}
+
 ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
    return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
 }
--- a/Show More
+++ b/Show More