cont : remove /api/tags

server : remove /api endpoints
2026-04-20 15:45:42 +03:00 · 2026-04-20 15:34:18 +03:00
162 changed files with 4377 additions and 11928 deletions
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04

 ## Build Image

@@ -2,19 +2,7 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
 ARG UBUNTU_VERSION=24.04

-# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
-ARG IGDGMM_VERSION=22.9.0
-
-# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
-ARG NPU_DRIVER_VERSION=v1.32.0
-ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
-ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
-
-# Optional proxy build arguments
+# Optional proxy build arguments - empty by default
 ARG http_proxy=
 ARG https_proxy=

@@ -90,47 +78,13 @@ ARG http_proxy
 ARG https_proxy

 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
+    && apt-get install -y libgomp1 libtbb12 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete

-# Install GPU drivers
-ARG IGC_VERSION
-ARG IGC_VERSION_FULL
-ARG COMPUTE_RUNTIME_VERSION
-ARG COMPUTE_RUNTIME_VERSION_FULL
-ARG IGDGMM_VERSION
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/neo/
-
-# Install NPU drivers
-ARG NPU_DRIVER_VERSION
-ARG NPU_DRIVER_FULL
-ARG LIBZE1_VERSION
-RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
-    && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/npu/
-
-RUN cd /tmp \
-    && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
-    && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
-    && rm libze1_${LIBZE1_VERSION}_amd64.deb
-
 COPY --from=build /app/lib/ /app/

 ### Full (all binaries)
@@ -1,113 +0,0 @@
-name: CI (snapdragon)
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - master
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  android-ndk-snapdragon:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Build Llama.CPP for Snapdragon Android
-        id: build_llama_cpp_snapdragon_android
-        run: |
-          cp docs/backend/snapdragon/CMakeUserPresets.json .
-          cmake --preset arm64-android-snapdragon-release -B build
-          cmake --build build
-          cmake --install build --prefix pkg-adb/llama.cpp
-
-      - name: Upload Llama.CPP Snapdragon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-android-arm64-snapdragon
-          path: pkg-adb/llama.cpp
-
-  check-secret:
-    runs-on: ubuntu-latest
-    outputs:
-      has-key: ${{ steps.check.outputs.has-key }}
-    steps:
-      - id: check
-        run: echo "has-key=${{ secrets.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
-
-  test-snapdragon-qdc:
-    name: Test on QDC Android Device (${{ matrix.device }})
-    needs: [android-ndk-snapdragon, check-secret]
-    if: needs.check-secret.outputs.has-key == 'true'
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        device: [SM8750, SM8650, SM8850]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Download build artifact
-        uses: actions/download-artifact@v4
-        with:
-          name: llama-cpp-android-arm64-snapdragon
-          path: pkg-snapdragon/
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.x'
-          cache: pip
-
-      - name: Install QDC SDK wheel
-        run: |
-          curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
-          unzip qdc_sdk.zip -d qdc_sdk
-          pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
-
-      - name: Run QDC tests (${{ matrix.device }})
-        run: |
-          python scripts/snapdragon/qdc/run_qdc_jobs.py \
-              --test       all \
-              --pkg-dir    pkg-snapdragon/llama.cpp \
-              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
-              --device     ${{ matrix.device }}
-        env:
-          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
-
-      - name: Cleanup
-        if: always()
-        run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
@@ -1,24 +1,26 @@
 name: CI (android)

 on:
-  workflow_dispatch:
+  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
-    paths:
-      - '.github/workflows/build-android.yml'
-      - '**/CMakeLists.txt'
-      - '**/.cmake'
-      - '**/*.h'
-      - '**/*.hpp'
-      - '**/*.c'
-      - '**/*.cpp'
+    paths: [
+      '.github/workflows/build-android.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]

  pull_request:
    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-android.yml'
-      - 'examples/llama.android/**'
+    paths: [
+      '.github/workflows/build-android.yml',
+      'examples/llama.android/**'
+    ]

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -65,24 +67,35 @@ jobs:
    defaults:
      run:
        shell: bash
+    strategy:
+      matrix:
+        include:
+          - build: 'arm64-cpu'
+            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
+          - build: 'arm64-snapdragon'
+            defines: '--preset arm64-android-snapdragon-release'

    steps:
      - name: Clone
+        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          lfs: false

-      - name: Build
-        id: ndk_build
+      - name: Build Llama.CPP for Hexagon Android
+        id: build_llama_cpp_hexagon_android
        run: |
-          cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
+          if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
+            cp docs/backend/snapdragon/CMakeUserPresets.json .
+          fi
+          cmake ${{ matrix.defines }} -B build
          cmake --build build
          cmake --install build --prefix pkg-adb/llama.cpp

-      - name: Upload Android Build Artifact
-        if: ${{ always() && steps.ndk_build.outcome == 'success' }}
+      - name: Upload Llama.CPP Hexagon Android Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
        uses: actions/upload-artifact@v6
        with:
-          name: llama-cpp-android-arm64-cpu
+          name: llama-cpp-android-${{ matrix.build }}
          path: pkg-adb/llama.cpp
@@ -1,120 +0,0 @@
-name: CI (openvino)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      'ggml/src/ggml-openvino/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-openvino:
-    name: ubuntu-24-openvino-${{ matrix.openvino_device }}
-
-    concurrency:
-      group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
-    strategy:
-      matrix:
-        include:
-          - variant: cpu
-            runner: '"ubuntu-24.04"'
-            openvino_device: "CPU"
-          - variant: gpu
-            runner: '["self-hosted","Linux","Intel","OpenVINO"]'
-            openvino_device: "GPU"
-
-    runs-on: ${{ fromJSON(matrix.runner) }}
-
-    env:
-      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        if: runner.environment == 'github-hosted'
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
-          sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
-
-      - name: Use OpenVINO Toolkit Cache
-        if: runner.environment == 'github-hosted'
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenVINO dependencies
-        run: |
-          cd ./openvino_toolkit
-          chmod +x ./install_dependencies/install_openvino_dependencies.sh
-          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source ./openvino_toolkit/setupvars.sh
-          cmake -B build/ReleaseOV -G Ninja \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON
-          time cmake --build build/ReleaseOV --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        # TODO: fix and re-enable the `test-llama-archs` test below
-        run: |
-          cd ${{ github.workspace }}
-          if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
-            export GGML_OPENVINO_DEVICE=GPU
-          fi
-          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
@@ -265,10 +265,6 @@ jobs:
  ggml-ci-intel-openvino-gpu-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

-    concurrency:
-      group: openvino-gpu-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
      OPENVINO_VERSION_MAJOR: "2026.0"
@@ -1,142 +0,0 @@
-name: CI (sycl)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      'ggml/src/ggml-sycl/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
@@ -555,6 +555,186 @@ jobs:
            -DGGML_MUSA=ON
          time cmake --build build --config Release -j $(nproc)

+  ubuntu-22-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-22-sycl
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx
+          time cmake --build build --config Release -j $(nproc)
+
+  ubuntu-22-sycl-fp16:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-22-sycl-fp16
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DGGML_SYCL_F16=ON
+          time cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-openvino:
+      name: ubuntu-24-openvino-${{ matrix.openvino_device }}
+      strategy:
+        matrix:
+          include:
+            - variant: cpu
+              runner: '"ubuntu-24.04"'
+              openvino_device: "CPU"
+            - variant: gpu
+              runner: '["self-hosted","Linux","X64","Intel"]'
+              openvino_device: "GPU"
+
+      runs-on: ${{ fromJSON(matrix.runner) }}
+
+      env:
+        # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+        OPENVINO_VERSION_MAJOR: "2026.0"
+        OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+
+      steps:
+        - name: Clone
+          id: checkout
+          uses: actions/checkout@v6
+
+        - name: ccache
+          if: runner.environment == 'github-hosted'
+          uses: ggml-org/ccache-action@v1.2.21
+          with:
+            key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
+            evict-old-files: 1d
+            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+        - name: Dependencies
+          id: depends
+          run: |
+            sudo apt-get update
+            sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
+            sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+
+        - name: Use OpenVINO Toolkit Cache
+          if: runner.environment == 'github-hosted'
+          uses: actions/cache@v5
+          id: cache-openvino
+          with:
+            path: ./openvino_toolkit
+            key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+        - name: Setup OpenVINO Toolkit
+          if: steps.cache-openvino.outputs.cache-hit != 'true'
+          uses: ./.github/actions/linux-setup-openvino
+          with:
+            path: ./openvino_toolkit
+            version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+            version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+        - name: Install OpenVINO dependencies
+          run: |
+            cd ./openvino_toolkit
+            chmod +x ./install_dependencies/install_openvino_dependencies.sh
+            echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
+
+        - name: Build
+          id: cmake_build
+          run: |
+            source ./openvino_toolkit/setupvars.sh
+            cmake -B build/ReleaseOV -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DGGML_OPENVINO=ON
+            time cmake --build build/ReleaseOV --config Release -j $(nproc)
+
+        - name: Test
+          id: cmake_test
+          # TODO: fix and re-enable the `test-llama-archs` test below
+          run: |
+            cd ${{ github.workspace }}
+            if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
+              export GGML_OPENVINO_DEVICE=GPU
+            fi
+            ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000

  windows-latest:
    runs-on: windows-2025
@@ -763,6 +943,39 @@ jobs:
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release

+  windows-latest-sycl:
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: windows-latest-sycl
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat

  windows-latest-hip:
    runs-on: windows-2022
@@ -598,29 +598,15 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -628,6 +614,10 @@ jobs:
          variant: ccache
          evict-old-files: 1d

+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
      - name: Build
        id: cmake_build
        shell: cmd
@@ -680,82 +670,6 @@ jobs:
          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-
  ubuntu-22-rocm:
    runs-on: ubuntu-22.04

@@ -1131,7 +1045,6 @@ jobs:
      - ubuntu-cpu
      - ubuntu-vulkan
      - ubuntu-24-openvino
-      - ubuntu-24-sycl
      - android-arm64
      - macOS-cpu
      - ios-xcode-build
@@ -1220,8 +1133,6 @@ jobs:
            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)

            **Android:**
            - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
@@ -145,5 +145,3 @@ poetry.toml
 /.windsurf/
 # emscripten
 a.out.*
-
-AGENTS.local.md
@@ -23,7 +23,6 @@
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
 /common/                                @ggml-org/llama-common
-/common/fit.*                           @JohannesGaessler
 /common/jinja/                          @CISC
 /common/ngram-map.*                     @srogmann
 /convert_*.py                           @CISC
@@ -73,8 +73,6 @@ add_library(${TARGET}
    debug.h
    download.cpp
    download.h
-    fit.cpp
-    fit.h
    hf-cache.cpp
    hf-cache.h
    http.h
@@ -2426,20 +2426,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_env("LLAMA_ARG_FIT"));
-    add_opt(common_arg(
-        { "-fitp", "--fit-print" }, "[on|off]",
-        string_format("print the estimated required memory ('on' or 'off', default: '%s')", params.fit_params_print ? "on" : "off"),
-        [](common_params & params, const std::string & value) {
-            if (is_truthy(value)) {
-                params.fit_params_print = true;
-            } else if (is_falsey(value)) {
-                params.fit_params_print = false;
-            } else {
-                throw std::runtime_error(
-                    string_format("error: unknown value for --fit-print: '%s'\n", value.c_str()));
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_FIT_PARAMS}).set_env("LLAMA_ARG_FIT_ESTIMATE"));
    add_opt(common_arg(
        { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
        string_format("target margin per device for --fit, comma-separated list of values, "
@@ -3122,14 +3108,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
        [](common_params & params, int value) {
            if (value < -1) { throw std::invalid_argument("invalid value"); }
-            params.sampling.reasoning_budget_tokens = value;
+            params.reasoning_budget = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
    add_opt(common_arg(
        {"--reasoning-budget-message"}, "MESSAGE",
        "message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
        [](common_params & params, const std::string & value) {
-            params.sampling.reasoning_budget_message = value;
+            params.reasoning_budget_message = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
    add_opt(common_arg(
@@ -3902,17 +3888,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));

-    add_opt(common_arg(
-        {"--spec-default"},
-        string_format("enable default speculative decoding config"),
-        [](common_params & params) {
-            params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
-            params.speculative.ngram_size_n = 24;
-            params.speculative.n_min = 48;
-            params.speculative.n_max = 64;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
-
    return ctx_arg;
 }

@@ -397,25 +397,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
    return render_message_to_json(msgs, c);
 }

-json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
-    if (tools.empty()) {
-        return json();
-    }
-
-    auto result = json::array();
-    for (const auto & tool : tools) {
-        result.push_back({
-            { "type",     "function" },
-            { "function", {
-                { "name", tool.name },
-                { "description", tool.description },
-                { "parameters", json::parse(tool.parameters) },
-            }},
-        });
-    }
-    return result;
-}
-
 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
    std::vector<common_chat_tool> result;

@@ -451,6 +432,56 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
    return result;
 }

+json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
+    if (tools.empty()) {
+        return json();
+    }
+
+    auto result = json::array();
+    for (const auto & tool : tools) {
+        result.push_back({
+            { "type",     "function" },
+            { "function",
+             {
+                  { "name", tool.name },
+                  { "description", tool.description },
+                  { "parameters", json::parse(tool.parameters) },
+              }                      },
+        });
+    }
+    return result;
+}
+
+json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
+    json delta = json::object();
+    if (!diff.reasoning_content_delta.empty()) {
+        delta["reasoning_content"] = diff.reasoning_content_delta;
+    }
+    if (!diff.content_delta.empty()) {
+        delta["content"] = diff.content_delta;
+    }
+    if (diff.tool_call_index != std::string::npos) {
+        json tool_call;
+        tool_call["index"] = diff.tool_call_index;
+        if (!diff.tool_call_delta.id.empty()) {
+            tool_call["id"]   = diff.tool_call_delta.id;
+            tool_call["type"] = "function";
+        }
+        if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
+            json function = json::object();
+            if (!diff.tool_call_delta.name.empty()) {
+                function["name"] = diff.tool_call_delta.name;
+            }
+            if (!diff.tool_call_delta.arguments.empty()) {
+                function["arguments"] = diff.tool_call_delta.arguments;
+            }
+            tool_call["function"] = function;
+        }
+        delta["tool_calls"] = json::array({ tool_call });
+    }
+    return delta;
+}
+
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
    if (use_jinja) {
        try {
@@ -544,26 +575,6 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
    return tmpls->has_explicit_template;
 }

-// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
-// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
-static bool is_lfm2_template(const std::string & src) {
-    return src.find("<|tool_list_start|>") != std::string::npos &&
-           src.find("<|tool_list_end|>")   != std::string::npos;
-}
-
-common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
-    common_chat_prompt_preset asr_preset;
-    asr_preset.system = "";
-    asr_preset.user   = "Transcribe audio to text";
-
-    if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
-        asr_preset.system = "Perform ASR.";
-        asr_preset.user   = "";
-    }
-
-    return asr_preset;
-}
-
 std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
    if (!variant.empty()) {
        if (variant == "tool_use") {
@@ -2073,7 +2084,10 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

-    if (is_lfm2_template(src)) {
+    // LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
+    // and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
+    if (src.find("<|tool_list_start|>") != std::string::npos &&
+        src.find("<|tool_list_end|>") != std::string::npos) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params);
    }
@@ -2382,3 +2396,4 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
    GGML_ASSERT(chat_templates->template_default != nullptr);
    return chat_templates->template_default->caps.to_map();
 }
+
@@ -256,13 +256,14 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
 // Parses a JSON array of messages in OpenAI's chat completion API format.
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);

-std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
-
 // DEPRECATED: only used in tests
 nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);

+std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
 nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);

+nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
+
 // get template caps, useful for reporting to server /props endpoint
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);

@@ -274,11 +275,3 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
        autoparser::generation_params & params);
-
-// specialized per-task preset
-struct common_chat_prompt_preset {
-    std::string system;
-    std::string user;
-};
-
-common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
@@ -3,7 +3,6 @@

 #include "build-info.h"
 #include "common.h"
-#include "fit.h"
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
@@ -1148,7 +1147,7 @@ common_init_result::common_init_result(common_params & params) :

    if (params.fit_params) {
        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
-        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
+        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
            params.fit_params_target.data(),
@@ -274,7 +274,6 @@ struct common_params_sampling {
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
-    std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted

    bool backend_sampling = false;

@@ -421,12 +420,11 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;    // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;     // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};   // how split tensors should be distributed across GPUs
-    bool    fit_params         = true;  // whether to fit unset model/context parameters to free device memory
-    bool    fit_params_print   = false; // print the estimated required memory to run the model
-    int32_t fit_params_min_ctx = 4096;  // minimum context size to set when trying to reduce memory use
+    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
+    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
+    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use

    // margin per device in bytes for fitting parameters to free memory:
    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
@@ -582,6 +580,8 @@ struct common_params {
    bool force_pure_content_parser = false;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
+    int reasoning_budget = -1;
+    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time

@@ -746,11 +746,6 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
           str.compare(0, prefix.size(), prefix) == 0;
 }

-// remove when moving to c++20
-inline bool string_starts_with(std::string_view str, char prefix) {
-    return !str.empty() && str.front() == prefix;
-}
-
 // remove when moving to c++20
 inline bool string_ends_with(std::string_view str, std::string_view suffix) {
    return str.size() >= suffix.size() &&
@@ -1,951 +0,0 @@
-#include "fit.h"
-
-#include "log.h"
-
-#include "../src/llama-ext.h"
-
-#include <array>
-#include <cassert>
-#include <stdexcept>
-#include <cinttypes>
-#include <set>
-#include <string>
-#include <vector>
-
-// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
-// enum to identify part of a layer for distributing its tensors:
-enum common_layer_fraction_t {
-    LAYER_FRACTION_NONE = 0, // nothing
-    LAYER_FRACTION_ATTN = 1, // attention
-    LAYER_FRACTION_UP   = 2, // attention + up
-    LAYER_FRACTION_GATE = 3, // attention + up + gate
-    LAYER_FRACTION_MOE  = 4, // everything but sparse MoE weights
-};
-
-class common_params_fit_exception : public std::runtime_error {
-    using std::runtime_error::runtime_error;
-};
-
-static std::vector<llama_device_memory_data> common_get_device_memory_data(
-        const char * path_model,
-        const llama_model_params * mparams,
-        const llama_context_params * cparams,
-        std::vector<ggml_backend_dev_t> & devs,
-        uint32_t & hp_ngl,
-        uint32_t & hp_n_ctx_train,
-        uint32_t & hp_n_expert,
-        ggml_log_level log_level) {
-    struct user_data_t {
-        struct {
-            ggml_log_callback callback;
-            void * user_data;
-        } original_logger;
-        ggml_log_level min_level; // prints below this log level go to debug log
-    };
-    user_data_t ud;
-    llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
-    ud.min_level = log_level;
-
-    llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
-        const user_data_t * ud = (const user_data_t *) user_data;
-        const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
-        ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
-    }, &ud);
-
-    llama_model_params mparams_copy = *mparams;
-    mparams_copy.no_alloc  = true;
-    mparams_copy.use_mmap  = false;
-    mparams_copy.use_mlock = false;
-
-    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
-    if (model == nullptr) {
-        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
-        throw std::runtime_error("failed to load model");
-    }
-
-    llama_context * ctx = llama_init_from_model(model, *cparams);
-    if (ctx == nullptr) {
-        llama_model_free(model);
-        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
-        throw std::runtime_error("failed to create llama_context from model");
-    }
-
-    const size_t nd = llama_model_n_devices(model);
-    std::vector<llama_device_memory_data> ret(nd + 1);
-
-    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
-
-    for (const auto & [buft, mb] : memory_breakdown) {
-        if (ggml_backend_buft_is_host(buft)) {
-            ret.back().mb.model   += mb.model;
-            ret.back().mb.context += mb.context;
-            ret.back().mb.compute += mb.compute;
-            continue;
-        }
-
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
-        if (!dev) {
-            continue;
-        }
-        for (size_t i = 0; i < nd; i++) {
-            if (dev == llama_model_get_device(model, i)) {
-                ret[i].mb.model   += mb.model;
-                ret[i].mb.context += mb.context;
-                ret[i].mb.compute += mb.compute;
-                break;
-            }
-        }
-    }
-
-    {
-        ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-        if (cpu_dev == nullptr) {
-            throw std::runtime_error("no CPU backend found");
-        }
-        size_t free;
-        size_t total;
-        ggml_backend_dev_memory(cpu_dev, &free, &total);
-        ret.back().free  = free;
-        ret.back().total = total;
-    }
-    for (size_t i = 0; i < nd; i++) {
-        size_t free;
-        size_t total;
-        ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
-
-        // devices can return 0 bytes for free and total memory if they do not
-        // have any to report. in this case, we will use the host memory as a fallback
-        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
-        if (free == 0 && total == 0) {
-            free  = ret.back().free;
-            total = ret.back().total;
-        }
-        ret[i].free  = free;
-        ret[i].total = total;
-    }
-
-    devs.clear();
-    for (int i = 0; i < llama_model_n_devices(model); i++) {
-        devs.push_back(llama_model_get_device(model, i));
-    }
-
-    hp_ngl         = llama_model_n_layer(model);
-    hp_n_ctx_train = llama_model_n_ctx_train(model);
-    hp_n_expert    = llama_model_n_expert(model);
-
-    common_memory_breakdown_print(ctx);
-
-    llama_free(ctx);
-    llama_model_free(model);
-    llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
-
-    return ret;
-}
-
-static void common_params_fit_impl(
-        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
-        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
-    if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
-        throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
-    }
-    constexpr int64_t MiB = 1024*1024;
-    typedef std::vector<llama_device_memory_data> dmds_t;
-    const llama_model_params default_mparams = llama_model_default_params();
-
-    std::vector<ggml_backend_dev_t> devs;
-    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
-    uint32_t hp_nct = 0; // hparams.n_ctx_train
-    uint32_t hp_nex = 0; // hparams.n_expert
-
-    // step 1: get data for default parameters and check whether any changes are necessary in the first place
-
-    LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
-    const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-    const size_t nd = devs.size(); // number of devices
-
-    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
-    margins.reserve(nd);
-    if (nd == 0) {
-        margins.push_back(margins_s[0]);
-    } else {
-        for (size_t id = 0; id < nd; id++) {
-            margins.push_back(margins_s[id]);
-        }
-    }
-
-    std::vector<std::string> dev_names;
-    {
-        dev_names.reserve(nd);
-        size_t max_length = 0;
-        for (const auto & dev : devs) {
-            std::string name = ggml_backend_dev_name(dev);
-            name += " (";
-            name += ggml_backend_dev_description(dev);
-            name += ")";
-            dev_names.push_back(name);
-            max_length = std::max(max_length, name.length());
-        }
-        for (std::string & dn : dev_names) {
-            dn.insert(dn.end(), max_length - dn.length(), ' ');
-        }
-    }
-
-    int64_t sum_free            = 0;
-    int64_t sum_projected_free  = 0;
-    int64_t sum_projected_used  = 0;
-    int64_t sum_projected_model = 0;
-    std::vector<int64_t> projected_free_per_device;
-    projected_free_per_device.reserve(nd);
-
-    if (nd == 0) {
-        sum_projected_used = dmds_full.back().mb.total();
-        sum_free           = dmds_full.back().total;
-        sum_projected_free = sum_free - sum_projected_used;
-        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
-            __func__, sum_projected_used/MiB, sum_free/MiB);
-        if (sum_projected_free >= margins[0]) {
-            LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
-                __func__, sum_projected_free/MiB, margins[0]/MiB);
-            return;
-        }
-    } else {
-        if (nd > 1) {
-            LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
-        }
-        for (size_t id = 0; id < nd; id++) {
-            const llama_device_memory_data & dmd = dmds_full[id];
-
-            const int64_t projected_used = dmd.mb.total();
-            const int64_t projected_free = dmd.free - projected_used;
-            projected_free_per_device.push_back(projected_free);
-
-            sum_free            += dmd.free;
-            sum_projected_used  += projected_used;
-            sum_projected_free  += projected_free;
-            sum_projected_model += dmd.mb.model;
-
-            if (nd > 1) {
-                LOG_INF("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
-                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
-            }
-        }
-        assert(sum_free >= 0 && sum_projected_used >= 0);
-        LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-            __func__, sum_projected_used/MiB, sum_free/MiB);
-        if (nd == 1) {
-            if (projected_free_per_device[0] >= margins[0]) {
-                LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
-                return;
-            }
-        } else {
-            bool changes_needed = false;
-            for (size_t id = 0; id < nd; id++) {
-                if (projected_free_per_device[id] < margins[id]) {
-                    changes_needed = true;
-                    break;
-                }
-            }
-            if (!changes_needed) {
-                LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
-                return;
-            }
-        }
-    }
-
-    // step 2: try reducing memory use by reducing the context size
-
-    {
-        int64_t global_surplus = sum_projected_free;
-        if (nd == 0) {
-            global_surplus -= margins[0];
-        } else {
-            for (size_t id = 0; id < nd; id++) {
-                global_surplus -= margins[id];
-            }
-        }
-        if (global_surplus < 0) {
-            if (nd <= 1) {
-                LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
-                    __func__, margins[0]/MiB, -global_surplus/MiB);
-            } else {
-                LOG_INF(
-                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
-                    __func__, -global_surplus/MiB);
-            }
-            if (cparams->n_ctx == 0) {
-                if (hp_nct > n_ctx_min) {
-                    int64_t sum_used_target = sum_free;
-                    if (nd == 0) {
-                        sum_used_target -= margins[0];
-                    } else {
-                        for (size_t id = 0; id < nd; id++) {
-                            sum_used_target -= margins[id];
-                        }
-                    }
-                    if (nd > 1) {
-                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
-                        //   - for dense models only whole layers can be assigned to devices
-                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
-                        //   - on average we expect a waste of 0.5 layers/tensors per device
-                        //   - use slightly more than the expected average for nd devices to be safe
-                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
-                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
-                    }
-
-                    int64_t sum_projected_used_min_ctx = 0;
-                    cparams->n_ctx = n_ctx_min;
-                    const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-                    if (nd == 0) {
-                        sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
-                    } else {
-                        for (size_t id = 0; id < nd; id++) {
-                            sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
-                        }
-                    }
-                    if (sum_used_target > sum_projected_used_min_ctx) {
-                        // linear interpolation between minimum and maximum context size:
-                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
-                            / (sum_projected_used - sum_projected_used_min_ctx);
-                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
-
-                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
-                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
-                        if (nd <= 1) {
-                            LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
-                            return;
-                        }
-                        LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
-                    } else {
-                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
-                    }
-                } else {
-                    if (n_ctx_min == UINT32_MAX) {
-                        LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
-                    } else {
-                        LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
-                            __func__, hp_nct, n_ctx_min);
-                    }
-                }
-            } else {
-                LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
-            }
-        }
-    }
-    if (nd == 0) {
-        throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
-    }
-
-    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
-        throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
-    }
-    if (nd > 1) {
-        if (!tensor_split) {
-            throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
-        }
-        if (mparams->tensor_split) {
-            for (size_t id = 0; id < nd; id++) {
-                if (mparams->tensor_split[id] != 0.0f) {
-                    throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
-                }
-            }
-        }
-        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
-        }
-    }
-    if (!tensor_buft_overrides) {
-        throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
-    }
-    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
-        throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
-    }
-
-    // step 3: iteratively fill the back to front with "dense" layers
-    //   - for a dense model simply fill full layers, giving each device a contiguous slice of the model
-    //   - for a MoE model, same as dense model but with all MoE tensors in system memory
-
-    // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
-    auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
-        constexpr size_t n_strings = 1000;
-        if (il >= n_strings) {
-            throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
-        }
-        switch (lf) {
-            case LAYER_FRACTION_ATTN: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
-                }
-                return patterns[il].c_str();
-            }
-            case LAYER_FRACTION_UP: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
-                }
-                return patterns[il].c_str();
-            }
-            case LAYER_FRACTION_GATE: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
-                }
-                return patterns[il].c_str();
-            }
-            case LAYER_FRACTION_MOE: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
-                }
-                return patterns[il].c_str();
-            }
-            default:
-                GGML_ABORT("fatal error");
-        }
-    };
-
-    struct ngl_t {
-        uint32_t n_layer = 0; // number of total layers
-        uint32_t n_part  = 0; // number of partial layers, <= n_layer
-
-        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
-        common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
-
-        uint32_t n_full() const {
-            assert(n_layer >= n_part);
-            return n_layer - n_part;
-        }
-    };
-
-    const size_t ntbo = llama_max_tensor_buft_overrides();
-
-    // utility function to set n_gpu_layers and tensor_split
-    auto set_ngl_tensor_split_tbo = [&](
-            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            llama_model_params & mparams) {
-        mparams.n_gpu_layers = 0;
-        for (size_t id = 0; id < nd; id++) {
-            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
-            if (nd > 1) {
-                tensor_split[id] = ngl_per_device[id].n_layer;
-            }
-        }
-        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
-        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
-
-        mparams.tensor_split = tensor_split;
-
-        size_t itbo = 0;
-        for (size_t id = 0; id < nd; id++) {
-            il0 += ngl_per_device[id].n_full();
-            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
-                if (itbo + 1 >= ntbo) {
-                    tensor_buft_overrides[itbo].pattern = nullptr;
-                    tensor_buft_overrides[itbo].buft    = nullptr;
-                    itbo++;
-                    mparams.tensor_buft_overrides = tensor_buft_overrides;
-                    throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
-                        + std::to_string(ntbo) + " is insufficient for model");
-                }
-                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
-                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
-                itbo++;
-            }
-            il0 += ngl_per_device[id].n_part;
-        }
-        tensor_buft_overrides[itbo].pattern = nullptr;
-        tensor_buft_overrides[itbo].buft    = nullptr;
-        itbo++;
-        mparams.tensor_buft_overrides = tensor_buft_overrides;
-    };
-
-    // utility function that returns the memory use per device for given numbers of layers per device
-    auto get_memory_for_layers = [&](
-            const char * func_name,
-            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
-        llama_model_params mparams_copy = *mparams;
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
-
-        const dmds_t dmd_nl = common_get_device_memory_data(
-            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-
-        LOG_INF("%s: memory for test allocation by device:\n", func_name);
-        for (size_t id = 0; id < nd; id++) {
-            const ngl_t & n = ngl_per_device[id];
-            LOG_INF(
-                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
-                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
-        }
-
-        std::vector<int64_t> ret;
-        ret.reserve(nd);
-        for (size_t id = 0; id < nd; id++) {
-            ret.push_back(dmd_nl[id].mb.total());
-        }
-        return ret;
-    };
-
-    int64_t global_surplus_cpu_moe = 0;
-    if (hp_nex > 0) {
-        const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
-        ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
-        tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
-        tensor_buft_overrides[1] = {nullptr, nullptr};
-        mparams->tensor_buft_overrides = tensor_buft_overrides;
-
-        LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
-        const dmds_t dmds_cpu_moe = common_get_device_memory_data(
-            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-
-        for (size_t id = 0; id < nd; id++) {
-            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
-            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
-        }
-
-        if (global_surplus_cpu_moe > 0) {
-            LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
-                __func__, global_surplus_cpu_moe/MiB);
-        } else {
-            LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
-                __func__, -global_surplus_cpu_moe/MiB);
-        }
-
-        // reset
-        tensor_buft_overrides[0] = {nullptr, nullptr};
-        mparams->tensor_buft_overrides = tensor_buft_overrides;
-    }
-
-    std::vector<int64_t> targets; // maximum acceptable memory use per device
-    targets.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        targets.push_back(dmds_full[id].free - margins[id]);
-        LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
-    }
-
-    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
-    overflow_bufts.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
-    }
-
-    std::vector<ngl_t> ngl_per_device(nd);
-    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
-
-    // optimize the number of layers per device using the method of false position:
-    //   - ngl_per_device has 0 layers for each device, lower bound
-    //   - try a "high" configuration where a device is given all unassigned layers
-    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
-    //   - check memory use of our guess, replace either the low or high bound
-    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
-    //   - the last device has the output layer, which cannot be a partial layer
-    if (hp_nex == 0) {
-        LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
-    } else {
-        LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
-    }
-    for (int id = nd - 1; id >= 0; id--) {
-        uint32_t n_unassigned = hp_ngl + 1;
-        for (size_t jd = id + 1; jd < nd; ++jd) {
-            assert(n_unassigned >= ngl_per_device[jd].n_layer);
-            n_unassigned -= ngl_per_device[jd].n_layer;
-        }
-
-        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
-        ngl_per_device_high[id].n_layer = n_unassigned;
-        if (hp_nex > 0) {
-            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
-        }
-        if (ngl_per_device_high[id].n_layer > 0) {
-            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
-            if (mem_high[id] > targets[id]) {
-                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
-                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
-                while (delta > 1) {
-                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
-                    step_size = std::max(step_size, uint32_t(1));
-                    step_size = std::min(step_size, delta - 1);
-
-                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
-                    ngl_per_device_test[id].n_layer += step_size;
-                    if (hp_nex) {
-                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
-                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
-                    }
-                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
-
-                    if (mem_test[id] <= targets[id]) {
-                        ngl_per_device = ngl_per_device_test;
-                        mem            = mem_test;
-                        LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
-                    } else {
-                        ngl_per_device_high = ngl_per_device_test;
-                        mem_high            = mem_test;
-                        LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
-                    }
-                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                }
-            } else {
-                assert(ngl_per_device_high[id].n_layer == n_unassigned);
-                ngl_per_device = ngl_per_device_high;
-                mem            = mem_high;
-                LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
-            }
-        }
-
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
-            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
-    }
-    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
-        return;
-    }
-
-    // step 4: for a MoE model where all dense tensors fit,
-    //     convert the dense-only layers in the back to full layers in the front until all devices are full
-    // essentially the same procedure as for the dense-only layers except front-to-back
-    // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
-
-    size_t id_dense_start = nd;
-    for (int id = nd - 1; id >= 0; id--) {
-        if (ngl_per_device[id].n_layer > 0) {
-            id_dense_start = id;
-            continue;
-        }
-        break;
-    }
-    assert(id_dense_start < nd);
-
-    LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
-    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
-        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
-        for (size_t jd = id_dense_start; jd < nd; jd++) {
-            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
-            ngl_per_device_high[id].n_layer += n_layer_move;
-            ngl_per_device_high[jd].n_layer -= n_layer_move;
-            ngl_per_device_high[jd].n_part = 0;
-        }
-        size_t id_dense_start_high = nd - 1;
-        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
-
-        if (mem_high[id] > targets[id]) {
-            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
-            while (delta > 1) {
-                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
-                step_size = std::max(step_size, uint32_t(1));
-                step_size = std::min(step_size, delta - 1);
-
-                std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
-                size_t id_dense_start_test = id_dense_start;
-                uint32_t n_converted_test = 0;
-                for (;id_dense_start_test < nd; id_dense_start_test++) {
-                    const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
-                    ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
-                    ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
-                    ngl_per_device_test[id].n_layer += n_convert_jd;
-                    n_converted_test += n_convert_jd;
-
-                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
-                        break;
-                    }
-                }
-                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
-
-                if (mem_test[id] <= targets[id]) {
-                    ngl_per_device = ngl_per_device_test;
-                    mem            = mem_test;
-                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
-                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-                } else {
-                    ngl_per_device_high = ngl_per_device_test;
-                    mem_high            = mem_test;
-                    id_dense_start_high = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
-                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
-                }
-                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
-            }
-        } else {
-            ngl_per_device = ngl_per_device_high;
-            mem            = mem_high;
-            id_dense_start = id_dense_start_high;
-            LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
-                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-        }
-
-        // try to fit at least part of one more layer
-        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
-            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
-            size_t id_dense_start_test = id_dense_start;
-            ngl_per_device_test[id_dense_start_test].n_layer--;
-            ngl_per_device_test[id_dense_start_test].n_part--;
-            ngl_per_device_test[id].n_layer++;
-            ngl_per_device_test[id].n_part++;
-            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
-                id_dense_start_test++;
-            }
-            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
-            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
-            if (id < nd - 1) {
-                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
-            }
-            LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
-            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
-                ngl_per_device = ngl_per_device_test;
-                overflow_bufts = overflow_bufts_test;
-                mem            = mem_test;
-                id_dense_start = id_dense_start_test;
-                LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
-                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-
-                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
-                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
-                    mem            = mem_test;
-                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
-                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-                }
-            } else {
-                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
-                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
-                    mem            = mem_test;
-                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
-                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-                }
-            }
-        }
-
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
-            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
-    }
-
-    // print info for devices that were not changed during the conversion from dense only to full layers:
-    for (size_t id = id_dense_start + 1; id < nd; id++) {
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
-            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
-    }
-
-    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
-}
-
-enum common_params_fit_status common_fit_params(
-        const char * path_model,
-        llama_model_params * mparams,
-        llama_context_params * cparams,
-        float * tensor_split,
-        llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins,
-        uint32_t n_ctx_min,
-        ggml_log_level log_level) {
-    const int64_t t0_us = llama_time_us();
-    common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
-    try {
-        common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
-        LOG_INF("%s: successfully fit params to free device memory\n", __func__);
-    } catch (const common_params_fit_exception & e) {
-        LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
-        status = COMMON_PARAMS_FIT_STATUS_FAILURE;
-    } catch (const std::runtime_error & e) {
-        LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
-        status = COMMON_PARAMS_FIT_STATUS_ERROR;
-    }
-    const int64_t t1_us = llama_time_us();
-    LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
-    return status;
-}
-
-void common_memory_breakdown_print(const struct llama_context * ctx) {
-    //const auto & devices = ctx->get_model().devices;
-    const auto * model = llama_get_model(ctx);
-
-    std::vector<ggml_backend_dev_t> devices;
-    for (int i = 0; i < llama_model_n_devices(model); i++) {
-        devices.push_back(llama_model_get_device(model, i));
-    }
-
-    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
-
-    std::vector<std::array<std::string, 9>> table_data;
-    table_data.reserve(devices.size());
-    const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";
-    const std::string template_gpu    = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
-    const std::string template_other  = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";
-
-    table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
-
-    constexpr size_t MiB = 1024 * 1024;
-    const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
-
-    // track seen buffer types to avoid double counting:
-    std::set<ggml_backend_buffer_type_t> seen_buffer_types;
-
-    // accumulative memory breakdown for each device and for host:
-    std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
-    llama_memory_breakdown_data              mb_host;
-
-    for (const auto & buft_mb : memory_breakdown) {
-        ggml_backend_buffer_type_t          buft = buft_mb.first;
-        const llama_memory_breakdown_data & mb   = buft_mb.second;
-        if (ggml_backend_buft_is_host(buft)) {
-            mb_host.model   += mb.model;
-            mb_host.context += mb.context;
-            mb_host.compute += mb.compute;
-            seen_buffer_types.insert(buft);
-            continue;
-        }
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
-        if (dev) {
-            int i_dev = -1;
-            for (size_t i = 0; i < devices.size(); i++) {
-                if (devices[i] == dev) {
-                    i_dev = i;
-                    break;
-                }
-            }
-            if (i_dev != -1) {
-                mb_dev[i_dev].model   += mb.model;
-                mb_dev[i_dev].context += mb.context;
-                mb_dev[i_dev].compute += mb.compute;
-                seen_buffer_types.insert(buft);
-                continue;
-            }
-        }
-    }
-
-    // print memory breakdown for each device:
-    for (size_t i = 0; i < devices.size(); i++) {
-        ggml_backend_dev_t dev = devices[i];
-        llama_memory_breakdown_data mb = mb_dev[i];
-
-        const std::string name = ggml_backend_dev_name(dev);
-        std::string desc = ggml_backend_dev_description(dev);
-        for (const std::string & prefix : desc_prefixes_strip) {
-            if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
-                desc = desc.substr(prefix.length());
-            }
-        }
-
-        size_t free, total;
-        ggml_backend_dev_memory(dev, &free, &total);
-
-        const size_t self = mb.model + mb.context + mb.compute;
-        const size_t unaccounted = total - self - free;
-
-        table_data.push_back({
-            template_gpu,
-            "  - " + name + " (" + desc + ")",
-            std::to_string(total / MiB),
-            std::to_string(free / MiB),
-            std::to_string(self / MiB),
-            std::to_string(mb.model / MiB),
-            std::to_string(mb.context / MiB),
-            std::to_string(mb.compute / MiB),
-            std::to_string(unaccounted / MiB)});
-    }
-
-    // print memory breakdown for host:
-    {
-        const size_t self = mb_host.model + mb_host.context + mb_host.compute;
-        table_data.push_back({
-            template_other,
-            "  - Host",
-            "", // total
-            "", // free
-            std::to_string(self / MiB),
-            std::to_string(mb_host.model / MiB),
-            std::to_string(mb_host.context / MiB),
-            std::to_string(mb_host.compute / MiB),
-            ""}); // unaccounted
-    }
-
-    // print memory breakdown for all remaining buffer types:
-    for (const auto & buft_mb : memory_breakdown) {
-        ggml_backend_buffer_type_t          buft = buft_mb.first;
-        const llama_memory_breakdown_data & mb   = buft_mb.second;
-        if (seen_buffer_types.count(buft) == 1) {
-            continue;
-        }
-        const std::string name = ggml_backend_buft_name(buft);
-        const size_t self = mb.model + mb.context + mb.compute;
-        table_data.push_back({
-            template_other,
-            "  - " + name,
-            "", // total
-            "", // free
-            std::to_string(self / MiB),
-            std::to_string(mb.model / MiB),
-            std::to_string(mb.context / MiB),
-            std::to_string(mb.compute / MiB),
-            ""}); // unaccounted
-        seen_buffer_types.insert(buft);
-    }
-
-    for (size_t j = 1; j < table_data[0].size(); j++) {
-        size_t max_len = 0;
-        for (const auto & td : table_data) {
-            max_len = std::max(max_len, td[j].length());
-        }
-        for (auto & td : table_data) {
-            td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
-        }
-    }
-    for (const auto & td : table_data) {
-        LOG_INF(td[0].c_str(),
-            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
-            td[6].c_str(), td[7].c_str(), td[8].c_str());
-    }
-}
-
-void common_fit_print(
-        const char * path_model,
-        llama_model_params * mparams,
-        llama_context_params * cparams) {
-    std::vector<ggml_backend_dev_t> devs;
-    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
-    uint32_t hp_nct = 0; // hparams.n_ctx_train
-    uint32_t hp_nex = 0; // hparams.n_expert
-
-    auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
-    GGML_ASSERT(dmd.size() == devs.size() + 1);
-
-    for (size_t id = 0; id < devs.size(); id++) {
-        printf("%s ",  ggml_backend_dev_name(devs[id]));
-        printf("%zu ", dmd[id].mb.model/1024/1024);
-        printf("%zu ", dmd[id].mb.context/1024/1024);
-        printf("%zu ", dmd[id].mb.compute/1024/1024);
-        printf("\n");
-    }
-
-    printf("Host ");
-    printf("%zu ", dmd.back().mb.model/1024/1024);
-    printf("%zu ", dmd.back().mb.context/1024/1024);
-    printf("%zu ", dmd.back().mb.compute/1024/1024);
-    printf("\n");
-}
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-enum common_params_fit_status {
-    COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
-    COMMON_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
-    COMMON_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occurred, e.g. because no model could be found at the specified path
-};
-
-// fits mparams and cparams to free device memory (assumes system memory is unlimited)
-//   - returns true if the parameters could be successfully modified to fit device memory
-//   - this function is NOT thread safe because it modifies the global llama logger state
-//   - only parameters that have the same value as in llama_default_model_params are modified
-//     with the exception of the context size which is modified if and only if equal to 0
-enum common_params_fit_status common_fit_params(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams,
-                                      float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
-    struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                     size_t * margins,               // margins of memory to leave per device in bytes
-                                   uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
-                        enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
-
-// print estimated memory to stdout
-void common_fit_print(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams);
-
-void common_memory_breakdown_print(const struct llama_context * ctx);
@@ -1,3 +1,4 @@
+#include "log.h"
 #include "value.h"
 #include "runtime.h"
 #include "caps.h"
@@ -106,16 +106,10 @@ struct statement {
    size_t pos; // position in source, for debugging
    virtual ~statement() = default;
    virtual std::string type() const { return "Statement"; }
-
    // execute_impl must be overridden by derived classes
-    virtual value execute_impl(context &) { throw_exec_error(); }
+    virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
    // execute is the public method to execute a statement with error handling
    value execute(context &);
-
-private:
-    [[noreturn]] void throw_exec_error() const {
-        throw std::runtime_error("cannot exec " + type());
-    }
 };

 // Type Checking Utilities
@@ -149,7 +143,7 @@ struct program : public statement {
    program() = default;
    explicit program(statements && body) : body(std::move(body)) {}
    std::string type() const override { return "Program"; }
-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
    }
 };
@@ -201,7 +195,7 @@ struct break_statement : public statement {
        }
    };

-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw break_statement::signal();
    }
 };
@@ -215,7 +209,7 @@ struct continue_statement : public statement {
        }
    };

-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw continue_statement::signal();
    }
 };
@@ -515,7 +509,7 @@ struct slice_expression : public expression {
        chk_type<expression>(this->step_expr);
    }
    std::string type() const override { return "SliceExpression"; }
-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw std::runtime_error("must be handled by MemberExpression");
    }
 };
@@ -590,10 +590,6 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
    return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
 }

-[[noreturn]] static value string_join_not_implemented(const func_args &) {
-    throw not_implemented_exception("String join builtin not implemented");
-}
-
 const func_builtins & value_string_t::get_builtins() const {
    static const func_builtins builtins = {
        {"default", default_value},
@@ -855,7 +851,9 @@ const func_builtins & value_string_t::get_builtins() const {
            res->val_str.mark_input_based_on(val_input->as_string());
            return res;
        }},
-        {"join", string_join_not_implemented},
+        {"join", [](const func_args &) -> value {
+            throw not_implemented_exception("String join builtin not implemented");
+        }},
    };
    return builtins;
 }
@@ -886,9 +884,6 @@ const func_builtins & value_bool_t::get_builtins() const {
    return builtins;
 }

-[[noreturn]] static value array_unique_not_implemented(const func_args &) {
-    throw not_implemented_exception("Array unique builtin not implemented");
-}

 const func_builtins & value_array_t::get_builtins() const {
    static const func_builtins builtins = {
@@ -1089,14 +1084,13 @@ const func_builtins & value_array_t::get_builtins() const {
            std::reverse(arr.begin(), arr.end());
            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
-        {"unique", array_unique_not_implemented},
+        {"unique", [](const func_args &) -> value {
+            throw not_implemented_exception("Array unique builtin not implemented");
+        }},
    };
    return builtins;
 }

-[[noreturn]] static value object_join_not_implemented(const func_args &) {
-    throw not_implemented_exception("object join not implemented");
-}

 const func_builtins & value_object_t::get_builtins() const {
    if (!has_builtins) {
@@ -1189,7 +1183,9 @@ const func_builtins & value_object_t::get_builtins() const {
            });
            return result;
        }},
-        {"join", object_join_not_implemented},
+        {"join", [](const func_args &) -> value {
+            throw not_implemented_exception("object join not implemented");
+        }},
    };
    return builtins;
 }
@@ -129,25 +129,27 @@ struct value_t {
    // Note: only for debugging and error reporting purposes
    virtual std::string type() const { return ""; }

-    virtual int64_t as_int() const { throw_type_error("is not an int value"); }
-    virtual double as_float() const { throw_type_error("is not a float value"); }
-    virtual string as_string() const { throw_type_error("is not a string value"); }
-    virtual bool as_bool() const { throw_type_error("is not a bool value"); }
-    virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
-    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
-    virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
+    virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
+    virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
+    virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
+    virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
+    virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
+    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
    virtual bool is_none() const { return false; }
    virtual bool is_undefined() const { return false; }
-    virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }
+    virtual const func_builtins & get_builtins() const {
+        throw std::runtime_error("No builtins available for type " + type());
+    }

-    virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
-    virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
-    virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
-    virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
-    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
-    virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
-    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
-    virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }
+    virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
+    virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }

    virtual bool is_numeric() const { return false; }
    virtual bool is_hashable() const { return false; }
@@ -161,11 +163,6 @@ struct value_t {
    // Note: only for debugging purposes
    virtual std::string as_repr() const { return as_string().str(); }

-private:
-    [[noreturn]] void throw_type_error(const char* expected) const {
-        throw std::runtime_error(type() + " " + expected);
-    }
-
 protected:
    virtual bool equivalent(const value_t &) const = 0;
    virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
@@ -1,12 +1,10 @@
 #include "sampling.h"

 #include "common.h"
-#include "fit.h"
+#include "ggml.h"
 #include "log.h"
 #include "reasoning-budget.h"

-#include "ggml.h"
-
 #include <algorithm>
 #include <cctype>
 #include <climits>
@@ -513,7 +511,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);

-        common_memory_breakdown_print(ctx);
+        llama_memory_breakdown_print(ctx);
    }
 }

@@ -749,7 +749,6 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {

                    mod.reset();
                    n_low = 0;
-                    i_last = 0;
                }
            } else {
                n_low = 0;
@@ -746,12 +746,7 @@ class ModelBase:

        if (not quant_algo or not quant_layers) and quant_config_file.is_file():
            with open(quant_config_file, "r", encoding="utf-8") as f:
-                hf_quant_config = json.load(f)
-                quant_config = hf_quant_config.get("quantization") or {}
-                producer = hf_quant_config.get("producer") or {}
-                producer_name = (producer.get("name") or "").lower()
-                if quant_method is None:
-                    self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
+                quant_config = json.load(f).get("quantization") or {}
                quant_algo = quant_config.get("quant_algo", quant_algo)
                quant_layers = quant_config.get("quantized_layers", quant_layers) or {}

@@ -11860,7 +11855,7 @@ class LLaDAMoEModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("HunYuanDenseV1ForCausalLM")
+@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
 class HunYuanModel(TextModel):
    model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE

@@ -11999,58 +11994,28 @@ class HunYuanModel(TextModel):


@ModelBase.register("HunYuanVLForConditionalGeneration")
-class HunyuanVLVisionModel(MmprojModel):
-    # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
-    # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
-    # Each variant maps to a different projector type in clip.cpp so image
-    # preprocessing follows the correct code path.
-
+class HunyuanOCRVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        assert self.hparams_vision is not None
-        # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
+        # HunyuanOCR uses max_image_size instead of image_size
        if "image_size" not in self.hparams_vision:
            self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)

-    @staticmethod
-    def is_ocr_variant(hparams: dict) -> bool:
-        """Return True for HunyuanOCR, False for HunyuanVL.
-
-        The projector's output dim must equal the text model's hidden_size by
-        construction (that's what "projector" means). HunyuanOCR pairs a 1B text
-        backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
-        ViT -> LLM projection dim is a hard architectural signature, not a
-        magic number.
-        """
-        vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
-        return vision_out == 1024
-
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        assert self.hparams_vision is not None
-        vcfg = self.hparams_vision
-
-        if self.is_ocr_variant(self.global_config):
-            # --- HunyuanOCR ---
-            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
-            self.gguf_writer.add_vision_use_gelu(True)
-            self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
-            self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
-            self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
-            self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
-            return
-
-        # --- HunyuanVL ---
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
-        self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
-        self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
-        self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
-        self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
-        self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
+        hparams = self.hparams_vision
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
+        self.gguf_writer.add_vision_use_gelu(True)
+        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
+        self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
+        self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
+        self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if not name.startswith("vit."):
-            return
+            return  # skip text tensors
        # strip CLS token (row 0) from position embeddings so resize_position_embeddings works
        if "position_embedding" in name:
            data_torch = data_torch[1:]  # [n_patches+1, n_embd] -> [n_patches, n_embd]
@@ -12058,66 +12023,11 @@ class HunyuanVLVisionModel(MmprojModel):

    def tensor_force_quant(self, name, new_name, bid, n_dims):
        # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
-        # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
        if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
        return super().tensor_force_quant(name, new_name, bid, n_dims)


-@ModelBase.register("HunYuanVLForConditionalGeneration")
-class HunyuanVLTextModel(HunYuanModel):
-    # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
-    # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
-    # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
-    # the config and pick the matching GGUF architecture.
-    model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
-
-    @staticmethod
-    def _is_ocr_config(hparams: dict) -> bool:
-        # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
-        # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
-        # HunyuanVLVisionModel.is_ocr_variant.
-        return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
-
-    def __init__(self, dir_model: Path, *args, **kwargs):
-        raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
-        if self._is_ocr_config(raw_hparams):
-            self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
-        else:
-            self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
-        super().__init__(dir_model, *args, **kwargs)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
-        # the HunYuan-Dense arch which already handles standard rope in super().
-        if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
-            return
-
-        if self.rope_parameters.get("rope_type") != "xdrope":
-            return
-
-        # defaults for HunyuanVL. The C++ side later computes:
-        #   freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
-        self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
-        self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-        self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
-
-        ctx_len = int(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
-        self.gguf_writer.add_context_length(ctx_len)
-
-        self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Skip vision tensors — they are written by HunyuanVLVisionModel
-        if name.startswith("vit."):
-            return
-        yield from super().modify_tensors(data_torch, name, bid)
-
-
@ModelBase.register("SmolLM3ForCausalLM")
 class SmolLM3Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.SMOLLM3
@@ -244,6 +244,7 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
 - `-fa 1` is required when running llama-bench with the OpenVINO backend.
  - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
 - `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
+- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)

 > [!NOTE]
 > The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
@@ -273,6 +274,8 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
 Run llama.cpp with OpenVINO backend Docker container.
 Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.

+> [!NOTE]
+> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).

 ```bash
 #  Run Docker container
@@ -31,8 +31,6 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.

 ## Recommended Release

-### Windows
-
 The following releases are verified and recommended:

 |Commit ID|Tag|Release|Verified  Platform| Update date|
@@ -41,13 +39,6 @@ The following releases are verified and recommended:
 |3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||

-### Ubuntu 24.04
-
-The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
-
-It is recommended to use them with Intel Docker.
-
-The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.

 ## News

@@ -238,7 +229,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 |Verified release|
 |-|
-|2025.3.3 |
 |2025.2.1|
 |2025.1|
 |2024.1|
@@ -249,27 +249,18 @@ build: 6a8cf8914 (6733)
  ```

 - `GGML_HEXAGON_PROFILE=1`
-  Enables Op profiling:
+  Generates a host-side profile for the ggml-hexagon Ops.

-  - `1` Basic profile with per-op `usecs` and `cycles` counters
-  - `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
-  - `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
-
-  The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
-  Examples:
-
-      `GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
-
- `GGML_HEXAGON_OPSTAGE=0x0`
-  Allows enabling specific stages of the Op processing pipeline:
+- `GGML_HEXAGON_OPMASK=0x0`
+  Allows enabling specific stages of the processing pipeline:

  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
  - `0x2` Enable Op Compute (MUL_MAT, etc.)

  Examples:

-      `GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
-      `GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
+      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
+      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)

 - `GGML_HEXAGON_OPFILTER=regex`
  Allows filtering (disabling) Ops that match the regex pattern:
@@ -25,11 +25,7 @@ MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
 METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
-if [[ -n "$MMPROJ" ]]; then
-    CONVERTED_MODEL="${OUTPUT_DIR}/mmproj-${MODEL_NAME}.gguf"
-else
-    CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
-fi
+CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"

 echo "Model path: ${MODEL_PATH}"
 echo "Model name: ${MODEL_NAME}"
@@ -42,7 +38,6 @@ if [[ -n "$DEBUG" ]]; then
 else
    CMD_ARGS=("python")
 fi
-
 CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
 CMD_ARGS+=("${MODEL_PATH}")
 CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
@@ -55,3 +50,7 @@ CMD_ARGS+=("--outtype" "${TYPE}")
 echo ""
 echo "The environment variable CONVERTED_MODEL can be set to this path using:"
 echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
+if [[ -n "$MMPROJ" ]]; then
+    mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
+    echo "The mmproj model was created in $(realpath "$mmproj_file")"
+fi
@@ -8,24 +8,8 @@
 #include <clocale>
 #include <cstdio>
 #include <cstring>
-#include <cinttypes>
 #include <string>
 #include <vector>
-#include <utility>
-
-struct spec_checkpoint {
-    int64_t n_tokens = 0;
-
-    std::vector<uint8_t> data;
-
-    size_t size() const {
-        return data.size();
-    }
-
-    bool empty() const {
-        return data.empty();
-    }
-};

 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
@@ -62,14 +46,6 @@ int main(int argc, char ** argv) {
    model_tgt = llama_init_tgt->model();
    ctx_tgt   = llama_init_tgt->context();

-    // check if the context supports partial sequence removal
-    const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
-    const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
-
-    if (use_ckpt) {
-        LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
-    }
-
    const llama_vocab * vocab = llama_model_get_vocab(model_tgt);

    // load the draft model
@@ -143,7 +119,7 @@ int main(int argc, char ** argv) {
    const auto t_enc_start = ggml_time_us();

    // target model sampling context
-    common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);

    // eval the prompt
    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
@@ -166,61 +142,21 @@ int main(int argc, char ** argv) {

    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);

-    size_t n_draft = 0;
-
-    llama_tokens draft;
-    spec_checkpoint spec_ckpt;
-
    const auto t_enc_end = ggml_time_us();

    const auto t_dec_start = ggml_time_us();

    while (true) {
-        // generate or reuse draft tokens
+        // optionally, generate draft tokens that can be appended to the target batch
        //
        // this is the most important part of the speculation. the more probable tokens that are provided here
        // the better the performance will be. in theory, this computation can be performed asynchronously and even
        // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
        // from a cache or lookup tables.
        //
-        if (draft.empty()) {
-            // generate a new draft
-            draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
+        llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);

-            if ((int) draft.size() > params_spec.n_max) {
-                LOG_WRN("draft size %zu exceeds max %d, truncating\n", draft.size(), params_spec.n_max);
-                draft.resize(params_spec.n_max);
-            }
-
-            if ((int) draft.size() < params_spec.n_min) {
-                LOG_DBG("ignoring small draft: %zu < %d\n", draft.size(), params_spec.n_min);
-                draft.clear();
-            }
-
-            // save the original draft size
-            n_draft = draft.size();
-
-            // save a checkpoint of the target context before evaluating the draft
-            // this allows us to restore the state if partial draft acceptance occurs
-            if (!draft.empty() && use_ckpt) {
-                const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                spec_ckpt.data.resize(ckpt_size);
-
-                const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                GGML_ASSERT(n == ckpt_size);
-
-                spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
-                LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
-                        spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
-            }
-        } else {
-            // we have a previous (partial) draft to reuse from checkpoint restoration
-            if (use_ckpt) {
-                GGML_ASSERT(!spec_ckpt.empty());
-            }
-        }
-
-        GGML_ASSERT(n_draft > 0);
+        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());

        // always have a token to evaluate from before - id_last
        common_batch_clear(batch_tgt);
@@ -242,12 +178,6 @@ int main(int argc, char ** argv) {
            llama_decode(ctx_tgt, batch_tgt);
        }

-        // only save the sampler sampler state if we use checkpoints
-        common_sampler_ptr smpl_save;
-        if (use_ckpt) {
-            smpl_save.reset(common_sampler_clone(smpl.get()));
-        }
-
        // sample from the full target batch and return the accepted tokens based on the target sampler
        //
        // for each token to be accepted, the sampler would have to sample that same token
@@ -255,38 +185,14 @@ int main(int argc, char ** argv) {
        // available logits from the batch and sample the next token until we run out of logits or the sampler
        // disagrees with the draft
        //
-        auto ids = common_sampler_sample_and_accept_n(smpl.get(), ctx_tgt, draft);
+        const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);

        //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());

        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token

-        // check for partial draft acceptance:
-        // if the context doesn't support partial sequence removal, restore the checkpoint
-        // and make the accepted tokens the new partial draft for the next iteration
-        if (use_ckpt && ids.size() - 1 < draft.size()) {
-            LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());
-
-            draft = std::move(ids);
-
-            const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-            GGML_ASSERT(n == spec_ckpt.size());
-
-            llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
-
-            prompt_tgt.resize(spec_ckpt.n_tokens);
-            smpl = std::move(smpl_save);
-
-            n_past = (int) prompt_tgt.size();
-
-            continue;
-        }
-
-        common_speculative_accept(spec, ids.size() - 1);
-
-        // full acceptance: consume the draft and commit accepted tokens
        n_past    += ids.size() - 1;
-        n_drafted += n_draft; // note: we ignore the discarded small drafts
+        n_drafted += draft.size(); // note: we ignore the discarded small drafts
        n_accept  += ids.size() - 1;
        n_predict += ids.size();

@@ -316,9 +222,6 @@ int main(int argc, char ** argv) {

        LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);

-        // clear the draft since it has been consumed
-        draft.clear();
-
        {
            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);

@@ -351,10 +254,11 @@ int main(int argc, char ** argv) {

    LOG_INF("\n");
    LOG_INF("target:\n\n");
-    common_perf_print(ctx_tgt, smpl.get());
+    common_perf_print(ctx_tgt, smpl);

    llama_batch_free(batch_tgt);

+    common_sampler_free(smpl);
    common_speculative_free(spec);

    llama_backend_free();
@@ -4,8 +4,8 @@ project("ggml" C CXX ASM)

 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
-set(GGML_VERSION_MINOR 10)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_MINOR 9)
+set(GGML_VERSION_PATCH 11)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -213,7 +213,7 @@ set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
 set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")

 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
-option(GGML_HIP_GRAPHS                      "ggml: use HIP graph"                              ON)
+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
 option(GGML_HIP_RCCL                        "ggml: use ROCm Collective Comm. Library"         OFF)
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
@@ -473,7 +473,7 @@ target_link_libraries(ggml-base PRIVATE Threads::Threads)
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
    if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
-        target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
+        target_link_libraries(ggml-base PRIVATE m)
    endif()
 endif()

@@ -1133,7 +1133,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
        if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
            t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
            if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
-                GGML_ASSERT(tensor->ne[split_dim] != 0);
+                GGML_ASSERT(ne[split_dim] != 0 && tensor->ne[split_dim] != 0);
                const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
                GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);

@@ -1170,28 +1170,6 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer

        simple_tensors.push_back(t_ij);
    }
-
-    // If one of the sources has a zero-sized slice, disable the computation:
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (tensor->src[i] == nullptr || !ggml_backend_buffer_is_meta(tensor->src[i]->buffer)) {
-            continue;
-        }
-
-        const ggml_backend_meta_split_state split_state_src = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
-        if (split_state_src.axis < 0 || split_state_src.axis >= GGML_MAX_DIMS) {
-            continue;
-        }
-        for (size_t j = 0; j < n_simple_bufs; j++) {
-            int64_t ne_sum = 0;
-            for (size_t s = 0; s < split_state_src.n_segments; s++) {
-                ne_sum += split_state_src.ne[s*n_simple_bufs + j];
-            }
-            if (ne_sum == 0) {
-                simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
-            }
-        }
-    }
-
    buf_ctx->simple_tensors[tensor] = simple_tensors;

    return GGML_STATUS_SUCCESS;
@@ -1464,20 +1442,17 @@ struct ggml_backend_meta_context {
    struct backend_config {
        ggml_backend_t backend;

-        std::vector<cgraph_config>           cgraphs;
-        std::vector<ggml_tensor *>           nodes;
-        std::vector<ggml_backend_buffer_ptr> bufs;
+        std::vector<cgraph_config> cgraphs;
+        std::vector<ggml_tensor *> nodes;
+        ggml_backend_buffer_ptr    buf;

-        backend_config(ggml_backend_t backend, const size_t n_reduce_steps) : backend(backend) {
-            bufs.resize(n_reduce_steps);
-        }
+        backend_config(ggml_backend_t backend) : backend(backend) {}
    };
    std::string                 name;
    std::vector<backend_config> backend_configs;
    ggml_context_ptr            ctx;
    std::vector<ggml_cgraph *>  cgraphs_aux;
    std::vector<ggml_tensor *>  nodes_aux;
-    size_t                      n_reduce_steps;
    int                         max_nnodes    = 0;
    size_t                      max_tmp_size  = 0;
    size_t                      max_subgraphs = 0;
@@ -1489,7 +1464,6 @@ struct ggml_backend_meta_context {

    ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
        const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
-        n_reduce_steps = std::ceil(std::log2(n_devs));
        name = "Meta(";
        std::vector<ggml_backend_t> simple_backends;
        backend_configs.reserve(n_devs);
@@ -1501,7 +1475,7 @@ struct ggml_backend_meta_context {
            }
            name += ggml_backend_dev_name(simple_dev);
            simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
-            backend_configs.emplace_back(simple_backends.back(), n_reduce_steps);
+            backend_configs.emplace_back(simple_backends.back());
        }
        name += ")";

@@ -1531,6 +1505,10 @@ struct ggml_backend_meta_context {
            ggml_backend_free(bc.backend);
        }
    }
+
+    size_t n_reduce_steps() const {
+        return std::ceil(std::log2(backend_configs.size()));
+    }
 };

 static const char * ggml_backend_meta_get_name(ggml_backend_t backend) {
@@ -1683,36 +1661,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,

                ggml_tensor * node = cgraph->nodes[id];
                int32_t n_used = ggml_node_get_use_count(cgraph, id);
-
-                // Skip MIRRORED nodes that don't consume node
-                auto skip_unrelated = [&]() {
-                    while (id + 1 < cgraph->n_nodes) {
-                        ggml_tensor * next = cgraph->nodes[id+1];
-                        if (ggml_backend_meta_get_split_state(next, false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
-                            break;
-                        }
-                        bool safe = true;
-                        for (int s = 0; s < GGML_MAX_SRC; s++) {
-                            if (next->src[s] == nullptr) {
-                                continue;
-                            }
-                            if (next->src[s] == node) {
-                                safe = false;
-                                break;
-                            }
-                            if (ggml_backend_meta_get_split_state(next->src[s], false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
-                                safe = false;
-                                break;
-                            }
-                        }
-                        if (!safe) {
-                            break;
-                        }
-                        id++;
-                    }
-                };
-
-                skip_unrelated();
                if (id + 1 >= cgraph->n_nodes) {
                    return idr;
                }
@@ -1727,12 +1675,10 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                        n_used = ggml_node_get_use_count(cgraph, id);
                    }
                }
-                // Chain of MULs with MIRRORED src[1]
-                while (true) {
-                    skip_unrelated();
-                    if (id + 1 >= cgraph->n_nodes) {
-                        return idr;
-                    }
+                if (id + 1 >= cgraph->n_nodes) {
+                    return idr;
+                }
+                {
                    ggml_tensor * next = cgraph->nodes[id+1];
                    if (next->op == GGML_OP_MUL && next->src[0] == node &&
                            ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
@@ -1740,8 +1686,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                        id++;
                        idr = id;
                        n_used = ggml_node_get_use_count(cgraph, id);
-                    } else {
-                        break;
                    }
                }

@@ -1810,17 +1754,16 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
        if (max_tmp_size > backend_ctx->max_tmp_size) {
            for (size_t j = 0; j < n_backends; j++) {
                auto & bcj = backend_ctx->backend_configs[j];
-                for (size_t i = 0; i < backend_ctx->n_reduce_steps; i++) {
-                    bcj.bufs[i].reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
-                }
+                bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
            }
            backend_ctx->max_tmp_size = max_tmp_size;
        }

        if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
            backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
-            const size_t n_nodes_per_device = 3 * backend_ctx->n_reduce_steps; // tmp + ADD (+zeroing) graph per step and device
-            const size_t n_cgraphs_per_device = 2 * backend_ctx->n_reduce_steps; // ADD ( + zeroing) graph per step and device
+            const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
+            const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
+            const size_t n_cgraphs_per_device = n_reduce_steps;    // 1 ADD graph per step
            const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
            const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
            const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
@@ -1869,6 +1812,11 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
    size_t iga = 0; // i graph aux
    size_t ina = 0; // i node aux

+    // FIXME usage_counts
+    auto get_cgraph_aux = [&]() -> ggml_cgraph * {
+        ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
+        return ret;
+    };
    auto get_node_aux = [&](ggml_tensor * t) -> ggml_tensor * {
        ggml_tensor * ret = backend_ctx->nodes_aux[ina++];
        memset(ret, 0, sizeof(ggml_tensor));
@@ -1880,110 +1828,75 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
        }
        return ret;
    };
-    auto set_tmp_data = [&](ggml_tensor * tensor, const size_t j, const size_t i_buf) {
-        auto & bcj = backend_ctx->backend_configs[j];
-        ggml_backend_buffer_ptr & buf_ptr = bcj.bufs[i_buf];
-        if (!buf_ptr || ggml_backend_buffer_get_size(buf_ptr.get()) < backend_ctx->max_tmp_size) {
-            buf_ptr.reset(ggml_backend_alloc_buffer(bcj.backend, backend_ctx->max_tmp_size));
-        }
-        tensor->buffer = buf_ptr.get();
-        tensor->data   = ggml_backend_buffer_get_base(buf_ptr.get());
-    };
-    // FIXME usage_counts
-    auto get_cgraph_aux = [&]() -> ggml_cgraph * {
-        ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
-        return ret;
-    };

    // Preferentially use backend-specific allreduce_tensor_async (e.g. NCCL for CUDA), use a generic fallback if unavailable:
    auto allreduce_fallback = [&](size_t i) -> ggml_status {
        std::vector<ggml_cgraph *> step_cgraphs(n_backends, nullptr);

-        // Zero out nodes that were disabled due to having a zero-sized slice:
-        for (size_t j = 0; j < n_backends; j++) {
-            auto & bcj = backend_ctx->backend_configs[j];
-            ggml_tensor * node = bcj.cgraphs[i].cgraph_main->nodes[bcj.cgraphs[i].cgraph_main->n_nodes - 1];
-            if (node->flags & GGML_TENSOR_FLAG_COMPUTE) {
-                continue;
-            }
-            ggml_tensor * node_zero = get_node_aux(node);
-            node_zero->op = GGML_OP_SCALE; // FIXME 0.0f * NaN == NaN
-            node_zero->src[0] = node;
-            ggml_set_op_params_f32(node_zero, 0, 0.0f);
-            node_zero->data = node->data;
-            node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
-
-            step_cgraphs[j] = get_cgraph_aux();
-            step_cgraphs[j]->nodes[0] = node_zero;
-            step_cgraphs[j]->n_nodes = 1;
-            const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, step_cgraphs[j]);
-            if (status != GGML_STATUS_SUCCESS) {
-                return status;
-            }
-        }
-        std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
-
-        auto push_data = [&](const size_t j_src, const size_t j_dst, const size_t i_buf) {
-            assert(step_cgraphs[j_dst] == nullptr);
-            auto & bcj_src = backend_ctx->backend_configs[j_src];
-            auto & bcj_dst = backend_ctx->backend_configs[j_dst];
-
-            ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
-            ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
-            GGML_ASSERT(ggml_is_contiguous(node_src));
-            GGML_ASSERT(ggml_is_contiguous(node_dst));
-
-            ggml_tensor * node_tmp = get_node_aux(node_dst);
-            set_tmp_data(node_tmp, j_dst, i_buf);
-
-            ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_tmp);
-
-            ggml_tensor * node_red = get_node_aux(node_dst);
-            node_red->view_src = node_dst->view_src == nullptr ? node_dst : node_dst->view_src;
-            node_red->view_offs = node_dst->view_offs;
-            node_red->op = GGML_OP_ADD;
-            node_red->src[0] = node_dst;
-            node_red->src[1] = node_tmp;
-            node_red->flags |= GGML_TENSOR_FLAG_COMPUTE;
-            ggml_backend_view_init(node_red);
-
-            ggml_cgraph * cgraph_aux = get_cgraph_aux();
-            cgraph_aux->nodes[0] = node_red;
-            cgraph_aux->n_nodes = 1;
-            step_cgraphs[j_dst] = cgraph_aux;
-        };
-
-        size_t offset_j = n_backends/2;
-        while ((offset_j & (offset_j - 1)) != 0) {
-            offset_j--;
-        }
-        const size_t offset_j_max = offset_j;
-        size_t i_buf = 0;
-
-        // If n_backends is not a power of 2, fold in the excess prior to butterfly reduction:
-        for (size_t j_src = 2*offset_j_max; j_src < n_backends; j_src++) {
-            const size_t j_dst = j_src - 2*offset_j_max;
-            push_data(j_src, j_dst, i_buf);
-            const ggml_status status = ggml_backend_graph_compute_async(backend_ctx->backend_configs[j_dst].backend, step_cgraphs[j_dst]);
-            if (status != GGML_STATUS_SUCCESS) {
-                return status;
-            }
-            i_buf = 1;
-        }
-
-        // Butterfly reduction:
-        for (; offset_j >= 1; offset_j /= 2) {
+        for (size_t offset_j = 1; offset_j < n_backends; offset_j *= 2) {
            std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);

-            for (size_t j = 0; j < 2*offset_j_max; j++) {
+            for (size_t j = 0; j < n_backends; j++) {
                const size_t j_other = j ^ offset_j;
-                if (j_other >= n_backends) {
+                if (j_other > j) {
                    continue;
                }
-                push_data(j, j_other, i_buf);
+
+                auto & bcj1 = backend_ctx->backend_configs[j];
+                auto & bcj2 = backend_ctx->backend_configs[j_other];
+
+                ggml_tensor * node1 = bcj1.cgraphs[i].cgraph_main->nodes[bcj1.cgraphs[i].cgraph_main->n_nodes - 1];
+                ggml_tensor * node2 = bcj2.cgraphs[i].cgraph_main->nodes[bcj2.cgraphs[i].cgraph_main->n_nodes - 1];
+                GGML_ASSERT(ggml_is_contiguous(node1));
+                GGML_ASSERT(ggml_is_contiguous(node2));
+
+                // Tmp tensors to receive P2P copies
+                ggml_tensor * node_tmp_1 = get_node_aux(node1);
+                node_tmp_1->buffer = bcj1.buf.get();
+                node_tmp_1->data = ggml_backend_buffer_get_base(bcj1.buf.get());
+
+                ggml_tensor * node_tmp_2 = get_node_aux(node2);
+                node_tmp_2->buffer = bcj2.buf.get();
+                node_tmp_2->data = ggml_backend_buffer_get_base(bcj2.buf.get());
+
+                // 2 P2P copies: exchange full buffers
+                ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
+                ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
+
+                // Local ADD: node1 += tmp1 (in-place via view)
+                ggml_tensor * node_red_1 = get_node_aux(node1);
+                node_red_1->view_src = node1->view_src == nullptr ? node1 : node1->view_src;
+                node_red_1->view_offs = node1->view_offs;
+                node_red_1->op = GGML_OP_ADD;
+                node_red_1->src[0] = node1;
+                node_red_1->src[1] = node_tmp_1;
+                node_red_1->flags |= GGML_TENSOR_FLAG_COMPUTE;
+                ggml_backend_view_init(node_red_1);
+
+                // Local ADD: node2 += tmp2 (in-place via view)
+                ggml_tensor * node_red_2 = get_node_aux(node2);
+                node_red_2->view_src = node2->view_src == nullptr ? node2 : node2->view_src;
+                node_red_2->view_offs = node2->view_offs;
+                node_red_2->op = GGML_OP_ADD;
+                node_red_2->src[0] = node2;
+                node_red_2->src[1] = node_tmp_2;
+                node_red_2->flags |= GGML_TENSOR_FLAG_COMPUTE;
+                ggml_backend_view_init(node_red_2);
+
+                // Build 1-node cgraphs for the ADD ops
+                ggml_cgraph * cgraph_aux_1 = get_cgraph_aux();
+                cgraph_aux_1->nodes[0] = node_red_1;
+                cgraph_aux_1->n_nodes = 1;
+                step_cgraphs[j] = cgraph_aux_1;
+
+                ggml_cgraph * cgraph_aux_2 = get_cgraph_aux();
+                cgraph_aux_2->nodes[0] = node_red_2;
+                cgraph_aux_2->n_nodes = 1;
+                step_cgraphs[j_other] = cgraph_aux_2;
            }

-            for (size_t j = 0; j < 2*offset_j_max; j++) {
+            // Execute local ADDs for this step
+            for (size_t j = 0; j < n_backends; j++) {
                if (step_cgraphs[j] == nullptr) {
                    continue;
                }
@@ -1993,20 +1906,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                    return status;
                }
            }
-            i_buf++;
        }
-        assert(i_buf == backend_ctx->n_reduce_steps);
-
-        // If n_backends is not a power of 2, copy back the reduced tensors to the excess:
-        for (size_t j = 2*offset_j_max; j < n_backends; j++) {
-            auto & bcj_src = backend_ctx->backend_configs[j - 2*offset_j_max];
-            auto & bcj_dst = backend_ctx->backend_configs[j];
-
-            ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
-            ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
-            ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_dst);
-        }
-
        return GGML_STATUS_SUCCESS;
    };

@@ -83,6 +83,7 @@
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // quants.c
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
+#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
@@ -151,6 +151,8 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    const block_q1_0 * GGML_RESTRICT x = vx;
    const block_q8_0 * GGML_RESTRICT y = vy;

+    float sumf = 0.0f;
+
 #if defined(__ARM_NEON)
    float32x4_t sumv = vdupq_n_f32(0.0f);

@@ -210,13 +212,31 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
        }
    }

-    *s = vaddvq_f32(sumv);
+    sumf = vaddvq_f32(sumv);
 #else
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
-    ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    // Scalar fallback
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+
+        // Process 4 Q8_0 blocks
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
+
+            int sumi = 0;
+            for (int j = 0; j < QK8_0; j++) {
+                const int bit_index = k * QK8_0 + j;
+                const int byte_index = bit_index / 8;
+                const int bit_offset = bit_index % 8;
+
+                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
+                sumi += xi * y[i*4 + k].qs[j];
+            }
+            sumf += d0 * d1 * sumi;
+        }
+    }
 #endif
+
+    *s = sumf;
 }


@@ -274,18 +274,6 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
 }
 #endif
 #elif defined(__SSSE3__)
-static inline __m128i bytes_from_bits_16(const uint8_t * x) {
-    uint16_t x16;
-    memcpy(&x16, x, sizeof(uint16_t));
-
-    const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
-    __m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
-    const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
-    bytes = _mm_or_si128(bytes, bit_mask);
-
-    return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
-}
-
 // horizontally add 4x4 floats
 static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
    __m128 res_0 =_mm_hadd_ps(a, b);
@@ -552,152 +540,6 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif

-void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    const int qk = QK1_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q1_0 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-#if defined(__AVX2__)
-    const __m256i ones_8 = _mm256_set1_epi8(1);
-    const __m256i ones_16 = _mm256_set1_epi16(1);
-    const __m256i byte_shuf = _mm256_setr_epi8(
-            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-            2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
-    const __m256i bit_masks = _mm256_setr_epi8(
-            1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
-            1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
-    const __m256i zero = _mm256_setzero_si256();
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int ib = 0; ib < nb; ++ib) {
-        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
-        const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
-        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
-
-        __m256 acc_block;
-        {
-            const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
-            const __m256i sm = _mm256_cmpeq_epi8(
-                    _mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
-            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
-            const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
-            acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
-        }
-        for (int K = 1; K < 4; ++K) {
-            const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs);
-            const __m256i sm = _mm256_cmpeq_epi8(
-                    _mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero);
-            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
-            const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
-            acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block);
-        }
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    const __m128i ones_8 = _mm_set1_epi8(1);
-    const __m128i ones_16 = _mm_set1_epi16(1);
-    const __m128i zero = _mm_setzero_si128();
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int ib = 0; ib < nb; ++ib) {
-        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
-        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
-        __m256 acc_block;
-        {
-            const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[0]);
-            const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
-            const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
-            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[0]);
-            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[16]);
-            const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
-            const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
-            const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
-            const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
-            const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
-            const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
-            const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
-            const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
-            const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
-            acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), q);
-        }
-        for(int K = 1; K < 4; ++K) {
-            const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[(K) * 4]);
-            const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
-            const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
-            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[0]);
-            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[16]);
-            const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
-            const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
-            const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
-            const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
-            const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
-            const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
-            const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
-            const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
-            const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
-            acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q));
-        }
-#undef Q1_AVX_BLOCK
-
-        acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__SSSE3__)
-    const __m128i ones_8 = _mm_set1_epi8(1);
-    const __m128i ones_16 = _mm_set1_epi16(1);
-    const __m128i zero = _mm_setzero_si128();
-    __m128 acc_0 = _mm_setzero_ps();
-    __m128 acc_1 = _mm_setzero_ps();
-    __m128 acc_2 = _mm_setzero_ps();
-    __m128 acc_3 = _mm_setzero_ps();
-
-    for (int ib = 0; ib < nb; ++ib) {
-        const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
-        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
-
-#define Q1_SSSE3_BLOCK(QS_OFF, Y_IDX, ACC) \
-        { \
-            const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 0]); \
-            const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 2]); \
-            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[0]); \
-            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[16]); \
-            const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
-            const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
-            const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
-            const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
-            const __m128i sum_0 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_0), ones_16); \
-            const __m128i sum_1 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_1), ones_16); \
-            const __m128 q = _mm_cvtepi32_ps(_mm_add_epi32(sum_0, sum_1)); \
-            (ACC) = _mm_add_ps((ACC), _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(Y_IDX)].d))), q)); \
-        }
-        Q1_SSSE3_BLOCK(0,  0, acc_0)
-        Q1_SSSE3_BLOCK(4,  1, acc_1)
-        Q1_SSSE3_BLOCK(8,  2, acc_2)
-        Q1_SSSE3_BLOCK(12, 3, acc_3)
-#undef Q1_SSSE3_BLOCK
-    }
-
-    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#else
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
-    ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
-#endif
-}
-
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -137,28 +137,22 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    float sumf = 0.0;

    for (int i = 0; i < nb; i++) {
-        const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);

        float sumi = 0.0f;

        for (int k = 0; k < 4; k++) {
-            const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
-            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+            const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
+
            int sumi_block = 0;

-            const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
-            const int8_t  * GGML_RESTRICT qy   = yb->qs;
+            for (int j = 0; j < QK8_0; j++) {
+                const int bit_index = k * QK8_0 + j;
+                const int byte_index = bit_index / 8;
+                const int bit_offset = bit_index % 8;

-            for (int b = 0; b < 4; ++b, qy += 8) {
-                const unsigned mask = bits[b];
-                sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
-                           +  ((mask & 0x02) ? qy[1] : -qy[1])
-                           +  ((mask & 0x04) ? qy[2] : -qy[2])
-                           +  ((mask & 0x08) ? qy[3] : -qy[3])
-                           +  ((mask & 0x10) ? qy[4] : -qy[4])
-                           +  ((mask & 0x20) ? qy[5] : -qy[5])
-                           +  ((mask & 0x40) ? qy[6] : -qy[6])
-                           +  ((mask & 0x80) ? qy[7] : -qy[7]);
+                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
+                sumi_block += xi * y[i*4 + k].qs[j];
            }

            sumi += d1 * sumi_block;
@@ -368,21 +368,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
    }

    ~ggml_cuda_pool_leg() {
-        clear_pool();
-        GGML_ASSERT(pool_size == 0);
-    }
-
-    void clear_pool() {
        ggml_cuda_set_device(device);
        for (int i = 0; i < MAX_BUFFERS; ++i) {
            ggml_cuda_buffer & b = buffer_pool[i];
            if (b.ptr != nullptr) {
                CUDA_CHECK(cudaFree(b.ptr));
                pool_size -= b.size;
-                b.ptr  = nullptr;
-                b.size = 0;
            }
        }
+        GGML_ASSERT(pool_size == 0);
    }

    void * alloc(size_t size, size_t * actual_size) override {
@@ -427,20 +421,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
        size_t look_ahead_size = (size_t) (1.05 * size);
        look_ahead_size = 256 * ((look_ahead_size + 255)/256);
        ggml_cuda_set_device(device);
-        cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
-        if (err == cudaErrorMemoryAllocation) {
-            (void)cudaGetLastError();
-            const size_t cached_bytes = pool_size;
-            GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n",
-                           device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
-            CUDA_CHECK(cudaDeviceSynchronize());
-            clear_pool();
-            err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
-            if (err == cudaSuccess) {
-                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
-            }
-        }
-        CUDA_CHECK(err);
+        CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
        *actual_size = look_ahead_size;
        pool_size += look_ahead_size;
 #ifdef DEBUG_CUDA_MALLOC
@@ -1222,13 +1203,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
    // For small tensors, simply reduce them as FP32.
    // The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
    if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
-        for (size_t i = 0; i < n_backends; ++i) {
-            if ((tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
-                ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
-                ggml_cuda_set_device(cuda_ctx->device);
-                CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, ggml_nbytes(tensors[i]), cuda_ctx->stream()));
-            }
-        }
        NCCL_CHECK(ncclGroupStart());
        for (size_t i = 0; i < n_backends; ++i) {
            ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
@@ -1250,11 +1224,7 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
        tmp[i].alloc(ne);

        ggml_cuda_set_device(cuda_ctx->device);
-        if (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) {
-            to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
-        } else {
-            CUDA_CHECK(cudaMemsetAsync(tmp[i].get(), 0, ne * sizeof(nv_bfloat16), cuda_ctx->stream()));
-        }
+        to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
        CUDA_CHECK(cudaGetLastError());
    }

@@ -3592,30 +3562,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
        return true;
    }

-    if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
-     && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
-        const ggml_tensor * unary = cgraph->nodes[node_idx];
-        const ggml_tensor * sqr   = cgraph->nodes[node_idx+1];
-
-        if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
-            return false;
-        }
-
-        if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
-            return false;
-        }
-
-        if (unary->type != sqr->type) {
-            return false;
-        }
-
-        if (!ggml_is_contiguous(unary->src[0])) {
-            return false;
-        }
-
-        return true;
-    }
-
    if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
     && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
        const ggml_tensor *scale  = cgraph->nodes[node_idx];
@@ -4124,12 +4070,6 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                        continue;
                    }

-                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
-                        ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i+1]);
-                        i++;
-                        continue;
-                    }
-
                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
                        i += 2;
                        ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
@@ -65,11 +65,6 @@ static __device__ __forceinline__ float op_sqr(float x) {
    return x * x;
 }

-static __device__ __forceinline__ float op_relu_sqr(float x) {
-    const float r = fmaxf(x, 0.0f);
-    return r * r;
-}
-
 static __device__ __forceinline__ float op_sqrt(float x) {
    return sqrtf(x);
 }
@@ -620,21 +615,3 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
            GGML_ABORT("Unsupported unary op for fused unary+mul");
    }
 }
-
-/* fused relu + sqr */
-
-void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
-    const ggml_tensor * src = relu_node->src[0];
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(ggml_is_contiguous(src));
-    GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
-    GGML_ASSERT(src->type == sqr_node->type);
-
-    const int k = ggml_nelements(src);
-    if (src->type == GGML_TYPE_F16) {
-        unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
-    } else {
-        unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
-    }
-}
@@ -91,8 +91,6 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);

-void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
-
 __device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
    return x / (1.0f + expf(-x));
 }
@@ -58,7 +58,6 @@
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
-#define cudaErrorMemoryAllocation hipErrorOutOfMemory
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
@@ -42,7 +42,6 @@
 #define cudaDeviceProp musaDeviceProp
 #define cudaDeviceSynchronize musaDeviceSynchronize
 #define cudaError_t musaError_t
-#define cudaErrorMemoryAllocation musaErrorMemoryAllocation
 #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags musaEventCreateWithFlags
@@ -12,12 +12,9 @@
 #include <cstddef>
 #include <stdexcept>
 #include <string>
-#include <sstream>
-#include <iomanip>
 #include <unordered_set>
 #include <unordered_map>
 #include <regex>
-#include <queue>

 #ifdef _WIN32
 #    include <sal.h>
@@ -44,26 +41,18 @@
 #include "htp_iface.h"
 #include "htp-drv.h"

-using intvec  = std::vector<int>;
-using uintvec = std::vector<unsigned int>;
-using u32vec  = std::vector<uint32_t>;
-
 static size_t opt_ndev         = 1;
 static size_t opt_nhvx         = 0; // use all
 static int    opt_arch         = 0; // autodetect
 static int    opt_etm          = 0;
 static int    opt_verbose      = 0;
-static int    opt_profile      = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
+static int    opt_profile      = 0;
 static int    opt_hostbuf      = 1; // hostbuf ON by default
 static int    opt_use_hmx      = 1; // when set, enable HMX; when 0, use HVX only

-// Default PMU events, if profiling with PMU (mode=2) is enabled
-// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
-//     https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
-static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
-
 // Enable all stages by default
-static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
+static int opt_opmask   = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE;
+static int opt_opsync   = 0;  // synchronous ops
 static int opt_opbatch  = 1024; // max number of ops in a batch
 static int opt_opqueue  = 16;   // max number of pending batches
 static std::regex* opt_opfilter = NULL; // regex of ops to not claim
@@ -115,26 +104,19 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
 }

 static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
-                                      uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
+                                      uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
    if (!opt_profile) return;

    op_desc desc(op);
-
-    char pmu_str[256] = "";
-    if (opt_profile > 1) {
-        static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
-        sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
-                pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
-    }
-
-    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
-            ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
+    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
+                ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
+                op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
 }

 // ** backend sessions

 struct ggml_hexagon_opbatch;
-struct ggml_hexagon_opqueue;
+struct ggml_hexagon_opshm;

 struct ggml_hexagon_session {
    std::string      name;
@@ -150,8 +132,8 @@ struct ggml_hexagon_session {
    bool             valid_iface;

    std::atomic<int>      op_pending;
-    ggml_hexagon_opbatch* op_batch;
-    ggml_hexagon_opqueue* op_queue;
+    ggml_hexagon_opbatch *op_batch;
+    ggml_hexagon_opshm   *op_shm;

    ggml_backend_buffer_type buffer_type        = {};
    ggml_backend_buffer_type repack_buffer_type = {};
@@ -1539,14 +1521,65 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf

 // Backend session implementation

+struct ggml_hexagon_opshm {
+    ggml_hexagon_shared_buffer *sbuf;
+
+    std::vector<bool> block_mask;
+    size_t            block_size;
+
+    uint8_t * base()     const { return this->sbuf->base; }
+    int       fd()       const { return this->sbuf->fd;   }
+    size_t    n_blocks() const { return this->block_mask.size(); }
+
+    ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) {
+        size_t n_bufs    = HTP_OP_MAX_BUFS;
+        size_t n_ops     = max_batch;
+        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
+
+        block_mask.resize(max_pending, true);
+
+        block_size = sizeof(htp_buf_desc) * n_bufs    +
+                     sizeof(htp_tensor)   * n_tensors +
+                     sizeof(htp_op_desc)  * n_ops;
+
+        sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */);
+
+        if (opt_verbose) {
+            GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n",
+                    sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending);
+        }
+    }
+
+    ~ggml_hexagon_opshm() {
+        delete sbuf;
+    }
+
+    uint8_t * allocate() {
+        auto it = std::find(block_mask.begin(), block_mask.end(), true);
+        if (it == block_mask.end())
+            return nullptr;
+
+        unsigned int i = std::distance(block_mask.begin(), it);
+        uint8_t*  addr = sbuf->base + (i * block_size);
+        block_mask[i]  = false;
+
+        HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
+        return addr;
+    }
+
+    void release(uint8_t * addr) {
+        int i = (addr - sbuf->base) / block_size;
+        block_mask[i] = true;
+        HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
+    }
+};
+
 struct ggml_hexagon_opbatch {
-    ggml_hexagon_session*            sess;
+    const char* name;

-    std::vector<const ggml_tensor*>  ops;       // pointers to original ops
-
-    std::vector<htp_buf_desc>        h_bufs;    // htp buffer descriptors
-    std::vector<htp_tensor>          h_tens;    // htp tensor descriptors
-    std::vector<htp_op_desc>         h_ops;     // htp op descriptors
+    std::vector<htp_buf_desc> buffers;
+    std::vector<htp_tensor>   tensors;
+    std::vector<htp_op_desc>  ops;

    std::unordered_map<int, int>                b_map; // buffer fd   to index
    std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr  to index
@@ -1573,21 +1606,19 @@ struct ggml_hexagon_opbatch {
        d_map.clear();
    }

-    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) {
-        this->sess = sess;
+    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) {
+        name = sess->c_name();

        n_bufs_max = HTP_OP_MAX_BUFS;
-        n_ops_max  = batch_size;
+        n_ops_max  = max_batch;
        n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;

        b_vmem_max = HTP_OP_MAX_VMEM;

+        buffers.resize(n_bufs_max);
+        tensors.resize(n_tens_max);
        ops.resize(n_ops_max);

-        h_bufs.resize(n_bufs_max);
-        h_tens.resize(n_tens_max);
-        h_ops.resize(n_ops_max);
-
        b_map.reserve(n_bufs_max);
        t_map.reserve(n_tens_max);
        d_map.reserve(n_tens_max);
@@ -1609,7 +1640,7 @@ struct ggml_hexagon_opbatch {

        b_map.insert({sbuf->fd, bi});

-        htp_buf_desc &b = h_bufs[bi];
+        htp_buf_desc &b = buffers[bi];
        b.base = (uint64_t) sbuf->base;
        b.fd   = sbuf->fd;
        b.size = sbuf->size;
@@ -1633,7 +1664,7 @@ struct ggml_hexagon_opbatch {
        // First lookup by tensor data
        auto range = d_map.equal_range(t->data);
        for (auto it = range.first; it != range.second; ++it) {
-            htp_tensor * h = &h_tens[it->second];
+            htp_tensor * h = &tensors[it->second];
            if (same_shape(h, t)) { return it->second; }
        }

@@ -1651,7 +1682,7 @@ struct ggml_hexagon_opbatch {
        uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
        size_t   t_size   = ggml_nbytes(t);

-        htp_tensor &h = h_tens[ti];
+        htp_tensor &h = tensors[ti];
        h.bi    = add_buffer(sbuf);
        h.data  = t_offset;
        h.size  = t_size;
@@ -1706,170 +1737,65 @@ struct ggml_hexagon_opbatch {
    // assumes that fit_op() was called first and returned true
    void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
        // Add new op
-
-        unsigned int n = n_ops++;
+        htp_op_desc &o = ops[n_ops++];
        GGML_ASSERT(n_ops <= n_ops_max);

-        ops[n] = t;
-
-        htp_op_desc &o = h_ops[n];
        memcpy(&o.params, &t->op_params, sizeof(t->op_params));
        o.opcode = opcode;
        o.flags  = 0;

-        if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
+        if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
            o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
        }

-        ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
+        ggml_hexagon_dump_op_exec(name, t, o.flags);

        for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
            o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
        }
        o.dst = add_tensor(t);
    }
-};

-struct ggml_hexagon_opqueue {
-    // Shared buffer for storing batches
-    ggml_hexagon_shared_buffer *shm_buf;
-    size_t                      shm_blk_size;
+    size_t flush(uint8_t * mem_addr, size_t mem_size) {
+        static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
+        static_assert(sizeof(htp_tensor)   % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
+        static_assert(sizeof(htp_op_desc)  % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");

-    using opvec = std::vector<const ggml_tensor*>;
+        const size_t b_size = sizeof(htp_buf_desc) * n_bufs;
+        const size_t t_size = sizeof(htp_tensor)   * n_tens;
+        const size_t o_size = sizeof(htp_op_desc)  * n_ops;

-    std::queue<unsigned int>    done;       // completed batch ids
-    std::vector<opvec>          op_cache;   // per batch op cache
-    std::vector<uint64_t>       start_usec; // per batch start time
+        const size_t m_size = b_size + t_size + o_size;
+        GGML_ASSERT(m_size <= mem_size);

-    ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
-        size_t n_bufs    = HTP_OP_MAX_BUFS;
-        size_t n_ops     = batch_size;
-        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
+        uint8_t * b_ptr = (uint8_t *) mem_addr;
+        uint8_t * t_ptr = (uint8_t *) b_ptr + b_size;
+        uint8_t * o_ptr = (uint8_t *) t_ptr + t_size;

-        shm_blk_size = sizeof(htp_buf_desc)  * n_bufs    +
-                       sizeof(htp_tensor)    * n_tensors +
-                       sizeof(htp_op_desc)   * n_ops     +
-                       sizeof(htp_prof_desc) * n_ops;
+        memcpy(b_ptr, (void *) buffers.data(), b_size);
+        memcpy(t_ptr, (void *) tensors.data(), t_size);
+        memcpy(o_ptr, (void *) ops.data(),     o_size);

-        shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
-
-        op_cache.resize(depth);
-        start_usec.resize(depth, 0);
-
-        // init done queue
-        for (unsigned int i = 0; i < depth; i++) { done.push(i); }
-
-        if (opt_verbose) {
-            GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
-                    sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
-        }
-    }
-
-    ~ggml_hexagon_opqueue() {
-        delete shm_buf;
-    }
-
-    // push new batch
-    bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
-        static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
-        static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
-        static_assert(sizeof(htp_buf_desc)    % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
-        static_assert(sizeof(htp_tensor)      % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
-        static_assert(sizeof(htp_op_desc)     % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
-        static_assert(sizeof(htp_prof_desc)   % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
-
-        if (done.empty()) { return false; }
-
-        req.id        = done.front(); done.pop(); // batch id
-        req.n_bufs    = op_batch->n_bufs;
-        req.n_tensors = op_batch->n_tens;
-        req.n_ops     = op_batch->n_ops;
-
-        op_cache[req.id]   = op_batch->ops;
-        start_usec[req.id] = ggml_time_us();
-
-        const size_t b_size = sizeof(htp_buf_desc)  * req.n_bufs;
-        const size_t t_size = sizeof(htp_tensor)    * req.n_tensors;
-        const size_t o_size = sizeof(htp_op_desc)   * req.n_ops;
-        const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
-
-        dbuf.ptr      = shm_buf->base + (req.id * shm_blk_size);
-        dbuf.fd       = shm_buf->fd;
-        dbuf.flags    = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
-        dbuf.offset   = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
-        dbuf.size     = b_size + t_size + o_size + p_size;
-
-        GGML_ASSERT(dbuf.size <= shm_blk_size);
-
-        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
-        uint8_t * b_ptr = m_ptr; m_ptr += b_size;
-        uint8_t * t_ptr = m_ptr; m_ptr += t_size;
-        uint8_t * o_ptr = m_ptr;
-
-        memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
-        memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
-        memcpy(o_ptr, (void *) op_batch->h_ops.data(),  o_size);
-
-        HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
-                shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
-                b_size, t_size, o_size, (size_t) dbuf.size);
-
-        op_batch->reset();
+        HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n",
+                name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size);

        if (opt_verbose > 1) {
            htp_buf_desc *b = (htp_buf_desc*) b_ptr;
-            for (unsigned int i=0; i < req.n_bufs; i++) {
-                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
+            for (unsigned int i=0; i < n_bufs; i++) {
+                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i,
                            b[i].fd, (void *) b[i].base, (size_t) b[i].size);
            }
            htp_tensor *t = (htp_tensor*) t_ptr;
-            for (unsigned int i=0; i < req.n_tensors; i++) {
+            for (unsigned int i=0; i < n_tens; i++) {
                GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
-                            shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
+                            name, i, t[i].bi, t[i].data, t[i].size,
                            (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
            }
        }

-        return true;
-    }
+        reset();

-    void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
-        GGML_ASSERT(rsp.id < op_cache.size());
-
-        done.push(rsp.id);
-
-        const size_t b_size = sizeof(htp_buf_desc)  * rsp.n_bufs;
-        const size_t t_size = sizeof(htp_tensor)    * rsp.n_tensors;
-        const size_t o_size = sizeof(htp_op_desc)   * rsp.n_ops;
-        const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
-
-        const size_t m_size = b_size + t_size + o_size + p_size;
-        GGML_ASSERT(m_size <= shm_blk_size);
-
-        HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
-                shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
-                (size_t) dbuf.size, b_size, t_size, o_size);
-
-        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
-        uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
-
-        if (opt_profile && rsp.n_ops > 0) {
-            auto & ops = op_cache[rsp.id];
-
-            uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
-            uint32_t htp_usec   = 0;
-
-            GGML_ASSERT(rsp.n_ops <= ops.size());
-
-            const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
-            for (uint32_t i = 0; i < rsp.n_ops; i++) {
-                htp_usec += pd[i].usecs;
-                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
-            }
-
-            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
-                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
-        }
+        return m_size;
    }
 };

@@ -1898,12 +1824,17 @@ void ggml_hexagon_session::flush_pending(bool all) {
            GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
        }

+        op_shm->release((uint8_t*) dbuf.ptr);
+
        if (rsp.status != HTP_STATUS_OK) {
            GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
            // TODO: handle errors
        }

-        op_queue->pop(rsp, dbuf);
+        // FIXME: profile will be per opreq
+        // this->prof_usecs  = rsp.prof_usecs;
+        // this->prof_cycles = rsp.prof_cycles;
+        // this->prof_pkts   = rsp.prof_pkts;

        this->op_pending--;  // atomic dec

@@ -1914,17 +1845,28 @@ void ggml_hexagon_session::flush_pending(bool all) {
 void ggml_hexagon_session::flush_batch() {
    if (op_batch->empty()) { return; }

-    htp_opbatch_req req {};
-    dspqueue_buffer dbuf{};
+    htp_opbatch_req req;
+    req.n_bufs    = op_batch->n_bufs;
+    req.n_tensors = op_batch->n_tens;
+    req.n_ops     = op_batch->n_ops;

-    if (!op_queue->push(req, dbuf, op_batch)) {
+    dspqueue_buffer dbuf;
+    dbuf.fd     = op_shm->fd();
+    dbuf.flags  = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+    dbuf.ptr    = op_shm->allocate();
+    if (!dbuf.ptr) {
        flush_pending(false);
-        op_queue->push(req, dbuf, op_batch);
+        dbuf.ptr = op_shm->allocate();
    }

+    dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base();
+    dbuf.size   = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size);
+
    // Bump pending flag (cleared in the session::flush once we get the response)
    this->op_pending++;  // atomic inc

+    HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
+
    int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
    if (err != 0) {
        GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
@@ -2074,33 +2016,25 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
    }

    if (opt_etm) {
-        err = htp_iface_etm(this->handle, 1);
+        err = htp_iface_enable_etm(this->handle);
        if (err != 0) {
            GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
        }
    }

-    if (opt_profile) {
-        htp_iface_pmu_conf pmu_conf{};
-        std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
-
-        err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
-        if (err != 0) {
-            GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
-        }
-    }
-
-    // Allocate buffers and state for op batching
-    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
-    this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
-
-    // Start processing op batch requests
+    // Start the DSP-side service. We need to pass the queue ID to the
+    // DSP in a FastRPC call; the DSP side will import the queue and start
+    // listening for packets in a callback.
    err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
    if (err != 0) {
        GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
        throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
    }
    this->valid_iface = true;
+
+    // Allocate buffers and state for op batching
+    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
+    this->op_shm   = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue);
 }

 void ggml_hexagon_session::release() noexcept(true) {
@@ -2109,7 +2043,7 @@ void ggml_hexagon_session::release() noexcept(true) {
    int err;

    delete this->op_batch;
-    delete this->op_queue;
+    delete this->op_shm;

    // Stop the DSP-side service and close the queue
    if (this->valid_iface) {
@@ -2120,20 +2054,12 @@ void ggml_hexagon_session::release() noexcept(true) {
    }

    if (opt_etm) {
-        err = htp_iface_etm(this->handle, 0);
+        err = htp_iface_disable_etm(this->handle);
        if (err != 0) {
            GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
        }
    }

-    if (opt_profile) {
-        htp_iface_pmu_conf pmu_conf{};
-        err = htp_iface_profiler(this->handle, 0, &pmu_conf);
-        if (err != 0) {
-            GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
-        }
-    }
-
    if (this->valid_queue) {
        err = dspqueue_close(queue);
        if (err != 0) {
@@ -2151,7 +2077,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
    repack_buffer_type.device = dev;

    op_batch = nullptr;
-    op_queue = nullptr;
+    op_shm   = nullptr;

    try {
        allocate(dev_id);
@@ -2670,62 +2596,6 @@ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * se
    return true;
 }

-static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * dst  = op;
-
-    // diag only supports F32 currently
-    if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
-        return false;
-    }
-
-    // Input must have ne[1] == 1 (vector input)
-    if (src0->ne[1] != 1) {
-        return false;
-    }
-
-    // Output must be square in first two dimensions
-    if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
-        return false;
-    }
-
-    GGML_UNUSED(sess);
-    return true;
-}
-
-static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
-    const struct ggml_tensor * src0 = op->src[0]; // A
-    const struct ggml_tensor * src1 = op->src[1]; // B
-    const struct ggml_tensor * dst  = op;         // X
-
-    if (!src0 || !src1) {
-        return false;
-    }
-
-    if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
-        return false;
-    }
-
-    if (src0->ne[0] != src0->ne[1]) {
-        return false;
-    }
-
-    if (src0->ne[1] != src1->ne[1]) {
-        return false;
-    }
-
-    if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
-        return false;
-    }
-
-    if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
-        return false;
-    }
-
-    GGML_UNUSED(sess);
-    return true;
-}
-
 static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
    return sess->c_name();
@@ -2762,9 +2632,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
        case GGML_OP_ROPE:           return HTP_OP_ROPE;
        case GGML_OP_REPEAT:         return HTP_OP_REPEAT;
        case GGML_OP_CUMSUM:         return HTP_OP_CUMSUM;
-        case GGML_OP_FILL:           return HTP_OP_FILL;
-        case GGML_OP_DIAG:           return HTP_OP_DIAG;
-        case GGML_OP_SOLVE_TRI:      return HTP_OP_SOLVE_TRI;
+
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(t)) {
                case GGML_UNARY_OP_SILU:     return HTP_OP_UNARY_SILU;
@@ -2805,7 +2673,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg

    for (int i = 0; i < graph->n_nodes; ++i) {
        ggml_tensor * n = graph->nodes[i];
-        if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
+        if (op_is_compute(n)) {
            sess->enqueue_op(op_remap_to_htp(n), n);
        }
    }
@@ -3161,17 +3029,6 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
    return true;
 }

-static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
-    const struct ggml_tensor * dst = op;
-
-    if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
-        return false;
-    }
-
-    GGML_UNUSED(sess);
-    return true;
-}
-
 static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
    auto sess = static_cast<ggml_hexagon_session *>(dev->context);

@@ -3302,18 +3159,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
            supp = ggml_hexagon_supported_cumsum(sess, op);
            break;

-        case GGML_OP_FILL:
-            supp = ggml_hexagon_supported_fill(sess, op);
-            break;
-
-        case GGML_OP_DIAG:
-            supp = ggml_hexagon_supported_diag(sess, op);
-            break;
-
-        case GGML_OP_SOLVE_TRI:
-            supp = ggml_hexagon_supported_solve_tri(sess, op);
-            break;
-
        default:
            break;
    }
@@ -3449,26 +3294,6 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
    return NULL;
 }

-template<typename T> std::vector<T> str_to_vec(const char* str) {
-    std::stringstream ss(str);
-    std::vector<T> v;
-    std::string    t;
-
-    while (std::getline(ss, t, ',')) {
-        v.push_back(std::stoul(t, nullptr, 0));
-    }
-
-    return v;
-}
-
-template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
-    std::stringstream ss;
-    ss << std::setbase(BASE) << std::showbase;
-    for (auto i : v) { ss << i << ','; }
-    auto str = ss.str(); str.pop_back(); // drop last comma
-    return str;
-}
-
 static void ggml_hexagon_init(ggml_backend_reg * reg) {
    // Basic sanity checks to make sure definitions match
    static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
@@ -3482,7 +3307,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {

    const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
    const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
-    const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
+    const char * str_opmask  = getenv("GGML_HEXAGON_OPMASK");
+    const char * str_opsync  = getenv("GGML_HEXAGON_OPSYNC");
    const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
    const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
    const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER");
@@ -3495,30 +3321,19 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {

    auto RE_ICASE = std::regex_constants::icase;

-    opt_opfilter     = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
-    opt_verbose      = str_verbose  ? atoi(str_verbose)             : 0;
-    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
-    opt_opstage      = str_opstage  ? strtoul(str_opstage, NULL, 0) : opt_opstage;
-    opt_opbatch      = str_opbatch  ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
-    opt_opqueue      = str_opqueue  ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
-    opt_etm          = str_etm      ? atoi(str_etm)                 : 0;
-    opt_nhvx         = str_nhvx     ? strtoul(str_nhvx, NULL, 0)    : opt_nhvx;
-    opt_use_hmx      = str_use_hmx  ? atoi(str_use_hmx)             : opt_use_hmx;
-    opt_ndev         = str_ndev     ? strtoul(str_ndev, NULL, 0)    : opt_ndev;
-    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
-
-    if (str_profile) {
-        opt_pmu_evt = [&]() -> std::vector<uint32_t> {
-            auto v  = str_to_vec<uint32_t>(str_profile);
-            switch (v.size()) {
-                case 1:  opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
-                case 8:  opt_profile = 2;    return v;           // mode with custom  pmu events
-                default: opt_profile = 0;    return {};          // garbage input
-            }}();
-        if (opt_profile == 1) opt_pmu_evt = {};
-        GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
-                vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
-    }
+    opt_opfilter     = str_opfilter     ? new std::regex(str_opfilter, RE_ICASE) : NULL;
+    opt_verbose      = str_verbose ? atoi(str_verbose) : 0;
+    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
+    opt_opmask       = str_opmask  ? strtoul(str_opmask, NULL, 0)  : opt_opmask;
+    opt_opsync       = str_opsync  ? atoi(str_opsync)              : opt_opsync;
+    opt_opbatch      = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
+    opt_opqueue      = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
+    opt_profile      = str_profile ? atoi(str_profile) : 0;
+    opt_etm          = str_etm     ? atoi(str_etm)     : 0;
+    opt_nhvx         = str_nhvx    ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
+    opt_use_hmx      = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
+    opt_ndev         = str_ndev    ? strtoul(str_ndev, NULL, 0) : opt_ndev;
+    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;

    if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
        opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
@@ -34,9 +34,6 @@ add_library(${HTP_LIB} SHARED
    argsort-ops.c
    ssm-conv.c
    cumsum-ops.c
-    fill-ops.c
-    diag-ops.c
-    solve-tri-ops.c
 )

 target_compile_definitions(${HTP_LIB} PRIVATE
@@ -1,216 +0,0 @@
-#pragma clang diagnostic ignored "-Wunused-but-set-variable"
-
-#include <HAP_farf.h>
-#include <HAP_perf.h>
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-#include "htp-ctx.h"
-#include "htp-ops.h"
-#include "hvx-types.h"
-#include "hex-utils.h"
-#include "hvx-copy.h"
-#include "hex-dma.h"
-
-#define htp_diag_tensors_preamble                           \
-    const struct htp_tensor * restrict src0 = octx->src[0]; \
-    const struct htp_tensor * restrict dst  = octx->dst;    \
-                                                     \
-    const uint32_t ne02 = src0->ne[2];               \
-                                                     \
-    const uint32_t ne0 = dst->ne[0];                 \
-    const uint32_t ne1 = dst->ne[1];                 \
-                                                     \
-    const uint32_t nb02 = src0->nb[2];               \
-    const uint32_t nb03 = src0->nb[3];               \
-                                                     \
-    const uint32_t nb1 = dst->nb[1];                 \
-    const uint32_t nb2 = dst->nb[2];                 \
-    const uint32_t nb3 = dst->nb[3];
-
-struct htp_diag_context {
-    struct htp_ops_context * octx;
-    size_t          src_batch_size;
-    size_t          dst_row_size;
-    size_t          src_batch_size_aligned;
-    size_t          dst_row_size_aligned;
-    uint32_t        batches_per_thread;
-    uint32_t        total_batches;
-};
-
-#define htp_diag_preamble                                              \
-    struct htp_diag_context * dctx = (struct htp_diag_context *) data; \
-    struct htp_ops_context *  octx = dctx->octx;                       \
-    htp_diag_tensors_preamble;
-
-static inline void hvx_diag_row_f32(const float * restrict src, float * restrict dst,
-                                    uint32_t row_idx, uint32_t n) {
-    hvx_splat_f32_a((uint8_t *) dst, 0.0f, n);
-    dst[row_idx] = src[row_idx];
-}
-
-// ---------------------------------------------------------------------------
-// Per thread worker: DMA src fetch, compute in VTCM, DMA dst writeback
-// ---------------------------------------------------------------------------
-
-static void diag_thread_f32_dma(unsigned int nth, unsigned int ith, void * data) {
-    htp_diag_preamble;
-    dma_queue * dma_queue = octx->ctx->dma[ith];
-
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
-
-    const uint32_t ib0 = dctx->batches_per_thread * ith;
-    const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
-
-    if (ib0 >= ib1) {
-        return;
-    }
-
-    const size_t src_batch_size         = dctx->src_batch_size;
-    const size_t dst_row_size           = dctx->dst_row_size;
-    const size_t src_batch_size_aligned = dctx->src_batch_size_aligned;
-    const size_t dst_row_size_aligned   = dctx->dst_row_size_aligned;
-
-    const uint8_t * src_data = (const uint8_t *) src0->data;
-    uint8_t *       dst_data = (uint8_t *) dst->data;
-
-    // 1 src buffer + 1 dst row buffer per thread in VTCM
-    uint8_t * src_spad = octx->src0_spad.data + (ith * src_batch_size_aligned);
-    uint8_t * dst_spad = octx->dst_spad.data  + (ith * dst_row_size_aligned);
-
-    for (uint32_t ib = ib0; ib < ib1; ib++) {
-        const uint32_t i3 = ib / ne02;
-        const uint32_t i2 = ib % ne02;
-
-        const uint8_t * src_batch = src_data + i3 * nb03 + i2 * nb02;
-
-        // Fetch source vector into VTCM
-        dma_queue_push_ddr_to_vtcm(dma_queue,
-                                   dma_make_ptr(src_spad, src_batch),
-                                   src_batch_size_aligned, src_batch_size, 1);
-        dma_queue_flush(dma_queue);
-
-        const float * src_spad_f32 = (const float *) src_spad;
-        float       * dst_spad_f32 = (float *) dst_spad;
-
-        for (uint32_t i1 = 0; i1 < ne1; i1++) {
-            // Compute row in VTCM
-            hvx_diag_row_f32(src_spad_f32, dst_spad_f32, i1, ne0);
-
-            // Write completed row back to DDR
-            uint8_t * dst_row = dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1;
-            dma_queue_push_vtcm_to_ddr(dma_queue,
-                                       dma_make_ptr(dst_row, dst_spad),
-                                       dst_row_size, dst_row_size_aligned, 1);
-            dma_queue_flush(dma_queue);
-        }
-    }
-
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "diag-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
-         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
-         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
-}
-
-// ---------------------------------------------------------------------------
-// Per thread worker: Direct HVX (no DMA)
-// ---------------------------------------------------------------------------
-
-static void diag_thread_f32(unsigned int nth, unsigned int ith, void * data) {
-    htp_diag_preamble;
-
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
-
-    const uint8_t * src_data = (const uint8_t *) src0->data;
-    uint8_t *       dst_data = (uint8_t *) dst->data;
-
-    const uint32_t ib0 = dctx->batches_per_thread * ith;
-    const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
-
-    for (uint32_t ib = ib0; ib < ib1; ib++) {
-        const uint32_t i3 = ib / ne02;
-        const uint32_t i2 = ib % ne02;
-
-        const float * restrict src_batch = (const float *)(src_data + i3 * nb03 + i2 * nb02);
-
-        for (uint32_t i1 = 0; i1 < ne1; i1++) {
-            float * restrict dst_row = (float *)(dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1);
-            hvx_diag_row_f32(src_batch, dst_row, i1, ne0);
-        }
-    }
-
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "diag-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
-         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
-         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
-}
-
-int op_diag_f32(struct htp_ops_context * octx) {
-    const struct htp_tensor * src0 = octx->src[0];
-    const struct htp_tensor * dst  = octx->dst;
-
-    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
-        return HTP_STATUS_OK;
-    }
-
-    const uint32_t total_batches = src0->ne[2] * src0->ne[3];
-    const uint32_t n_threads     = MIN(octx->n_threads, total_batches);
-
-    const size_t src_batch_size         = src0->ne[0] * sizeof(float);
-    const size_t dst_row_size           = dst->ne[0] * sizeof(float);
-    const size_t src_batch_size_aligned = hex_round_up(src_batch_size, VLEN);
-    const size_t dst_row_size_aligned   = hex_round_up(dst_row_size, VLEN);
-
-    // 1 src buffer + 1 dst row buffer per thread
-    const size_t spad_per_thread = src_batch_size_aligned + dst_row_size_aligned;
-
-    octx->src0_spad.size_per_thread = src_batch_size_aligned;
-    octx->dst_spad.size_per_thread  = dst_row_size_aligned;
-
-    octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
-    octx->dst_spad.size  = n_threads * octx->dst_spad.size_per_thread;
-
-    octx->src0_spad.data = octx->ctx->vtcm_base;                        octx->src0_spad.src = NULL;
-    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.src  = NULL;
-
-    struct htp_diag_context dctx = {
-        .octx                   = octx,
-        .src_batch_size         = src_batch_size,
-        .dst_row_size           = dst_row_size,
-        .src_batch_size_aligned = src_batch_size_aligned,
-        .dst_row_size_aligned   = dst_row_size_aligned,
-        .batches_per_thread     = (total_batches + n_threads - 1) / n_threads,
-        .total_batches          = total_batches,
-    };
-
-    if (octx->ctx->vtcm_size < spad_per_thread * n_threads) {
-        worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32, &dctx, n_threads);
-    } else {
-        worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32_dma, &dctx, n_threads);
-    }
-
-    return HTP_STATUS_OK;
-}
-
-int op_diag(struct htp_ops_context * octx) {
-    const struct htp_tensor * dst = octx->dst;
-
-    int err = HTP_STATUS_OK;
-
-    switch (dst->type) {
-        case HTP_TYPE_F32:
-            err = op_diag_f32(octx);
-            break;
-        default:
-            err = HTP_STATUS_NO_SUPPORT;
-            break;
-    }
-
-    return err;
-}
@@ -1,123 +0,0 @@
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-function"
-#pragma clang diagnostic ignored "-Wunused-but-set-variable"
-
-#include <HAP_farf.h>
-#include <HAP_perf.h>
-
-#include <string.h>
-
-#include "hvx-copy.h"
-#include "hvx-utils.h"
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-#include "htp-ctx.h"
-#include "htp-ops.h"
-
-// ggml op_params layout for FILL:
-//   op_params[0] (as float) - the scalar fill value
-
-#define fill_preamble \
-    const struct htp_tensor * dst = octx->dst; \
-    \
-    const uint32_t ne0 = dst->ne[0]; \
-    const uint32_t ne1 = dst->ne[1]; \
-    const uint32_t ne2 = dst->ne[2]; \
-    const uint32_t ne3 = dst->ne[3]; \
-    \
-    const uint32_t nb1 = dst->nb[1]; \
-    const uint32_t nb2 = dst->nb[2]; \
-    const uint32_t nb3 = dst->nb[3]; \
-    \
-    const uint32_t nr = ne1 * ne2 * ne3;
-
-struct htp_fill_context {
-    struct htp_ops_context * octx;
-    uint32_t nrows_per_thread;
-    uint32_t total_rows;  // ne1 * ne2 * ne3
-    bool     opt_path;
-    HVX_Vector splat_vec;
-    uint32_t   elem_size;
-};
-
-static void fill_thread(unsigned int nth, unsigned int ith, void * data) {
-    const struct htp_fill_context * fctx = (const struct htp_fill_context *) data;
-    struct htp_ops_context        * octx = fctx->octx;
-    fill_preamble;
-
-    // Parallelise over the flat row index spanning ne1*ne2*ne3
-    const uint32_t ir0 = fctx->nrows_per_thread * ith;
-    const uint32_t ir1 = MIN(ir0 + fctx->nrows_per_thread, fctx->total_rows);
-
-    uint64_t t1 = HAP_perf_get_qtimer_count();
-
-    if (fctx->opt_path) {
-        // Opt path: tensor is fully contiguous, treat as flat array
-        const uint32_t elem_start = ir0 * ne0;
-        const uint32_t elem_end = ir1 * ne0;
-        uint8_t * dst_ptr = (uint8_t *) dst->data + elem_start * fctx->elem_size;
-        hvx_splat_u(dst_ptr, fctx->splat_vec, elem_end - elem_start, fctx->elem_size);
-    } else {
-        // Non-contiguous path: must respect strides
-        for (uint32_t ir = ir0; ir < ir1; ++ir) {
-            const uint32_t i1 = ir % ne1;
-            const uint32_t i2 = (ir / ne1) % ne2;
-            const uint32_t i3 = ir / (ne1 * ne2);
-            uint8_t * dst_ptr = (uint8_t *) dst->data + i1*nb1 + i2*nb2 + i3*nb3;
-            hvx_splat_u(dst_ptr, fctx->splat_vec, ne0, fctx->elem_size);
-        }
-    }
-
-    uint64_t t2 = HAP_perf_get_qtimer_count();
-    FARF(HIGH, "fill %u/%u: rows %u:%u usec %u\n",
-         ith, nth, ir0, ir1, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
-}
-
-int op_fill(struct htp_ops_context * octx) {
-    fill_preamble;
-
-    if (dst->type != HTP_TYPE_F32 && dst->type != HTP_TYPE_F16) {
-        return HTP_STATUS_NO_SUPPORT;
-    }
-
-    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
-        return HTP_STATUS_OK;
-    }
-
-    // nr = ne1*ne2*ne3 (flat row count across all outer dims); parallelise over it.
-    const uint32_t n_threads = MIN(nr, octx->n_threads);
-
-    // Optimize if fully contiguous: skip stride arithmetic, treat as flat array
-    const bool opt_path = (nb2 == nb1 * ne1) && (nb3 == nb2 * ne2);
-
-    FARF(HIGH, "fill: (%ux%ux%ux%u) type=%u opt=%d\n",
-         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->type, (int) opt_path);
-
-    float val_f32 = 0.f;
-    memcpy(&val_f32, &octx->op_params[0], sizeof(float));
-
-    struct htp_fill_context fctx = {
-        .octx             = octx,
-        .nrows_per_thread = (nr + n_threads - 1) / n_threads,
-        .total_rows       = nr,
-        .opt_path         = opt_path,
-    };
-
-    switch (dst->type) {
-    case HTP_TYPE_F32:
-        fctx.splat_vec = hvx_vec_splat_f32(val_f32);
-        fctx.elem_size = sizeof(float);
-        break;
-    case HTP_TYPE_F16:
-        fctx.splat_vec = hvx_vec_splat_f16((_Float16) val_f32);
-        fctx.elem_size = sizeof(_Float16);
-        break;
-    default:
-        return HTP_STATUS_NO_SUPPORT;
-    }
-
-    worker_pool_run_func(octx->ctx->worker_pool, fill_thread, &fctx, n_threads);
-
-    return HTP_STATUS_OK;
-}
@@ -4,7 +4,6 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <qurt_memory.h>
-#include <qurt.h>

 #include "hexagon_types.h"
 #include "hexagon_protos.h"
@@ -101,31 +100,4 @@ static inline void hex_pause() {
    asm volatile(" pause(#255)\n");
 }

-#ifndef HEX_NUM_PMU_COUNTERS
-#define HEX_NUM_PMU_COUNTERS 8
-#endif
-
-static inline void hex_get_pmu(uint32_t counters[]) {
-#if __HVX_ARCH__ >= 79
-    asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
-    asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
-    asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
-    asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
-    asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
-    asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
-    asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
-    asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
-#else
-    counters[0] = qurt_pmu_get(QURT_PMUCNT0);
-    counters[1] = qurt_pmu_get(QURT_PMUCNT1);
-    counters[2] = qurt_pmu_get(QURT_PMUCNT2);
-    counters[3] = qurt_pmu_get(QURT_PMUCNT3);
-    counters[4] = qurt_pmu_get(QURT_PMUCNT4);
-    counters[5] = qurt_pmu_get(QURT_PMUCNT5);
-    counters[6] = qurt_pmu_get(QURT_PMUCNT6);
-    counters[7] = qurt_pmu_get(QURT_PMUCNT7);
-    // qurt_pmu_get_pmucnt(counters);
-#endif
-}
-
 #endif /* HEX_UTILS_H */
@@ -10,7 +10,6 @@
 #include <dspqueue.h>
 #include <stdatomic.h>
 #include <stdint.h>
-#include <stdbool.h>

 #define HTP_MAX_NTHREADS 10
 #define HTP_MAX_MMAPS    16
@@ -67,9 +66,7 @@ struct htp_context {
    int                    thread_id;
    int                    thread_prio;

-    bool                   hmx_enabled;
-    bool                   etm;
-    uint32_t               profiler;
+    int                    hmx_enabled;

    uint8_t *              vtcm_base;
    size_t                 vtcm_size;
@@ -101,8 +98,5 @@ int op_repeat(struct htp_ops_context * octx);
 int op_argsort(struct htp_ops_context * octx);
 int op_ssm_conv(struct htp_ops_context * octx);
 int op_cumsum(struct htp_ops_context * octx);
-int op_fill(struct htp_ops_context * octx);
-int op_diag(struct htp_ops_context * octx);
-int op_solve_tri(struct htp_ops_context * octx);

 #endif /* HTP_CTX_H */
@@ -42,9 +42,9 @@ enum htp_data_type {

 // Mask to enable various stages of the Ops.
 // Used for debugging and profiling.
-enum htp_op_stage {
-    HTP_OPSTAGE_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into NPU)
-    HTP_OPSTAGE_COMPUTE  = (1 << 1),  // Enable Compute
+enum htp_op_mask {
+    HTP_OPMASK_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into the DSP)
+    HTP_OPMASK_COMPUTE  = (1 << 1),  // Enable Compute
 };

 // Do not reorder first 4 (used as an index)
@@ -80,9 +80,7 @@ enum htp_op_code {
    HTP_OP_SSM_CONV,
    HTP_OP_REPEAT,
    HTP_OP_CUMSUM,
-    HTP_OP_FILL,
-    HTP_OP_DIAG,
-    HTP_OP_SOLVE_TRI,
+
    HTP_OP_INVALID
 };

@@ -137,45 +135,27 @@ struct htp_op_desc {
    int32_t  params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
    uint16_t src[HTP_OP_MAX_INPUTS];    // Input tensors indices
    uint16_t dst;                       // Output tensor index
-};

-enum htp_profiler_mode {
-    HTP_PROF_DISABLED = 0,
-    HTP_PROF_BASIC    = 1,
-    HTP_PROF_PMU      = 2,
-};
-
-#define HTP_PROF_PMU_NCNT 8
-
-// Profile descriptor
-struct htp_prof_desc {
-    uint32_t opcode;                 // GGML/HTP Op
-    uint32_t usecs;                  // Number of usec
-    uint32_t cycles;                 // Number of cycles
-    uint32_t pad;                    // Unused
-    uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
+    // the rest is filled in-place by the NPU
+    uint32_t prof_usecs;                // Number of usec per request
+    uint32_t prof_cycles;               // Number of cycles per request
+    uint32_t prof_pkts;                 // Number of instruction packets per request
+    uint32_t unused;
 };

 struct htp_opbatch_req {
-    uint32_t id;          // Batch id
    uint32_t n_bufs;      // Number of buffers
    uint32_t n_tensors;   // Number of tensors
    uint32_t n_ops;       // Number of ops
    uint32_t flags;       // unused
-    uint32_t pad;         // unused
    // struct htp_buf_desc  bufs[];    -- dspqueue buf 0
    // struct htp_tensor    tensors[]; -- dspqueue buf 0
    // struct htp_op_desc   ops[];     -- dspqueue buf 0
 };

 struct htp_opbatch_rsp {
-    uint32_t id;         // Batch id
    uint32_t status;     // HTP_STATUS_...
-    uint32_t n_bufs;     // Number of buffers
-    uint32_t n_tensors;  // Number of tensors
-    uint32_t n_ops;      // Number of op profile descriptors
-    uint32_t pad;        // unused
-    // struct htp_prof_desc profs[];  -- dspqueue buf 0
+    // struct htp_op_req ops[];     -- dspqueue buf 0
 };

 #endif /* HTP_OPS_H */
@@ -6,17 +6,13 @@
 #include "AEEStdDef.idl"
 #include "remote.idl"

-struct htp_iface_pmu_conf {
-    uint32 events[8];
-};
-
 interface htp_iface : remote_handle64 {
    AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
    AEEResult stop();
    AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
    AEEResult munmap(in uint32 fd);
-    AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
-    AEEResult etm(in uint32 enable);
+    AEEResult enable_etm();
+    AEEResult disable_etm();
 };

 #endif /* HTP_IDL */
@@ -256,18 +256,6 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
    return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b));
 }

-static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
-    return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
-}
-
-static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
-    return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b));
-}
-
-static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
-    return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b));
-}
-
 #else

 static inline HVX_Vector hvx_vec_add_f16_f16(HVX_Vector a, HVX_Vector b)
@@ -285,18 +273,6 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
    return Q6_Vhf_vmpy_VhfVhf(a, b);
 }

-static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
-    return Q6_Vsf_vadd_VsfVsf(a, b);
-}
-
-static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
-    return Q6_Vsf_vsub_VsfVsf(a, b);
-}
-
-static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
-    return Q6_Vsf_vmpy_VsfVsf(a, b);
-}
-
 #endif // __HVX_ARCH__ < 79

 #endif /* HVX_BASE_H */
@@ -27,7 +27,6 @@
 #include "htp-ctx.h"
 #include "htp-ops.h"
 #include "htp-ops.h"
-#include "htp_iface.h"
 #include "worker-pool.h"

 AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@@ -104,54 +103,6 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
    return AEE_SUCCESS;
 }

-AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
-    int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
-    if (err) {
-        if (err == AEE_EVERSIONNOTSUPPORT) {
-            FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
-        } else {
-            FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
-        }
-    }
-    return err;
-}
-
-AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
-    struct htp_context * ctx = (struct htp_context *) handle;
-    if (!ctx) {
-        return AEE_EBADPARM;
-    }
-
-    if (mode == HTP_PROF_PMU) {
-        const uint32_t* events = pmu_conf->events;
-
-        // Pack 4 event IDs (low 8 bits) into each 32-bit config register
-        uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
-        for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
-            evtcfg  |= ((events[i + 0] & 0xFF) << (i * 8));
-            evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
-        }
-
-        // For events >255 pack high 2 bits of all 8 event IDs into cfg register
-        // 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
-        for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
-            cfg |= (((events[i] >> 8) & 3) << (i * 2));
-        }
-
-        FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
-
-        // Configure PMU registers
-        qurt_pmu_set(QURT_PMUCFG,     cfg);
-        qurt_pmu_set(QURT_PMUEVTCFG,  evtcfg);
-        qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
-        qurt_pmu_enable(1);
-    }
-
-    ctx->profiler = mode;
-
-    return AEE_SUCCESS;
-}
-
 AEEResult htp_iface_close(remote_handle64 handle) {
    struct htp_context * ctx = (struct htp_context *) handle;

@@ -178,19 +129,35 @@ AEEResult htp_iface_close(remote_handle64 handle) {
        }
    }

-    if (ctx->profiler) {
-        qurt_pmu_enable(1);
-    }
-
-    if (ctx->etm) {
-        HAP_user_etm_disable();
-    }
-
    free(ctx);
    return AEE_SUCCESS;
 }

-AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) {
+AEEResult htp_iface_enable_etm(remote_handle64 handle) {
+    int err = HAP_user_etm_enable();
+    if (err) {
+        if (err == AEE_EVERSIONNOTSUPPORT) {
+            FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
+        } else {
+            FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
+        }
+    }
+    return err;
+}
+
+AEEResult htp_iface_disable_etm(remote_handle64 handle) {
+    int err = HAP_user_etm_disable();
+    if (err) {
+        if (err == AEE_EVERSIONNOTSUPPORT) {
+            FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
+        } else {
+            FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
+        }
+    }
+    return err;
+}
+
+AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
@@ -237,7 +204,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32
    return AEE_ENOMEMORY;
 }

-AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
+AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
@@ -467,39 +434,19 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
 struct profile_data {
    uint64_t usecs;
    uint64_t cycles;
-    uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
+    uint64_t pkts;
 };

-static inline void profile_start(uint32_t mode, struct profile_data * d) {
-    switch (mode) {
-        case HTP_PROF_PMU:
-            hex_get_pmu(d->pmu_counters);
-            // fallthrough
-        case HTP_PROF_BASIC:
-            d->usecs  = HAP_perf_get_qtimer_count();
-            d->cycles = hex_get_cycles();
-            break;
-        default:
-            break;
-    }
+static inline void profile_start(struct profile_data * d) {
+    d->usecs  = HAP_perf_get_qtimer_count();
+    d->cycles = hex_get_cycles();
+    d->pkts   = hex_get_pktcnt();
 }

-static inline void profile_stop(uint32_t mode, struct profile_data * d) {
-    uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
-    switch (mode) {
-        case HTP_PROF_PMU:
-            hex_get_pmu(pmu_counters);
-            for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
-                d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
-            }
-            // fallthrough
-        case HTP_PROF_BASIC:
-            d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
-            d->cycles = hex_get_cycles() - d->cycles;
-            break;
-        default:
-            break;
-    }
+static inline void profile_stop(struct profile_data * d) {
+    d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
+    d->cycles = hex_get_cycles() - d->cycles;
+    d->pkts   = hex_get_pktcnt() - d->pkts;
 }

 static int execute_op(struct htp_ops_context * octx) {
@@ -567,15 +514,6 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_CUMSUM:
            return op_cumsum(octx);

-        case HTP_OP_FILL:
-            return op_fill(octx);
-
-        case HTP_OP_DIAG:
-            return op_diag(octx);
-
-        case HTP_OP_SOLVE_TRI:
-            return op_solve_tri(octx);
-
        case HTP_OP_INVALID:
            break;

@@ -782,32 +720,29 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
            continue;
        }

-        // Reset poll count for valid requests
-        poll_count = DSPQUEUE_POLL_COUNT;
-
        const uint32_t n_bufs = req.n_bufs;
        const uint32_t n_tens = req.n_tensors;
        const uint32_t n_ops  = req.n_ops;

-        const uint32_t b_size = sizeof(struct htp_buf_desc)  * n_bufs;
-        const uint32_t t_size = sizeof(struct htp_tensor)    * n_tens;
-        const uint32_t o_size = sizeof(struct htp_op_desc)   * n_ops;
-        const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
+        const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
+        const uint32_t t_size = sizeof(struct htp_tensor)   * n_tens;
+        const uint32_t o_size = sizeof(struct htp_op_desc)  * n_ops;

-        if (dbuf.size < b_size + t_size + o_size + p_size) {
+        if (dbuf.size < b_size + t_size + o_size) {
            FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
            break;
        }

-        FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
-                n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
+        // Reset poll count for valid requests
+        poll_count = DSPQUEUE_POLL_COUNT;

-        // Setup descriptor pointers
        uint8_t * m_ptr = dbuf.ptr;
-        struct htp_buf_desc* bufs = (struct htp_buf_desc*)  m_ptr; m_ptr += b_size;
-        struct htp_tensor*   tens = (struct htp_tensor*)    m_ptr; m_ptr += t_size;
-        struct htp_op_desc*   ops = (struct htp_op_desc*)   m_ptr; m_ptr += o_size;
-        struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
+        struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
+        struct htp_tensor*   tens = (struct htp_tensor*)   m_ptr; m_ptr += t_size;
+        struct htp_op_desc*   ops = (struct htp_op_desc*)  m_ptr;
+
+        FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u",
+                n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);

        prep_op_bufs(ctx, bufs, n_bufs);
        prep_tensors(ctx, bufs, tens, n_tens);
@@ -819,34 +754,22 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {

        for (uint32_t i=0; i < n_ops; i++) {
            struct profile_data prof;
-
-            profile_start(ctx->profiler, &prof);
+            profile_start(&prof);

            proc_op_req(octx, tens, i, &ops[i]);

-            profile_stop(ctx->profiler, &prof);
-
-            if (ctx->profiler) {
-                pds[i].opcode = ops[i].opcode;
-                pds[i].usecs  = prof.usecs;
-                pds[i].cycles = prof.cycles;
-                for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
-                    pds[i].pmu[j] = prof.pmu_counters[j];
-                }
-            }
+            profile_stop(&prof);
+            ops[i].prof_usecs  = prof.usecs;
+            ops[i].prof_cycles = prof.cycles;
+            ops[i].prof_pkts   = prof.pkts;
        }

        // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);

        struct htp_opbatch_rsp rsp;
-        rsp.id        = req.id;
-        rsp.status    = HTP_STATUS_OK;
-        rsp.n_bufs    = n_bufs;
-        rsp.n_tensors = n_tens;
-        rsp.n_ops     = n_ops;
+        rsp.status = HTP_STATUS_OK; // FIXME

        dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
-
        err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
        if (err != 0) {
            FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
@@ -3017,10 +3017,6 @@ int op_matmul(struct htp_ops_context * octx) {
    const int act_stride = (int)(src1->nb[1] / sizeof(float));
    const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));

-    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
-        return HTP_STATUS_OK;
-    }
-
    if (src0->type == HTP_TYPE_F16) {
        if (is_batched) {
            hmx_matmul_w16a32_batched_params_t batch_params = {
@@ -1,267 +0,0 @@
-#pragma clang diagnostic ignored "-Wunused-but-set-variable"
-
-#include <HAP_farf.h>
-#include <HAP_perf.h>
-#include <string.h>
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-#include "htp-ctx.h"
-#include "htp-ops.h"
-#include "hvx-types.h"
-#include "hvx-utils.h"
-
-struct htp_solve_tri_context {
-    struct htp_ops_context * octx;
-    uint32_t                 jobs_per_thread;
-    uint32_t                 total_jobs;
-    uint32_t                 k_chunks;
-    uint32_t                 col_block;
-};
-
-static inline void solve_tri_row_scalar(const float * A_row,
-                                        const float * B_row,
-                                        float *       X,
-                                        uint32_t      row,
-                                        uint32_t      k,
-                                        uint32_t      col0,
-                                        uint32_t      coln,
-                                        float         inv_diag) {
-    for (uint32_t col = col0; col < col0 + coln; ++col) {
-        float sum = 0.0f;
-        for (uint32_t t = 0; t < row; ++t) {
-            sum += A_row[t] * X[t * k + col];
-        }
-        X[row * k + col] = (B_row[col] - sum) * inv_diag;
-    }
-}
-
-static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) {
-    HVX_Vector v = *((const HVX_UVector *) src);
-    HVX_VectorPred mask = Q6_Q_vsetq2_R(n * sizeof(float));
-    return Q6_V_vmux_QVV(mask, v, Q6_V_vzero());
-}
-
-static inline void solve_tri_row_hvx(const float * A_row,
-                                     const float * B_row,
-                                     float *       X,
-                                     uint32_t      row,
-                                     uint32_t      k,
-                                     uint32_t      col0,
-                                     uint32_t      coln,
-                                     float         inv_diag) {
-    const bool full = (coln == VLEN_FP32);
-
-    HVX_Vector sum_v = Q6_V_vzero();
-    for (uint32_t t = 0; t < row; ++t) {
-        const float   a         = A_row[t];
-        const float * x_row_col = X + t * k + col0;
-
-        HVX_Vector x_v = full ? *((const HVX_UVector *) x_row_col) : hvx_load_partial_f32(x_row_col, coln);
-        HVX_Vector a_v = hvx_vec_splat_f32(a);
-        sum_v          = hvx_vec_add_f32_f32(sum_v, hvx_vec_mul_f32_f32(x_v, a_v));
-    }
-
-    const float * b_row_col = B_row + col0;
-    float *       x_out_col = X + row * k + col0;
-
-    HVX_Vector b_v        = full ? *((const HVX_UVector *) b_row_col) : hvx_load_partial_f32(b_row_col, coln);
-    HVX_Vector inv_diag_v = hvx_vec_splat_f32(inv_diag);
-
-    HVX_Vector out_v = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(b_v, sum_v), inv_diag_v);
-    hvx_vec_store_u((void *) x_out_col, coln * sizeof(float), out_v);
-}
-
-// Batch-level thread: each job is one full batch.
-static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) {
-    struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
-    struct htp_ops_context *       octx = sctx->octx;
-
-    const struct htp_tensor * src0 = octx->src[0];  // A
-    const struct htp_tensor * src1 = octx->src[1];  // B
-    const struct htp_tensor * dst  = octx->dst;     // X
-
-    const uint32_t n = src0->ne[0];
-    const uint32_t k = src1->ne[0];
-
-    const uint32_t ne02 = src0->ne[2];
-
-    const uint32_t col_block = VLEN_FP32;
-    const uint32_t k_full    = (k / col_block) * col_block;
-
-    const uint32_t start_batch = sctx->jobs_per_thread * ith;
-    const uint32_t end_batch   = MIN(start_batch + sctx->jobs_per_thread, sctx->total_jobs);
-
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
-
-    for (uint32_t batch = start_batch; batch < end_batch; ++batch) {
-        const uint32_t i03 = batch / ne02;
-        const uint32_t i02 = batch - i03 * ne02;
-
-        const float * A_batch =
-            (const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
-        const float * B_batch =
-            (const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
-        float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
-
-        for (uint32_t row = 0; row < n; ++row) {
-            const float   diag     = A_batch[row * n + row];
-            const float   inv_diag = 1.0f / diag;
-            const float * A_row    = A_batch + row * n;
-            const float * B_row    = B_batch + row * k;
-
-            uint32_t col0 = 0;
-            for (; col0 < k_full; col0 += col_block) {
-                solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, col_block, inv_diag);
-            }
-
-            if (col0 < k) {
-                const uint32_t coln = k - col0;
-                if (coln >= 8) {
-                    solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
-                } else {
-                    solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
-                }
-            }
-        }
-    }
-
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "solve-tri-batch %d/%d: A=(%ux%u) B=(%ux%u) batch %u:%u usec %u\n",
-         ith, nth, n, n, k, n, start_batch, end_batch,
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
-}
-
-// Chunk-level thread: each job is one (batch, col_chunk) pair.
-static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) {
-    struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
-    struct htp_ops_context *       octx = sctx->octx;
-
-    const struct htp_tensor * src0 = octx->src[0];  // A
-    const struct htp_tensor * src1 = octx->src[1];  // B
-    const struct htp_tensor * dst  = octx->dst;     // X
-
-    const uint32_t n = src0->ne[0];
-    const uint32_t k = src1->ne[0];
-
-    const uint32_t ne02 = src0->ne[2];
-
-    const uint32_t start_job = sctx->jobs_per_thread * ith;
-    const uint32_t end_job   = MIN(start_job + sctx->jobs_per_thread, sctx->total_jobs);
-
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
-
-    for (uint32_t job = start_job; job < end_job; ++job) {
-        const uint32_t batch = job / sctx->k_chunks;
-        const uint32_t chunk = job - batch * sctx->k_chunks;
-
-        const uint32_t i03 = batch / ne02;
-        const uint32_t i02 = batch - i03 * ne02;
-
-        const uint32_t col0 = chunk * sctx->col_block;
-        const uint32_t coln = MIN(sctx->col_block, k - col0);
-
-        const float * A_batch =
-            (const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
-        const float * B_batch =
-            (const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
-        float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
-
-        const bool use_hvx = (coln >= 8);
-
-        for (uint32_t row = 0; row < n; ++row) {
-            const float diag     = A_batch[row * n + row];
-            const float inv_diag = 1.0f / diag;
-
-            const float * A_row = A_batch + row * n;
-            const float * B_row = B_batch + row * k;
-
-            if (use_hvx) {
-                solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
-            } else {
-                solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
-            }
-        }
-    }
-
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "solve-tri-chunk %d/%d: A=(%ux%u) B=(%ux%u) job %u:%u usec %u\n",
-         ith, nth, n, n, k, n, start_job, end_job,
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
-}
-
-int op_solve_tri(struct htp_ops_context * octx) {
-    const struct htp_tensor * src0 = octx->src[0];  // A
-    const struct htp_tensor * src1 = octx->src[1];  // B
-    const struct htp_tensor * dst  = octx->dst;     // X
-
-    if (src0->type != HTP_TYPE_F32 || src1->type != HTP_TYPE_F32 || dst->type != HTP_TYPE_F32) {
-        return HTP_STATUS_NO_SUPPORT;
-    }
-
-    // left=true, lower=true, uni=false only
-    if (src0->ne[0] != src0->ne[1]) {
-        return HTP_STATUS_INVAL_PARAMS;
-    }
-    if (src0->ne[1] != src1->ne[1]) {
-        return HTP_STATUS_INVAL_PARAMS;
-    }
-    if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
-        return HTP_STATUS_INVAL_PARAMS;
-    }
-    if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] ||
-        dst->ne[3] != src1->ne[3]) {
-        return HTP_STATUS_INVAL_PARAMS;
-    }
-
-    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
-        return HTP_STATUS_OK;
-    }
-
-    const uint32_t k = src1->ne[0];
-
-    const uint32_t col_block     = VLEN_FP32;
-    const uint32_t k_chunks      = (k + col_block - 1) / col_block;
-    const uint32_t total_batches = src0->ne[2] * src0->ne[3];
-    const bool     batched       = total_batches >= (uint32_t) octx->n_threads;
-
-    FARF(HIGH, "solve-tri: (%ux%ux%ux%u) x (%ux%ux%ux%u) -> (%ux%ux%ux%u) : batched %d\n",
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-         src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
-         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], batched);
-
-    if (batched) {
-        // Batch-level parallelism
-        const uint32_t n_threads = MIN((uint32_t) octx->n_threads, total_batches);
-
-        struct htp_solve_tri_context sctx = {
-            .octx            = octx,
-            .jobs_per_thread = (total_batches + n_threads - 1) / n_threads,
-            .total_jobs      = total_batches,
-            .k_chunks        = k_chunks,
-            .col_block       = col_block,
-        };
-
-        worker_pool_run_func(octx->ctx->worker_pool, solve_tri_batch_thread_f32, &sctx, n_threads);
-    } else {
-        // Chunk-level parallelism
-        const uint32_t total_jobs = total_batches * k_chunks;
-        const uint32_t n_threads  = MIN((uint32_t) octx->n_threads, MAX(total_jobs, 1));
-
-        struct htp_solve_tri_context sctx = {
-            .octx            = octx,
-            .jobs_per_thread = (total_jobs + n_threads - 1) / n_threads,
-            .total_jobs      = total_jobs,
-            .k_chunks        = k_chunks,
-            .col_block       = col_block,
-        };
-
-        worker_pool_run_func(octx->ctx->worker_pool, solve_tri_chunk_thread_f32, &sctx, n_threads);
-    }
-
-    return HTP_STATUS_OK;
-}
@@ -8,7 +8,7 @@ CatalogFile = libggml-htp.cat
 PnpLockDown = 1

 [DestinationDirs]
-Drivers_Dir = 13
+Drivers_Dir = 6

 [SourceDisksNames]
 1 = %DiskId%
@@ -18,7 +18,6 @@ libggml-htp-v68.so = 1
 libggml-htp-v69.so = 1
 libggml-htp-v73.so = 1
 libggml-htp-v75.so = 1
-libggml-htp-v79.so = 1
 libggml-htp-v81.so = 1

 [ControlFlags]
@@ -32,7 +31,6 @@ libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
-libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE

 [Strings]
@@ -814,7 +814,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
            }

            // print MTL GPU family:
-            GGML_LOG_INFO("%s: GPU name:   %s (%s)\n", __func__, dev->props.name, dev->props.desc);
+            GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, dev->props.name);

            // determine max supported GPU family
            // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
@@ -931,13 +931,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
 }

 struct ggml_metal_event {
-    void * obj; // id<MTLSharedEvent>
+    void * obj; // id<MTLEvent>

    atomic_int value;
 };

 void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
-    id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
+    id<MTLEvent> event = (id<MTLEvent>)ev->obj;

    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;

@@ -945,7 +945,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
 }

 void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
-    id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
+    id<MTLEvent> event = (id<MTLEvent>)ev->obj;

    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;

@@ -953,7 +953,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
 }

 ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
-    id<MTLSharedEvent> event = [dev->mtl_device newSharedEvent];
+    id<MTLEvent> event = [dev->mtl_device newEvent];

    ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));

@@ -964,7 +964,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
 }

 void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
-    id<MTLSharedEvent> event = ev->obj;
+    id<MTLEvent> event = ev->obj;
    [event release];

    free(ev);
@@ -973,13 +973,14 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
 }

 void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
-    id<MTLSharedEvent> event = ev->obj;
-    const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
-    if (!res) {
-        GGML_ABORT("%s: failed to wait for event\n", __func__);
-    }
+    @autoreleasepool {
+        id<MTLEvent> event = ev->obj;

-    GGML_UNUSED(dev);
+        id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
+        [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
 }

 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
@@ -918,10 +918,6 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
        static std::vector<ggml_backend_device_ptr> devs;

        if (!initialized) {
-            // workaround macOS limitation (kIOGPUCommandBufferCallbackErrorImpactingInteractivity) until proper fix becomes possible
-            // ref: https://github.com/ggml-org/llama.cpp/issues/20141#issuecomment-4272947703
-            setenv("AGX_RELAX_CDM_CTXSTORE_TIMEOUT", "1", true);
-
            static ggml_backend_metal_reg_ptr reg_ctx(ggml_backend_metal_reg_init());

            for (int i = 0; i < g_devices; ++i) {
@@ -19,6 +19,7 @@
 #include <iomanip>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <openvino/core/dimension.hpp>
 #include <openvino/core/except.hpp>
 #include <openvino/core/node.hpp>
@@ -206,22 +207,8 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
        break;
    }
    case GGML_OP_ROPE: {
-        const int mode = node->op_params[2];
-        switch (mode) {
-       case GGML_ROPE_TYPE_NEOX: {
-            op_case = 0x00010000;
-            break;
-        }
-       case GGML_ROPE_TYPE_IMROPE: {
-            op_case = 0x00020000;
-            break;
-        }
-        default:
-            op_case = 0x00000000;
-            break;
-        }
        if (node->src[0]->op == GGML_OP_VIEW) {
-            op_case = (op_case | 0x00000002);
+            op_case = 2;
        }
        break;
    }
@@ -586,6 +573,9 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
 }

 std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
+    static std::mutex weights_mutex;
+    std::lock_guard<std::mutex> lock(weights_mutex);
+
    std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
    auto * nodes = cgraph->nodes;
    auto n_nodes = cgraph->n_nodes;
@@ -6,7 +6,6 @@
 #include <cstring>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
-#include <openvino/runtime/properties.hpp>
 #include <optional>

 ov::Core & ov_singleton_core() {
@@ -43,13 +42,11 @@ void ggml_openvino_device_config::init() {
            {"NPUW_DQ",                           "YES"   },
            {"NPUW_DQ_FULL",                      "NO"    },
        };
-        if (cache_dir && strlen(cache_dir) > 0) {
+        if (cache_dir) {
            compile_config["NPUW_CACHE_DIR"] = cache_dir;
-            compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
        }
-    } else if (cache_dir && strlen(cache_dir) > 0) {
-        compile_config.insert(ov::cache_dir(cache_dir));
-        compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
+    } else if (cache_dir) {
+        ov_singleton_core().set_property(ov::cache_dir(cache_dir));
    }

    // Initialize remote context with queue sharing for GPU
@@ -262,12 +259,10 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
            layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
            int64_t n_blocks = n_elements / layout.weights_per_block;
            layout.scales_size = n_blocks * sizeof(uint16_t);
-            // For symmetric quantization, no zp needed (weights stored as signed)
-            if (layout.is_symmetric) {
-                layout.zp_size = 0;
-            } else {
-                layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
-            }
+            // For symmetric quantization, we only need one zp value (not one per block)
+            // Zero points are stored in U4 or U8 format matching the weight type
+            size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
+            layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;

            layout.weights_offset = 0;
            layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
@@ -318,12 +313,10 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
    // Scales: F16 per block
    int64_t n_blocks = n_elements / layout.weights_per_block;
    layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
-    // For symmetric quantization, no zp needed (weights stored as signed)
-    if (layout.is_symmetric) {
-        layout.zp_size = 0;
-    } else {
-        layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
-    }
+    // Zero points: U4 or U8 matching weight type
+    // For symmetric quantization, we only need one zp value (not one per block)
+    size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
+    layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;

    // Layout in buffer: [weights | scales | zp] with alignment
    layout.weights_offset = 0;
@@ -145,18 +145,13 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
    return ctx->data;
 }

-static bool is_stateful_enabled() {
-    static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
-    return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0;
-}
-
 static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
    ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;

    // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
    if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
-        !is_stateful_enabled()) {
+        !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
        GGML_ASSERT(ctx->tensor_extras.empty());
        auto device = ctx->device;
        auto size = ctx->size;
@@ -605,14 +600,6 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {

 static void ggml_backend_openvino_free(ggml_backend_t backend) {
    ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
-
-    if (ctx->runtime_context) {
-        auto r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
-        if (--r_ctx->backend_count == 0) {
-            r_ctx->clear_caches();
-        }
-    }
-
    delete ctx;
    delete backend;
 }
@@ -657,12 +644,7 @@ static ggml_guid_t ggml_backend_openvino_guid(void) {
 }

 static std::shared_ptr<ov_runtime_context> get_ov_runtime_context_ptr() {
-    static std::shared_ptr<ov_runtime_context> r_ctx = [] {
-        auto ctx = std::make_shared<ov_runtime_context>();
-        ctx->device = ggml_openvino_get_device_name();
-        ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu();
-        return ctx;
-    }();
+    static std::shared_ptr<ov_runtime_context> r_ctx = std::make_shared<ov_runtime_context>();
    return r_ctx;
 }

@@ -687,7 +669,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
    }

    std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
-    r_ctx->backend_count++;
+    r_ctx->device = ggml_openvino_get_device_name();
+    r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();

    ggml_backend_t openvino_backend = new ggml_backend{
        /* .guid      = */ ggml_backend_openvino_guid(),
@@ -900,7 +883,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
        const int32_t * op_params = op->op_params;
        const int n_dims = op_params[1];
        const int mode = op_params[2];
-        if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_IMROPE) {
+        if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
            // GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
            return true;
        }
@@ -913,6 +896,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
            // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
            return true;
        }
+        float freq_scale;
+        float ext_factor;
+        memcpy(&freq_scale, op_params + 6, sizeof(float));
+        memcpy(&ext_factor, op_params + 7, sizeof(float));
+        if (ext_factor != 0.0f) {
+            // GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
+            return true;
+        }
        if (op->src[0]->op == GGML_OP_VIEW) {
            if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
                // GGML_LOG_WARN(
@@ -922,12 +913,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
                return true;
            }
        }
-        if (mode == GGML_ROPE_TYPE_IMROPE &&
-            (op->src[2] != 0 || ((const float *) op_params)[6] != 1 || ((const float *) op_params)[7] != 0 ||
-             ((const float *) op_params)[8] != 1)) {
-            // GGML_LOG_WARN("OpenVINO backend does not support IMROPE with freq_factors, freq_scale, ext_factor, and attn_factor\n");
-            return true;
-        }
        break;
    }
    default:
@@ -957,7 +942,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
                                                 // GGML_OP_SOFT_MAX,
                                                 GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
    static const std::set<ggml_unary_op> supported_unary_ops{
-        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_SILU,
    };
    static const std::set<ggml_glu_op> supported_glu_ops{
@@ -46,7 +46,6 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {

 // Extracts (weight, scales, zp) from Q4_0 tensors.
 // Data layout is: |16 bit scale|32 x 4bit weights|.
-// When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8).
 void extract_q4_0_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
@@ -56,32 +55,28 @@ void extract_q4_0_data(const ggml_tensor * tensor,
    auto * data = static_cast<uint8_t *>(tensor->data);
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    auto * zp = static_cast<uint8_t *>(zp_arr.data());

-    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4);  // Signed i4 path
+    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization

-    if (!is_symmetric) {
-        auto * zp = static_cast<uint8_t *>(zp_arr.data());
-        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
-            scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
+    // For Q4_0, zero point is always 8
+    if (is_scalar_zp) {
+        zp[0] = 8 | (8 << 4);  // Pack two 4-bit values
+    }
+
+    ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+        scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
+        // For asymmetric quantization, compute per-block zero points
+        if (!is_scalar_zp) {
            // Pack two 4-bit zero points per byte
            if (i % 2 == 0) {
                zp[i / 2] = 8;          // Lower nibble
            } else {
                zp[i / 2] |= (8 << 4);  // Upper nibble
            }
-            unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
-        });
-    } else {
-        // Symmetric: unpack as u4 then convert to i4 by subtracting 8 (XOR each nibble)
-        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
-            scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
-            unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
-            // Convert u4 to i4: subtract 8 from each nibble. XOR 0x88 flips each nibble by 8.
-            for (int j = 0; j < 16; ++j) {
-                weights[i * 16 + j] ^= 0x88;
-            }
-        });
-    }
+        }
+        unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
+    });
 }

 // Extracts (weight, scales, zp) from Q4_1 tensors.
@@ -128,7 +123,6 @@ void extract_q4_1_data(const ggml_tensor * tensor,

 // Extracts (weight, scales, zp) from Q8_0 tensors.
 // Data layout is: |16 bit scale|32 x 8bit weights|.
-// When zp_arr is empty (symmetric), weights are stored as signed i8 directly.
 void extract_q8_0_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
@@ -139,30 +133,29 @@ void extract_q8_0_data(const ggml_tensor * tensor,
    auto * data = static_cast<uint8_t *>(tensor->data);
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    auto * zp = static_cast<uint8_t *>(zp_arr.data());

-    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8);  // Signed i8 path
+    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization

-    if (!is_symmetric) {
-        auto * zp = static_cast<uint8_t *>(zp_arr.data());
-        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
-            uint8_t * block_data = data + i * bytes_per_block;
-            scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
-            zp[i] = 128;
-            for (size_t j = 0; j < weights_per_block; ++j) {
-                uint8_t x = block_data[j + 2];
-                x ^= 1 << 7;  // Convert int8 to uint8 by flipping sign bit
-                weights[i * weights_per_block + j] = x;
-            }
-        });
-    } else {
-        // Symmetric: store original int8 values directly (no unsigned bias)
-        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
-            uint8_t * block_data = data + i * bytes_per_block;
-            scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
-            // Copy int8 weights as-is (the tensor element type is i8)
-            memcpy(weights + i * weights_per_block, block_data + 2, weights_per_block);
-        });
+    // For Q8_0, zero point is always 128
+    if (is_scalar_zp) {
+        zp[0] = 128;
    }
+
+    ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+        uint8_t * block_data = data + i * bytes_per_block;
+        scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
+        // For asymmetric quantization, store per-block zero points
+        if (!is_scalar_zp) {
+            zp[i] = 128;
+        }
+        for (size_t j = 0; j < weights_per_block; ++j) {
+            uint8_t x = block_data[j + 2];  // j+2 to skip the scale bytes.
+            // Original data is in int8_t, so we add a bias of -128 and invert the first bit.
+            x ^= 1 << 7;
+            weights[i * weights_per_block + j] = x;
+        }
+    });
 }

 void unpack_256_4(const uint8_t * data, uint8_t * dst) {
@@ -263,62 +256,44 @@ void extract_q6_k_data(const ggml_tensor * tensor,
    auto * data = static_cast<uint8_t *>(tensor->data);
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    auto * zp = static_cast<uint8_t *>(zp_arr.data());

-    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8);  // Signed i8 path
+    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization

-    if (!is_symmetric) {
-        auto * zp = static_cast<uint8_t *>(zp_arr.data());
-        ov::parallel_for(n_super_block, [&](size_t i) {
-            uint8_t * block_data = data + i * bytes_per_block;
-            float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
-            for (size_t j = 0; j < 16; j++) {
-                scales[j + i * 16] =
-                    ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
+    // For Q6_K, zero point is always 32
+    if (is_scalar_zp) {
+        zp[0] = 32;
+    }
+
+    ov::parallel_for(n_super_block, [&](size_t i) {
+        uint8_t * block_data = data + i * bytes_per_block;
+
+        float scale_factor =
+            static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));  // (128+64+16)/2
+
+        for (size_t j = 0; j < 16; j++) {
+            scales[j + i * 16] =
+                ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
+            // For asymmetric quantization, store per-block zero points
+            if (!is_scalar_zp) {
                zp[j + i * 16] = 32;
            }
-            uint8_t * ql = block_data;
-            uint8_t * qh = block_data + 128;
-            for (int64_t j = 0; j < 32; ++j) {
-                weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
-                weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
-                weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
-                weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
-                weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
-                weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
-                weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
-                weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
-            }
-        });
-    } else {
-        // Symmetric: subtract 32 from each weight to store as signed i8
-        ov::parallel_for(n_super_block, [&](size_t i) {
-            uint8_t * block_data = data + i * bytes_per_block;
-            float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
-            for (size_t j = 0; j < 16; j++) {
-                scales[j + i * 16] =
-                    ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
-            }
-            uint8_t * ql = block_data;
-            uint8_t * qh = block_data + 128;
-            auto * signed_weights = reinterpret_cast<int8_t *>(weights);
-            for (int64_t j = 0; j < 32; ++j) {
-                signed_weights[i * 256 + j] = static_cast<int8_t>((ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4)) - 32;
-                signed_weights[i * 256 + j + 32] =
-                    static_cast<int8_t>((ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4)) - 32;
-                signed_weights[i * 256 + j + 64] = static_cast<int8_t>((ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4)) - 32;
-                signed_weights[i * 256 + j + 96] =
-                    static_cast<int8_t>((ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4)) - 32;
-                signed_weights[i * 256 + j + 128] =
-                    static_cast<int8_t>((ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4)) - 32;
-                signed_weights[i * 256 + j + 160] =
-                    static_cast<int8_t>((ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4)) - 32;
-                signed_weights[i * 256 + j + 192] =
-                    static_cast<int8_t>((ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4)) - 32;
-                signed_weights[i * 256 + j + 224] =
-                    static_cast<int8_t>((ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4)) - 32;
-            }
-        });
-    }
+        }
+
+        uint8_t * ql = block_data;
+        uint8_t * qh = block_data + 128;
+
+        for (int64_t j = 0; j < 32; ++j) {
+            weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
+            weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
+            weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
+            weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
+            weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
+            weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
+            weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
+            weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
+        }
+    });
 }

 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
@@ -414,10 +389,11 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
                                       size_t group_size,
                                       bool use_bias) {
    ov::Shape orig_shape = weight.get_shape();
-    bool is_signed = (weight.get_element_type() == ov::element::i8);  // Symmetric: signed weights, no ZP

    // Expand dimensions for scales and zp/bias
    auto scale_shape = scales.get_shape();
+    auto zp_shape = zp.get_shape();
+    bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization

    ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};

@@ -427,48 +403,37 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
    } else {
        scale_shape.push_back(1);
        scales.set_shape(scale_shape);
-        if (!is_signed && zp.get_size() > 0) {
-            auto zp_shape = zp.get_shape();
+        // For symmetric quantization, zp remains scalar (don't resize)
+        if (!is_scalar_zp) {
            zp_shape.push_back(1);
            zp.set_shape(zp_shape);
        }
    }

+    // Create graph nodes
+    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
+                                                               static_cast<uint8_t *>(weight.data()), nullptr);
+    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
+    auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);

    ov::Output<ov::Node> result;
-    if (is_signed) {
-        // Signed path: q * s (no zero point subtraction needed)
-        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i8, packed_shape,
-                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
-        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
-        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
-        result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+    if (use_bias && !is_scalar_zp) {
+        // Bias path: w * s + b (zp tensor holds f16 bias values)
+        auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
+        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+        result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
    } else {
-        // Unsigned path
-        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
-                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
-        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
-        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
-
-        if (use_bias && zp.get_size() > 0) {
-            // Bias path: w * s + b (zp tensor holds f16 bias values)
-            auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
-            auto w_s =
-                std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
-            result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
-        } else {
-            // Zero point path: (w - zp) * s
-            auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
-            float zp_value;
-            if (ov::op::util::get_single_value(zero_point, zp_value)) {
-                zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
-            }
-            auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
-            auto w_zp =
-                std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
-            result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+        // Zero point path: (w - zp) * s
+        auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
+        float zp_value;
+        if (ov::op::util::get_single_value(zero_point, zp_value)) {
+            zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
        }
+        auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
+        auto w_zp =
+            std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
+        result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
    }

    if (packed_shape.size() != 2) {
@@ -487,10 +452,11 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
                                       size_t group_size,
                                       bool use_bias) {
    ov::Shape orig_weight_shape = weight.get_shape();
-    bool is_signed = (weight.get_element_type() == ov::element::i4);  // Symmetric: signed weights, no ZP

    // Expand dimensions for scales and zp/bias
    ov::Shape scale_shape = scales.get_shape();
+    auto zp_shape = zp.get_shape();
+    bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization

    // Create INT4 weight tensor
    ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
@@ -501,48 +467,36 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
    } else {
        scale_shape.push_back(1);
        scales.set_shape(scale_shape);
-        if (!is_signed && zp.get_size() > 0) {
-            auto zp_shape = zp.get_shape();
+        // For symmetric quantization, zp remains scalar (don't resize)
+        if (!is_scalar_zp) {
            zp_shape.push_back(1);
            zp.set_shape(zp_shape);
        }
    }

+    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
+                                                               static_cast<uint8_t *>(weight.data()), nullptr);
+    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
+    auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);

    ov::Output<ov::Node> result;
-    if (is_signed) {
-        // Signed path: q * s (no zero point subtraction needed)
-        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i4, packed_shape,
-                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
-        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
-        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
-        result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+    if (use_bias && !is_scalar_zp) {
+        // Bias path: w * s + b (zp tensor holds f16 bias values)
+        auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
+        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+        result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
    } else {
-        // Unsigned path
-        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
-                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
-        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
-        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
-
-        if (use_bias && zp.get_size() > 0) {
-            // Bias path: w * s + b (zp tensor holds f16 bias values)
-            auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
-            auto w_s =
-                std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
-            result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
-        } else {
-            // Zero point path: (w - zp) * s
-            auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
-            float zp_value;
-            if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
-                zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
-            }
-            auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
-            auto w_zp =
-                std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
-            result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+        // Zero point path: (w - zp) * s
+        auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
+        float zp_value;
+        if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
+            zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
        }
+        auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
+        auto w_zp =
+            std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
+        result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
    }

    if (packed_shape.size() != 2) {
@@ -745,32 +699,24 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo

    // Quantized path (normal extraction or quantized requant)
    // Create weight/scale/zp tensors - shared between both paths
-    // For symmetric quantization, use signed types (i4/i8) and no ZP tensor
-    ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
-                                                          (layout.is_u4 ? ov::element::u4 : ov::element::u8);
+    ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
    ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
+    ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;

    if (output_base_ptr) {
        uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
        result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
        result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-        if (!layout.is_symmetric) {
-            ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
-            result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset);
-        }
-        // else: result.zp remains default-constructed (empty) for symmetric
+        result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
    } else {
        result.weights = ov::Tensor(weight_type, node_shape);
        result.scales = ov::Tensor(ov::element::f16, scale_shape);
-        if (!layout.is_symmetric) {
-            if (use_bias) {
-                result.zp = ov::Tensor(ov::element::f16, scale_shape);
-            } else {
-                ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
-                result.zp = ov::Tensor(zp_type, scale_shape);
-            }
+        if (use_bias && !layout.is_symmetric) {
+            // bias only has effect for asymmetric quant
+            result.zp = ov::Tensor(ov::element::f16, zp_shape);
+        } else {
+            result.zp = ov::Tensor(weight_type, zp_shape);
        }
-        // else: result.zp remains default-constructed (empty) for symmetric
    }

    if (layout.is_requant && layout.requant_type.has_value()) {
@@ -795,75 +741,59 @@ void quantize_q4_0(const float * x,

    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4);  // Signed i4 path
+    auto * zp = static_cast<uint8_t *>(zp_arr.data());
+    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization

-    if (!is_symmetric) {
-        auto * zp = static_cast<uint8_t *>(zp_arr.data());
-        for (int i = 0; i < nb; i++) {
-            float amax = 0.0f;
-            float max = 0.0f;
-            for (int j = 0; j < qk; j++) {
-                const float v = x[i * qk + j];
-                if (amax < fabsf(v)) {
-                    amax = fabsf(v);
-                    max = v;
-                }
+    // For Q4_0, zero point is always 8
+    if (is_scalar_zp) {
+        zp[0] = 8 | (8 << 4);  // Pack two 4-bit values
+    }
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f;  // absolute max
+        float max = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i * qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
            }
-            const float d = max / -8;
-            if (d == 0) {
-                scales[i] = ov::float16(1.0f);
+        }
+
+        const float d = max / -8;
+
+        if (d == 0) {
+            scales[i] = ov::float16(1.0f);
+            // zp is already set to 8 for symmetric, or set per-block for asymmetric
+            if (!is_scalar_zp) {
                if (i % 2 == 0) {
                    zp[i / 2] = 8;
                } else {
                    zp[i / 2] |= (8 << 4);
                }
-                memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
-                continue;
            }
-            const float id = 1.0f / d;
-            scales[i] = ov::float16(d);
+            memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
+            continue;
+        }
+
+        const float id = 1.0f / d;
+        scales[i] = ov::float16(d);
+        // For asymmetric quantization, store per-block zero points
+        if (!is_scalar_zp) {
            if (i % 2 == 0) {
                zp[i / 2] = 8;
            } else {
                zp[i / 2] |= (8 << 4);
            }
-            for (int j = 0; j < qk / 2; ++j) {
-                const float x0 = x[i * qk + 2 * j] * id;
-                const float x1 = x[i * qk + 2 * j + 1] * id;
-                const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
-                const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
-                weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
-            }
        }
-    } else {
-        // Symmetric: produce signed i4 values in [-8, 7]
-        for (int i = 0; i < nb; i++) {
-            float amax = 0.0f;
-            float max = 0.0f;
-            for (int j = 0; j < qk; j++) {
-                const float v = x[i * qk + j];
-                if (amax < fabsf(v)) {
-                    amax = fabsf(v);
-                    max = v;
-                }
-            }
-            const float d = max / -8;
-            if (d == 0) {
-                scales[i] = ov::float16(1.0f);
-                // i4 value 0 packed: 0x00
-                memset(weights + i * qk / 2, 0, qk / 2);
-                continue;
-            }
-            const float id = 1.0f / d;
-            scales[i] = ov::float16(d);
-            for (int j = 0; j < qk / 2; ++j) {
-                const float x0 = x[i * qk + 2 * j] * id;
-                const float x1 = x[i * qk + 2 * j + 1] * id;
-                // Signed i4: range [-8, 7]. Quantize as round(x*id), then pack as 4-bit two's complement.
-                int8_t si0 = (int8_t) std::max(-8, std::min(7, (int) roundf(x0)));
-                int8_t si1 = (int8_t) std::max(-8, std::min(7, (int) roundf(x1)));
-                weights[i * qk / 2 + j] = (si0 & 0x0F) | ((si1 & 0x0F) << 4);
-            }
+
+        for (int j = 0; j < qk / 2; ++j) {
+            const float x0 = x[i * qk + 2 * j] * id;
+            const float x1 = x[i * qk + 2 * j + 1] * id;
+            const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
+            weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
        }
    }
 }
@@ -879,42 +809,36 @@ void quantize_q8_0(const float * x,

    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8);  // Signed i8 path
+    auto * zp = static_cast<uint8_t *>(zp_arr.data());
+    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization

-    if (!is_symmetric) {
-        auto * zp = static_cast<uint8_t *>(zp_arr.data());
-        for (int i = 0; i < nb; i++) {
-            float amax = 0.0f;
-            for (int j = 0; j < qk; j++) {
-                const float v = x[i * qk + j];
-                amax = std::max(amax, fabsf(v));
-            }
-            const float d = amax / 127.0f;
-            const float id = d ? 1.0f / d : 0.0f;
-            scales[i] = ov::float16(d);
-            zp[i] = 128;
-            for (int j = 0; j < qk; ++j) {
-                const float x0 = x[i * qk + j] * id;
-                const int8_t xi0 = roundf(x0);
-                weights[i * qk + j] = (uint8_t) (xi0 + 128);
+    // For Q8_0, zero point is always 128
+    if (is_scalar_zp) {
+        zp[0] = 128;
+    }
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f;  // absolute max
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i * qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
            }
        }
-    } else {
-        // Symmetric: store signed int8 values directly
-        auto * signed_weights = reinterpret_cast<int8_t *>(weights);
-        for (int i = 0; i < nb; i++) {
-            float amax = 0.0f;
-            for (int j = 0; j < qk; j++) {
-                const float v = x[i * qk + j];
-                amax = std::max(amax, fabsf(v));
-            }
-            const float d = amax / 127.0f;
-            const float id = d ? 1.0f / d : 0.0f;
-            scales[i] = ov::float16(d);
-            for (int j = 0; j < qk; ++j) {
-                const float x0 = x[i * qk + j] * id;
-                signed_weights[i * qk + j] = (int8_t) roundf(x0);
-            }
+
+        const float d = amax / 127.0f;
+        const float id = d ? 1.0f / d : 0.0f;
+        scales[i] = ov::float16(d);
+        // For asymmetric quantization, store per-block zero points
+        if (!is_scalar_zp) {
+            zp[i] = 128;
+        }
+
+        for (int j = 0; j < qk; ++j) {
+            const float x0 = x[i * qk + j] * id;
+            const int8_t xi0 = roundf(x0);
+            weights[i * qk + j] = (uint8_t) (xi0 + 128);
        }
    }
 }
@@ -937,8 +861,12 @@ void quantize_q8_1(const float * x,

        for (int j = 0; j < qk; j++) {
            const float v = x[i * qk + j];
-            min = std::min(v, min);
-            max = std::max(v, max);
+            if (v < min) {
+                min = v;
+            }
+            if (v > max) {
+                max = v;
+            }
        }

        const float d = (max - min) / ((1 << 8) - 1);
@@ -9,17 +9,12 @@
 #include <openvino/op/add.hpp>
 #include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
-#include <openvino/op/convert.hpp>
-#include <openvino/op/cos.hpp>
-#include <openvino/op/gather.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/shape_of.hpp>
-#include <openvino/op/sin.hpp>
 #include <openvino/op/slice.hpp>
 #include <openvino/op/split.hpp>
 #include <openvino/op/subtract.hpp>
-#include <openvino/op/transpose.hpp>
 #include <openvino/op/unsqueeze.hpp>
 #include <vector>

@@ -38,12 +33,6 @@ OutputVector translate_rope(const NodeContext & context) {
    auto data_node = context.get_input(0).get_node_shared_ptr();
    auto output_shape = context.get_output_shape().to_shape();
    int32_t * op_params = context.get_output_op_params();
-    const int mode = (op_case & 0xFFFF0000) >> 16;
-    op_case = (op_case & 0x0000FFFF);
-
-    constexpr int TYPE_NORMAL = 0;
-    constexpr int TYPE_NEOX = 1;
-    constexpr int TYPE_IMROPE = 2;

    Output<Node> cos_theta_node;
    Output<Node> sin_theta_node;
@@ -56,7 +45,7 @@ OutputVector translate_rope(const NodeContext & context) {
        if (context.get_input_size() == 3) {
            rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
        }
-        auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE);
+        auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
        sin_theta_node = sin_cos.first;
        cos_theta_node = sin_cos.second;
    }
@@ -76,7 +65,11 @@ OutputVector translate_rope(const NodeContext & context) {
        }
    }

-    if (mode == TYPE_NORMAL) {
+    const int mode = op_params[2];
+    constexpr int ROPE_TYPE_NORMAL = 0;
+    constexpr int ROPE_TYPE_NEOX = 2;
+
+    if (mode == ROPE_TYPE_NORMAL) {
        auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
@@ -104,7 +97,7 @@ OutputVector translate_rope(const NodeContext & context) {
        auto data_shape = ov::op::v0::Constant::create(
            ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
        res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
-    } else if (mode == TYPE_NEOX) {
+    } else if (mode == ROPE_TYPE_NEOX) {
        auto data_split = std::make_shared<ov::op::v1::Split>(
            data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2);
        Output<Node> slice_data_node_0 = data_split->outputs()[0];
@@ -119,25 +112,6 @@ OutputVector translate_rope(const NodeContext & context) {
            std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));

        res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
-    } else if (mode == TYPE_IMROPE) {
-        int64_t n_dims = data_node->get_shape()[3];
-        auto cos_sin_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1,-1,1,(n_dims >> 1)});
-        auto cos_reshaped = std::make_shared<ov::op::v1::Reshape>(cos_theta_node, cos_sin_shape, true);
-        auto sin_reshaped = std::make_shared<ov::op::v1::Reshape>(sin_theta_node, cos_sin_shape, true);
-
-        auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3});
-        auto split_a = std::make_shared<ov::op::v1::Split>(data_node, split_axis, 2);
-        auto x0 = split_a->output(0);
-        auto x1 = split_a->output(1);
-        auto mul_a = std::make_shared<ov::op::v1::Multiply>(x0, cos_reshaped);
-        auto mul_b = std::make_shared<ov::op::v1::Multiply>(x1, sin_reshaped);
-        auto sub = std::make_shared<ov::op::v1::Subtract>(mul_a, mul_b);
-
-        auto mul_c = std::make_shared<ov::op::v1::Multiply>(x0, sin_reshaped);
-        auto mul_d = std::make_shared<ov::op::v1::Multiply>(x1, cos_reshaped);
-        auto add = std::make_shared<ov::op::v1::Add>(mul_c, mul_d);
-
-        res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{sub, add}, 3);
    }

    return rename_outputs_with_suffix({res}, context.get_name());
@@ -1,25 +0,0 @@
-#include "../node_context.h"
-#include "../op_table.h"
-#include "../utils.h"
-
-#include <openvino/core/node_output.hpp>
-#include <openvino/op/gelu.hpp>
-
-namespace ov {
-namespace frontend {
-namespace ggml {
-namespace op {
-
-OutputVector translate_unary_gelu(const NodeContext & context) {
-    num_inputs_check(context, 1, 1);
-
-    auto input = context.get_input(0);
-    auto res = std::make_shared<ov::op::v7::Gelu>(input);
-
-    return rename_outputs_with_suffix({res}, context.get_name());
-}
-
-}  // namespace op
-}  // namespace ggml
-}  // namespace frontend
-}  // namespace ov
@@ -31,7 +31,6 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
        {"GGML_OP_SOFT_MAX",       op::translate_soft_max                         },
        {"GGML_OP_SUB",            op::translate_1to1_match_2_inputs<v1::Subtract>},
        {"GGML_OP_TRANSPOSE",      op::translate_transpose                        },
-        {"GGML_UNARY_OP_GELU",     op::translate_unary_gelu                       },
        {"GGML_UNARY_OP_SILU",     op::translate_unary_silu                       },
        {"GGML_OP_VIEW",           op::translate_view                             },
        {"GGML_GLU_OP_SWIGLU",     op::translate_glu_swiglu                       },
@@ -21,7 +21,6 @@ GGML_OP_CONVERTER(translate_rms_norm);
 GGML_OP_CONVERTER(translate_rope);
 GGML_OP_CONVERTER(translate_scale);
 GGML_OP_CONVERTER(translate_unary_silu);
-GGML_OP_CONVERTER(translate_unary_gelu);
 GGML_OP_CONVERTER(translate_soft_max);
 GGML_OP_CONVERTER(translate_transpose);
 GGML_OP_CONVERTER(translate_view);
@@ -0,0 +1,123 @@
+#include "eliminate_zp.h"
+
+#include <openvino/core/graph_util.hpp>
+#include <openvino/core/parallel.hpp>
+#include <openvino/core/rt_info.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/subtract.hpp>
+#include <openvino/pass/pattern/op/label.hpp>
+#include <openvino/pass/pattern/op/pattern.hpp>
+#include <openvino/pass/pattern/op/wrap_type.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace pass {
+
+EliminateZeroPoints::EliminateZeroPoints() {
+    // Find pattern:
+    // (Multiply Any(scale)
+    //           (Subtract (Convert Constant(data)))
+    //                     (Convert Constant(zero_point)))
+    // where zero_point is a scalar
+    // If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val
+    // If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant
+
+    auto m_data_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
+    auto m_data_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_data_constant});
+
+    auto m_zp_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
+    auto m_zp_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_zp_constant});
+
+    auto m_subtract = ov::pass::pattern::wrap_type<ov::op::v1::Subtract>({m_data_convert, m_zp_convert});
+    auto m_scale = ov::pass::pattern::any_input();
+    auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
+
+    const auto callback = [=](ov::pass::pattern::Matcher & m) {
+        const auto & pattern_map = m.get_pattern_value_map();
+
+        auto multiply_node =
+            std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
+        auto subtract_node =
+            std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
+        auto data_constant =
+            std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
+        auto zp_constant =
+            std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
+
+        if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
+            return false;
+        }
+
+        if (ov::shape_size(zp_constant->get_shape()) != 1) {
+            return false;
+        }
+
+        auto data_type = data_constant->get_element_type();
+        auto zp_data = zp_constant->cast_vector<int>();
+
+        if (zp_data.empty()) {
+            return false;
+        }
+
+        int zp_value = zp_data[0];
+
+        bool should_eliminate = false;
+        ov::element::Type target_type;
+
+        if (data_type == ov::element::u4 && zp_value == 8) {
+            should_eliminate = true;
+            target_type = ov::element::i4;
+        } else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) {
+            should_eliminate = true;
+            target_type = ov::element::i8;
+        }
+
+        if (!should_eliminate) {
+            return false;
+        }
+
+        auto data_shape = data_constant->get_shape();
+        size_t total_elements = ov::shape_size(data_shape);
+
+        std::shared_ptr<ov::op::v0::Constant> new_constant;
+
+        // TODO improve performance
+        if (data_type == ov::element::u4) {
+            auto data_values = data_constant->cast_vector<uint8_t>();
+            std::vector<int8_t> adjusted_values(total_elements);
+
+            ov::parallel_for(total_elements, [&](size_t i) {
+                adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - 8);
+            });
+
+            new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
+        } else if (data_type == ov::element::u8) {
+            auto data_values = data_constant->cast_vector<uint8_t>();
+            std::vector<int8_t> adjusted_values(total_elements);
+
+            ov::parallel_for(total_elements, [&, zp_value](size_t i) {
+                adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - zp_value);
+            });
+
+            new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
+        }
+
+        auto new_convert =
+            std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
+        ov::replace_node(subtract_node, new_convert);
+
+        return true;
+    };
+
+    register_matcher(
+        std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
+        callback);
+}
+
+}  // namespace pass
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
@@ -0,0 +1,17 @@
+#include "openvino/pass/matcher_pass.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace pass {
+
+class EliminateZeroPoints : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints")
+    EliminateZeroPoints();
+};
+
+}  // namespace pass
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
@@ -1,41 +0,0 @@
-// Copyright (C) 2018-2026 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <openvino/core/core_visibility.hpp>
-#include <openvino/core/node.hpp>
-#include <openvino/core/runtime_attribute.hpp>
-
-namespace ov {
-
-/**
- * @brief Holds weightless caching attributes of a single constant.
- *
- * WeightlessCacheAttribute class represents runtime info attribute that holds
- * the values of original size of the constant in bytes and the binary offset of the
- * constant's data in the weights file used by the weightless caching mechanism. It's
- * not copyable in case the data was changed (the original node was replaced by a new
- * one produced during the tranformation pipeline) - in that case weightless caching
- * can't be used for that constant.
- */
-class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute {
-public:
-    OPENVINO_RTTI("WeightlessCacheAttribute", "0", RuntimeAttribute)
-
-    WeightlessCacheAttribute() = delete;
-
-    WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype)
-        : original_size(original_size),
-          bin_offset(bin_offset),
-          original_dtype(original_dtype) {}
-
-    bool is_copyable() const override;
-
-    size_t original_size;
-    size_t bin_offset;
-    ov::element::Type original_dtype;
-};
-
-}  // namespace ov
@@ -3,16 +3,15 @@
 #include "ggml-openvino/openvino/node_context.h"
 #include "ggml-openvino/openvino/utils.h"
 #include "input_model.h"
+#include "pass/eliminate_zp.h"
 #include "pass/mark_decompression_convert_constant_folding.h"
 #include "pass/squeeze_matmul.h"
-#include "rt_info/weightless_caching_attributes.hpp"

 #include <cstdint>
 #include <cstdlib>
 #include <map>
 #include <memory>
 #include <openvino/core/node.hpp>
-#include <openvino/core/preprocess/pre_post_process.hpp>
 #include <openvino/op/add.hpp>
 #include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
@@ -34,6 +33,7 @@
 #include <openvino/op/unsqueeze.hpp>
 #include <openvino/pass/constant_folding.hpp>
 #include <openvino/pass/make_stateful.hpp>
+#include <openvino/core/preprocess/pre_post_process.hpp>

 namespace ov {
 namespace frontend {
@@ -240,31 +240,6 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
    resulting_model = std::make_shared<Model>(results, used_params);

    apply_transformations(resulting_model);
-
-    // Set WeightlessCacheAttribute on large constants to avoid unnecessary memory copies
-    // in the NPUW plugin. Without this attribute, NPUW's LazyTensor constructor
-    // (lazy_tensor.cpp, op::Const::Const) will memcpy every constant "in case export
-    // occurs", doubling memory usage per compile_model call.
-    //
-    // The bin_offset field serves as a unique key (not a real file offset) — this is
-    // the same convention the GPU plugin uses for non-IR models (see
-    // Plugin::set_weightless_cache_attributes in intel_gpu/src/plugin/plugin.cpp).
-    // Each constant must have a distinct bin_offset, otherwise GPU's weightless cache
-    // import will map multiple constants to the same data.
-    //
-    // Small constants (< 16 elements) are excluded since they may be introduced by
-    // optimization patterns and the overhead is negligible.
-    size_t offset = 0;
-    for (auto & node : resulting_model->get_ordered_ops()) {
-        if (auto cnst = ov::as_type_ptr<ov::op::v0::Constant>(node);
-            cnst && cnst->get_byte_size() / cnst->get_element_type().size() >= 16) {
-            auto & rt_info = cnst->get_rt_info();
-            if (rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()) == rt_info.end()) {
-                rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] =
-                    ov::WeightlessCacheAttribute(cnst->get_byte_size(), offset++, cnst->get_element_type());
-            }
-        }
-    }
    return resulting_model;
 }

@@ -282,6 +257,7 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
        }

        if (ggml_model_decoder->is_static()) {
+            manager.register_pass<pass::EliminateZeroPoints>();
            manager.register_pass<pass::SqueezeMatmul>();
        }
        manager.run_passes(model);
@@ -2,7 +2,6 @@

 #include "ggml-impl.h"

-#include <cmath>
 #include <cstddef>
 #include <ctime>
 #include <memory>
@@ -14,7 +13,6 @@
 #include <openvino/op/gather.hpp>
 #include <openvino/op/maximum.hpp>
 #include <openvino/op/multiply.hpp>
-#include <openvino/op/reshape.hpp>
 #include <openvino/op/shape_of.hpp>
 #include <openvino/op/sin.hpp>
 #include <openvino/op/squeeze.hpp>
@@ -89,11 +87,8 @@ ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl
    auto ramp_y =
        std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
    auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
-    // rope_yarn_ramp returns (1 - clamp(y)), so invert before scaling
-    auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
-    auto ramp_inverted = std::make_shared<ov::op::v1::Subtract>(one, ramp_clamped);
    auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
-    auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_inverted, ext_factor_node);
+    auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
    return ramp_mix;
 }

@@ -120,7 +115,6 @@ void ggml_rope_yarn_corr_dims(int n_dims,
 std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
                                                           std::shared_ptr<ov::Node> inp_pos,
                                                           std::shared_ptr<ov::Node> rope_freqs_weight,
-                                                           bool imrope,
                                                           bool stateful) {
    if (stateful) {
        inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
@@ -128,13 +122,6 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
        auto pos_perm =
            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
        inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
-    } else if (imrope) {
-        inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
-        auto pos_shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, {0, 0, 0, 4, -1});
-        inp_pos = std::make_shared<ov::op::v1::Reshape>(inp_pos, pos_shape, true);
-        auto pos_transpose_shape =
-            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{5}, std::vector<int64_t>{0, 1, 2, 4, 3});
-        inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_transpose_shape);
    } else {
        inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
        auto pos_perm =
@@ -149,7 +136,6 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
    float beta_fast;
    float beta_slow;
    const int n_dims = rope_params[1];
-    const size_t n_dims_half = n_dims >> 1;
    const int n_ctx_orig = rope_params[4];
    memcpy(&freq_base, rope_params + 5, sizeof(float));
    memcpy(&freq_scale, rope_params + 6, sizeof(float));
@@ -160,74 +146,57 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params

    const float theta_scale = powf(freq_base, -2.0f / n_dims);

-    std::vector<float> factor(n_dims_half);
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    std::vector<float> factor(n_dims / 2);
+    factor[0] = 1.0f;
+    for (size_t i = 1; i < factor.size(); i++) {
+        factor[i] = theta_scale * factor[i - 1];
+    }

    Output<Node> freq_factors;
+    if (stateful) {
+        freq_factors =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
+    } else {
+        freq_factors =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
+    }
+    if (rope_freqs_weight) {
+        freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
+    }
+
+    auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
+    auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
+        theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));

    Output<Node> theta;
    float mscale = attn_factor;
-    if (imrope) {
-        std::vector<int64_t> gather_indices(n_dims_half);
-        for (size_t j = 0; j < n_dims_half; j++) {
-            gather_indices[j] = j % 3;
-            factor[j] = std::pow(theta_scale, j);
-        }
-        auto gather_indices_const =
-            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{n_dims_half}, gather_indices);
-        auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {4});
-        inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, gather_indices_const, gather_axis);
-        auto factor_const = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{n_dims_half}, factor);
-        theta = std::make_shared<ov::op::v1::Multiply>(inp_pos, factor_const);
+    if (ext_factor == 0.0f) {
+        theta = theta_interp;
    } else {
-        float corr_dims[2];
-        ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
-        factor[0] = 1.0f;
-        for (size_t i = 1; i < factor.size(); i++) {
-            factor[i] = theta_scale * factor[i - 1];
-        }
+        auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
+        Output<Node> one;
        if (stateful) {
-            freq_factors =
-                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
+            one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
        } else {
-            freq_factors =
-                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
-        }
-        if (rope_freqs_weight) {
-            freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
+            one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
        }
+        auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);

-        auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
-        auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
-            theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
-
-        if (ext_factor == 0.0f) {
-            theta = theta_interp;
-        } else {
-            auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
-            Output<Node> one;
-            if (stateful) {
-                one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
-            } else {
-                one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
-            }
-            auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
-
-            theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
-                                                      std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
-            mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
-        }
+        theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
+                                                  std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
+        mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
    }

    Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
    Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);

-    if (!imrope) {
-        auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
-
-        cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
-        sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
-    }
+    auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});

+    cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
+    sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
    return std::make_pair(sin_theta, cos_theta);
 }

@@ -67,7 +67,6 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
 std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
                                                           std::shared_ptr<ov::Node> inp_pos,
                                                           std::shared_ptr<ov::Node> rope_freqs_weight = nullptr,
-                                                           bool imrope = false,
                                                           bool stateful = false);

 ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
@@ -81,8 +81,8 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
 enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) {
    auto & core = ov_singleton_core();
    const auto & config = ggml_openvino_get_compile_config();
-    const auto & device = r_ctx->device;
-    const auto & stateful = r_ctx->stateful;
+    auto device = r_ctx->device;
+    bool stateful = r_ctx->stateful;
    static auto is_static = false;

    if (is_naive(cgraph)) {
@@ -106,26 +106,14 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
    int64_t infer_end_time;

    {
-        std::shared_ptr<decoder_runtime_ctx> entry;
+        std::lock_guard<std::mutex> lock(r_ctx->ov_compute_mutex);
+
+        auto it = r_ctx->decoder_cache.find(key);
+
+        cache_hit = it != r_ctx->decoder_cache.end();
        ModelParams old_m_params;
-
-        {
-            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-            auto it = r_ctx->decoder_cache.find(key);
-            cache_hit = it != r_ctx->decoder_cache.end();
-            if (cache_hit) {
-                entry = it->second;
-            } else {
-                auto mutex = std::make_shared<std::mutex>();
-                entry = std::make_shared<decoder_runtime_ctx>(mutex);
-                r_ctx->decoder_cache[key] = entry;
-            }
-        }
-
-        std::lock_guard<std::mutex> lock(*(entry->mutex));
-
        if (cache_hit) {
-            ggml_decoder = entry->ptr;
+            ggml_decoder = it->second;
            old_m_params = ggml_decoder->get_model_params();
            cache_hit = old_m_params.can_reuse_dynamically(m_params);
        }
@@ -138,10 +126,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
                ggml_decoder->update_io(cgraph);
            }
            ggml_decoder->add_extra_inputs();
-            {
-                std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-                infer_request = r_ctx->infer_request_cache.at(key);
-            }
+            infer_request = r_ctx->infer_request_cache.at(key);

            if (stateful) {
                const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -185,10 +170,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            conversion_end_time = decoder_end_time;
            compile_end_time = decoder_end_time;
        } else {
-            {
-                std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-                r_ctx->infer_request_cache.erase(key);
-            }
+            r_ctx->infer_request_cache.erase(key);

            std::shared_ptr<ov::Model> model;
            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -217,7 +199,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            }
            compile_end_time = ggml_time_us();
            infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
-            entry->ptr = ggml_decoder;
+            r_ctx->infer_request_cache[key] = infer_request;
+            r_ctx->decoder_cache[key] = ggml_decoder;

            std::vector<std::string> ov_input_names;
            std::vector<std::string> ov_output_names;
@@ -227,13 +210,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            for (const auto & ov_output : model->get_results()) {
                ov_output_names.push_back(ov_output->get_friendly_name());
            }
-
-            {
-                std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-                r_ctx->infer_request_cache[key] = infer_request;
-                r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
-                r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
-            }
+            r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
+            r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);

            if (stateful) {
                const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -246,13 +224,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            }
        }

-        std::vector<std::string> ov_input_names;
-        std::vector<std::string> ov_output_names;
-        {
-            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-            ov_input_names = r_ctx->ov_input_names_cache[key];
-            ov_output_names = r_ctx->ov_output_names_cache[key];
-        }
+        auto ov_input_names = r_ctx->ov_input_names_cache[key];
+        auto ov_output_names = r_ctx->ov_output_names_cache[key];

        for (size_t i = 0; i < ov_input_names.size(); i++) {
            auto param_name = ov_input_names[i];
@@ -333,26 +306,12 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
    int64_t compile_end_time;
    int64_t infer_end_time;

-    std::shared_ptr<decoder_runtime_ctx> entry;
+    auto it = r_ctx->decoder_cache.find(key);
+
+    cache_hit = it != r_ctx->decoder_cache.end();
    ModelParams old_m_params;
-
-    {
-        std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-        auto it = r_ctx->decoder_cache.find(key);
-        cache_hit = it != r_ctx->decoder_cache.end();
-        if (cache_hit) {
-            entry = it->second;
-        } else {
-            auto mutex = std::make_shared<std::mutex>();
-            entry = std::make_shared<decoder_runtime_ctx>(mutex);
-            r_ctx->decoder_cache[key] = entry;
-        }
-    }
-
-    std::lock_guard<std::mutex> lock(*(entry->mutex));
-
    if (cache_hit) {
-        ggml_decoder = entry->ptr;
+        ggml_decoder = it->second;
        old_m_params = ggml_decoder->get_model_params();
        cache_hit = old_m_params.can_reuse_statically(m_params);
    }
@@ -366,21 +325,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
            ggml_decoder->update_io(cgraph);
        }
        ggml_decoder->add_extra_inputs();
-        {
-            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-            infer_request =
-                is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
-        }
+        infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);

        decoder_end_time = ggml_time_us();
        conversion_end_time = decoder_end_time;
        compile_end_time = decoder_end_time;
    } else {
-        {
-            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-            r_ctx->infer_request_cache.erase(key);
-            r_ctx->infer_request_cache_prefill.erase(key);
-        }
+        r_ctx->infer_request_cache.erase(key);
+        r_ctx->infer_request_cache_prefill.erase(key);

        std::shared_ptr<ov::Model> model;
        auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -420,14 +372,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
            compiled_model_decode = core.compile_model(model_decode, device, config);
        }

-        auto infer_request_prefill = std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
-        auto infer_request_decode = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
+        r_ctx->infer_request_cache_prefill[key] =
+            std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
+        r_ctx->infer_request_cache[key] =
+            std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
        compile_end_time = ggml_time_us();

        model = is_prefill ? model_prefill : model_decode;
        ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
-        infer_request = is_prefill ? infer_request_prefill : infer_request_decode;
-        entry->ptr = ggml_decoder;
+        infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key];
+        r_ctx->decoder_cache[key] = ggml_decoder;

        std::vector<std::string> ov_input_names;
        std::vector<std::string> ov_output_names;
@@ -437,29 +391,18 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
        for (const auto & ov_output : model->get_results()) {
            ov_output_names.push_back(ov_output->get_friendly_name());
        }
-
-        {
-            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-            r_ctx->infer_request_cache_prefill[key] = infer_request_prefill;
-            r_ctx->infer_request_cache[key] = infer_request_decode;
-            r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
-            r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
-        }
+        r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
+        r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
    }

-    std::vector<std::string> ov_input_names_local;
-    std::vector<std::string> ov_output_names_local;
-    {
-        std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
-        ov_input_names_local = r_ctx->ov_input_names_cache[key];
-        ov_output_names_local = r_ctx->ov_output_names_cache[key];
-    }
+    auto ov_input_names = r_ctx->ov_input_names_cache[key];
+    auto ov_output_names = r_ctx->ov_output_names_cache[key];

    if (is_prefill) {
        auto inp_len = inp_pos->ne[0];
        for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) {
-            for (size_t i = 0; i < ov_input_names_local.size(); i++) {
-                auto param_name = ov_input_names_local[i];
+            for (size_t i = 0; i < ov_input_names.size(); i++) {
+                auto param_name = ov_input_names[i];
                auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index);
                infer_request->set_input_tensor(i, input_tensor);

@@ -469,8 +412,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
                }
            }

-            for (size_t i = 0; i < ov_output_names_local.size(); i++) {
-                auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
+            for (size_t i = 0; i < ov_output_names.size(); i++) {
+                auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
                auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
                infer_request->set_output_tensor(i, output_tensor);
            }
@@ -478,16 +421,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
            infer_request->infer();

            if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
-                for (size_t i = 0; i < ov_output_names_local.size(); i++) {
+                for (size_t i = 0; i < ov_output_names.size(); i++) {
                    const auto output_tensor = infer_request->get_output_tensor(i);
-                    print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
+                    print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
                }
            }
        }
        infer_end_time = ggml_time_us();
    } else {
-        for (size_t i = 0; i < ov_input_names_local.size(); i++) {
-            auto param_name = ov_input_names_local[i];
+        for (size_t i = 0; i < ov_input_names.size(); i++) {
+            auto param_name = ov_input_names[i];
            auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name);
            infer_request->set_input_tensor(i, input_tensor);

@@ -497,8 +440,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
            }
        }

-        for (size_t i = 0; i < ov_output_names_local.size(); i++) {
-            auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
+        for (size_t i = 0; i < ov_output_names.size(); i++) {
+            auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
            auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
            infer_request->set_output_tensor(i, output_tensor);
        }
@@ -507,9 +450,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
        infer_end_time = ggml_time_us();

        if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
-            for (size_t i = 0; i < ov_output_names_local.size(); i++) {
+            for (size_t i = 0; i < ov_output_names.size(); i++) {
                const auto output_tensor = infer_request->get_output_tensor(i);
-                print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
+                print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
            }
        }
    }
@@ -3,15 +3,12 @@
 #include "ggml-impl.h"

 #include <algorithm>
-#include <atomic>
 #include <cstddef>
 #include <memory>
-#include <mutex>
 #include <openvino/runtime/core.hpp>
 #include <openvino/runtime/infer_request.hpp>
 #include <string>
 #include <unordered_map>
-#include <utility>
 #include <vector>

 struct graph_key {
@@ -43,17 +40,11 @@ struct graph_key_hash {
    }
 };

-struct decoder_runtime_ctx {
-    decoder_runtime_ctx(std::shared_ptr<std::mutex> mutex) : mutex(std::move(mutex)) {}
-    std::shared_ptr<std::mutex> mutex;
-    std::shared_ptr<GgmlOvDecoder> ptr;
-};
-
 struct ov_runtime_context {
-    mutable std::mutex ctx_mutex;
+    std::mutex ov_compute_mutex;
    std::string device;
    bool stateful;
-    std::unordered_map<graph_key, std::shared_ptr<decoder_runtime_ctx>, graph_key_hash> decoder_cache;
+    std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
    std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
    std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
    std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
@@ -62,22 +53,11 @@ struct ov_runtime_context {
    //      Simultanous stateful inference request support to be added.
    size_t stateful_kv_size;
    std::map<std::string, std::string> kv_state_input_name_map;
-    std::atomic<int> backend_count;

    ov_runtime_context() :
        device("CPU"),
        stateful(false),
-        stateful_kv_size(0),
-        backend_count(0) {}
-
-    void clear_caches() {
-        std::lock_guard<std::mutex> lock(ctx_mutex);
-        decoder_cache.clear();
-        infer_request_cache.clear();
-        infer_request_cache_prefill.clear();
-        ov_input_names_cache.clear();
-        ov_output_names_cache.clear();
-    }
+        stateful_kv_size(0) {}
 };

 enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend);
@@ -28,13 +28,6 @@

 namespace syclexp = sycl::ext::oneapi::experimental;

-#if defined(__INTEL_LLVM_COMPILER) && __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
-    #include <sycl/ext/oneapi/bfloat16.hpp>
-    #ifndef GGML_SYCL_HAS_BF16
-        #define GGML_SYCL_HAS_BF16
-    #endif
-#endif
-
 #if GGML_SYCL_DNNL
 #include "dnnl.hpp"
 #include "dnnl_sycl.hpp"
@@ -2,6 +2,13 @@
 #include "dequantize.hpp"
 #include "presets.hpp"

+#if defined(__INTEL_LLVM_COMPILER)
+    #if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
+        #include <sycl/ext/oneapi/bfloat16.hpp>
+        #define GGML_SYCL_HAS_BF16
+    #endif
+#endif
+
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
                             const sycl::nd_item<3> &item_ct1) {
@@ -760,22 +767,6 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
 }


-#ifdef GGML_SYCL_HAS_BF16
-to_bf16_sycl_t ggml_get_to_bf16_sycl(ggml_type type, ggml_tensor * /*dst*/) {
-    switch (type) {
-        case GGML_TYPE_F32:
-            return convert_unary_sycl<float>;
-        case GGML_TYPE_F16:
-            return convert_unary_sycl<sycl::half>;
-        case GGML_TYPE_BF16:
-            return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
-        default:
-            GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
-            return nullptr;
-    }
-}
-#endif
-
 to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) {
    switch (type) {
        case GGML_TYPE_F32:
@@ -23,11 +23,6 @@ typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
 to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst);
 to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst);

-#ifdef GGML_SYCL_HAS_BF16
-typedef to_t_sycl_t<sycl::ext::oneapi::bfloat16> to_bf16_sycl_t;
-to_bf16_sycl_t ggml_get_to_bf16_sycl(ggml_type type, ggml_tensor * dst);
-#endif
-
 // Nc = Non-contiguous
 template <typename T>
 using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
@@ -40,19 +35,15 @@ template<typename dst_t, typename src_t>
 inline dst_t ggml_sycl_cast(src_t x) {
    if constexpr (std::is_same_v<dst_t, src_t>) {
        return x;
-#ifdef GGML_SYCL_HAS_BF16
    } else if constexpr (std::is_same_v<dst_t, sycl::ext::oneapi::bfloat16>) {
        return sycl::ext::oneapi::bfloat16(float(x));
    } else if constexpr (std::is_same_v<src_t, sycl::ext::oneapi::bfloat16>) {
        return static_cast<float>(x);
-#endif
    } else if constexpr (std::is_same_v<src_t, sycl::float2> && std::is_same_v<dst_t, sycl::half2>) {
        return x.template convert<sycl::half, sycl::rounding_mode::rte>();
-#ifdef GGML_SYCL_HAS_BF16
    } else if constexpr (std::is_same_v<src_t, sycl::float2> &&
                         std::is_same_v<dst_t, sycl::vec<sycl::ext::oneapi::bfloat16, 2>>) {
        return {x.x, x.y};
-#endif
    } else if constexpr(std::is_same_v<dst_t, int32_t>) {
        return int32_t(x);
    } else {
@@ -29,9 +29,6 @@ public:
    static constexpr dt to_dt() {
        if constexpr (std::is_same_v<T, float>) return dt::f32;
        else if constexpr (std::is_same_v<T, sycl::half>) return dt::f16;
-#ifdef GGML_SYCL_HAS_BF16
-        else if constexpr (std::is_same_v<T, sycl::ext::oneapi::bfloat16>) return dt::bf16;
-#endif
        else static_assert(0);
    }

@@ -2176,31 +2176,6 @@ inline void ggml_sycl_op_mul_mat_sycl(
 #else
    bool use_fp16 = false;
 #endif
-
-#if GGML_SYCL_DNNL && defined(GGML_SYCL_HAS_BF16)
-    // Fast path for bf16 src0
-    if (src0->type == GGML_TYPE_BF16 && !g_ggml_sycl_disable_dnn && ggml_is_contiguous(src0) &&
-        row_diff == src0->ne[1]) {
-        using bf16_t = sycl::ext::oneapi::bfloat16;
-        ggml_sycl_pool_alloc<bf16_t> src1_as_bf16(ctx.pool(), src1_ncols*ne10);
-        if (src1->type != GGML_TYPE_BF16) {
-            const to_bf16_sycl_t to_bf16_sycl = ggml_get_to_bf16_sycl(src1->type, dst);
-            GGML_ASSERT(to_bf16_sycl != nullptr);
-            to_bf16_sycl(src1_ddf_i, src1_as_bf16.get(), src1_ncols*ne10, stream);
-        } else {
-            stream->memcpy(src1_as_bf16.get(), src1_ddf_i, src1_ncols*ne10*sizeof(bf16_t));
-        }
-        DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10,
-                                  src0_dd_i, DnnlGemmWrapper::to_dt<bf16_t>(),
-                                  src1_as_bf16.get(), DnnlGemmWrapper::to_dt<bf16_t>(),
-                                  dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
-        GGML_UNUSED(dst);
-        GGML_UNUSED(src1_ddq_i);
-        GGML_UNUSED(src1_padded_row_size);
-        return;
-    }
-#endif
-
    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
        row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
        ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
@@ -3808,51 +3783,6 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
    }
 }

-// Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
-static bool ggml_sycl_mul_mat_id_mmvq_fused(
-    ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
-    const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
-{
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    if (ne12 != 1) return false;
-    if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
-    if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
-    if (!ggml_is_contiguous(src1)) return false;
-
-    // Reorder layout not supported; fall back.
-    const ggml_tensor_extra_gpu * src0_extra =
-        static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
-    if (src0_extra && src0_extra->optimized_feature.reorder) return false;
-
-    const int64_t n_ids_per_group = ids->ne[0];
-    if (ids->ne[1] != 1) return false;
-    if (ne11 != 1 && ne11 != n_ids_per_group) return false;
-
-    const queue_ptr stream           = ctx.stream();
-    const int       src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
-    const int       n_experts_used   = (int) n_ids_per_group;
-    const int       nrows            = (int) src0->ne[1];
-
-    ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
-        (size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
-    char * src1_ddq = src1_q8_alloc.get();
-    quantize_row_q8_1_sycl<quantize_q8_1>(
-        (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
-        src1_padded_cols, stream);
-
-    const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
-    const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
-
-    return ggml_sycl_mul_mat_vec_q_id(
-        src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
-        (float *) dst->data, (int) ne10, nrows, n_experts_used,
-        /*expert_weight_stride=*/ src0->nb[2],
-        /*dst_row_stride=*/ dst->nb[1],
-        src1_row_stride, stream);
-}
-
 static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
                                 ggml_tensor *dst) try {
    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
@@ -3868,12 +3798,6 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
    const int64_t n_as = ne02;
    const int64_t n_ids = ids->ne[0];

-    if (ne12 == 1) {
-        if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
-            return;
-        }
-    }
-
    std::vector<char> ids_host(ggml_nbytes(ids));
    const char * ids_dev = (const char *) ids->data;

@@ -3924,9 +3848,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
            }
        }
    } else {
-        const int64_t n_routed_rows = ids->ne[1] * n_ids;
-        ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne10);
-        ggml_sycl_pool_alloc<char>  dst_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne0);
+        ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
+        ggml_sycl_pool_alloc<char>  dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));

        src1_row.data = src1_contiguous.get();
        dst_row.data  =  dst_contiguous.get();
@@ -1199,154 +1199,3 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
    GGML_UNUSED(src1_ddf_i);
    GGML_UNUSED(ctx);
 }
-
-// src1_row_stride: 0 for shared src1 (gate/up proj), else per-expert stride (down proj).
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
-static void mul_mat_vec_q_moe(
-    const void * __restrict__ vx_base, const void * __restrict__ vy_base,
-    float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
-    const int ncols, const int nrows,
-    const size_t expert_weight_stride, const size_t dst_row_stride,
-    const size_t src1_row_stride,
-    const sycl::nd_item<3> & item_ct1) {
-
-    const int expert_idx = item_ct1.get_group(1);
-    const int i02        = ids_dev[expert_idx];
-
-    const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
-    const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
-    float *      dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int     blocks_per_row  = ncols / qk;
-    constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi;
-
-    float tmp = 0.0f;
-
-    const block_q_t *  x = (const block_q_t *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row * blocks_per_row + i;
-        const int iby = i * (qk / QK8_1);
-
-        for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
-            const int iqs = elem + vdr * (item_ct1.get_local_id(2) % (qi / vdr));
-            tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
-        }
-    }
-
-#pragma unroll
-    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
-        tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
-static void launch_mul_mat_vec_q_moe(
-    const void * vx_base, const void * vy, const int32_t * ids_dev,
-    float * dst_base, const int ncols, const int nrows, const int n_experts_used,
-    const size_t expert_weight_stride, const size_t dst_row_stride,
-    const size_t src1_row_stride,
-    dpct::queue_ptr stream) {
-    const int            block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler & cgh) {
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                mul_mat_vec_q_moe<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
-                    vx_base, vy, dst_base, ids_dev, ncols, nrows,
-                    expert_weight_stride, dst_row_stride, src1_row_stride, item);
-            });
-    });
-}
-
-bool ggml_sycl_mul_mat_vec_q_id(
-    enum ggml_type     src0_type,
-    const void *       vx_base,
-    const void *       vy,
-    const int32_t *    ids_dev,
-    float *            dst_base,
-    int                ncols,
-    int                nrows,
-    int                n_experts_used,
-    size_t             expert_weight_stride,
-    size_t             dst_row_stride,
-    size_t             src1_row_stride,
-    dpct::queue_ptr    stream) {
-    switch (src0_type) {
-        case GGML_TYPE_Q4_0:
-            launch_mul_mat_vec_q_moe<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_Q4_1:
-            launch_mul_mat_vec_q_moe<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_Q5_0:
-            launch_mul_mat_vec_q_moe<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_Q5_1:
-            launch_mul_mat_vec_q_moe<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_Q8_0:
-            launch_mul_mat_vec_q_moe<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_Q2_K:
-            launch_mul_mat_vec_q_moe<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_Q3_K:
-            launch_mul_mat_vec_q_moe<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_Q4_K:
-            launch_mul_mat_vec_q_moe<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_Q5_K:
-            launch_mul_mat_vec_q_moe<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_Q6_K:
-            launch_mul_mat_vec_q_moe<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_MXFP4:
-            launch_mul_mat_vec_q_moe<QK_MXFP4, QI_MXFP4, block_mxfp4, VDR_MXFP4_Q8_1_MMVQ, vec_dot_mxfp4_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        case GGML_TYPE_NVFP4:
-            launch_mul_mat_vec_q_moe<QK_NVFP4, QI_NVFP4, block_nvfp4, VDR_NVFP4_Q8_1_MMVQ, vec_dot_nvfp4_q8_1>(
-                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
-                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
-            return true;
-        default:
-            return false;
-    }
-}
@@ -24,20 +24,4 @@ void ggml_sycl_op_mul_mat_vec_q(
    const int64_t src1_ncols, const int64_t src1_padded_row_size,
    const dpct::queue_ptr &stream);

-// Requires standard (non-reorder) block layout for src0.
-// Returns false if src0_type isn't handled; caller should fall back.
-bool ggml_sycl_mul_mat_vec_q_id(
-    enum ggml_type     src0_type,
-    const void *       vx_base,             // start of stacked expert weights
-    const void *       vy,                  // pre-quantized src1 (Q8_1)
-    const int32_t *    ids_dev,             // device-side int32, length n_experts_used
-    float *            dst_base,
-    int                ncols,
-    int                nrows,
-    int                n_experts_used,
-    size_t             expert_weight_stride, // bytes between experts in vx_base
-    size_t             dst_row_stride,       // bytes between dst rows
-    size_t             src1_row_stride,      // 0 = shared src1, else per-expert stride in bytes
-    dpct::queue_ptr    stream);
-
 #endif // GGML_SYCL_MMVQ_HPP
@@ -4,11 +4,7 @@
 namespace utils {
 template<typename T>
 static constexpr bool is_arithmetic_v() {
-    return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half>
-#ifdef GGML_SYCL_HAS_BF16
-        || std::is_same_v<T, sycl::ext::oneapi::bfloat16>
-#endif
-        ;
+    return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
 }
 }

@@ -185,7 +181,6 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s
                stream
            );
            break;
-#ifdef GGML_SYCL_HAS_BF16
        case GGML_TYPE_BF16:
            set_rows_sycl<TIn, TIdx, sycl::ext::oneapi::bfloat16>(
                src0_d, src1_d, (char *)dst->data,
@@ -198,7 +193,6 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s
                stream
            );
            break;
-#endif
        case GGML_TYPE_Q8_0:
            set_rows_sycl_q<TIdx, block_q8_0, QK8_0, cpy_blck_f32_q8_0>(src0_d, src1_d, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
            break;
@@ -792,7 +792,6 @@ struct vk_device_struct {
    vk_pipeline pipeline_arange_f32;

    vk_pipeline pipeline_fill_f32;
-    vk_pipeline pipeline_fill_f16;

    vk_pipeline pipeline_geglu[2];
    vk_pipeline pipeline_reglu[2];
@@ -4578,7 +4577,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_fill_f16, "fill_f16", fill_f16_len, fill_f16_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);

 #define CREATE_GLU(name)  \
    ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true);   \
@@ -9846,9 +9844,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        if (dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_fill_f32;
        }
-        if (dst->type == GGML_TYPE_F16) {
-            return ctx->device->pipeline_fill_f16;
-        }
        return nullptr;
    default:
        return nullptr;
@@ -15718,9 +15713,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                || (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32)
                || (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16);
        case GGML_OP_ARANGE:
-            return op->type == GGML_TYPE_F32;
        case GGML_OP_FILL:
-            return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
+            return op->type == GGML_TYPE_F32;
        case GGML_OP_SCALE:
            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_PAD:
@@ -889,7 +889,6 @@ void process_shaders() {
    string_to_spv("add1_f32_f32",   "add1.comp",        {{"A_TYPE", "float"},       {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    string_to_spv("arange_f32",     "arange.comp",      {{"A_TYPE", "float"},       {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    string_to_spv("fill_f32",       "fill.comp",        {{"D_TYPE", "float"},       {"FLOAT_TYPE", "float"}});
-    string_to_spv("fill_f16",       "fill.comp",        {{"D_TYPE", "float16_t"},   {"FLOAT_TYPE", "float"}});
    string_to_spv("step_f16",       "step.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("step_f32",       "step.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
    string_to_spv("round_f16",      "round.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
@@ -44,9 +44,18 @@
 // Matrix-vector multiplication parameters
 #define WEBGPU_MUL_MAT_VEC_WG_SIZE 256

-#define WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG    4
-#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 4
-#define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG      4
+// Must be multiple of 4 to work with vectorized paths, and must divide
+// mul_mat_vec wg size
+#define WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG 64
+#define WEBGPU_MUL_MAT_VEC_FLOAT_TILE_K         256
+
+#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 64
+#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_TILE_K         256
+
+// Requires 32 threads per output (wg_size/outputs_per_wg == 32)
+#define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG 8
+// Requires at least two (and multiple of 2) k-quant blocks per tile
+#define WEBGPU_MUL_MAT_VEC_K_Q_TILE_K         512

 // default size for legacy matrix multiplication
 #define WEBGPU_MUL_MAT_WG_SIZE 256
@@ -69,7 +78,6 @@ struct ggml_webgpu_shader_lib_context {
    bool     inplace                  = false;
    bool     overlap                  = false;
    bool     src_overlap              = false;
-    bool     supports_subgroups       = false;
    bool     supports_subgroup_matrix = false;
    uint32_t sg_mat_m                 = 0;
    uint32_t sg_mat_n                 = 0;
@@ -194,28 +202,6 @@ struct ggml_webgpu_row_norm_pipeline_key_hash {
    }
 };

-/** RMS_NORM + MUL **/
-
-struct ggml_webgpu_rms_norm_mul_pipeline_key {
-    bool inplace;      // rn_src == dst
-    bool overlap;      // mul_src == dst
-    bool src_overlap;  // rn_src == mul_src
-
-    bool operator==(const ggml_webgpu_rms_norm_mul_pipeline_key & other) const {
-        return inplace == other.inplace && overlap == other.overlap && src_overlap == other.src_overlap;
-    }
-};
-
-struct ggml_webgpu_rms_norm_mul_pipeline_key_hash {
-    size_t operator()(const ggml_webgpu_rms_norm_mul_pipeline_key & key) const {
-        size_t seed = 0;
-        ggml_webgpu_hash_combine(seed, key.inplace);
-        ggml_webgpu_hash_combine(seed, key.overlap);
-        ggml_webgpu_hash_combine(seed, key.src_overlap);
-        return seed;
-    }
-};
-
 /** Pad **/
 struct ggml_webgpu_pad_pipeline_key {
    bool circular;
@@ -262,46 +248,6 @@ struct ggml_webgpu_ssm_conv_pipeline_key {
    }
 };

-/** CONV 2D */
-struct ggml_webgpu_conv2d_pipeline_key {
-    ggml_type weight_type;
-    ggml_type input_type;
-    ggml_type output_type;
-
-    bool operator==(const ggml_webgpu_conv2d_pipeline_key & other) const {
-        return weight_type == other.weight_type && input_type == other.input_type && output_type == other.output_type;
-    }
-};
-
-struct ggml_webgpu_conv2d_pipeline_key_hash {
-    size_t operator()(const ggml_webgpu_conv2d_pipeline_key & key) const {
-        size_t seed = 0;
-        ggml_webgpu_hash_combine(seed, key.weight_type);
-        ggml_webgpu_hash_combine(seed, key.input_type);
-        ggml_webgpu_hash_combine(seed, key.output_type);
-        return seed;
-    }
-};
-
-/** Im2Col **/
-struct ggml_webgpu_im2col_pipeline_key {
-    ggml_type input_type;
-    ggml_type output_type;
-
-    bool operator==(const ggml_webgpu_im2col_pipeline_key & other) const {
-        return input_type == other.input_type && output_type == other.output_type;
-    }
-};
-
-struct ggml_webgpu_im2col_pipeline_key_hash {
-    size_t operator()(const ggml_webgpu_im2col_pipeline_key & key) const {
-        size_t seed = 0;
-        ggml_webgpu_hash_combine(seed, key.input_type);
-        ggml_webgpu_hash_combine(seed, key.output_type);
-        return seed;
-    }
-};
-
 /** Gated Delta Net **/
 struct ggml_webgpu_gated_delta_net_pipeline_key {
    int type;
@@ -436,27 +382,19 @@ struct ggml_webgpu_unary_pipeline_key_hash {

 /** FlashAttention */

-enum ggml_webgpu_flash_attn_path : uint32_t {
-    GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX = 0u,
-    GGML_WEBGPU_FLASH_ATTN_PATH_TILE            = 1u,
-    GGML_WEBGPU_FLASH_ATTN_PATH_VEC             = 2u,
-};
-
 struct ggml_webgpu_flash_attn_pipeline_key {
    ggml_type kv_type;
    uint32_t  head_dim_qk;
    uint32_t  head_dim_v;
    bool      kv_direct;
-    bool      kv_overlap;
    bool      has_mask;
    bool      has_sinks;
    bool      uses_logit_softcap;
-    uint32_t  path;

    bool operator==(const ggml_webgpu_flash_attn_pipeline_key & other) const {
        return kv_type == other.kv_type && head_dim_qk == other.head_dim_qk && head_dim_v == other.head_dim_v &&
-               kv_direct == other.kv_direct && kv_overlap == other.kv_overlap && has_mask == other.has_mask &&
-               has_sinks == other.has_sinks && uses_logit_softcap == other.uses_logit_softcap && path == other.path;
+               kv_direct == other.kv_direct && has_mask == other.has_mask && has_sinks == other.has_sinks &&
+               uses_logit_softcap == other.uses_logit_softcap;
    }
 };

@@ -467,70 +405,39 @@ struct ggml_webgpu_flash_attn_pipeline_key_hash {
        ggml_webgpu_hash_combine(seed, key.head_dim_qk);
        ggml_webgpu_hash_combine(seed, key.head_dim_v);
        ggml_webgpu_hash_combine(seed, key.kv_direct);
-        ggml_webgpu_hash_combine(seed, key.kv_overlap);
        ggml_webgpu_hash_combine(seed, key.has_mask);
        ggml_webgpu_hash_combine(seed, key.has_sinks);
        ggml_webgpu_hash_combine(seed, key.uses_logit_softcap);
-        ggml_webgpu_hash_combine(seed, key.path);
        return seed;
    }
 };

 struct ggml_webgpu_flash_attn_decisions {
-    uint32_t path      = GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX;
-    uint32_t q_tile    = 0;
-    uint32_t kv_tile   = 0;
-    uint32_t wg_size   = 0;
-    bool     kv_direct = false;
+    uint32_t q_tile  = 0;
+    uint32_t kv_tile = 0;
+    uint32_t wg_size = 0;
 };

-inline constexpr uint32_t GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH = 4u;
-inline constexpr uint32_t GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE       = 4u;
-
-inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_pipeline_key & key) {
-    if (key.path != GGML_WEBGPU_FLASH_ATTN_PATH_VEC || key.kv_type != GGML_TYPE_F16 ||
-        key.head_dim_qk != key.head_dim_v) {
-        return 1u;
-    }
-
-    switch (key.head_dim_qk) {
-        case 64:
-        case 192:
-        case 576:
-            return 2u;
-        case 96:
-            return 4u;
-        default:
-            return 1u;
-    }
-}
+struct ggml_webgpu_flash_attn_vec_decisions {
+    uint32_t kv_tile = 0;
+    uint32_t wg_size = 0;
+};

 inline ggml_webgpu_flash_attn_pipeline_key ggml_webgpu_flash_attn_make_pipeline_key(
-    const ggml_webgpu_shader_lib_context & context,
-    uint32_t                               path) {
+    const ggml_webgpu_shader_lib_context & context) {
    const bool has_mask  = context.src3 != nullptr;
    const bool has_sinks = context.src4 != nullptr;
-    bool       kv_direct = false;
-    if (path != GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
-        uint32_t kv_direct_align = GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH;
-        if (path == GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX) {
-            kv_direct_align = context.sg_mat_k;
-        }
-        kv_direct = (context.src1->type == GGML_TYPE_F16) &&
-                    (context.src0->ne[0] % std::max(1u, kv_direct_align) == 0) &&
-                    (context.src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);
-    }
+    const bool kv_direct = (context.src1->type == GGML_TYPE_F16) && (context.src0->ne[0] % context.sg_mat_k == 0) &&
+                           (context.src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);

    ggml_webgpu_flash_attn_pipeline_key key = {};
    key.kv_type                             = context.src1->type;
    key.head_dim_qk                         = (uint32_t) context.src0->ne[0];
    key.head_dim_v                          = (uint32_t) context.src2->ne[0];
    key.kv_direct                           = kv_direct;
-    key.kv_overlap                          = context.src_overlap;
    key.has_mask                            = has_mask;
    key.has_sinks                           = has_sinks;
    key.uses_logit_softcap                  = ggml_get_op_params_f32(context.dst, 2) != 0.0f;
-    key.path                                = path;
    return key;
 }

@@ -593,16 +500,8 @@ inline size_t ggml_webgpu_flash_attn_wg_mem_bytes(uint32_t q_tile,

 inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_context &      context,
                                                   const ggml_webgpu_flash_attn_pipeline_key & key) {
-    const size_t limit_bytes    = context.wg_mem_limit_bytes;
-    uint32_t     q_tile         = context.sg_mat_m;
-    uint32_t     kv_granularity = context.sg_mat_n;
-    if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
-        q_tile         = GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE;
-        kv_granularity = std::max(1u, context.max_subgroup_size);
-    } else if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
-        q_tile         = 1u;
-        kv_granularity = 8u;
-    }
+    const size_t limit_bytes  = context.wg_mem_limit_bytes;
+    const size_t q_tile       = context.sg_mat_m;
    const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
                                2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
    size_t bytes_per_kv = 0;
@@ -615,90 +514,23 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
    bytes_per_kv += q_tile;
    bytes_per_kv *= GGML_WEBGPU_F16_SIZE_BYTES;
    const uint32_t max_kv_tile = (limit_bytes - base_q_bytes) / bytes_per_kv;
-    return (max_kv_tile / kv_granularity) * kv_granularity;
+    return (max_kv_tile / context.sg_mat_n) * context.sg_mat_n;
 }

-inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
-    const ggml_webgpu_shader_lib_context & context,
-    size_t                                 storage_offset_alignment) {
-    ggml_webgpu_flash_attn_decisions decisions = {};
-    const size_t                     alignment = std::max<size_t>(1u, storage_offset_alignment);
-    const auto *                     K         = context.src1;
-    const auto *                     V         = context.src2;
-    GGML_ASSERT(K != nullptr);
-    GGML_ASSERT(V != nullptr);
+inline uint32_t ggml_webgpu_flash_attn_vec_get_kv_tile(const ggml_webgpu_shader_lib_context & context) {
+    const ggml_webgpu_flash_attn_pipeline_key key         = ggml_webgpu_flash_attn_make_pipeline_key(context);
+    const uint32_t                            min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
+    uint32_t                                  kv_tile     = std::max(context.sg_mat_n, std::min(32u, min_kv_tile));
+    kv_tile                                               = (kv_tile / context.sg_mat_n) * context.sg_mat_n;

-    const auto flash_attn_tensor_offset = [](const ggml_tensor * tensor) -> size_t {
-        constexpr uintptr_t ptr_base_addr = 0x1000u;
-        const ggml_tensor * base          = tensor->view_src != nullptr ? tensor->view_src : tensor;
-        return reinterpret_cast<uintptr_t>(base->data) - ptr_base_addr + tensor->view_offs;
-    };
-
-    const uint32_t k_offset_elems =
-        (uint32_t) ((flash_attn_tensor_offset(K) & (alignment - 1)) / ggml_type_size(K->type));
-    const uint32_t v_offset_elems =
-        (uint32_t) ((flash_attn_tensor_offset(V) & (alignment - 1)) / ggml_type_size(V->type));
-    const bool f16_vec4_aligned = (k_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u) &&
-                                  (v_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u);
-    const bool kv_vec_type_supported =
-        K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
-    const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && (context.src0->ne[0] % 32 == 0) &&
-                         (context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
-                         kv_vec_type_supported && (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
-                         (context.src2->type == K->type);
-    const bool use_tile = context.supports_subgroups && !context.supports_subgroup_matrix && K->type == GGML_TYPE_F16 &&
-                          V->type == GGML_TYPE_F16 && f16_vec4_aligned &&
-                          (context.src0->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
-                          (context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) && !use_vec;
-
-    decisions.path = use_vec  ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC :
-                     use_tile ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE :
-                                GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX;
-
-    const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context, decisions.path);
-    decisions.kv_direct                           = key.kv_direct;
-
-    if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
-        const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
-        decisions.q_tile           = 1u;
-        decisions.kv_tile          = std::max(8u, std::min(32u, min_kv_tile));
-        decisions.kv_tile          = (decisions.kv_tile / 8u) * 8u;
-        decisions.wg_size          = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
-        if (decisions.kv_direct) {
-            decisions.kv_tile = std::min(decisions.kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
-            while (GGML_WEBGPU_KV_SEQ_PAD % decisions.kv_tile != 0) {
-                decisions.kv_tile -= 8u;
-            }
-        }
-        return decisions;
-    }
-
-    decisions.q_tile =
-        decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ? GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE : context.sg_mat_m;
-    decisions.kv_tile = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
-                            std::min(64u, ggml_webgpu_flash_attn_max_kv_tile(context, key)) :
-                            std::min(ggml_webgpu_flash_attn_max_kv_tile(context, key),
-                                     context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
-    decisions.wg_size = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
-                            GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE :
-                            std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
-
-    if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
-        const uint32_t tile_kv_granularity = std::max(1u, context.max_subgroup_size);
-        decisions.kv_tile =
-            std::max(tile_kv_granularity, (decisions.kv_tile / tile_kv_granularity) * tile_kv_granularity);
-    }
-
-    if (decisions.kv_direct) {
-        GGML_ASSERT(decisions.kv_tile <= GGML_WEBGPU_KV_SEQ_PAD);
-        while (GGML_WEBGPU_KV_SEQ_PAD % decisions.kv_tile != 0) {
-            decisions.kv_tile -= decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
-                                     std::max(1u, context.max_subgroup_size) :
-                                     context.sg_mat_n;
+    if (key.kv_direct) {
+        kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
+        while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
+            kv_tile -= context.sg_mat_n;
        }
    }

-    return decisions;
+    return kv_tile;
 }

 /** Matrix Multiplication **/
@@ -743,6 +575,7 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash {

 struct ggml_webgpu_mul_mat_vec_shader_decisions {
    uint32_t wg_size;
+    uint32_t tile_k;
    uint32_t outputs_per_wg;
    uint32_t vec_size;
 };
@@ -910,17 +743,16 @@ class ggml_webgpu_shader_lib {
    std::unordered_map<int, webgpu_pipeline> cumsum_pipelines;         // key is fixed, no variants yet
    std::unordered_map<ggml_webgpu_row_norm_pipeline_key, webgpu_pipeline, ggml_webgpu_row_norm_pipeline_key_hash>
        row_norm_pipelines;                                            // op/inplace
-
    std::unordered_map<ggml_webgpu_get_rows_pipeline_key, webgpu_pipeline, ggml_webgpu_get_rows_pipeline_key_hash>
-        get_rows_pipelines;   // src_type, vectorized
+        get_rows_pipelines;                                            // src_type, vectorized
    std::unordered_map<ggml_webgpu_unary_pipeline_key, webgpu_pipeline, ggml_webgpu_unary_pipeline_key_hash>
-        unary_pipelines;      // type/op/inplace
+        unary_pipelines;                                               // type/op/inplace
    std::unordered_map<ggml_webgpu_scale_pipeline_key, webgpu_pipeline, ggml_webgpu_scale_pipeline_key_hash>
-        scale_pipelines;      // inplace
+        scale_pipelines;                                               // inplace
    std::unordered_map<ggml_webgpu_solve_tri_pipeline_key, webgpu_pipeline, ggml_webgpu_solve_tri_pipeline_key_hash>
-        solve_tri_pipelines;  // type
+        solve_tri_pipelines;                                           // type
    std::unordered_map<ggml_webgpu_ssm_conv_pipeline_key, webgpu_pipeline, ggml_webgpu_ssm_conv_pipeline_key_hash>
-        ssm_conv_pipelines;   // type/vectorized
+        ssm_conv_pipelines;                                            // type/vectorized
    std::unordered_map<ggml_webgpu_gated_delta_net_pipeline_key,
                       webgpu_pipeline,
                       ggml_webgpu_gated_delta_net_pipeline_key_hash>
@@ -935,6 +767,8 @@ class ggml_webgpu_shader_lib {
        repeat_pipelines;           // type
    std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
        flash_attn_pipelines;
+    std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
+        flash_attn_vec_pipelines;
    std::unordered_map<ggml_webgpu_flash_attn_vec_reduce_pipeline_key,
                       webgpu_pipeline,
                       ggml_webgpu_flash_attn_vec_reduce_pipeline_key_hash>
@@ -964,15 +798,6 @@ class ggml_webgpu_shader_lib {
        rope_pipelines;
    std::unordered_map<ggml_webgpu_soft_max_pipeline_key, webgpu_pipeline, ggml_webgpu_soft_max_pipeline_key_hash>
        soft_max_pipelines;
-    std::unordered_map<ggml_webgpu_conv2d_pipeline_key, webgpu_pipeline, ggml_webgpu_conv2d_pipeline_key_hash>
-        conv2d_pipelines;
-    std::unordered_map<ggml_webgpu_im2col_pipeline_key, webgpu_pipeline, ggml_webgpu_im2col_pipeline_key_hash>
-        im2col_pipelines;
-
-    std::unordered_map<ggml_webgpu_rms_norm_mul_pipeline_key,
-                       webgpu_pipeline,
-                       ggml_webgpu_rms_norm_mul_pipeline_key_hash>
-        rms_norm_mul_pipelines;

  public:
    ggml_webgpu_shader_lib(wgpu::Device device) { this->device = device; }
@@ -1501,7 +1326,7 @@ class ggml_webgpu_shader_lib {
        ggml_webgpu_mul_mat_vec_pipeline_key key = {};
        key.src0_type                            = context.src0->type;
        key.src1_type                            = context.src1->type;
-        key.vectorized                           = (context.src0->ne[0] % 4 == 0 &&
+        key.vectorized                           = (context.src0->ne[0] % 4 == 0 && context.dst->ne[0] % 4 == 0 &&
                          (context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
                                                       1 :
                                                       0;
@@ -1512,8 +1337,7 @@ class ggml_webgpu_shader_lib {
        }

        std::vector<std::string> defines;
-        std::string              variant    = "mul_mat_vec";
-        const char *             shader_src = wgsl_mul_mat_vec;
+        std::string              variant = "mul_mat_vec";

        // src0 type (matrix row)
        switch (context.src0->type) {
@@ -1562,25 +1386,25 @@ class ggml_webgpu_shader_lib {
        defines.push_back(key.vectorized ? "VEC" : "SCALAR");

        uint32_t wg_size        = WEBGPU_MUL_MAT_VEC_WG_SIZE;
+        uint32_t tile_k         = WEBGPU_MUL_MAT_VEC_FLOAT_TILE_K;
        uint32_t outputs_per_wg = WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG;

        if (key.src0_type >= GGML_TYPE_Q2_K) {
+            tile_k         = WEBGPU_MUL_MAT_VEC_K_Q_TILE_K;
            outputs_per_wg = WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG;
        } else if (key.src0_type >= GGML_TYPE_Q4_0) {
+            tile_k         = WEBGPU_MUL_MAT_VEC_LEGACY_Q_TILE_K;
            outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
        }

        defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
+        defines.push_back(std::string("TILE_K=") + std::to_string(tile_k));
        defines.push_back(std::string("OUTPUTS_PER_WG=") + std::to_string(outputs_per_wg));
-        defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
-        variant += context.supports_subgroups ? "_sg_reduce" : "_wg_reduce";
-        if (key.vectorized) {
-            variant += "_vectorized";
-        }

-        auto processed            = preprocessor.preprocess(shader_src, defines);
+        auto processed            = preprocessor.preprocess(wgsl_mul_mat_vec, defines);
        auto decisions            = std::make_shared<ggml_webgpu_mul_mat_vec_shader_decisions>();
        decisions->wg_size        = wg_size;
+        decisions->tile_k         = tile_k;
        decisions->outputs_per_wg = outputs_per_wg;
        decisions->vec_size       = key.vectorized ? 4 : 1;

@@ -1989,43 +1813,6 @@ class ggml_webgpu_shader_lib {
        return unary_pipelines[key];
    }

-    webgpu_pipeline get_rms_norm_mul_pipeline(const ggml_webgpu_shader_lib_context & context) {
-        ggml_webgpu_rms_norm_mul_pipeline_key key = {};
-        key.inplace                               = context.inplace;
-        key.overlap                               = context.overlap;
-        key.src_overlap                           = context.src_overlap;
-
-        auto it = rms_norm_mul_pipelines.find(key);
-        if (it != rms_norm_mul_pipelines.end()) {
-            return it->second;
-        }
-
-        std::vector<std::string> defines;
-        std::string              op_name = "RMS_NORM_MUL";
-        std::string              variant = op_name;
-
-        if (key.inplace) {
-            defines.push_back("INPLACE");
-            variant += "_inplace";
-        } else if (key.overlap) {
-            defines.push_back("OVERLAP");
-            variant += "_overlap";
-        } else if (key.src_overlap) {
-            defines.push_back("SRC_OVERLAP");
-            variant += "_src_overlap";
-        }
-
-        defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
-
-        auto processed              = preprocessor.preprocess(wgsl_rms_norm_mul, defines);
-        auto decisions              = std::make_shared<ggml_webgpu_generic_shader_decisions>();
-        decisions->wg_size          = context.max_wg_size;
-        webgpu_pipeline pipeline    = ggml_webgpu_create_pipeline(device, processed, variant);
-        pipeline.context            = decisions;
-        rms_norm_mul_pipelines[key] = pipeline;
-        return rms_norm_mul_pipelines[key];
-    }
-
    webgpu_pipeline get_binary_pipeline(const ggml_webgpu_shader_lib_context & context) {
        ggml_webgpu_binary_pipeline_key key = {};
        key.type                            = context.dst->type;
@@ -2156,19 +1943,14 @@ class ggml_webgpu_shader_lib {
        return repeat_pipelines[key];
    }

-    webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context,
-                                            size_t                                 storage_offset_alignment) {
-        const ggml_webgpu_flash_attn_decisions decisions =
-            ggml_webgpu_flash_attn_get_decisions(context, storage_offset_alignment);
-        ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context, decisions.path);
-        auto                                it  = flash_attn_pipelines.find(key);
+    webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context) {
+        const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
+        auto                                      it  = flash_attn_pipelines.find(key);
        if (it != flash_attn_pipelines.end()) {
            return it->second;
        }
        std::vector<std::string> defines;
-        std::string              variant = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC  ? "flash_attn_vec" :
-                                           decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ? "flash_attn_tile" :
-                                                                                                "flash_attn";
+        std::string              variant = "flash_attn";

        switch (key.kv_type) {
            case GGML_TYPE_F32:
@@ -2190,12 +1972,7 @@ class ggml_webgpu_shader_lib {

        if (key.has_mask) {
            defines.push_back("MASK");
-            if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
-                defines.push_back("BLK");
-                variant += "_mask_blk";
-            } else {
-                variant += "_mask";
-            }
+            variant += "_mask";
        }
        if (key.has_sinks) {
            defines.push_back("SINKS");
@@ -2209,9 +1986,88 @@ class ggml_webgpu_shader_lib {
            defines.push_back("KV_DIRECT");
            variant += "_kvdirect";
        }
-        if (key.kv_overlap) {
-            defines.push_back("KV_OVERLAP");
-            variant += "_kv_overlap";
+
+        defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
+        variant += std::string("_hsqk") + std::to_string(key.head_dim_qk);
+
+        defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
+        variant += std::string("_hsv") + std::to_string(key.head_dim_v);
+
+        defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
+        defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
+        defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
+
+        auto decisions    = std::make_shared<ggml_webgpu_flash_attn_decisions>();
+        decisions->q_tile = context.sg_mat_m;
+
+        const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
+        uint32_t       kv_tile = std::min(min_kv_tile, context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
+
+        if (key.kv_direct) {
+            kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
+            while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
+                kv_tile -= context.sg_mat_n;
+            }
+        }
+
+        decisions->kv_tile = kv_tile;
+        decisions->wg_size = std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
+
+        defines.push_back(std::string("Q_TILE=") + std::to_string(decisions->q_tile));
+        defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
+        defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
+
+        webgpu_pipeline pipeline =
+            ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn, defines), variant);
+        pipeline.context          = decisions;
+        flash_attn_pipelines[key] = pipeline;
+        return flash_attn_pipelines[key];
+    }
+
+    webgpu_pipeline get_flash_attn_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
+        const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
+        auto                                      it  = flash_attn_vec_pipelines.find(key);
+        if (it != flash_attn_vec_pipelines.end()) {
+            return it->second;
+        }
+
+        std::vector<std::string> defines;
+        std::string              variant = "flash_attn_vec";
+
+        switch (key.kv_type) {
+            case GGML_TYPE_F32:
+                defines.push_back("KV_F32");
+                break;
+            case GGML_TYPE_F16:
+                defines.push_back("KV_F16");
+                break;
+            case GGML_TYPE_Q4_0:
+                defines.push_back("KV_Q4_0");
+                break;
+            case GGML_TYPE_Q8_0:
+                defines.push_back("KV_Q8_0");
+                break;
+            default:
+                GGML_ABORT("Unsupported KV type for flash attention shader");
+        }
+        variant += std::string("_") + ggml_type_name(key.kv_type);
+
+        if (key.has_mask) {
+            defines.push_back("MASK");
+            defines.push_back("BLK");
+            variant += "_mask_blk";
+        }
+        if (key.has_sinks) {
+            defines.push_back("SINKS");
+            variant += "_sinks";
+        }
+        if (key.uses_logit_softcap) {
+            defines.push_back("LOGIT_SOFTCAP");
+            variant += "_lgsc";
+        }
+        if (key.kv_direct) {
+            defines.push_back("KV_DIRECT");
+            variant += "_kvdirect";
        }

        defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
@@ -2220,37 +2076,46 @@ class ggml_webgpu_shader_lib {
        defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
        variant += std::string("_hsv") + std::to_string(key.head_dim_v);

-        const char * shader_src = wgsl_flash_attn;
-        if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
-            defines.push_back("KV_GRANULARITY=8");
-            defines.push_back(std::string("VEC_NE=") + std::to_string(ggml_webgpu_flash_attn_pick_vec_ne(key)) + "u");
-            shader_src = wgsl_flash_attn_vec_split;
-        } else if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
-            shader_src = wgsl_flash_attn_tile;
-            defines.push_back("MAX_SUBGROUP_SIZE=" + std::to_string(context.max_subgroup_size));
-            defines.push_back("KV_STAGE_STRIDE=" + std::to_string(std::max(key.head_dim_qk, key.head_dim_v)));
-            variant += "_tile";
-        } else {
-            defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
-            defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
-            defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
+        defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
+        defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
+        defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
+        defines.push_back("Q_TILE=1");
+
+        auto decisions     = std::make_shared<ggml_webgpu_flash_attn_vec_decisions>();
+        decisions->kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
+        decisions->wg_size = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
+        uint32_t vec_ne    = 1u;
+
+        // Keep conservative defaults unless this is the f16 vec-split shape family.
+        if (key.kv_type == GGML_TYPE_F16 && key.head_dim_qk == key.head_dim_v) {
+            switch (key.head_dim_qk) {
+                case 64:
+                case 192:
+                case 576:
+                    vec_ne = 2u;
+                    break;
+                case 96:
+                    vec_ne = 4u;
+                    break;
+                default:
+                    break;
+            }
        }

-        auto pipeline_decisions = std::make_shared<ggml_webgpu_flash_attn_decisions>(decisions);
-        defines.push_back(std::string("Q_TILE=") + std::to_string(decisions.q_tile));
-        defines.push_back(std::string("KV_TILE=") + std::to_string(decisions.kv_tile));
-        defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions.wg_size));
+        defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
+        defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
+        defines.push_back(std::string("VEC_NE=") + std::to_string(vec_ne) + "u");

        webgpu_pipeline pipeline =
-            ggml_webgpu_create_pipeline(device, preprocessor.preprocess(shader_src, defines), variant);
-        pipeline.context          = pipeline_decisions;
-        flash_attn_pipelines[key] = pipeline;
-        return flash_attn_pipelines[key];
+            ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn_vec_split, defines), variant);
+        pipeline.context              = decisions;
+        flash_attn_vec_pipelines[key] = pipeline;
+        return flash_attn_vec_pipelines[key];
    }

-    webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_shader_lib_context & context, uint32_t kv_tile) {
+    webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_shader_lib_context & context) {
        ggml_webgpu_flash_attn_blk_pipeline_key key = {};
-        key.kv_tile                                 = kv_tile;
+        key.kv_tile                                 = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
        auto it                                     = flash_attn_blk_pipelines.find(key);
        if (it != flash_attn_blk_pipelines.end()) {
            return it->second;
@@ -2525,84 +2390,6 @@ class ggml_webgpu_shader_lib {
        return soft_max_pipelines[key];
    }

-    webgpu_pipeline get_conv2d_pipeline(const ggml_webgpu_shader_lib_context & context) {
-        ggml_webgpu_conv2d_pipeline_key key = {};
-        key.weight_type                     = context.src0->type;
-        key.input_type                      = context.src1->type;
-        key.output_type                     = context.dst->type;
-
-        auto it = conv2d_pipelines.find(key);
-        if (it != conv2d_pipelines.end()) {
-            return it->second;
-        }
-
-        std::vector<std::string> defines;
-        std::string              variant = "conv_2d";
-
-        auto push_type_defines = [&](const char * prefix, ggml_type type) {
-            std::string s_prefix = prefix;
-            if (type == GGML_TYPE_F32) {
-                defines.push_back(s_prefix + "_F32");
-            } else if (type == GGML_TYPE_F16) {
-                defines.push_back(s_prefix + "_F16");
-            } else {
-                GGML_ABORT("Unsupported type for CONV_2D shader");
-            }
-        };
-
-        push_type_defines("WEIGHT", key.weight_type);
-        push_type_defines("INPUT", key.input_type);
-        push_type_defines("OUTPUT", key.output_type);
-
-        defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
-
-        auto processed           = preprocessor.preprocess(wgsl_conv2d, defines);
-        auto decisions           = std::make_shared<ggml_webgpu_generic_shader_decisions>();
-        decisions->wg_size       = context.max_wg_size;
-        webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
-        pipeline.context         = decisions;
-        conv2d_pipelines[key]    = pipeline;
-        return conv2d_pipelines[key];
-    }
-
-    webgpu_pipeline get_im2col_pipeline(const ggml_webgpu_shader_lib_context & context) {
-        ggml_webgpu_im2col_pipeline_key key = {};
-        key.input_type                      = context.src1->type;
-        key.output_type                     = context.dst->type;
-
-        auto it = im2col_pipelines.find(key);
-        if (it != im2col_pipelines.end()) {
-            return it->second;
-        }
-
-        std::vector<std::string> defines;
-        std::string              variant = "im2col";
-
-        auto push_type_defines = [&](const char * prefix, ggml_type type) {
-            std::string s_prefix = prefix;
-            if (type == GGML_TYPE_F32) {
-                defines.push_back(s_prefix + "_F32");
-            } else if (type == GGML_TYPE_F16) {
-                defines.push_back(s_prefix + "_F16");
-            } else {
-                GGML_ABORT("Unsupported type for IM2COL shader");
-            }
-        };
-
-        push_type_defines("INPUT", key.input_type);
-        push_type_defines("OUTPUT", key.output_type);
-
-        defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
-
-        auto processed           = preprocessor.preprocess(wgsl_im2col, defines);
-        auto decisions           = std::make_shared<ggml_webgpu_generic_shader_decisions>();
-        decisions->wg_size       = context.max_wg_size;
-        webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
-        pipeline.context         = decisions;
-        im2col_pipelines[key]    = pipeline;
-        return im2col_pipelines[key];
-    }
-
  private:
    static webgpu_pipeline ggml_webgpu_create_pipeline(wgpu::Device & device,
                                                       std::string    shader_code,
@@ -8,7 +8,6 @@
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
 #include "ggml-webgpu-shader-lib.hpp"
-#include "ggml.h"

 #ifdef __EMSCRIPTEN__
 #    include <emscripten/emscripten.h>
@@ -182,7 +181,6 @@ struct webgpu_dispatch_desc {

 struct webgpu_capabilities {
    wgpu::Limits limits;
-    bool         supports_subgroups       = false;
    bool         supports_subgroup_matrix = false;

    uint32_t sg_mat_m = 0;
@@ -212,7 +210,6 @@ struct webgpu_global_context_struct {
    wgpu::Buffer    memset_params_buf;
    webgpu_pipeline memset_pipeline;

-    // TODO: We should rework the CPU profiling time handling to make it more useful. ref: https://github.com/ggml-org/llama.cpp/pull/22050
 #ifdef GGML_WEBGPU_CPU_PROFILE
    // Profiling: labeled CPU time in ms (total)
    std::unordered_map<std::string, double> cpu_time_ms;
@@ -220,6 +217,11 @@ struct webgpu_global_context_struct {
    std::unordered_map<std::string, double> cpu_detail_ms;
 #endif

+#ifdef GGML_WEBGPU_GPU_PROFILE
+    // Profiling: per-shader GPU time in ms
+    std::unordered_map<std::string, double> shader_gpu_time_ms;
+#endif
+
 #ifdef GGML_WEBGPU_DEBUG
    wgpu::Buffer debug_host_buf;
    wgpu::Buffer debug_dev_buf;
@@ -265,12 +267,10 @@ struct webgpu_context_struct {
    size_t memset_bytes_per_thread;

 #ifdef GGML_WEBGPU_GPU_PROFILE
-    // Profiling: per-shader GPU time in ms
-    std::unordered_map<std::string, double> shader_gpu_time_ms;
-    wgpu::Buffer                            profile_timestamp_dev_buf;
-    wgpu::Buffer                            profile_timestamp_host_buf;
-    wgpu::QuerySet                          profile_timestamp_query_set;
-    uint32_t                                profile_timestamp_query_count = 0;
+    wgpu::Buffer   profile_timestamp_dev_buf;
+    wgpu::Buffer   profile_timestamp_host_buf;
+    wgpu::QuerySet profile_timestamp_query_set;
+    uint32_t       profile_timestamp_query_count = 0;
 #endif

    ~webgpu_context_struct() {
@@ -389,6 +389,23 @@ static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, const ggml_t
    return offset & (ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment - 1);
 }

+static bool ggml_webgpu_flash_attn_use_vec(webgpu_global_context & global_ctx,
+                                           const ggml_tensor *     Q,
+                                           const ggml_tensor *     K,
+                                           const ggml_tensor *     V) {
+    const size_t   alignment = global_ctx->capabilities.limits.minStorageBufferOffsetAlignment;
+    const uint32_t k_offset_elems =
+        (uint32_t) ((ggml_webgpu_tensor_offset(K) & (alignment - 1)) / ggml_type_size(K->type));
+    const uint32_t v_offset_elems =
+        (uint32_t) ((ggml_webgpu_tensor_offset(V) & (alignment - 1)) / ggml_type_size(V->type));
+    const bool f16_vec4_aligned = (k_offset_elems % 4u == 0u) && (v_offset_elems % 4u == 0u);
+    const bool kv_vec_type_supported =
+        K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
+
+    return (Q->ne[1] < 20) && (Q->ne[0] % 32 == 0) && (V->ne[0] % 4 == 0) && kv_vec_type_supported &&
+           (K->type != GGML_TYPE_F16 || f16_vec4_aligned) && (V->type == K->type);
+}
+
 static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, const ggml_tensor * t) {
    size_t offset = ggml_webgpu_tensor_offset(t);
    return offset & ~(ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment - 1);
@@ -695,12 +712,12 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
 #ifdef GGML_WEBGPU_GPU_PROFILE
    std::cout << "\n[ggml_webgpu gpu profiling summary]\n";
    double total_gpu = 0.0;
-    for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
+    for (const auto & kv : ctx->webgpu_ctx->global_ctx->shader_gpu_time_ms) {
        total_gpu += kv.second;
    }
    std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n";
    std::cout << "\nggml_webgpu: gpu breakdown:\n";
-    for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
+    for (const auto & kv : ctx->webgpu_ctx->global_ctx->shader_gpu_time_ms) {
        double pct = (total_gpu > 0.0) ? (kv.second / total_gpu * 100.0) : 0.0;
        std::cout << "ggml_webgpu:  " << kv.first << ": " << kv.second << " ms (" << std::fixed << std::setprecision(2)
                  << pct << "%)\n";
@@ -905,170 +922,6 @@ static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx,
    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }

-static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx,
-                                             ggml_tensor *    src0,
-                                             ggml_tensor *    src1,
-                                             ggml_tensor *    dst) {
-    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
-    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
-    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
-    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
-    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
-    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
-
-    std::vector<uint32_t> params = {
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
-
-        (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
-        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
-        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
-        (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
-
-        (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
-        (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
-        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
-        (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
-
-        (uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
-        (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
-        (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
-        (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
-
-        (uint32_t) src0->ne[0],
-        (uint32_t) src0->ne[1],
-        (uint32_t) src0->ne[2],
-
-        (uint32_t) src1->ne[0],
-        (uint32_t) src1->ne[1],
-
-        (uint32_t) dst->ne[0],
-        (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2],
-        (uint32_t) dst->ne[3],
-
-        (uint32_t) s0,
-        (uint32_t) s1,
-        (uint32_t) p0,
-        (uint32_t) p1,
-        (uint32_t) d0,
-        (uint32_t) d1,
-    };
-
-    std::vector<wgpu::BindGroupEntry> entries = {
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
-    };
-
-    ggml_webgpu_shader_lib_context shader_lib_ctx = {};
-    shader_lib_ctx.src0                           = src0;
-    shader_lib_ctx.src1                           = src1;
-    shader_lib_ctx.dst                            = dst;
-    shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
-
-    webgpu_pipeline pipeline = ctx->shader_lib->get_conv2d_pipeline(shader_lib_ctx);
-
-    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
-
-    uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
-    uint32_t wg_x     = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
-    uint32_t wg_y     = CEIL_DIV(total_wg, wg_x);
-
-    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
-}
-
-static webgpu_encoded_op ggml_webgpu_im2col(webgpu_context & ctx,
-                                            ggml_tensor *    src0,
-                                            ggml_tensor *    src1,
-                                            ggml_tensor *    dst) {
-    const int32_t s0    = ggml_get_op_params_i32(dst, 0);
-    const int32_t s1    = ggml_get_op_params_i32(dst, 1);
-    const int32_t p0    = ggml_get_op_params_i32(dst, 2);
-    const int32_t p1    = ggml_get_op_params_i32(dst, 3);
-    const int32_t d0    = ggml_get_op_params_i32(dst, 4);
-    const int32_t d1    = ggml_get_op_params_i32(dst, 5);
-    const bool    is_2D = ggml_get_op_params_i32(dst, 6) == 1;
-
-    const uint32_t KW = src0->ne[0];
-    const uint32_t KH = is_2D ? src0->ne[1] : 1;
-    const uint32_t IC = is_2D ? src0->ne[2] : src0->ne[1];
-
-    const uint32_t IW = src1->ne[0];
-    const uint32_t IH = is_2D ? src1->ne[1] : 1;
-    const uint32_t N  = is_2D ? src1->ne[3] : src1->ne[2];
-
-    const uint32_t OW = dst->ne[1];
-    const uint32_t OH = is_2D ? dst->ne[2] : 1;
-
-    const uint32_t si0 = (uint32_t) (src1->nb[0] / ggml_type_size(src1->type));
-    const uint32_t si1 = is_2D ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) : 0;
-    const uint32_t si2 = is_2D ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) :
-                                 (uint32_t) (src1->nb[1] / ggml_type_size(src1->type));
-    const uint32_t si3 = is_2D ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) :
-                                 (uint32_t) (src1->nb[2] / ggml_type_size(src1->type));
-
-    const uint32_t so0 = (uint32_t) (dst->nb[0] / ggml_type_size(dst->type));
-    const uint32_t so1 = (uint32_t) (dst->nb[1] / ggml_type_size(dst->type));
-    const uint32_t so2 = is_2D ? (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)) : 0;
-    const uint32_t so3 = is_2D ? (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)) :
-                                 (uint32_t) (dst->nb[2] / ggml_type_size(dst->type));
-
-    std::vector<uint32_t> params = {
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
-
-        si0,
-        si1,
-        si2,
-        si3,
-        so0,
-        so1,
-        so2,
-        so3,
-
-        KW,
-        KH,
-        IC,
-
-        IW,
-        IH,
-        N,
-
-        OW,
-        OH,
-
-        (uint32_t) s0,
-        (uint32_t) s1,
-        (uint32_t) p0,
-        (uint32_t) p1,
-        (uint32_t) d0,
-        (uint32_t) d1,
-    };
-
-    std::vector<wgpu::BindGroupEntry> entries = {
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst),
-    };
-
-    ggml_webgpu_shader_lib_context shader_lib_ctx = {};
-    shader_lib_ctx.src0                           = src0;
-    shader_lib_ctx.src1                           = src1;
-    shader_lib_ctx.dst                            = dst;
-    shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
-
-    webgpu_pipeline pipeline = ctx->shader_lib->get_im2col_pipeline(shader_lib_ctx);
-
-    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
-
-    uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
-    uint32_t wg_x     = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
-    uint32_t wg_y     = CEIL_DIV(total_wg, wg_x);
-
-    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
-}
-
 static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx,
                                              ggml_tensor *    src0,
                                              ggml_tensor *    src1,
@@ -1311,11 +1164,14 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
                case GGML_TYPE_Q8_0:
                case GGML_TYPE_Q8_1:
                case GGML_TYPE_Q6_K:
+                    use_fast = true;
+                    break;
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
                case GGML_TYPE_Q4_K:
                case GGML_TYPE_Q5_K:
-                case GGML_TYPE_Q3_K:
-                case GGML_TYPE_Q2_K:
-                    use_fast = true;
+                    // we don't have fast mat-vec for these types, but we do have (semi) fast mat-mat
+                    use_fast = !is_vec;
                    break;
                default:
                    break;
@@ -1326,12 +1182,10 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
    }

    ggml_webgpu_shader_lib_context shader_lib_ctx = {};
-
-    shader_lib_ctx.src0                     = src0;
-    shader_lib_ctx.src1                     = src1;
-    shader_lib_ctx.dst                      = dst;
+    shader_lib_ctx.src0                           = src0;
+    shader_lib_ctx.src1                           = src1;
+    shader_lib_ctx.dst                            = dst;
    shader_lib_ctx.max_wg_size              = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
-    shader_lib_ctx.supports_subgroups       = ctx->global_ctx->capabilities.supports_subgroups;
    shader_lib_ctx.supports_subgroup_matrix = ctx->global_ctx->capabilities.supports_subgroup_matrix;
    shader_lib_ctx.sg_mat_m                 = ctx->global_ctx->capabilities.sg_mat_m;
    shader_lib_ctx.sg_mat_n                 = ctx->global_ctx->capabilities.sg_mat_n;
@@ -1433,8 +1287,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
    shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;

    // Get or create pipeline
-    webgpu_pipeline gather_pipeline;
-    webgpu_pipeline main_pipeline;
+    webgpu_pipeline gather_pipeline, main_pipeline;

    std::vector<webgpu_dispatch_desc> dispatches;

@@ -1550,6 +1403,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
    return ggml_backend_webgpu_build_multi(ctx, dispatches);
 }

+#ifndef __EMSCRIPTEN__
 static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
                                                ggml_tensor *    Q,
                                                ggml_tensor *    K,
@@ -1567,29 +1421,13 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
    float m0          = powf(2.0f, -(max_bias) / n_head_log2);
    float m1          = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

-    const int  has_mask   = (mask != nullptr);
-    const int  has_sinks  = (sinks != nullptr);
-    const bool kv_overlap = ggml_webgpu_tensor_overlap(K, V) && K->type == V->type;
-
-    uint32_t offset_k       = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, K) / ggml_type_size(K->type));
-    uint32_t offset_v       = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, V) / ggml_type_size(V->type));
-    size_t   kv_bind_offset = 0;
-    size_t   kv_bind_size   = 0;
-    if (kv_overlap) {
-        const size_t k_bind_offset = ggml_webgpu_tensor_align_offset(ctx, K);
-        const size_t v_bind_offset = ggml_webgpu_tensor_align_offset(ctx, V);
-        const size_t k_bind_end    = k_bind_offset + ggml_webgpu_tensor_binding_size(ctx, K);
-        const size_t v_bind_end    = v_bind_offset + ggml_webgpu_tensor_binding_size(ctx, V);
-        kv_bind_offset             = std::min(k_bind_offset, v_bind_offset);
-        kv_bind_size               = std::max(k_bind_end, v_bind_end) - kv_bind_offset;
-        offset_k = (uint32_t) ((ggml_webgpu_tensor_offset(K) - kv_bind_offset) / ggml_type_size(K->type));
-        offset_v = (uint32_t) ((ggml_webgpu_tensor_offset(V) - kv_bind_offset) / ggml_type_size(V->type));
-    }
+    const int has_mask  = (mask != nullptr);
+    const int has_sinks = (sinks != nullptr);

    std::vector<uint32_t> params = {
        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, Q) / ggml_type_size(Q->type)),
-        offset_k,
-        offset_v,
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, K) / ggml_type_size(K->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, V) / ggml_type_size(V->type)),
        has_mask ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mask) / ggml_type_size(mask->type)) : 0,
        has_sinks ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, sinks) / ggml_type_size(sinks->type)) : 0,
        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
@@ -1617,15 +1455,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
    };
    std::vector<wgpu::BindGroupEntry> entries = {
        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, Q),
+        ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, K),
+        ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, V),
    };
-    if (kv_overlap) {
-        entries.push_back(
-            ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), kv_bind_offset, kv_bind_size));
-    } else {
-        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, K));
-        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, V));
-    }
-    uint32_t binding_index = kv_overlap ? 2u : 3u;
+    uint32_t binding_index = 3;
    if (has_mask) {
        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, binding_index++, mask));
    }
@@ -1641,25 +1474,25 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
    shader_lib_ctx.src3                           = mask;
    shader_lib_ctx.src4                           = sinks;
    shader_lib_ctx.dst                            = dst;
-    shader_lib_ctx.src_overlap                    = kv_overlap;
-    shader_lib_ctx.supports_subgroups             = ctx->global_ctx->capabilities.supports_subgroups;
-    shader_lib_ctx.supports_subgroup_matrix       = ctx->global_ctx->capabilities.supports_subgroup_matrix;
    shader_lib_ctx.max_wg_size        = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
    shader_lib_ctx.wg_mem_limit_bytes = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
    shader_lib_ctx.sg_mat_m           = ctx->global_ctx->capabilities.sg_mat_m;
    shader_lib_ctx.sg_mat_n           = ctx->global_ctx->capabilities.sg_mat_n;
    shader_lib_ctx.sg_mat_k           = ctx->global_ctx->capabilities.sg_mat_k;
    shader_lib_ctx.max_subgroup_size  = ctx->global_ctx->capabilities.max_subgroup_size;
-    webgpu_pipeline pipeline          = ctx->shader_lib->get_flash_attn_pipeline(
-        shader_lib_ctx, ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
-    auto * decisions = static_cast<ggml_webgpu_flash_attn_decisions *>(pipeline.context.get());
+    const bool      use_vec           = ggml_webgpu_flash_attn_use_vec(ctx->global_ctx, Q, K, V);
+    webgpu_pipeline pipeline          = use_vec ? ctx->shader_lib->get_flash_attn_vec_pipeline(shader_lib_ctx) :
+                                                  ctx->shader_lib->get_flash_attn_pipeline(shader_lib_ctx);

-    if (decisions->path != GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
+    if (!use_vec) {
+        auto *   decisions   = static_cast<ggml_webgpu_flash_attn_decisions *>(pipeline.context.get());
        uint32_t wg_per_head = CEIL_DIV(Q->ne[1], decisions->q_tile);
        uint32_t wg_x        = wg_per_head * Q->ne[2] * Q->ne[3];  // wg per head * number of heads * number of batches
        return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
    }

+    auto * decisions = static_cast<ggml_webgpu_flash_attn_vec_decisions *>(pipeline.context.get());
+
    wgpu::Buffer blk_buf         = {};
    uint64_t     blk_size_bytes  = 0;
    uint32_t     blk_nblk0       = 0;
@@ -1698,12 +1531,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
        tmp_bind_size   = tmp_size_bytes;
        scratch_offset  = ROUNDUP_POW2(scratch_offset + tmp_size_bytes, align_bytes);
    } else {
-        // nwg==1 writes final dst directly in vec-split; bind tmp to a tiny non-overlapping scratch region.
-        tmp_size_bytes  = WEBGPU_STORAGE_BUF_BINDING_MULT;
+        // nwg==1 writes final dst directly in vec-split; keep tmp binding valid without extra allocation.
        tmp_buf         = ggml_webgpu_tensor_buf(dst);
-        tmp_bind_offset = scratch_offset;
-        tmp_bind_size   = tmp_size_bytes;
-        scratch_offset  = ROUNDUP_POW2(scratch_offset + tmp_size_bytes, align_bytes);
+        tmp_bind_offset = ggml_webgpu_tensor_align_offset(ctx, dst);
+        tmp_bind_size   = ggml_webgpu_tensor_binding_size(ctx, dst);
    }

    webgpu_pipeline                   blk_pipeline;
@@ -1718,7 +1549,7 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
        const uint64_t blk_elems    = (uint64_t) blk_nblk0 * blk_nblk1 * blk_batch_count;
        blk_size_bytes              = ROUNDUP_POW2(blk_elems * sizeof(uint32_t), WEBGPU_STORAGE_BUF_BINDING_MULT);
        const ggml_webgpu_shader_lib_context blk_shader_ctx = shader_lib_ctx;
-        blk_pipeline = ctx->shader_lib->get_flash_attn_blk_pipeline(blk_shader_ctx, decisions->kv_tile);
+        blk_pipeline = ctx->shader_lib->get_flash_attn_blk_pipeline(blk_shader_ctx);

        blk_params = {
            (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mask) / ggml_type_size(mask->type)),  // offset_mask
@@ -1750,19 +1581,12 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
    std::vector<wgpu::BindGroupEntry> split_entries = {
        ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(Q), ggml_webgpu_tensor_align_offset(ctx, Q),
                                          ggml_webgpu_tensor_binding_size(ctx, Q)),
+        ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), ggml_webgpu_tensor_align_offset(ctx, K),
+                                          ggml_webgpu_tensor_binding_size(ctx, K)),
+        ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(V), ggml_webgpu_tensor_align_offset(ctx, V),
+                                          ggml_webgpu_tensor_binding_size(ctx, V)),
    };
-    if (kv_overlap) {
-        split_entries.push_back(
-            ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), kv_bind_offset, kv_bind_size));
-    } else {
-        split_entries.push_back(ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K),
-                                                                  ggml_webgpu_tensor_align_offset(ctx, K),
-                                                                  ggml_webgpu_tensor_binding_size(ctx, K)));
-        split_entries.push_back(ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(V),
-                                                                  ggml_webgpu_tensor_align_offset(ctx, V),
-                                                                  ggml_webgpu_tensor_binding_size(ctx, V)));
-    }
-    uint32_t split_binding_index = kv_overlap ? 2u : 3u;
+    uint32_t split_binding_index = 3;
    if (has_mask) {
        split_entries.push_back(ggml_webgpu_make_bind_group_entry(split_binding_index++, ggml_webgpu_tensor_buf(mask),
                                                                  ggml_webgpu_tensor_align_offset(ctx, mask),
@@ -1832,6 +1656,7 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,

    return ggml_backend_webgpu_build_multi(ctx, dispatches);
 }
+#endif  // __EMSCRIPTEN__

 static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
    bool is_unary = dst->op == GGML_OP_UNARY;
@@ -2066,96 +1891,6 @@ static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, ggml_tensor *
    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }

-static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context & ctx,
-                                                                 ggml_tensor *    rn_src,
-                                                                 ggml_tensor *    rn_dst,
-                                                                 ggml_tensor *    mul_src0,
-                                                                 ggml_tensor *    mul_src1,
-                                                                 ggml_tensor *    dst) {
-    ggml_tensor * mul_src;
-
-    if (ggml_webgpu_tensor_equal(rn_dst, mul_src0)) {
-        mul_src = mul_src1;
-    } else if (ggml_webgpu_tensor_equal(rn_dst, mul_src1)) {
-        mul_src = mul_src0;
-    } else {
-        GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
-    }
-
-    bool overlap = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
-                   (ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
-    bool inplace     = ggml_webgpu_tensor_equal(rn_src, dst);
-    bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
-
-    uint32_t offset_merged_rn_src               = 0;
-    uint32_t offset_merged_mul_src              = 0;
-    size_t   rn_src_webgpu_tensor_align_offset  = ggml_webgpu_tensor_align_offset(ctx, rn_src);
-    size_t   mul_src_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, mul_src);
-
-    if (src_overlap) {
-        size_t min_offset = std::min(rn_src_webgpu_tensor_align_offset, mul_src_webgpu_tensor_align_offset);
-        offset_merged_rn_src =
-            (uint32_t) ((rn_src_webgpu_tensor_align_offset - min_offset) / ggml_type_size(rn_src->type));
-        offset_merged_mul_src =
-            (uint32_t) ((mul_src_webgpu_tensor_align_offset - min_offset) / ggml_type_size(mul_src->type));
-    }
-
-    std::vector<uint32_t> params = {
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, rn_src) / ggml_type_size(rn_src->type)),
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mul_src) / ggml_type_size(mul_src->type)),
-        offset_merged_rn_src,
-        offset_merged_mul_src,
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
-        (uint32_t) (rn_src->nb[1] / ggml_type_size(rn_src->type)),
-        (uint32_t) (rn_src->nb[2] / ggml_type_size(rn_src->type)),
-        (uint32_t) (rn_src->nb[3] / ggml_type_size(rn_src->type)),
-        (uint32_t) (mul_src->nb[1] / ggml_type_size(mul_src->type)),
-        (uint32_t) (mul_src->nb[2] / ggml_type_size(mul_src->type)),
-        (uint32_t) (mul_src->nb[3] / ggml_type_size(mul_src->type)),
-        (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
-        (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
-        (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
-        (uint32_t) mul_src->ne[0],
-        (uint32_t) mul_src->ne[1],
-        (uint32_t) mul_src->ne[2],
-        (uint32_t) mul_src->ne[3],
-        (uint32_t) dst->ne[0],
-        (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2],
-        (uint32_t) dst->ne[3],
-        ggml_webgpu_u32_from_f32(ggml_get_op_params_f32(rn_dst, 0))  // epsilon, treated as f32 in the shader
-    };
-
-    std::vector<wgpu::BindGroupEntry> entries;
-
-    if (inplace || overlap) {
-        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
-        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
-    } else if (src_overlap) {
-        size_t merged_offset = std::min(rn_src_webgpu_tensor_align_offset, mul_src_webgpu_tensor_align_offset);
-        size_t merged_end =
-            std::max(rn_src_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, rn_src),
-                     mul_src_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, mul_src));
-        entries.push_back(ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(rn_src), merged_offset,
-                                                            merged_end - merged_offset));
-        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst));
-    } else {
-        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
-        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
-        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));
-    }
-
-    ggml_webgpu_shader_lib_context shader_lib_ctx = {};
-    shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
-    shader_lib_ctx.inplace     = inplace;
-    shader_lib_ctx.overlap     = overlap;
-    shader_lib_ctx.src_overlap = src_overlap;
-
-    webgpu_pipeline pipeline = ctx->shader_lib->get_rms_norm_mul_pipeline(shader_lib_ctx);
-
-    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, ggml_nrows(dst));
-}
-
 static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
    bool inplace = ggml_webgpu_tensor_equal(src, dst);

@@ -2652,48 +2387,15 @@ static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, ggml_tensor
    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }

-static bool ggml_webgpu_can_fuse_rms_norm_mul(const struct ggml_cgraph * cgraph, int node_idx) {
-    if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
-        return false;
-    }
-
-    // additional constraints specific to this fusion
-    const ggml_tensor * rms_norm = cgraph->nodes[node_idx];
-    const ggml_tensor * mul      = cgraph->nodes[node_idx + 1];
-
-    GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
-    // rms_norm only supports f32
-    if (mul->src[0]->type != GGML_TYPE_F32 || mul->src[1]->type != GGML_TYPE_F32 || mul->type != GGML_TYPE_F32) {
-        return false;
-    }
-    // if rms_norm is the B operand, then we don't handle broadcast
-    if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm)) {
-        return false;
-    }
-    // rms_norm shader assumes contiguous rows
-    if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
-        return false;
-    }
-
-    return true;
-}
-
 // Returns the encoded command, or std::nullopt if the operation is a no-op
-static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
-                                                           ggml_cgraph *  cgraph,
-                                                           int            node_idx,
-                                                           int &          num_encoded_ops) {
-    ggml_tensor ** nodes = cgraph->nodes;
-    ggml_tensor *  node  = nodes[node_idx];
-
+static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
    if (ggml_is_empty(node)) {
        return std::nullopt;
    }
    if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
        return std::nullopt;
    }
-    WEBGPU_LOG_DEBUG("ggml_webgpu_encode(" << node << ", " << ggml_op_name(node->op) << ")");
+    WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");

    ggml_tensor * src0 = node->src[0];
    ggml_tensor * src1 = node->src[1];
@@ -2721,7 +2423,11 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
        case GGML_OP_MUL_MAT_ID:
            return ggml_webgpu_mul_mat_id(ctx, src0, src1, src2, node);
        case GGML_OP_FLASH_ATTN_EXT:
+#ifndef __EMSCRIPTEN__
            return ggml_webgpu_flash_attn(ctx, src0, src1, src2, node->src[3], node->src[4], node);
+#else
+            return std::nullopt;
+#endif
        case GGML_OP_ADD:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
@@ -2732,13 +2438,6 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
        case GGML_OP_REPEAT:
            return ggml_webgpu_repeat(ctx, src0, node);
        case GGML_OP_RMS_NORM:
-            if (ggml_webgpu_can_fuse_rms_norm_mul(cgraph, node_idx)) {
-                num_encoded_ops        = 2;
-                ggml_tensor * mul_node = nodes[node_idx + 1];
-                return ggml_webgpu_rms_norm_mul(ctx, src0, node, mul_node->src[0], mul_node->src[1], mul_node);
-            } else {
-                return ggml_webgpu_row_norm(ctx, src0, node);
-            }
        case GGML_OP_L2_NORM:
            return ggml_webgpu_row_norm(ctx, src0, node);
        case GGML_OP_ROPE:
@@ -2779,10 +2478,6 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
            return ggml_webgpu_sum_rows(ctx, src0, node);
-        case GGML_OP_CONV_2D:
-            return ggml_webgpu_conv_2d(ctx, src0, src1, node);
-        case GGML_OP_IM2COL:
-            return ggml_webgpu_im2col(ctx, src0, src1, node);
        default:
            return std::nullopt;
    }
@@ -2815,7 +2510,7 @@ static void ggml_backend_webgpu_collect_profile_results(webgpu_context &
    for (size_t i = 0; i < pipeline_names.size(); ++i) {
        // WebGPU timestamps are in ns; convert to ms.
        const double elapsed_ms = double(ts_data[2 * i + 1] - ts_data[2 * i]) * 1e-6;
-        ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms;
+        ctx->global_ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms;
    }

    ctx->profile_timestamp_host_buf.Unmap();
@@ -2851,8 +2546,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
    uint32_t num_inflight_batches = 0;
    bool     contains_set_rows    = false;
    bool     batch_compute_passes = true;
-    int      num_encoded_ops      = 1;
-    int      node_idx             = 0;

 #ifdef GGML_WEBGPU_GPU_PROFILE
    ctx->profile_timestamp_query_count = 0;
@@ -2865,11 +2558,11 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
        ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
    }

-    while (node_idx < cgraph->n_nodes) {
-        if (cgraph->nodes[node_idx]->op == GGML_OP_SET_ROWS) {
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
            contains_set_rows = true;
        }
-        if (auto cmd = ggml_webgpu_encode(ctx, cgraph, node_idx, num_encoded_ops)) {
+        if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
            commands.push_back(*cmd);
            num_batched_kernels += cmd.value().num_kernels;
 #ifdef GGML_WEBGPU_GPU_PROFILE
@@ -2894,9 +2587,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
            ctx->param_arena.reset();
            commands.clear();
        }
-
-        node_idx += num_encoded_ops;
-        num_encoded_ops = 1;
    }

    if (ctx->active_compute_pass) {
@@ -2926,107 +2616,22 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
    return GGML_STATUS_SUCCESS;
 }

-struct ggml_backend_webgpu_event_context {
-    webgpu_global_context global_ctx;
-    wgpu::Future          future;
-    bool                  recorded = false;
-};
-
-static ggml_backend_event_t ggml_backend_webgpu_device_event_new(ggml_backend_dev_t device) {
-    ggml_backend_webgpu_device_context * dev_ctx = (ggml_backend_webgpu_device_context *) device->context;
-
-    auto * event_ctx      = new ggml_backend_webgpu_event_context();
-    event_ctx->global_ctx = dev_ctx->webgpu_global_ctx;
-
-    auto * event   = new ggml_backend_event;
-    event->device  = device;
-    event->context = event_ctx;
-    return event;
-}
-
-static void ggml_backend_webgpu_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
-    GGML_UNUSED(dev);
-    delete static_cast<ggml_backend_webgpu_event_context *>(event->context);
-    delete event;
-}
-
-static void ggml_backend_webgpu_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
-    GGML_UNUSED(dev);
-    ggml_backend_webgpu_event_context * event_ctx = (ggml_backend_webgpu_event_context *) event->context;
-    if (!event_ctx->recorded) {
-        return;
-    }
-    wgpu::WaitStatus status =
-        event_ctx->global_ctx->instance.WaitAny(event_ctx->future, WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
-    if (status == wgpu::WaitStatus::TimedOut) {
-        GGML_ABORT("ggml_webgpu: event_synchronize timed out after %u ms\n", WEBGPU_RUNTIME_WAIT_TIMEOUT_MS);
-    }
-    event_ctx->recorded = false;
-}
-
-static void ggml_backend_webgpu_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
-    ggml_backend_webgpu_context *       backend_ctx = (ggml_backend_webgpu_context *) backend->context;
-    ggml_backend_webgpu_event_context * event_ctx   = (ggml_backend_webgpu_event_context *) event->context;
-
-    event_ctx->future = backend_ctx->webgpu_ctx->global_ctx->queue.OnSubmittedWorkDone(
-        wgpu::CallbackMode::AllowSpontaneous, [](wgpu::QueueWorkDoneStatus, wgpu::StringView) {});
-    event_ctx->recorded = true;
-}
-
-static void ggml_backend_webgpu_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
-    GGML_UNUSED(backend);
-    ggml_backend_webgpu_device_event_synchronize(nullptr, event);
-}
-
-static void ggml_backend_webgpu_set_tensor_async(ggml_backend_t backend,
-                                                 ggml_tensor *  tensor,
-                                                 const void *   data,
-                                                 size_t         offset,
-                                                 size_t         size) {
-    GGML_UNUSED(backend);
-    auto * buf_ctx      = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context;
-    size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
-
-    // Write aligned portion
-    buf_ctx->global_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
-
-    if (size % 4 != 0) {
-        // If size is not a multiple of 4, we need to memset the remaining bytes
-        size_t remaining_size = size % 4;
-
-        // pack the remaining bytes into a uint32_t
-        uint32_t val32 = 0;
-
-        for (size_t i = 0; i < remaining_size; i++) {
-            ((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
-        }
-        // memset the remaining bytes
-        ggml_backend_webgpu_buffer_memset(buf_ctx->global_ctx, buf_ctx->buffer, val32,
-                                          total_offset + (size - remaining_size), remaining_size);
-    }
-}
-
-static void ggml_backend_webgpu_synchronize(ggml_backend_t backend) {
-    ggml_backend_webgpu_context * backend_ctx = (ggml_backend_webgpu_context *) backend->context;
-    ggml_backend_webgpu_wait_queue(backend_ctx->webgpu_ctx->global_ctx);
-}
-
 static ggml_backend_i ggml_backend_webgpu_i = {
    /* .get_name                = */ ggml_backend_webgpu_name,
    /* .free                    = */ ggml_backend_webgpu_free,
-    /* .set_tensor_async        = */ ggml_backend_webgpu_set_tensor_async,
+    /* .set_tensor_async        = */ NULL,
    /* .get_tensor_async        = */ NULL,
    /* .get_tensor_2d_async     = */ NULL,
    /* .set_tensor_2d_async     = */ NULL,
    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ ggml_backend_webgpu_synchronize,
+    /* .synchronize             = */ NULL,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_webgpu_graph_compute,
-    /* .event_record            = */ ggml_backend_webgpu_event_record,
-    /* .event_wait              = */ ggml_backend_webgpu_event_wait,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
    /* .graph_optimize          = */ NULL,
 };

@@ -3264,19 +2869,13 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                        ctx->webgpu_global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
                    shader_lib_ctx.wg_mem_limit_bytes =
                        ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
-                    shader_lib_ctx.supports_subgroups = ctx->webgpu_global_ctx->capabilities.supports_subgroups;
-                    shader_lib_ctx.supports_subgroup_matrix =
-                        ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix;
                    shader_lib_ctx.sg_mat_m          = ctx->webgpu_global_ctx->capabilities.sg_mat_m;
                    shader_lib_ctx.sg_mat_n          = ctx->webgpu_global_ctx->capabilities.sg_mat_n;
                    shader_lib_ctx.sg_mat_k          = ctx->webgpu_global_ctx->capabilities.sg_mat_k;
                    shader_lib_ctx.max_subgroup_size = ctx->webgpu_global_ctx->capabilities.max_subgroup_size;

-                    const ggml_webgpu_flash_attn_decisions decisions = ggml_webgpu_flash_attn_get_decisions(
-                        shader_lib_ctx, ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
-
-                    if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
-                        const uint32_t kv_tile = decisions.kv_tile;
+                    if (ggml_webgpu_flash_attn_use_vec(ctx->webgpu_global_ctx, Q, K, V)) {
+                        const uint32_t kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(shader_lib_ctx);

                        const uint32_t vec_nwg_cap = std::max(
                            1u, std::min<uint32_t>(32u, ctx->webgpu_global_ctx->capabilities.max_subgroup_size));
@@ -3296,8 +2895,6 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                            const size_t   tmp_size_bytes  = ROUNDUP_POW2(
                                (tmp_data_elems + tmp_stats_elems) * sizeof(float), WEBGPU_STORAGE_BUF_BINDING_MULT);
                            res += tmp_size_bytes + align;
-                        } else {
-                            res += WEBGPU_STORAGE_BUF_BINDING_MULT + align;
                        }
                        if (mask != nullptr) {
                            const uint32_t blk_nblk0       = CEIL_DIV((uint32_t) K->ne[1], kv_tile);
@@ -3443,15 +3040,13 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
    ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
    // we require f16 support
    GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
-    ctx->webgpu_global_ctx->capabilities.supports_subgroups =
-        ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);

-    bool valid_subgroup_matrix_config = false;
 #ifndef __EMSCRIPTEN__
    // Accept f16 subgroup matrix configurations (square or non-square).
    // NVIDIA GPUs typically report square configs (e.g. 16x16x16),
    // while Intel Xe2 GPUs report non-square configs (e.g. 8x16x16).
    // The shaders are already parameterized to handle any M/N/K dimensions.
+    bool valid_subgroup_matrix_config = false;
    if (ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
        for (size_t i = 0; i < subgroup_matrix_configs.configCount; i++) {
            const wgpu::SubgroupMatrixConfig config = subgroup_matrix_configs.configs[i];
@@ -3465,8 +3060,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
            }
        }
    }
-#endif
    ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix = valid_subgroup_matrix_config;
+#endif

    // For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
    // Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
@@ -3477,14 +3072,11 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
 #ifndef __EMSCRIPTEN__
    required_features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
    if (ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
+        required_features.push_back(wgpu::FeatureName::Subgroups);
        required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
    }
 #endif

-    if (ctx->webgpu_global_ctx->capabilities.supports_subgroups) {
-        required_features.push_back(wgpu::FeatureName::Subgroups);
-    }
-
 #ifdef GGML_WEBGPU_GPU_PROFILE
    required_features.push_back(wgpu::FeatureName::TimestampQuery);
 #endif
@@ -3514,12 +3106,12 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
    // Enable Dawn-specific toggles to increase native performance
    // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
    //       only for native performance?
-    const char * const          deviceEnabledToggles[]  = { "disable_robustness", "disable_workgroup_init",
-                                                            "disable_polyfills_on_integer_div_and_mod" };
-    const char * const          deviceDisabledToggles[] = { "timestamp_quantization" };
+    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
+                                                   "disable_polyfills_on_integer_div_and_mod" };
+    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
    wgpu::DawnTogglesDescriptor deviceTogglesDesc;
    deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
-    deviceTogglesDesc.enabledToggleCount  = 3;
+    deviceTogglesDesc.enabledToggleCount  = 4;
    deviceTogglesDesc.disabledToggles     = deviceDisabledToggles;
    deviceTogglesDesc.disabledToggleCount = 1;

@@ -3797,63 +3389,33 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
            break;
        case GGML_OP_FLASH_ATTN_EXT:
            {
+#ifndef __EMSCRIPTEN__
+                if (!ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
+                    break;
+                }
+                // Head dimensions must be divisible by subgroup matrix dimensions
+                if (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k != 0 ||
+                    src2->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_n != 0) {
+                    break;
+                }
+                // Head dimensions must fit in workgroup memory with minimum tile sizes
+                size_t     limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
+                const bool has_mask    = op->src[3] != nullptr;
+                const bool kv_direct   = src1->type == GGML_TYPE_F16 &&
+                                       (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
+                                       (src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
+                const size_t min_bytes = ggml_webgpu_flash_attn_wg_mem_bytes(
+                    ctx->webgpu_global_ctx->capabilities.sg_mat_m, ctx->webgpu_global_ctx->capabilities.sg_mat_n,
+                    (uint32_t) src0->ne[0], (uint32_t) src2->ne[0], has_mask, kv_direct);
+                if (min_bytes > limit_bytes) {
+                    break;
+                }
+
                supports_op = src0->type == GGML_TYPE_F32 &&
                              (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 ||
                               src1->type == GGML_TYPE_Q4_0 || src1->type == GGML_TYPE_Q8_0) &&
                              src2->type == src1->type && op->type == GGML_TYPE_F32;
-                if (!supports_op) {
-                    break;
-                }
-                ggml_webgpu_shader_lib_context shader_lib_ctx = {};
-                shader_lib_ctx.src0                           = src0;
-                shader_lib_ctx.src1                           = src1;
-                shader_lib_ctx.src2                           = src2;
-                shader_lib_ctx.src3                           = op->src[3];
-                shader_lib_ctx.src4                           = op->src[4];
-                shader_lib_ctx.dst                            = const_cast<ggml_tensor *>(op);
-                shader_lib_ctx.supports_subgroups             = ctx->webgpu_global_ctx->capabilities.supports_subgroups;
-                shader_lib_ctx.supports_subgroup_matrix = ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix;
-                shader_lib_ctx.wg_mem_limit_bytes =
-                    ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
-                shader_lib_ctx.sg_mat_m          = ctx->webgpu_global_ctx->capabilities.sg_mat_m;
-                shader_lib_ctx.sg_mat_n          = ctx->webgpu_global_ctx->capabilities.sg_mat_n;
-                shader_lib_ctx.sg_mat_k          = ctx->webgpu_global_ctx->capabilities.sg_mat_k;
-                shader_lib_ctx.max_subgroup_size = ctx->webgpu_global_ctx->capabilities.max_subgroup_size;
-
-                const ggml_webgpu_flash_attn_decisions decisions = ggml_webgpu_flash_attn_get_decisions(
-                    shader_lib_ctx, ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
-                const size_t limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
-                const bool   has_mask    = op->src[3] != nullptr;
-                if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
-                    const size_t min_bytes =
-                        ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
-                                                            (uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
-                    if (min_bytes > limit_bytes) {
-                        supports_op = false;
-                    }
-                    break;
-                }
-
-                if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
-                    const size_t min_bytes =
-                        ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
-                                                            (uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
-                    if (min_bytes > limit_bytes) {
-                        supports_op = false;
-                    }
-                    break;
-                }
-
-                if (!ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
-                    supports_op = false;
-                    break;
-                }
-                const size_t min_bytes =
-                    ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
-                                                        (uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
-                if (min_bytes > limit_bytes) {
-                    supports_op = false;
-                }
+#endif
                break;
            }
        case GGML_OP_RMS_NORM:
@@ -3929,15 +3491,6 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
        case GGML_OP_SOLVE_TRI:
            supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
            break;
-        case GGML_OP_CONV_2D:
-            supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
-                          (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
-                          (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
-            break;
-        case GGML_OP_IM2COL:
-            supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
-                          (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
-            break;
        case GGML_OP_SSM_CONV:
            supports_op = op->type == GGML_TYPE_F32;
            break;
@@ -4031,9 +3584,9 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
    /* .supports_op          = */ ggml_backend_webgpu_device_supports_op,
    /* .supports_buft        = */ ggml_backend_webgpu_device_supports_buft,
    /* .offload_op           = */ NULL,
-    /* .event_new            = */ ggml_backend_webgpu_device_event_new,
-    /* .event_free           = */ ggml_backend_webgpu_device_event_free,
-    /* .event_synchronize    = */ ggml_backend_webgpu_device_event_synchronize,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
 };

 /* End GGML Backend Device Interface */
@@ -45,13 +45,6 @@ fn load_u16_at_src0(byte_offset: u32) -> u32 {
    return (word >> shift) & 0xFFFFu;
 }

-// Always reads the 4-byte-aligned word containing byte_offset.
-// Caller extracts the 16-bit half it needs via & 0xFFFFu or >> 16u.
-// this is used in k-quants for better performance
-fn load_u32_at_src0_aligned(byte_offset: u32) -> u32 {
-    return src0[(byte_offset & ~3u) / 4u];
-}
-
 fn load_u32_at_src0(byte_offset: u32) -> u32 {
    let word_idx = byte_offset / 4u;
    let shift = (byte_offset & 0x3u) * 8u;
@@ -1,165 +0,0 @@
-#include "common_decls.tmpl"
-enable f16;
-
-@group(0) @binding(0)
-#if defined(WEIGHT_F32)
-var<storage, read_write> weights: array<f32>;
-#elif defined(WEIGHT_F16)
-var<storage, read_write> weights: array<f16>;
-#endif
-
-@group(0) @binding(1)
-#if defined(INPUT_F32)
-var<storage, read_write> input: array<f32>;
-#elif defined(INPUT_F16)
-var<storage, read_write> input: array<f16>;
-#endif
-
-@group(0) @binding(2)
-#if defined(OUTPUT_F32)
-var<storage, read_write> output: array<f32>;
-#elif defined(OUTPUT_F16)
-var<storage, read_write> output: array<f16>;
-#endif
-
-struct Params {
-    offset_w: u32,
-    offset_i: u32,
-    offset_o: u32,
-
-    // element strides
-    sw0: u32, sw1: u32, sw2: u32, sw3: u32,
-    si0: u32, si1: u32, si2: u32, si3: u32,
-    so0: u32, so1: u32, so2: u32, so3: u32,
-
-    // kernel dimensions
-    KW: u32, KH: u32, IC: u32,
-    // input dimensions
-    IW: u32, IH: u32,
-    // output dimensions
-    OW: u32, OH: u32, OC_out: u32, N_out: u32,
-
-    // stride
-    s0: u32, s1: u32,
-    // padding
-    p0: u32, p1: u32,
-    // dilation
-    d0: u32, d1: u32,
-};
-
-@group(0) @binding(3)
-var<uniform> params: Params;
-
-fn load_weight(idx: u32) -> f32 {
-    #if defined(WEIGHT_F32)
-        return weights[idx];
-    #elif defined(WEIGHT_F16)
-        return f32(weights[idx]);
-    #endif
-}
-
-fn load_input(idx: u32) -> f32 {
-    #if defined(INPUT_F32)
-        return input[idx];
-    #elif defined(INPUT_F16)
-        return f32(input[idx]);
-    #endif
-}
-
-fn store_output(idx: u32, val: f32) {
-    #if defined(OUTPUT_F32)
-        output[idx] = val;
-    #elif defined(OUTPUT_F16)
-        output[idx] = f16(val);
-    #endif
-}
-
-fn ceil_div_u32(x: u32, y: u32) -> u32 {
-    return (x + y - 1) / y;
-}
-
-// returns the first valid kernel index k such that base + k * step >= 0
-fn first_valid_k(base: i32, step: u32) -> u32 {
-    if (base >= 0) {
-        return 0;
-    }
-
-    return ceil_div_u32(u32(-base), step);
-}
-
-// returns the first invalid kernel index k such that base + k * step >= limit so valid k are in [0, end_valid_k)
-fn end_valid_k(base: i32, step: u32, limit: u32, k_max: u32) -> u32 {
-    let remaining = i32(limit) - base;
-    if (remaining <= 0) {
-        return 0;
-    }
-
-    return min(k_max, ceil_div_u32(u32(remaining), step));
-}
-
-@compute @workgroup_size(WG_SIZE)
-fn main(
-    @builtin(global_invocation_id) gid: vec3<u32>,
-    @builtin(num_workgroups) num_wg: vec3<u32>
-) {
-
-    let threads_per_group = u32(WG_SIZE);
-    let i_out = gid.x + (num_wg.x * threads_per_group) * gid.y;
-    let n_out = params.OW * params.OH * params.OC_out * params.N_out;
-
-    var sum: f32 = 0.0;
-    if (i_out >= n_out) {
-        return;
-    }
-
-    // Kernel layout: [KW, KH, IC, ..]
-    // Input layout:  [IW, IH, .., ..]
-    // Output layout: [OW, OH, OC, N]
-
-    var i = i_out;
-    let n = i / (params.OC_out * params.OH * params.OW);
-    i = i % (params.OC_out * params.OH * params.OW);
-    let oc = i / (params.OH * params.OW);
-    i = i % (params.OH * params.OW);
-    let oh = i / params.OW;
-    let ow = i % params.OW;
-
-    let ow_base = i32(ow * params.s0) - i32(params.p0);
-    let oh_base = i32(oh * params.s1) - i32(params.p1);
-
-    // clip the valid kernel window once
-    let kw_begin = first_valid_k(ow_base, params.d0);
-    let kw_end = end_valid_k(ow_base, params.d0, params.IW, params.KW);
-    let kh_begin = first_valid_k(oh_base, params.d1);
-    let kh_end = end_valid_k(oh_base, params.d1, params.IH, params.KH);
-
-    // entire receptive field is out of bounds
-    if (kw_begin >= kw_end || kh_begin >= kh_end) {
-        let out_idx = params.offset_o + ow * params.so0 + oh * params.so1 + oc * params.so2 + n * params.so3;
-        store_output(out_idx, 0.0);
-        return;
-    }
-
-    let weight_oc_base = params.offset_w + oc * params.sw3;
-    let input_n_base = params.offset_i + n * params.si3;
-
-    for (var ic: u32 = 0; ic < params.IC; ic += 1) {
-        let w_base_ic = ic * params.sw2 + weight_oc_base;
-        let in_base = ic * params.si2 + input_n_base;
-
-        for (var kh: u32 = kh_begin; kh < kh_end; kh += 1) {
-            let ih = u32(oh_base + i32(kh * params.d1));
-            let w_row_base = w_base_ic + kh * params.sw1;
-            let in_row_base = in_base + ih * params.si1;
-            for (var kw: u32 = kw_begin; kw < kw_end; kw += 1) {
-                let iw = u32(ow_base + i32(kw * params.d0));
-                let w_idx = w_row_base + kw * params.sw0;
-                let in_idx = in_row_base + iw * params.si0;
-                sum += load_weight(w_idx) * load_input(in_idx);
-            }
-        }
-    }
-
-    let out_idx = params.offset_o + ow * params.so0 + oh * params.so1 + oc * params.so2 + n * params.so3;
-    store_output(out_idx, sum);
-}
@@ -138,55 +138,26 @@ struct Params {
 };

@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
-#ifdef KV_OVERLAP
-@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
-#define V K
-#else
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
@group(0) @binding(2) var<storage, read_write> V: array<KV_TYPE>;
-#endif

 #if defined(MASK) && defined(SINKS)
-#ifdef KV_OVERLAP
-@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
-@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
-#define DST_BINDING 4
-#define PARAMS_BINDING 5
-#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
 #define DST_BINDING 5
 #define PARAMS_BINDING 6
-#endif
 #elif defined(MASK)
-#ifdef KV_OVERLAP
-@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
-#define DST_BINDING 3
-#define PARAMS_BINDING 4
-#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
 #define DST_BINDING 4
 #define PARAMS_BINDING 5
-#endif
 #elif defined(SINKS)
-#ifdef KV_OVERLAP
-@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
-#define DST_BINDING 3
-#define PARAMS_BINDING 4
-#else
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
 #define DST_BINDING 4
 #define PARAMS_BINDING 5
-#endif
-#else
-#ifdef KV_OVERLAP
-#define DST_BINDING 2
-#define PARAMS_BINDING 3
 #else
 #define DST_BINDING 3
 #define PARAMS_BINDING 4
 #endif
-#endif

@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
@@ -1,330 +0,0 @@
-enable f16;
-enable subgroups;
-
-#define HEAD_DIM_QK 64
-#define HEAD_DIM_V 64
-#define KV_STAGE_STRIDE 64
-#define Q_TILE 4
-#define KV_TILE 64
-#define WG_SIZE 128
-
-struct Params {
-    offset_q: u32,
-    offset_k: u32,
-    offset_v: u32,
-    offset_mask: u32,
-    offset_sinks: u32,
-    offset_dst: u32,
-
-    n_heads: u32,
-    seq_len_q: u32,
-    seq_len_kv: u32,
-
-    stride_q1: u32,
-    stride_q2: u32,
-    stride_q3: u32,
-    stride_k1: u32,
-    stride_k2: u32,
-    stride_k3: u32,
-    stride_v1: u32,
-    stride_v2: u32,
-    stride_v3: u32,
-    stride_mask3: u32,
-
-    q_per_kv: u32,
-
-    scale: f32,
-    max_bias: f32,
-    logit_softcap: f32,
-    n_head_log2: f32,
-    m0: f32,
-    m1: f32,
-};
-
-@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
-#ifdef KV_OVERLAP
-@group(0) @binding(1) var<storage, read_write> K: array<vec4<f16>>;
-#define V K
-#else
-@group(0) @binding(1) var<storage, read_write> K: array<vec4<f16>>;
-@group(0) @binding(2) var<storage, read_write> V: array<vec4<f16>>;
-#endif
-
-#if defined(MASK) && defined(SINKS)
-#ifdef KV_OVERLAP
-@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
-@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
-#define DST_BINDING 4
-#define PARAMS_BINDING 5
-#else
-@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
-@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
-#define DST_BINDING 5
-#define PARAMS_BINDING 6
-#endif
-#elif defined(MASK)
-#ifdef KV_OVERLAP
-@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
-#define DST_BINDING 3
-#define PARAMS_BINDING 4
-#else
-@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
-#define DST_BINDING 4
-#define PARAMS_BINDING 5
-#endif
-#elif defined(SINKS)
-#ifdef KV_OVERLAP
-@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
-#define DST_BINDING 3
-#define PARAMS_BINDING 4
-#else
-@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
-#define DST_BINDING 4
-#define PARAMS_BINDING 5
-#endif
-#else
-#ifdef KV_OVERLAP
-#define DST_BINDING 2
-#define PARAMS_BINDING 3
-#else
-#define DST_BINDING 3
-#define PARAMS_BINDING 4
-#endif
-#endif
-
-@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
-@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
-
-const FLOAT_MIN: f32 = -1.0e9;
-const Q_CHUNKS: u32 = HEAD_DIM_QK / 4u;
-const V_CHUNKS: u32 = HEAD_DIM_V / 4u;
-const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MAX_SUBGROUP_SIZE - 1u) / MAX_SUBGROUP_SIZE;
-const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MAX_SUBGROUP_SIZE - 1u) / MAX_SUBGROUP_SIZE;
-
-var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;
-var<workgroup> kv_shmem: array<f16, KV_TILE * KV_STAGE_STRIDE>;
-var<workgroup> p_shmem: array<f32, Q_TILE * KV_TILE>;
-
-@compute @workgroup_size(WG_SIZE)
-fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
-        @builtin(local_invocation_id) local_id: vec3<u32>,
-        @builtin(subgroup_id) subgroup_id: u32,
-        @builtin(subgroup_size) subgroup_size: u32,
-        @builtin(num_subgroups) num_subgroups: u32,
-        @builtin(subgroup_invocation_id) sg_inv_id: u32) {
-    if (subgroup_size == 0u || num_subgroups < Q_TILE) {
-        return;
-    }
-
-    let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
-    let wg_per_batch = wg_per_head * params.n_heads;
-
-    let dst2_stride = HEAD_DIM_V * params.n_heads;
-    let dst3_stride = dst2_stride * params.seq_len_q;
-
-    let batch_idx = wg_id.x / wg_per_batch;
-    let q_batch_offset = params.offset_q + batch_idx * params.stride_q3;
-    let k_batch_offset = params.offset_k + batch_idx * params.stride_k3;
-    let v_batch_offset = params.offset_v + batch_idx * params.stride_v3;
-    let dst_batch_offset = params.offset_dst + batch_idx * dst3_stride;
-    let wg_in_batch = wg_id.x % wg_per_batch;
-
-    let head_idx = wg_in_batch / wg_per_head;
-    let q_head_offset = q_batch_offset + head_idx * params.stride_q2;
-    let k_head_idx = head_idx / params.q_per_kv;
-    let v_head_offset = v_batch_offset + k_head_idx * params.stride_v2;
-    let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
-
-    let wg_in_head = wg_in_batch % wg_per_head;
-    let q_row_start = wg_in_head * Q_TILE;
-    let global_q_row = q_row_start + subgroup_id;
-    let row_active = subgroup_id < Q_TILE && global_q_row < params.seq_len_q;
-
-#ifdef MASK
-    let mask_global_offset = params.offset_mask + batch_idx * params.stride_mask3 + q_row_start * params.seq_len_kv;
-#endif
-
-    let dst_global_offset = dst_batch_offset + q_row_start * dst2_stride + head_idx * HEAD_DIM_V;
-
-    let head = f32(head_idx);
-    let slope = select(1.0,
-                       select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
-                              pow(params.m0, head + 1.0),
-                              head < params.n_head_log2),
-                       params.max_bias > 0.0);
-
-    for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
-        let q_tile_row = elem_idx / HEAD_DIM_QK;
-        let q_col = elem_idx % HEAD_DIM_QK;
-        let head_q_row = q_row_start + q_tile_row;
-        let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
-        q_shmem[elem_idx] = f16(select(
-            0.0,
-            Q[global_q_row_offset + q_col] * params.scale,
-            head_q_row < params.seq_len_q));
-    }
-
-    workgroupBarrier();
-
-    var row_max = FLOAT_MIN;
-    var exp_sum = 0.0;
-    var out_regs: array<vec4<f32>, OUT_REGS_PER_LANE>;
-    for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
-        out_regs[reg_idx] = vec4<f32>(0.0);
-    }
-
-    let q_base = subgroup_id * HEAD_DIM_QK;
-    let subgroup_p_offset = subgroup_id * KV_TILE;
-
-    for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) {
-        let kv_count = min(KV_TILE, params.seq_len_kv - kv_tile);
-        let score_slots = min(SCORE_REGS_PER_LANE, (kv_count + subgroup_size - 1u) / subgroup_size);
-        let out_slots = min(OUT_REGS_PER_LANE, (V_CHUNKS + subgroup_size - 1u) / subgroup_size);
-        var local_scores: array<f32, SCORE_REGS_PER_LANE>;
-        for (var slot = 0u; slot < SCORE_REGS_PER_LANE; slot += 1u) {
-            local_scores[slot] = FLOAT_MIN;
-        }
-
-        for (var vec_idx_local = local_id.x; vec_idx_local < kv_count * Q_CHUNKS; vec_idx_local += WG_SIZE) {
-            let kv_local = vec_idx_local / Q_CHUNKS;
-            let chunk = vec_idx_local % Q_CHUNKS;
-            let global_k_row = kv_tile + kv_local;
-            let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u;
-            let k4 = K[k_vec_index];
-            let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-            kv_shmem[kv_off + 0u] = k4.x;
-            kv_shmem[kv_off + 1u] = k4.y;
-            kv_shmem[kv_off + 2u] = k4.z;
-            kv_shmem[kv_off + 3u] = k4.w;
-        }
-
-        workgroupBarrier();
-
-        var local_max = FLOAT_MIN;
-        if (row_active) {
-            for (var slot = 0u; slot < score_slots; slot += 1u) {
-                let kv_local = sg_inv_id + slot * subgroup_size;
-                if (kv_local >= kv_count) {
-                    continue;
-                }
-
-                let global_k_row = kv_tile + kv_local;
-                var dot_val = 0.0;
-                for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) {
-                    let q_off = q_base + chunk * 4u;
-                    let qv = vec4<f32>(
-                        f32(q_shmem[q_off + 0u]),
-                        f32(q_shmem[q_off + 1u]),
-                        f32(q_shmem[q_off + 2u]),
-                        f32(q_shmem[q_off + 3u]));
-                    let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-                    let kv = vec4<f32>(
-                        f32(kv_shmem[kv_off + 0u]),
-                        f32(kv_shmem[kv_off + 1u]),
-                        f32(kv_shmem[kv_off + 2u]),
-                        f32(kv_shmem[kv_off + 3u]));
-                    dot_val += dot(qv, kv);
-                }
-#ifdef LOGIT_SOFTCAP
-                dot_val = params.logit_softcap * tanh(dot_val);
-#endif
-#ifdef MASK
-                let mask_idx = mask_global_offset + subgroup_id * params.seq_len_kv + global_k_row;
-                dot_val += slope * f32(mask[mask_idx]);
-#endif
-                local_scores[slot] = dot_val;
-                local_max = max(local_max, dot_val);
-            }
-        }
-
-        let tile_max = subgroupMax(local_max);
-        let new_max = max(row_max, tile_max);
-        let cur_exp = exp(row_max - new_max);
-        exp_sum *= cur_exp;
-        for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
-            out_regs[reg_idx] *= cur_exp;
-        }
-
-        var local_sum = 0.0;
-        for (var slot = 0u; slot < score_slots; slot += 1u) {
-            let kv_local = sg_inv_id + slot * subgroup_size;
-            if (row_active && kv_local < kv_count) {
-                let p = exp(local_scores[slot] - new_max);
-                p_shmem[subgroup_p_offset + kv_local] = p;
-                local_sum += p;
-            }
-        }
-
-        workgroupBarrier();
-
-        for (var vec_idx_local = local_id.x; vec_idx_local < kv_count * V_CHUNKS; vec_idx_local += WG_SIZE) {
-            let kv_local = vec_idx_local / V_CHUNKS;
-            let chunk = vec_idx_local % V_CHUNKS;
-            let global_v_row = kv_tile + kv_local;
-            let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u;
-            let v4 = V[v_vec_index];
-            let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-            kv_shmem[kv_off + 0u] = v4.x;
-            kv_shmem[kv_off + 1u] = v4.y;
-            kv_shmem[kv_off + 2u] = v4.z;
-            kv_shmem[kv_off + 3u] = v4.w;
-        }
-
-        workgroupBarrier();
-
-        let tile_sum = subgroupAdd(local_sum);
-        exp_sum += tile_sum;
-        row_max = new_max;
-
-        if (row_active) {
-            for (var reg_idx = 0u; reg_idx < out_slots; reg_idx += 1u) {
-                let chunk = sg_inv_id + reg_idx * subgroup_size;
-                if (chunk >= V_CHUNKS) {
-                    continue;
-                }
-
-                var acc = out_regs[reg_idx];
-                for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) {
-                    let p = p_shmem[subgroup_p_offset + kv_local];
-                    let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-                    let v4 = vec4<f32>(
-                        f32(kv_shmem[kv_off + 0u]),
-                        f32(kv_shmem[kv_off + 1u]),
-                        f32(kv_shmem[kv_off + 2u]),
-                        f32(kv_shmem[kv_off + 3u]));
-                    acc += p * v4;
-                }
-                out_regs[reg_idx] = acc;
-            }
-        }
-
-        workgroupBarrier();
-    }
-
-#ifdef SINKS
-    if (row_active) {
-        let sink_score = sinks[params.offset_sinks + head_idx];
-        let sink_max = max(row_max, sink_score);
-        let sink_scale = exp(row_max - sink_max);
-        for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
-            out_regs[reg_idx] *= sink_scale;
-        }
-        exp_sum = exp_sum * sink_scale + exp(sink_score - sink_max);
-        row_max = sink_max;
-    }
-#endif
-
-    if (row_active) {
-        let inv_exp_sum = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
-        let row_base = dst_global_offset + subgroup_id * dst2_stride;
-        let out_slots = min(OUT_REGS_PER_LANE, (V_CHUNKS + subgroup_size - 1u) / subgroup_size);
-        for (var reg_idx = 0u; reg_idx < out_slots; reg_idx += 1u) {
-            let chunk = sg_inv_id + reg_idx * subgroup_size;
-            if (chunk >= V_CHUNKS) {
-                continue;
-            }
-            let dst_vec_index = (row_base + chunk * 4u) >> 2u;
-            dst[dst_vec_index] = out_regs[reg_idx] * inv_exp_sum;
-        }
-    }
-}
@@ -15,7 +15,7 @@ struct Params {
    nblk1: u32,
 };

-@group(0) @binding(0) var<storage, read_write> mask: array<f16>;
+@group(0) @binding(0) var<storage, read> mask: array<f16>;
@group(0) @binding(1) var<storage, read_write> blk: array<u32>;
@group(0) @binding(2) var<uniform> params: Params;

@@ -1,6 +1,8 @@
+diagnostic(off, chromium.subgroup_matrix_uniformity);
 diagnostic(off, subgroup_uniformity);
 enable f16;
 enable subgroups;
+enable chromium_experimental_subgroup_matrix;

 #ifdef KV_F32
 #define KV_TYPE f32
@@ -11,14 +13,19 @@ enable subgroups;
 #define HEAD_DIM_QK 64
 #define HEAD_DIM_V 64

-#define KV_GRANULARITY 8
+
+#define SG_MAT_M 8
+#define SG_MAT_N 8
+#define SG_MAT_K 8
+
+#define Q_TILE SG_MAT_M
 #define KV_TILE 16
 #define WG_SIZE 64
 #ifndef VEC_NE
 #define VEC_NE 4u
 #endif

-#define KV_BLOCKS (KV_TILE / KV_GRANULARITY)
+#define KV_BLOCKS (KV_TILE / SG_MAT_N)

 #define BLOCK_SIZE 32
 #define BLOCKS_K ((HEAD_DIM_QK + BLOCK_SIZE - 1) / BLOCK_SIZE)
@@ -90,14 +97,6 @@ struct Params {
 };

@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
-#ifdef KV_OVERLAP
-#if defined(KV_Q4_0) || defined(KV_Q8_0)
-@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
-#else
-@group(0) @binding(1) var<storage, read_write> K: array<vec4<KV_TYPE>>;
-#endif
-#define V K
-#else
 #if defined(KV_Q4_0) || defined(KV_Q8_0)
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
 #else
@@ -108,22 +107,7 @@ struct Params {
 #else
@group(0) @binding(2) var<storage, read_write> V: array<vec4<KV_TYPE>>;
 #endif
-#endif
 #if defined(MASK) && defined(SINKS)
-#ifdef KV_OVERLAP
-@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
-@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
-#ifdef BLK
-#define BLK_BINDING 4
-#define TMP_BINDING 5
-#define DST_BINDING 6
-#define PARAMS_BINDING 7
-#else
-#define TMP_BINDING 4
-#define DST_BINDING 5
-#define PARAMS_BINDING 6
-#endif
-#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
 #ifdef BLK
@@ -136,21 +120,7 @@ struct Params {
 #define DST_BINDING 6
 #define PARAMS_BINDING 7
 #endif
-#endif
 #elif defined(MASK)
-#ifdef KV_OVERLAP
-@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
-#ifdef BLK
-#define BLK_BINDING 3
-#define TMP_BINDING 4
-#define DST_BINDING 5
-#define PARAMS_BINDING 6
-#else
-#define TMP_BINDING 3
-#define DST_BINDING 4
-#define PARAMS_BINDING 5
-#endif
-#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
 #ifdef BLK
 #define BLK_BINDING 4
@@ -162,30 +132,16 @@ struct Params {
 #define DST_BINDING 5
 #define PARAMS_BINDING 6
 #endif
-#endif
 #elif defined(SINKS)
-#ifdef KV_OVERLAP
-@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
-#define TMP_BINDING 3
-#define DST_BINDING 4
-#define PARAMS_BINDING 5
-#else
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
 #define TMP_BINDING 4
 #define DST_BINDING 5
 #define PARAMS_BINDING 6
-#endif
-#else
-#ifdef KV_OVERLAP
-#define TMP_BINDING 2
-#define DST_BINDING 3
-#define PARAMS_BINDING 4
 #else
 #define TMP_BINDING 3
 #define DST_BINDING 4
 #define PARAMS_BINDING 5
 #endif
-#endif

 #ifdef BLK
@group(0) @binding(BLK_BINDING) var<storage, read_write> blk: array<u32>;
@@ -197,7 +153,7 @@ struct Params {
 // Just a very small float value.
 const FLOAT_MIN: f32 = -1.0e9;

-var<workgroup> q_shmem: array<f16, HEAD_DIM_QK>;
+var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;

 #ifndef KV_DIRECT
 const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
@@ -205,27 +161,31 @@ const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
 var<workgroup> kv_shmem: array<f16, kv_shmem_size>;
 #endif

-var<workgroup> o_shmem: array<f16, HEAD_DIM_V>;
+var<workgroup> o_shmem: array<f16, Q_TILE * HEAD_DIM_V>;

 #ifdef MASK
 // storage for mask values
-var<workgroup> mask_shmem: array<f16, KV_TILE>;
+var<workgroup> mask_shmem: array<f16, Q_TILE * KV_TILE>;
 #endif

 // note that we reuse the same storage for both since we only need one at a time
-var<workgroup> inter_shmem: array<f16, KV_TILE>;
+var<workgroup> inter_shmem: array<f16, Q_TILE * KV_TILE>;

 // Storage for row max and exp sum during online softmax
-fn calc_softmax_term(kv_idx: u32, slope: f32, has_bias: bool, apply_mask: bool) -> f32 {
+var<workgroup> row_max_shmem: array<f32, Q_TILE>;
+var<workgroup> exp_sum_shmem: array<f32, Q_TILE>;
+var<workgroup> blk_state_wg: u32;
+
+fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32, has_bias: bool, apply_mask: bool) -> f32 {
    var v = select(FLOAT_MIN,
-                   f32(inter_shmem[kv_idx]) * params.scale,
+                   f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]) * params.scale,
                   kv_idx < KV_TILE);
 #ifdef LOGIT_SOFTCAP
    v = params.logit_softcap * tanh(v);
 #endif
 #ifdef MASK
    if (apply_mask) {
-        var mask_val = select(0.0, f32(mask_shmem[kv_idx]), kv_idx < KV_TILE);
+        var mask_val = select(0.0,f32(mask_shmem[q_tile_row * KV_TILE + kv_idx]), kv_idx < KV_TILE);
        v += select(mask_val, slope * mask_val, has_bias);
    }
 #endif
@@ -239,17 +199,19 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
    @builtin(subgroup_size) subgroup_size: u32,
    @builtin(num_subgroups) num_subgroups: u32,
    @builtin(subgroup_invocation_id) sg_inv_id: u32) {
-    // Vec path processes exactly one query row per workgroup, so subgroup 0 can
-    // keep the running softmax state in private storage.
-    var row_max = FLOAT_MIN;
-    var exp_sum = 0.0;

-    for (var i = local_id.x; i < HEAD_DIM_V; i += WG_SIZE) {
+    // initialize row max for online softmax
+    for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) {
+        row_max_shmem[i] = FLOAT_MIN;
+        exp_sum_shmem[i] = 0.0;
+    }
+
+    for (var i = local_id.x; i < Q_TILE * HEAD_DIM_V; i += WG_SIZE) {
        o_shmem[i] = 0.0;
    }

    // workgroups per head/batch
-    let wg_per_head = params.seq_len_q;
+    let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
    let wg_per_batch = wg_per_head * params.n_heads;

    let dst2_stride = HEAD_DIM_V * params.n_heads;
@@ -273,9 +235,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
    let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
    let v_head_offset = v_batch_offset + v_head_idx * params.stride_v2;

-    // Vec path handles one Q row per workgroup.
+    // starting Q row for this workgroup
    let wg_in_head = wg_in_batch % wg_per_head;
-    let q_row_start = wg_in_head;
+    let q_row_start = wg_in_head * Q_TILE;

 #ifdef MASK
    // mask offset
@@ -286,18 +248,21 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
    let has_bias = params.max_bias > 0.0;
    let slope = select(1.0, select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), pow(params.m0, head + 1.0), head < params.n_head_log2), has_bias);

-    // load the single Q row into shared memory
-    for (var elem_idx = local_id.x; elem_idx < HEAD_DIM_QK; elem_idx += WG_SIZE) {
-        let global_q_row_offset = q_head_offset + q_row_start * params.stride_q1;
+    // load q tile into shared memory
+    for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
+        let q_row = elem_idx / HEAD_DIM_QK;
+        let q_col = elem_idx % HEAD_DIM_QK;
+        let head_q_row = q_row_start + q_row;
+        let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
        q_shmem[elem_idx] = f16(select(
            0.0,
-            Q[global_q_row_offset + elem_idx],
-            q_row_start < params.seq_len_q));
+            Q[global_q_row_offset + q_col],
+            head_q_row < params.seq_len_q && q_col < HEAD_DIM_QK));
    }

    for (var kv_tile = iwg * KV_TILE; kv_tile < params.seq_len_kv; kv_tile += KV_TILE * params.nwg) {
 #ifdef BLK
-        let q_blk = q_row_start;
+        let q_blk = q_row_start / Q_TILE;
        let kv_blk = kv_tile / KV_TILE;
        let blk_batch = select(0u, batch_idx, params.stride_mask3 > 0u);
        let blk_idx = params.blk_base + (blk_batch * params.blk_nblk1 + q_blk) * params.blk_nblk0 + kv_blk;
@@ -305,9 +270,13 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
 #else
        let blk_state_local = 1u;
 #endif
-        let blk_state = blk_state_local;
+        if (local_id.x == 0u) {
+            blk_state_wg = blk_state_local;
+        }
+        workgroupBarrier();
+        let blk_state = blk_state_wg;
        let skip_tile = blk_state == 0u;
-        for (var elem_idx = local_id.x; elem_idx < KV_TILE; elem_idx += WG_SIZE) {
+        for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
            inter_shmem[elem_idx] = f16(0.0);
        }

@@ -391,14 +360,20 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
        let num_of_threads = subgroup_size / VEC_NE;
        let tx = sg_inv_id % num_of_threads;
        let ty = sg_inv_id / num_of_threads;
-          if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
+          for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
+              let global_q_row = q_row_start + q_tile_row;
+              if (global_q_row >= params.seq_len_q) {
+                  continue;
+              }
+              let local_q_row_offset = q_tile_row * HEAD_DIM_QK;
+
              for (var kv_base : u32 = 0u; kv_base < KV_TILE; kv_base += VEC_NE) {
                  let kv_idx = kv_base + ty;
                  var partial_sum: f32 = 0.0;
                  let kv_valid = kv_idx < KV_TILE && (kv_tile + kv_idx) < params.seq_len_kv;
                  if (kv_valid) {
                    for (var i = tx; i < (HEAD_DIM_QK / 4u); i += num_of_threads) {
-                        let q_off = i * 4u;
+                        let q_off = local_q_row_offset + i * 4u;

                        let qv = vec4<f32>(
                            f32(q_shmem[q_off + 0u]),
@@ -435,7 +410,8 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,

                  let sum_bcast = subgroupShuffle(sum, num_of_threads * ty);
                  if (tx == 0u && kv_valid) {
-                      inter_shmem[kv_idx] = f16(sum_bcast);
+                      let dst_idx = q_tile_row * KV_TILE + kv_idx;
+                      inter_shmem[dst_idx] = f16(sum_bcast);
                  }
              }
          }
@@ -446,10 +422,13 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
      let apply_mask = !skip_tile && (blk_state != 2u);
      if (apply_mask) {
          // load mask tile into shared memory for this KV block
-          for (var elem_idx = local_id.x; elem_idx < KV_TILE; elem_idx += WG_SIZE) {
-              let global_k_col = kv_tile + elem_idx;
-              let mask_in_bounds = q_row_start < params.seq_len_q && global_k_col < params.seq_len_kv;
-              let mask_idx = mask_global_offset + global_k_col;
+          for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
+              let mask_row = elem_idx / KV_TILE;
+              let mask_col = elem_idx % KV_TILE;
+              let global_q_row = q_row_start + mask_row;
+              let global_k_col = kv_tile + mask_col;
+              let mask_in_bounds = global_q_row < params.seq_len_q && global_k_col < params.seq_len_kv;
+              let mask_idx = mask_global_offset + mask_row * params.seq_len_kv + global_k_col;
              mask_shmem[elem_idx] = select(0.0, mask[mask_idx], mask_in_bounds);
          }
      }
@@ -460,40 +439,50 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
      workgroupBarrier();

      // online softmax
-      if (!skip_tile && subgroup_id == 0u && q_row_start < params.seq_len_q) {
-          var prev_max = row_max;
-          var final_max = prev_max;
-          // pass 1: compute final max across the full KV tile in chunks
-          for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
-              let kv_idx = kv_offset + sg_inv_id;
-              let kv_valid = kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE;
-              let softmax_term = select(FLOAT_MIN,
-                                        calc_softmax_term(kv_idx, slope, has_bias, apply_mask),
-                                        kv_valid);
-              final_max = subgroupMax(max(final_max, softmax_term));
-          }
-
-          var total_exp_term: f32 = 0.0;
-          // pass 2: compute exp sum and write P using final_max
-          for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
-              let kv_idx = kv_offset + sg_inv_id;
-              let softmax_term = calc_softmax_term(kv_idx, slope, has_bias, apply_mask);
-              let cur_p = select(0.0,
-                                 exp(softmax_term - final_max),
-                                 kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
-              total_exp_term += subgroupAdd(cur_p);
-              if (kv_idx < KV_TILE) {
-                  inter_shmem[kv_idx] = f16(cur_p);
+      if (!skip_tile) {
+          for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
+              let global_q_row = q_row_start + q_tile_row;
+              if (global_q_row >= params.seq_len_q) {
+                  break;
              }
-          }

-          let cur_exp = exp(prev_max - final_max);
+              var prev_max = row_max_shmem[q_tile_row];
+              var final_max = prev_max;
+              // pass 1: compute final max across the full KV tile in chunks
+              for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
+                  let kv_idx = kv_offset + sg_inv_id;
+                  let kv_valid = kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE;
+                  let softmax_term = select(FLOAT_MIN,
+                                            calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask),
+                                            kv_valid);
+                  final_max = subgroupMax(max(final_max, softmax_term));
+              }

-          row_max = final_max;
-          exp_sum = exp_sum * cur_exp + total_exp_term;
+              var total_exp_term: f32 = 0.0;
+              // pass 2: compute exp sum and write P using final_max
+              for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
+                  let kv_idx = kv_offset + sg_inv_id;
+                  let softmax_term = calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask);
+                  let cur_p = select(0.0,
+                                     exp(softmax_term - final_max),
+                                     kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
+                  total_exp_term += subgroupAdd(cur_p);
+                  if (kv_idx < KV_TILE) {
+                      inter_shmem[kv_idx + q_tile_row * KV_TILE] = f16(cur_p);
+                  }
+              }

-          for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
-              o_shmem[elem_idx] = f16(f32(o_shmem[elem_idx]) * cur_exp);
+              let cur_exp = exp(prev_max - final_max);
+
+              if (sg_inv_id == 0) {
+                  row_max_shmem[q_tile_row] = final_max;
+                  exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * cur_exp + total_exp_term;
+              }
+
+              for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
+                  let idx = q_tile_row * HEAD_DIM_V + elem_idx;
+                  o_shmem[idx] = f16(f32(o_shmem[idx]) * cur_exp);
+              }
          }
      }

@@ -573,13 +562,15 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
      workgroupBarrier();

      if (!skip_tile) {
-          // we have P (KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
+          // we have P (Q_TILE x KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
          // we want to compute O += P * V across the full KV tile
          let ne_threads : u32 = VEC_NE;
          let nl_threads = max(1u, subgroup_size / ne_threads);
          let tx_pv = sg_inv_id % nl_threads;
          let ty_pv = sg_inv_id / nl_threads;
-          if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
+          for (var q_tile_row = subgroup_id;
+               q_tile_row < Q_TILE;
+               q_tile_row += num_subgroups) {
              for (var vec_col = tx_pv; vec_col < (HEAD_DIM_V / 4u); vec_col += nl_threads) {
                  var lo = vec4<f32>(0.0, 0.0, 0.0, 0.0);
                  for (var cc = 0u; cc < KV_TILE / ne_threads; cc += 1u) {
@@ -589,7 +580,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                          continue;
                      }

-                      let p = f32(inter_shmem[kv_idx]);
+                      let p = f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]);
 #ifdef KV_DIRECT
                      let v_idx = v_head_offset + v_row * params.stride_v1 + vec_col * 4u;
                      let v4 = vec4<f32>(V[v_idx >> 2u]);
@@ -630,10 +621,11 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,

                  if (ty_pv == 0u) {
                      let elem_base = vec_col * 4u;
-                      o_shmem[elem_base + 0u] = f16(f32(o_shmem[elem_base + 0u]) + lo_x);
-                      o_shmem[elem_base + 1u] = f16(f32(o_shmem[elem_base + 1u]) + lo_y);
-                      o_shmem[elem_base + 2u] = f16(f32(o_shmem[elem_base + 2u]) + lo_z);
-                      o_shmem[elem_base + 3u] = f16(f32(o_shmem[elem_base + 3u]) + lo_w);
+                      let o_base_idx = q_tile_row * HEAD_DIM_V + elem_base;
+                      o_shmem[o_base_idx + 0u] = f16(f32(o_shmem[o_base_idx + 0u]) + lo_x);
+                      o_shmem[o_base_idx + 1u] = f16(f32(o_shmem[o_base_idx + 1u]) + lo_y);
+                      o_shmem[o_base_idx + 2u] = f16(f32(o_shmem[o_base_idx + 2u]) + lo_z);
+                      o_shmem[o_base_idx + 3u] = f16(f32(o_shmem[o_base_idx + 3u]) + lo_w);
                  }
              }
          }
@@ -645,46 +637,70 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,

 #ifdef SINKS
    // Sinks are global terms and must be applied exactly once across split workgroups.
-    if (iwg == 0u && subgroup_id == 0u && q_row_start < params.seq_len_q) {
-        var prev_max = row_max;
+    if (iwg == 0u) {
+        for (var q_tile_row = subgroup_id;
+             q_tile_row < Q_TILE;
+             q_tile_row += num_subgroups) {
+                let global_q_row = q_row_start + q_tile_row;
+                if (global_q_row >= params.seq_len_q) {
+                    break;
+                }

-        // for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
-        let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0u);
-        let new_max = subgroupMax(max(prev_max, sink_val));
-        let max_exp = exp(prev_max - new_max);
-        let sink_exp = exp(sink_val - new_max);
+                var prev_max = row_max_shmem[q_tile_row];

-        let sink_exp_sum = subgroupAdd(sink_exp);
+                // for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
+                let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0);
+                let new_max = subgroupMax(max(prev_max, sink_val));
+                let max_exp = exp(prev_max - new_max);
+                let sink_exp = exp(sink_val - new_max);

-        row_max = new_max;
-        exp_sum = exp_sum * max_exp + sink_exp_sum;
+                let sink_exp_sum = subgroupAdd(sink_exp);

-        for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
-            o_shmem[elem_idx] = f16(f32(o_shmem[elem_idx]) * max_exp);
+                if (sg_inv_id == 0) {
+                    row_max_shmem[q_tile_row] = new_max;
+                    exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * max_exp + sink_exp_sum;
+                }
+
+            for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
+                let idx = q_tile_row * HEAD_DIM_V + elem_idx;
+                o_shmem[idx] = f16(f32(o_shmem[idx]) * max_exp);
+            }
        }
+        workgroupBarrier();
    }
-    workgroupBarrier();
 #endif
    let rows_per_batch = params.n_heads * params.seq_len_q;
-    if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
+    for (var q_tile_row = subgroup_id;
+         q_tile_row < Q_TILE;
+         q_tile_row += num_subgroups) {
+
+        let global_q_row = q_row_start + q_tile_row;
+        if (global_q_row >= params.seq_len_q) { break; }
+
        if (params.nwg == 1u) {
+            let exp_sum = exp_sum_shmem[q_tile_row];
            let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
-            let row_base: u32 = params.offset_dst + batch_idx * dst3_stride + q_row_start * dst2_stride +
-                                head_idx * HEAD_DIM_V;
+            let row_base: u32 =
+                params.offset_dst + batch_idx * dst3_stride + global_q_row * dst2_stride + head_idx * HEAD_DIM_V;

            for (var elem_base = sg_inv_id * 4u; elem_base < HEAD_DIM_V; elem_base += subgroup_size * 4u) {
+                let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
+                let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
+                let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
+                let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
+
                let v = vec4<f32>(
-                    f32(o_shmem[elem_base + 0u]) * scale,
-                    f32(o_shmem[elem_base + 1u]) * scale,
-                    f32(o_shmem[elem_base + 2u]) * scale,
-                    f32(o_shmem[elem_base + 3u]) * scale
+                    f32(o_shmem[i0]) * scale,
+                    f32(o_shmem[i1]) * scale,
+                    f32(o_shmem[i2]) * scale,
+                    f32(o_shmem[i3]) * scale
                );

                let dst_vec_index: u32 = (row_base + elem_base) >> 2u;
                dst[dst_vec_index] = v;
            }
        } else {
-            let rid = batch_idx * rows_per_batch + head_idx * params.seq_len_q + q_row_start;
+            let rid = batch_idx * rows_per_batch + head_idx * params.seq_len_q + global_q_row;
            let tmp_row_data_base = params.tmp_data_base + rid * (HEAD_DIM_V * params.nwg) + iwg * HEAD_DIM_V;
            let tmp_row_stats_base = params.tmp_stats_base + rid * (2u * params.nwg) + 2u * iwg;

@@ -692,16 +708,21 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                elem_base < HEAD_DIM_V;
                elem_base += subgroup_size * 4u) {

+                let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
+                let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
+                let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
+                let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
+
                let tbase = tmp_row_data_base + elem_base;
-                tmp[tbase + 0u] = f32(o_shmem[elem_base + 0u]);
-                tmp[tbase + 1u] = f32(o_shmem[elem_base + 1u]);
-                tmp[tbase + 2u] = f32(o_shmem[elem_base + 2u]);
-                tmp[tbase + 3u] = f32(o_shmem[elem_base + 3u]);
+                tmp[tbase + 0u] = f32(o_shmem[i0]);
+                tmp[tbase + 1u] = f32(o_shmem[i1]);
+                tmp[tbase + 2u] = f32(o_shmem[i2]);
+                tmp[tbase + 3u] = f32(o_shmem[i3]);
            }

            if (sg_inv_id == 0u) {
-                tmp[tmp_row_stats_base + 0u] = exp_sum;
-                tmp[tmp_row_stats_base + 1u] = row_max;
+                tmp[tmp_row_stats_base + 0u] = exp_sum_shmem[q_tile_row];
+                tmp[tmp_row_stats_base + 1u] = row_max_shmem[q_tile_row];
            }
        }
    }
@@ -1,101 +0,0 @@
-#include "common_decls.tmpl"
-enable f16;
-
-@group(0) @binding(0)
-#if defined(INPUT_F32)
-var<storage, read_write> input: array<f32>;
-#elif defined(INPUT_F16)
-var<storage, read_write> input: array<f16>;
-#endif
-
-@group(0) @binding(1)
-#if defined(OUTPUT_F32)
-var<storage, read_write> output: array<f32>;
-#elif defined(OUTPUT_F16)
-var<storage, read_write> output: array<f16>;
-#endif
-
-struct Params {
-    offset_i: u32,
-    offset_o: u32,
-
-    // element strides
-    si0: u32, si1: u32, si2: u32, si3: u32,
-    so0: u32, so1: u32, so2: u32, so3: u32,
-
-    KW: u32, KH: u32, IC: u32,
-    IW: u32, IH: u32, N: u32,
-    OW: u32, OH: u32,
-
-    // stride
-    s0: u32, s1: u32,
-    // padding
-    p0: u32, p1: u32,
-    // dilation
-    d0: u32, d1: u32,
-}
-
-@group(0) @binding(2)
-var<uniform> params: Params;
-
-fn load_input(idx: u32) -> f32 {
-    #if defined(INPUT_F32)
-        return input[idx];
-    #elif defined(INPUT_F16)
-        return f32(input[idx]);
-    #endif
-}
-
-fn store_output(idx: u32, val: f32) {
-    #if defined(OUTPUT_F32)
-        output[idx] = val;
-    #elif defined(OUTPUT_F16)
-        output[idx] = f16(val);
-    #endif
-}
-
-@compute @workgroup_size(WG_SIZE)
-fn main(
-    @builtin(global_invocation_id) gid: vec3<u32>,
-    @builtin(num_workgroups) num_wg: vec3<u32>
-) {
-
-    let threads_per_group = u32(WG_SIZE);
-    let i_out = gid.x + (num_wg.x * threads_per_group) * gid.y;
-    let K = params.KW * params.KH * params.IC;
-    let M = params.OW * params.OH;
-    let total = K * M * params.N;
-
-    if (i_out >= total) {
-        return;
-    }
-
-    // decode (k, m, n)
-    var i = i_out;
-    let n = i / (K * M);
-    i = i % (K * M);
-    let m = i / K;
-    let k = i % K;
-
-    // decode (oh, ow)
-    let oh = m / params.OW;
-    let ow = m % params.OW;
-
-    // decode (kw, kh, ic)
-    let kw = k % params.KW;
-    let tmp = k / params.KW;
-    let kh = tmp % params.KH;
-    let ic = tmp / params.KH;
-
-    let iw_i32 = i32(ow * params.s0 + kw * params.d0) - i32(params.p0);
-    let ih_i32 = i32(oh * params.s1 + kh * params.d1) - i32(params.p1);
-
-    if (iw_i32 >= 0 && iw_i32 < i32(params.IW) && ih_i32 >= 0 && ih_i32 < i32(params.IH)) {
-        let iw = u32(iw_i32);
-        let ih = u32(ih_i32);
-        let in_idx = params.offset_i + iw * params.si0 + ih * params.si1 + ic * params.si2 + n * params.si3;
-        store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, load_input(in_idx));
-    } else {
-        store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, 0.0);
-    }
-}
@@ -1,154 +0,0 @@
-#ifdef OVERLAP
-
-@group(0) @binding(0)
-var<storage, read_write> rn_src: array<f32>;
-
-@group(0) @binding(1)
-var<storage, read_write> mul_src: array<f32>;
-
-@group(0) @binding(2)
-var<uniform> params: Params;
-
-fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
-    mul_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
-}
-
-#elif INPLACE
-
-@group(0) @binding(0)
-var<storage, read_write> rn_src: array<f32>;
-
-@group(0) @binding(1)
-var<storage, read_write> mul_src: array<f32>;
-
-@group(0) @binding(2)
-var<uniform> params: Params;
-
-fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
-    rn_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
-}
-
-#elif SRC_OVERLAP
-
-@group(0) @binding(0)
-var<storage, read_write> merged_src: array<f32>;
-
-@group(0) @binding(1)
-var<storage, read_write> dst: array<f32>;
-
-@group(0) @binding(2)
-var<uniform> params: Params;
-
-fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
-    dst[dst_offset] = scale * merged_src[rn_src_offset] * merged_src[mul_src_offset];
-}
-
-#else
-
-@group(0) @binding(0)
-var<storage, read_write> rn_src: array<f32>;
-
-@group(0) @binding(1)
-var<storage, read_write> mul_src: array<f32>;
-
-@group(0) @binding(2)
-var<storage, read_write> dst: array<f32>;
-
-@group(0) @binding(3)
-var<uniform> params: Params;
-
-fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
-    dst[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
-}
-
-#endif
-
-struct Params {
-    offset_rn_src: u32,
-    offset_mul_src: u32,
-    offset_merged_rn_src: u32,
-    offset_merged_mul_src: u32,
-    offset_dst: u32,
-
-    stride_rn_src1: u32,
-    stride_rn_src2: u32,
-    stride_rn_src3: u32,
-
-    stride_mul_src1: u32,
-    stride_mul_src2: u32,
-    stride_mul_src3: u32,
-
-    stride_dst1: u32,
-    stride_dst2: u32,
-    stride_dst3: u32,
-
-    mul_src_ne0: u32,
-    mul_src_ne1: u32,
-    mul_src_ne2: u32,
-    mul_src_ne3: u32,
-
-    ne0: u32,
-    ne1: u32,
-    ne2: u32,
-    ne3: u32,
-
-    eps: f32
-};
-
-var<workgroup> scratch: array<f32, WG_SIZE>;
-
-@compute @workgroup_size(WG_SIZE)
-fn main(@builtin(workgroup_id) wid: vec3<u32>,
-        @builtin(local_invocation_id) lid: vec3<u32>) {
-
-    // one thread per row
-    var i = wid.x;
-    let i3 = i / (params.ne2 * params.ne1);
-    i = i % (params.ne2 * params.ne1);
-    let i2 = i / params.ne1;
-    let i1 = i % params.ne1;
-    let i_rn_src_row = params.offset_rn_src + params.offset_merged_rn_src + i3 * params.stride_rn_src3 + i2 * params.stride_rn_src2 + i1 * params.stride_rn_src1;
-    let i_mul_src_row = params.offset_mul_src + params.offset_merged_mul_src + (i3 % params.mul_src_ne3) * params.stride_mul_src3 + (i2 % params.mul_src_ne2) * params.stride_mul_src2 + (i1 % params.mul_src_ne1) * params.stride_mul_src1;
-    let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
-
-    let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE;
-
-    var sum = 0.0f;
-    var col = lid.x;
-    for (var j: u32 = 0; j < elems; j++) {
-        if (col >= params.ne0) {
-            break;
-        }
-#ifdef SRC_OVERLAP
-        sum += pow(merged_src[i_rn_src_row + col], 2.0);
-#else
-        sum += pow(rn_src[i_rn_src_row + col], 2.0);
-#endif
-        col += WG_SIZE;
-    }
-
-    scratch[lid.x] = sum;
-
-    workgroupBarrier();
-
-    var offset: u32 = WG_SIZE / 2;
-    while (offset > 0) {
-        if (lid.x < offset) {
-            scratch[lid.x] += scratch[lid.x + offset];
-        }
-        offset = offset / 2;
-        workgroupBarrier();
-    }
-    sum = scratch[0];
-
-    let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps);
-
-    col = lid.x;
-    for (var j: u32 = 0; j < elems; j++) {
-        if (col >= params.ne0) {
-            break;
-        }
-        update(i_rn_src_row + col, i_dst_row + col, scale, i_mul_src_row + col % params.mul_src_ne0);
-        col += WG_SIZE;
-    }
-}
@@ -7656,7 +7656,7 @@ size_t ggml_quantize_chunk(
               int64_t   nrows,
               int64_t   n_per_row,
           const float * imatrix) {
-    const int64_t n = nrows * n_per_row;
+    const int64_t n = (int64_t) nrows * n_per_row;

    if (ggml_quantize_requires_imatrix(type)) {
        GGML_ASSERT(imatrix != NULL);
@@ -7673,21 +7673,21 @@ size_t ggml_quantize_chunk(
    size_t result = 0;

    switch (type) {
-        case GGML_TYPE_Q1_0:    result = quantize_q1_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0:    result = quantize_q4_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_1:    result = quantize_q4_1   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_0:    result = quantize_q5_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_1:    result = quantize_q5_1   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q8_0:    result = quantize_q8_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_MXFP4:   result = quantize_mxfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_NVFP4:   result = quantize_nvfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q2_K:    result = quantize_q2_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q3_K:    result = quantize_q3_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_K:    result = quantize_q4_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_K:    result = quantize_q5_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q6_K:    result = quantize_q6_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_TQ1_0:   result = quantize_tq1_0  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_TQ2_0:   result = quantize_tq2_0  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q1_0:    result = quantize_q1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_1:    result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q8_0:    result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_MXFP4:   result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_NVFP4:   result = quantize_nvfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q2_K:    result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q3_K:    result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ1_0:   result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ2_0:   result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@@ -7752,9 +7752,9 @@ struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
 }

 bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
-    if (p0->n_threads  != p1->n_threads  ) return false;
-    if (p0->prio       != p1->prio       ) return false;
-    if (p0->poll       != p1->poll       ) return false;
-    if (p0->strict_cpu != p1->strict_cpu ) return false;
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }
@@ -197,7 +197,6 @@ class Keys:
        FREQ_BASE_SWA             = "{arch}.rope.freq_base_swa"
        SCALING_TYPE              = "{arch}.rope.scaling.type"
        SCALING_FACTOR            = "{arch}.rope.scaling.factor"
-        SCALING_ALPHA             = "{arch}.rope.scaling.alpha"
        SCALING_ATTN_FACTOR       = "{arch}.rope.scaling.attn_factor"
        SCALING_ORIG_CTX_LEN      = "{arch}.rope.scaling.original_context_length"
        SCALING_FINETUNED         = "{arch}.rope.scaling.finetuned"
@@ -472,7 +471,6 @@ class MODEL_ARCH(IntEnum):
    ERNIE4_5_MOE     = auto()
    HUNYUAN_MOE      = auto()
    HUNYUAN_DENSE    = auto()
-    HUNYUAN_VL       = auto()
    SMOLLM3          = auto()
    GPT_OSS          = auto()
    LFM2             = auto()
@@ -959,7 +957,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.FALCON_H1:        "falcon-h1",
    MODEL_ARCH.HUNYUAN_MOE:      "hunyuan-moe",
    MODEL_ARCH.HUNYUAN_DENSE:    "hunyuan-dense",
-    MODEL_ARCH.HUNYUAN_VL:       "hunyuan_vl",
    MODEL_ARCH.SMOLLM3:          "smollm3",
    MODEL_ARCH.GPT_OSS:          "gpt-oss",
    MODEL_ARCH.LFM2:             "lfm2",
@@ -3492,22 +3489,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
-    MODEL_ARCH.HUNYUAN_VL: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_Q_NORM,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_K_NORM,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
    MODEL_ARCH.SMOLLM3: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@@ -4157,7 +4138,6 @@ class VisionProjectorType:
    YOUTUVL = "youtuvl"
    NEMOTRON_V2_VL = "nemotron_v2_vl"
    HUNYUANOCR     = "hunyuanocr"
-    HUNYUANVL      = "hunyuanvl"


 # Items here are (block size, type size)
@@ -973,9 +973,6 @@ class GGUFWriter:
    def add_rope_scaling_factor(self, value: float) -> None:
        self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)

-    def add_rope_scaling_alpha(self, value: float) -> None:
-        self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value)
-
    def add_rope_scaling_attn_factors(self, value: float) -> None:
        self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)

@@ -511,6 +511,27 @@ extern "C" {
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

+    enum llama_params_fit_status {
+        LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occurred, e.g. because no model could be found at the specified path
+    };
+
+    // fits mparams and cparams to free device memory (assumes system memory is unlimited)
+    //   - returns true if the parameters could be successfully modified to fit device memory
+    //   - this function is NOT thread safe because it modifies the global llama logger state
+    //   - only parameters that have the same value as in llama_default_model_params are modified
+    //     with the exception of the context size which is modified if and only if equal to 0
+    LLAMA_API enum llama_params_fit_status llama_params_fit(
+                                   const char   * path_model,
+                    struct llama_model_params   * mparams,
+                    struct llama_context_params * cparams,
+                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+                                         size_t * margins,               // margins of memory to leave per device in bytes
+                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
+
    LLAMA_API int64_t llama_time_us(void);

    LLAMA_API size_t llama_max_devices(void);
@@ -1525,6 +1546,9 @@ extern "C" {
    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);

+    // print a breakdown of per-device memory use via LLAMA_LOG:
+    LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
+
    //
    // training
    //
@@ -1,991 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test parallel tool-calling capability via chat completions endpoint.
-
-Only run this against models that actually support parallel tool calls — this
-script does not attempt to toggle that setting on the server. Each scenario is
-explicitly worded so that a capable model SHOULD emit multiple tool calls in a
-single assistant turn (either the same tool N times, or several different
-tools at once).
-
-Each test case contains:
-  - tools: list of tool definitions (OpenAI-compatible)
-  - messages: initial conversation messages
-  - mock_tool_responses: dict mapping tool_name -> callable(arguments) -> str (JSON)
-  - expected_parallel: dict describing what constitutes a successful parallel turn
-        {"min_parallel": int,                # minimum tool_calls in one turn
-         "require_same_tool": Optional[str], # all parallel calls must be this tool
-         "require_distinct_tools": Optional[int], # >= N distinct tool names in one turn
-         "min_distinct_args_key": Optional[str]}  # parallel calls must span this
-                                                   # many distinct values of this arg key
-  - validate: callable(turns, all_tool_calls, final_content) -> (passed, reason)
-"""
-
-import argparse
-import json
-import requests
-import sys
-
-# ---------------------------------------------------------------------------
-# Color / formatting helpers
-# ---------------------------------------------------------------------------
-
-RESET = "\x1b[0m"
-BOLD = "\x1b[1m"
-DIM = "\x1b[2m"
-CYAN = "\x1b[36m"
-YELLOW = "\x1b[33m"
-GREEN = "\x1b[32m"
-RED = "\x1b[31m"
-BLUE = "\x1b[34m"
-WHITE = "\x1b[97m"
-MAGENTA = "\x1b[35m"
-
-
-def _print(text="", end="\n"):
-    sys.stdout.write(text + end)
-    sys.stdout.flush()
-
-
-def print_header(title):
-    bar = "─" * 60
-    _print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}")
-    _print(
-        f"{BOLD}{CYAN}│  {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}"
-    )
-    _print(f"{BOLD}{CYAN}└{bar}┘{RESET}")
-
-
-def print_turn_banner(turn_idx, n_calls):
-    color = MAGENTA if n_calls >= 2 else DIM
-    _print(f"\n  {BOLD}{color}▶ turn {turn_idx} — {n_calls} tool call(s){RESET}")
-
-
-def print_tool_call(name, args):
-    args_str = json.dumps(args)
-    _print(
-        f"    {BOLD}{YELLOW}⚙ {name}{RESET}{DIM}({args_str}){RESET}"
-    )
-
-
-def print_tool_result(result):
-    preview = result[:140] + ("…" if len(result) > 140 else "")
-    _print(f"      {DIM}{BLUE}↳ {preview}{RESET}")
-
-
-def print_model_output(text):
-    sys.stdout.write(text)
-    sys.stdout.flush()
-
-
-def print_pass(reason):
-    _print(f"\n{BOLD}{GREEN}✔ PASS{RESET}  {reason}")
-
-
-def print_fail(reason):
-    _print(f"\n{BOLD}{RED}✘ FAIL{RESET}  {reason}")
-
-
-def print_info(msg):
-    _print(f"{DIM}{msg}{RESET}")
-
-
-def print_warn(msg):
-    _print(f"{BOLD}{YELLOW}⚠ {msg}{RESET}")
-
-
-# ---------------------------------------------------------------------------
-# HTTP helpers
-# ---------------------------------------------------------------------------
-
-
-def chat_completion(url, messages, tools=None, stream=False):
-    payload = {
-        "messages": messages,
-        "stream": stream,
-        "max_tokens": 4096,
-    }
-    if tools:
-        payload["tools"] = tools
-        payload["tool_choice"] = "auto"
-
-    try:
-        response = requests.post(url, json=payload, stream=stream)
-        response.raise_for_status()
-    except requests.exceptions.RequestException as e:
-        body = e.response.content if (e.response is not None) else b""
-        print_fail(f"Request error: {e} | body: {body}")
-        return None
-
-    full_content = ""
-    reasoning_content = ""
-    tool_calls: list[dict] = []
-
-    if stream:
-        for line in response.iter_lines():
-            if not line:
-                continue
-            decoded = line.decode("utf-8")
-            if not decoded.startswith("data: "):
-                continue
-            data_str = decoded[6:]
-            if data_str == "[DONE]":
-                break
-            try:
-                data = json.loads(data_str)
-            except json.JSONDecodeError:
-                continue
-            choices = data.get("choices", [])
-            if not choices:
-                continue
-            delta = choices[0].get("delta", {})
-            if delta.get("reasoning_content"):
-                reasoning_content += delta["reasoning_content"]
-            if delta.get("content"):
-                full_content += delta["content"]
-                print_model_output(delta["content"])
-            for tc in delta.get("tool_calls", []):
-                idx = tc.get("index", 0)
-                while len(tool_calls) <= idx:
-                    tool_calls.append(
-                        {
-                            "id": "",
-                            "type": "function",
-                            "function": {"name": "", "arguments": ""},
-                        }
-                    )
-                if "id" in tc:
-                    tool_calls[idx]["id"] += tc["id"]
-                if "function" in tc:
-                    if "name" in tc["function"]:
-                        tool_calls[idx]["function"]["name"] += tc["function"]["name"]
-                    if "arguments" in tc["function"]:
-                        tool_calls[idx]["function"]["arguments"] += tc["function"][
-                            "arguments"
-                        ]
-    else:
-        data = response.json()
-        choices = data.get("choices", [])
-        if choices:
-            msg = choices[0].get("message", {})
-            full_content = msg.get("content") or ""
-            reasoning_content = msg.get("reasoning_content") or ""
-            tool_calls = msg.get("tool_calls") or []
-            if full_content:
-                print_model_output(full_content)
-
-    result = {"content": full_content, "tool_calls": tool_calls}
-    if reasoning_content:
-        result["reasoning_content"] = reasoning_content
-    return result
-
-
-def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
-    """
-    Drive the multi-turn tool-call loop, but record each turn's tool calls
-    separately so parallelism can be validated.
-
-    Returns (turns, all_tool_calls, final_content) where `turns` is a list
-    of dicts: {"index": int, "tool_calls": [...], "content": str}.
-    """
-    msgs = list(messages)
-    turns: list[dict] = []
-    all_tool_calls: list[dict] = []
-
-    for turn_idx in range(max_turns):
-        result = chat_completion(url, msgs, tools=tools, stream=stream)
-        if result is None:
-            return turns, all_tool_calls, None
-
-        tcs = result.get("tool_calls") or []
-        content = result.get("content") or ""
-
-        turns.append(
-            {"index": turn_idx, "tool_calls": list(tcs), "content": content}
-        )
-
-        if not tcs:
-            if content:
-                _print(f"\n{DIM}{'·' * 60}{RESET}")
-                _print(f"{DIM}  model response:{RESET}\n")
-            return turns, all_tool_calls, content
-
-        print_turn_banner(turn_idx, len(tcs))
-        all_tool_calls.extend(tcs)
-
-        assistant_msg: dict = {
-            "role": "assistant",
-            "content": content,
-            "tool_calls": tcs,
-        }
-        reasoning = result.get("reasoning_content")
-        if reasoning:
-            assistant_msg["reasoning_content"] = reasoning
-        msgs.append(assistant_msg)
-
-        for tc in tcs:
-            tool_name = tc["function"]["name"]
-            try:
-                args = json.loads(tc["function"]["arguments"])
-            except json.JSONDecodeError:
-                args = {}
-
-            print_tool_call(tool_name, args)
-
-            mock_fn = mock_tool_responses.get(tool_name)
-            if mock_fn:
-                tool_result = mock_fn(args)
-            else:
-                tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"})
-
-            print_tool_result(tool_result)
-
-            msgs.append(
-                {
-                    "role": "tool",
-                    "tool_call_id": tc.get("id", ""),
-                    "content": tool_result,
-                }
-            )
-
-    return turns, all_tool_calls, None
-
-
-# ---------------------------------------------------------------------------
-# Parallelism helpers
-# ---------------------------------------------------------------------------
-
-
-def _best_parallel_turn(turns):
-    """Return the turn (dict) with the most tool calls, or None if no tools."""
-    tool_turns = [t for t in turns if t["tool_calls"]]
-    if not tool_turns:
-        return None
-    return max(tool_turns, key=lambda t: len(t["tool_calls"]))
-
-
-def _distinct_tool_names(turn):
-    return {tc["function"]["name"] for tc in turn["tool_calls"]}
-
-
-def _distinct_arg_values(turn, key):
-    values = set()
-    for tc in turn["tool_calls"]:
-        try:
-            args = json.loads(tc["function"]["arguments"])
-        except json.JSONDecodeError:
-            continue
-        v = args.get(key)
-        if v is not None:
-            if isinstance(v, str):
-                values.add(v.strip().lower())
-            else:
-                values.add(v)
-    return values
-
-
-def _check_parallel(turns, expected):
-    """
-    Check that at least one turn satisfies the parallel-call expectations.
-    Returns (ok, reason).
-    """
-    best = _best_parallel_turn(turns)
-    if best is None:
-        return False, "No tool calls were made at all"
-
-    min_parallel = expected.get("min_parallel", 2)
-    if len(best["tool_calls"]) < min_parallel:
-        by_turn = [len(t["tool_calls"]) for t in turns]
-        return False, (
-            f"No turn had >= {min_parallel} parallel tool calls "
-            f"(per-turn counts: {by_turn})"
-        )
-
-    require_same = expected.get("require_same_tool")
-    if require_same is not None:
-        names = [tc["function"]["name"] for tc in best["tool_calls"]]
-        if any(n != require_same for n in names):
-            return False, (
-                f"Parallel turn mixed tools; expected all {require_same!r}, got {names}"
-            )
-
-    require_distinct = expected.get("require_distinct_tools")
-    if require_distinct is not None:
-        distinct = _distinct_tool_names(best)
-        if len(distinct) < require_distinct:
-            return False, (
-                f"Parallel turn had only {len(distinct)} distinct tool names "
-                f"({distinct}); need >= {require_distinct}"
-            )
-
-    distinct_key = expected.get("min_distinct_args_key")
-    distinct_count = expected.get("min_distinct_args_count", min_parallel)
-    if distinct_key is not None:
-        values = _distinct_arg_values(best, distinct_key)
-        if len(values) < distinct_count:
-            return False, (
-                f"Parallel turn had only {len(values)} distinct {distinct_key!r} "
-                f"values ({values}); need >= {distinct_count}"
-            )
-
-    return True, (
-        f"Parallel turn had {len(best['tool_calls'])} calls across "
-        f"{len(_distinct_tool_names(best))} distinct tool(s)"
-    )
-
-
-# ---------------------------------------------------------------------------
-# Test case runner
-# ---------------------------------------------------------------------------
-
-
-def run_test(url, test_case, stream):
-    name = test_case["name"]
-    mode = f"{'stream' if stream else 'non-stream'}"
-    print_header(f"{name}  [{mode}]")
-
-    turns, all_tool_calls, final_content = run_agentic_loop(
-        url,
-        messages=test_case["messages"],
-        tools=test_case["tools"],
-        mock_tool_responses=test_case["mock_tool_responses"],
-        stream=stream,
-    )
-
-    if not turns:
-        print_fail("No response from server.")
-        return False
-
-    parallel_ok, parallel_reason = _check_parallel(turns, test_case["expected_parallel"])
-    if not parallel_ok:
-        print_fail(parallel_reason)
-        return False
-
-    passed, reason = test_case["validate"](turns, all_tool_calls, final_content)
-    if passed:
-        print_pass(f"{parallel_reason}; {reason}")
-    else:
-        print_fail(reason)
-    return passed
-
-
-# ---------------------------------------------------------------------------
-# Test case definitions
-# ---------------------------------------------------------------------------
-
-# ---- Test 1: Multi-file read (same tool, multiple distinct paths) ----
-
-_FILE_TOOLS = [
-    {
-        "type": "function",
-        "function": {
-            "name": "read_file",
-            "description": (
-                "Read the full contents of a file from the local filesystem. "
-                "Call this tool in parallel when asked to read several files — "
-                "each path needs its own call."
-            ),
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "path": {
-                        "type": "string",
-                        "description": "Absolute or repo-relative path to a file",
-                    },
-                },
-                "required": ["path"],
-            },
-        },
-    },
-]
-
-_FILE_CONTENTS = {
-    "config/database.yml": "host: db.internal\nport: 5432\nuser: svc_app\n",
-    "config/redis.yml":    "host: cache.internal\nport: 6379\ndb: 0\n",
-    "config/queue.yml":    "broker: rabbitmq.internal\nport: 5672\nvhost: prod\n",
-    "config/auth.yml":     "provider: oidc\nissuer: https://auth.internal\n",
-}
-
-
-def _read_file_mock(args):
-    path = args.get("path", "")
-    norm = path.lstrip("./").lstrip("/")
-    content = _FILE_CONTENTS.get(norm)
-    if content is None:
-        for k, v in _FILE_CONTENTS.items():
-            if path.endswith(k):
-                content = v
-                break
-    if content is None:
-        return json.dumps({"path": path, "error": "not found"})
-    return json.dumps({"path": path, "content": content})
-
-
-MULTIFILE_READ_TEST = {
-    "name": "Parallel multi-file read (same tool, 4 distinct paths)",
-    "tools": _FILE_TOOLS,
-    "messages": [
-        {
-            "role": "user",
-            "content": (
-                "Please read all four of these config files so I can review them "
-                "together: config/database.yml, config/redis.yml, config/queue.yml, "
-                "and config/auth.yml. Call read_file for every path in parallel in "
-                "a single batch — do NOT read them one by one sequentially across "
-                "turns. After you have all four, give me a one-line summary of each."
-            ),
-        }
-    ],
-    "mock_tool_responses": {"read_file": _read_file_mock},
-    "expected_parallel": {
-        "min_parallel": 4,
-        "require_same_tool": "read_file",
-        "min_distinct_args_key": "path",
-        "min_distinct_args_count": 4,
-    },
-    "validate": lambda turns, tcs, content: _validate_multifile(turns, tcs, content),
-}
-
-
-def _validate_multifile(turns, tcs, content):
-    del turns
-    if not content:
-        return False, "No final summary produced"
-    return True, f"{len(tcs)} total read_file calls; content length={len(content)}"
-
-
-# ---- Test 2: Batch TODO marking (same tool, N calls in one turn) ----
-
-_TODO_TOOLS = [
-    {
-        "type": "function",
-        "function": {
-            "name": "mark_todo_complete",
-            "description": (
-                "Mark a single TODO item as complete by ID. When the user wants "
-                "several items marked at once, call this tool in parallel — "
-                "one call per item — rather than sequentially across turns."
-            ),
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "todo_id": {
-                        "type": "string",
-                        "description": "Identifier of the TODO item",
-                    },
-                    "note": {
-                        "type": "string",
-                        "description": "Optional completion note",
-                    },
-                },
-                "required": ["todo_id"],
-            },
-        },
-    },
-]
-
-_TODO_DB = {
-    "T-101": "Draft onboarding doc",
-    "T-102": "Update dependency lockfile",
-    "T-103": "Fix flaky login test",
-    "T-104": "Rotate service credentials",
-    "T-105": "Archive Q4 reports",
-}
-
-
-def _mark_todo_mock(args):
-    tid = args.get("todo_id", "")
-    if tid in _TODO_DB:
-        return json.dumps({"todo_id": tid, "title": _TODO_DB[tid], "status": "done"})
-    return json.dumps({"todo_id": tid, "error": "unknown id"})
-
-
-TODO_BATCH_TEST = {
-    "name": "Batch TODO completion (same tool, 5 IDs in one turn)",
-    "tools": _TODO_TOOLS,
-    "messages": [
-        {
-            "role": "user",
-            "content": (
-                "I finished every item on today's list. Please mark all of the "
-                "following TODOs as complete, in one parallel batch: T-101, T-102, "
-                "T-103, T-104, T-105. Don't mark them one at a time across separate "
-                "turns — issue all five mark_todo_complete calls at once. Afterwards "
-                "confirm which ones succeeded."
-            ),
-        }
-    ],
-    "mock_tool_responses": {"mark_todo_complete": _mark_todo_mock},
-    "expected_parallel": {
-        "min_parallel": 5,
-        "require_same_tool": "mark_todo_complete",
-        "min_distinct_args_key": "todo_id",
-        "min_distinct_args_count": 5,
-    },
-    "validate": lambda turns, tcs, content: _validate_todo(turns, tcs, content),
-}
-
-
-def _validate_todo(turns, tcs, content):
-    del turns
-    if not content:
-        return False, "No confirmation summary produced"
-    return True, f"{len(tcs)} total mark_todo_complete calls"
-
-
-# ---- Test 3: Multi-city weather (same tool, N parallel locations) ----
-
-_WEATHER_TOOLS = [
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": (
-                "Fetch current weather for ONE city. When the user asks about "
-                "several cities, call this tool in parallel — one call per city — "
-                "instead of sequentially."
-            ),
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "city":  {"type": "string", "description": "City name"},
-                    "units": {
-                        "type": "string",
-                        "enum": ["metric", "imperial"],
-                        "default": "metric",
-                    },
-                },
-                "required": ["city"],
-            },
-        },
-    },
-]
-
-_WEATHER_DB = {
-    "tokyo":  {"city": "Tokyo",  "temp_c": 18.4, "condition": "partly cloudy", "humidity": 64},
-    "london": {"city": "London", "temp_c":  9.1, "condition": "overcast",       "humidity": 81},
-    "new york": {"city": "New York", "temp_c": 12.7, "condition": "clear",      "humidity": 55},
-    "paris":  {"city": "Paris",  "temp_c": 11.3, "condition": "light rain",     "humidity": 78},
-}
-
-
-def _weather_mock(args):
-    city = args.get("city", "").strip().lower()
-    if city.startswith("new york"):
-        city = "new york"
-    if city in _WEATHER_DB:
-        return json.dumps(_WEATHER_DB[city])
-    return json.dumps({"city": args.get("city", ""), "error": "unknown city"})
-
-
-MULTI_WEATHER_TEST = {
-    "name": "Parallel multi-city weather (same tool, 4 cities)",
-    "tools": _WEATHER_TOOLS,
-    "messages": [
-        {
-            "role": "user",
-            "content": (
-                "I'm comparing today's weather across four cities for a travel "
-                "decision: Tokyo, London, New York, and Paris. Please call "
-                "get_weather for all four in parallel in a single turn — don't "
-                "fetch them one at a time. Then rank them from warmest to coolest."
-            ),
-        }
-    ],
-    "mock_tool_responses": {"get_weather": _weather_mock},
-    "expected_parallel": {
-        "min_parallel": 4,
-        "require_same_tool": "get_weather",
-        "min_distinct_args_key": "city",
-        "min_distinct_args_count": 4,
-    },
-    "validate": lambda turns, tcs, content: _validate_weather(turns, tcs, content),
-}
-
-
-def _validate_weather(turns, tcs, content):
-    del turns
-    if not content or not any(
-        kw in content.lower() for kw in ("warmest", "rank", "hot", "cool")
-    ):
-        return False, f"Final content missing a ranking: {content!r}"
-    return True, f"{len(tcs)} total get_weather calls; ranking produced"
-
-
-# ---- Test 4: Trip planning (different tools, parallel in one turn) ----
-
-_TRIP_TOOLS = [
-    {
-        "type": "function",
-        "function": {
-            "name": "search_flights",
-            "description": "Search one-way flights between two airports on a given date.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "from_airport": {"type": "string", "description": "IATA code, e.g. SFO"},
-                    "to_airport":   {"type": "string", "description": "IATA code, e.g. JFK"},
-                    "date":         {"type": "string", "description": "YYYY-MM-DD"},
-                },
-                "required": ["from_airport", "to_airport", "date"],
-            },
-        },
-    },
-    {
-        "type": "function",
-        "function": {
-            "name": "search_hotels",
-            "description": "Search hotels in a city for a date range.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "city":       {"type": "string"},
-                    "check_in":   {"type": "string", "description": "YYYY-MM-DD"},
-                    "check_out":  {"type": "string", "description": "YYYY-MM-DD"},
-                    "max_price":  {"type": "integer"},
-                },
-                "required": ["city", "check_in", "check_out"],
-            },
-        },
-    },
-    {
-        "type": "function",
-        "function": {
-            "name": "search_restaurants",
-            "description": "Search restaurants in a city by cuisine.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "city":    {"type": "string"},
-                    "cuisine": {"type": "string"},
-                },
-                "required": ["city"],
-            },
-        },
-    },
-]
-
-_FLIGHTS_RESULT = {
-    "results": [
-        {"flight": "UA 1552", "depart": "08:15", "arrive": "16:45", "price": 389},
-        {"flight": "AA  20",  "depart": "10:00", "arrive": "18:35", "price": 412},
-    ]
-}
-_HOTELS_RESULT = {
-    "results": [
-        {"name": "Midtown Grand",    "nightly_rate": 245, "rating": 4.3},
-        {"name": "Harbour Boutique", "nightly_rate": 312, "rating": 4.6},
-    ]
-}
-_RESTAURANTS_RESULT = {
-    "results": [
-        {"name": "Trattoria Nona", "cuisine": "italian", "rating": 4.5},
-        {"name": "Osteria Blu",    "cuisine": "italian", "rating": 4.4},
-    ]
-}
-
-TRIP_PLAN_TEST = {
-    "name": "Trip planning (3 different tools in parallel)",
-    "tools": _TRIP_TOOLS,
-    "messages": [
-        {
-            "role": "user",
-            "content": (
-                "I'm flying from SFO to JFK on 2026-06-12 and staying four nights "
-                "(check out 2026-06-16). I'd also like some Italian restaurant "
-                "suggestions in New York. Please call search_flights, search_hotels, "
-                "and search_restaurants in parallel — all three in a single turn, "
-                "since they don't depend on each other. Then give me a concise "
-                "travel summary."
-            ),
-        }
-    ],
-    "mock_tool_responses": {
-        "search_flights": lambda _: json.dumps(_FLIGHTS_RESULT),
-        "search_hotels": lambda _: json.dumps(_HOTELS_RESULT),
-        "search_restaurants": lambda _: json.dumps(_RESTAURANTS_RESULT),
-    },
-    "expected_parallel": {
-        "min_parallel": 3,
-        "require_distinct_tools": 3,
-    },
-    "validate": lambda turns, tcs, content: _validate_trip(turns, tcs, content),
-}
-
-
-def _validate_trip(turns, tcs, content):
-    del turns
-    names = {tc["function"]["name"] for tc in tcs}
-    required = {"search_flights", "search_hotels", "search_restaurants"}
-    missing = required - names
-    if missing:
-        return False, f"Missing tool calls: {missing}"
-    if not content:
-        return False, "No travel summary produced"
-    return True, f"All three tools called; summary length={len(content)}"
-
-
-# ---- Test 5: Portfolio check (same tool, parallel tickers) ----
-
-_STOCK_TOOLS = [
-    {
-        "type": "function",
-        "function": {
-            "name": "get_stock_quote",
-            "description": (
-                "Get the latest quote for ONE ticker. When the user asks about "
-                "multiple tickers, call this tool in parallel — one per symbol — "
-                "rather than sequentially."
-            ),
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "symbol": {"type": "string", "description": "Ticker symbol"},
-                },
-                "required": ["symbol"],
-            },
-        },
-    },
-]
-
-_STOCK_DB = {
-    "AAPL": {"symbol": "AAPL", "price": 218.45, "change_pct": "+0.8%"},
-    "MSFT": {"symbol": "MSFT", "price": 421.10, "change_pct": "+1.2%"},
-    "GOOGL":{"symbol": "GOOGL","price": 175.22, "change_pct": "-0.3%"},
-    "AMZN": {"symbol": "AMZN", "price": 189.76, "change_pct": "+0.5%"},
-    "NVDA": {"symbol": "NVDA", "price": 140.88, "change_pct": "+2.4%"},
-}
-
-
-def _stock_mock(args):
-    sym = args.get("symbol", "").strip().upper()
-    if sym in _STOCK_DB:
-        return json.dumps(_STOCK_DB[sym])
-    return json.dumps({"symbol": sym, "error": "unknown ticker"})
-
-
-PORTFOLIO_TEST = {
-    "name": "Portfolio check (same tool, 5 tickers in parallel)",
-    "tools": _STOCK_TOOLS,
-    "messages": [
-        {
-            "role": "user",
-            "content": (
-                "Pull the latest quote for every ticker in my portfolio — AAPL, "
-                "MSFT, GOOGL, AMZN, and NVDA — in a single parallel batch. These "
-                "lookups are independent, so please don't chain them across turns. "
-                "Once you have all five, tell me which ticker had the biggest "
-                "percentage change today."
-            ),
-        }
-    ],
-    "mock_tool_responses": {"get_stock_quote": _stock_mock},
-    "expected_parallel": {
-        "min_parallel": 5,
-        "require_same_tool": "get_stock_quote",
-        "min_distinct_args_key": "symbol",
-        "min_distinct_args_count": 5,
-    },
-    "validate": lambda turns, tcs, content: _validate_portfolio(turns, tcs, content),
-}
-
-
-def _validate_portfolio(turns, tcs, content):
-    del turns
-    if not content or ("nvda" not in content.lower() and "NVDA" not in content):
-        return False, f"Expected NVDA to be identified as the biggest mover: {content!r}"
-    return True, f"{len(tcs)} total quotes pulled"
-
-
-# ---- Test 6: Mixed — translate + dictionary in parallel for the same word ----
-
-_LANG_TOOLS = [
-    {
-        "type": "function",
-        "function": {
-            "name": "translate_text",
-            "description": "Translate a short text into a target language.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "text":            {"type": "string"},
-                    "target_language": {"type": "string",
-                                        "description": "ISO 639-1 language code, e.g. 'es'"},
-                },
-                "required": ["text", "target_language"],
-            },
-        },
-    },
-    {
-        "type": "function",
-        "function": {
-            "name": "get_definition",
-            "description": "Get the English dictionary definition of a word.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "word": {"type": "string"},
-                },
-                "required": ["word"],
-            },
-        },
-    },
-    {
-        "type": "function",
-        "function": {
-            "name": "get_synonyms",
-            "description": "Get English synonyms for a word.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "word": {"type": "string"},
-                },
-                "required": ["word"],
-            },
-        },
-    },
-]
-
-
-def _translate_mock(args):
-    t = args.get("text", "")
-    lang = args.get("target_language", "")
-    return json.dumps({"source": t, "target_language": lang, "translation": f"[{lang}] {t}"})
-
-
-def _definition_mock(args):
-    w = args.get("word", "")
-    return json.dumps({
-        "word": w,
-        "definition": f"A standard dictionary definition of {w!r}.",
-    })
-
-
-def _synonyms_mock(args):
-    w = args.get("word", "")
-    return json.dumps({
-        "word": w,
-        "synonyms": ["synonym_a", "synonym_b", "synonym_c"],
-    })
-
-
-LANG_TOOLKIT_TEST = {
-    "name": "Language toolkit (translate + definition + synonyms in parallel)",
-    "tools": _LANG_TOOLS,
-    "messages": [
-        {
-            "role": "user",
-            "content": (
-                "For the English word 'resilient', I need three independent "
-                "look-ups at once: (a) translate it into Spanish, (b) fetch its "
-                "dictionary definition, and (c) list its synonyms. These three "
-                "calls don't depend on each other — please issue them in parallel "
-                "in a single turn. Then present the combined results as a short "
-                "language note."
-            ),
-        }
-    ],
-    "mock_tool_responses": {
-        "translate_text":  _translate_mock,
-        "get_definition":  _definition_mock,
-        "get_synonyms":    _synonyms_mock,
-    },
-    "expected_parallel": {
-        "min_parallel": 3,
-        "require_distinct_tools": 3,
-    },
-    "validate": lambda turns, tcs, content: _validate_lang(turns, tcs, content),
-}
-
-
-def _validate_lang(turns, tcs, content):
-    del turns
-    names = {tc["function"]["name"] for tc in tcs}
-    required = {"translate_text", "get_definition", "get_synonyms"}
-    missing = required - names
-    if missing:
-        return False, f"Missing tool calls: {missing}"
-    if not content:
-        return False, "No language note produced"
-    return True, f"All three lookup tools called; note length={len(content)}"
-
-
-# ---------------------------------------------------------------------------
-# All test cases
-# ---------------------------------------------------------------------------
-
-ALL_TEST_CASES = [
-    MULTIFILE_READ_TEST,
-    TODO_BATCH_TEST,
-    MULTI_WEATHER_TEST,
-    TRIP_PLAN_TEST,
-    PORTFOLIO_TEST,
-    LANG_TOOLKIT_TEST,
-]
-
-
-# ---------------------------------------------------------------------------
-# Entry point
-# ---------------------------------------------------------------------------
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description=(
-            "Test llama-server parallel tool-calling capability. Run this only "
-            "against models configured for parallel tool calls — this script "
-            "does not configure that itself."
-        )
-    )
-    parser.add_argument("--host", default="localhost")
-    parser.add_argument("--port", default=8080, type=int)
-    parser.add_argument(
-        "--no-stream", action="store_true", help="Disable streaming mode tests"
-    )
-    parser.add_argument(
-        "--stream-only", action="store_true", help="Only run streaming mode tests"
-    )
-    parser.add_argument(
-        "--test",
-        help="Run only the test whose name contains this substring (case-insensitive)",
-    )
-    args = parser.parse_args()
-
-    url = f"http://{args.host}:{args.port}/v1/chat/completions"
-    print_info(f"Testing server at {url}")
-    print_warn(
-        "This script expects the target model to emit multiple tool calls in a "
-        "single assistant turn. Run it only against parallel-tool-capable models."
-    )
-
-    modes: list[bool] = []
-    if not args.stream_only:
-        modes.append(False)
-    if not args.no_stream:
-        modes.append(True)
-
-    cases: list[dict] = ALL_TEST_CASES
-    if args.test:
-        name_filter = args.test.lower()
-        cases = [c for c in cases if name_filter in str(c["name"]).lower()]
-        if not cases:
-            print_fail(f"No test cases matched '{args.test}'")
-            sys.exit(1)
-
-    total = 0
-    passed = 0
-    for stream in modes:
-        for case in cases:
-            total += 1
-            if run_test(url, case, stream=stream):
-                passed += 1
-
-    color = GREEN if passed == total else RED
-    _print(f"\n{BOLD}{color}{'─' * 60}{RESET}")
-    _print(f"{BOLD}{color}  Results: {passed}/{total} passed{RESET}")
-    _print(f"{BOLD}{color}{'─' * 60}{RESET}\n")
-    sys.exit(0 if passed == total else 1)
-
-
-if __name__ == "__main__":
-    main()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	35df147d80	cont : remove /api/tags	2026-04-20 15:45:42 +03:00
Georgi Gerganov	c1891fd6eb	server : remove /api endpoints	2026-04-20 15:34:18 +03:00