Compare commits

..

2 Commits

Author SHA1 Message Date
Georgi Gerganov 35df147d80 cont : remove /api/tags 2026-04-20 15:45:42 +03:00
Georgi Gerganov c1891fd6eb server : remove /api endpoints 2026-04-20 15:34:18 +03:00
162 changed files with 4377 additions and 11928 deletions
+1 -1
View File
@@ -1,4 +1,4 @@
ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
## Build Image
+2 -48
View File
@@ -2,19 +2,7 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
ARG UBUNTU_VERSION=24.04
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGDGMM_VERSION=22.9.0
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
ARG NPU_DRIVER_VERSION=v1.32.0
ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
# Optional proxy build arguments
# Optional proxy build arguments - empty by default
ARG http_proxy=
ARG https_proxy=
@@ -90,47 +78,13 @@ ARG http_proxy
ARG https_proxy
RUN apt-get update \
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
&& apt-get install -y libgomp1 libtbb12 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
# Install GPU drivers
ARG IGC_VERSION
ARG IGC_VERSION_FULL
ARG COMPUTE_RUNTIME_VERSION
ARG COMPUTE_RUNTIME_VERSION_FULL
ARG IGDGMM_VERSION
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& dpkg --install *.deb \
&& rm -rf /tmp/neo/
# Install NPU drivers
ARG NPU_DRIVER_VERSION
ARG NPU_DRIVER_FULL
ARG LIBZE1_VERSION
RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
&& wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& dpkg --install *.deb \
&& rm -rf /tmp/npu/
RUN cd /tmp \
&& wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
&& dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
&& rm libze1_${LIBZE1_VERSION}_amd64.deb
COPY --from=build /app/lib/ /app/
### Full (all binaries)
@@ -1,113 +0,0 @@
name: CI (snapdragon)
on:
workflow_dispatch:
push:
branches:
- master
paths:
- '.github/workflows/build-and-test-snapdragon.yml'
- 'ggml/include/ggml-hexagon.h'
- 'ggml/src/ggml-hexagon/**'
- 'docs/backend/snapdragon/**'
- 'scripts/snapdragon/**'
- 'CMakePresets.json'
pull_request:
types: [opened, synchronize, reopened]
paths:
- '.github/workflows/build-and-test-snapdragon.yml'
- 'ggml/include/ggml-hexagon.h'
- 'ggml/src/ggml-hexagon/**'
- 'docs/backend/snapdragon/**'
- 'scripts/snapdragon/**'
- 'CMakePresets.json'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
android-ndk-snapdragon:
runs-on: ubuntu-latest
container:
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
defaults:
run:
shell: bash
steps:
- name: Clone
uses: actions/checkout@v6
with:
fetch-depth: 0
lfs: false
- name: Build Llama.CPP for Snapdragon Android
id: build_llama_cpp_snapdragon_android
run: |
cp docs/backend/snapdragon/CMakeUserPresets.json .
cmake --preset arm64-android-snapdragon-release -B build
cmake --build build
cmake --install build --prefix pkg-adb/llama.cpp
- name: Upload Llama.CPP Snapdragon Android Build Artifact
if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
uses: actions/upload-artifact@v6
with:
name: llama-cpp-android-arm64-snapdragon
path: pkg-adb/llama.cpp
check-secret:
runs-on: ubuntu-latest
outputs:
has-key: ${{ steps.check.outputs.has-key }}
steps:
- id: check
run: echo "has-key=${{ secrets.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
test-snapdragon-qdc:
name: Test on QDC Android Device (${{ matrix.device }})
needs: [android-ndk-snapdragon, check-secret]
if: needs.check-secret.outputs.has-key == 'true'
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
device: [SM8750, SM8650, SM8850]
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Download build artifact
uses: actions/download-artifact@v4
with:
name: llama-cpp-android-arm64-snapdragon
path: pkg-snapdragon/
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
cache: pip
- name: Install QDC SDK wheel
run: |
curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
unzip qdc_sdk.zip -d qdc_sdk
pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
- name: Run QDC tests (${{ matrix.device }})
run: |
python scripts/snapdragon/qdc/run_qdc_jobs.py \
--test all \
--pkg-dir pkg-snapdragon/llama.cpp \
--model-url "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
--device ${{ matrix.device }}
env:
QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
- name: Cleanup
if: always()
run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
+31 -18
View File
@@ -1,24 +1,26 @@
name: CI (android)
on:
workflow_dispatch:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths:
- '.github/workflows/build-android.yml'
- '**/CMakeLists.txt'
- '**/.cmake'
- '**/*.h'
- '**/*.hpp'
- '**/*.c'
- '**/*.cpp'
paths: [
'.github/workflows/build-android.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths:
- '.github/workflows/build-android.yml'
- 'examples/llama.android/**'
paths: [
'.github/workflows/build-android.yml',
'examples/llama.android/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -65,24 +67,35 @@ jobs:
defaults:
run:
shell: bash
strategy:
matrix:
include:
- build: 'arm64-cpu'
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
- build: 'arm64-snapdragon'
defines: '--preset arm64-android-snapdragon-release'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
lfs: false
- name: Build
id: ndk_build
- name: Build Llama.CPP for Hexagon Android
id: build_llama_cpp_hexagon_android
run: |
cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
cp docs/backend/snapdragon/CMakeUserPresets.json .
fi
cmake ${{ matrix.defines }} -B build
cmake --build build
cmake --install build --prefix pkg-adb/llama.cpp
- name: Upload Android Build Artifact
if: ${{ always() && steps.ndk_build.outcome == 'success' }}
- name: Upload Llama.CPP Hexagon Android Build Artifact
if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
uses: actions/upload-artifact@v6
with:
name: llama-cpp-android-arm64-cpu
name: llama-cpp-android-${{ matrix.build }}
path: pkg-adb/llama.cpp
-120
View File
@@ -1,120 +0,0 @@
name: CI (openvino)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-openvino.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-openvino.yml',
'ggml/src/ggml-openvino/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-openvino:
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
concurrency:
group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
cancel-in-progress: false
strategy:
matrix:
include:
- variant: cpu
runner: '"ubuntu-24.04"'
openvino_device: "CPU"
- variant: gpu
runner: '["self-hosted","Linux","Intel","OpenVINO"]'
openvino_device: "GPU"
runs-on: ${{ fromJSON(matrix.runner) }}
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
if: runner.environment == 'github-hosted'
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
- name: Use OpenVINO Toolkit Cache
if: runner.environment == 'github-hosted'
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenVINO dependencies
run: |
cd ./openvino_toolkit
chmod +x ./install_dependencies/install_openvino_dependencies.sh
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
- name: Build
id: cmake_build
run: |
source ./openvino_toolkit/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
time cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Test
id: cmake_test
# TODO: fix and re-enable the `test-llama-archs` test below
run: |
cd ${{ github.workspace }}
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
export GGML_OPENVINO_DEVICE=GPU
fi
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
-4
View File
@@ -265,10 +265,6 @@ jobs:
ggml-ci-intel-openvino-gpu-low-perf:
runs-on: [self-hosted, Linux, Intel, OpenVINO]
concurrency:
group: openvino-gpu-${{ github.head_ref || github.ref }}
cancel-in-progress: false
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
-142
View File
@@ -1,142 +0,0 @@
name: CI (sycl)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-sycl.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-sycl.yml',
'ggml/src/ggml-sycl/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
continue-on-error: true
steps:
- uses: actions/checkout@v6
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-sycl-${{ matrix.build }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
windows-latest-sycl:
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: windows-latest-sycl
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
+213
View File
@@ -555,6 +555,186 @@ jobs:
-DGGML_MUSA=ON
time cmake --build build --config Release -j $(nproc)
ubuntu-22-sycl:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v6
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-sycl
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
time cmake --build build --config Release -j $(nproc)
ubuntu-22-sycl-fp16:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v6
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-sycl-fp16
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DGGML_SYCL_F16=ON
time cmake --build build --config Release -j $(nproc)
ubuntu-24-openvino:
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
strategy:
matrix:
include:
- variant: cpu
runner: '"ubuntu-24.04"'
openvino_device: "CPU"
- variant: gpu
runner: '["self-hosted","Linux","X64","Intel"]'
openvino_device: "GPU"
runs-on: ${{ fromJSON(matrix.runner) }}
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
if: runner.environment == 'github-hosted'
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
- name: Use OpenVINO Toolkit Cache
if: runner.environment == 'github-hosted'
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenVINO dependencies
run: |
cd ./openvino_toolkit
chmod +x ./install_dependencies/install_openvino_dependencies.sh
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
- name: Build
id: cmake_build
run: |
source ./openvino_toolkit/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
time cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Test
id: cmake_test
# TODO: fix and re-enable the `test-llama-archs` test below
run: |
cd ${{ github.workspace }}
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
export GGML_OPENVINO_DEVICE=GPU
fi
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
windows-latest:
runs-on: windows-2025
@@ -763,6 +943,39 @@ jobs:
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
windows-latest-sycl:
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: windows-latest-sycl
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
windows-latest-hip:
runs-on: windows-2022
+5 -94
View File
@@ -598,29 +598,15 @@ jobs:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
@@ -628,6 +614,10 @@ jobs:
variant: ccache
evict-old-files: 1d
- name: Install
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Build
id: cmake_build
shell: cmd
@@ -680,82 +670,6 @@ jobs:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-sycl-${{ matrix.build }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
ubuntu-22-rocm:
runs-on: ubuntu-22.04
@@ -1131,7 +1045,6 @@ jobs:
- ubuntu-cpu
- ubuntu-vulkan
- ubuntu-24-openvino
- ubuntu-24-sycl
- android-arm64
- macOS-cpu
- ios-xcode-build
@@ -1220,8 +1133,6 @@ jobs:
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
**Android:**
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
-2
View File
@@ -145,5 +145,3 @@ poetry.toml
/.windsurf/
# emscripten
a.out.*
AGENTS.local.md
-1
View File
@@ -23,7 +23,6 @@
/ci/ @ggerganov
/cmake/ @ggerganov
/common/ @ggml-org/llama-common
/common/fit.* @JohannesGaessler
/common/jinja/ @CISC
/common/ngram-map.* @srogmann
/convert_*.py @CISC
-2
View File
@@ -73,8 +73,6 @@ add_library(${TARGET}
debug.h
download.cpp
download.h
fit.cpp
fit.h
hf-cache.cpp
hf-cache.h
http.h
+2 -27
View File
@@ -2426,20 +2426,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_env("LLAMA_ARG_FIT"));
add_opt(common_arg(
{ "-fitp", "--fit-print" }, "[on|off]",
string_format("print the estimated required memory ('on' or 'off', default: '%s')", params.fit_params_print ? "on" : "off"),
[](common_params & params, const std::string & value) {
if (is_truthy(value)) {
params.fit_params_print = true;
} else if (is_falsey(value)) {
params.fit_params_print = false;
} else {
throw std::runtime_error(
string_format("error: unknown value for --fit-print: '%s'\n", value.c_str()));
}
}
).set_examples({LLAMA_EXAMPLE_FIT_PARAMS}).set_env("LLAMA_ARG_FIT_ESTIMATE"));
add_opt(common_arg(
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
string_format("target margin per device for --fit, comma-separated list of values, "
@@ -3122,14 +3108,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
[](common_params & params, int value) {
if (value < -1) { throw std::invalid_argument("invalid value"); }
params.sampling.reasoning_budget_tokens = value;
params.reasoning_budget = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--reasoning-budget-message"}, "MESSAGE",
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
[](common_params & params, const std::string & value) {
params.sampling.reasoning_budget_message = value;
params.reasoning_budget_message = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
add_opt(common_arg(
@@ -3902,17 +3888,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--spec-default"},
string_format("enable default speculative decoding config"),
[](common_params & params) {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
params.speculative.ngram_size_n = 24;
params.speculative.n_min = 48;
params.speculative.n_max = 64;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
return ctx_arg;
}
+55 -40
View File
@@ -397,25 +397,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
return render_message_to_json(msgs, c);
}
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
if (tools.empty()) {
return json();
}
auto result = json::array();
for (const auto & tool : tools) {
result.push_back({
{ "type", "function" },
{ "function", {
{ "name", tool.name },
{ "description", tool.description },
{ "parameters", json::parse(tool.parameters) },
}},
});
}
return result;
}
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
std::vector<common_chat_tool> result;
@@ -451,6 +432,56 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
return result;
}
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
if (tools.empty()) {
return json();
}
auto result = json::array();
for (const auto & tool : tools) {
result.push_back({
{ "type", "function" },
{ "function",
{
{ "name", tool.name },
{ "description", tool.description },
{ "parameters", json::parse(tool.parameters) },
} },
});
}
return result;
}
json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
json delta = json::object();
if (!diff.reasoning_content_delta.empty()) {
delta["reasoning_content"] = diff.reasoning_content_delta;
}
if (!diff.content_delta.empty()) {
delta["content"] = diff.content_delta;
}
if (diff.tool_call_index != std::string::npos) {
json tool_call;
tool_call["index"] = diff.tool_call_index;
if (!diff.tool_call_delta.id.empty()) {
tool_call["id"] = diff.tool_call_delta.id;
tool_call["type"] = "function";
}
if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
json function = json::object();
if (!diff.tool_call_delta.name.empty()) {
function["name"] = diff.tool_call_delta.name;
}
if (!diff.tool_call_delta.arguments.empty()) {
function["arguments"] = diff.tool_call_delta.arguments;
}
tool_call["function"] = function;
}
delta["tool_calls"] = json::array({ tool_call });
}
return delta;
}
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
if (use_jinja) {
try {
@@ -544,26 +575,6 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
return tmpls->has_explicit_template;
}
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
static bool is_lfm2_template(const std::string & src) {
return src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos;
}
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
common_chat_prompt_preset asr_preset;
asr_preset.system = "";
asr_preset.user = "Transcribe audio to text";
if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
asr_preset.system = "Perform ASR.";
asr_preset.user = "";
}
return asr_preset;
}
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
if (!variant.empty()) {
if (variant == "tool_use") {
@@ -2073,7 +2084,10 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
return common_chat_params_init_kimi_k2(tmpl, params);
}
if (is_lfm2_template(src)) {
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
if (src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
}
@@ -2382,3 +2396,4 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
GGML_ASSERT(chat_templates->template_default != nullptr);
return chat_templates->template_default->caps.to_map();
}
+3 -10
View File
@@ -256,13 +256,14 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
// Parses a JSON array of messages in OpenAI's chat completion API format.
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
// DEPRECATED: only used in tests
nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
// get template caps, useful for reporting to server /props endpoint
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
@@ -274,11 +275,3 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
const common_chat_template & tmpl,
const std::string & src,
autoparser::generation_params & params);
// specialized per-task preset
struct common_chat_prompt_preset {
std::string system;
std::string user;
};
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
+1 -2
View File
@@ -3,7 +3,6 @@
#include "build-info.h"
#include "common.h"
#include "fit.h"
#include "log.h"
#include "llama.h"
#include "sampling.h"
@@ -1148,7 +1147,7 @@ common_init_result::common_init_result(common_params & params) :
if (params.fit_params) {
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
common_fit_params(params.model.path.c_str(), &mparams, &cparams,
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
params.tensor_split,
params.tensor_buft_overrides.data(),
params.fit_params_target.data(),
+7 -12
View File
@@ -274,7 +274,6 @@ struct common_params_sampling {
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
bool backend_sampling = false;
@@ -421,12 +420,11 @@ struct common_params {
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
bool fit_params_print = false; // print the estimated required memory to run the model
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
// margin per device in bytes for fitting parameters to free memory:
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
@@ -582,6 +580,8 @@ struct common_params {
bool force_pure_content_parser = false;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
int reasoning_budget = -1;
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
@@ -746,11 +746,6 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
str.compare(0, prefix.size(), prefix) == 0;
}
// remove when moving to c++20
inline bool string_starts_with(std::string_view str, char prefix) {
return !str.empty() && str.front() == prefix;
}
// remove when moving to c++20
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() &&
-951
View File
@@ -1,951 +0,0 @@
#include "fit.h"
#include "log.h"
#include "../src/llama-ext.h"
#include <array>
#include <cassert>
#include <stdexcept>
#include <cinttypes>
#include <set>
#include <string>
#include <vector>
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
// enum to identify part of a layer for distributing its tensors:
enum common_layer_fraction_t {
LAYER_FRACTION_NONE = 0, // nothing
LAYER_FRACTION_ATTN = 1, // attention
LAYER_FRACTION_UP = 2, // attention + up
LAYER_FRACTION_GATE = 3, // attention + up + gate
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
};
class common_params_fit_exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};
static std::vector<llama_device_memory_data> common_get_device_memory_data(
const char * path_model,
const llama_model_params * mparams,
const llama_context_params * cparams,
std::vector<ggml_backend_dev_t> & devs,
uint32_t & hp_ngl,
uint32_t & hp_n_ctx_train,
uint32_t & hp_n_expert,
ggml_log_level log_level) {
struct user_data_t {
struct {
ggml_log_callback callback;
void * user_data;
} original_logger;
ggml_log_level min_level; // prints below this log level go to debug log
};
user_data_t ud;
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
ud.min_level = log_level;
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
const user_data_t * ud = (const user_data_t *) user_data;
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
}, &ud);
llama_model_params mparams_copy = *mparams;
mparams_copy.no_alloc = true;
mparams_copy.use_mmap = false;
mparams_copy.use_mlock = false;
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
if (model == nullptr) {
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
throw std::runtime_error("failed to load model");
}
llama_context * ctx = llama_init_from_model(model, *cparams);
if (ctx == nullptr) {
llama_model_free(model);
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
throw std::runtime_error("failed to create llama_context from model");
}
const size_t nd = llama_model_n_devices(model);
std::vector<llama_device_memory_data> ret(nd + 1);
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
for (const auto & [buft, mb] : memory_breakdown) {
if (ggml_backend_buft_is_host(buft)) {
ret.back().mb.model += mb.model;
ret.back().mb.context += mb.context;
ret.back().mb.compute += mb.compute;
continue;
}
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (!dev) {
continue;
}
for (size_t i = 0; i < nd; i++) {
if (dev == llama_model_get_device(model, i)) {
ret[i].mb.model += mb.model;
ret[i].mb.context += mb.context;
ret[i].mb.compute += mb.compute;
break;
}
}
}
{
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error("no CPU backend found");
}
size_t free;
size_t total;
ggml_backend_dev_memory(cpu_dev, &free, &total);
ret.back().free = free;
ret.back().total = total;
}
for (size_t i = 0; i < nd; i++) {
size_t free;
size_t total;
ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
// devices can return 0 bytes for free and total memory if they do not
// have any to report. in this case, we will use the host memory as a fallback
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
if (free == 0 && total == 0) {
free = ret.back().free;
total = ret.back().total;
}
ret[i].free = free;
ret[i].total = total;
}
devs.clear();
for (int i = 0; i < llama_model_n_devices(model); i++) {
devs.push_back(llama_model_get_device(model, i));
}
hp_ngl = llama_model_n_layer(model);
hp_n_ctx_train = llama_model_n_ctx_train(model);
hp_n_expert = llama_model_n_expert(model);
common_memory_breakdown_print(ctx);
llama_free(ctx);
llama_model_free(model);
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
return ret;
}
static void common_params_fit_impl(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
}
constexpr int64_t MiB = 1024*1024;
typedef std::vector<llama_device_memory_data> dmds_t;
const llama_model_params default_mparams = llama_model_default_params();
std::vector<ggml_backend_dev_t> devs;
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
uint32_t hp_nct = 0; // hparams.n_ctx_train
uint32_t hp_nex = 0; // hparams.n_expert
// step 1: get data for default parameters and check whether any changes are necessary in the first place
LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
const size_t nd = devs.size(); // number of devices
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
margins.reserve(nd);
if (nd == 0) {
margins.push_back(margins_s[0]);
} else {
for (size_t id = 0; id < nd; id++) {
margins.push_back(margins_s[id]);
}
}
std::vector<std::string> dev_names;
{
dev_names.reserve(nd);
size_t max_length = 0;
for (const auto & dev : devs) {
std::string name = ggml_backend_dev_name(dev);
name += " (";
name += ggml_backend_dev_description(dev);
name += ")";
dev_names.push_back(name);
max_length = std::max(max_length, name.length());
}
for (std::string & dn : dev_names) {
dn.insert(dn.end(), max_length - dn.length(), ' ');
}
}
int64_t sum_free = 0;
int64_t sum_projected_free = 0;
int64_t sum_projected_used = 0;
int64_t sum_projected_model = 0;
std::vector<int64_t> projected_free_per_device;
projected_free_per_device.reserve(nd);
if (nd == 0) {
sum_projected_used = dmds_full.back().mb.total();
sum_free = dmds_full.back().total;
sum_projected_free = sum_free - sum_projected_used;
LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (sum_projected_free >= margins[0]) {
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
__func__, sum_projected_free/MiB, margins[0]/MiB);
return;
}
} else {
if (nd > 1) {
LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
}
for (size_t id = 0; id < nd; id++) {
const llama_device_memory_data & dmd = dmds_full[id];
const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used;
projected_free_per_device.push_back(projected_free);
sum_free += dmd.free;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
sum_projected_model += dmd.mb.model;
if (nd > 1) {
LOG_INF("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
}
}
assert(sum_free >= 0 && sum_projected_used >= 0);
LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (nd == 1) {
if (projected_free_per_device[0] >= margins[0]) {
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
return;
}
} else {
bool changes_needed = false;
for (size_t id = 0; id < nd; id++) {
if (projected_free_per_device[id] < margins[id]) {
changes_needed = true;
break;
}
}
if (!changes_needed) {
LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
return;
}
}
}
// step 2: try reducing memory use by reducing the context size
{
int64_t global_surplus = sum_projected_free;
if (nd == 0) {
global_surplus -= margins[0];
} else {
for (size_t id = 0; id < nd; id++) {
global_surplus -= margins[id];
}
}
if (global_surplus < 0) {
if (nd <= 1) {
LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
__func__, margins[0]/MiB, -global_surplus/MiB);
} else {
LOG_INF(
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
__func__, -global_surplus/MiB);
}
if (cparams->n_ctx == 0) {
if (hp_nct > n_ctx_min) {
int64_t sum_used_target = sum_free;
if (nd == 0) {
sum_used_target -= margins[0];
} else {
for (size_t id = 0; id < nd; id++) {
sum_used_target -= margins[id];
}
}
if (nd > 1) {
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
// - for dense models only whole layers can be assigned to devices
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
// - on average we expect a waste of 0.5 layers/tensors per device
// - use slightly more than the expected average for nd devices to be safe
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
}
int64_t sum_projected_used_min_ctx = 0;
cparams->n_ctx = n_ctx_min;
const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
if (nd == 0) {
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
} else {
for (size_t id = 0; id < nd; id++) {
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
}
}
if (sum_used_target > sum_projected_used_min_ctx) {
// linear interpolation between minimum and maximum context size:
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
/ (sum_projected_used - sum_projected_used_min_ctx);
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
if (nd <= 1) {
LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
return;
}
LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
} else {
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
}
} else {
if (n_ctx_min == UINT32_MAX) {
LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
} else {
LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
__func__, hp_nct, n_ctx_min);
}
}
} else {
LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
}
}
}
if (nd == 0) {
throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
}
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
}
if (nd > 1) {
if (!tensor_split) {
throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
}
if (mparams->tensor_split) {
for (size_t id = 0; id < nd; id++) {
if (mparams->tensor_split[id] != 0.0f) {
throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
}
}
}
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
}
}
if (!tensor_buft_overrides) {
throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
}
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
}
// step 3: iteratively fill the back to front with "dense" layers
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
// - for a MoE model, same as dense model but with all MoE tensors in system memory
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
constexpr size_t n_strings = 1000;
if (il >= n_strings) {
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
}
switch (lf) {
case LAYER_FRACTION_ATTN: {
static std::array<std::string, n_strings> patterns;
if (patterns[il].empty()) {
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
}
return patterns[il].c_str();
}
case LAYER_FRACTION_UP: {
static std::array<std::string, n_strings> patterns;
if (patterns[il].empty()) {
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
}
return patterns[il].c_str();
}
case LAYER_FRACTION_GATE: {
static std::array<std::string, n_strings> patterns;
if (patterns[il].empty()) {
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
}
return patterns[il].c_str();
}
case LAYER_FRACTION_MOE: {
static std::array<std::string, n_strings> patterns;
if (patterns[il].empty()) {
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
}
return patterns[il].c_str();
}
default:
GGML_ABORT("fatal error");
}
};
struct ngl_t {
uint32_t n_layer = 0; // number of total layers
uint32_t n_part = 0; // number of partial layers, <= n_layer
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
uint32_t n_full() const {
assert(n_layer >= n_part);
return n_layer - n_part;
}
};
const size_t ntbo = llama_max_tensor_buft_overrides();
// utility function to set n_gpu_layers and tensor_split
auto set_ngl_tensor_split_tbo = [&](
const std::vector<ngl_t> & ngl_per_device,
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
llama_model_params & mparams) {
mparams.n_gpu_layers = 0;
for (size_t id = 0; id < nd; id++) {
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
if (nd > 1) {
tensor_split[id] = ngl_per_device[id].n_layer;
}
}
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
mparams.tensor_split = tensor_split;
size_t itbo = 0;
for (size_t id = 0; id < nd; id++) {
il0 += ngl_per_device[id].n_full();
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
if (itbo + 1 >= ntbo) {
tensor_buft_overrides[itbo].pattern = nullptr;
tensor_buft_overrides[itbo].buft = nullptr;
itbo++;
mparams.tensor_buft_overrides = tensor_buft_overrides;
throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
+ std::to_string(ntbo) + " is insufficient for model");
}
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
itbo++;
}
il0 += ngl_per_device[id].n_part;
}
tensor_buft_overrides[itbo].pattern = nullptr;
tensor_buft_overrides[itbo].buft = nullptr;
itbo++;
mparams.tensor_buft_overrides = tensor_buft_overrides;
};
// utility function that returns the memory use per device for given numbers of layers per device
auto get_memory_for_layers = [&](
const char * func_name,
const std::vector<ngl_t> & ngl_per_device,
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
llama_model_params mparams_copy = *mparams;
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
const dmds_t dmd_nl = common_get_device_memory_data(
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
LOG_INF("%s: memory for test allocation by device:\n", func_name);
for (size_t id = 0; id < nd; id++) {
const ngl_t & n = ngl_per_device[id];
LOG_INF(
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
}
std::vector<int64_t> ret;
ret.reserve(nd);
for (size_t id = 0; id < nd; id++) {
ret.push_back(dmd_nl[id].mb.total());
}
return ret;
};
int64_t global_surplus_cpu_moe = 0;
if (hp_nex > 0) {
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
tensor_buft_overrides[1] = {nullptr, nullptr};
mparams->tensor_buft_overrides = tensor_buft_overrides;
LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
const dmds_t dmds_cpu_moe = common_get_device_memory_data(
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
for (size_t id = 0; id < nd; id++) {
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
}
if (global_surplus_cpu_moe > 0) {
LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
__func__, global_surplus_cpu_moe/MiB);
} else {
LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
__func__, -global_surplus_cpu_moe/MiB);
}
// reset
tensor_buft_overrides[0] = {nullptr, nullptr};
mparams->tensor_buft_overrides = tensor_buft_overrides;
}
std::vector<int64_t> targets; // maximum acceptable memory use per device
targets.reserve(nd);
for (size_t id = 0; id < nd; id++) {
targets.push_back(dmds_full[id].free - margins[id]);
LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
}
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
overflow_bufts.reserve(nd);
for (size_t id = 0; id < nd; id++) {
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
}
std::vector<ngl_t> ngl_per_device(nd);
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
// optimize the number of layers per device using the method of false position:
// - ngl_per_device has 0 layers for each device, lower bound
// - try a "high" configuration where a device is given all unassigned layers
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
// - check memory use of our guess, replace either the low or high bound
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
// - the last device has the output layer, which cannot be a partial layer
if (hp_nex == 0) {
LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
} else {
LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
}
for (int id = nd - 1; id >= 0; id--) {
uint32_t n_unassigned = hp_ngl + 1;
for (size_t jd = id + 1; jd < nd; ++jd) {
assert(n_unassigned >= ngl_per_device[jd].n_layer);
n_unassigned -= ngl_per_device[jd].n_layer;
}
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
ngl_per_device_high[id].n_layer = n_unassigned;
if (hp_nex > 0) {
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
}
if (ngl_per_device_high[id].n_layer > 0) {
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
step_size = std::max(step_size, uint32_t(1));
step_size = std::min(step_size, delta - 1);
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
ngl_per_device_test[id].n_layer += step_size;
if (hp_nex) {
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
step_size - 1 : step_size; // the first layer is the output layer which must always be full
}
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test;
mem = mem_test;
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
} else {
ngl_per_device_high = ngl_per_device_test;
mem_high = mem_test;
LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
}
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
}
} else {
assert(ngl_per_device_high[id].n_layer == n_unassigned);
ngl_per_device = ngl_per_device_high;
mem = mem_high;
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
}
}
const int64_t projected_margin = dmds_full[id].free - mem[id];
LOG_INF(
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
}
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
return;
}
// step 4: for a MoE model where all dense tensors fit,
// convert the dense-only layers in the back to full layers in the front until all devices are full
// essentially the same procedure as for the dense-only layers except front-to-back
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
size_t id_dense_start = nd;
for (int id = nd - 1; id >= 0; id--) {
if (ngl_per_device[id].n_layer > 0) {
id_dense_start = id;
continue;
}
break;
}
assert(id_dense_start < nd);
LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
for (size_t jd = id_dense_start; jd < nd; jd++) {
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
ngl_per_device_high[id].n_layer += n_layer_move;
ngl_per_device_high[jd].n_layer -= n_layer_move;
ngl_per_device_high[jd].n_part = 0;
}
size_t id_dense_start_high = nd - 1;
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
step_size = std::max(step_size, uint32_t(1));
step_size = std::min(step_size, delta - 1);
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
size_t id_dense_start_test = id_dense_start;
uint32_t n_converted_test = 0;
for (;id_dense_start_test < nd; id_dense_start_test++) {
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
ngl_per_device_test[id].n_layer += n_convert_jd;
n_converted_test += n_convert_jd;
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
break;
}
}
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
} else {
ngl_per_device_high = ngl_per_device_test;
mem_high = mem_test;
id_dense_start_high = id_dense_start_test;
LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
}
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
}
} else {
ngl_per_device = ngl_per_device_high;
mem = mem_high;
id_dense_start = id_dense_start_high;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
}
// try to fit at least part of one more layer
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
size_t id_dense_start_test = id_dense_start;
ngl_per_device_test[id_dense_start_test].n_layer--;
ngl_per_device_test[id_dense_start_test].n_part--;
ngl_per_device_test[id].n_layer++;
ngl_per_device_test[id].n_part++;
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
id_dense_start_test++;
}
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
if (id < nd - 1) {
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
}
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
}
} else {
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
}
}
}
const int64_t projected_margin = dmds_full[id].free - mem[id];
LOG_INF(
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
}
// print info for devices that were not changed during the conversion from dense only to full layers:
for (size_t id = id_dense_start + 1; id < nd; id++) {
const int64_t projected_margin = dmds_full[id].free - mem[id];
LOG_INF(
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
}
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
}
enum common_params_fit_status common_fit_params(
const char * path_model,
llama_model_params * mparams,
llama_context_params * cparams,
float * tensor_split,
llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins,
uint32_t n_ctx_min,
ggml_log_level log_level) {
const int64_t t0_us = llama_time_us();
common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
try {
common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
LOG_INF("%s: successfully fit params to free device memory\n", __func__);
} catch (const common_params_fit_exception & e) {
LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
status = COMMON_PARAMS_FIT_STATUS_FAILURE;
} catch (const std::runtime_error & e) {
LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
status = COMMON_PARAMS_FIT_STATUS_ERROR;
}
const int64_t t1_us = llama_time_us();
LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
return status;
}
void common_memory_breakdown_print(const struct llama_context * ctx) {
//const auto & devices = ctx->get_model().devices;
const auto * model = llama_get_model(ctx);
std::vector<ggml_backend_dev_t> devices;
for (int i = 0; i < llama_model_n_devices(model); i++) {
devices.push_back(llama_model_get_device(model, i));
}
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
std::vector<std::array<std::string, 9>> table_data;
table_data.reserve(devices.size());
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
constexpr size_t MiB = 1024 * 1024;
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
// track seen buffer types to avoid double counting:
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
// accumulative memory breakdown for each device and for host:
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
llama_memory_breakdown_data mb_host;
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (ggml_backend_buft_is_host(buft)) {
mb_host.model += mb.model;
mb_host.context += mb.context;
mb_host.compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (dev) {
int i_dev = -1;
for (size_t i = 0; i < devices.size(); i++) {
if (devices[i] == dev) {
i_dev = i;
break;
}
}
if (i_dev != -1) {
mb_dev[i_dev].model += mb.model;
mb_dev[i_dev].context += mb.context;
mb_dev[i_dev].compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
}
}
// print memory breakdown for each device:
for (size_t i = 0; i < devices.size(); i++) {
ggml_backend_dev_t dev = devices[i];
llama_memory_breakdown_data mb = mb_dev[i];
const std::string name = ggml_backend_dev_name(dev);
std::string desc = ggml_backend_dev_description(dev);
for (const std::string & prefix : desc_prefixes_strip) {
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
desc = desc.substr(prefix.length());
}
}
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
const size_t self = mb.model + mb.context + mb.compute;
const size_t unaccounted = total - self - free;
table_data.push_back({
template_gpu,
" - " + name + " (" + desc + ")",
std::to_string(total / MiB),
std::to_string(free / MiB),
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
std::to_string(unaccounted / MiB)});
}
// print memory breakdown for host:
{
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
table_data.push_back({
template_other,
" - Host",
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb_host.model / MiB),
std::to_string(mb_host.context / MiB),
std::to_string(mb_host.compute / MiB),
""}); // unaccounted
}
// print memory breakdown for all remaining buffer types:
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (seen_buffer_types.count(buft) == 1) {
continue;
}
const std::string name = ggml_backend_buft_name(buft);
const size_t self = mb.model + mb.context + mb.compute;
table_data.push_back({
template_other,
" - " + name,
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
""}); // unaccounted
seen_buffer_types.insert(buft);
}
for (size_t j = 1; j < table_data[0].size(); j++) {
size_t max_len = 0;
for (const auto & td : table_data) {
max_len = std::max(max_len, td[j].length());
}
for (auto & td : table_data) {
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
}
}
for (const auto & td : table_data) {
LOG_INF(td[0].c_str(),
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
td[6].c_str(), td[7].c_str(), td[8].c_str());
}
}
void common_fit_print(
const char * path_model,
llama_model_params * mparams,
llama_context_params * cparams) {
std::vector<ggml_backend_dev_t> devs;
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
uint32_t hp_nct = 0; // hparams.n_ctx_train
uint32_t hp_nex = 0; // hparams.n_expert
auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
GGML_ASSERT(dmd.size() == devs.size() + 1);
for (size_t id = 0; id < devs.size(); id++) {
printf("%s ", ggml_backend_dev_name(devs[id]));
printf("%zu ", dmd[id].mb.model/1024/1024);
printf("%zu ", dmd[id].mb.context/1024/1024);
printf("%zu ", dmd[id].mb.compute/1024/1024);
printf("\n");
}
printf("Host ");
printf("%zu ", dmd.back().mb.model/1024/1024);
printf("%zu ", dmd.back().mb.context/1024/1024);
printf("%zu ", dmd.back().mb.compute/1024/1024);
printf("\n");
}
-32
View File
@@ -1,32 +0,0 @@
#pragma once
#include "ggml.h"
enum common_params_fit_status {
COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
COMMON_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
COMMON_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
};
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
// - returns true if the parameters could be successfully modified to fit device memory
// - this function is NOT thread safe because it modifies the global llama logger state
// - only parameters that have the same value as in llama_default_model_params are modified
// with the exception of the context size which is modified if and only if equal to 0
enum common_params_fit_status common_fit_params(
const char * path_model,
struct llama_model_params * mparams,
struct llama_context_params * cparams,
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
size_t * margins, // margins of memory to leave per device in bytes
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
// print estimated memory to stdout
void common_fit_print(
const char * path_model,
struct llama_model_params * mparams,
struct llama_context_params * cparams);
void common_memory_breakdown_print(const struct llama_context * ctx);
+1
View File
@@ -1,3 +1,4 @@
#include "log.h"
#include "value.h"
#include "runtime.h"
#include "caps.h"
+5 -11
View File
@@ -106,16 +106,10 @@ struct statement {
size_t pos; // position in source, for debugging
virtual ~statement() = default;
virtual std::string type() const { return "Statement"; }
// execute_impl must be overridden by derived classes
virtual value execute_impl(context &) { throw_exec_error(); }
virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
// execute is the public method to execute a statement with error handling
value execute(context &);
private:
[[noreturn]] void throw_exec_error() const {
throw std::runtime_error("cannot exec " + type());
}
};
// Type Checking Utilities
@@ -149,7 +143,7 @@ struct program : public statement {
program() = default;
explicit program(statements && body) : body(std::move(body)) {}
std::string type() const override { return "Program"; }
[[noreturn]] value execute_impl(context &) override {
value execute_impl(context &) override {
throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
}
};
@@ -201,7 +195,7 @@ struct break_statement : public statement {
}
};
[[noreturn]] value execute_impl(context &) override {
value execute_impl(context &) override {
throw break_statement::signal();
}
};
@@ -215,7 +209,7 @@ struct continue_statement : public statement {
}
};
[[noreturn]] value execute_impl(context &) override {
value execute_impl(context &) override {
throw continue_statement::signal();
}
};
@@ -515,7 +509,7 @@ struct slice_expression : public expression {
chk_type<expression>(this->step_expr);
}
std::string type() const override { return "SliceExpression"; }
[[noreturn]] value execute_impl(context &) override {
value execute_impl(context &) override {
throw std::runtime_error("must be handled by MemberExpression");
}
};
+9 -13
View File
@@ -590,10 +590,6 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
}
[[noreturn]] static value string_join_not_implemented(const func_args &) {
throw not_implemented_exception("String join builtin not implemented");
}
const func_builtins & value_string_t::get_builtins() const {
static const func_builtins builtins = {
{"default", default_value},
@@ -855,7 +851,9 @@ const func_builtins & value_string_t::get_builtins() const {
res->val_str.mark_input_based_on(val_input->as_string());
return res;
}},
{"join", string_join_not_implemented},
{"join", [](const func_args &) -> value {
throw not_implemented_exception("String join builtin not implemented");
}},
};
return builtins;
}
@@ -886,9 +884,6 @@ const func_builtins & value_bool_t::get_builtins() const {
return builtins;
}
[[noreturn]] static value array_unique_not_implemented(const func_args &) {
throw not_implemented_exception("Array unique builtin not implemented");
}
const func_builtins & value_array_t::get_builtins() const {
static const func_builtins builtins = {
@@ -1089,14 +1084,13 @@ const func_builtins & value_array_t::get_builtins() const {
std::reverse(arr.begin(), arr.end());
return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
}},
{"unique", array_unique_not_implemented},
{"unique", [](const func_args &) -> value {
throw not_implemented_exception("Array unique builtin not implemented");
}},
};
return builtins;
}
[[noreturn]] static value object_join_not_implemented(const func_args &) {
throw not_implemented_exception("object join not implemented");
}
const func_builtins & value_object_t::get_builtins() const {
if (!has_builtins) {
@@ -1189,7 +1183,9 @@ const func_builtins & value_object_t::get_builtins() const {
});
return result;
}},
{"join", object_join_not_implemented},
{"join", [](const func_args &) -> value {
throw not_implemented_exception("object join not implemented");
}},
};
return builtins;
}
+18 -21
View File
@@ -129,25 +129,27 @@ struct value_t {
// Note: only for debugging and error reporting purposes
virtual std::string type() const { return ""; }
virtual int64_t as_int() const { throw_type_error("is not an int value"); }
virtual double as_float() const { throw_type_error("is not a float value"); }
virtual string as_string() const { throw_type_error("is not a string value"); }
virtual bool as_bool() const { throw_type_error("is not a bool value"); }
virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
virtual bool is_none() const { return false; }
virtual bool is_undefined() const { return false; }
virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }
virtual const func_builtins & get_builtins() const {
throw std::runtime_error("No builtins available for type " + type());
}
virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }
virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }
virtual bool is_numeric() const { return false; }
virtual bool is_hashable() const { return false; }
@@ -161,11 +163,6 @@ struct value_t {
// Note: only for debugging purposes
virtual std::string as_repr() const { return as_string().str(); }
private:
[[noreturn]] void throw_type_error(const char* expected) const {
throw std::runtime_error(type() + " " + expected);
}
protected:
virtual bool equivalent(const value_t &) const = 0;
virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
+2 -4
View File
@@ -1,12 +1,10 @@
#include "sampling.h"
#include "common.h"
#include "fit.h"
#include "ggml.h"
#include "log.h"
#include "reasoning-budget.h"
#include "ggml.h"
#include <algorithm>
#include <cctype>
#include <climits>
@@ -513,7 +511,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
common_memory_breakdown_print(ctx);
llama_memory_breakdown_print(ctx);
}
}
-1
View File
@@ -749,7 +749,6 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
mod.reset();
n_low = 0;
i_last = 0;
}
} else {
n_low = 0;
+12 -102
View File
@@ -746,12 +746,7 @@ class ModelBase:
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
with open(quant_config_file, "r", encoding="utf-8") as f:
hf_quant_config = json.load(f)
quant_config = hf_quant_config.get("quantization") or {}
producer = hf_quant_config.get("producer") or {}
producer_name = (producer.get("name") or "").lower()
if quant_method is None:
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
quant_config = json.load(f).get("quantization") or {}
quant_algo = quant_config.get("quant_algo", quant_algo)
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
@@ -11860,7 +11855,7 @@ class LLaDAMoEModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("HunYuanDenseV1ForCausalLM")
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
class HunYuanModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
@@ -11999,58 +11994,28 @@ class HunYuanModel(TextModel):
@ModelBase.register("HunYuanVLForConditionalGeneration")
class HunyuanVLVisionModel(MmprojModel):
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
# Each variant maps to a different projector type in clip.cpp so image
# preprocessing follows the correct code path.
class HunyuanOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
# HunyuanOCR uses max_image_size instead of image_size
if "image_size" not in self.hparams_vision:
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
@staticmethod
def is_ocr_variant(hparams: dict) -> bool:
"""Return True for HunyuanOCR, False for HunyuanVL.
The projector's output dim must equal the text model's hidden_size by
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
ViT -> LLM projection dim is a hard architectural signature, not a
magic number.
"""
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
return vision_out == 1024
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
vcfg = self.hparams_vision
if self.is_ocr_variant(self.global_config):
# --- HunyuanOCR ---
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
return
# --- HunyuanVL ---
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
hparams = self.hparams_vision
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if not name.startswith("vit."):
return
return # skip text tensors
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
if "position_embedding" in name:
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
@@ -12058,66 +12023,11 @@ class HunyuanVLVisionModel(MmprojModel):
def tensor_force_quant(self, name, new_name, bid, n_dims):
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
@ModelBase.register("HunYuanVLForConditionalGeneration")
class HunyuanVLTextModel(HunYuanModel):
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
# the config and pick the matching GGUF architecture.
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
@staticmethod
def _is_ocr_config(hparams: dict) -> bool:
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
# HunyuanVLVisionModel.is_ocr_variant.
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
def __init__(self, dir_model: Path, *args, **kwargs):
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
if self._is_ocr_config(raw_hparams):
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
else:
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
super().__init__(dir_model, *args, **kwargs)
def set_gguf_parameters(self):
super().set_gguf_parameters()
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
# the HunYuan-Dense arch which already handles standard rope in super().
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
return
if self.rope_parameters.get("rope_type") != "xdrope":
return
# defaults for HunyuanVL. The C++ side later computes:
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
ctx_len = int(self.hparams["max_position_embeddings"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
self.gguf_writer.add_context_length(ctx_len)
self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision tensors — they are written by HunyuanVLVisionModel
if name.startswith("vit."):
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("SmolLM3ForCausalLM")
class SmolLM3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.SMOLLM3
+3
View File
@@ -244,6 +244,7 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
- `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
> [!NOTE]
> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
@@ -273,6 +274,8 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
Run llama.cpp with OpenVINO backend Docker container.
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
> [!NOTE]
> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
```bash
# Run Docker container
-10
View File
@@ -31,8 +31,6 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.
## Recommended Release
### Windows
The following releases are verified and recommended:
|Commit ID|Tag|Release|Verified Platform| Update date|
@@ -41,13 +39,6 @@ The following releases are verified and recommended:
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
### Ubuntu 24.04
The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
It is recommended to use them with Intel Docker.
The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.
## News
@@ -238,7 +229,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|Verified release|
|-|
|2025.3.3 |
|2025.2.1|
|2025.1|
|2024.1|
+5 -14
View File
@@ -249,27 +249,18 @@ build: 6a8cf8914 (6733)
```
- `GGML_HEXAGON_PROFILE=1`
Enables Op profiling:
Generates a host-side profile for the ggml-hexagon Ops.
- `1` Basic profile with per-op `usecs` and `cycles` counters
- `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
- `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
Examples:
`GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
- `GGML_HEXAGON_OPSTAGE=0x0`
Allows enabling specific stages of the Op processing pipeline:
- `GGML_HEXAGON_OPMASK=0x0`
Allows enabling specific stages of the processing pipeline:
- `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
- `0x2` Enable Op Compute (MUL_MAT, etc.)
Examples:
`GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
`GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
`GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
`GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
- `GGML_HEXAGON_OPFILTER=regex`
Allows filtering (disabling) Ops that match the regex pattern:
@@ -25,11 +25,7 @@ MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
TYPE="${OUTTYPE:-f16}"
METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
if [[ -n "$MMPROJ" ]]; then
CONVERTED_MODEL="${OUTPUT_DIR}/mmproj-${MODEL_NAME}.gguf"
else
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
fi
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
echo "Model path: ${MODEL_PATH}"
echo "Model name: ${MODEL_NAME}"
@@ -42,7 +38,6 @@ if [[ -n "$DEBUG" ]]; then
else
CMD_ARGS=("python")
fi
CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
CMD_ARGS+=("${MODEL_PATH}")
CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
@@ -55,3 +50,7 @@ CMD_ARGS+=("--outtype" "${TYPE}")
echo ""
echo "The environment variable CONVERTED_MODEL can be set to this path using:"
echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
if [[ -n "$MMPROJ" ]]; then
mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
echo "The mmproj model was created in $(realpath "$mmproj_file")"
fi
@@ -8,24 +8,8 @@
#include <clocale>
#include <cstdio>
#include <cstring>
#include <cinttypes>
#include <string>
#include <vector>
#include <utility>
struct spec_checkpoint {
int64_t n_tokens = 0;
std::vector<uint8_t> data;
size_t size() const {
return data.size();
}
bool empty() const {
return data.empty();
}
};
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
@@ -62,14 +46,6 @@ int main(int argc, char ** argv) {
model_tgt = llama_init_tgt->model();
ctx_tgt = llama_init_tgt->context();
// check if the context supports partial sequence removal
const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
if (use_ckpt) {
LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
}
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
// load the draft model
@@ -143,7 +119,7 @@ int main(int argc, char ** argv) {
const auto t_enc_start = ggml_time_us();
// target model sampling context
common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
// eval the prompt
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
@@ -166,61 +142,21 @@ int main(int argc, char ** argv) {
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
size_t n_draft = 0;
llama_tokens draft;
spec_checkpoint spec_ckpt;
const auto t_enc_end = ggml_time_us();
const auto t_dec_start = ggml_time_us();
while (true) {
// generate or reuse draft tokens
// optionally, generate draft tokens that can be appended to the target batch
//
// this is the most important part of the speculation. the more probable tokens that are provided here
// the better the performance will be. in theory, this computation can be performed asynchronously and even
// offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
// from a cache or lookup tables.
//
if (draft.empty()) {
// generate a new draft
draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
if ((int) draft.size() > params_spec.n_max) {
LOG_WRN("draft size %zu exceeds max %d, truncating\n", draft.size(), params_spec.n_max);
draft.resize(params_spec.n_max);
}
if ((int) draft.size() < params_spec.n_min) {
LOG_DBG("ignoring small draft: %zu < %d\n", draft.size(), params_spec.n_min);
draft.clear();
}
// save the original draft size
n_draft = draft.size();
// save a checkpoint of the target context before evaluating the draft
// this allows us to restore the state if partial draft acceptance occurs
if (!draft.empty() && use_ckpt) {
const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
spec_ckpt.data.resize(ckpt_size);
const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
GGML_ASSERT(n == ckpt_size);
spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
}
} else {
// we have a previous (partial) draft to reuse from checkpoint restoration
if (use_ckpt) {
GGML_ASSERT(!spec_ckpt.empty());
}
}
GGML_ASSERT(n_draft > 0);
//LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
// always have a token to evaluate from before - id_last
common_batch_clear(batch_tgt);
@@ -242,12 +178,6 @@ int main(int argc, char ** argv) {
llama_decode(ctx_tgt, batch_tgt);
}
// only save the sampler sampler state if we use checkpoints
common_sampler_ptr smpl_save;
if (use_ckpt) {
smpl_save.reset(common_sampler_clone(smpl.get()));
}
// sample from the full target batch and return the accepted tokens based on the target sampler
//
// for each token to be accepted, the sampler would have to sample that same token
@@ -255,38 +185,14 @@ int main(int argc, char ** argv) {
// available logits from the batch and sample the next token until we run out of logits or the sampler
// disagrees with the draft
//
auto ids = common_sampler_sample_and_accept_n(smpl.get(), ctx_tgt, draft);
const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
//LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
// check for partial draft acceptance:
// if the context doesn't support partial sequence removal, restore the checkpoint
// and make the accepted tokens the new partial draft for the next iteration
if (use_ckpt && ids.size() - 1 < draft.size()) {
LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());
draft = std::move(ids);
const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
GGML_ASSERT(n == spec_ckpt.size());
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
prompt_tgt.resize(spec_ckpt.n_tokens);
smpl = std::move(smpl_save);
n_past = (int) prompt_tgt.size();
continue;
}
common_speculative_accept(spec, ids.size() - 1);
// full acceptance: consume the draft and commit accepted tokens
n_past += ids.size() - 1;
n_drafted += n_draft; // note: we ignore the discarded small drafts
n_drafted += draft.size(); // note: we ignore the discarded small drafts
n_accept += ids.size() - 1;
n_predict += ids.size();
@@ -316,9 +222,6 @@ int main(int argc, char ** argv) {
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
// clear the draft since it has been consumed
draft.clear();
{
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
@@ -351,10 +254,11 @@ int main(int argc, char ** argv) {
LOG_INF("\n");
LOG_INF("target:\n\n");
common_perf_print(ctx_tgt, smpl.get());
common_perf_print(ctx_tgt, smpl);
llama_batch_free(batch_tgt);
common_sampler_free(smpl);
common_speculative_free(spec);
llama_backend_free();
+3 -3
View File
@@ -4,8 +4,8 @@ project("ggml" C CXX ASM)
### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 10)
set(GGML_VERSION_PATCH 0)
set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 11)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -213,7 +213,7 @@ set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIP_GRAPHS "ggml: use HIP graph" ON)
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
option(GGML_HIP_RCCL "ggml: use ROCm Collective Comm. Library" OFF)
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
+1 -1
View File
@@ -473,7 +473,7 @@ target_link_libraries(ggml-base PRIVATE Threads::Threads)
find_library(MATH_LIBRARY m)
if (MATH_LIBRARY)
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
target_link_libraries(ggml-base PRIVATE m)
endif()
endif()
+80 -180
View File
@@ -1133,7 +1133,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
GGML_ASSERT(tensor->ne[split_dim] != 0);
GGML_ASSERT(ne[split_dim] != 0 && tensor->ne[split_dim] != 0);
const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);
@@ -1170,28 +1170,6 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
simple_tensors.push_back(t_ij);
}
// If one of the sources has a zero-sized slice, disable the computation:
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (tensor->src[i] == nullptr || !ggml_backend_buffer_is_meta(tensor->src[i]->buffer)) {
continue;
}
const ggml_backend_meta_split_state split_state_src = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
if (split_state_src.axis < 0 || split_state_src.axis >= GGML_MAX_DIMS) {
continue;
}
for (size_t j = 0; j < n_simple_bufs; j++) {
int64_t ne_sum = 0;
for (size_t s = 0; s < split_state_src.n_segments; s++) {
ne_sum += split_state_src.ne[s*n_simple_bufs + j];
}
if (ne_sum == 0) {
simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
}
}
}
buf_ctx->simple_tensors[tensor] = simple_tensors;
return GGML_STATUS_SUCCESS;
@@ -1464,20 +1442,17 @@ struct ggml_backend_meta_context {
struct backend_config {
ggml_backend_t backend;
std::vector<cgraph_config> cgraphs;
std::vector<ggml_tensor *> nodes;
std::vector<ggml_backend_buffer_ptr> bufs;
std::vector<cgraph_config> cgraphs;
std::vector<ggml_tensor *> nodes;
ggml_backend_buffer_ptr buf;
backend_config(ggml_backend_t backend, const size_t n_reduce_steps) : backend(backend) {
bufs.resize(n_reduce_steps);
}
backend_config(ggml_backend_t backend) : backend(backend) {}
};
std::string name;
std::vector<backend_config> backend_configs;
ggml_context_ptr ctx;
std::vector<ggml_cgraph *> cgraphs_aux;
std::vector<ggml_tensor *> nodes_aux;
size_t n_reduce_steps;
int max_nnodes = 0;
size_t max_tmp_size = 0;
size_t max_subgraphs = 0;
@@ -1489,7 +1464,6 @@ struct ggml_backend_meta_context {
ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
n_reduce_steps = std::ceil(std::log2(n_devs));
name = "Meta(";
std::vector<ggml_backend_t> simple_backends;
backend_configs.reserve(n_devs);
@@ -1501,7 +1475,7 @@ struct ggml_backend_meta_context {
}
name += ggml_backend_dev_name(simple_dev);
simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
backend_configs.emplace_back(simple_backends.back(), n_reduce_steps);
backend_configs.emplace_back(simple_backends.back());
}
name += ")";
@@ -1531,6 +1505,10 @@ struct ggml_backend_meta_context {
ggml_backend_free(bc.backend);
}
}
size_t n_reduce_steps() const {
return std::ceil(std::log2(backend_configs.size()));
}
};
static const char * ggml_backend_meta_get_name(ggml_backend_t backend) {
@@ -1683,36 +1661,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
ggml_tensor * node = cgraph->nodes[id];
int32_t n_used = ggml_node_get_use_count(cgraph, id);
// Skip MIRRORED nodes that don't consume node
auto skip_unrelated = [&]() {
while (id + 1 < cgraph->n_nodes) {
ggml_tensor * next = cgraph->nodes[id+1];
if (ggml_backend_meta_get_split_state(next, false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
break;
}
bool safe = true;
for (int s = 0; s < GGML_MAX_SRC; s++) {
if (next->src[s] == nullptr) {
continue;
}
if (next->src[s] == node) {
safe = false;
break;
}
if (ggml_backend_meta_get_split_state(next->src[s], false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
safe = false;
break;
}
}
if (!safe) {
break;
}
id++;
}
};
skip_unrelated();
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
@@ -1727,12 +1675,10 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
n_used = ggml_node_get_use_count(cgraph, id);
}
}
// Chain of MULs with MIRRORED src[1]
while (true) {
skip_unrelated();
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_MUL && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
@@ -1740,8 +1686,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
} else {
break;
}
}
@@ -1810,17 +1754,16 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
if (max_tmp_size > backend_ctx->max_tmp_size) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i = 0; i < backend_ctx->n_reduce_steps; i++) {
bcj.bufs[i].reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
}
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
}
backend_ctx->max_tmp_size = max_tmp_size;
}
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
const size_t n_nodes_per_device = 3 * backend_ctx->n_reduce_steps; // tmp + ADD (+zeroing) graph per step and device
const size_t n_cgraphs_per_device = 2 * backend_ctx->n_reduce_steps; // ADD ( + zeroing) graph per step and device
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
@@ -1869,6 +1812,11 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
size_t iga = 0; // i graph aux
size_t ina = 0; // i node aux
// FIXME usage_counts
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
return ret;
};
auto get_node_aux = [&](ggml_tensor * t) -> ggml_tensor * {
ggml_tensor * ret = backend_ctx->nodes_aux[ina++];
memset(ret, 0, sizeof(ggml_tensor));
@@ -1880,110 +1828,75 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
}
return ret;
};
auto set_tmp_data = [&](ggml_tensor * tensor, const size_t j, const size_t i_buf) {
auto & bcj = backend_ctx->backend_configs[j];
ggml_backend_buffer_ptr & buf_ptr = bcj.bufs[i_buf];
if (!buf_ptr || ggml_backend_buffer_get_size(buf_ptr.get()) < backend_ctx->max_tmp_size) {
buf_ptr.reset(ggml_backend_alloc_buffer(bcj.backend, backend_ctx->max_tmp_size));
}
tensor->buffer = buf_ptr.get();
tensor->data = ggml_backend_buffer_get_base(buf_ptr.get());
};
// FIXME usage_counts
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
return ret;
};
// Preferentially use backend-specific allreduce_tensor_async (e.g. NCCL for CUDA), use a generic fallback if unavailable:
auto allreduce_fallback = [&](size_t i) -> ggml_status {
std::vector<ggml_cgraph *> step_cgraphs(n_backends, nullptr);
// Zero out nodes that were disabled due to having a zero-sized slice:
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
ggml_tensor * node = bcj.cgraphs[i].cgraph_main->nodes[bcj.cgraphs[i].cgraph_main->n_nodes - 1];
if (node->flags & GGML_TENSOR_FLAG_COMPUTE) {
continue;
}
ggml_tensor * node_zero = get_node_aux(node);
node_zero->op = GGML_OP_SCALE; // FIXME 0.0f * NaN == NaN
node_zero->src[0] = node;
ggml_set_op_params_f32(node_zero, 0, 0.0f);
node_zero->data = node->data;
node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
step_cgraphs[j] = get_cgraph_aux();
step_cgraphs[j]->nodes[0] = node_zero;
step_cgraphs[j]->n_nodes = 1;
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, step_cgraphs[j]);
if (status != GGML_STATUS_SUCCESS) {
return status;
}
}
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
auto push_data = [&](const size_t j_src, const size_t j_dst, const size_t i_buf) {
assert(step_cgraphs[j_dst] == nullptr);
auto & bcj_src = backend_ctx->backend_configs[j_src];
auto & bcj_dst = backend_ctx->backend_configs[j_dst];
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
GGML_ASSERT(ggml_is_contiguous(node_src));
GGML_ASSERT(ggml_is_contiguous(node_dst));
ggml_tensor * node_tmp = get_node_aux(node_dst);
set_tmp_data(node_tmp, j_dst, i_buf);
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_tmp);
ggml_tensor * node_red = get_node_aux(node_dst);
node_red->view_src = node_dst->view_src == nullptr ? node_dst : node_dst->view_src;
node_red->view_offs = node_dst->view_offs;
node_red->op = GGML_OP_ADD;
node_red->src[0] = node_dst;
node_red->src[1] = node_tmp;
node_red->flags |= GGML_TENSOR_FLAG_COMPUTE;
ggml_backend_view_init(node_red);
ggml_cgraph * cgraph_aux = get_cgraph_aux();
cgraph_aux->nodes[0] = node_red;
cgraph_aux->n_nodes = 1;
step_cgraphs[j_dst] = cgraph_aux;
};
size_t offset_j = n_backends/2;
while ((offset_j & (offset_j - 1)) != 0) {
offset_j--;
}
const size_t offset_j_max = offset_j;
size_t i_buf = 0;
// If n_backends is not a power of 2, fold in the excess prior to butterfly reduction:
for (size_t j_src = 2*offset_j_max; j_src < n_backends; j_src++) {
const size_t j_dst = j_src - 2*offset_j_max;
push_data(j_src, j_dst, i_buf);
const ggml_status status = ggml_backend_graph_compute_async(backend_ctx->backend_configs[j_dst].backend, step_cgraphs[j_dst]);
if (status != GGML_STATUS_SUCCESS) {
return status;
}
i_buf = 1;
}
// Butterfly reduction:
for (; offset_j >= 1; offset_j /= 2) {
for (size_t offset_j = 1; offset_j < n_backends; offset_j *= 2) {
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
for (size_t j = 0; j < 2*offset_j_max; j++) {
for (size_t j = 0; j < n_backends; j++) {
const size_t j_other = j ^ offset_j;
if (j_other >= n_backends) {
if (j_other > j) {
continue;
}
push_data(j, j_other, i_buf);
auto & bcj1 = backend_ctx->backend_configs[j];
auto & bcj2 = backend_ctx->backend_configs[j_other];
ggml_tensor * node1 = bcj1.cgraphs[i].cgraph_main->nodes[bcj1.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_tensor * node2 = bcj2.cgraphs[i].cgraph_main->nodes[bcj2.cgraphs[i].cgraph_main->n_nodes - 1];
GGML_ASSERT(ggml_is_contiguous(node1));
GGML_ASSERT(ggml_is_contiguous(node2));
// Tmp tensors to receive P2P copies
ggml_tensor * node_tmp_1 = get_node_aux(node1);
node_tmp_1->buffer = bcj1.buf.get();
node_tmp_1->data = ggml_backend_buffer_get_base(bcj1.buf.get());
ggml_tensor * node_tmp_2 = get_node_aux(node2);
node_tmp_2->buffer = bcj2.buf.get();
node_tmp_2->data = ggml_backend_buffer_get_base(bcj2.buf.get());
// 2 P2P copies: exchange full buffers
ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
// Local ADD: node1 += tmp1 (in-place via view)
ggml_tensor * node_red_1 = get_node_aux(node1);
node_red_1->view_src = node1->view_src == nullptr ? node1 : node1->view_src;
node_red_1->view_offs = node1->view_offs;
node_red_1->op = GGML_OP_ADD;
node_red_1->src[0] = node1;
node_red_1->src[1] = node_tmp_1;
node_red_1->flags |= GGML_TENSOR_FLAG_COMPUTE;
ggml_backend_view_init(node_red_1);
// Local ADD: node2 += tmp2 (in-place via view)
ggml_tensor * node_red_2 = get_node_aux(node2);
node_red_2->view_src = node2->view_src == nullptr ? node2 : node2->view_src;
node_red_2->view_offs = node2->view_offs;
node_red_2->op = GGML_OP_ADD;
node_red_2->src[0] = node2;
node_red_2->src[1] = node_tmp_2;
node_red_2->flags |= GGML_TENSOR_FLAG_COMPUTE;
ggml_backend_view_init(node_red_2);
// Build 1-node cgraphs for the ADD ops
ggml_cgraph * cgraph_aux_1 = get_cgraph_aux();
cgraph_aux_1->nodes[0] = node_red_1;
cgraph_aux_1->n_nodes = 1;
step_cgraphs[j] = cgraph_aux_1;
ggml_cgraph * cgraph_aux_2 = get_cgraph_aux();
cgraph_aux_2->nodes[0] = node_red_2;
cgraph_aux_2->n_nodes = 1;
step_cgraphs[j_other] = cgraph_aux_2;
}
for (size_t j = 0; j < 2*offset_j_max; j++) {
// Execute local ADDs for this step
for (size_t j = 0; j < n_backends; j++) {
if (step_cgraphs[j] == nullptr) {
continue;
}
@@ -1993,20 +1906,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
return status;
}
}
i_buf++;
}
assert(i_buf == backend_ctx->n_reduce_steps);
// If n_backends is not a power of 2, copy back the reduced tensors to the excess:
for (size_t j = 2*offset_j_max; j < n_backends; j++) {
auto & bcj_src = backend_ctx->backend_configs[j - 2*offset_j_max];
auto & bcj_dst = backend_ctx->backend_configs[j];
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_dst);
}
return GGML_STATUS_SUCCESS;
};
+1
View File
@@ -83,6 +83,7 @@
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// quants.c
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
+25 -5
View File
@@ -151,6 +151,8 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const block_q1_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
float sumf = 0.0f;
#if defined(__ARM_NEON)
float32x4_t sumv = vdupq_n_f32(0.0f);
@@ -210,13 +212,31 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
}
*s = vaddvq_f32(sumv);
sumf = vaddvq_f32(sumv);
#else
UNUSED(nb);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
// Scalar fallback
for (int i = 0; i < nb; i++) {
const float d0 = GGML_FP16_TO_FP32(x[i].d);
// Process 4 Q8_0 blocks
for (int k = 0; k < 4; k++) {
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
int sumi = 0;
for (int j = 0; j < QK8_0; j++) {
const int bit_index = k * QK8_0 + j;
const int byte_index = bit_index / 8;
const int bit_offset = bit_index % 8;
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
sumi += xi * y[i*4 + k].qs[j];
}
sumf += d0 * d1 * sumi;
}
}
#endif
*s = sumf;
}
-158
View File
@@ -274,18 +274,6 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
}
#endif
#elif defined(__SSSE3__)
static inline __m128i bytes_from_bits_16(const uint8_t * x) {
uint16_t x16;
memcpy(&x16, x, sizeof(uint16_t));
const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
__m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
bytes = _mm_or_si128(bytes, bit_mask);
return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
}
// horizontally add 4x4 floats
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
__m128 res_0 =_mm_hadd_ps(a, b);
@@ -552,152 +540,6 @@ static inline __m128i get_scale_shuffle(int i) {
}
#endif
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK1_0;
const int nb = n / qk;
assert(n % qk == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q1_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
#if defined(__AVX2__)
const __m256i ones_8 = _mm256_set1_epi8(1);
const __m256i ones_16 = _mm256_set1_epi16(1);
const __m256i byte_shuf = _mm256_setr_epi8(
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
const __m256i bit_masks = _mm256_setr_epi8(
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
const __m256i zero = _mm256_setzero_si256();
__m256 acc = _mm256_setzero_ps();
for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
__m256 acc_block;
{
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
const __m256i sm = _mm256_cmpeq_epi8(
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
}
for (int K = 1; K < 4; ++K) {
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs);
const __m256i sm = _mm256_cmpeq_epi8(
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero);
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block);
}
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
}
*s = hsum_float_8(acc);
#elif defined(__AVX__)
const __m128i ones_8 = _mm_set1_epi8(1);
const __m128i ones_16 = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m256 acc = _mm256_setzero_ps();
for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
__m256 acc_block;
{
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[0]);
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[0]);
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[16]);
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), q);
}
for(int K = 1; K < 4; ++K) {
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[(K) * 4]);
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[0]);
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[16]);
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q));
}
#undef Q1_AVX_BLOCK
acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
}
*s = hsum_float_8(acc);
#elif defined(__SSSE3__)
const __m128i ones_8 = _mm_set1_epi8(1);
const __m128i ones_16 = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m128 acc_0 = _mm_setzero_ps();
__m128 acc_1 = _mm_setzero_ps();
__m128 acc_2 = _mm_setzero_ps();
__m128 acc_3 = _mm_setzero_ps();
for (int ib = 0; ib < nb; ++ib) {
const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
#define Q1_SSSE3_BLOCK(QS_OFF, Y_IDX, ACC) \
{ \
const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 0]); \
const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 2]); \
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[0]); \
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[16]); \
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
const __m128i sum_0 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_0), ones_16); \
const __m128i sum_1 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_1), ones_16); \
const __m128 q = _mm_cvtepi32_ps(_mm_add_epi32(sum_0, sum_1)); \
(ACC) = _mm_add_ps((ACC), _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(Y_IDX)].d))), q)); \
}
Q1_SSSE3_BLOCK(0, 0, acc_0)
Q1_SSSE3_BLOCK(4, 1, acc_1)
Q1_SSSE3_BLOCK(8, 2, acc_2)
Q1_SSSE3_BLOCK(12, 3, acc_3)
#undef Q1_SSSE3_BLOCK
}
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#else
UNUSED(nb);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
+9 -15
View File
@@ -137,28 +137,22 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
float sumf = 0.0;
for (int i = 0; i < nb; i++) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
const float d0 = GGML_FP16_TO_FP32(x[i].d);
float sumi = 0.0f;
for (int k = 0; k < 4; k++) {
const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
int sumi_block = 0;
const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
const int8_t * GGML_RESTRICT qy = yb->qs;
for (int j = 0; j < QK8_0; j++) {
const int bit_index = k * QK8_0 + j;
const int byte_index = bit_index / 8;
const int bit_offset = bit_index % 8;
for (int b = 0; b < 4; ++b, qy += 8) {
const unsigned mask = bits[b];
sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+ ((mask & 0x02) ? qy[1] : -qy[1])
+ ((mask & 0x04) ? qy[2] : -qy[2])
+ ((mask & 0x08) ? qy[3] : -qy[3])
+ ((mask & 0x10) ? qy[4] : -qy[4])
+ ((mask & 0x20) ? qy[5] : -qy[5])
+ ((mask & 0x40) ? qy[6] : -qy[6])
+ ((mask & 0x80) ? qy[7] : -qy[7]);
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
sumi_block += xi * y[i*4 + k].qs[j];
}
sumi += d1 * sumi_block;
+3 -63
View File
@@ -368,21 +368,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
}
~ggml_cuda_pool_leg() {
clear_pool();
GGML_ASSERT(pool_size == 0);
}
void clear_pool() {
ggml_cuda_set_device(device);
for (int i = 0; i < MAX_BUFFERS; ++i) {
ggml_cuda_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
CUDA_CHECK(cudaFree(b.ptr));
pool_size -= b.size;
b.ptr = nullptr;
b.size = 0;
}
}
GGML_ASSERT(pool_size == 0);
}
void * alloc(size_t size, size_t * actual_size) override {
@@ -427,20 +421,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
size_t look_ahead_size = (size_t) (1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
ggml_cuda_set_device(device);
cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
if (err == cudaErrorMemoryAllocation) {
(void)cudaGetLastError();
const size_t cached_bytes = pool_size;
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n",
device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
CUDA_CHECK(cudaDeviceSynchronize());
clear_pool();
err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
if (err == cudaSuccess) {
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
}
}
CUDA_CHECK(err);
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
*actual_size = look_ahead_size;
pool_size += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC
@@ -1222,13 +1203,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
// For small tensors, simply reduce them as FP32.
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
for (size_t i = 0; i < n_backends; ++i) {
if ((tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
ggml_cuda_set_device(cuda_ctx->device);
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, ggml_nbytes(tensors[i]), cuda_ctx->stream()));
}
}
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
@@ -1250,11 +1224,7 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
tmp[i].alloc(ne);
ggml_cuda_set_device(cuda_ctx->device);
if (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) {
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
} else {
CUDA_CHECK(cudaMemsetAsync(tmp[i].get(), 0, ne * sizeof(nv_bfloat16), cuda_ctx->stream()));
}
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
@@ -3592,30 +3562,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
return true;
}
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
const ggml_tensor * unary = cgraph->nodes[node_idx];
const ggml_tensor * sqr = cgraph->nodes[node_idx+1];
if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
return false;
}
if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
return false;
}
if (unary->type != sqr->type) {
return false;
}
if (!ggml_is_contiguous(unary->src[0])) {
return false;
}
return true;
}
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
const ggml_tensor *scale = cgraph->nodes[node_idx];
@@ -4124,12 +4070,6 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i+1]);
i++;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
i += 2;
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
-23
View File
@@ -65,11 +65,6 @@ static __device__ __forceinline__ float op_sqr(float x) {
return x * x;
}
static __device__ __forceinline__ float op_relu_sqr(float x) {
const float r = fmaxf(x, 0.0f);
return r * r;
}
static __device__ __forceinline__ float op_sqrt(float x) {
return sqrtf(x);
}
@@ -620,21 +615,3 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
GGML_ABORT("Unsupported unary op for fused unary+mul");
}
}
/* fused relu + sqr */
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
const ggml_tensor * src = relu_node->src[0];
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src));
GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
GGML_ASSERT(src->type == sqr_node->type);
const int k = ggml_nelements(src);
if (src->type == GGML_TYPE_F16) {
unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
} else {
unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
}
}
-2
View File
@@ -91,8 +91,6 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
return x / (1.0f + expf(-x));
}
-1
View File
@@ -58,7 +58,6 @@
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorMemoryAllocation hipErrorOutOfMemory
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags
-1
View File
@@ -42,7 +42,6 @@
#define cudaDeviceProp musaDeviceProp
#define cudaDeviceSynchronize musaDeviceSynchronize
#define cudaError_t musaError_t
#define cudaErrorMemoryAllocation musaErrorMemoryAllocation
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags musaEventCreateWithFlags
+150 -335
View File
@@ -12,12 +12,9 @@
#include <cstddef>
#include <stdexcept>
#include <string>
#include <sstream>
#include <iomanip>
#include <unordered_set>
#include <unordered_map>
#include <regex>
#include <queue>
#ifdef _WIN32
# include <sal.h>
@@ -44,26 +41,18 @@
#include "htp_iface.h"
#include "htp-drv.h"
using intvec = std::vector<int>;
using uintvec = std::vector<unsigned int>;
using u32vec = std::vector<uint32_t>;
static size_t opt_ndev = 1;
static size_t opt_nhvx = 0; // use all
static int opt_arch = 0; // autodetect
static int opt_etm = 0;
static int opt_verbose = 0;
static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
static int opt_profile = 0;
static int opt_hostbuf = 1; // hostbuf ON by default
static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
// Default PMU events, if profiling with PMU (mode=2) is enabled
// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
// https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
// Enable all stages by default
static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE;
static int opt_opsync = 0; // synchronous ops
static int opt_opbatch = 1024; // max number of ops in a batch
static int opt_opqueue = 16; // max number of pending batches
static std::regex* opt_opfilter = NULL; // regex of ops to not claim
@@ -115,26 +104,19 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
}
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
if (!opt_profile) return;
op_desc desc(op);
char pmu_str[256] = "";
if (opt_profile > 1) {
static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
}
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
}
// ** backend sessions
struct ggml_hexagon_opbatch;
struct ggml_hexagon_opqueue;
struct ggml_hexagon_opshm;
struct ggml_hexagon_session {
std::string name;
@@ -150,8 +132,8 @@ struct ggml_hexagon_session {
bool valid_iface;
std::atomic<int> op_pending;
ggml_hexagon_opbatch* op_batch;
ggml_hexagon_opqueue* op_queue;
ggml_hexagon_opbatch *op_batch;
ggml_hexagon_opshm *op_shm;
ggml_backend_buffer_type buffer_type = {};
ggml_backend_buffer_type repack_buffer_type = {};
@@ -1539,14 +1521,65 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
// Backend session implementation
struct ggml_hexagon_opshm {
ggml_hexagon_shared_buffer *sbuf;
std::vector<bool> block_mask;
size_t block_size;
uint8_t * base() const { return this->sbuf->base; }
int fd() const { return this->sbuf->fd; }
size_t n_blocks() const { return this->block_mask.size(); }
ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) {
size_t n_bufs = HTP_OP_MAX_BUFS;
size_t n_ops = max_batch;
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
block_mask.resize(max_pending, true);
block_size = sizeof(htp_buf_desc) * n_bufs +
sizeof(htp_tensor) * n_tensors +
sizeof(htp_op_desc) * n_ops;
sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */);
if (opt_verbose) {
GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n",
sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending);
}
}
~ggml_hexagon_opshm() {
delete sbuf;
}
uint8_t * allocate() {
auto it = std::find(block_mask.begin(), block_mask.end(), true);
if (it == block_mask.end())
return nullptr;
unsigned int i = std::distance(block_mask.begin(), it);
uint8_t* addr = sbuf->base + (i * block_size);
block_mask[i] = false;
HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
return addr;
}
void release(uint8_t * addr) {
int i = (addr - sbuf->base) / block_size;
block_mask[i] = true;
HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
}
};
struct ggml_hexagon_opbatch {
ggml_hexagon_session* sess;
const char* name;
std::vector<const ggml_tensor*> ops; // pointers to original ops
std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
std::vector<htp_tensor> h_tens; // htp tensor descriptors
std::vector<htp_op_desc> h_ops; // htp op descriptors
std::vector<htp_buf_desc> buffers;
std::vector<htp_tensor> tensors;
std::vector<htp_op_desc> ops;
std::unordered_map<int, int> b_map; // buffer fd to index
std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr to index
@@ -1573,21 +1606,19 @@ struct ggml_hexagon_opbatch {
d_map.clear();
}
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) {
this->sess = sess;
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) {
name = sess->c_name();
n_bufs_max = HTP_OP_MAX_BUFS;
n_ops_max = batch_size;
n_ops_max = max_batch;
n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
b_vmem_max = HTP_OP_MAX_VMEM;
buffers.resize(n_bufs_max);
tensors.resize(n_tens_max);
ops.resize(n_ops_max);
h_bufs.resize(n_bufs_max);
h_tens.resize(n_tens_max);
h_ops.resize(n_ops_max);
b_map.reserve(n_bufs_max);
t_map.reserve(n_tens_max);
d_map.reserve(n_tens_max);
@@ -1609,7 +1640,7 @@ struct ggml_hexagon_opbatch {
b_map.insert({sbuf->fd, bi});
htp_buf_desc &b = h_bufs[bi];
htp_buf_desc &b = buffers[bi];
b.base = (uint64_t) sbuf->base;
b.fd = sbuf->fd;
b.size = sbuf->size;
@@ -1633,7 +1664,7 @@ struct ggml_hexagon_opbatch {
// First lookup by tensor data
auto range = d_map.equal_range(t->data);
for (auto it = range.first; it != range.second; ++it) {
htp_tensor * h = &h_tens[it->second];
htp_tensor * h = &tensors[it->second];
if (same_shape(h, t)) { return it->second; }
}
@@ -1651,7 +1682,7 @@ struct ggml_hexagon_opbatch {
uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
size_t t_size = ggml_nbytes(t);
htp_tensor &h = h_tens[ti];
htp_tensor &h = tensors[ti];
h.bi = add_buffer(sbuf);
h.data = t_offset;
h.size = t_size;
@@ -1706,170 +1737,65 @@ struct ggml_hexagon_opbatch {
// assumes that fit_op() was called first and returned true
void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
// Add new op
unsigned int n = n_ops++;
htp_op_desc &o = ops[n_ops++];
GGML_ASSERT(n_ops <= n_ops_max);
ops[n] = t;
htp_op_desc &o = h_ops[n];
memcpy(&o.params, &t->op_params, sizeof(t->op_params));
o.opcode = opcode;
o.flags = 0;
if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
}
ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
ggml_hexagon_dump_op_exec(name, t, o.flags);
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
}
o.dst = add_tensor(t);
}
};
struct ggml_hexagon_opqueue {
// Shared buffer for storing batches
ggml_hexagon_shared_buffer *shm_buf;
size_t shm_blk_size;
size_t flush(uint8_t * mem_addr, size_t mem_size) {
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
using opvec = std::vector<const ggml_tensor*>;
const size_t b_size = sizeof(htp_buf_desc) * n_bufs;
const size_t t_size = sizeof(htp_tensor) * n_tens;
const size_t o_size = sizeof(htp_op_desc) * n_ops;
std::queue<unsigned int> done; // completed batch ids
std::vector<opvec> op_cache; // per batch op cache
std::vector<uint64_t> start_usec; // per batch start time
const size_t m_size = b_size + t_size + o_size;
GGML_ASSERT(m_size <= mem_size);
ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
size_t n_bufs = HTP_OP_MAX_BUFS;
size_t n_ops = batch_size;
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
uint8_t * b_ptr = (uint8_t *) mem_addr;
uint8_t * t_ptr = (uint8_t *) b_ptr + b_size;
uint8_t * o_ptr = (uint8_t *) t_ptr + t_size;
shm_blk_size = sizeof(htp_buf_desc) * n_bufs +
sizeof(htp_tensor) * n_tensors +
sizeof(htp_op_desc) * n_ops +
sizeof(htp_prof_desc) * n_ops;
memcpy(b_ptr, (void *) buffers.data(), b_size);
memcpy(t_ptr, (void *) tensors.data(), t_size);
memcpy(o_ptr, (void *) ops.data(), o_size);
shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
op_cache.resize(depth);
start_usec.resize(depth, 0);
// init done queue
for (unsigned int i = 0; i < depth; i++) { done.push(i); }
if (opt_verbose) {
GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
}
}
~ggml_hexagon_opqueue() {
delete shm_buf;
}
// push new batch
bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
if (done.empty()) { return false; }
req.id = done.front(); done.pop(); // batch id
req.n_bufs = op_batch->n_bufs;
req.n_tensors = op_batch->n_tens;
req.n_ops = op_batch->n_ops;
op_cache[req.id] = op_batch->ops;
start_usec[req.id] = ggml_time_us();
const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs;
const size_t t_size = sizeof(htp_tensor) * req.n_tensors;
const size_t o_size = sizeof(htp_op_desc) * req.n_ops;
const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
dbuf.ptr = shm_buf->base + (req.id * shm_blk_size);
dbuf.fd = shm_buf->fd;
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
dbuf.size = b_size + t_size + o_size + p_size;
GGML_ASSERT(dbuf.size <= shm_blk_size);
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
uint8_t * b_ptr = m_ptr; m_ptr += b_size;
uint8_t * t_ptr = m_ptr; m_ptr += t_size;
uint8_t * o_ptr = m_ptr;
memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size);
HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
b_size, t_size, o_size, (size_t) dbuf.size);
op_batch->reset();
HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n",
name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size);
if (opt_verbose > 1) {
htp_buf_desc *b = (htp_buf_desc*) b_ptr;
for (unsigned int i=0; i < req.n_bufs; i++) {
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
for (unsigned int i=0; i < n_bufs; i++) {
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i,
b[i].fd, (void *) b[i].base, (size_t) b[i].size);
}
htp_tensor *t = (htp_tensor*) t_ptr;
for (unsigned int i=0; i < req.n_tensors; i++) {
for (unsigned int i=0; i < n_tens; i++) {
GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
name, i, t[i].bi, t[i].data, t[i].size,
(size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
}
}
return true;
}
reset();
void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
GGML_ASSERT(rsp.id < op_cache.size());
done.push(rsp.id);
const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs;
const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors;
const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops;
const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
const size_t m_size = b_size + t_size + o_size + p_size;
GGML_ASSERT(m_size <= shm_blk_size);
HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
(size_t) dbuf.size, b_size, t_size, o_size);
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
if (opt_profile && rsp.n_ops > 0) {
auto & ops = op_cache[rsp.id];
uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
uint32_t htp_usec = 0;
GGML_ASSERT(rsp.n_ops <= ops.size());
const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
for (uint32_t i = 0; i < rsp.n_ops; i++) {
htp_usec += pd[i].usecs;
ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
}
GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
}
return m_size;
}
};
@@ -1898,12 +1824,17 @@ void ggml_hexagon_session::flush_pending(bool all) {
GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
}
op_shm->release((uint8_t*) dbuf.ptr);
if (rsp.status != HTP_STATUS_OK) {
GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
// TODO: handle errors
}
op_queue->pop(rsp, dbuf);
// FIXME: profile will be per opreq
// this->prof_usecs = rsp.prof_usecs;
// this->prof_cycles = rsp.prof_cycles;
// this->prof_pkts = rsp.prof_pkts;
this->op_pending--; // atomic dec
@@ -1914,17 +1845,28 @@ void ggml_hexagon_session::flush_pending(bool all) {
void ggml_hexagon_session::flush_batch() {
if (op_batch->empty()) { return; }
htp_opbatch_req req {};
dspqueue_buffer dbuf{};
htp_opbatch_req req;
req.n_bufs = op_batch->n_bufs;
req.n_tensors = op_batch->n_tens;
req.n_ops = op_batch->n_ops;
if (!op_queue->push(req, dbuf, op_batch)) {
dspqueue_buffer dbuf;
dbuf.fd = op_shm->fd();
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
dbuf.ptr = op_shm->allocate();
if (!dbuf.ptr) {
flush_pending(false);
op_queue->push(req, dbuf, op_batch);
dbuf.ptr = op_shm->allocate();
}
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base();
dbuf.size = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size);
// Bump pending flag (cleared in the session::flush once we get the response)
this->op_pending++; // atomic inc
HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
if (err != 0) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
@@ -2074,33 +2016,25 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
}
if (opt_etm) {
err = htp_iface_etm(this->handle, 1);
err = htp_iface_enable_etm(this->handle);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
}
}
if (opt_profile) {
htp_iface_pmu_conf pmu_conf{};
std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
}
}
// Allocate buffers and state for op batching
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
// Start processing op batch requests
// Start the DSP-side service. We need to pass the queue ID to the
// DSP in a FastRPC call; the DSP side will import the queue and start
// listening for packets in a callback.
err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
}
this->valid_iface = true;
// Allocate buffers and state for op batching
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
this->op_shm = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue);
}
void ggml_hexagon_session::release() noexcept(true) {
@@ -2109,7 +2043,7 @@ void ggml_hexagon_session::release() noexcept(true) {
int err;
delete this->op_batch;
delete this->op_queue;
delete this->op_shm;
// Stop the DSP-side service and close the queue
if (this->valid_iface) {
@@ -2120,20 +2054,12 @@ void ggml_hexagon_session::release() noexcept(true) {
}
if (opt_etm) {
err = htp_iface_etm(this->handle, 0);
err = htp_iface_disable_etm(this->handle);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
}
}
if (opt_profile) {
htp_iface_pmu_conf pmu_conf{};
err = htp_iface_profiler(this->handle, 0, &pmu_conf);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
}
}
if (this->valid_queue) {
err = dspqueue_close(queue);
if (err != 0) {
@@ -2151,7 +2077,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
repack_buffer_type.device = dev;
op_batch = nullptr;
op_queue = nullptr;
op_shm = nullptr;
try {
allocate(dev_id);
@@ -2670,62 +2596,6 @@ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * se
return true;
}
static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * dst = op;
// diag only supports F32 currently
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
return false;
}
// Input must have ne[1] == 1 (vector input)
if (src0->ne[1] != 1) {
return false;
}
// Output must be square in first two dimensions
if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
return false;
}
GGML_UNUSED(sess);
return true;
}
static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
const struct ggml_tensor * src0 = op->src[0]; // A
const struct ggml_tensor * src1 = op->src[1]; // B
const struct ggml_tensor * dst = op; // X
if (!src0 || !src1) {
return false;
}
if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
return false;
}
if (src0->ne[0] != src0->ne[1]) {
return false;
}
if (src0->ne[1] != src1->ne[1]) {
return false;
}
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
return false;
}
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
return false;
}
GGML_UNUSED(sess);
return true;
}
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
return sess->c_name();
@@ -2762,9 +2632,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
case GGML_OP_ROPE: return HTP_OP_ROPE;
case GGML_OP_REPEAT: return HTP_OP_REPEAT;
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
case GGML_OP_FILL: return HTP_OP_FILL;
case GGML_OP_DIAG: return HTP_OP_DIAG;
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(t)) {
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
@@ -2805,7 +2673,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * n = graph->nodes[i];
if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
if (op_is_compute(n)) {
sess->enqueue_op(op_remap_to_htp(n), n);
}
}
@@ -3161,17 +3029,6 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
return true;
}
static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
const struct ggml_tensor * dst = op;
if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
return false;
}
GGML_UNUSED(sess);
return true;
}
static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
@@ -3302,18 +3159,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
supp = ggml_hexagon_supported_cumsum(sess, op);
break;
case GGML_OP_FILL:
supp = ggml_hexagon_supported_fill(sess, op);
break;
case GGML_OP_DIAG:
supp = ggml_hexagon_supported_diag(sess, op);
break;
case GGML_OP_SOLVE_TRI:
supp = ggml_hexagon_supported_solve_tri(sess, op);
break;
default:
break;
}
@@ -3449,26 +3294,6 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
return NULL;
}
template<typename T> std::vector<T> str_to_vec(const char* str) {
std::stringstream ss(str);
std::vector<T> v;
std::string t;
while (std::getline(ss, t, ',')) {
v.push_back(std::stoul(t, nullptr, 0));
}
return v;
}
template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
std::stringstream ss;
ss << std::setbase(BASE) << std::showbase;
for (auto i : v) { ss << i << ','; }
auto str = ss.str(); str.pop_back(); // drop last comma
return str;
}
static void ggml_hexagon_init(ggml_backend_reg * reg) {
// Basic sanity checks to make sure definitions match
static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
@@ -3482,7 +3307,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC");
const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER");
@@ -3495,30 +3321,19 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
auto RE_ICASE = std::regex_constants::icase;
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
opt_etm = str_etm ? atoi(str_etm) : 0;
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
if (str_profile) {
opt_pmu_evt = [&]() -> std::vector<uint32_t> {
auto v = str_to_vec<uint32_t>(str_profile);
switch (v.size()) {
case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
case 8: opt_profile = 2; return v; // mode with custom pmu events
default: opt_profile = 0; return {}; // garbage input
}}();
if (opt_profile == 1) opt_pmu_evt = {};
GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
}
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask;
opt_opsync = str_opsync ? atoi(str_opsync) : opt_opsync;
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
opt_profile = str_profile ? atoi(str_profile) : 0;
opt_etm = str_etm ? atoi(str_etm) : 0;
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
-3
View File
@@ -34,9 +34,6 @@ add_library(${HTP_LIB} SHARED
argsort-ops.c
ssm-conv.c
cumsum-ops.c
fill-ops.c
diag-ops.c
solve-tri-ops.c
)
target_compile_definitions(${HTP_LIB} PRIVATE
-216
View File
@@ -1,216 +0,0 @@
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <HAP_farf.h>
#include <HAP_perf.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "hvx-types.h"
#include "hex-utils.h"
#include "hvx-copy.h"
#include "hex-dma.h"
#define htp_diag_tensors_preamble \
const struct htp_tensor * restrict src0 = octx->src[0]; \
const struct htp_tensor * restrict dst = octx->dst; \
\
const uint32_t ne02 = src0->ne[2]; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
\
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
struct htp_diag_context {
struct htp_ops_context * octx;
size_t src_batch_size;
size_t dst_row_size;
size_t src_batch_size_aligned;
size_t dst_row_size_aligned;
uint32_t batches_per_thread;
uint32_t total_batches;
};
#define htp_diag_preamble \
struct htp_diag_context * dctx = (struct htp_diag_context *) data; \
struct htp_ops_context * octx = dctx->octx; \
htp_diag_tensors_preamble;
static inline void hvx_diag_row_f32(const float * restrict src, float * restrict dst,
uint32_t row_idx, uint32_t n) {
hvx_splat_f32_a((uint8_t *) dst, 0.0f, n);
dst[row_idx] = src[row_idx];
}
// ---------------------------------------------------------------------------
// Per thread worker: DMA src fetch, compute in VTCM, DMA dst writeback
// ---------------------------------------------------------------------------
static void diag_thread_f32_dma(unsigned int nth, unsigned int ith, void * data) {
htp_diag_preamble;
dma_queue * dma_queue = octx->ctx->dma[ith];
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const uint32_t ib0 = dctx->batches_per_thread * ith;
const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
if (ib0 >= ib1) {
return;
}
const size_t src_batch_size = dctx->src_batch_size;
const size_t dst_row_size = dctx->dst_row_size;
const size_t src_batch_size_aligned = dctx->src_batch_size_aligned;
const size_t dst_row_size_aligned = dctx->dst_row_size_aligned;
const uint8_t * src_data = (const uint8_t *) src0->data;
uint8_t * dst_data = (uint8_t *) dst->data;
// 1 src buffer + 1 dst row buffer per thread in VTCM
uint8_t * src_spad = octx->src0_spad.data + (ith * src_batch_size_aligned);
uint8_t * dst_spad = octx->dst_spad.data + (ith * dst_row_size_aligned);
for (uint32_t ib = ib0; ib < ib1; ib++) {
const uint32_t i3 = ib / ne02;
const uint32_t i2 = ib % ne02;
const uint8_t * src_batch = src_data + i3 * nb03 + i2 * nb02;
// Fetch source vector into VTCM
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src_spad, src_batch),
src_batch_size_aligned, src_batch_size, 1);
dma_queue_flush(dma_queue);
const float * src_spad_f32 = (const float *) src_spad;
float * dst_spad_f32 = (float *) dst_spad;
for (uint32_t i1 = 0; i1 < ne1; i1++) {
// Compute row in VTCM
hvx_diag_row_f32(src_spad_f32, dst_spad_f32, i1, ne0);
// Write completed row back to DDR
uint8_t * dst_row = dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1;
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(dst_row, dst_spad),
dst_row_size, dst_row_size_aligned, 1);
dma_queue_flush(dma_queue);
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "diag-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
// ---------------------------------------------------------------------------
// Per thread worker: Direct HVX (no DMA)
// ---------------------------------------------------------------------------
static void diag_thread_f32(unsigned int nth, unsigned int ith, void * data) {
htp_diag_preamble;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const uint8_t * src_data = (const uint8_t *) src0->data;
uint8_t * dst_data = (uint8_t *) dst->data;
const uint32_t ib0 = dctx->batches_per_thread * ith;
const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
for (uint32_t ib = ib0; ib < ib1; ib++) {
const uint32_t i3 = ib / ne02;
const uint32_t i2 = ib % ne02;
const float * restrict src_batch = (const float *)(src_data + i3 * nb03 + i2 * nb02);
for (uint32_t i1 = 0; i1 < ne1; i1++) {
float * restrict dst_row = (float *)(dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1);
hvx_diag_row_f32(src_batch, dst_row, i1, ne0);
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "diag-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
int op_diag_f32(struct htp_ops_context * octx) {
const struct htp_tensor * src0 = octx->src[0];
const struct htp_tensor * dst = octx->dst;
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
}
const uint32_t total_batches = src0->ne[2] * src0->ne[3];
const uint32_t n_threads = MIN(octx->n_threads, total_batches);
const size_t src_batch_size = src0->ne[0] * sizeof(float);
const size_t dst_row_size = dst->ne[0] * sizeof(float);
const size_t src_batch_size_aligned = hex_round_up(src_batch_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
// 1 src buffer + 1 dst row buffer per thread
const size_t spad_per_thread = src_batch_size_aligned + dst_row_size_aligned;
octx->src0_spad.size_per_thread = src_batch_size_aligned;
octx->dst_spad.size_per_thread = dst_row_size_aligned;
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.src = NULL;
struct htp_diag_context dctx = {
.octx = octx,
.src_batch_size = src_batch_size,
.dst_row_size = dst_row_size,
.src_batch_size_aligned = src_batch_size_aligned,
.dst_row_size_aligned = dst_row_size_aligned,
.batches_per_thread = (total_batches + n_threads - 1) / n_threads,
.total_batches = total_batches,
};
if (octx->ctx->vtcm_size < spad_per_thread * n_threads) {
worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32, &dctx, n_threads);
} else {
worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32_dma, &dctx, n_threads);
}
return HTP_STATUS_OK;
}
int op_diag(struct htp_ops_context * octx) {
const struct htp_tensor * dst = octx->dst;
int err = HTP_STATUS_OK;
switch (dst->type) {
case HTP_TYPE_F32:
err = op_diag_f32(octx);
break;
default:
err = HTP_STATUS_NO_SUPPORT;
break;
}
return err;
}
-123
View File
@@ -1,123 +0,0 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <HAP_farf.h>
#include <HAP_perf.h>
#include <string.h>
#include "hvx-copy.h"
#include "hvx-utils.h"
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
// ggml op_params layout for FILL:
// op_params[0] (as float) - the scalar fill value
#define fill_preamble \
const struct htp_tensor * dst = octx->dst; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3]; \
\
const uint32_t nr = ne1 * ne2 * ne3;
struct htp_fill_context {
struct htp_ops_context * octx;
uint32_t nrows_per_thread;
uint32_t total_rows; // ne1 * ne2 * ne3
bool opt_path;
HVX_Vector splat_vec;
uint32_t elem_size;
};
static void fill_thread(unsigned int nth, unsigned int ith, void * data) {
const struct htp_fill_context * fctx = (const struct htp_fill_context *) data;
struct htp_ops_context * octx = fctx->octx;
fill_preamble;
// Parallelise over the flat row index spanning ne1*ne2*ne3
const uint32_t ir0 = fctx->nrows_per_thread * ith;
const uint32_t ir1 = MIN(ir0 + fctx->nrows_per_thread, fctx->total_rows);
uint64_t t1 = HAP_perf_get_qtimer_count();
if (fctx->opt_path) {
// Opt path: tensor is fully contiguous, treat as flat array
const uint32_t elem_start = ir0 * ne0;
const uint32_t elem_end = ir1 * ne0;
uint8_t * dst_ptr = (uint8_t *) dst->data + elem_start * fctx->elem_size;
hvx_splat_u(dst_ptr, fctx->splat_vec, elem_end - elem_start, fctx->elem_size);
} else {
// Non-contiguous path: must respect strides
for (uint32_t ir = ir0; ir < ir1; ++ir) {
const uint32_t i1 = ir % ne1;
const uint32_t i2 = (ir / ne1) % ne2;
const uint32_t i3 = ir / (ne1 * ne2);
uint8_t * dst_ptr = (uint8_t *) dst->data + i1*nb1 + i2*nb2 + i3*nb3;
hvx_splat_u(dst_ptr, fctx->splat_vec, ne0, fctx->elem_size);
}
}
uint64_t t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "fill %u/%u: rows %u:%u usec %u\n",
ith, nth, ir0, ir1, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
int op_fill(struct htp_ops_context * octx) {
fill_preamble;
if (dst->type != HTP_TYPE_F32 && dst->type != HTP_TYPE_F16) {
return HTP_STATUS_NO_SUPPORT;
}
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
}
// nr = ne1*ne2*ne3 (flat row count across all outer dims); parallelise over it.
const uint32_t n_threads = MIN(nr, octx->n_threads);
// Optimize if fully contiguous: skip stride arithmetic, treat as flat array
const bool opt_path = (nb2 == nb1 * ne1) && (nb3 == nb2 * ne2);
FARF(HIGH, "fill: (%ux%ux%ux%u) type=%u opt=%d\n",
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->type, (int) opt_path);
float val_f32 = 0.f;
memcpy(&val_f32, &octx->op_params[0], sizeof(float));
struct htp_fill_context fctx = {
.octx = octx,
.nrows_per_thread = (nr + n_threads - 1) / n_threads,
.total_rows = nr,
.opt_path = opt_path,
};
switch (dst->type) {
case HTP_TYPE_F32:
fctx.splat_vec = hvx_vec_splat_f32(val_f32);
fctx.elem_size = sizeof(float);
break;
case HTP_TYPE_F16:
fctx.splat_vec = hvx_vec_splat_f16((_Float16) val_f32);
fctx.elem_size = sizeof(_Float16);
break;
default:
return HTP_STATUS_NO_SUPPORT;
}
worker_pool_run_func(octx->ctx->worker_pool, fill_thread, &fctx, n_threads);
return HTP_STATUS_OK;
}
-28
View File
@@ -4,7 +4,6 @@
#include <stdbool.h>
#include <stdint.h>
#include <qurt_memory.h>
#include <qurt.h>
#include "hexagon_types.h"
#include "hexagon_protos.h"
@@ -101,31 +100,4 @@ static inline void hex_pause() {
asm volatile(" pause(#255)\n");
}
#ifndef HEX_NUM_PMU_COUNTERS
#define HEX_NUM_PMU_COUNTERS 8
#endif
static inline void hex_get_pmu(uint32_t counters[]) {
#if __HVX_ARCH__ >= 79
asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
#else
counters[0] = qurt_pmu_get(QURT_PMUCNT0);
counters[1] = qurt_pmu_get(QURT_PMUCNT1);
counters[2] = qurt_pmu_get(QURT_PMUCNT2);
counters[3] = qurt_pmu_get(QURT_PMUCNT3);
counters[4] = qurt_pmu_get(QURT_PMUCNT4);
counters[5] = qurt_pmu_get(QURT_PMUCNT5);
counters[6] = qurt_pmu_get(QURT_PMUCNT6);
counters[7] = qurt_pmu_get(QURT_PMUCNT7);
// qurt_pmu_get_pmucnt(counters);
#endif
}
#endif /* HEX_UTILS_H */
+1 -7
View File
@@ -10,7 +10,6 @@
#include <dspqueue.h>
#include <stdatomic.h>
#include <stdint.h>
#include <stdbool.h>
#define HTP_MAX_NTHREADS 10
#define HTP_MAX_MMAPS 16
@@ -67,9 +66,7 @@ struct htp_context {
int thread_id;
int thread_prio;
bool hmx_enabled;
bool etm;
uint32_t profiler;
int hmx_enabled;
uint8_t * vtcm_base;
size_t vtcm_size;
@@ -101,8 +98,5 @@ int op_repeat(struct htp_ops_context * octx);
int op_argsort(struct htp_ops_context * octx);
int op_ssm_conv(struct htp_ops_context * octx);
int op_cumsum(struct htp_ops_context * octx);
int op_fill(struct htp_ops_context * octx);
int op_diag(struct htp_ops_context * octx);
int op_solve_tri(struct htp_ops_context * octx);
#endif /* HTP_CTX_H */
+10 -30
View File
@@ -42,9 +42,9 @@ enum htp_data_type {
// Mask to enable various stages of the Ops.
// Used for debugging and profiling.
enum htp_op_stage {
HTP_OPSTAGE_QUEUE = (1 << 0), // Enable Queueing (ie calls into NPU)
HTP_OPSTAGE_COMPUTE = (1 << 1), // Enable Compute
enum htp_op_mask {
HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP)
HTP_OPMASK_COMPUTE = (1 << 1), // Enable Compute
};
// Do not reorder first 4 (used as an index)
@@ -80,9 +80,7 @@ enum htp_op_code {
HTP_OP_SSM_CONV,
HTP_OP_REPEAT,
HTP_OP_CUMSUM,
HTP_OP_FILL,
HTP_OP_DIAG,
HTP_OP_SOLVE_TRI,
HTP_OP_INVALID
};
@@ -137,45 +135,27 @@ struct htp_op_desc {
int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices
uint16_t dst; // Output tensor index
};
enum htp_profiler_mode {
HTP_PROF_DISABLED = 0,
HTP_PROF_BASIC = 1,
HTP_PROF_PMU = 2,
};
#define HTP_PROF_PMU_NCNT 8
// Profile descriptor
struct htp_prof_desc {
uint32_t opcode; // GGML/HTP Op
uint32_t usecs; // Number of usec
uint32_t cycles; // Number of cycles
uint32_t pad; // Unused
uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
// the rest is filled in-place by the NPU
uint32_t prof_usecs; // Number of usec per request
uint32_t prof_cycles; // Number of cycles per request
uint32_t prof_pkts; // Number of instruction packets per request
uint32_t unused;
};
struct htp_opbatch_req {
uint32_t id; // Batch id
uint32_t n_bufs; // Number of buffers
uint32_t n_tensors; // Number of tensors
uint32_t n_ops; // Number of ops
uint32_t flags; // unused
uint32_t pad; // unused
// struct htp_buf_desc bufs[]; -- dspqueue buf 0
// struct htp_tensor tensors[]; -- dspqueue buf 0
// struct htp_op_desc ops[]; -- dspqueue buf 0
};
struct htp_opbatch_rsp {
uint32_t id; // Batch id
uint32_t status; // HTP_STATUS_...
uint32_t n_bufs; // Number of buffers
uint32_t n_tensors; // Number of tensors
uint32_t n_ops; // Number of op profile descriptors
uint32_t pad; // unused
// struct htp_prof_desc profs[]; -- dspqueue buf 0
// struct htp_op_req ops[]; -- dspqueue buf 0
};
#endif /* HTP_OPS_H */
+2 -6
View File
@@ -6,17 +6,13 @@
#include "AEEStdDef.idl"
#include "remote.idl"
struct htp_iface_pmu_conf {
uint32 events[8];
};
interface htp_iface : remote_handle64 {
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
AEEResult stop();
AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
AEEResult munmap(in uint32 fd);
AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
AEEResult etm(in uint32 enable);
AEEResult enable_etm();
AEEResult disable_etm();
};
#endif /* HTP_IDL */
-24
View File
@@ -256,18 +256,6 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b));
}
static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
}
static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b));
}
static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b));
}
#else
static inline HVX_Vector hvx_vec_add_f16_f16(HVX_Vector a, HVX_Vector b)
@@ -285,18 +273,6 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
return Q6_Vhf_vmpy_VhfVhf(a, b);
}
static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vadd_VsfVsf(a, b);
}
static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vsub_VsfVsf(a, b);
}
static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vmpy_VsfVsf(a, b);
}
#endif // __HVX_ARCH__ < 79
#endif /* HVX_BASE_H */
+53 -130
View File
@@ -27,7 +27,6 @@
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-ops.h"
#include "htp_iface.h"
#include "worker-pool.h"
AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@@ -104,54 +103,6 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
return AEE_SUCCESS;
}
AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
}
if (mode == HTP_PROF_PMU) {
const uint32_t* events = pmu_conf->events;
// Pack 4 event IDs (low 8 bits) into each 32-bit config register
uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
evtcfg |= ((events[i + 0] & 0xFF) << (i * 8));
evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
}
// For events >255 pack high 2 bits of all 8 event IDs into cfg register
// 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
cfg |= (((events[i] >> 8) & 3) << (i * 2));
}
FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
// Configure PMU registers
qurt_pmu_set(QURT_PMUCFG, cfg);
qurt_pmu_set(QURT_PMUEVTCFG, evtcfg);
qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
qurt_pmu_enable(1);
}
ctx->profiler = mode;
return AEE_SUCCESS;
}
AEEResult htp_iface_close(remote_handle64 handle) {
struct htp_context * ctx = (struct htp_context *) handle;
@@ -178,19 +129,35 @@ AEEResult htp_iface_close(remote_handle64 handle) {
}
}
if (ctx->profiler) {
qurt_pmu_enable(1);
}
if (ctx->etm) {
HAP_user_etm_disable();
}
free(ctx);
return AEE_SUCCESS;
}
AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) {
AEEResult htp_iface_enable_etm(remote_handle64 handle) {
int err = HAP_user_etm_enable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_disable_etm(remote_handle64 handle) {
int err = HAP_user_etm_disable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
@@ -237,7 +204,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32
return AEE_ENOMEMORY;
}
AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
@@ -467,39 +434,19 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
struct profile_data {
uint64_t usecs;
uint64_t cycles;
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
uint64_t pkts;
};
static inline void profile_start(uint32_t mode, struct profile_data * d) {
switch (mode) {
case HTP_PROF_PMU:
hex_get_pmu(d->pmu_counters);
// fallthrough
case HTP_PROF_BASIC:
d->usecs = HAP_perf_get_qtimer_count();
d->cycles = hex_get_cycles();
break;
default:
break;
}
static inline void profile_start(struct profile_data * d) {
d->usecs = HAP_perf_get_qtimer_count();
d->cycles = hex_get_cycles();
d->pkts = hex_get_pktcnt();
}
static inline void profile_stop(uint32_t mode, struct profile_data * d) {
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
switch (mode) {
case HTP_PROF_PMU:
hex_get_pmu(pmu_counters);
for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
}
// fallthrough
case HTP_PROF_BASIC:
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
d->cycles = hex_get_cycles() - d->cycles;
break;
default:
break;
}
static inline void profile_stop(struct profile_data * d) {
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
d->cycles = hex_get_cycles() - d->cycles;
d->pkts = hex_get_pktcnt() - d->pkts;
}
static int execute_op(struct htp_ops_context * octx) {
@@ -567,15 +514,6 @@ static int execute_op(struct htp_ops_context * octx) {
case HTP_OP_CUMSUM:
return op_cumsum(octx);
case HTP_OP_FILL:
return op_fill(octx);
case HTP_OP_DIAG:
return op_diag(octx);
case HTP_OP_SOLVE_TRI:
return op_solve_tri(octx);
case HTP_OP_INVALID:
break;
@@ -782,32 +720,29 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
continue;
}
// Reset poll count for valid requests
poll_count = DSPQUEUE_POLL_COUNT;
const uint32_t n_bufs = req.n_bufs;
const uint32_t n_tens = req.n_tensors;
const uint32_t n_ops = req.n_ops;
const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
if (dbuf.size < b_size + t_size + o_size + p_size) {
if (dbuf.size < b_size + t_size + o_size) {
FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
break;
}
FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
// Reset poll count for valid requests
poll_count = DSPQUEUE_POLL_COUNT;
// Setup descriptor pointers
uint8_t * m_ptr = dbuf.ptr;
struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; m_ptr += o_size;
struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr;
FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u",
n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
prep_op_bufs(ctx, bufs, n_bufs);
prep_tensors(ctx, bufs, tens, n_tens);
@@ -819,34 +754,22 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
for (uint32_t i=0; i < n_ops; i++) {
struct profile_data prof;
profile_start(ctx->profiler, &prof);
profile_start(&prof);
proc_op_req(octx, tens, i, &ops[i]);
profile_stop(ctx->profiler, &prof);
if (ctx->profiler) {
pds[i].opcode = ops[i].opcode;
pds[i].usecs = prof.usecs;
pds[i].cycles = prof.cycles;
for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
pds[i].pmu[j] = prof.pmu_counters[j];
}
}
profile_stop(&prof);
ops[i].prof_usecs = prof.usecs;
ops[i].prof_cycles = prof.cycles;
ops[i].prof_pkts = prof.pkts;
}
// dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
struct htp_opbatch_rsp rsp;
rsp.id = req.id;
rsp.status = HTP_STATUS_OK;
rsp.n_bufs = n_bufs;
rsp.n_tensors = n_tens;
rsp.n_ops = n_ops;
rsp.status = HTP_STATUS_OK; // FIXME
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
if (err != 0) {
FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
-4
View File
@@ -3017,10 +3017,6 @@ int op_matmul(struct htp_ops_context * octx) {
const int act_stride = (int)(src1->nb[1] / sizeof(float));
const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
}
if (src0->type == HTP_TYPE_F16) {
if (is_batched) {
hmx_matmul_w16a32_batched_params_t batch_params = {
-267
View File
@@ -1,267 +0,0 @@
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <HAP_farf.h>
#include <HAP_perf.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "hvx-types.h"
#include "hvx-utils.h"
struct htp_solve_tri_context {
struct htp_ops_context * octx;
uint32_t jobs_per_thread;
uint32_t total_jobs;
uint32_t k_chunks;
uint32_t col_block;
};
static inline void solve_tri_row_scalar(const float * A_row,
const float * B_row,
float * X,
uint32_t row,
uint32_t k,
uint32_t col0,
uint32_t coln,
float inv_diag) {
for (uint32_t col = col0; col < col0 + coln; ++col) {
float sum = 0.0f;
for (uint32_t t = 0; t < row; ++t) {
sum += A_row[t] * X[t * k + col];
}
X[row * k + col] = (B_row[col] - sum) * inv_diag;
}
}
static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) {
HVX_Vector v = *((const HVX_UVector *) src);
HVX_VectorPred mask = Q6_Q_vsetq2_R(n * sizeof(float));
return Q6_V_vmux_QVV(mask, v, Q6_V_vzero());
}
static inline void solve_tri_row_hvx(const float * A_row,
const float * B_row,
float * X,
uint32_t row,
uint32_t k,
uint32_t col0,
uint32_t coln,
float inv_diag) {
const bool full = (coln == VLEN_FP32);
HVX_Vector sum_v = Q6_V_vzero();
for (uint32_t t = 0; t < row; ++t) {
const float a = A_row[t];
const float * x_row_col = X + t * k + col0;
HVX_Vector x_v = full ? *((const HVX_UVector *) x_row_col) : hvx_load_partial_f32(x_row_col, coln);
HVX_Vector a_v = hvx_vec_splat_f32(a);
sum_v = hvx_vec_add_f32_f32(sum_v, hvx_vec_mul_f32_f32(x_v, a_v));
}
const float * b_row_col = B_row + col0;
float * x_out_col = X + row * k + col0;
HVX_Vector b_v = full ? *((const HVX_UVector *) b_row_col) : hvx_load_partial_f32(b_row_col, coln);
HVX_Vector inv_diag_v = hvx_vec_splat_f32(inv_diag);
HVX_Vector out_v = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(b_v, sum_v), inv_diag_v);
hvx_vec_store_u((void *) x_out_col, coln * sizeof(float), out_v);
}
// Batch-level thread: each job is one full batch.
static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
struct htp_ops_context * octx = sctx->octx;
const struct htp_tensor * src0 = octx->src[0]; // A
const struct htp_tensor * src1 = octx->src[1]; // B
const struct htp_tensor * dst = octx->dst; // X
const uint32_t n = src0->ne[0];
const uint32_t k = src1->ne[0];
const uint32_t ne02 = src0->ne[2];
const uint32_t col_block = VLEN_FP32;
const uint32_t k_full = (k / col_block) * col_block;
const uint32_t start_batch = sctx->jobs_per_thread * ith;
const uint32_t end_batch = MIN(start_batch + sctx->jobs_per_thread, sctx->total_jobs);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
for (uint32_t batch = start_batch; batch < end_batch; ++batch) {
const uint32_t i03 = batch / ne02;
const uint32_t i02 = batch - i03 * ne02;
const float * A_batch =
(const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
const float * B_batch =
(const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
for (uint32_t row = 0; row < n; ++row) {
const float diag = A_batch[row * n + row];
const float inv_diag = 1.0f / diag;
const float * A_row = A_batch + row * n;
const float * B_row = B_batch + row * k;
uint32_t col0 = 0;
for (; col0 < k_full; col0 += col_block) {
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, col_block, inv_diag);
}
if (col0 < k) {
const uint32_t coln = k - col0;
if (coln >= 8) {
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
} else {
solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
}
}
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "solve-tri-batch %d/%d: A=(%ux%u) B=(%ux%u) batch %u:%u usec %u\n",
ith, nth, n, n, k, n, start_batch, end_batch,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
// Chunk-level thread: each job is one (batch, col_chunk) pair.
static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
struct htp_ops_context * octx = sctx->octx;
const struct htp_tensor * src0 = octx->src[0]; // A
const struct htp_tensor * src1 = octx->src[1]; // B
const struct htp_tensor * dst = octx->dst; // X
const uint32_t n = src0->ne[0];
const uint32_t k = src1->ne[0];
const uint32_t ne02 = src0->ne[2];
const uint32_t start_job = sctx->jobs_per_thread * ith;
const uint32_t end_job = MIN(start_job + sctx->jobs_per_thread, sctx->total_jobs);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
for (uint32_t job = start_job; job < end_job; ++job) {
const uint32_t batch = job / sctx->k_chunks;
const uint32_t chunk = job - batch * sctx->k_chunks;
const uint32_t i03 = batch / ne02;
const uint32_t i02 = batch - i03 * ne02;
const uint32_t col0 = chunk * sctx->col_block;
const uint32_t coln = MIN(sctx->col_block, k - col0);
const float * A_batch =
(const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
const float * B_batch =
(const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
const bool use_hvx = (coln >= 8);
for (uint32_t row = 0; row < n; ++row) {
const float diag = A_batch[row * n + row];
const float inv_diag = 1.0f / diag;
const float * A_row = A_batch + row * n;
const float * B_row = B_batch + row * k;
if (use_hvx) {
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
} else {
solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
}
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "solve-tri-chunk %d/%d: A=(%ux%u) B=(%ux%u) job %u:%u usec %u\n",
ith, nth, n, n, k, n, start_job, end_job,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
int op_solve_tri(struct htp_ops_context * octx) {
const struct htp_tensor * src0 = octx->src[0]; // A
const struct htp_tensor * src1 = octx->src[1]; // B
const struct htp_tensor * dst = octx->dst; // X
if (src0->type != HTP_TYPE_F32 || src1->type != HTP_TYPE_F32 || dst->type != HTP_TYPE_F32) {
return HTP_STATUS_NO_SUPPORT;
}
// left=true, lower=true, uni=false only
if (src0->ne[0] != src0->ne[1]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (src0->ne[1] != src1->ne[1]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] ||
dst->ne[3] != src1->ne[3]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
}
const uint32_t k = src1->ne[0];
const uint32_t col_block = VLEN_FP32;
const uint32_t k_chunks = (k + col_block - 1) / col_block;
const uint32_t total_batches = src0->ne[2] * src0->ne[3];
const bool batched = total_batches >= (uint32_t) octx->n_threads;
FARF(HIGH, "solve-tri: (%ux%ux%ux%u) x (%ux%ux%ux%u) -> (%ux%ux%ux%u) : batched %d\n",
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], batched);
if (batched) {
// Batch-level parallelism
const uint32_t n_threads = MIN((uint32_t) octx->n_threads, total_batches);
struct htp_solve_tri_context sctx = {
.octx = octx,
.jobs_per_thread = (total_batches + n_threads - 1) / n_threads,
.total_jobs = total_batches,
.k_chunks = k_chunks,
.col_block = col_block,
};
worker_pool_run_func(octx->ctx->worker_pool, solve_tri_batch_thread_f32, &sctx, n_threads);
} else {
// Chunk-level parallelism
const uint32_t total_jobs = total_batches * k_chunks;
const uint32_t n_threads = MIN((uint32_t) octx->n_threads, MAX(total_jobs, 1));
struct htp_solve_tri_context sctx = {
.octx = octx,
.jobs_per_thread = (total_jobs + n_threads - 1) / n_threads,
.total_jobs = total_jobs,
.k_chunks = k_chunks,
.col_block = col_block,
};
worker_pool_run_func(octx->ctx->worker_pool, solve_tri_chunk_thread_f32, &sctx, n_threads);
}
return HTP_STATUS_OK;
}
+1 -3
View File
@@ -8,7 +8,7 @@ CatalogFile = libggml-htp.cat
PnpLockDown = 1
[DestinationDirs]
Drivers_Dir = 13
Drivers_Dir = 6
[SourceDisksNames]
1 = %DiskId%
@@ -18,7 +18,6 @@ libggml-htp-v68.so = 1
libggml-htp-v69.so = 1
libggml-htp-v73.so = 1
libggml-htp-v75.so = 1
libggml-htp-v79.so = 1
libggml-htp-v81.so = 1
[ControlFlags]
@@ -32,7 +31,6 @@ libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE
[Strings]
+13 -12
View File
@@ -814,7 +814,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
}
// print MTL GPU family:
GGML_LOG_INFO("%s: GPU name: %s (%s)\n", __func__, dev->props.name, dev->props.desc);
GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
// determine max supported GPU family
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
@@ -931,13 +931,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
}
struct ggml_metal_event {
void * obj; // id<MTLSharedEvent>
void * obj; // id<MTLEvent>
atomic_int value;
};
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
@@ -945,7 +945,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
}
void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
@@ -953,7 +953,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
}
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
id<MTLSharedEvent> event = [dev->mtl_device newSharedEvent];
id<MTLEvent> event = [dev->mtl_device newEvent];
ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
@@ -964,7 +964,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
}
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
id<MTLSharedEvent> event = ev->obj;
id<MTLEvent> event = ev->obj;
[event release];
free(ev);
@@ -973,13 +973,14 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
}
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
id<MTLSharedEvent> event = ev->obj;
const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
if (!res) {
GGML_ABORT("%s: failed to wait for event\n", __func__);
}
@autoreleasepool {
id<MTLEvent> event = ev->obj;
GGML_UNUSED(dev);
id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
[cmd_buf commit];
[cmd_buf waitUntilCompleted];
}
}
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
-4
View File
@@ -918,10 +918,6 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
static std::vector<ggml_backend_device_ptr> devs;
if (!initialized) {
// workaround macOS limitation (kIOGPUCommandBufferCallbackErrorImpactingInteractivity) until proper fix becomes possible
// ref: https://github.com/ggml-org/llama.cpp/issues/20141#issuecomment-4272947703
setenv("AGX_RELAX_CDM_CTXSTORE_TIMEOUT", "1", true);
static ggml_backend_metal_reg_ptr reg_ctx(ggml_backend_metal_reg_init());
for (int i = 0; i < g_devices; ++i) {
+5 -15
View File
@@ -19,6 +19,7 @@
#include <iomanip>
#include <map>
#include <memory>
#include <mutex>
#include <openvino/core/dimension.hpp>
#include <openvino/core/except.hpp>
#include <openvino/core/node.hpp>
@@ -206,22 +207,8 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
break;
}
case GGML_OP_ROPE: {
const int mode = node->op_params[2];
switch (mode) {
case GGML_ROPE_TYPE_NEOX: {
op_case = 0x00010000;
break;
}
case GGML_ROPE_TYPE_IMROPE: {
op_case = 0x00020000;
break;
}
default:
op_case = 0x00000000;
break;
}
if (node->src[0]->op == GGML_OP_VIEW) {
op_case = (op_case | 0x00000002);
op_case = 2;
}
break;
}
@@ -586,6 +573,9 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
}
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
static std::mutex weights_mutex;
std::lock_guard<std::mutex> lock(weights_mutex);
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
auto * nodes = cgraph->nodes;
auto n_nodes = cgraph->n_nodes;
+11 -18
View File
@@ -6,7 +6,6 @@
#include <cstring>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
#include <openvino/runtime/properties.hpp>
#include <optional>
ov::Core & ov_singleton_core() {
@@ -43,13 +42,11 @@ void ggml_openvino_device_config::init() {
{"NPUW_DQ", "YES" },
{"NPUW_DQ_FULL", "NO" },
};
if (cache_dir && strlen(cache_dir) > 0) {
if (cache_dir) {
compile_config["NPUW_CACHE_DIR"] = cache_dir;
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
}
} else if (cache_dir && strlen(cache_dir) > 0) {
compile_config.insert(ov::cache_dir(cache_dir));
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
} else if (cache_dir) {
ov_singleton_core().set_property(ov::cache_dir(cache_dir));
}
// Initialize remote context with queue sharing for GPU
@@ -262,12 +259,10 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t);
// For symmetric quantization, no zp needed (weights stored as signed)
if (layout.is_symmetric) {
layout.zp_size = 0;
} else {
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
}
// For symmetric quantization, we only need one zp value (not one per block)
// Zero points are stored in U4 or U8 format matching the weight type
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
@@ -318,12 +313,10 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
// Scales: F16 per block
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
// For symmetric quantization, no zp needed (weights stored as signed)
if (layout.is_symmetric) {
layout.zp_size = 0;
} else {
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
}
// Zero points: U4 or U8 matching weight type
// For symmetric quantization, we only need one zp value (not one per block)
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
// Layout in buffer: [weights | scales | zp] with alignment
layout.weights_offset = 0;
+13 -29
View File
@@ -145,18 +145,13 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
return ctx->data;
}
static bool is_stateful_enabled() {
static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0;
}
static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
// GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
// Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
!is_stateful_enabled()) {
!getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
GGML_ASSERT(ctx->tensor_extras.empty());
auto device = ctx->device;
auto size = ctx->size;
@@ -605,14 +600,6 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {
static void ggml_backend_openvino_free(ggml_backend_t backend) {
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
if (ctx->runtime_context) {
auto r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
if (--r_ctx->backend_count == 0) {
r_ctx->clear_caches();
}
}
delete ctx;
delete backend;
}
@@ -657,12 +644,7 @@ static ggml_guid_t ggml_backend_openvino_guid(void) {
}
static std::shared_ptr<ov_runtime_context> get_ov_runtime_context_ptr() {
static std::shared_ptr<ov_runtime_context> r_ctx = [] {
auto ctx = std::make_shared<ov_runtime_context>();
ctx->device = ggml_openvino_get_device_name();
ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu();
return ctx;
}();
static std::shared_ptr<ov_runtime_context> r_ctx = std::make_shared<ov_runtime_context>();
return r_ctx;
}
@@ -687,7 +669,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
}
std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
r_ctx->backend_count++;
r_ctx->device = ggml_openvino_get_device_name();
r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
ggml_backend_t openvino_backend = new ggml_backend{
/* .guid = */ ggml_backend_openvino_guid(),
@@ -900,7 +883,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
const int32_t * op_params = op->op_params;
const int n_dims = op_params[1];
const int mode = op_params[2];
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_IMROPE) {
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
return true;
}
@@ -913,6 +896,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
return true;
}
float freq_scale;
float ext_factor;
memcpy(&freq_scale, op_params + 6, sizeof(float));
memcpy(&ext_factor, op_params + 7, sizeof(float));
if (ext_factor != 0.0f) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
return true;
}
if (op->src[0]->op == GGML_OP_VIEW) {
if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
// GGML_LOG_WARN(
@@ -922,12 +913,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
return true;
}
}
if (mode == GGML_ROPE_TYPE_IMROPE &&
(op->src[2] != 0 || ((const float *) op_params)[6] != 1 || ((const float *) op_params)[7] != 0 ||
((const float *) op_params)[8] != 1)) {
// GGML_LOG_WARN("OpenVINO backend does not support IMROPE with freq_factors, freq_scale, ext_factor, and attn_factor\n");
return true;
}
break;
}
default:
@@ -957,7 +942,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
// GGML_OP_SOFT_MAX,
GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
static const std::set<ggml_unary_op> supported_unary_ops{
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_SILU,
};
static const std::set<ggml_glu_op> supported_glu_ops{
+193 -265
View File
@@ -46,7 +46,6 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
// Extracts (weight, scales, zp) from Q4_0 tensors.
// Data layout is: |16 bit scale|32 x 4bit weights|.
// When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8).
void extract_q4_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
@@ -56,32 +55,28 @@ void extract_q4_0_data(const ggml_tensor * tensor,
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data());
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
// For Q4_0, zero point is always 8
if (is_scalar_zp) {
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
}
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
// For asymmetric quantization, compute per-block zero points
if (!is_scalar_zp) {
// Pack two 4-bit zero points per byte
if (i % 2 == 0) {
zp[i / 2] = 8; // Lower nibble
} else {
zp[i / 2] |= (8 << 4); // Upper nibble
}
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
});
} else {
// Symmetric: unpack as u4 then convert to i4 by subtracting 8 (XOR each nibble)
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
// Convert u4 to i4: subtract 8 from each nibble. XOR 0x88 flips each nibble by 8.
for (int j = 0; j < 16; ++j) {
weights[i * 16 + j] ^= 0x88;
}
});
}
}
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
});
}
// Extracts (weight, scales, zp) from Q4_1 tensors.
@@ -128,7 +123,6 @@ void extract_q4_1_data(const ggml_tensor * tensor,
// Extracts (weight, scales, zp) from Q8_0 tensors.
// Data layout is: |16 bit scale|32 x 8bit weights|.
// When zp_arr is empty (symmetric), weights are stored as signed i8 directly.
void extract_q8_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
@@ -139,30 +133,29 @@ void extract_q8_0_data(const ggml_tensor * tensor,
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data());
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
zp[i] = 128;
for (size_t j = 0; j < weights_per_block; ++j) {
uint8_t x = block_data[j + 2];
x ^= 1 << 7; // Convert int8 to uint8 by flipping sign bit
weights[i * weights_per_block + j] = x;
}
});
} else {
// Symmetric: store original int8 values directly (no unsigned bias)
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
// Copy int8 weights as-is (the tensor element type is i8)
memcpy(weights + i * weights_per_block, block_data + 2, weights_per_block);
});
// For Q8_0, zero point is always 128
if (is_scalar_zp) {
zp[0] = 128;
}
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
zp[i] = 128;
}
for (size_t j = 0; j < weights_per_block; ++j) {
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
// Original data is in int8_t, so we add a bias of -128 and invert the first bit.
x ^= 1 << 7;
weights[i * weights_per_block + j] = x;
}
});
}
void unpack_256_4(const uint8_t * data, uint8_t * dst) {
@@ -263,62 +256,44 @@ void extract_q6_k_data(const ggml_tensor * tensor,
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data());
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
for (size_t j = 0; j < 16; j++) {
scales[j + i * 16] =
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
// For Q6_K, zero point is always 32
if (is_scalar_zp) {
zp[0] = 32;
}
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
float scale_factor =
static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2
for (size_t j = 0; j < 16; j++) {
scales[j + i * 16] =
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
zp[j + i * 16] = 32;
}
uint8_t * ql = block_data;
uint8_t * qh = block_data + 128;
for (int64_t j = 0; j < 32; ++j) {
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
}
});
} else {
// Symmetric: subtract 32 from each weight to store as signed i8
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
for (size_t j = 0; j < 16; j++) {
scales[j + i * 16] =
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
}
uint8_t * ql = block_data;
uint8_t * qh = block_data + 128;
auto * signed_weights = reinterpret_cast<int8_t *>(weights);
for (int64_t j = 0; j < 32; ++j) {
signed_weights[i * 256 + j] = static_cast<int8_t>((ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 32] =
static_cast<int8_t>((ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 64] = static_cast<int8_t>((ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 96] =
static_cast<int8_t>((ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 128] =
static_cast<int8_t>((ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 160] =
static_cast<int8_t>((ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 192] =
static_cast<int8_t>((ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 224] =
static_cast<int8_t>((ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4)) - 32;
}
});
}
}
uint8_t * ql = block_data;
uint8_t * qh = block_data + 128;
for (int64_t j = 0; j < 32; ++j) {
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
}
});
}
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
@@ -414,10 +389,11 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
size_t group_size,
bool use_bias) {
ov::Shape orig_shape = weight.get_shape();
bool is_signed = (weight.get_element_type() == ov::element::i8); // Symmetric: signed weights, no ZP
// Expand dimensions for scales and zp/bias
auto scale_shape = scales.get_shape();
auto zp_shape = zp.get_shape();
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
@@ -427,48 +403,37 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
} else {
scale_shape.push_back(1);
scales.set_shape(scale_shape);
if (!is_signed && zp.get_size() > 0) {
auto zp_shape = zp.get_shape();
// For symmetric quantization, zp remains scalar (don't resize)
if (!is_scalar_zp) {
zp_shape.push_back(1);
zp.set_shape(zp_shape);
}
}
// Create graph nodes
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
ov::Output<ov::Node> result;
if (is_signed) {
// Signed path: q * s (no zero point subtraction needed)
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i8, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
if (use_bias && !is_scalar_zp) {
// Bias path: w * s + b (zp tensor holds f16 bias values)
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Unsigned path
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
if (use_bias && zp.get_size() > 0) {
// Bias path: w * s + b (zp tensor holds f16 bias values)
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
auto w_s =
std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Zero point path: (w - zp) * s
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_point, zp_value)) {
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
}
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
auto w_zp =
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
// Zero point path: (w - zp) * s
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_point, zp_value)) {
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
}
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
auto w_zp =
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
}
if (packed_shape.size() != 2) {
@@ -487,10 +452,11 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
size_t group_size,
bool use_bias) {
ov::Shape orig_weight_shape = weight.get_shape();
bool is_signed = (weight.get_element_type() == ov::element::i4); // Symmetric: signed weights, no ZP
// Expand dimensions for scales and zp/bias
ov::Shape scale_shape = scales.get_shape();
auto zp_shape = zp.get_shape();
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
// Create INT4 weight tensor
ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
@@ -501,48 +467,36 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
} else {
scale_shape.push_back(1);
scales.set_shape(scale_shape);
if (!is_signed && zp.get_size() > 0) {
auto zp_shape = zp.get_shape();
// For symmetric quantization, zp remains scalar (don't resize)
if (!is_scalar_zp) {
zp_shape.push_back(1);
zp.set_shape(zp_shape);
}
}
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
ov::Output<ov::Node> result;
if (is_signed) {
// Signed path: q * s (no zero point subtraction needed)
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i4, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
if (use_bias && !is_scalar_zp) {
// Bias path: w * s + b (zp tensor holds f16 bias values)
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Unsigned path
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
if (use_bias && zp.get_size() > 0) {
// Bias path: w * s + b (zp tensor holds f16 bias values)
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
auto w_s =
std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Zero point path: (w - zp) * s
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
}
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
auto w_zp =
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
// Zero point path: (w - zp) * s
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
}
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
auto w_zp =
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
}
if (packed_shape.size() != 2) {
@@ -745,32 +699,24 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
// Quantized path (normal extraction or quantized requant)
// Create weight/scale/zp tensors - shared between both paths
// For symmetric quantization, use signed types (i4/i8) and no ZP tensor
ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
(layout.is_u4 ? ov::element::u4 : ov::element::u8);
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
if (output_base_ptr) {
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
if (!layout.is_symmetric) {
ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset);
}
// else: result.zp remains default-constructed (empty) for symmetric
result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
} else {
result.weights = ov::Tensor(weight_type, node_shape);
result.scales = ov::Tensor(ov::element::f16, scale_shape);
if (!layout.is_symmetric) {
if (use_bias) {
result.zp = ov::Tensor(ov::element::f16, scale_shape);
} else {
ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
result.zp = ov::Tensor(zp_type, scale_shape);
}
if (use_bias && !layout.is_symmetric) {
// bias only has effect for asymmetric quant
result.zp = ov::Tensor(ov::element::f16, zp_shape);
} else {
result.zp = ov::Tensor(weight_type, zp_shape);
}
// else: result.zp remains default-constructed (empty) for symmetric
}
if (layout.is_requant && layout.requant_type.has_value()) {
@@ -795,75 +741,59 @@ void quantize_q4_0(const float * x,
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data());
for (int i = 0; i < nb; i++) {
float amax = 0.0f;
float max = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
// For Q4_0, zero point is always 8
if (is_scalar_zp) {
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
}
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
const float d = max / -8;
if (d == 0) {
scales[i] = ov::float16(1.0f);
}
const float d = max / -8;
if (d == 0) {
scales[i] = ov::float16(1.0f);
// zp is already set to 8 for symmetric, or set per-block for asymmetric
if (!is_scalar_zp) {
if (i % 2 == 0) {
zp[i / 2] = 8;
} else {
zp[i / 2] |= (8 << 4);
}
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
continue;
}
const float id = 1.0f / d;
scales[i] = ov::float16(d);
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
continue;
}
const float id = 1.0f / d;
scales[i] = ov::float16(d);
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
if (i % 2 == 0) {
zp[i / 2] = 8;
} else {
zp[i / 2] |= (8 << 4);
}
for (int j = 0; j < qk / 2; ++j) {
const float x0 = x[i * qk + 2 * j] * id;
const float x1 = x[i * qk + 2 * j + 1] * id;
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
}
}
} else {
// Symmetric: produce signed i4 values in [-8, 7]
for (int i = 0; i < nb; i++) {
float amax = 0.0f;
float max = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = max / -8;
if (d == 0) {
scales[i] = ov::float16(1.0f);
// i4 value 0 packed: 0x00
memset(weights + i * qk / 2, 0, qk / 2);
continue;
}
const float id = 1.0f / d;
scales[i] = ov::float16(d);
for (int j = 0; j < qk / 2; ++j) {
const float x0 = x[i * qk + 2 * j] * id;
const float x1 = x[i * qk + 2 * j + 1] * id;
// Signed i4: range [-8, 7]. Quantize as round(x*id), then pack as 4-bit two's complement.
int8_t si0 = (int8_t) std::max(-8, std::min(7, (int) roundf(x0)));
int8_t si1 = (int8_t) std::max(-8, std::min(7, (int) roundf(x1)));
weights[i * qk / 2 + j] = (si0 & 0x0F) | ((si1 & 0x0F) << 4);
}
for (int j = 0; j < qk / 2; ++j) {
const float x0 = x[i * qk + 2 * j] * id;
const float x1 = x[i * qk + 2 * j + 1] * id;
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
}
}
}
@@ -879,42 +809,36 @@ void quantize_q8_0(const float * x,
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data());
for (int i = 0; i < nb; i++) {
float amax = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
amax = std::max(amax, fabsf(v));
}
const float d = amax / 127.0f;
const float id = d ? 1.0f / d : 0.0f;
scales[i] = ov::float16(d);
zp[i] = 128;
for (int j = 0; j < qk; ++j) {
const float x0 = x[i * qk + j] * id;
const int8_t xi0 = roundf(x0);
weights[i * qk + j] = (uint8_t) (xi0 + 128);
// For Q8_0, zero point is always 128
if (is_scalar_zp) {
zp[0] = 128;
}
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
}
}
} else {
// Symmetric: store signed int8 values directly
auto * signed_weights = reinterpret_cast<int8_t *>(weights);
for (int i = 0; i < nb; i++) {
float amax = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
amax = std::max(amax, fabsf(v));
}
const float d = amax / 127.0f;
const float id = d ? 1.0f / d : 0.0f;
scales[i] = ov::float16(d);
for (int j = 0; j < qk; ++j) {
const float x0 = x[i * qk + j] * id;
signed_weights[i * qk + j] = (int8_t) roundf(x0);
}
const float d = amax / 127.0f;
const float id = d ? 1.0f / d : 0.0f;
scales[i] = ov::float16(d);
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
zp[i] = 128;
}
for (int j = 0; j < qk; ++j) {
const float x0 = x[i * qk + j] * id;
const int8_t xi0 = roundf(x0);
weights[i * qk + j] = (uint8_t) (xi0 + 128);
}
}
}
@@ -937,8 +861,12 @@ void quantize_q8_1(const float * x,
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
min = std::min(v, min);
max = std::max(v, max);
if (v < min) {
min = v;
}
if (v > max) {
max = v;
}
}
const float d = (max - min) / ((1 << 8) - 1);
+7 -33
View File
@@ -9,17 +9,12 @@
#include <openvino/op/add.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/cos.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/sin.hpp>
#include <openvino/op/slice.hpp>
#include <openvino/op/split.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/op/transpose.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <vector>
@@ -38,12 +33,6 @@ OutputVector translate_rope(const NodeContext & context) {
auto data_node = context.get_input(0).get_node_shared_ptr();
auto output_shape = context.get_output_shape().to_shape();
int32_t * op_params = context.get_output_op_params();
const int mode = (op_case & 0xFFFF0000) >> 16;
op_case = (op_case & 0x0000FFFF);
constexpr int TYPE_NORMAL = 0;
constexpr int TYPE_NEOX = 1;
constexpr int TYPE_IMROPE = 2;
Output<Node> cos_theta_node;
Output<Node> sin_theta_node;
@@ -56,7 +45,7 @@ OutputVector translate_rope(const NodeContext & context) {
if (context.get_input_size() == 3) {
rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
}
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE);
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
sin_theta_node = sin_cos.first;
cos_theta_node = sin_cos.second;
}
@@ -76,7 +65,11 @@ OutputVector translate_rope(const NodeContext & context) {
}
}
if (mode == TYPE_NORMAL) {
const int mode = op_params[2];
constexpr int ROPE_TYPE_NORMAL = 0;
constexpr int ROPE_TYPE_NEOX = 2;
if (mode == ROPE_TYPE_NORMAL) {
auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
@@ -104,7 +97,7 @@ OutputVector translate_rope(const NodeContext & context) {
auto data_shape = ov::op::v0::Constant::create(
ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
} else if (mode == TYPE_NEOX) {
} else if (mode == ROPE_TYPE_NEOX) {
auto data_split = std::make_shared<ov::op::v1::Split>(
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2);
Output<Node> slice_data_node_0 = data_split->outputs()[0];
@@ -119,25 +112,6 @@ OutputVector translate_rope(const NodeContext & context) {
std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
} else if (mode == TYPE_IMROPE) {
int64_t n_dims = data_node->get_shape()[3];
auto cos_sin_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1,-1,1,(n_dims >> 1)});
auto cos_reshaped = std::make_shared<ov::op::v1::Reshape>(cos_theta_node, cos_sin_shape, true);
auto sin_reshaped = std::make_shared<ov::op::v1::Reshape>(sin_theta_node, cos_sin_shape, true);
auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3});
auto split_a = std::make_shared<ov::op::v1::Split>(data_node, split_axis, 2);
auto x0 = split_a->output(0);
auto x1 = split_a->output(1);
auto mul_a = std::make_shared<ov::op::v1::Multiply>(x0, cos_reshaped);
auto mul_b = std::make_shared<ov::op::v1::Multiply>(x1, sin_reshaped);
auto sub = std::make_shared<ov::op::v1::Subtract>(mul_a, mul_b);
auto mul_c = std::make_shared<ov::op::v1::Multiply>(x0, sin_reshaped);
auto mul_d = std::make_shared<ov::op::v1::Multiply>(x1, cos_reshaped);
auto add = std::make_shared<ov::op::v1::Add>(mul_c, mul_d);
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{sub, add}, 3);
}
return rename_outputs_with_suffix({res}, context.get_name());
@@ -1,25 +0,0 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <openvino/core/node_output.hpp>
#include <openvino/op/gelu.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_unary_gelu(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = context.get_input(0);
auto res = std::make_shared<ov::op::v7::Gelu>(input);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -31,7 +31,6 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
{"GGML_OP_SOFT_MAX", op::translate_soft_max },
{"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>},
{"GGML_OP_TRANSPOSE", op::translate_transpose },
{"GGML_UNARY_OP_GELU", op::translate_unary_gelu },
{"GGML_UNARY_OP_SILU", op::translate_unary_silu },
{"GGML_OP_VIEW", op::translate_view },
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
@@ -21,7 +21,6 @@ GGML_OP_CONVERTER(translate_rms_norm);
GGML_OP_CONVERTER(translate_rope);
GGML_OP_CONVERTER(translate_scale);
GGML_OP_CONVERTER(translate_unary_silu);
GGML_OP_CONVERTER(translate_unary_gelu);
GGML_OP_CONVERTER(translate_soft_max);
GGML_OP_CONVERTER(translate_transpose);
GGML_OP_CONVERTER(translate_view);
@@ -0,0 +1,123 @@
#include "eliminate_zp.h"
#include <openvino/core/graph_util.hpp>
#include <openvino/core/parallel.hpp>
#include <openvino/core/rt_info.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/pass/pattern/op/label.hpp>
#include <openvino/pass/pattern/op/pattern.hpp>
#include <openvino/pass/pattern/op/wrap_type.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace pass {
EliminateZeroPoints::EliminateZeroPoints() {
// Find pattern:
// (Multiply Any(scale)
// (Subtract (Convert Constant(data)))
// (Convert Constant(zero_point)))
// where zero_point is a scalar
// If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val
// If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant
auto m_data_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
auto m_data_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_data_constant});
auto m_zp_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
auto m_zp_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_zp_constant});
auto m_subtract = ov::pass::pattern::wrap_type<ov::op::v1::Subtract>({m_data_convert, m_zp_convert});
auto m_scale = ov::pass::pattern::any_input();
auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
const auto callback = [=](ov::pass::pattern::Matcher & m) {
const auto & pattern_map = m.get_pattern_value_map();
auto multiply_node =
std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
auto subtract_node =
std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
auto data_constant =
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
auto zp_constant =
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
return false;
}
if (ov::shape_size(zp_constant->get_shape()) != 1) {
return false;
}
auto data_type = data_constant->get_element_type();
auto zp_data = zp_constant->cast_vector<int>();
if (zp_data.empty()) {
return false;
}
int zp_value = zp_data[0];
bool should_eliminate = false;
ov::element::Type target_type;
if (data_type == ov::element::u4 && zp_value == 8) {
should_eliminate = true;
target_type = ov::element::i4;
} else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) {
should_eliminate = true;
target_type = ov::element::i8;
}
if (!should_eliminate) {
return false;
}
auto data_shape = data_constant->get_shape();
size_t total_elements = ov::shape_size(data_shape);
std::shared_ptr<ov::op::v0::Constant> new_constant;
// TODO improve performance
if (data_type == ov::element::u4) {
auto data_values = data_constant->cast_vector<uint8_t>();
std::vector<int8_t> adjusted_values(total_elements);
ov::parallel_for(total_elements, [&](size_t i) {
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - 8);
});
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
} else if (data_type == ov::element::u8) {
auto data_values = data_constant->cast_vector<uint8_t>();
std::vector<int8_t> adjusted_values(total_elements);
ov::parallel_for(total_elements, [&, zp_value](size_t i) {
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - zp_value);
});
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
}
auto new_convert =
std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
ov::replace_node(subtract_node, new_convert);
return true;
};
register_matcher(
std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
callback);
}
} // namespace pass
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,17 @@
#include "openvino/pass/matcher_pass.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace pass {
class EliminateZeroPoints : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints")
EliminateZeroPoints();
};
} // namespace pass
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -1,41 +0,0 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <openvino/core/core_visibility.hpp>
#include <openvino/core/node.hpp>
#include <openvino/core/runtime_attribute.hpp>
namespace ov {
/**
* @brief Holds weightless caching attributes of a single constant.
*
* WeightlessCacheAttribute class represents runtime info attribute that holds
* the values of original size of the constant in bytes and the binary offset of the
* constant's data in the weights file used by the weightless caching mechanism. It's
* not copyable in case the data was changed (the original node was replaced by a new
* one produced during the tranformation pipeline) - in that case weightless caching
* can't be used for that constant.
*/
class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute {
public:
OPENVINO_RTTI("WeightlessCacheAttribute", "0", RuntimeAttribute)
WeightlessCacheAttribute() = delete;
WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype)
: original_size(original_size),
bin_offset(bin_offset),
original_dtype(original_dtype) {}
bool is_copyable() const override;
size_t original_size;
size_t bin_offset;
ov::element::Type original_dtype;
};
} // namespace ov
@@ -3,16 +3,15 @@
#include "ggml-openvino/openvino/node_context.h"
#include "ggml-openvino/openvino/utils.h"
#include "input_model.h"
#include "pass/eliminate_zp.h"
#include "pass/mark_decompression_convert_constant_folding.h"
#include "pass/squeeze_matmul.h"
#include "rt_info/weightless_caching_attributes.hpp"
#include <cstdint>
#include <cstdlib>
#include <map>
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/core/preprocess/pre_post_process.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
@@ -34,6 +33,7 @@
#include <openvino/op/unsqueeze.hpp>
#include <openvino/pass/constant_folding.hpp>
#include <openvino/pass/make_stateful.hpp>
#include <openvino/core/preprocess/pre_post_process.hpp>
namespace ov {
namespace frontend {
@@ -240,31 +240,6 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
resulting_model = std::make_shared<Model>(results, used_params);
apply_transformations(resulting_model);
// Set WeightlessCacheAttribute on large constants to avoid unnecessary memory copies
// in the NPUW plugin. Without this attribute, NPUW's LazyTensor constructor
// (lazy_tensor.cpp, op::Const::Const) will memcpy every constant "in case export
// occurs", doubling memory usage per compile_model call.
//
// The bin_offset field serves as a unique key (not a real file offset) — this is
// the same convention the GPU plugin uses for non-IR models (see
// Plugin::set_weightless_cache_attributes in intel_gpu/src/plugin/plugin.cpp).
// Each constant must have a distinct bin_offset, otherwise GPU's weightless cache
// import will map multiple constants to the same data.
//
// Small constants (< 16 elements) are excluded since they may be introduced by
// optimization patterns and the overhead is negligible.
size_t offset = 0;
for (auto & node : resulting_model->get_ordered_ops()) {
if (auto cnst = ov::as_type_ptr<ov::op::v0::Constant>(node);
cnst && cnst->get_byte_size() / cnst->get_element_type().size() >= 16) {
auto & rt_info = cnst->get_rt_info();
if (rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()) == rt_info.end()) {
rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] =
ov::WeightlessCacheAttribute(cnst->get_byte_size(), offset++, cnst->get_element_type());
}
}
}
return resulting_model;
}
@@ -282,6 +257,7 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
}
if (ggml_model_decoder->is_static()) {
manager.register_pass<pass::EliminateZeroPoints>();
manager.register_pass<pass::SqueezeMatmul>();
}
manager.run_passes(model);
+36 -67
View File
@@ -2,7 +2,6 @@
#include "ggml-impl.h"
#include <cmath>
#include <cstddef>
#include <ctime>
#include <memory>
@@ -14,7 +13,6 @@
#include <openvino/op/gather.hpp>
#include <openvino/op/maximum.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/sin.hpp>
#include <openvino/op/squeeze.hpp>
@@ -89,11 +87,8 @@ ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl
auto ramp_y =
std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
// rope_yarn_ramp returns (1 - clamp(y)), so invert before scaling
auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
auto ramp_inverted = std::make_shared<ov::op::v1::Subtract>(one, ramp_clamped);
auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_inverted, ext_factor_node);
auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
return ramp_mix;
}
@@ -120,7 +115,6 @@ void ggml_rope_yarn_corr_dims(int n_dims,
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
std::shared_ptr<ov::Node> inp_pos,
std::shared_ptr<ov::Node> rope_freqs_weight,
bool imrope,
bool stateful) {
if (stateful) {
inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
@@ -128,13 +122,6 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
auto pos_perm =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
} else if (imrope) {
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
auto pos_shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, {0, 0, 0, 4, -1});
inp_pos = std::make_shared<ov::op::v1::Reshape>(inp_pos, pos_shape, true);
auto pos_transpose_shape =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{5}, std::vector<int64_t>{0, 1, 2, 4, 3});
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_transpose_shape);
} else {
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
auto pos_perm =
@@ -149,7 +136,6 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
float beta_fast;
float beta_slow;
const int n_dims = rope_params[1];
const size_t n_dims_half = n_dims >> 1;
const int n_ctx_orig = rope_params[4];
memcpy(&freq_base, rope_params + 5, sizeof(float));
memcpy(&freq_scale, rope_params + 6, sizeof(float));
@@ -160,74 +146,57 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
const float theta_scale = powf(freq_base, -2.0f / n_dims);
std::vector<float> factor(n_dims_half);
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
std::vector<float> factor(n_dims / 2);
factor[0] = 1.0f;
for (size_t i = 1; i < factor.size(); i++) {
factor[i] = theta_scale * factor[i - 1];
}
Output<Node> freq_factors;
if (stateful) {
freq_factors =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
} else {
freq_factors =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
}
if (rope_freqs_weight) {
freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
}
auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
Output<Node> theta;
float mscale = attn_factor;
if (imrope) {
std::vector<int64_t> gather_indices(n_dims_half);
for (size_t j = 0; j < n_dims_half; j++) {
gather_indices[j] = j % 3;
factor[j] = std::pow(theta_scale, j);
}
auto gather_indices_const =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{n_dims_half}, gather_indices);
auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {4});
inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, gather_indices_const, gather_axis);
auto factor_const = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{n_dims_half}, factor);
theta = std::make_shared<ov::op::v1::Multiply>(inp_pos, factor_const);
if (ext_factor == 0.0f) {
theta = theta_interp;
} else {
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
factor[0] = 1.0f;
for (size_t i = 1; i < factor.size(); i++) {
factor[i] = theta_scale * factor[i - 1];
}
auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
Output<Node> one;
if (stateful) {
freq_factors =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
} else {
freq_factors =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
}
if (rope_freqs_weight) {
freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
}
auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
if (ext_factor == 0.0f) {
theta = theta_interp;
} else {
auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
Output<Node> one;
if (stateful) {
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
} else {
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
}
auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
}
theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
}
Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
if (!imrope) {
auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
}
auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
return std::make_pair(sin_theta, cos_theta);
}
-1
View File
@@ -67,7 +67,6 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
std::shared_ptr<ov::Node> inp_pos,
std::shared_ptr<ov::Node> rope_freqs_weight = nullptr,
bool imrope = false,
bool stateful = false);
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
+45 -102
View File
@@ -81,8 +81,8 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) {
auto & core = ov_singleton_core();
const auto & config = ggml_openvino_get_compile_config();
const auto & device = r_ctx->device;
const auto & stateful = r_ctx->stateful;
auto device = r_ctx->device;
bool stateful = r_ctx->stateful;
static auto is_static = false;
if (is_naive(cgraph)) {
@@ -106,26 +106,14 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
int64_t infer_end_time;
{
std::shared_ptr<decoder_runtime_ctx> entry;
std::lock_guard<std::mutex> lock(r_ctx->ov_compute_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
ModelParams old_m_params;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
if (cache_hit) {
entry = it->second;
} else {
auto mutex = std::make_shared<std::mutex>();
entry = std::make_shared<decoder_runtime_ctx>(mutex);
r_ctx->decoder_cache[key] = entry;
}
}
std::lock_guard<std::mutex> lock(*(entry->mutex));
if (cache_hit) {
ggml_decoder = entry->ptr;
ggml_decoder = it->second;
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_dynamically(m_params);
}
@@ -138,10 +126,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
ggml_decoder->update_io(cgraph);
}
ggml_decoder->add_extra_inputs();
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
infer_request = r_ctx->infer_request_cache.at(key);
}
infer_request = r_ctx->infer_request_cache.at(key);
if (stateful) {
const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -185,10 +170,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
conversion_end_time = decoder_end_time;
compile_end_time = decoder_end_time;
} else {
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache.erase(key);
}
r_ctx->infer_request_cache.erase(key);
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -217,7 +199,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
}
compile_end_time = ggml_time_us();
infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
entry->ptr = ggml_decoder;
r_ctx->infer_request_cache[key] = infer_request;
r_ctx->decoder_cache[key] = ggml_decoder;
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
@@ -227,13 +210,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
for (const auto & ov_output : model->get_results()) {
ov_output_names.push_back(ov_output->get_friendly_name());
}
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache[key] = infer_request;
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
}
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
if (stateful) {
const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -246,13 +224,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
}
}
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
ov_input_names = r_ctx->ov_input_names_cache[key];
ov_output_names = r_ctx->ov_output_names_cache[key];
}
auto ov_input_names = r_ctx->ov_input_names_cache[key];
auto ov_output_names = r_ctx->ov_output_names_cache[key];
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
@@ -333,26 +306,12 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
int64_t compile_end_time;
int64_t infer_end_time;
std::shared_ptr<decoder_runtime_ctx> entry;
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
ModelParams old_m_params;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
if (cache_hit) {
entry = it->second;
} else {
auto mutex = std::make_shared<std::mutex>();
entry = std::make_shared<decoder_runtime_ctx>(mutex);
r_ctx->decoder_cache[key] = entry;
}
}
std::lock_guard<std::mutex> lock(*(entry->mutex));
if (cache_hit) {
ggml_decoder = entry->ptr;
ggml_decoder = it->second;
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_statically(m_params);
}
@@ -366,21 +325,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
ggml_decoder->update_io(cgraph);
}
ggml_decoder->add_extra_inputs();
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
infer_request =
is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
}
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
decoder_end_time = ggml_time_us();
conversion_end_time = decoder_end_time;
compile_end_time = decoder_end_time;
} else {
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache.erase(key);
r_ctx->infer_request_cache_prefill.erase(key);
}
r_ctx->infer_request_cache.erase(key);
r_ctx->infer_request_cache_prefill.erase(key);
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -420,14 +372,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
compiled_model_decode = core.compile_model(model_decode, device, config);
}
auto infer_request_prefill = std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
auto infer_request_decode = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
r_ctx->infer_request_cache_prefill[key] =
std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
r_ctx->infer_request_cache[key] =
std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
compile_end_time = ggml_time_us();
model = is_prefill ? model_prefill : model_decode;
ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
infer_request = is_prefill ? infer_request_prefill : infer_request_decode;
entry->ptr = ggml_decoder;
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key];
r_ctx->decoder_cache[key] = ggml_decoder;
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
@@ -437,29 +391,18 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
for (const auto & ov_output : model->get_results()) {
ov_output_names.push_back(ov_output->get_friendly_name());
}
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache_prefill[key] = infer_request_prefill;
r_ctx->infer_request_cache[key] = infer_request_decode;
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
}
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
}
std::vector<std::string> ov_input_names_local;
std::vector<std::string> ov_output_names_local;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
ov_input_names_local = r_ctx->ov_input_names_cache[key];
ov_output_names_local = r_ctx->ov_output_names_cache[key];
}
auto ov_input_names = r_ctx->ov_input_names_cache[key];
auto ov_output_names = r_ctx->ov_output_names_cache[key];
if (is_prefill) {
auto inp_len = inp_pos->ne[0];
for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) {
for (size_t i = 0; i < ov_input_names_local.size(); i++) {
auto param_name = ov_input_names_local[i];
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index);
infer_request->set_input_tensor(i, input_tensor);
@@ -469,8 +412,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
}
}
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
for (size_t i = 0; i < ov_output_names.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
infer_request->set_output_tensor(i, output_tensor);
}
@@ -478,16 +421,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
infer_request->infer();
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
for (size_t i = 0; i < ov_output_names.size(); i++) {
const auto output_tensor = infer_request->get_output_tensor(i);
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
}
}
}
infer_end_time = ggml_time_us();
} else {
for (size_t i = 0; i < ov_input_names_local.size(); i++) {
auto param_name = ov_input_names_local[i];
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name);
infer_request->set_input_tensor(i, input_tensor);
@@ -497,8 +440,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
}
}
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
for (size_t i = 0; i < ov_output_names.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
infer_request->set_output_tensor(i, output_tensor);
}
@@ -507,9 +450,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
infer_end_time = ggml_time_us();
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
for (size_t i = 0; i < ov_output_names.size(); i++) {
const auto output_tensor = infer_request->get_output_tensor(i);
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
}
}
}
+3 -23
View File
@@ -3,15 +3,12 @@
#include "ggml-impl.h"
#include <algorithm>
#include <atomic>
#include <cstddef>
#include <memory>
#include <mutex>
#include <openvino/runtime/core.hpp>
#include <openvino/runtime/infer_request.hpp>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
struct graph_key {
@@ -43,17 +40,11 @@ struct graph_key_hash {
}
};
struct decoder_runtime_ctx {
decoder_runtime_ctx(std::shared_ptr<std::mutex> mutex) : mutex(std::move(mutex)) {}
std::shared_ptr<std::mutex> mutex;
std::shared_ptr<GgmlOvDecoder> ptr;
};
struct ov_runtime_context {
mutable std::mutex ctx_mutex;
std::mutex ov_compute_mutex;
std::string device;
bool stateful;
std::unordered_map<graph_key, std::shared_ptr<decoder_runtime_ctx>, graph_key_hash> decoder_cache;
std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
@@ -62,22 +53,11 @@ struct ov_runtime_context {
// Simultanous stateful inference request support to be added.
size_t stateful_kv_size;
std::map<std::string, std::string> kv_state_input_name_map;
std::atomic<int> backend_count;
ov_runtime_context() :
device("CPU"),
stateful(false),
stateful_kv_size(0),
backend_count(0) {}
void clear_caches() {
std::lock_guard<std::mutex> lock(ctx_mutex);
decoder_cache.clear();
infer_request_cache.clear();
infer_request_cache_prefill.clear();
ov_input_names_cache.clear();
ov_output_names_cache.clear();
}
stateful_kv_size(0) {}
};
enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend);
-7
View File
@@ -28,13 +28,6 @@
namespace syclexp = sycl::ext::oneapi::experimental;
#if defined(__INTEL_LLVM_COMPILER) && __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
#include <sycl/ext/oneapi/bfloat16.hpp>
#ifndef GGML_SYCL_HAS_BF16
#define GGML_SYCL_HAS_BF16
#endif
#endif
#if GGML_SYCL_DNNL
#include "dnnl.hpp"
#include "dnnl_sycl.hpp"
+7 -16
View File
@@ -2,6 +2,13 @@
#include "dequantize.hpp"
#include "presets.hpp"
#if defined(__INTEL_LLVM_COMPILER)
#if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
#include <sycl/ext/oneapi/bfloat16.hpp>
#define GGML_SYCL_HAS_BF16
#endif
#endif
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
const sycl::nd_item<3> &item_ct1) {
@@ -760,22 +767,6 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
}
#ifdef GGML_SYCL_HAS_BF16
to_bf16_sycl_t ggml_get_to_bf16_sycl(ggml_type type, ggml_tensor * /*dst*/) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_sycl<float>;
case GGML_TYPE_F16:
return convert_unary_sycl<sycl::half>;
case GGML_TYPE_BF16:
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
default:
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
return nullptr;
}
}
#endif
to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
-9
View File
@@ -23,11 +23,6 @@ typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst);
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst);
#ifdef GGML_SYCL_HAS_BF16
typedef to_t_sycl_t<sycl::ext::oneapi::bfloat16> to_bf16_sycl_t;
to_bf16_sycl_t ggml_get_to_bf16_sycl(ggml_type type, ggml_tensor * dst);
#endif
// Nc = Non-contiguous
template <typename T>
using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
@@ -40,19 +35,15 @@ template<typename dst_t, typename src_t>
inline dst_t ggml_sycl_cast(src_t x) {
if constexpr (std::is_same_v<dst_t, src_t>) {
return x;
#ifdef GGML_SYCL_HAS_BF16
} else if constexpr (std::is_same_v<dst_t, sycl::ext::oneapi::bfloat16>) {
return sycl::ext::oneapi::bfloat16(float(x));
} else if constexpr (std::is_same_v<src_t, sycl::ext::oneapi::bfloat16>) {
return static_cast<float>(x);
#endif
} else if constexpr (std::is_same_v<src_t, sycl::float2> && std::is_same_v<dst_t, sycl::half2>) {
return x.template convert<sycl::half, sycl::rounding_mode::rte>();
#ifdef GGML_SYCL_HAS_BF16
} else if constexpr (std::is_same_v<src_t, sycl::float2> &&
std::is_same_v<dst_t, sycl::vec<sycl::ext::oneapi::bfloat16, 2>>) {
return {x.x, x.y};
#endif
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
return int32_t(x);
} else {
-3
View File
@@ -29,9 +29,6 @@ public:
static constexpr dt to_dt() {
if constexpr (std::is_same_v<T, float>) return dt::f32;
else if constexpr (std::is_same_v<T, sycl::half>) return dt::f16;
#ifdef GGML_SYCL_HAS_BF16
else if constexpr (std::is_same_v<T, sycl::ext::oneapi::bfloat16>) return dt::bf16;
#endif
else static_assert(0);
}
+2 -79
View File
@@ -2176,31 +2176,6 @@ inline void ggml_sycl_op_mul_mat_sycl(
#else
bool use_fp16 = false;
#endif
#if GGML_SYCL_DNNL && defined(GGML_SYCL_HAS_BF16)
// Fast path for bf16 src0
if (src0->type == GGML_TYPE_BF16 && !g_ggml_sycl_disable_dnn && ggml_is_contiguous(src0) &&
row_diff == src0->ne[1]) {
using bf16_t = sycl::ext::oneapi::bfloat16;
ggml_sycl_pool_alloc<bf16_t> src1_as_bf16(ctx.pool(), src1_ncols*ne10);
if (src1->type != GGML_TYPE_BF16) {
const to_bf16_sycl_t to_bf16_sycl = ggml_get_to_bf16_sycl(src1->type, dst);
GGML_ASSERT(to_bf16_sycl != nullptr);
to_bf16_sycl(src1_ddf_i, src1_as_bf16.get(), src1_ncols*ne10, stream);
} else {
stream->memcpy(src1_as_bf16.get(), src1_ddf_i, src1_ncols*ne10*sizeof(bf16_t));
}
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10,
src0_dd_i, DnnlGemmWrapper::to_dt<bf16_t>(),
src1_as_bf16.get(), DnnlGemmWrapper::to_dt<bf16_t>(),
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
GGML_UNUSED(dst);
GGML_UNUSED(src1_ddq_i);
GGML_UNUSED(src1_padded_row_size);
return;
}
#endif
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
@@ -3808,51 +3783,6 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
}
}
// Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
static bool ggml_sycl_mul_mat_id_mmvq_fused(
ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
{
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
if (ne12 != 1) return false;
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
if (!ggml_is_contiguous(src1)) return false;
// Reorder layout not supported; fall back.
const ggml_tensor_extra_gpu * src0_extra =
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
if (src0_extra && src0_extra->optimized_feature.reorder) return false;
const int64_t n_ids_per_group = ids->ne[0];
if (ids->ne[1] != 1) return false;
if (ne11 != 1 && ne11 != n_ids_per_group) return false;
const queue_ptr stream = ctx.stream();
const int src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
const int n_experts_used = (int) n_ids_per_group;
const int nrows = (int) src0->ne[1];
ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
(size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
char * src1_ddq = src1_q8_alloc.get();
quantize_row_q8_1_sycl<quantize_q8_1>(
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
src1_padded_cols, stream);
const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
return ggml_sycl_mul_mat_vec_q_id(
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
(float *) dst->data, (int) ne10, nrows, n_experts_used,
/*expert_weight_stride=*/ src0->nb[2],
/*dst_row_stride=*/ dst->nb[1],
src1_row_stride, stream);
}
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
ggml_tensor *dst) try {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
@@ -3868,12 +3798,6 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
const int64_t n_as = ne02;
const int64_t n_ids = ids->ne[0];
if (ne12 == 1) {
if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
return;
}
}
std::vector<char> ids_host(ggml_nbytes(ids));
const char * ids_dev = (const char *) ids->data;
@@ -3924,9 +3848,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
}
}
} else {
const int64_t n_routed_rows = ids->ne[1] * n_ids;
ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne10);
ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne0);
ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
src1_row.data = src1_contiguous.get();
dst_row.data = dst_contiguous.get();
-151
View File
@@ -1199,154 +1199,3 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
GGML_UNUSED(src1_ddf_i);
GGML_UNUSED(ctx);
}
// src1_row_stride: 0 for shared src1 (gate/up proj), else per-expert stride (down proj).
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
static void mul_mat_vec_q_moe(
const void * __restrict__ vx_base, const void * __restrict__ vy_base,
float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
const int ncols, const int nrows,
const size_t expert_weight_stride, const size_t dst_row_stride,
const size_t src1_row_stride,
const sycl::nd_item<3> & item_ct1) {
const int expert_idx = item_ct1.get_group(1);
const int i02 = ids_dev[expert_idx];
const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
float * dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
if (row >= nrows) {
return;
}
const int blocks_per_row = ncols / qk;
constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi;
float tmp = 0.0f;
const block_q_t * x = (const block_q_t *) vx;
const block_q8_1 * y = (const block_q8_1 *) vy;
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
const int ibx = row * blocks_per_row + i;
const int iby = i * (qk / QK8_1);
for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
const int iqs = elem + vdr * (item_ct1.get_local_id(2) % (qi / vdr));
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
}
}
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
if (item_ct1.get_local_id(2) == 0) {
dst[row] = tmp;
}
}
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
static void launch_mul_mat_vec_q_moe(
const void * vx_base, const void * vy, const int32_t * ids_dev,
float * dst_base, const int ncols, const int nrows, const int n_experts_used,
const size_t expert_weight_stride, const size_t dst_row_stride,
const size_t src1_row_stride,
dpct::queue_ptr stream) {
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q_moe<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
vx_base, vy, dst_base, ids_dev, ncols, nrows,
expert_weight_stride, dst_row_stride, src1_row_stride, item);
});
});
}
bool ggml_sycl_mul_mat_vec_q_id(
enum ggml_type src0_type,
const void * vx_base,
const void * vy,
const int32_t * ids_dev,
float * dst_base,
int ncols,
int nrows,
int n_experts_used,
size_t expert_weight_stride,
size_t dst_row_stride,
size_t src1_row_stride,
dpct::queue_ptr stream) {
switch (src0_type) {
case GGML_TYPE_Q4_0:
launch_mul_mat_vec_q_moe<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q4_1:
launch_mul_mat_vec_q_moe<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q5_0:
launch_mul_mat_vec_q_moe<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q5_1:
launch_mul_mat_vec_q_moe<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q8_0:
launch_mul_mat_vec_q_moe<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q2_K:
launch_mul_mat_vec_q_moe<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q3_K:
launch_mul_mat_vec_q_moe<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q4_K:
launch_mul_mat_vec_q_moe<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q5_K:
launch_mul_mat_vec_q_moe<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q6_K:
launch_mul_mat_vec_q_moe<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_MXFP4:
launch_mul_mat_vec_q_moe<QK_MXFP4, QI_MXFP4, block_mxfp4, VDR_MXFP4_Q8_1_MMVQ, vec_dot_mxfp4_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_NVFP4:
launch_mul_mat_vec_q_moe<QK_NVFP4, QI_NVFP4, block_nvfp4, VDR_NVFP4_Q8_1_MMVQ, vec_dot_nvfp4_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
default:
return false;
}
}
-16
View File
@@ -24,20 +24,4 @@ void ggml_sycl_op_mul_mat_vec_q(
const int64_t src1_ncols, const int64_t src1_padded_row_size,
const dpct::queue_ptr &stream);
// Requires standard (non-reorder) block layout for src0.
// Returns false if src0_type isn't handled; caller should fall back.
bool ggml_sycl_mul_mat_vec_q_id(
enum ggml_type src0_type,
const void * vx_base, // start of stacked expert weights
const void * vy, // pre-quantized src1 (Q8_1)
const int32_t * ids_dev, // device-side int32, length n_experts_used
float * dst_base,
int ncols,
int nrows,
int n_experts_used,
size_t expert_weight_stride, // bytes between experts in vx_base
size_t dst_row_stride, // bytes between dst rows
size_t src1_row_stride, // 0 = shared src1, else per-expert stride in bytes
dpct::queue_ptr stream);
#endif // GGML_SYCL_MMVQ_HPP
+1 -7
View File
@@ -4,11 +4,7 @@
namespace utils {
template<typename T>
static constexpr bool is_arithmetic_v() {
return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half>
#ifdef GGML_SYCL_HAS_BF16
|| std::is_same_v<T, sycl::ext::oneapi::bfloat16>
#endif
;
return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
}
}
@@ -185,7 +181,6 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s
stream
);
break;
#ifdef GGML_SYCL_HAS_BF16
case GGML_TYPE_BF16:
set_rows_sycl<TIn, TIdx, sycl::ext::oneapi::bfloat16>(
src0_d, src1_d, (char *)dst->data,
@@ -198,7 +193,6 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s
stream
);
break;
#endif
case GGML_TYPE_Q8_0:
set_rows_sycl_q<TIdx, block_q8_0, QK8_0, cpy_blck_f32_q8_0>(src0_d, src1_d, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
break;
+1 -7
View File
@@ -792,7 +792,6 @@ struct vk_device_struct {
vk_pipeline pipeline_arange_f32;
vk_pipeline pipeline_fill_f32;
vk_pipeline pipeline_fill_f16;
vk_pipeline pipeline_geglu[2];
vk_pipeline pipeline_reglu[2];
@@ -4578,7 +4577,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_fill_f16, "fill_f16", fill_f16_len, fill_f16_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
#define CREATE_GLU(name) \
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
@@ -9846,9 +9844,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
if (dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_fill_f32;
}
if (dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_fill_f16;
}
return nullptr;
default:
return nullptr;
@@ -15718,9 +15713,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32)
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16);
case GGML_OP_ARANGE:
return op->type == GGML_TYPE_F32;
case GGML_OP_FILL:
return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
return op->type == GGML_TYPE_F32;
case GGML_OP_SCALE:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_PAD:
@@ -889,7 +889,6 @@ void process_shaders() {
string_to_spv("add1_f32_f32", "add1.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("arange_f32", "arange.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("fill_f32", "fill.comp", {{"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("fill_f16", "fill.comp", {{"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
string_to_spv("step_f16", "step.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("step_f32", "step.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("round_f16", "round.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
+173 -386
View File
@@ -44,9 +44,18 @@
// Matrix-vector multiplication parameters
#define WEBGPU_MUL_MAT_VEC_WG_SIZE 256
#define WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG 4
#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 4
#define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG 4
// Must be multiple of 4 to work with vectorized paths, and must divide
// mul_mat_vec wg size
#define WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG 64
#define WEBGPU_MUL_MAT_VEC_FLOAT_TILE_K 256
#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 64
#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_TILE_K 256
// Requires 32 threads per output (wg_size/outputs_per_wg == 32)
#define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG 8
// Requires at least two (and multiple of 2) k-quant blocks per tile
#define WEBGPU_MUL_MAT_VEC_K_Q_TILE_K 512
// default size for legacy matrix multiplication
#define WEBGPU_MUL_MAT_WG_SIZE 256
@@ -69,7 +78,6 @@ struct ggml_webgpu_shader_lib_context {
bool inplace = false;
bool overlap = false;
bool src_overlap = false;
bool supports_subgroups = false;
bool supports_subgroup_matrix = false;
uint32_t sg_mat_m = 0;
uint32_t sg_mat_n = 0;
@@ -194,28 +202,6 @@ struct ggml_webgpu_row_norm_pipeline_key_hash {
}
};
/** RMS_NORM + MUL **/
struct ggml_webgpu_rms_norm_mul_pipeline_key {
bool inplace; // rn_src == dst
bool overlap; // mul_src == dst
bool src_overlap; // rn_src == mul_src
bool operator==(const ggml_webgpu_rms_norm_mul_pipeline_key & other) const {
return inplace == other.inplace && overlap == other.overlap && src_overlap == other.src_overlap;
}
};
struct ggml_webgpu_rms_norm_mul_pipeline_key_hash {
size_t operator()(const ggml_webgpu_rms_norm_mul_pipeline_key & key) const {
size_t seed = 0;
ggml_webgpu_hash_combine(seed, key.inplace);
ggml_webgpu_hash_combine(seed, key.overlap);
ggml_webgpu_hash_combine(seed, key.src_overlap);
return seed;
}
};
/** Pad **/
struct ggml_webgpu_pad_pipeline_key {
bool circular;
@@ -262,46 +248,6 @@ struct ggml_webgpu_ssm_conv_pipeline_key {
}
};
/** CONV 2D */
struct ggml_webgpu_conv2d_pipeline_key {
ggml_type weight_type;
ggml_type input_type;
ggml_type output_type;
bool operator==(const ggml_webgpu_conv2d_pipeline_key & other) const {
return weight_type == other.weight_type && input_type == other.input_type && output_type == other.output_type;
}
};
struct ggml_webgpu_conv2d_pipeline_key_hash {
size_t operator()(const ggml_webgpu_conv2d_pipeline_key & key) const {
size_t seed = 0;
ggml_webgpu_hash_combine(seed, key.weight_type);
ggml_webgpu_hash_combine(seed, key.input_type);
ggml_webgpu_hash_combine(seed, key.output_type);
return seed;
}
};
/** Im2Col **/
struct ggml_webgpu_im2col_pipeline_key {
ggml_type input_type;
ggml_type output_type;
bool operator==(const ggml_webgpu_im2col_pipeline_key & other) const {
return input_type == other.input_type && output_type == other.output_type;
}
};
struct ggml_webgpu_im2col_pipeline_key_hash {
size_t operator()(const ggml_webgpu_im2col_pipeline_key & key) const {
size_t seed = 0;
ggml_webgpu_hash_combine(seed, key.input_type);
ggml_webgpu_hash_combine(seed, key.output_type);
return seed;
}
};
/** Gated Delta Net **/
struct ggml_webgpu_gated_delta_net_pipeline_key {
int type;
@@ -436,27 +382,19 @@ struct ggml_webgpu_unary_pipeline_key_hash {
/** FlashAttention */
enum ggml_webgpu_flash_attn_path : uint32_t {
GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX = 0u,
GGML_WEBGPU_FLASH_ATTN_PATH_TILE = 1u,
GGML_WEBGPU_FLASH_ATTN_PATH_VEC = 2u,
};
struct ggml_webgpu_flash_attn_pipeline_key {
ggml_type kv_type;
uint32_t head_dim_qk;
uint32_t head_dim_v;
bool kv_direct;
bool kv_overlap;
bool has_mask;
bool has_sinks;
bool uses_logit_softcap;
uint32_t path;
bool operator==(const ggml_webgpu_flash_attn_pipeline_key & other) const {
return kv_type == other.kv_type && head_dim_qk == other.head_dim_qk && head_dim_v == other.head_dim_v &&
kv_direct == other.kv_direct && kv_overlap == other.kv_overlap && has_mask == other.has_mask &&
has_sinks == other.has_sinks && uses_logit_softcap == other.uses_logit_softcap && path == other.path;
kv_direct == other.kv_direct && has_mask == other.has_mask && has_sinks == other.has_sinks &&
uses_logit_softcap == other.uses_logit_softcap;
}
};
@@ -467,70 +405,39 @@ struct ggml_webgpu_flash_attn_pipeline_key_hash {
ggml_webgpu_hash_combine(seed, key.head_dim_qk);
ggml_webgpu_hash_combine(seed, key.head_dim_v);
ggml_webgpu_hash_combine(seed, key.kv_direct);
ggml_webgpu_hash_combine(seed, key.kv_overlap);
ggml_webgpu_hash_combine(seed, key.has_mask);
ggml_webgpu_hash_combine(seed, key.has_sinks);
ggml_webgpu_hash_combine(seed, key.uses_logit_softcap);
ggml_webgpu_hash_combine(seed, key.path);
return seed;
}
};
struct ggml_webgpu_flash_attn_decisions {
uint32_t path = GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX;
uint32_t q_tile = 0;
uint32_t kv_tile = 0;
uint32_t wg_size = 0;
bool kv_direct = false;
uint32_t q_tile = 0;
uint32_t kv_tile = 0;
uint32_t wg_size = 0;
};
inline constexpr uint32_t GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH = 4u;
inline constexpr uint32_t GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE = 4u;
inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_pipeline_key & key) {
if (key.path != GGML_WEBGPU_FLASH_ATTN_PATH_VEC || key.kv_type != GGML_TYPE_F16 ||
key.head_dim_qk != key.head_dim_v) {
return 1u;
}
switch (key.head_dim_qk) {
case 64:
case 192:
case 576:
return 2u;
case 96:
return 4u;
default:
return 1u;
}
}
struct ggml_webgpu_flash_attn_vec_decisions {
uint32_t kv_tile = 0;
uint32_t wg_size = 0;
};
inline ggml_webgpu_flash_attn_pipeline_key ggml_webgpu_flash_attn_make_pipeline_key(
const ggml_webgpu_shader_lib_context & context,
uint32_t path) {
const ggml_webgpu_shader_lib_context & context) {
const bool has_mask = context.src3 != nullptr;
const bool has_sinks = context.src4 != nullptr;
bool kv_direct = false;
if (path != GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
uint32_t kv_direct_align = GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH;
if (path == GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX) {
kv_direct_align = context.sg_mat_k;
}
kv_direct = (context.src1->type == GGML_TYPE_F16) &&
(context.src0->ne[0] % std::max(1u, kv_direct_align) == 0) &&
(context.src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);
}
const bool kv_direct = (context.src1->type == GGML_TYPE_F16) && (context.src0->ne[0] % context.sg_mat_k == 0) &&
(context.src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);
ggml_webgpu_flash_attn_pipeline_key key = {};
key.kv_type = context.src1->type;
key.head_dim_qk = (uint32_t) context.src0->ne[0];
key.head_dim_v = (uint32_t) context.src2->ne[0];
key.kv_direct = kv_direct;
key.kv_overlap = context.src_overlap;
key.has_mask = has_mask;
key.has_sinks = has_sinks;
key.uses_logit_softcap = ggml_get_op_params_f32(context.dst, 2) != 0.0f;
key.path = path;
return key;
}
@@ -593,16 +500,8 @@ inline size_t ggml_webgpu_flash_attn_wg_mem_bytes(uint32_t q_tile,
inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_context & context,
const ggml_webgpu_flash_attn_pipeline_key & key) {
const size_t limit_bytes = context.wg_mem_limit_bytes;
uint32_t q_tile = context.sg_mat_m;
uint32_t kv_granularity = context.sg_mat_n;
if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
q_tile = GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE;
kv_granularity = std::max(1u, context.max_subgroup_size);
} else if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
q_tile = 1u;
kv_granularity = 8u;
}
const size_t limit_bytes = context.wg_mem_limit_bytes;
const size_t q_tile = context.sg_mat_m;
const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
size_t bytes_per_kv = 0;
@@ -615,90 +514,23 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
bytes_per_kv += q_tile;
bytes_per_kv *= GGML_WEBGPU_F16_SIZE_BYTES;
const uint32_t max_kv_tile = (limit_bytes - base_q_bytes) / bytes_per_kv;
return (max_kv_tile / kv_granularity) * kv_granularity;
return (max_kv_tile / context.sg_mat_n) * context.sg_mat_n;
}
inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
const ggml_webgpu_shader_lib_context & context,
size_t storage_offset_alignment) {
ggml_webgpu_flash_attn_decisions decisions = {};
const size_t alignment = std::max<size_t>(1u, storage_offset_alignment);
const auto * K = context.src1;
const auto * V = context.src2;
GGML_ASSERT(K != nullptr);
GGML_ASSERT(V != nullptr);
inline uint32_t ggml_webgpu_flash_attn_vec_get_kv_tile(const ggml_webgpu_shader_lib_context & context) {
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
uint32_t kv_tile = std::max(context.sg_mat_n, std::min(32u, min_kv_tile));
kv_tile = (kv_tile / context.sg_mat_n) * context.sg_mat_n;
const auto flash_attn_tensor_offset = [](const ggml_tensor * tensor) -> size_t {
constexpr uintptr_t ptr_base_addr = 0x1000u;
const ggml_tensor * base = tensor->view_src != nullptr ? tensor->view_src : tensor;
return reinterpret_cast<uintptr_t>(base->data) - ptr_base_addr + tensor->view_offs;
};
const uint32_t k_offset_elems =
(uint32_t) ((flash_attn_tensor_offset(K) & (alignment - 1)) / ggml_type_size(K->type));
const uint32_t v_offset_elems =
(uint32_t) ((flash_attn_tensor_offset(V) & (alignment - 1)) / ggml_type_size(V->type));
const bool f16_vec4_aligned = (k_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u) &&
(v_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u);
const bool kv_vec_type_supported =
K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && (context.src0->ne[0] % 32 == 0) &&
(context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
kv_vec_type_supported && (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
(context.src2->type == K->type);
const bool use_tile = context.supports_subgroups && !context.supports_subgroup_matrix && K->type == GGML_TYPE_F16 &&
V->type == GGML_TYPE_F16 && f16_vec4_aligned &&
(context.src0->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
(context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) && !use_vec;
decisions.path = use_vec ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC :
use_tile ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE :
GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX;
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context, decisions.path);
decisions.kv_direct = key.kv_direct;
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
decisions.q_tile = 1u;
decisions.kv_tile = std::max(8u, std::min(32u, min_kv_tile));
decisions.kv_tile = (decisions.kv_tile / 8u) * 8u;
decisions.wg_size = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
if (decisions.kv_direct) {
decisions.kv_tile = std::min(decisions.kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
while (GGML_WEBGPU_KV_SEQ_PAD % decisions.kv_tile != 0) {
decisions.kv_tile -= 8u;
}
}
return decisions;
}
decisions.q_tile =
decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ? GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE : context.sg_mat_m;
decisions.kv_tile = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
std::min(64u, ggml_webgpu_flash_attn_max_kv_tile(context, key)) :
std::min(ggml_webgpu_flash_attn_max_kv_tile(context, key),
context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
decisions.wg_size = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE :
std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
const uint32_t tile_kv_granularity = std::max(1u, context.max_subgroup_size);
decisions.kv_tile =
std::max(tile_kv_granularity, (decisions.kv_tile / tile_kv_granularity) * tile_kv_granularity);
}
if (decisions.kv_direct) {
GGML_ASSERT(decisions.kv_tile <= GGML_WEBGPU_KV_SEQ_PAD);
while (GGML_WEBGPU_KV_SEQ_PAD % decisions.kv_tile != 0) {
decisions.kv_tile -= decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
std::max(1u, context.max_subgroup_size) :
context.sg_mat_n;
if (key.kv_direct) {
kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
kv_tile -= context.sg_mat_n;
}
}
return decisions;
return kv_tile;
}
/** Matrix Multiplication **/
@@ -743,6 +575,7 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash {
struct ggml_webgpu_mul_mat_vec_shader_decisions {
uint32_t wg_size;
uint32_t tile_k;
uint32_t outputs_per_wg;
uint32_t vec_size;
};
@@ -910,17 +743,16 @@ class ggml_webgpu_shader_lib {
std::unordered_map<int, webgpu_pipeline> cumsum_pipelines; // key is fixed, no variants yet
std::unordered_map<ggml_webgpu_row_norm_pipeline_key, webgpu_pipeline, ggml_webgpu_row_norm_pipeline_key_hash>
row_norm_pipelines; // op/inplace
std::unordered_map<ggml_webgpu_get_rows_pipeline_key, webgpu_pipeline, ggml_webgpu_get_rows_pipeline_key_hash>
get_rows_pipelines; // src_type, vectorized
get_rows_pipelines; // src_type, vectorized
std::unordered_map<ggml_webgpu_unary_pipeline_key, webgpu_pipeline, ggml_webgpu_unary_pipeline_key_hash>
unary_pipelines; // type/op/inplace
unary_pipelines; // type/op/inplace
std::unordered_map<ggml_webgpu_scale_pipeline_key, webgpu_pipeline, ggml_webgpu_scale_pipeline_key_hash>
scale_pipelines; // inplace
scale_pipelines; // inplace
std::unordered_map<ggml_webgpu_solve_tri_pipeline_key, webgpu_pipeline, ggml_webgpu_solve_tri_pipeline_key_hash>
solve_tri_pipelines; // type
solve_tri_pipelines; // type
std::unordered_map<ggml_webgpu_ssm_conv_pipeline_key, webgpu_pipeline, ggml_webgpu_ssm_conv_pipeline_key_hash>
ssm_conv_pipelines; // type/vectorized
ssm_conv_pipelines; // type/vectorized
std::unordered_map<ggml_webgpu_gated_delta_net_pipeline_key,
webgpu_pipeline,
ggml_webgpu_gated_delta_net_pipeline_key_hash>
@@ -935,6 +767,8 @@ class ggml_webgpu_shader_lib {
repeat_pipelines; // type
std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
flash_attn_pipelines;
std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
flash_attn_vec_pipelines;
std::unordered_map<ggml_webgpu_flash_attn_vec_reduce_pipeline_key,
webgpu_pipeline,
ggml_webgpu_flash_attn_vec_reduce_pipeline_key_hash>
@@ -964,15 +798,6 @@ class ggml_webgpu_shader_lib {
rope_pipelines;
std::unordered_map<ggml_webgpu_soft_max_pipeline_key, webgpu_pipeline, ggml_webgpu_soft_max_pipeline_key_hash>
soft_max_pipelines;
std::unordered_map<ggml_webgpu_conv2d_pipeline_key, webgpu_pipeline, ggml_webgpu_conv2d_pipeline_key_hash>
conv2d_pipelines;
std::unordered_map<ggml_webgpu_im2col_pipeline_key, webgpu_pipeline, ggml_webgpu_im2col_pipeline_key_hash>
im2col_pipelines;
std::unordered_map<ggml_webgpu_rms_norm_mul_pipeline_key,
webgpu_pipeline,
ggml_webgpu_rms_norm_mul_pipeline_key_hash>
rms_norm_mul_pipelines;
public:
ggml_webgpu_shader_lib(wgpu::Device device) { this->device = device; }
@@ -1501,7 +1326,7 @@ class ggml_webgpu_shader_lib {
ggml_webgpu_mul_mat_vec_pipeline_key key = {};
key.src0_type = context.src0->type;
key.src1_type = context.src1->type;
key.vectorized = (context.src0->ne[0] % 4 == 0 &&
key.vectorized = (context.src0->ne[0] % 4 == 0 && context.dst->ne[0] % 4 == 0 &&
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
1 :
0;
@@ -1512,8 +1337,7 @@ class ggml_webgpu_shader_lib {
}
std::vector<std::string> defines;
std::string variant = "mul_mat_vec";
const char * shader_src = wgsl_mul_mat_vec;
std::string variant = "mul_mat_vec";
// src0 type (matrix row)
switch (context.src0->type) {
@@ -1562,25 +1386,25 @@ class ggml_webgpu_shader_lib {
defines.push_back(key.vectorized ? "VEC" : "SCALAR");
uint32_t wg_size = WEBGPU_MUL_MAT_VEC_WG_SIZE;
uint32_t tile_k = WEBGPU_MUL_MAT_VEC_FLOAT_TILE_K;
uint32_t outputs_per_wg = WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG;
if (key.src0_type >= GGML_TYPE_Q2_K) {
tile_k = WEBGPU_MUL_MAT_VEC_K_Q_TILE_K;
outputs_per_wg = WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG;
} else if (key.src0_type >= GGML_TYPE_Q4_0) {
tile_k = WEBGPU_MUL_MAT_VEC_LEGACY_Q_TILE_K;
outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
defines.push_back(std::string("TILE_K=") + std::to_string(tile_k));
defines.push_back(std::string("OUTPUTS_PER_WG=") + std::to_string(outputs_per_wg));
defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
variant += context.supports_subgroups ? "_sg_reduce" : "_wg_reduce";
if (key.vectorized) {
variant += "_vectorized";
}
auto processed = preprocessor.preprocess(shader_src, defines);
auto processed = preprocessor.preprocess(wgsl_mul_mat_vec, defines);
auto decisions = std::make_shared<ggml_webgpu_mul_mat_vec_shader_decisions>();
decisions->wg_size = wg_size;
decisions->tile_k = tile_k;
decisions->outputs_per_wg = outputs_per_wg;
decisions->vec_size = key.vectorized ? 4 : 1;
@@ -1989,43 +1813,6 @@ class ggml_webgpu_shader_lib {
return unary_pipelines[key];
}
webgpu_pipeline get_rms_norm_mul_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_rms_norm_mul_pipeline_key key = {};
key.inplace = context.inplace;
key.overlap = context.overlap;
key.src_overlap = context.src_overlap;
auto it = rms_norm_mul_pipelines.find(key);
if (it != rms_norm_mul_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string op_name = "RMS_NORM_MUL";
std::string variant = op_name;
if (key.inplace) {
defines.push_back("INPLACE");
variant += "_inplace";
} else if (key.overlap) {
defines.push_back("OVERLAP");
variant += "_overlap";
} else if (key.src_overlap) {
defines.push_back("SRC_OVERLAP");
variant += "_src_overlap";
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
auto processed = preprocessor.preprocess(wgsl_rms_norm_mul, defines);
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
decisions->wg_size = context.max_wg_size;
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
pipeline.context = decisions;
rms_norm_mul_pipelines[key] = pipeline;
return rms_norm_mul_pipelines[key];
}
webgpu_pipeline get_binary_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_binary_pipeline_key key = {};
key.type = context.dst->type;
@@ -2156,19 +1943,14 @@ class ggml_webgpu_shader_lib {
return repeat_pipelines[key];
}
webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context,
size_t storage_offset_alignment) {
const ggml_webgpu_flash_attn_decisions decisions =
ggml_webgpu_flash_attn_get_decisions(context, storage_offset_alignment);
ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context, decisions.path);
auto it = flash_attn_pipelines.find(key);
webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context) {
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
auto it = flash_attn_pipelines.find(key);
if (it != flash_attn_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string variant = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC ? "flash_attn_vec" :
decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ? "flash_attn_tile" :
"flash_attn";
std::string variant = "flash_attn";
switch (key.kv_type) {
case GGML_TYPE_F32:
@@ -2190,12 +1972,7 @@ class ggml_webgpu_shader_lib {
if (key.has_mask) {
defines.push_back("MASK");
if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
defines.push_back("BLK");
variant += "_mask_blk";
} else {
variant += "_mask";
}
variant += "_mask";
}
if (key.has_sinks) {
defines.push_back("SINKS");
@@ -2209,9 +1986,88 @@ class ggml_webgpu_shader_lib {
defines.push_back("KV_DIRECT");
variant += "_kvdirect";
}
if (key.kv_overlap) {
defines.push_back("KV_OVERLAP");
variant += "_kv_overlap";
defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
variant += std::string("_hsqk") + std::to_string(key.head_dim_qk);
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
variant += std::string("_hsv") + std::to_string(key.head_dim_v);
defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
auto decisions = std::make_shared<ggml_webgpu_flash_attn_decisions>();
decisions->q_tile = context.sg_mat_m;
const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
uint32_t kv_tile = std::min(min_kv_tile, context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
if (key.kv_direct) {
kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
kv_tile -= context.sg_mat_n;
}
}
decisions->kv_tile = kv_tile;
decisions->wg_size = std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
defines.push_back(std::string("Q_TILE=") + std::to_string(decisions->q_tile));
defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
webgpu_pipeline pipeline =
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn, defines), variant);
pipeline.context = decisions;
flash_attn_pipelines[key] = pipeline;
return flash_attn_pipelines[key];
}
webgpu_pipeline get_flash_attn_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
auto it = flash_attn_vec_pipelines.find(key);
if (it != flash_attn_vec_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string variant = "flash_attn_vec";
switch (key.kv_type) {
case GGML_TYPE_F32:
defines.push_back("KV_F32");
break;
case GGML_TYPE_F16:
defines.push_back("KV_F16");
break;
case GGML_TYPE_Q4_0:
defines.push_back("KV_Q4_0");
break;
case GGML_TYPE_Q8_0:
defines.push_back("KV_Q8_0");
break;
default:
GGML_ABORT("Unsupported KV type for flash attention shader");
}
variant += std::string("_") + ggml_type_name(key.kv_type);
if (key.has_mask) {
defines.push_back("MASK");
defines.push_back("BLK");
variant += "_mask_blk";
}
if (key.has_sinks) {
defines.push_back("SINKS");
variant += "_sinks";
}
if (key.uses_logit_softcap) {
defines.push_back("LOGIT_SOFTCAP");
variant += "_lgsc";
}
if (key.kv_direct) {
defines.push_back("KV_DIRECT");
variant += "_kvdirect";
}
defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
@@ -2220,37 +2076,46 @@ class ggml_webgpu_shader_lib {
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
variant += std::string("_hsv") + std::to_string(key.head_dim_v);
const char * shader_src = wgsl_flash_attn;
if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
defines.push_back("KV_GRANULARITY=8");
defines.push_back(std::string("VEC_NE=") + std::to_string(ggml_webgpu_flash_attn_pick_vec_ne(key)) + "u");
shader_src = wgsl_flash_attn_vec_split;
} else if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
shader_src = wgsl_flash_attn_tile;
defines.push_back("MAX_SUBGROUP_SIZE=" + std::to_string(context.max_subgroup_size));
defines.push_back("KV_STAGE_STRIDE=" + std::to_string(std::max(key.head_dim_qk, key.head_dim_v)));
variant += "_tile";
} else {
defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
defines.push_back("Q_TILE=1");
auto decisions = std::make_shared<ggml_webgpu_flash_attn_vec_decisions>();
decisions->kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
decisions->wg_size = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
uint32_t vec_ne = 1u;
// Keep conservative defaults unless this is the f16 vec-split shape family.
if (key.kv_type == GGML_TYPE_F16 && key.head_dim_qk == key.head_dim_v) {
switch (key.head_dim_qk) {
case 64:
case 192:
case 576:
vec_ne = 2u;
break;
case 96:
vec_ne = 4u;
break;
default:
break;
}
}
auto pipeline_decisions = std::make_shared<ggml_webgpu_flash_attn_decisions>(decisions);
defines.push_back(std::string("Q_TILE=") + std::to_string(decisions.q_tile));
defines.push_back(std::string("KV_TILE=") + std::to_string(decisions.kv_tile));
defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions.wg_size));
defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
defines.push_back(std::string("VEC_NE=") + std::to_string(vec_ne) + "u");
webgpu_pipeline pipeline =
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(shader_src, defines), variant);
pipeline.context = pipeline_decisions;
flash_attn_pipelines[key] = pipeline;
return flash_attn_pipelines[key];
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn_vec_split, defines), variant);
pipeline.context = decisions;
flash_attn_vec_pipelines[key] = pipeline;
return flash_attn_vec_pipelines[key];
}
webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_shader_lib_context & context, uint32_t kv_tile) {
webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_flash_attn_blk_pipeline_key key = {};
key.kv_tile = kv_tile;
key.kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
auto it = flash_attn_blk_pipelines.find(key);
if (it != flash_attn_blk_pipelines.end()) {
return it->second;
@@ -2525,84 +2390,6 @@ class ggml_webgpu_shader_lib {
return soft_max_pipelines[key];
}
webgpu_pipeline get_conv2d_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_conv2d_pipeline_key key = {};
key.weight_type = context.src0->type;
key.input_type = context.src1->type;
key.output_type = context.dst->type;
auto it = conv2d_pipelines.find(key);
if (it != conv2d_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string variant = "conv_2d";
auto push_type_defines = [&](const char * prefix, ggml_type type) {
std::string s_prefix = prefix;
if (type == GGML_TYPE_F32) {
defines.push_back(s_prefix + "_F32");
} else if (type == GGML_TYPE_F16) {
defines.push_back(s_prefix + "_F16");
} else {
GGML_ABORT("Unsupported type for CONV_2D shader");
}
};
push_type_defines("WEIGHT", key.weight_type);
push_type_defines("INPUT", key.input_type);
push_type_defines("OUTPUT", key.output_type);
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
auto processed = preprocessor.preprocess(wgsl_conv2d, defines);
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
decisions->wg_size = context.max_wg_size;
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
pipeline.context = decisions;
conv2d_pipelines[key] = pipeline;
return conv2d_pipelines[key];
}
webgpu_pipeline get_im2col_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_im2col_pipeline_key key = {};
key.input_type = context.src1->type;
key.output_type = context.dst->type;
auto it = im2col_pipelines.find(key);
if (it != im2col_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string variant = "im2col";
auto push_type_defines = [&](const char * prefix, ggml_type type) {
std::string s_prefix = prefix;
if (type == GGML_TYPE_F32) {
defines.push_back(s_prefix + "_F32");
} else if (type == GGML_TYPE_F16) {
defines.push_back(s_prefix + "_F16");
} else {
GGML_ABORT("Unsupported type for IM2COL shader");
}
};
push_type_defines("INPUT", key.input_type);
push_type_defines("OUTPUT", key.output_type);
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
auto processed = preprocessor.preprocess(wgsl_im2col, defines);
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
decisions->wg_size = context.max_wg_size;
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
pipeline.context = decisions;
im2col_pipelines[key] = pipeline;
return im2col_pipelines[key];
}
private:
static webgpu_pipeline ggml_webgpu_create_pipeline(wgpu::Device & device,
std::string shader_code,
+112 -559
View File
@@ -8,7 +8,6 @@
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#include "ggml-webgpu-shader-lib.hpp"
#include "ggml.h"
#ifdef __EMSCRIPTEN__
# include <emscripten/emscripten.h>
@@ -182,7 +181,6 @@ struct webgpu_dispatch_desc {
struct webgpu_capabilities {
wgpu::Limits limits;
bool supports_subgroups = false;
bool supports_subgroup_matrix = false;
uint32_t sg_mat_m = 0;
@@ -212,7 +210,6 @@ struct webgpu_global_context_struct {
wgpu::Buffer memset_params_buf;
webgpu_pipeline memset_pipeline;
// TODO: We should rework the CPU profiling time handling to make it more useful. ref: https://github.com/ggml-org/llama.cpp/pull/22050
#ifdef GGML_WEBGPU_CPU_PROFILE
// Profiling: labeled CPU time in ms (total)
std::unordered_map<std::string, double> cpu_time_ms;
@@ -220,6 +217,11 @@ struct webgpu_global_context_struct {
std::unordered_map<std::string, double> cpu_detail_ms;
#endif
#ifdef GGML_WEBGPU_GPU_PROFILE
// Profiling: per-shader GPU time in ms
std::unordered_map<std::string, double> shader_gpu_time_ms;
#endif
#ifdef GGML_WEBGPU_DEBUG
wgpu::Buffer debug_host_buf;
wgpu::Buffer debug_dev_buf;
@@ -265,12 +267,10 @@ struct webgpu_context_struct {
size_t memset_bytes_per_thread;
#ifdef GGML_WEBGPU_GPU_PROFILE
// Profiling: per-shader GPU time in ms
std::unordered_map<std::string, double> shader_gpu_time_ms;
wgpu::Buffer profile_timestamp_dev_buf;
wgpu::Buffer profile_timestamp_host_buf;
wgpu::QuerySet profile_timestamp_query_set;
uint32_t profile_timestamp_query_count = 0;
wgpu::Buffer profile_timestamp_dev_buf;
wgpu::Buffer profile_timestamp_host_buf;
wgpu::QuerySet profile_timestamp_query_set;
uint32_t profile_timestamp_query_count = 0;
#endif
~webgpu_context_struct() {
@@ -389,6 +389,23 @@ static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, const ggml_t
return offset & (ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment - 1);
}
static bool ggml_webgpu_flash_attn_use_vec(webgpu_global_context & global_ctx,
const ggml_tensor * Q,
const ggml_tensor * K,
const ggml_tensor * V) {
const size_t alignment = global_ctx->capabilities.limits.minStorageBufferOffsetAlignment;
const uint32_t k_offset_elems =
(uint32_t) ((ggml_webgpu_tensor_offset(K) & (alignment - 1)) / ggml_type_size(K->type));
const uint32_t v_offset_elems =
(uint32_t) ((ggml_webgpu_tensor_offset(V) & (alignment - 1)) / ggml_type_size(V->type));
const bool f16_vec4_aligned = (k_offset_elems % 4u == 0u) && (v_offset_elems % 4u == 0u);
const bool kv_vec_type_supported =
K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
return (Q->ne[1] < 20) && (Q->ne[0] % 32 == 0) && (V->ne[0] % 4 == 0) && kv_vec_type_supported &&
(K->type != GGML_TYPE_F16 || f16_vec4_aligned) && (V->type == K->type);
}
static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, const ggml_tensor * t) {
size_t offset = ggml_webgpu_tensor_offset(t);
return offset & ~(ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment - 1);
@@ -695,12 +712,12 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
#ifdef GGML_WEBGPU_GPU_PROFILE
std::cout << "\n[ggml_webgpu gpu profiling summary]\n";
double total_gpu = 0.0;
for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
for (const auto & kv : ctx->webgpu_ctx->global_ctx->shader_gpu_time_ms) {
total_gpu += kv.second;
}
std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n";
std::cout << "\nggml_webgpu: gpu breakdown:\n";
for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
for (const auto & kv : ctx->webgpu_ctx->global_ctx->shader_gpu_time_ms) {
double pct = (total_gpu > 0.0) ? (kv.second / total_gpu * 100.0) : 0.0;
std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << std::fixed << std::setprecision(2)
<< pct << "%)\n";
@@ -905,170 +922,6 @@ static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx,
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
}
static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx,
ggml_tensor * src0,
ggml_tensor * src1,
ggml_tensor * dst) {
const int32_t s0 = ggml_get_op_params_i32(dst, 0);
const int32_t s1 = ggml_get_op_params_i32(dst, 1);
const int32_t p0 = ggml_get_op_params_i32(dst, 2);
const int32_t p1 = ggml_get_op_params_i32(dst, 3);
const int32_t d0 = ggml_get_op_params_i32(dst, 4);
const int32_t d1 = ggml_get_op_params_i32(dst, 5);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
(uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
(uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
(uint32_t) src0->ne[0],
(uint32_t) src0->ne[1],
(uint32_t) src0->ne[2],
(uint32_t) src1->ne[0],
(uint32_t) src1->ne[1],
(uint32_t) dst->ne[0],
(uint32_t) dst->ne[1],
(uint32_t) dst->ne[2],
(uint32_t) dst->ne[3],
(uint32_t) s0,
(uint32_t) s1,
(uint32_t) p0,
(uint32_t) p1,
(uint32_t) d0,
(uint32_t) d1,
};
std::vector<wgpu::BindGroupEntry> entries = {
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
};
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.src0 = src0;
shader_lib_ctx.src1 = src1;
shader_lib_ctx.dst = dst;
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
webgpu_pipeline pipeline = ctx->shader_lib->get_conv2d_pipeline(shader_lib_ctx);
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
uint32_t wg_y = CEIL_DIV(total_wg, wg_x);
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
}
static webgpu_encoded_op ggml_webgpu_im2col(webgpu_context & ctx,
ggml_tensor * src0,
ggml_tensor * src1,
ggml_tensor * dst) {
const int32_t s0 = ggml_get_op_params_i32(dst, 0);
const int32_t s1 = ggml_get_op_params_i32(dst, 1);
const int32_t p0 = ggml_get_op_params_i32(dst, 2);
const int32_t p1 = ggml_get_op_params_i32(dst, 3);
const int32_t d0 = ggml_get_op_params_i32(dst, 4);
const int32_t d1 = ggml_get_op_params_i32(dst, 5);
const bool is_2D = ggml_get_op_params_i32(dst, 6) == 1;
const uint32_t KW = src0->ne[0];
const uint32_t KH = is_2D ? src0->ne[1] : 1;
const uint32_t IC = is_2D ? src0->ne[2] : src0->ne[1];
const uint32_t IW = src1->ne[0];
const uint32_t IH = is_2D ? src1->ne[1] : 1;
const uint32_t N = is_2D ? src1->ne[3] : src1->ne[2];
const uint32_t OW = dst->ne[1];
const uint32_t OH = is_2D ? dst->ne[2] : 1;
const uint32_t si0 = (uint32_t) (src1->nb[0] / ggml_type_size(src1->type));
const uint32_t si1 = is_2D ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) : 0;
const uint32_t si2 = is_2D ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) :
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type));
const uint32_t si3 = is_2D ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) :
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type));
const uint32_t so0 = (uint32_t) (dst->nb[0] / ggml_type_size(dst->type));
const uint32_t so1 = (uint32_t) (dst->nb[1] / ggml_type_size(dst->type));
const uint32_t so2 = is_2D ? (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)) : 0;
const uint32_t so3 = is_2D ? (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)) :
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type));
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
si0,
si1,
si2,
si3,
so0,
so1,
so2,
so3,
KW,
KH,
IC,
IW,
IH,
N,
OW,
OH,
(uint32_t) s0,
(uint32_t) s1,
(uint32_t) p0,
(uint32_t) p1,
(uint32_t) d0,
(uint32_t) d1,
};
std::vector<wgpu::BindGroupEntry> entries = {
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst),
};
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.src0 = src0;
shader_lib_ctx.src1 = src1;
shader_lib_ctx.dst = dst;
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
webgpu_pipeline pipeline = ctx->shader_lib->get_im2col_pipeline(shader_lib_ctx);
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
uint32_t wg_y = CEIL_DIV(total_wg, wg_x);
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
}
static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx,
ggml_tensor * src0,
ggml_tensor * src1,
@@ -1311,11 +1164,14 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q6_K:
use_fast = true;
break;
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q2_K:
use_fast = true;
// we don't have fast mat-vec for these types, but we do have (semi) fast mat-mat
use_fast = !is_vec;
break;
default:
break;
@@ -1326,12 +1182,10 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
}
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.src0 = src0;
shader_lib_ctx.src1 = src1;
shader_lib_ctx.dst = dst;
shader_lib_ctx.src0 = src0;
shader_lib_ctx.src1 = src1;
shader_lib_ctx.dst = dst;
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
shader_lib_ctx.supports_subgroup_matrix = ctx->global_ctx->capabilities.supports_subgroup_matrix;
shader_lib_ctx.sg_mat_m = ctx->global_ctx->capabilities.sg_mat_m;
shader_lib_ctx.sg_mat_n = ctx->global_ctx->capabilities.sg_mat_n;
@@ -1433,8 +1287,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
// Get or create pipeline
webgpu_pipeline gather_pipeline;
webgpu_pipeline main_pipeline;
webgpu_pipeline gather_pipeline, main_pipeline;
std::vector<webgpu_dispatch_desc> dispatches;
@@ -1550,6 +1403,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
return ggml_backend_webgpu_build_multi(ctx, dispatches);
}
#ifndef __EMSCRIPTEN__
static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
ggml_tensor * Q,
ggml_tensor * K,
@@ -1567,29 +1421,13 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
const int has_mask = (mask != nullptr);
const int has_sinks = (sinks != nullptr);
const bool kv_overlap = ggml_webgpu_tensor_overlap(K, V) && K->type == V->type;
uint32_t offset_k = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, K) / ggml_type_size(K->type));
uint32_t offset_v = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, V) / ggml_type_size(V->type));
size_t kv_bind_offset = 0;
size_t kv_bind_size = 0;
if (kv_overlap) {
const size_t k_bind_offset = ggml_webgpu_tensor_align_offset(ctx, K);
const size_t v_bind_offset = ggml_webgpu_tensor_align_offset(ctx, V);
const size_t k_bind_end = k_bind_offset + ggml_webgpu_tensor_binding_size(ctx, K);
const size_t v_bind_end = v_bind_offset + ggml_webgpu_tensor_binding_size(ctx, V);
kv_bind_offset = std::min(k_bind_offset, v_bind_offset);
kv_bind_size = std::max(k_bind_end, v_bind_end) - kv_bind_offset;
offset_k = (uint32_t) ((ggml_webgpu_tensor_offset(K) - kv_bind_offset) / ggml_type_size(K->type));
offset_v = (uint32_t) ((ggml_webgpu_tensor_offset(V) - kv_bind_offset) / ggml_type_size(V->type));
}
const int has_mask = (mask != nullptr);
const int has_sinks = (sinks != nullptr);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, Q) / ggml_type_size(Q->type)),
offset_k,
offset_v,
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, K) / ggml_type_size(K->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, V) / ggml_type_size(V->type)),
has_mask ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mask) / ggml_type_size(mask->type)) : 0,
has_sinks ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, sinks) / ggml_type_size(sinks->type)) : 0,
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
@@ -1617,15 +1455,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
};
std::vector<wgpu::BindGroupEntry> entries = {
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, Q),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, K),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, V),
};
if (kv_overlap) {
entries.push_back(
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), kv_bind_offset, kv_bind_size));
} else {
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, K));
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, V));
}
uint32_t binding_index = kv_overlap ? 2u : 3u;
uint32_t binding_index = 3;
if (has_mask) {
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, binding_index++, mask));
}
@@ -1641,25 +1474,25 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
shader_lib_ctx.src3 = mask;
shader_lib_ctx.src4 = sinks;
shader_lib_ctx.dst = dst;
shader_lib_ctx.src_overlap = kv_overlap;
shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
shader_lib_ctx.supports_subgroup_matrix = ctx->global_ctx->capabilities.supports_subgroup_matrix;
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
shader_lib_ctx.wg_mem_limit_bytes = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
shader_lib_ctx.sg_mat_m = ctx->global_ctx->capabilities.sg_mat_m;
shader_lib_ctx.sg_mat_n = ctx->global_ctx->capabilities.sg_mat_n;
shader_lib_ctx.sg_mat_k = ctx->global_ctx->capabilities.sg_mat_k;
shader_lib_ctx.max_subgroup_size = ctx->global_ctx->capabilities.max_subgroup_size;
webgpu_pipeline pipeline = ctx->shader_lib->get_flash_attn_pipeline(
shader_lib_ctx, ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
auto * decisions = static_cast<ggml_webgpu_flash_attn_decisions *>(pipeline.context.get());
const bool use_vec = ggml_webgpu_flash_attn_use_vec(ctx->global_ctx, Q, K, V);
webgpu_pipeline pipeline = use_vec ? ctx->shader_lib->get_flash_attn_vec_pipeline(shader_lib_ctx) :
ctx->shader_lib->get_flash_attn_pipeline(shader_lib_ctx);
if (decisions->path != GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
if (!use_vec) {
auto * decisions = static_cast<ggml_webgpu_flash_attn_decisions *>(pipeline.context.get());
uint32_t wg_per_head = CEIL_DIV(Q->ne[1], decisions->q_tile);
uint32_t wg_x = wg_per_head * Q->ne[2] * Q->ne[3]; // wg per head * number of heads * number of batches
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
}
auto * decisions = static_cast<ggml_webgpu_flash_attn_vec_decisions *>(pipeline.context.get());
wgpu::Buffer blk_buf = {};
uint64_t blk_size_bytes = 0;
uint32_t blk_nblk0 = 0;
@@ -1698,12 +1531,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
tmp_bind_size = tmp_size_bytes;
scratch_offset = ROUNDUP_POW2(scratch_offset + tmp_size_bytes, align_bytes);
} else {
// nwg==1 writes final dst directly in vec-split; bind tmp to a tiny non-overlapping scratch region.
tmp_size_bytes = WEBGPU_STORAGE_BUF_BINDING_MULT;
// nwg==1 writes final dst directly in vec-split; keep tmp binding valid without extra allocation.
tmp_buf = ggml_webgpu_tensor_buf(dst);
tmp_bind_offset = scratch_offset;
tmp_bind_size = tmp_size_bytes;
scratch_offset = ROUNDUP_POW2(scratch_offset + tmp_size_bytes, align_bytes);
tmp_bind_offset = ggml_webgpu_tensor_align_offset(ctx, dst);
tmp_bind_size = ggml_webgpu_tensor_binding_size(ctx, dst);
}
webgpu_pipeline blk_pipeline;
@@ -1718,7 +1549,7 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
const uint64_t blk_elems = (uint64_t) blk_nblk0 * blk_nblk1 * blk_batch_count;
blk_size_bytes = ROUNDUP_POW2(blk_elems * sizeof(uint32_t), WEBGPU_STORAGE_BUF_BINDING_MULT);
const ggml_webgpu_shader_lib_context blk_shader_ctx = shader_lib_ctx;
blk_pipeline = ctx->shader_lib->get_flash_attn_blk_pipeline(blk_shader_ctx, decisions->kv_tile);
blk_pipeline = ctx->shader_lib->get_flash_attn_blk_pipeline(blk_shader_ctx);
blk_params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mask) / ggml_type_size(mask->type)), // offset_mask
@@ -1750,19 +1581,12 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
std::vector<wgpu::BindGroupEntry> split_entries = {
ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(Q), ggml_webgpu_tensor_align_offset(ctx, Q),
ggml_webgpu_tensor_binding_size(ctx, Q)),
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), ggml_webgpu_tensor_align_offset(ctx, K),
ggml_webgpu_tensor_binding_size(ctx, K)),
ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(V), ggml_webgpu_tensor_align_offset(ctx, V),
ggml_webgpu_tensor_binding_size(ctx, V)),
};
if (kv_overlap) {
split_entries.push_back(
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), kv_bind_offset, kv_bind_size));
} else {
split_entries.push_back(ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K),
ggml_webgpu_tensor_align_offset(ctx, K),
ggml_webgpu_tensor_binding_size(ctx, K)));
split_entries.push_back(ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(V),
ggml_webgpu_tensor_align_offset(ctx, V),
ggml_webgpu_tensor_binding_size(ctx, V)));
}
uint32_t split_binding_index = kv_overlap ? 2u : 3u;
uint32_t split_binding_index = 3;
if (has_mask) {
split_entries.push_back(ggml_webgpu_make_bind_group_entry(split_binding_index++, ggml_webgpu_tensor_buf(mask),
ggml_webgpu_tensor_align_offset(ctx, mask),
@@ -1832,6 +1656,7 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
return ggml_backend_webgpu_build_multi(ctx, dispatches);
}
#endif // __EMSCRIPTEN__
static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
bool is_unary = dst->op == GGML_OP_UNARY;
@@ -2066,96 +1891,6 @@ static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, ggml_tensor *
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
}
static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context & ctx,
ggml_tensor * rn_src,
ggml_tensor * rn_dst,
ggml_tensor * mul_src0,
ggml_tensor * mul_src1,
ggml_tensor * dst) {
ggml_tensor * mul_src;
if (ggml_webgpu_tensor_equal(rn_dst, mul_src0)) {
mul_src = mul_src1;
} else if (ggml_webgpu_tensor_equal(rn_dst, mul_src1)) {
mul_src = mul_src0;
} else {
GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
}
bool overlap = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
(ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
bool inplace = ggml_webgpu_tensor_equal(rn_src, dst);
bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
uint32_t offset_merged_rn_src = 0;
uint32_t offset_merged_mul_src = 0;
size_t rn_src_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, rn_src);
size_t mul_src_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, mul_src);
if (src_overlap) {
size_t min_offset = std::min(rn_src_webgpu_tensor_align_offset, mul_src_webgpu_tensor_align_offset);
offset_merged_rn_src =
(uint32_t) ((rn_src_webgpu_tensor_align_offset - min_offset) / ggml_type_size(rn_src->type));
offset_merged_mul_src =
(uint32_t) ((mul_src_webgpu_tensor_align_offset - min_offset) / ggml_type_size(mul_src->type));
}
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, rn_src) / ggml_type_size(rn_src->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mul_src) / ggml_type_size(mul_src->type)),
offset_merged_rn_src,
offset_merged_mul_src,
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (rn_src->nb[1] / ggml_type_size(rn_src->type)),
(uint32_t) (rn_src->nb[2] / ggml_type_size(rn_src->type)),
(uint32_t) (rn_src->nb[3] / ggml_type_size(rn_src->type)),
(uint32_t) (mul_src->nb[1] / ggml_type_size(mul_src->type)),
(uint32_t) (mul_src->nb[2] / ggml_type_size(mul_src->type)),
(uint32_t) (mul_src->nb[3] / ggml_type_size(mul_src->type)),
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
(uint32_t) mul_src->ne[0],
(uint32_t) mul_src->ne[1],
(uint32_t) mul_src->ne[2],
(uint32_t) mul_src->ne[3],
(uint32_t) dst->ne[0],
(uint32_t) dst->ne[1],
(uint32_t) dst->ne[2],
(uint32_t) dst->ne[3],
ggml_webgpu_u32_from_f32(ggml_get_op_params_f32(rn_dst, 0)) // epsilon, treated as f32 in the shader
};
std::vector<wgpu::BindGroupEntry> entries;
if (inplace || overlap) {
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
} else if (src_overlap) {
size_t merged_offset = std::min(rn_src_webgpu_tensor_align_offset, mul_src_webgpu_tensor_align_offset);
size_t merged_end =
std::max(rn_src_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, rn_src),
mul_src_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, mul_src));
entries.push_back(ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(rn_src), merged_offset,
merged_end - merged_offset));
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst));
} else {
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));
}
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
shader_lib_ctx.inplace = inplace;
shader_lib_ctx.overlap = overlap;
shader_lib_ctx.src_overlap = src_overlap;
webgpu_pipeline pipeline = ctx->shader_lib->get_rms_norm_mul_pipeline(shader_lib_ctx);
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, ggml_nrows(dst));
}
static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
bool inplace = ggml_webgpu_tensor_equal(src, dst);
@@ -2652,48 +2387,15 @@ static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, ggml_tensor
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
}
static bool ggml_webgpu_can_fuse_rms_norm_mul(const struct ggml_cgraph * cgraph, int node_idx) {
if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
return false;
}
// additional constraints specific to this fusion
const ggml_tensor * rms_norm = cgraph->nodes[node_idx];
const ggml_tensor * mul = cgraph->nodes[node_idx + 1];
GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
// rms_norm only supports f32
if (mul->src[0]->type != GGML_TYPE_F32 || mul->src[1]->type != GGML_TYPE_F32 || mul->type != GGML_TYPE_F32) {
return false;
}
// if rms_norm is the B operand, then we don't handle broadcast
if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm)) {
return false;
}
// rms_norm shader assumes contiguous rows
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
return false;
}
return true;
}
// Returns the encoded command, or std::nullopt if the operation is a no-op
static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
ggml_cgraph * cgraph,
int node_idx,
int & num_encoded_ops) {
ggml_tensor ** nodes = cgraph->nodes;
ggml_tensor * node = nodes[node_idx];
static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
if (ggml_is_empty(node)) {
return std::nullopt;
}
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
return std::nullopt;
}
WEBGPU_LOG_DEBUG("ggml_webgpu_encode(" << node << ", " << ggml_op_name(node->op) << ")");
WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
ggml_tensor * src0 = node->src[0];
ggml_tensor * src1 = node->src[1];
@@ -2721,7 +2423,11 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
case GGML_OP_MUL_MAT_ID:
return ggml_webgpu_mul_mat_id(ctx, src0, src1, src2, node);
case GGML_OP_FLASH_ATTN_EXT:
#ifndef __EMSCRIPTEN__
return ggml_webgpu_flash_attn(ctx, src0, src1, src2, node->src[3], node->src[4], node);
#else
return std::nullopt;
#endif
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL:
@@ -2732,13 +2438,6 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
case GGML_OP_REPEAT:
return ggml_webgpu_repeat(ctx, src0, node);
case GGML_OP_RMS_NORM:
if (ggml_webgpu_can_fuse_rms_norm_mul(cgraph, node_idx)) {
num_encoded_ops = 2;
ggml_tensor * mul_node = nodes[node_idx + 1];
return ggml_webgpu_rms_norm_mul(ctx, src0, node, mul_node->src[0], mul_node->src[1], mul_node);
} else {
return ggml_webgpu_row_norm(ctx, src0, node);
}
case GGML_OP_L2_NORM:
return ggml_webgpu_row_norm(ctx, src0, node);
case GGML_OP_ROPE:
@@ -2779,10 +2478,6 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
return ggml_webgpu_sum_rows(ctx, src0, node);
case GGML_OP_CONV_2D:
return ggml_webgpu_conv_2d(ctx, src0, src1, node);
case GGML_OP_IM2COL:
return ggml_webgpu_im2col(ctx, src0, src1, node);
default:
return std::nullopt;
}
@@ -2815,7 +2510,7 @@ static void ggml_backend_webgpu_collect_profile_results(webgpu_context &
for (size_t i = 0; i < pipeline_names.size(); ++i) {
// WebGPU timestamps are in ns; convert to ms.
const double elapsed_ms = double(ts_data[2 * i + 1] - ts_data[2 * i]) * 1e-6;
ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms;
ctx->global_ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms;
}
ctx->profile_timestamp_host_buf.Unmap();
@@ -2851,8 +2546,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
uint32_t num_inflight_batches = 0;
bool contains_set_rows = false;
bool batch_compute_passes = true;
int num_encoded_ops = 1;
int node_idx = 0;
#ifdef GGML_WEBGPU_GPU_PROFILE
ctx->profile_timestamp_query_count = 0;
@@ -2865,11 +2558,11 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
}
while (node_idx < cgraph->n_nodes) {
if (cgraph->nodes[node_idx]->op == GGML_OP_SET_ROWS) {
for (int i = 0; i < cgraph->n_nodes; i++) {
if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
contains_set_rows = true;
}
if (auto cmd = ggml_webgpu_encode(ctx, cgraph, node_idx, num_encoded_ops)) {
if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
commands.push_back(*cmd);
num_batched_kernels += cmd.value().num_kernels;
#ifdef GGML_WEBGPU_GPU_PROFILE
@@ -2894,9 +2587,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
ctx->param_arena.reset();
commands.clear();
}
node_idx += num_encoded_ops;
num_encoded_ops = 1;
}
if (ctx->active_compute_pass) {
@@ -2926,107 +2616,22 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
return GGML_STATUS_SUCCESS;
}
struct ggml_backend_webgpu_event_context {
webgpu_global_context global_ctx;
wgpu::Future future;
bool recorded = false;
};
static ggml_backend_event_t ggml_backend_webgpu_device_event_new(ggml_backend_dev_t device) {
ggml_backend_webgpu_device_context * dev_ctx = (ggml_backend_webgpu_device_context *) device->context;
auto * event_ctx = new ggml_backend_webgpu_event_context();
event_ctx->global_ctx = dev_ctx->webgpu_global_ctx;
auto * event = new ggml_backend_event;
event->device = device;
event->context = event_ctx;
return event;
}
static void ggml_backend_webgpu_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
GGML_UNUSED(dev);
delete static_cast<ggml_backend_webgpu_event_context *>(event->context);
delete event;
}
static void ggml_backend_webgpu_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
GGML_UNUSED(dev);
ggml_backend_webgpu_event_context * event_ctx = (ggml_backend_webgpu_event_context *) event->context;
if (!event_ctx->recorded) {
return;
}
wgpu::WaitStatus status =
event_ctx->global_ctx->instance.WaitAny(event_ctx->future, WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
if (status == wgpu::WaitStatus::TimedOut) {
GGML_ABORT("ggml_webgpu: event_synchronize timed out after %u ms\n", WEBGPU_RUNTIME_WAIT_TIMEOUT_MS);
}
event_ctx->recorded = false;
}
static void ggml_backend_webgpu_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_webgpu_context * backend_ctx = (ggml_backend_webgpu_context *) backend->context;
ggml_backend_webgpu_event_context * event_ctx = (ggml_backend_webgpu_event_context *) event->context;
event_ctx->future = backend_ctx->webgpu_ctx->global_ctx->queue.OnSubmittedWorkDone(
wgpu::CallbackMode::AllowSpontaneous, [](wgpu::QueueWorkDoneStatus, wgpu::StringView) {});
event_ctx->recorded = true;
}
static void ggml_backend_webgpu_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
GGML_UNUSED(backend);
ggml_backend_webgpu_device_event_synchronize(nullptr, event);
}
static void ggml_backend_webgpu_set_tensor_async(ggml_backend_t backend,
ggml_tensor * tensor,
const void * data,
size_t offset,
size_t size) {
GGML_UNUSED(backend);
auto * buf_ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context;
size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
// Write aligned portion
buf_ctx->global_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
if (size % 4 != 0) {
// If size is not a multiple of 4, we need to memset the remaining bytes
size_t remaining_size = size % 4;
// pack the remaining bytes into a uint32_t
uint32_t val32 = 0;
for (size_t i = 0; i < remaining_size; i++) {
((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
}
// memset the remaining bytes
ggml_backend_webgpu_buffer_memset(buf_ctx->global_ctx, buf_ctx->buffer, val32,
total_offset + (size - remaining_size), remaining_size);
}
}
static void ggml_backend_webgpu_synchronize(ggml_backend_t backend) {
ggml_backend_webgpu_context * backend_ctx = (ggml_backend_webgpu_context *) backend->context;
ggml_backend_webgpu_wait_queue(backend_ctx->webgpu_ctx->global_ctx);
}
static ggml_backend_i ggml_backend_webgpu_i = {
/* .get_name = */ ggml_backend_webgpu_name,
/* .free = */ ggml_backend_webgpu_free,
/* .set_tensor_async = */ ggml_backend_webgpu_set_tensor_async,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_webgpu_synchronize,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_webgpu_graph_compute,
/* .event_record = */ ggml_backend_webgpu_event_record,
/* .event_wait = */ ggml_backend_webgpu_event_wait,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .graph_optimize = */ NULL,
};
@@ -3264,19 +2869,13 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
ctx->webgpu_global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
shader_lib_ctx.wg_mem_limit_bytes =
ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
shader_lib_ctx.supports_subgroups = ctx->webgpu_global_ctx->capabilities.supports_subgroups;
shader_lib_ctx.supports_subgroup_matrix =
ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix;
shader_lib_ctx.sg_mat_m = ctx->webgpu_global_ctx->capabilities.sg_mat_m;
shader_lib_ctx.sg_mat_n = ctx->webgpu_global_ctx->capabilities.sg_mat_n;
shader_lib_ctx.sg_mat_k = ctx->webgpu_global_ctx->capabilities.sg_mat_k;
shader_lib_ctx.max_subgroup_size = ctx->webgpu_global_ctx->capabilities.max_subgroup_size;
const ggml_webgpu_flash_attn_decisions decisions = ggml_webgpu_flash_attn_get_decisions(
shader_lib_ctx, ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
const uint32_t kv_tile = decisions.kv_tile;
if (ggml_webgpu_flash_attn_use_vec(ctx->webgpu_global_ctx, Q, K, V)) {
const uint32_t kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(shader_lib_ctx);
const uint32_t vec_nwg_cap = std::max(
1u, std::min<uint32_t>(32u, ctx->webgpu_global_ctx->capabilities.max_subgroup_size));
@@ -3296,8 +2895,6 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
const size_t tmp_size_bytes = ROUNDUP_POW2(
(tmp_data_elems + tmp_stats_elems) * sizeof(float), WEBGPU_STORAGE_BUF_BINDING_MULT);
res += tmp_size_bytes + align;
} else {
res += WEBGPU_STORAGE_BUF_BINDING_MULT + align;
}
if (mask != nullptr) {
const uint32_t blk_nblk0 = CEIL_DIV((uint32_t) K->ne[1], kv_tile);
@@ -3443,15 +3040,13 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
// we require f16 support
GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
ctx->webgpu_global_ctx->capabilities.supports_subgroups =
ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
bool valid_subgroup_matrix_config = false;
#ifndef __EMSCRIPTEN__
// Accept f16 subgroup matrix configurations (square or non-square).
// NVIDIA GPUs typically report square configs (e.g. 16x16x16),
// while Intel Xe2 GPUs report non-square configs (e.g. 8x16x16).
// The shaders are already parameterized to handle any M/N/K dimensions.
bool valid_subgroup_matrix_config = false;
if (ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
for (size_t i = 0; i < subgroup_matrix_configs.configCount; i++) {
const wgpu::SubgroupMatrixConfig config = subgroup_matrix_configs.configs[i];
@@ -3465,8 +3060,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
}
}
}
#endif
ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix = valid_subgroup_matrix_config;
#endif
// For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
// Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
@@ -3477,14 +3072,11 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
#ifndef __EMSCRIPTEN__
required_features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
if (ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
required_features.push_back(wgpu::FeatureName::Subgroups);
required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
}
#endif
if (ctx->webgpu_global_ctx->capabilities.supports_subgroups) {
required_features.push_back(wgpu::FeatureName::Subgroups);
}
#ifdef GGML_WEBGPU_GPU_PROFILE
required_features.push_back(wgpu::FeatureName::TimestampQuery);
#endif
@@ -3514,12 +3106,12 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
// Enable Dawn-specific toggles to increase native performance
// TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
// only for native performance?
const char * const deviceEnabledToggles[] = { "disable_robustness", "disable_workgroup_init",
"disable_polyfills_on_integer_div_and_mod" };
const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init",
"disable_polyfills_on_integer_div_and_mod" };
const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
wgpu::DawnTogglesDescriptor deviceTogglesDesc;
deviceTogglesDesc.enabledToggles = deviceEnabledToggles;
deviceTogglesDesc.enabledToggleCount = 3;
deviceTogglesDesc.enabledToggleCount = 4;
deviceTogglesDesc.disabledToggles = deviceDisabledToggles;
deviceTogglesDesc.disabledToggleCount = 1;
@@ -3797,63 +3389,33 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
break;
case GGML_OP_FLASH_ATTN_EXT:
{
#ifndef __EMSCRIPTEN__
if (!ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
break;
}
// Head dimensions must be divisible by subgroup matrix dimensions
if (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k != 0 ||
src2->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_n != 0) {
break;
}
// Head dimensions must fit in workgroup memory with minimum tile sizes
size_t limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
const bool has_mask = op->src[3] != nullptr;
const bool kv_direct = src1->type == GGML_TYPE_F16 &&
(src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
(src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
const size_t min_bytes = ggml_webgpu_flash_attn_wg_mem_bytes(
ctx->webgpu_global_ctx->capabilities.sg_mat_m, ctx->webgpu_global_ctx->capabilities.sg_mat_n,
(uint32_t) src0->ne[0], (uint32_t) src2->ne[0], has_mask, kv_direct);
if (min_bytes > limit_bytes) {
break;
}
supports_op = src0->type == GGML_TYPE_F32 &&
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 ||
src1->type == GGML_TYPE_Q4_0 || src1->type == GGML_TYPE_Q8_0) &&
src2->type == src1->type && op->type == GGML_TYPE_F32;
if (!supports_op) {
break;
}
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.src0 = src0;
shader_lib_ctx.src1 = src1;
shader_lib_ctx.src2 = src2;
shader_lib_ctx.src3 = op->src[3];
shader_lib_ctx.src4 = op->src[4];
shader_lib_ctx.dst = const_cast<ggml_tensor *>(op);
shader_lib_ctx.supports_subgroups = ctx->webgpu_global_ctx->capabilities.supports_subgroups;
shader_lib_ctx.supports_subgroup_matrix = ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix;
shader_lib_ctx.wg_mem_limit_bytes =
ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
shader_lib_ctx.sg_mat_m = ctx->webgpu_global_ctx->capabilities.sg_mat_m;
shader_lib_ctx.sg_mat_n = ctx->webgpu_global_ctx->capabilities.sg_mat_n;
shader_lib_ctx.sg_mat_k = ctx->webgpu_global_ctx->capabilities.sg_mat_k;
shader_lib_ctx.max_subgroup_size = ctx->webgpu_global_ctx->capabilities.max_subgroup_size;
const ggml_webgpu_flash_attn_decisions decisions = ggml_webgpu_flash_attn_get_decisions(
shader_lib_ctx, ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
const size_t limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
const bool has_mask = op->src[3] != nullptr;
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
const size_t min_bytes =
ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
(uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
if (min_bytes > limit_bytes) {
supports_op = false;
}
break;
}
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
const size_t min_bytes =
ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
(uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
if (min_bytes > limit_bytes) {
supports_op = false;
}
break;
}
if (!ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
supports_op = false;
break;
}
const size_t min_bytes =
ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
(uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
if (min_bytes > limit_bytes) {
supports_op = false;
}
#endif
break;
}
case GGML_OP_RMS_NORM:
@@ -3929,15 +3491,6 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_OP_SOLVE_TRI:
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
break;
case GGML_OP_CONV_2D:
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
break;
case GGML_OP_IM2COL:
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
break;
case GGML_OP_SSM_CONV:
supports_op = op->type == GGML_TYPE_F32;
break;
@@ -4031,9 +3584,9 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
/* .supports_op = */ ggml_backend_webgpu_device_supports_op,
/* .supports_buft = */ ggml_backend_webgpu_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ ggml_backend_webgpu_device_event_new,
/* .event_free = */ ggml_backend_webgpu_device_event_free,
/* .event_synchronize = */ ggml_backend_webgpu_device_event_synchronize,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
/* End GGML Backend Device Interface */
@@ -45,13 +45,6 @@ fn load_u16_at_src0(byte_offset: u32) -> u32 {
return (word >> shift) & 0xFFFFu;
}
// Always reads the 4-byte-aligned word containing byte_offset.
// Caller extracts the 16-bit half it needs via & 0xFFFFu or >> 16u.
// this is used in k-quants for better performance
fn load_u32_at_src0_aligned(byte_offset: u32) -> u32 {
return src0[(byte_offset & ~3u) / 4u];
}
fn load_u32_at_src0(byte_offset: u32) -> u32 {
let word_idx = byte_offset / 4u;
let shift = (byte_offset & 0x3u) * 8u;
@@ -1,165 +0,0 @@
#include "common_decls.tmpl"
enable f16;
@group(0) @binding(0)
#if defined(WEIGHT_F32)
var<storage, read_write> weights: array<f32>;
#elif defined(WEIGHT_F16)
var<storage, read_write> weights: array<f16>;
#endif
@group(0) @binding(1)
#if defined(INPUT_F32)
var<storage, read_write> input: array<f32>;
#elif defined(INPUT_F16)
var<storage, read_write> input: array<f16>;
#endif
@group(0) @binding(2)
#if defined(OUTPUT_F32)
var<storage, read_write> output: array<f32>;
#elif defined(OUTPUT_F16)
var<storage, read_write> output: array<f16>;
#endif
struct Params {
offset_w: u32,
offset_i: u32,
offset_o: u32,
// element strides
sw0: u32, sw1: u32, sw2: u32, sw3: u32,
si0: u32, si1: u32, si2: u32, si3: u32,
so0: u32, so1: u32, so2: u32, so3: u32,
// kernel dimensions
KW: u32, KH: u32, IC: u32,
// input dimensions
IW: u32, IH: u32,
// output dimensions
OW: u32, OH: u32, OC_out: u32, N_out: u32,
// stride
s0: u32, s1: u32,
// padding
p0: u32, p1: u32,
// dilation
d0: u32, d1: u32,
};
@group(0) @binding(3)
var<uniform> params: Params;
fn load_weight(idx: u32) -> f32 {
#if defined(WEIGHT_F32)
return weights[idx];
#elif defined(WEIGHT_F16)
return f32(weights[idx]);
#endif
}
fn load_input(idx: u32) -> f32 {
#if defined(INPUT_F32)
return input[idx];
#elif defined(INPUT_F16)
return f32(input[idx]);
#endif
}
fn store_output(idx: u32, val: f32) {
#if defined(OUTPUT_F32)
output[idx] = val;
#elif defined(OUTPUT_F16)
output[idx] = f16(val);
#endif
}
fn ceil_div_u32(x: u32, y: u32) -> u32 {
return (x + y - 1) / y;
}
// returns the first valid kernel index k such that base + k * step >= 0
fn first_valid_k(base: i32, step: u32) -> u32 {
if (base >= 0) {
return 0;
}
return ceil_div_u32(u32(-base), step);
}
// returns the first invalid kernel index k such that base + k * step >= limit so valid k are in [0, end_valid_k)
fn end_valid_k(base: i32, step: u32, limit: u32, k_max: u32) -> u32 {
let remaining = i32(limit) - base;
if (remaining <= 0) {
return 0;
}
return min(k_max, ceil_div_u32(u32(remaining), step));
}
@compute @workgroup_size(WG_SIZE)
fn main(
@builtin(global_invocation_id) gid: vec3<u32>,
@builtin(num_workgroups) num_wg: vec3<u32>
) {
let threads_per_group = u32(WG_SIZE);
let i_out = gid.x + (num_wg.x * threads_per_group) * gid.y;
let n_out = params.OW * params.OH * params.OC_out * params.N_out;
var sum: f32 = 0.0;
if (i_out >= n_out) {
return;
}
// Kernel layout: [KW, KH, IC, ..]
// Input layout: [IW, IH, .., ..]
// Output layout: [OW, OH, OC, N]
var i = i_out;
let n = i / (params.OC_out * params.OH * params.OW);
i = i % (params.OC_out * params.OH * params.OW);
let oc = i / (params.OH * params.OW);
i = i % (params.OH * params.OW);
let oh = i / params.OW;
let ow = i % params.OW;
let ow_base = i32(ow * params.s0) - i32(params.p0);
let oh_base = i32(oh * params.s1) - i32(params.p1);
// clip the valid kernel window once
let kw_begin = first_valid_k(ow_base, params.d0);
let kw_end = end_valid_k(ow_base, params.d0, params.IW, params.KW);
let kh_begin = first_valid_k(oh_base, params.d1);
let kh_end = end_valid_k(oh_base, params.d1, params.IH, params.KH);
// entire receptive field is out of bounds
if (kw_begin >= kw_end || kh_begin >= kh_end) {
let out_idx = params.offset_o + ow * params.so0 + oh * params.so1 + oc * params.so2 + n * params.so3;
store_output(out_idx, 0.0);
return;
}
let weight_oc_base = params.offset_w + oc * params.sw3;
let input_n_base = params.offset_i + n * params.si3;
for (var ic: u32 = 0; ic < params.IC; ic += 1) {
let w_base_ic = ic * params.sw2 + weight_oc_base;
let in_base = ic * params.si2 + input_n_base;
for (var kh: u32 = kh_begin; kh < kh_end; kh += 1) {
let ih = u32(oh_base + i32(kh * params.d1));
let w_row_base = w_base_ic + kh * params.sw1;
let in_row_base = in_base + ih * params.si1;
for (var kw: u32 = kw_begin; kw < kw_end; kw += 1) {
let iw = u32(ow_base + i32(kw * params.d0));
let w_idx = w_row_base + kw * params.sw0;
let in_idx = in_row_base + iw * params.si0;
sum += load_weight(w_idx) * load_input(in_idx);
}
}
}
let out_idx = params.offset_o + ow * params.so0 + oh * params.so1 + oc * params.so2 + n * params.so3;
store_output(out_idx, sum);
}
@@ -138,55 +138,26 @@ struct Params {
};
@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
#ifdef KV_OVERLAP
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
#define V K
#else
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
@group(0) @binding(2) var<storage, read_write> V: array<KV_TYPE>;
#endif
#if defined(MASK) && defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#elif defined(MASK)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#elif defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#else
#ifdef KV_OVERLAP
#define DST_BINDING 2
#define PARAMS_BINDING 3
#else
#define DST_BINDING 3
#define PARAMS_BINDING 4
#endif
#endif
@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
@@ -1,330 +0,0 @@
enable f16;
enable subgroups;
#define HEAD_DIM_QK 64
#define HEAD_DIM_V 64
#define KV_STAGE_STRIDE 64
#define Q_TILE 4
#define KV_TILE 64
#define WG_SIZE 128
struct Params {
offset_q: u32,
offset_k: u32,
offset_v: u32,
offset_mask: u32,
offset_sinks: u32,
offset_dst: u32,
n_heads: u32,
seq_len_q: u32,
seq_len_kv: u32,
stride_q1: u32,
stride_q2: u32,
stride_q3: u32,
stride_k1: u32,
stride_k2: u32,
stride_k3: u32,
stride_v1: u32,
stride_v2: u32,
stride_v3: u32,
stride_mask3: u32,
q_per_kv: u32,
scale: f32,
max_bias: f32,
logit_softcap: f32,
n_head_log2: f32,
m0: f32,
m1: f32,
};
@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
#ifdef KV_OVERLAP
@group(0) @binding(1) var<storage, read_write> K: array<vec4<f16>>;
#define V K
#else
@group(0) @binding(1) var<storage, read_write> K: array<vec4<f16>>;
@group(0) @binding(2) var<storage, read_write> V: array<vec4<f16>>;
#endif
#if defined(MASK) && defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#elif defined(MASK)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#elif defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#else
#ifdef KV_OVERLAP
#define DST_BINDING 2
#define PARAMS_BINDING 3
#else
#define DST_BINDING 3
#define PARAMS_BINDING 4
#endif
#endif
@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
const FLOAT_MIN: f32 = -1.0e9;
const Q_CHUNKS: u32 = HEAD_DIM_QK / 4u;
const V_CHUNKS: u32 = HEAD_DIM_V / 4u;
const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MAX_SUBGROUP_SIZE - 1u) / MAX_SUBGROUP_SIZE;
const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MAX_SUBGROUP_SIZE - 1u) / MAX_SUBGROUP_SIZE;
var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;
var<workgroup> kv_shmem: array<f16, KV_TILE * KV_STAGE_STRIDE>;
var<workgroup> p_shmem: array<f32, Q_TILE * KV_TILE>;
@compute @workgroup_size(WG_SIZE)
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(subgroup_id) subgroup_id: u32,
@builtin(subgroup_size) subgroup_size: u32,
@builtin(num_subgroups) num_subgroups: u32,
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
if (subgroup_size == 0u || num_subgroups < Q_TILE) {
return;
}
let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
let wg_per_batch = wg_per_head * params.n_heads;
let dst2_stride = HEAD_DIM_V * params.n_heads;
let dst3_stride = dst2_stride * params.seq_len_q;
let batch_idx = wg_id.x / wg_per_batch;
let q_batch_offset = params.offset_q + batch_idx * params.stride_q3;
let k_batch_offset = params.offset_k + batch_idx * params.stride_k3;
let v_batch_offset = params.offset_v + batch_idx * params.stride_v3;
let dst_batch_offset = params.offset_dst + batch_idx * dst3_stride;
let wg_in_batch = wg_id.x % wg_per_batch;
let head_idx = wg_in_batch / wg_per_head;
let q_head_offset = q_batch_offset + head_idx * params.stride_q2;
let k_head_idx = head_idx / params.q_per_kv;
let v_head_offset = v_batch_offset + k_head_idx * params.stride_v2;
let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
let wg_in_head = wg_in_batch % wg_per_head;
let q_row_start = wg_in_head * Q_TILE;
let global_q_row = q_row_start + subgroup_id;
let row_active = subgroup_id < Q_TILE && global_q_row < params.seq_len_q;
#ifdef MASK
let mask_global_offset = params.offset_mask + batch_idx * params.stride_mask3 + q_row_start * params.seq_len_kv;
#endif
let dst_global_offset = dst_batch_offset + q_row_start * dst2_stride + head_idx * HEAD_DIM_V;
let head = f32(head_idx);
let slope = select(1.0,
select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
pow(params.m0, head + 1.0),
head < params.n_head_log2),
params.max_bias > 0.0);
for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
let q_tile_row = elem_idx / HEAD_DIM_QK;
let q_col = elem_idx % HEAD_DIM_QK;
let head_q_row = q_row_start + q_tile_row;
let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
q_shmem[elem_idx] = f16(select(
0.0,
Q[global_q_row_offset + q_col] * params.scale,
head_q_row < params.seq_len_q));
}
workgroupBarrier();
var row_max = FLOAT_MIN;
var exp_sum = 0.0;
var out_regs: array<vec4<f32>, OUT_REGS_PER_LANE>;
for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
out_regs[reg_idx] = vec4<f32>(0.0);
}
let q_base = subgroup_id * HEAD_DIM_QK;
let subgroup_p_offset = subgroup_id * KV_TILE;
for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) {
let kv_count = min(KV_TILE, params.seq_len_kv - kv_tile);
let score_slots = min(SCORE_REGS_PER_LANE, (kv_count + subgroup_size - 1u) / subgroup_size);
let out_slots = min(OUT_REGS_PER_LANE, (V_CHUNKS + subgroup_size - 1u) / subgroup_size);
var local_scores: array<f32, SCORE_REGS_PER_LANE>;
for (var slot = 0u; slot < SCORE_REGS_PER_LANE; slot += 1u) {
local_scores[slot] = FLOAT_MIN;
}
for (var vec_idx_local = local_id.x; vec_idx_local < kv_count * Q_CHUNKS; vec_idx_local += WG_SIZE) {
let kv_local = vec_idx_local / Q_CHUNKS;
let chunk = vec_idx_local % Q_CHUNKS;
let global_k_row = kv_tile + kv_local;
let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u;
let k4 = K[k_vec_index];
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
kv_shmem[kv_off + 0u] = k4.x;
kv_shmem[kv_off + 1u] = k4.y;
kv_shmem[kv_off + 2u] = k4.z;
kv_shmem[kv_off + 3u] = k4.w;
}
workgroupBarrier();
var local_max = FLOAT_MIN;
if (row_active) {
for (var slot = 0u; slot < score_slots; slot += 1u) {
let kv_local = sg_inv_id + slot * subgroup_size;
if (kv_local >= kv_count) {
continue;
}
let global_k_row = kv_tile + kv_local;
var dot_val = 0.0;
for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) {
let q_off = q_base + chunk * 4u;
let qv = vec4<f32>(
f32(q_shmem[q_off + 0u]),
f32(q_shmem[q_off + 1u]),
f32(q_shmem[q_off + 2u]),
f32(q_shmem[q_off + 3u]));
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
let kv = vec4<f32>(
f32(kv_shmem[kv_off + 0u]),
f32(kv_shmem[kv_off + 1u]),
f32(kv_shmem[kv_off + 2u]),
f32(kv_shmem[kv_off + 3u]));
dot_val += dot(qv, kv);
}
#ifdef LOGIT_SOFTCAP
dot_val = params.logit_softcap * tanh(dot_val);
#endif
#ifdef MASK
let mask_idx = mask_global_offset + subgroup_id * params.seq_len_kv + global_k_row;
dot_val += slope * f32(mask[mask_idx]);
#endif
local_scores[slot] = dot_val;
local_max = max(local_max, dot_val);
}
}
let tile_max = subgroupMax(local_max);
let new_max = max(row_max, tile_max);
let cur_exp = exp(row_max - new_max);
exp_sum *= cur_exp;
for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
out_regs[reg_idx] *= cur_exp;
}
var local_sum = 0.0;
for (var slot = 0u; slot < score_slots; slot += 1u) {
let kv_local = sg_inv_id + slot * subgroup_size;
if (row_active && kv_local < kv_count) {
let p = exp(local_scores[slot] - new_max);
p_shmem[subgroup_p_offset + kv_local] = p;
local_sum += p;
}
}
workgroupBarrier();
for (var vec_idx_local = local_id.x; vec_idx_local < kv_count * V_CHUNKS; vec_idx_local += WG_SIZE) {
let kv_local = vec_idx_local / V_CHUNKS;
let chunk = vec_idx_local % V_CHUNKS;
let global_v_row = kv_tile + kv_local;
let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u;
let v4 = V[v_vec_index];
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
kv_shmem[kv_off + 0u] = v4.x;
kv_shmem[kv_off + 1u] = v4.y;
kv_shmem[kv_off + 2u] = v4.z;
kv_shmem[kv_off + 3u] = v4.w;
}
workgroupBarrier();
let tile_sum = subgroupAdd(local_sum);
exp_sum += tile_sum;
row_max = new_max;
if (row_active) {
for (var reg_idx = 0u; reg_idx < out_slots; reg_idx += 1u) {
let chunk = sg_inv_id + reg_idx * subgroup_size;
if (chunk >= V_CHUNKS) {
continue;
}
var acc = out_regs[reg_idx];
for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) {
let p = p_shmem[subgroup_p_offset + kv_local];
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
let v4 = vec4<f32>(
f32(kv_shmem[kv_off + 0u]),
f32(kv_shmem[kv_off + 1u]),
f32(kv_shmem[kv_off + 2u]),
f32(kv_shmem[kv_off + 3u]));
acc += p * v4;
}
out_regs[reg_idx] = acc;
}
}
workgroupBarrier();
}
#ifdef SINKS
if (row_active) {
let sink_score = sinks[params.offset_sinks + head_idx];
let sink_max = max(row_max, sink_score);
let sink_scale = exp(row_max - sink_max);
for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
out_regs[reg_idx] *= sink_scale;
}
exp_sum = exp_sum * sink_scale + exp(sink_score - sink_max);
row_max = sink_max;
}
#endif
if (row_active) {
let inv_exp_sum = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
let row_base = dst_global_offset + subgroup_id * dst2_stride;
let out_slots = min(OUT_REGS_PER_LANE, (V_CHUNKS + subgroup_size - 1u) / subgroup_size);
for (var reg_idx = 0u; reg_idx < out_slots; reg_idx += 1u) {
let chunk = sg_inv_id + reg_idx * subgroup_size;
if (chunk >= V_CHUNKS) {
continue;
}
let dst_vec_index = (row_base + chunk * 4u) >> 2u;
dst[dst_vec_index] = out_regs[reg_idx] * inv_exp_sum;
}
}
}
@@ -15,7 +15,7 @@ struct Params {
nblk1: u32,
};
@group(0) @binding(0) var<storage, read_write> mask: array<f16>;
@group(0) @binding(0) var<storage, read> mask: array<f16>;
@group(0) @binding(1) var<storage, read_write> blk: array<u32>;
@group(0) @binding(2) var<uniform> params: Params;
@@ -1,6 +1,8 @@
diagnostic(off, chromium.subgroup_matrix_uniformity);
diagnostic(off, subgroup_uniformity);
enable f16;
enable subgroups;
enable chromium_experimental_subgroup_matrix;
#ifdef KV_F32
#define KV_TYPE f32
@@ -11,14 +13,19 @@ enable subgroups;
#define HEAD_DIM_QK 64
#define HEAD_DIM_V 64
#define KV_GRANULARITY 8
#define SG_MAT_M 8
#define SG_MAT_N 8
#define SG_MAT_K 8
#define Q_TILE SG_MAT_M
#define KV_TILE 16
#define WG_SIZE 64
#ifndef VEC_NE
#define VEC_NE 4u
#endif
#define KV_BLOCKS (KV_TILE / KV_GRANULARITY)
#define KV_BLOCKS (KV_TILE / SG_MAT_N)
#define BLOCK_SIZE 32
#define BLOCKS_K ((HEAD_DIM_QK + BLOCK_SIZE - 1) / BLOCK_SIZE)
@@ -90,14 +97,6 @@ struct Params {
};
@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
#ifdef KV_OVERLAP
#if defined(KV_Q4_0) || defined(KV_Q8_0)
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
#else
@group(0) @binding(1) var<storage, read_write> K: array<vec4<KV_TYPE>>;
#endif
#define V K
#else
#if defined(KV_Q4_0) || defined(KV_Q8_0)
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
#else
@@ -108,22 +107,7 @@ struct Params {
#else
@group(0) @binding(2) var<storage, read_write> V: array<vec4<KV_TYPE>>;
#endif
#endif
#if defined(MASK) && defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#ifdef BLK
#define BLK_BINDING 4
#define TMP_BINDING 5
#define DST_BINDING 6
#define PARAMS_BINDING 7
#else
#define TMP_BINDING 4
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
#ifdef BLK
@@ -136,21 +120,7 @@ struct Params {
#define DST_BINDING 6
#define PARAMS_BINDING 7
#endif
#endif
#elif defined(MASK)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
#ifdef BLK
#define BLK_BINDING 3
#define TMP_BINDING 4
#define DST_BINDING 5
#define PARAMS_BINDING 6
#else
#define TMP_BINDING 3
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
#ifdef BLK
#define BLK_BINDING 4
@@ -162,30 +132,16 @@ struct Params {
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#endif
#elif defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
#define TMP_BINDING 3
#define DST_BINDING 4
#define PARAMS_BINDING 5
#else
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define TMP_BINDING 4
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#else
#ifdef KV_OVERLAP
#define TMP_BINDING 2
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
#define TMP_BINDING 3
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#endif
#ifdef BLK
@group(0) @binding(BLK_BINDING) var<storage, read_write> blk: array<u32>;
@@ -197,7 +153,7 @@ struct Params {
// Just a very small float value.
const FLOAT_MIN: f32 = -1.0e9;
var<workgroup> q_shmem: array<f16, HEAD_DIM_QK>;
var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;
#ifndef KV_DIRECT
const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
@@ -205,27 +161,31 @@ const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
var<workgroup> kv_shmem: array<f16, kv_shmem_size>;
#endif
var<workgroup> o_shmem: array<f16, HEAD_DIM_V>;
var<workgroup> o_shmem: array<f16, Q_TILE * HEAD_DIM_V>;
#ifdef MASK
// storage for mask values
var<workgroup> mask_shmem: array<f16, KV_TILE>;
var<workgroup> mask_shmem: array<f16, Q_TILE * KV_TILE>;
#endif
// note that we reuse the same storage for both since we only need one at a time
var<workgroup> inter_shmem: array<f16, KV_TILE>;
var<workgroup> inter_shmem: array<f16, Q_TILE * KV_TILE>;
// Storage for row max and exp sum during online softmax
fn calc_softmax_term(kv_idx: u32, slope: f32, has_bias: bool, apply_mask: bool) -> f32 {
var<workgroup> row_max_shmem: array<f32, Q_TILE>;
var<workgroup> exp_sum_shmem: array<f32, Q_TILE>;
var<workgroup> blk_state_wg: u32;
fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32, has_bias: bool, apply_mask: bool) -> f32 {
var v = select(FLOAT_MIN,
f32(inter_shmem[kv_idx]) * params.scale,
f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]) * params.scale,
kv_idx < KV_TILE);
#ifdef LOGIT_SOFTCAP
v = params.logit_softcap * tanh(v);
#endif
#ifdef MASK
if (apply_mask) {
var mask_val = select(0.0, f32(mask_shmem[kv_idx]), kv_idx < KV_TILE);
var mask_val = select(0.0,f32(mask_shmem[q_tile_row * KV_TILE + kv_idx]), kv_idx < KV_TILE);
v += select(mask_val, slope * mask_val, has_bias);
}
#endif
@@ -239,17 +199,19 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
@builtin(subgroup_size) subgroup_size: u32,
@builtin(num_subgroups) num_subgroups: u32,
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
// Vec path processes exactly one query row per workgroup, so subgroup 0 can
// keep the running softmax state in private storage.
var row_max = FLOAT_MIN;
var exp_sum = 0.0;
for (var i = local_id.x; i < HEAD_DIM_V; i += WG_SIZE) {
// initialize row max for online softmax
for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) {
row_max_shmem[i] = FLOAT_MIN;
exp_sum_shmem[i] = 0.0;
}
for (var i = local_id.x; i < Q_TILE * HEAD_DIM_V; i += WG_SIZE) {
o_shmem[i] = 0.0;
}
// workgroups per head/batch
let wg_per_head = params.seq_len_q;
let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
let wg_per_batch = wg_per_head * params.n_heads;
let dst2_stride = HEAD_DIM_V * params.n_heads;
@@ -273,9 +235,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
let v_head_offset = v_batch_offset + v_head_idx * params.stride_v2;
// Vec path handles one Q row per workgroup.
// starting Q row for this workgroup
let wg_in_head = wg_in_batch % wg_per_head;
let q_row_start = wg_in_head;
let q_row_start = wg_in_head * Q_TILE;
#ifdef MASK
// mask offset
@@ -286,18 +248,21 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let has_bias = params.max_bias > 0.0;
let slope = select(1.0, select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), pow(params.m0, head + 1.0), head < params.n_head_log2), has_bias);
// load the single Q row into shared memory
for (var elem_idx = local_id.x; elem_idx < HEAD_DIM_QK; elem_idx += WG_SIZE) {
let global_q_row_offset = q_head_offset + q_row_start * params.stride_q1;
// load q tile into shared memory
for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
let q_row = elem_idx / HEAD_DIM_QK;
let q_col = elem_idx % HEAD_DIM_QK;
let head_q_row = q_row_start + q_row;
let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
q_shmem[elem_idx] = f16(select(
0.0,
Q[global_q_row_offset + elem_idx],
q_row_start < params.seq_len_q));
Q[global_q_row_offset + q_col],
head_q_row < params.seq_len_q && q_col < HEAD_DIM_QK));
}
for (var kv_tile = iwg * KV_TILE; kv_tile < params.seq_len_kv; kv_tile += KV_TILE * params.nwg) {
#ifdef BLK
let q_blk = q_row_start;
let q_blk = q_row_start / Q_TILE;
let kv_blk = kv_tile / KV_TILE;
let blk_batch = select(0u, batch_idx, params.stride_mask3 > 0u);
let blk_idx = params.blk_base + (blk_batch * params.blk_nblk1 + q_blk) * params.blk_nblk0 + kv_blk;
@@ -305,9 +270,13 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
#else
let blk_state_local = 1u;
#endif
let blk_state = blk_state_local;
if (local_id.x == 0u) {
blk_state_wg = blk_state_local;
}
workgroupBarrier();
let blk_state = blk_state_wg;
let skip_tile = blk_state == 0u;
for (var elem_idx = local_id.x; elem_idx < KV_TILE; elem_idx += WG_SIZE) {
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
inter_shmem[elem_idx] = f16(0.0);
}
@@ -391,14 +360,20 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let num_of_threads = subgroup_size / VEC_NE;
let tx = sg_inv_id % num_of_threads;
let ty = sg_inv_id / num_of_threads;
if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) {
continue;
}
let local_q_row_offset = q_tile_row * HEAD_DIM_QK;
for (var kv_base : u32 = 0u; kv_base < KV_TILE; kv_base += VEC_NE) {
let kv_idx = kv_base + ty;
var partial_sum: f32 = 0.0;
let kv_valid = kv_idx < KV_TILE && (kv_tile + kv_idx) < params.seq_len_kv;
if (kv_valid) {
for (var i = tx; i < (HEAD_DIM_QK / 4u); i += num_of_threads) {
let q_off = i * 4u;
let q_off = local_q_row_offset + i * 4u;
let qv = vec4<f32>(
f32(q_shmem[q_off + 0u]),
@@ -435,7 +410,8 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let sum_bcast = subgroupShuffle(sum, num_of_threads * ty);
if (tx == 0u && kv_valid) {
inter_shmem[kv_idx] = f16(sum_bcast);
let dst_idx = q_tile_row * KV_TILE + kv_idx;
inter_shmem[dst_idx] = f16(sum_bcast);
}
}
}
@@ -446,10 +422,13 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let apply_mask = !skip_tile && (blk_state != 2u);
if (apply_mask) {
// load mask tile into shared memory for this KV block
for (var elem_idx = local_id.x; elem_idx < KV_TILE; elem_idx += WG_SIZE) {
let global_k_col = kv_tile + elem_idx;
let mask_in_bounds = q_row_start < params.seq_len_q && global_k_col < params.seq_len_kv;
let mask_idx = mask_global_offset + global_k_col;
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
let mask_row = elem_idx / KV_TILE;
let mask_col = elem_idx % KV_TILE;
let global_q_row = q_row_start + mask_row;
let global_k_col = kv_tile + mask_col;
let mask_in_bounds = global_q_row < params.seq_len_q && global_k_col < params.seq_len_kv;
let mask_idx = mask_global_offset + mask_row * params.seq_len_kv + global_k_col;
mask_shmem[elem_idx] = select(0.0, mask[mask_idx], mask_in_bounds);
}
}
@@ -460,40 +439,50 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
workgroupBarrier();
// online softmax
if (!skip_tile && subgroup_id == 0u && q_row_start < params.seq_len_q) {
var prev_max = row_max;
var final_max = prev_max;
// pass 1: compute final max across the full KV tile in chunks
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
let kv_idx = kv_offset + sg_inv_id;
let kv_valid = kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE;
let softmax_term = select(FLOAT_MIN,
calc_softmax_term(kv_idx, slope, has_bias, apply_mask),
kv_valid);
final_max = subgroupMax(max(final_max, softmax_term));
}
var total_exp_term: f32 = 0.0;
// pass 2: compute exp sum and write P using final_max
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
let kv_idx = kv_offset + sg_inv_id;
let softmax_term = calc_softmax_term(kv_idx, slope, has_bias, apply_mask);
let cur_p = select(0.0,
exp(softmax_term - final_max),
kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
total_exp_term += subgroupAdd(cur_p);
if (kv_idx < KV_TILE) {
inter_shmem[kv_idx] = f16(cur_p);
if (!skip_tile) {
for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) {
break;
}
}
let cur_exp = exp(prev_max - final_max);
var prev_max = row_max_shmem[q_tile_row];
var final_max = prev_max;
// pass 1: compute final max across the full KV tile in chunks
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
let kv_idx = kv_offset + sg_inv_id;
let kv_valid = kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE;
let softmax_term = select(FLOAT_MIN,
calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask),
kv_valid);
final_max = subgroupMax(max(final_max, softmax_term));
}
row_max = final_max;
exp_sum = exp_sum * cur_exp + total_exp_term;
var total_exp_term: f32 = 0.0;
// pass 2: compute exp sum and write P using final_max
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
let kv_idx = kv_offset + sg_inv_id;
let softmax_term = calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask);
let cur_p = select(0.0,
exp(softmax_term - final_max),
kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
total_exp_term += subgroupAdd(cur_p);
if (kv_idx < KV_TILE) {
inter_shmem[kv_idx + q_tile_row * KV_TILE] = f16(cur_p);
}
}
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
o_shmem[elem_idx] = f16(f32(o_shmem[elem_idx]) * cur_exp);
let cur_exp = exp(prev_max - final_max);
if (sg_inv_id == 0) {
row_max_shmem[q_tile_row] = final_max;
exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * cur_exp + total_exp_term;
}
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
let idx = q_tile_row * HEAD_DIM_V + elem_idx;
o_shmem[idx] = f16(f32(o_shmem[idx]) * cur_exp);
}
}
}
@@ -573,13 +562,15 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
workgroupBarrier();
if (!skip_tile) {
// we have P (KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
// we have P (Q_TILE x KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
// we want to compute O += P * V across the full KV tile
let ne_threads : u32 = VEC_NE;
let nl_threads = max(1u, subgroup_size / ne_threads);
let tx_pv = sg_inv_id % nl_threads;
let ty_pv = sg_inv_id / nl_threads;
if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
for (var q_tile_row = subgroup_id;
q_tile_row < Q_TILE;
q_tile_row += num_subgroups) {
for (var vec_col = tx_pv; vec_col < (HEAD_DIM_V / 4u); vec_col += nl_threads) {
var lo = vec4<f32>(0.0, 0.0, 0.0, 0.0);
for (var cc = 0u; cc < KV_TILE / ne_threads; cc += 1u) {
@@ -589,7 +580,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
continue;
}
let p = f32(inter_shmem[kv_idx]);
let p = f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]);
#ifdef KV_DIRECT
let v_idx = v_head_offset + v_row * params.stride_v1 + vec_col * 4u;
let v4 = vec4<f32>(V[v_idx >> 2u]);
@@ -630,10 +621,11 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
if (ty_pv == 0u) {
let elem_base = vec_col * 4u;
o_shmem[elem_base + 0u] = f16(f32(o_shmem[elem_base + 0u]) + lo_x);
o_shmem[elem_base + 1u] = f16(f32(o_shmem[elem_base + 1u]) + lo_y);
o_shmem[elem_base + 2u] = f16(f32(o_shmem[elem_base + 2u]) + lo_z);
o_shmem[elem_base + 3u] = f16(f32(o_shmem[elem_base + 3u]) + lo_w);
let o_base_idx = q_tile_row * HEAD_DIM_V + elem_base;
o_shmem[o_base_idx + 0u] = f16(f32(o_shmem[o_base_idx + 0u]) + lo_x);
o_shmem[o_base_idx + 1u] = f16(f32(o_shmem[o_base_idx + 1u]) + lo_y);
o_shmem[o_base_idx + 2u] = f16(f32(o_shmem[o_base_idx + 2u]) + lo_z);
o_shmem[o_base_idx + 3u] = f16(f32(o_shmem[o_base_idx + 3u]) + lo_w);
}
}
}
@@ -645,46 +637,70 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
#ifdef SINKS
// Sinks are global terms and must be applied exactly once across split workgroups.
if (iwg == 0u && subgroup_id == 0u && q_row_start < params.seq_len_q) {
var prev_max = row_max;
if (iwg == 0u) {
for (var q_tile_row = subgroup_id;
q_tile_row < Q_TILE;
q_tile_row += num_subgroups) {
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) {
break;
}
// for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0u);
let new_max = subgroupMax(max(prev_max, sink_val));
let max_exp = exp(prev_max - new_max);
let sink_exp = exp(sink_val - new_max);
var prev_max = row_max_shmem[q_tile_row];
let sink_exp_sum = subgroupAdd(sink_exp);
// for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0);
let new_max = subgroupMax(max(prev_max, sink_val));
let max_exp = exp(prev_max - new_max);
let sink_exp = exp(sink_val - new_max);
row_max = new_max;
exp_sum = exp_sum * max_exp + sink_exp_sum;
let sink_exp_sum = subgroupAdd(sink_exp);
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
o_shmem[elem_idx] = f16(f32(o_shmem[elem_idx]) * max_exp);
if (sg_inv_id == 0) {
row_max_shmem[q_tile_row] = new_max;
exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * max_exp + sink_exp_sum;
}
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
let idx = q_tile_row * HEAD_DIM_V + elem_idx;
o_shmem[idx] = f16(f32(o_shmem[idx]) * max_exp);
}
}
workgroupBarrier();
}
workgroupBarrier();
#endif
let rows_per_batch = params.n_heads * params.seq_len_q;
if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
for (var q_tile_row = subgroup_id;
q_tile_row < Q_TILE;
q_tile_row += num_subgroups) {
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) { break; }
if (params.nwg == 1u) {
let exp_sum = exp_sum_shmem[q_tile_row];
let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
let row_base: u32 = params.offset_dst + batch_idx * dst3_stride + q_row_start * dst2_stride +
head_idx * HEAD_DIM_V;
let row_base: u32 =
params.offset_dst + batch_idx * dst3_stride + global_q_row * dst2_stride + head_idx * HEAD_DIM_V;
for (var elem_base = sg_inv_id * 4u; elem_base < HEAD_DIM_V; elem_base += subgroup_size * 4u) {
let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
let v = vec4<f32>(
f32(o_shmem[elem_base + 0u]) * scale,
f32(o_shmem[elem_base + 1u]) * scale,
f32(o_shmem[elem_base + 2u]) * scale,
f32(o_shmem[elem_base + 3u]) * scale
f32(o_shmem[i0]) * scale,
f32(o_shmem[i1]) * scale,
f32(o_shmem[i2]) * scale,
f32(o_shmem[i3]) * scale
);
let dst_vec_index: u32 = (row_base + elem_base) >> 2u;
dst[dst_vec_index] = v;
}
} else {
let rid = batch_idx * rows_per_batch + head_idx * params.seq_len_q + q_row_start;
let rid = batch_idx * rows_per_batch + head_idx * params.seq_len_q + global_q_row;
let tmp_row_data_base = params.tmp_data_base + rid * (HEAD_DIM_V * params.nwg) + iwg * HEAD_DIM_V;
let tmp_row_stats_base = params.tmp_stats_base + rid * (2u * params.nwg) + 2u * iwg;
@@ -692,16 +708,21 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
elem_base < HEAD_DIM_V;
elem_base += subgroup_size * 4u) {
let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
let tbase = tmp_row_data_base + elem_base;
tmp[tbase + 0u] = f32(o_shmem[elem_base + 0u]);
tmp[tbase + 1u] = f32(o_shmem[elem_base + 1u]);
tmp[tbase + 2u] = f32(o_shmem[elem_base + 2u]);
tmp[tbase + 3u] = f32(o_shmem[elem_base + 3u]);
tmp[tbase + 0u] = f32(o_shmem[i0]);
tmp[tbase + 1u] = f32(o_shmem[i1]);
tmp[tbase + 2u] = f32(o_shmem[i2]);
tmp[tbase + 3u] = f32(o_shmem[i3]);
}
if (sg_inv_id == 0u) {
tmp[tmp_row_stats_base + 0u] = exp_sum;
tmp[tmp_row_stats_base + 1u] = row_max;
tmp[tmp_row_stats_base + 0u] = exp_sum_shmem[q_tile_row];
tmp[tmp_row_stats_base + 1u] = row_max_shmem[q_tile_row];
}
}
}
@@ -1,101 +0,0 @@
#include "common_decls.tmpl"
enable f16;
@group(0) @binding(0)
#if defined(INPUT_F32)
var<storage, read_write> input: array<f32>;
#elif defined(INPUT_F16)
var<storage, read_write> input: array<f16>;
#endif
@group(0) @binding(1)
#if defined(OUTPUT_F32)
var<storage, read_write> output: array<f32>;
#elif defined(OUTPUT_F16)
var<storage, read_write> output: array<f16>;
#endif
struct Params {
offset_i: u32,
offset_o: u32,
// element strides
si0: u32, si1: u32, si2: u32, si3: u32,
so0: u32, so1: u32, so2: u32, so3: u32,
KW: u32, KH: u32, IC: u32,
IW: u32, IH: u32, N: u32,
OW: u32, OH: u32,
// stride
s0: u32, s1: u32,
// padding
p0: u32, p1: u32,
// dilation
d0: u32, d1: u32,
}
@group(0) @binding(2)
var<uniform> params: Params;
fn load_input(idx: u32) -> f32 {
#if defined(INPUT_F32)
return input[idx];
#elif defined(INPUT_F16)
return f32(input[idx]);
#endif
}
fn store_output(idx: u32, val: f32) {
#if defined(OUTPUT_F32)
output[idx] = val;
#elif defined(OUTPUT_F16)
output[idx] = f16(val);
#endif
}
@compute @workgroup_size(WG_SIZE)
fn main(
@builtin(global_invocation_id) gid: vec3<u32>,
@builtin(num_workgroups) num_wg: vec3<u32>
) {
let threads_per_group = u32(WG_SIZE);
let i_out = gid.x + (num_wg.x * threads_per_group) * gid.y;
let K = params.KW * params.KH * params.IC;
let M = params.OW * params.OH;
let total = K * M * params.N;
if (i_out >= total) {
return;
}
// decode (k, m, n)
var i = i_out;
let n = i / (K * M);
i = i % (K * M);
let m = i / K;
let k = i % K;
// decode (oh, ow)
let oh = m / params.OW;
let ow = m % params.OW;
// decode (kw, kh, ic)
let kw = k % params.KW;
let tmp = k / params.KW;
let kh = tmp % params.KH;
let ic = tmp / params.KH;
let iw_i32 = i32(ow * params.s0 + kw * params.d0) - i32(params.p0);
let ih_i32 = i32(oh * params.s1 + kh * params.d1) - i32(params.p1);
if (iw_i32 >= 0 && iw_i32 < i32(params.IW) && ih_i32 >= 0 && ih_i32 < i32(params.IH)) {
let iw = u32(iw_i32);
let ih = u32(ih_i32);
let in_idx = params.offset_i + iw * params.si0 + ih * params.si1 + ic * params.si2 + n * params.si3;
store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, load_input(in_idx));
} else {
store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, 0.0);
}
}
File diff suppressed because it is too large Load Diff
@@ -1,154 +0,0 @@
#ifdef OVERLAP
@group(0) @binding(0)
var<storage, read_write> rn_src: array<f32>;
@group(0) @binding(1)
var<storage, read_write> mul_src: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
mul_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
}
#elif INPLACE
@group(0) @binding(0)
var<storage, read_write> rn_src: array<f32>;
@group(0) @binding(1)
var<storage, read_write> mul_src: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
rn_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
}
#elif SRC_OVERLAP
@group(0) @binding(0)
var<storage, read_write> merged_src: array<f32>;
@group(0) @binding(1)
var<storage, read_write> dst: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
dst[dst_offset] = scale * merged_src[rn_src_offset] * merged_src[mul_src_offset];
}
#else
@group(0) @binding(0)
var<storage, read_write> rn_src: array<f32>;
@group(0) @binding(1)
var<storage, read_write> mul_src: array<f32>;
@group(0) @binding(2)
var<storage, read_write> dst: array<f32>;
@group(0) @binding(3)
var<uniform> params: Params;
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
dst[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
}
#endif
struct Params {
offset_rn_src: u32,
offset_mul_src: u32,
offset_merged_rn_src: u32,
offset_merged_mul_src: u32,
offset_dst: u32,
stride_rn_src1: u32,
stride_rn_src2: u32,
stride_rn_src3: u32,
stride_mul_src1: u32,
stride_mul_src2: u32,
stride_mul_src3: u32,
stride_dst1: u32,
stride_dst2: u32,
stride_dst3: u32,
mul_src_ne0: u32,
mul_src_ne1: u32,
mul_src_ne2: u32,
mul_src_ne3: u32,
ne0: u32,
ne1: u32,
ne2: u32,
ne3: u32,
eps: f32
};
var<workgroup> scratch: array<f32, WG_SIZE>;
@compute @workgroup_size(WG_SIZE)
fn main(@builtin(workgroup_id) wid: vec3<u32>,
@builtin(local_invocation_id) lid: vec3<u32>) {
// one thread per row
var i = wid.x;
let i3 = i / (params.ne2 * params.ne1);
i = i % (params.ne2 * params.ne1);
let i2 = i / params.ne1;
let i1 = i % params.ne1;
let i_rn_src_row = params.offset_rn_src + params.offset_merged_rn_src + i3 * params.stride_rn_src3 + i2 * params.stride_rn_src2 + i1 * params.stride_rn_src1;
let i_mul_src_row = params.offset_mul_src + params.offset_merged_mul_src + (i3 % params.mul_src_ne3) * params.stride_mul_src3 + (i2 % params.mul_src_ne2) * params.stride_mul_src2 + (i1 % params.mul_src_ne1) * params.stride_mul_src1;
let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE;
var sum = 0.0f;
var col = lid.x;
for (var j: u32 = 0; j < elems; j++) {
if (col >= params.ne0) {
break;
}
#ifdef SRC_OVERLAP
sum += pow(merged_src[i_rn_src_row + col], 2.0);
#else
sum += pow(rn_src[i_rn_src_row + col], 2.0);
#endif
col += WG_SIZE;
}
scratch[lid.x] = sum;
workgroupBarrier();
var offset: u32 = WG_SIZE / 2;
while (offset > 0) {
if (lid.x < offset) {
scratch[lid.x] += scratch[lid.x + offset];
}
offset = offset / 2;
workgroupBarrier();
}
sum = scratch[0];
let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps);
col = lid.x;
for (var j: u32 = 0; j < elems; j++) {
if (col >= params.ne0) {
break;
}
update(i_rn_src_row + col, i_dst_row + col, scale, i_mul_src_row + col % params.mul_src_ne0);
col += WG_SIZE;
}
}
+20 -20
View File
@@ -7656,7 +7656,7 @@ size_t ggml_quantize_chunk(
int64_t nrows,
int64_t n_per_row,
const float * imatrix) {
const int64_t n = nrows * n_per_row;
const int64_t n = (int64_t) nrows * n_per_row;
if (ggml_quantize_requires_imatrix(type)) {
GGML_ASSERT(imatrix != NULL);
@@ -7673,21 +7673,21 @@ size_t ggml_quantize_chunk(
size_t result = 0;
switch (type) {
case GGML_TYPE_Q1_0: result = quantize_q1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0: result = quantize_q4_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_1: result = quantize_q4_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_0: result = quantize_q5_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_1: result = quantize_q5_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_0: result = quantize_q8_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_MXFP4: result = quantize_mxfp4 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_NVFP4: result = quantize_nvfp4 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q2_K: result = quantize_q2_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q3_K: result = quantize_q3_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_K: result = quantize_q4_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_K: result = quantize_q5_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q6_K: result = quantize_q6_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ1_0: result = quantize_tq1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ2_0: result = quantize_tq2_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q1_0: result = quantize_q1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_NVFP4: result = quantize_nvfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@@ -7752,9 +7752,9 @@ struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
}
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}
-20
View File
@@ -197,7 +197,6 @@ class Keys:
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
SCALING_TYPE = "{arch}.rope.scaling.type"
SCALING_FACTOR = "{arch}.rope.scaling.factor"
SCALING_ALPHA = "{arch}.rope.scaling.alpha"
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
@@ -472,7 +471,6 @@ class MODEL_ARCH(IntEnum):
ERNIE4_5_MOE = auto()
HUNYUAN_MOE = auto()
HUNYUAN_DENSE = auto()
HUNYUAN_VL = auto()
SMOLLM3 = auto()
GPT_OSS = auto()
LFM2 = auto()
@@ -959,7 +957,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.FALCON_H1: "falcon-h1",
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
MODEL_ARCH.HUNYUAN_VL: "hunyuan_vl",
MODEL_ARCH.SMOLLM3: "smollm3",
MODEL_ARCH.GPT_OSS: "gpt-oss",
MODEL_ARCH.LFM2: "lfm2",
@@ -3492,22 +3489,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.HUNYUAN_VL: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.SMOLLM3: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@@ -4157,7 +4138,6 @@ class VisionProjectorType:
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANOCR = "hunyuanocr"
HUNYUANVL = "hunyuanvl"
# Items here are (block size, type size)
-3
View File
@@ -973,9 +973,6 @@ class GGUFWriter:
def add_rope_scaling_factor(self, value: float) -> None:
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
def add_rope_scaling_alpha(self, value: float) -> None:
self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value)
def add_rope_scaling_attn_factors(self, value: float) -> None:
self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
+24
View File
@@ -511,6 +511,27 @@ extern "C" {
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
enum llama_params_fit_status {
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
};
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
// - returns true if the parameters could be successfully modified to fit device memory
// - this function is NOT thread safe because it modifies the global llama logger state
// - only parameters that have the same value as in llama_default_model_params are modified
// with the exception of the context size which is modified if and only if equal to 0
LLAMA_API enum llama_params_fit_status llama_params_fit(
const char * path_model,
struct llama_model_params * mparams,
struct llama_context_params * cparams,
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
size_t * margins, // margins of memory to leave per device in bytes
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
LLAMA_API int64_t llama_time_us(void);
LLAMA_API size_t llama_max_devices(void);
@@ -1525,6 +1546,9 @@ extern "C" {
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
// print a breakdown of per-device memory use via LLAMA_LOG:
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
//
// training
//
-991
View File
@@ -1,991 +0,0 @@
#!/usr/bin/env python3
"""
Test parallel tool-calling capability via chat completions endpoint.
Only run this against models that actually support parallel tool calls this
script does not attempt to toggle that setting on the server. Each scenario is
explicitly worded so that a capable model SHOULD emit multiple tool calls in a
single assistant turn (either the same tool N times, or several different
tools at once).
Each test case contains:
- tools: list of tool definitions (OpenAI-compatible)
- messages: initial conversation messages
- mock_tool_responses: dict mapping tool_name -> callable(arguments) -> str (JSON)
- expected_parallel: dict describing what constitutes a successful parallel turn
{"min_parallel": int, # minimum tool_calls in one turn
"require_same_tool": Optional[str], # all parallel calls must be this tool
"require_distinct_tools": Optional[int], # >= N distinct tool names in one turn
"min_distinct_args_key": Optional[str]} # parallel calls must span this
# many distinct values of this arg key
- validate: callable(turns, all_tool_calls, final_content) -> (passed, reason)
"""
import argparse
import json
import requests
import sys
# ---------------------------------------------------------------------------
# Color / formatting helpers
# ---------------------------------------------------------------------------
RESET = "\x1b[0m"
BOLD = "\x1b[1m"
DIM = "\x1b[2m"
CYAN = "\x1b[36m"
YELLOW = "\x1b[33m"
GREEN = "\x1b[32m"
RED = "\x1b[31m"
BLUE = "\x1b[34m"
WHITE = "\x1b[97m"
MAGENTA = "\x1b[35m"
def _print(text="", end="\n"):
sys.stdout.write(text + end)
sys.stdout.flush()
def print_header(title):
bar = "" * 60
_print(f"\n{BOLD}{CYAN}{bar}{RESET}")
_print(
f"{BOLD}{CYAN}{WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}{RESET}"
)
_print(f"{BOLD}{CYAN}{bar}{RESET}")
def print_turn_banner(turn_idx, n_calls):
color = MAGENTA if n_calls >= 2 else DIM
_print(f"\n {BOLD}{color}▶ turn {turn_idx}{n_calls} tool call(s){RESET}")
def print_tool_call(name, args):
args_str = json.dumps(args)
_print(
f" {BOLD}{YELLOW}{name}{RESET}{DIM}({args_str}){RESET}"
)
def print_tool_result(result):
preview = result[:140] + ("" if len(result) > 140 else "")
_print(f" {DIM}{BLUE}{preview}{RESET}")
def print_model_output(text):
sys.stdout.write(text)
sys.stdout.flush()
def print_pass(reason):
_print(f"\n{BOLD}{GREEN}✔ PASS{RESET} {reason}")
def print_fail(reason):
_print(f"\n{BOLD}{RED}✘ FAIL{RESET} {reason}")
def print_info(msg):
_print(f"{DIM}{msg}{RESET}")
def print_warn(msg):
_print(f"{BOLD}{YELLOW}{msg}{RESET}")
# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------
def chat_completion(url, messages, tools=None, stream=False):
payload = {
"messages": messages,
"stream": stream,
"max_tokens": 4096,
}
if tools:
payload["tools"] = tools
payload["tool_choice"] = "auto"
try:
response = requests.post(url, json=payload, stream=stream)
response.raise_for_status()
except requests.exceptions.RequestException as e:
body = e.response.content if (e.response is not None) else b""
print_fail(f"Request error: {e} | body: {body}")
return None
full_content = ""
reasoning_content = ""
tool_calls: list[dict] = []
if stream:
for line in response.iter_lines():
if not line:
continue
decoded = line.decode("utf-8")
if not decoded.startswith("data: "):
continue
data_str = decoded[6:]
if data_str == "[DONE]":
break
try:
data = json.loads(data_str)
except json.JSONDecodeError:
continue
choices = data.get("choices", [])
if not choices:
continue
delta = choices[0].get("delta", {})
if delta.get("reasoning_content"):
reasoning_content += delta["reasoning_content"]
if delta.get("content"):
full_content += delta["content"]
print_model_output(delta["content"])
for tc in delta.get("tool_calls", []):
idx = tc.get("index", 0)
while len(tool_calls) <= idx:
tool_calls.append(
{
"id": "",
"type": "function",
"function": {"name": "", "arguments": ""},
}
)
if "id" in tc:
tool_calls[idx]["id"] += tc["id"]
if "function" in tc:
if "name" in tc["function"]:
tool_calls[idx]["function"]["name"] += tc["function"]["name"]
if "arguments" in tc["function"]:
tool_calls[idx]["function"]["arguments"] += tc["function"][
"arguments"
]
else:
data = response.json()
choices = data.get("choices", [])
if choices:
msg = choices[0].get("message", {})
full_content = msg.get("content") or ""
reasoning_content = msg.get("reasoning_content") or ""
tool_calls = msg.get("tool_calls") or []
if full_content:
print_model_output(full_content)
result = {"content": full_content, "tool_calls": tool_calls}
if reasoning_content:
result["reasoning_content"] = reasoning_content
return result
def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
"""
Drive the multi-turn tool-call loop, but record each turn's tool calls
separately so parallelism can be validated.
Returns (turns, all_tool_calls, final_content) where `turns` is a list
of dicts: {"index": int, "tool_calls": [...], "content": str}.
"""
msgs = list(messages)
turns: list[dict] = []
all_tool_calls: list[dict] = []
for turn_idx in range(max_turns):
result = chat_completion(url, msgs, tools=tools, stream=stream)
if result is None:
return turns, all_tool_calls, None
tcs = result.get("tool_calls") or []
content = result.get("content") or ""
turns.append(
{"index": turn_idx, "tool_calls": list(tcs), "content": content}
)
if not tcs:
if content:
_print(f"\n{DIM}{'·' * 60}{RESET}")
_print(f"{DIM} model response:{RESET}\n")
return turns, all_tool_calls, content
print_turn_banner(turn_idx, len(tcs))
all_tool_calls.extend(tcs)
assistant_msg: dict = {
"role": "assistant",
"content": content,
"tool_calls": tcs,
}
reasoning = result.get("reasoning_content")
if reasoning:
assistant_msg["reasoning_content"] = reasoning
msgs.append(assistant_msg)
for tc in tcs:
tool_name = tc["function"]["name"]
try:
args = json.loads(tc["function"]["arguments"])
except json.JSONDecodeError:
args = {}
print_tool_call(tool_name, args)
mock_fn = mock_tool_responses.get(tool_name)
if mock_fn:
tool_result = mock_fn(args)
else:
tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"})
print_tool_result(tool_result)
msgs.append(
{
"role": "tool",
"tool_call_id": tc.get("id", ""),
"content": tool_result,
}
)
return turns, all_tool_calls, None
# ---------------------------------------------------------------------------
# Parallelism helpers
# ---------------------------------------------------------------------------
def _best_parallel_turn(turns):
"""Return the turn (dict) with the most tool calls, or None if no tools."""
tool_turns = [t for t in turns if t["tool_calls"]]
if not tool_turns:
return None
return max(tool_turns, key=lambda t: len(t["tool_calls"]))
def _distinct_tool_names(turn):
return {tc["function"]["name"] for tc in turn["tool_calls"]}
def _distinct_arg_values(turn, key):
values = set()
for tc in turn["tool_calls"]:
try:
args = json.loads(tc["function"]["arguments"])
except json.JSONDecodeError:
continue
v = args.get(key)
if v is not None:
if isinstance(v, str):
values.add(v.strip().lower())
else:
values.add(v)
return values
def _check_parallel(turns, expected):
"""
Check that at least one turn satisfies the parallel-call expectations.
Returns (ok, reason).
"""
best = _best_parallel_turn(turns)
if best is None:
return False, "No tool calls were made at all"
min_parallel = expected.get("min_parallel", 2)
if len(best["tool_calls"]) < min_parallel:
by_turn = [len(t["tool_calls"]) for t in turns]
return False, (
f"No turn had >= {min_parallel} parallel tool calls "
f"(per-turn counts: {by_turn})"
)
require_same = expected.get("require_same_tool")
if require_same is not None:
names = [tc["function"]["name"] for tc in best["tool_calls"]]
if any(n != require_same for n in names):
return False, (
f"Parallel turn mixed tools; expected all {require_same!r}, got {names}"
)
require_distinct = expected.get("require_distinct_tools")
if require_distinct is not None:
distinct = _distinct_tool_names(best)
if len(distinct) < require_distinct:
return False, (
f"Parallel turn had only {len(distinct)} distinct tool names "
f"({distinct}); need >= {require_distinct}"
)
distinct_key = expected.get("min_distinct_args_key")
distinct_count = expected.get("min_distinct_args_count", min_parallel)
if distinct_key is not None:
values = _distinct_arg_values(best, distinct_key)
if len(values) < distinct_count:
return False, (
f"Parallel turn had only {len(values)} distinct {distinct_key!r} "
f"values ({values}); need >= {distinct_count}"
)
return True, (
f"Parallel turn had {len(best['tool_calls'])} calls across "
f"{len(_distinct_tool_names(best))} distinct tool(s)"
)
# ---------------------------------------------------------------------------
# Test case runner
# ---------------------------------------------------------------------------
def run_test(url, test_case, stream):
name = test_case["name"]
mode = f"{'stream' if stream else 'non-stream'}"
print_header(f"{name} [{mode}]")
turns, all_tool_calls, final_content = run_agentic_loop(
url,
messages=test_case["messages"],
tools=test_case["tools"],
mock_tool_responses=test_case["mock_tool_responses"],
stream=stream,
)
if not turns:
print_fail("No response from server.")
return False
parallel_ok, parallel_reason = _check_parallel(turns, test_case["expected_parallel"])
if not parallel_ok:
print_fail(parallel_reason)
return False
passed, reason = test_case["validate"](turns, all_tool_calls, final_content)
if passed:
print_pass(f"{parallel_reason}; {reason}")
else:
print_fail(reason)
return passed
# ---------------------------------------------------------------------------
# Test case definitions
# ---------------------------------------------------------------------------
# ---- Test 1: Multi-file read (same tool, multiple distinct paths) ----
_FILE_TOOLS = [
{
"type": "function",
"function": {
"name": "read_file",
"description": (
"Read the full contents of a file from the local filesystem. "
"Call this tool in parallel when asked to read several files — "
"each path needs its own call."
),
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Absolute or repo-relative path to a file",
},
},
"required": ["path"],
},
},
},
]
_FILE_CONTENTS = {
"config/database.yml": "host: db.internal\nport: 5432\nuser: svc_app\n",
"config/redis.yml": "host: cache.internal\nport: 6379\ndb: 0\n",
"config/queue.yml": "broker: rabbitmq.internal\nport: 5672\nvhost: prod\n",
"config/auth.yml": "provider: oidc\nissuer: https://auth.internal\n",
}
def _read_file_mock(args):
path = args.get("path", "")
norm = path.lstrip("./").lstrip("/")
content = _FILE_CONTENTS.get(norm)
if content is None:
for k, v in _FILE_CONTENTS.items():
if path.endswith(k):
content = v
break
if content is None:
return json.dumps({"path": path, "error": "not found"})
return json.dumps({"path": path, "content": content})
MULTIFILE_READ_TEST = {
"name": "Parallel multi-file read (same tool, 4 distinct paths)",
"tools": _FILE_TOOLS,
"messages": [
{
"role": "user",
"content": (
"Please read all four of these config files so I can review them "
"together: config/database.yml, config/redis.yml, config/queue.yml, "
"and config/auth.yml. Call read_file for every path in parallel in "
"a single batch — do NOT read them one by one sequentially across "
"turns. After you have all four, give me a one-line summary of each."
),
}
],
"mock_tool_responses": {"read_file": _read_file_mock},
"expected_parallel": {
"min_parallel": 4,
"require_same_tool": "read_file",
"min_distinct_args_key": "path",
"min_distinct_args_count": 4,
},
"validate": lambda turns, tcs, content: _validate_multifile(turns, tcs, content),
}
def _validate_multifile(turns, tcs, content):
del turns
if not content:
return False, "No final summary produced"
return True, f"{len(tcs)} total read_file calls; content length={len(content)}"
# ---- Test 2: Batch TODO marking (same tool, N calls in one turn) ----
_TODO_TOOLS = [
{
"type": "function",
"function": {
"name": "mark_todo_complete",
"description": (
"Mark a single TODO item as complete by ID. When the user wants "
"several items marked at once, call this tool in parallel — "
"one call per item — rather than sequentially across turns."
),
"parameters": {
"type": "object",
"properties": {
"todo_id": {
"type": "string",
"description": "Identifier of the TODO item",
},
"note": {
"type": "string",
"description": "Optional completion note",
},
},
"required": ["todo_id"],
},
},
},
]
_TODO_DB = {
"T-101": "Draft onboarding doc",
"T-102": "Update dependency lockfile",
"T-103": "Fix flaky login test",
"T-104": "Rotate service credentials",
"T-105": "Archive Q4 reports",
}
def _mark_todo_mock(args):
tid = args.get("todo_id", "")
if tid in _TODO_DB:
return json.dumps({"todo_id": tid, "title": _TODO_DB[tid], "status": "done"})
return json.dumps({"todo_id": tid, "error": "unknown id"})
TODO_BATCH_TEST = {
"name": "Batch TODO completion (same tool, 5 IDs in one turn)",
"tools": _TODO_TOOLS,
"messages": [
{
"role": "user",
"content": (
"I finished every item on today's list. Please mark all of the "
"following TODOs as complete, in one parallel batch: T-101, T-102, "
"T-103, T-104, T-105. Don't mark them one at a time across separate "
"turns — issue all five mark_todo_complete calls at once. Afterwards "
"confirm which ones succeeded."
),
}
],
"mock_tool_responses": {"mark_todo_complete": _mark_todo_mock},
"expected_parallel": {
"min_parallel": 5,
"require_same_tool": "mark_todo_complete",
"min_distinct_args_key": "todo_id",
"min_distinct_args_count": 5,
},
"validate": lambda turns, tcs, content: _validate_todo(turns, tcs, content),
}
def _validate_todo(turns, tcs, content):
del turns
if not content:
return False, "No confirmation summary produced"
return True, f"{len(tcs)} total mark_todo_complete calls"
# ---- Test 3: Multi-city weather (same tool, N parallel locations) ----
_WEATHER_TOOLS = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": (
"Fetch current weather for ONE city. When the user asks about "
"several cities, call this tool in parallel — one call per city — "
"instead of sequentially."
),
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "City name"},
"units": {
"type": "string",
"enum": ["metric", "imperial"],
"default": "metric",
},
},
"required": ["city"],
},
},
},
]
_WEATHER_DB = {
"tokyo": {"city": "Tokyo", "temp_c": 18.4, "condition": "partly cloudy", "humidity": 64},
"london": {"city": "London", "temp_c": 9.1, "condition": "overcast", "humidity": 81},
"new york": {"city": "New York", "temp_c": 12.7, "condition": "clear", "humidity": 55},
"paris": {"city": "Paris", "temp_c": 11.3, "condition": "light rain", "humidity": 78},
}
def _weather_mock(args):
city = args.get("city", "").strip().lower()
if city.startswith("new york"):
city = "new york"
if city in _WEATHER_DB:
return json.dumps(_WEATHER_DB[city])
return json.dumps({"city": args.get("city", ""), "error": "unknown city"})
MULTI_WEATHER_TEST = {
"name": "Parallel multi-city weather (same tool, 4 cities)",
"tools": _WEATHER_TOOLS,
"messages": [
{
"role": "user",
"content": (
"I'm comparing today's weather across four cities for a travel "
"decision: Tokyo, London, New York, and Paris. Please call "
"get_weather for all four in parallel in a single turn — don't "
"fetch them one at a time. Then rank them from warmest to coolest."
),
}
],
"mock_tool_responses": {"get_weather": _weather_mock},
"expected_parallel": {
"min_parallel": 4,
"require_same_tool": "get_weather",
"min_distinct_args_key": "city",
"min_distinct_args_count": 4,
},
"validate": lambda turns, tcs, content: _validate_weather(turns, tcs, content),
}
def _validate_weather(turns, tcs, content):
del turns
if not content or not any(
kw in content.lower() for kw in ("warmest", "rank", "hot", "cool")
):
return False, f"Final content missing a ranking: {content!r}"
return True, f"{len(tcs)} total get_weather calls; ranking produced"
# ---- Test 4: Trip planning (different tools, parallel in one turn) ----
_TRIP_TOOLS = [
{
"type": "function",
"function": {
"name": "search_flights",
"description": "Search one-way flights between two airports on a given date.",
"parameters": {
"type": "object",
"properties": {
"from_airport": {"type": "string", "description": "IATA code, e.g. SFO"},
"to_airport": {"type": "string", "description": "IATA code, e.g. JFK"},
"date": {"type": "string", "description": "YYYY-MM-DD"},
},
"required": ["from_airport", "to_airport", "date"],
},
},
},
{
"type": "function",
"function": {
"name": "search_hotels",
"description": "Search hotels in a city for a date range.",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string"},
"check_in": {"type": "string", "description": "YYYY-MM-DD"},
"check_out": {"type": "string", "description": "YYYY-MM-DD"},
"max_price": {"type": "integer"},
},
"required": ["city", "check_in", "check_out"],
},
},
},
{
"type": "function",
"function": {
"name": "search_restaurants",
"description": "Search restaurants in a city by cuisine.",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string"},
"cuisine": {"type": "string"},
},
"required": ["city"],
},
},
},
]
_FLIGHTS_RESULT = {
"results": [
{"flight": "UA 1552", "depart": "08:15", "arrive": "16:45", "price": 389},
{"flight": "AA 20", "depart": "10:00", "arrive": "18:35", "price": 412},
]
}
_HOTELS_RESULT = {
"results": [
{"name": "Midtown Grand", "nightly_rate": 245, "rating": 4.3},
{"name": "Harbour Boutique", "nightly_rate": 312, "rating": 4.6},
]
}
_RESTAURANTS_RESULT = {
"results": [
{"name": "Trattoria Nona", "cuisine": "italian", "rating": 4.5},
{"name": "Osteria Blu", "cuisine": "italian", "rating": 4.4},
]
}
TRIP_PLAN_TEST = {
"name": "Trip planning (3 different tools in parallel)",
"tools": _TRIP_TOOLS,
"messages": [
{
"role": "user",
"content": (
"I'm flying from SFO to JFK on 2026-06-12 and staying four nights "
"(check out 2026-06-16). I'd also like some Italian restaurant "
"suggestions in New York. Please call search_flights, search_hotels, "
"and search_restaurants in parallel — all three in a single turn, "
"since they don't depend on each other. Then give me a concise "
"travel summary."
),
}
],
"mock_tool_responses": {
"search_flights": lambda _: json.dumps(_FLIGHTS_RESULT),
"search_hotels": lambda _: json.dumps(_HOTELS_RESULT),
"search_restaurants": lambda _: json.dumps(_RESTAURANTS_RESULT),
},
"expected_parallel": {
"min_parallel": 3,
"require_distinct_tools": 3,
},
"validate": lambda turns, tcs, content: _validate_trip(turns, tcs, content),
}
def _validate_trip(turns, tcs, content):
del turns
names = {tc["function"]["name"] for tc in tcs}
required = {"search_flights", "search_hotels", "search_restaurants"}
missing = required - names
if missing:
return False, f"Missing tool calls: {missing}"
if not content:
return False, "No travel summary produced"
return True, f"All three tools called; summary length={len(content)}"
# ---- Test 5: Portfolio check (same tool, parallel tickers) ----
_STOCK_TOOLS = [
{
"type": "function",
"function": {
"name": "get_stock_quote",
"description": (
"Get the latest quote for ONE ticker. When the user asks about "
"multiple tickers, call this tool in parallel — one per symbol — "
"rather than sequentially."
),
"parameters": {
"type": "object",
"properties": {
"symbol": {"type": "string", "description": "Ticker symbol"},
},
"required": ["symbol"],
},
},
},
]
_STOCK_DB = {
"AAPL": {"symbol": "AAPL", "price": 218.45, "change_pct": "+0.8%"},
"MSFT": {"symbol": "MSFT", "price": 421.10, "change_pct": "+1.2%"},
"GOOGL":{"symbol": "GOOGL","price": 175.22, "change_pct": "-0.3%"},
"AMZN": {"symbol": "AMZN", "price": 189.76, "change_pct": "+0.5%"},
"NVDA": {"symbol": "NVDA", "price": 140.88, "change_pct": "+2.4%"},
}
def _stock_mock(args):
sym = args.get("symbol", "").strip().upper()
if sym in _STOCK_DB:
return json.dumps(_STOCK_DB[sym])
return json.dumps({"symbol": sym, "error": "unknown ticker"})
PORTFOLIO_TEST = {
"name": "Portfolio check (same tool, 5 tickers in parallel)",
"tools": _STOCK_TOOLS,
"messages": [
{
"role": "user",
"content": (
"Pull the latest quote for every ticker in my portfolio — AAPL, "
"MSFT, GOOGL, AMZN, and NVDA — in a single parallel batch. These "
"lookups are independent, so please don't chain them across turns. "
"Once you have all five, tell me which ticker had the biggest "
"percentage change today."
),
}
],
"mock_tool_responses": {"get_stock_quote": _stock_mock},
"expected_parallel": {
"min_parallel": 5,
"require_same_tool": "get_stock_quote",
"min_distinct_args_key": "symbol",
"min_distinct_args_count": 5,
},
"validate": lambda turns, tcs, content: _validate_portfolio(turns, tcs, content),
}
def _validate_portfolio(turns, tcs, content):
del turns
if not content or ("nvda" not in content.lower() and "NVDA" not in content):
return False, f"Expected NVDA to be identified as the biggest mover: {content!r}"
return True, f"{len(tcs)} total quotes pulled"
# ---- Test 6: Mixed — translate + dictionary in parallel for the same word ----
_LANG_TOOLS = [
{
"type": "function",
"function": {
"name": "translate_text",
"description": "Translate a short text into a target language.",
"parameters": {
"type": "object",
"properties": {
"text": {"type": "string"},
"target_language": {"type": "string",
"description": "ISO 639-1 language code, e.g. 'es'"},
},
"required": ["text", "target_language"],
},
},
},
{
"type": "function",
"function": {
"name": "get_definition",
"description": "Get the English dictionary definition of a word.",
"parameters": {
"type": "object",
"properties": {
"word": {"type": "string"},
},
"required": ["word"],
},
},
},
{
"type": "function",
"function": {
"name": "get_synonyms",
"description": "Get English synonyms for a word.",
"parameters": {
"type": "object",
"properties": {
"word": {"type": "string"},
},
"required": ["word"],
},
},
},
]
def _translate_mock(args):
t = args.get("text", "")
lang = args.get("target_language", "")
return json.dumps({"source": t, "target_language": lang, "translation": f"[{lang}] {t}"})
def _definition_mock(args):
w = args.get("word", "")
return json.dumps({
"word": w,
"definition": f"A standard dictionary definition of {w!r}.",
})
def _synonyms_mock(args):
w = args.get("word", "")
return json.dumps({
"word": w,
"synonyms": ["synonym_a", "synonym_b", "synonym_c"],
})
LANG_TOOLKIT_TEST = {
"name": "Language toolkit (translate + definition + synonyms in parallel)",
"tools": _LANG_TOOLS,
"messages": [
{
"role": "user",
"content": (
"For the English word 'resilient', I need three independent "
"look-ups at once: (a) translate it into Spanish, (b) fetch its "
"dictionary definition, and (c) list its synonyms. These three "
"calls don't depend on each other — please issue them in parallel "
"in a single turn. Then present the combined results as a short "
"language note."
),
}
],
"mock_tool_responses": {
"translate_text": _translate_mock,
"get_definition": _definition_mock,
"get_synonyms": _synonyms_mock,
},
"expected_parallel": {
"min_parallel": 3,
"require_distinct_tools": 3,
},
"validate": lambda turns, tcs, content: _validate_lang(turns, tcs, content),
}
def _validate_lang(turns, tcs, content):
del turns
names = {tc["function"]["name"] for tc in tcs}
required = {"translate_text", "get_definition", "get_synonyms"}
missing = required - names
if missing:
return False, f"Missing tool calls: {missing}"
if not content:
return False, "No language note produced"
return True, f"All three lookup tools called; note length={len(content)}"
# ---------------------------------------------------------------------------
# All test cases
# ---------------------------------------------------------------------------
ALL_TEST_CASES = [
MULTIFILE_READ_TEST,
TODO_BATCH_TEST,
MULTI_WEATHER_TEST,
TRIP_PLAN_TEST,
PORTFOLIO_TEST,
LANG_TOOLKIT_TEST,
]
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description=(
"Test llama-server parallel tool-calling capability. Run this only "
"against models configured for parallel tool calls — this script "
"does not configure that itself."
)
)
parser.add_argument("--host", default="localhost")
parser.add_argument("--port", default=8080, type=int)
parser.add_argument(
"--no-stream", action="store_true", help="Disable streaming mode tests"
)
parser.add_argument(
"--stream-only", action="store_true", help="Only run streaming mode tests"
)
parser.add_argument(
"--test",
help="Run only the test whose name contains this substring (case-insensitive)",
)
args = parser.parse_args()
url = f"http://{args.host}:{args.port}/v1/chat/completions"
print_info(f"Testing server at {url}")
print_warn(
"This script expects the target model to emit multiple tool calls in a "
"single assistant turn. Run it only against parallel-tool-capable models."
)
modes: list[bool] = []
if not args.stream_only:
modes.append(False)
if not args.no_stream:
modes.append(True)
cases: list[dict] = ALL_TEST_CASES
if args.test:
name_filter = args.test.lower()
cases = [c for c in cases if name_filter in str(c["name"]).lower()]
if not cases:
print_fail(f"No test cases matched '{args.test}'")
sys.exit(1)
total = 0
passed = 0
for stream in modes:
for case in cases:
total += 1
if run_test(url, case, stream=stream):
passed += 1
color = GREEN if passed == total else RED
_print(f"\n{BOLD}{color}{'' * 60}{RESET}")
_print(f"{BOLD}{color} Results: {passed}/{total} passed{RESET}")
_print(f"{BOLD}{color}{'' * 60}{RESET}\n")
sys.exit(0 if passed == total else 1)
if __name__ == "__main__":
main()

Some files were not shown because too many files have changed in this diff Show More