Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 35df147d80 | |||
| c1891fd6eb |
@@ -1,4 +1,4 @@
|
||||
ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
|
||||
ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
|
||||
|
||||
## Build Image
|
||||
|
||||
|
||||
@@ -2,19 +2,7 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
|
||||
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
|
||||
ARG IGC_VERSION=v2.30.1
|
||||
ARG IGC_VERSION_FULL=2_2.30.1+20950
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
|
||||
ARG IGDGMM_VERSION=22.9.0
|
||||
|
||||
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
|
||||
ARG NPU_DRIVER_VERSION=v1.32.0
|
||||
ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
|
||||
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
|
||||
|
||||
# Optional proxy build arguments
|
||||
# Optional proxy build arguments - empty by default
|
||||
ARG http_proxy=
|
||||
ARG https_proxy=
|
||||
|
||||
@@ -90,47 +78,13 @@ ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
# Install GPU drivers
|
||||
ARG IGC_VERSION
|
||||
ARG IGC_VERSION_FULL
|
||||
ARG COMPUTE_RUNTIME_VERSION
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL
|
||||
ARG IGDGMM_VERSION
|
||||
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& dpkg --install *.deb \
|
||||
&& rm -rf /tmp/neo/
|
||||
|
||||
# Install NPU drivers
|
||||
ARG NPU_DRIVER_VERSION
|
||||
ARG NPU_DRIVER_FULL
|
||||
ARG LIBZE1_VERSION
|
||||
RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
|
||||
&& wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
|
||||
&& tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
|
||||
&& dpkg --install *.deb \
|
||||
&& rm -rf /tmp/npu/
|
||||
|
||||
RUN cd /tmp \
|
||||
&& wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
|
||||
&& dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
|
||||
&& rm libze1_${LIBZE1_VERSION}_amd64.deb
|
||||
|
||||
COPY --from=build /app/lib/ /app/
|
||||
|
||||
### Full (all binaries)
|
||||
|
||||
@@ -1,120 +0,0 @@
|
||||
name: CI (openvino)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-openvino.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-openvino.yml',
|
||||
'ggml/src/ggml-openvino/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-openvino:
|
||||
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
|
||||
|
||||
concurrency:
|
||||
group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
|
||||
cancel-in-progress: false
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- variant: cpu
|
||||
runner: '"ubuntu-24.04"'
|
||||
openvino_device: "CPU"
|
||||
- variant: gpu
|
||||
runner: '["self-hosted","Linux","Intel","OpenVINO"]'
|
||||
openvino_device: "GPU"
|
||||
|
||||
runs-on: ${{ fromJSON(matrix.runner) }}
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
|
||||
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
|
||||
|
||||
- name: Use OpenVINO Toolkit Cache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: actions/cache@v5
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
- name: Install OpenVINO dependencies
|
||||
run: |
|
||||
cd ./openvino_toolkit
|
||||
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
time cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# TODO: fix and re-enable the `test-llama-archs` test below
|
||||
run: |
|
||||
cd ${{ github.workspace }}
|
||||
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
fi
|
||||
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
|
||||
@@ -265,10 +265,6 @@ jobs:
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
concurrency:
|
||||
group: openvino-gpu-${{ github.head_ref || github.ref }}
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
name: CI (sycl)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-sycl.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-sycl.yml',
|
||||
'ggml/src/ggml-sycl/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
@@ -555,6 +555,186 @@ jobs:
|
||||
-DGGML_MUSA=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||
|
||||
- name: install oneAPI dpcpp compiler
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt install intel-oneapi-mkl-devel
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-22-sycl
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-sycl-fp16:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||
|
||||
- name: install oneAPI dpcpp compiler
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt install intel-oneapi-mkl-devel
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-22-sycl-fp16
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DGGML_SYCL_F16=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-openvino:
|
||||
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- variant: cpu
|
||||
runner: '"ubuntu-24.04"'
|
||||
openvino_device: "CPU"
|
||||
- variant: gpu
|
||||
runner: '["self-hosted","Linux","X64","Intel"]'
|
||||
openvino_device: "GPU"
|
||||
|
||||
runs-on: ${{ fromJSON(matrix.runner) }}
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
|
||||
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
|
||||
|
||||
- name: Use OpenVINO Toolkit Cache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: actions/cache@v5
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
- name: Install OpenVINO dependencies
|
||||
run: |
|
||||
cd ./openvino_toolkit
|
||||
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
time cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# TODO: fix and re-enable the `test-llama-archs` test below
|
||||
run: |
|
||||
cd ${{ github.workspace }}
|
||||
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
fi
|
||||
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
|
||||
|
||||
windows-latest:
|
||||
runs-on: windows-2025
|
||||
@@ -763,6 +943,39 @@ jobs:
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Install
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
windows-latest-hip:
|
||||
runs-on: windows-2022
|
||||
|
||||
@@ -598,29 +598,15 @@ jobs:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
@@ -628,6 +614,10 @@ jobs:
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
@@ -680,82 +670,6 @@ jobs:
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
|
||||
ubuntu-22-rocm:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -1131,7 +1045,6 @@ jobs:
|
||||
- ubuntu-cpu
|
||||
- ubuntu-vulkan
|
||||
- ubuntu-24-openvino
|
||||
- ubuntu-24-sycl
|
||||
- android-arm64
|
||||
- macOS-cpu
|
||||
- ios-xcode-build
|
||||
@@ -1220,8 +1133,6 @@ jobs:
|
||||
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
|
||||
|
||||
**Android:**
|
||||
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
|
||||
|
||||
@@ -145,5 +145,3 @@ poetry.toml
|
||||
/.windsurf/
|
||||
# emscripten
|
||||
a.out.*
|
||||
|
||||
AGENTS.local.md
|
||||
|
||||
@@ -23,7 +23,6 @@
|
||||
/ci/ @ggerganov
|
||||
/cmake/ @ggerganov
|
||||
/common/ @ggml-org/llama-common
|
||||
/common/fit.* @JohannesGaessler
|
||||
/common/jinja/ @CISC
|
||||
/common/ngram-map.* @srogmann
|
||||
/convert_*.py @CISC
|
||||
|
||||
@@ -73,8 +73,6 @@ add_library(${TARGET}
|
||||
debug.h
|
||||
download.cpp
|
||||
download.h
|
||||
fit.cpp
|
||||
fit.h
|
||||
hf-cache.cpp
|
||||
hf-cache.h
|
||||
http.h
|
||||
|
||||
+2
-27
@@ -2426,20 +2426,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_FIT"));
|
||||
add_opt(common_arg(
|
||||
{ "-fitp", "--fit-print" }, "[on|off]",
|
||||
string_format("print the estimated required memory ('on' or 'off', default: '%s')", params.fit_params_print ? "on" : "off"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (is_truthy(value)) {
|
||||
params.fit_params_print = true;
|
||||
} else if (is_falsey(value)) {
|
||||
params.fit_params_print = false;
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
string_format("error: unknown value for --fit-print: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_FIT_PARAMS}).set_env("LLAMA_ARG_FIT_ESTIMATE"));
|
||||
add_opt(common_arg(
|
||||
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
|
||||
string_format("target margin per device for --fit, comma-separated list of values, "
|
||||
@@ -3122,14 +3108,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
|
||||
[](common_params & params, int value) {
|
||||
if (value < -1) { throw std::invalid_argument("invalid value"); }
|
||||
params.sampling.reasoning_budget_tokens = value;
|
||||
params.reasoning_budget = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
||||
add_opt(common_arg(
|
||||
{"--reasoning-budget-message"}, "MESSAGE",
|
||||
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.reasoning_budget_message = value;
|
||||
params.reasoning_budget_message = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
|
||||
add_opt(common_arg(
|
||||
@@ -3902,17 +3888,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
add_opt(common_arg(
|
||||
{"--spec-default"},
|
||||
string_format("enable default speculative decoding config"),
|
||||
[](common_params & params) {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
|
||||
params.speculative.ngram_size_n = 24;
|
||||
params.speculative.n_min = 48;
|
||||
params.speculative.n_max = 64;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
||||
|
||||
+55
-40
@@ -397,25 +397,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
|
||||
return render_message_to_json(msgs, c);
|
||||
}
|
||||
|
||||
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
|
||||
if (tools.empty()) {
|
||||
return json();
|
||||
}
|
||||
|
||||
auto result = json::array();
|
||||
for (const auto & tool : tools) {
|
||||
result.push_back({
|
||||
{ "type", "function" },
|
||||
{ "function", {
|
||||
{ "name", tool.name },
|
||||
{ "description", tool.description },
|
||||
{ "parameters", json::parse(tool.parameters) },
|
||||
}},
|
||||
});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
|
||||
std::vector<common_chat_tool> result;
|
||||
|
||||
@@ -451,6 +432,56 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
|
||||
return result;
|
||||
}
|
||||
|
||||
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
|
||||
if (tools.empty()) {
|
||||
return json();
|
||||
}
|
||||
|
||||
auto result = json::array();
|
||||
for (const auto & tool : tools) {
|
||||
result.push_back({
|
||||
{ "type", "function" },
|
||||
{ "function",
|
||||
{
|
||||
{ "name", tool.name },
|
||||
{ "description", tool.description },
|
||||
{ "parameters", json::parse(tool.parameters) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
|
||||
json delta = json::object();
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
delta["reasoning_content"] = diff.reasoning_content_delta;
|
||||
}
|
||||
if (!diff.content_delta.empty()) {
|
||||
delta["content"] = diff.content_delta;
|
||||
}
|
||||
if (diff.tool_call_index != std::string::npos) {
|
||||
json tool_call;
|
||||
tool_call["index"] = diff.tool_call_index;
|
||||
if (!diff.tool_call_delta.id.empty()) {
|
||||
tool_call["id"] = diff.tool_call_delta.id;
|
||||
tool_call["type"] = "function";
|
||||
}
|
||||
if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
|
||||
json function = json::object();
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
function["name"] = diff.tool_call_delta.name;
|
||||
}
|
||||
if (!diff.tool_call_delta.arguments.empty()) {
|
||||
function["arguments"] = diff.tool_call_delta.arguments;
|
||||
}
|
||||
tool_call["function"] = function;
|
||||
}
|
||||
delta["tool_calls"] = json::array({ tool_call });
|
||||
}
|
||||
return delta;
|
||||
}
|
||||
|
||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
|
||||
if (use_jinja) {
|
||||
try {
|
||||
@@ -544,26 +575,6 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
|
||||
return tmpls->has_explicit_template;
|
||||
}
|
||||
|
||||
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
|
||||
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
|
||||
static bool is_lfm2_template(const std::string & src) {
|
||||
return src.find("<|tool_list_start|>") != std::string::npos &&
|
||||
src.find("<|tool_list_end|>") != std::string::npos;
|
||||
}
|
||||
|
||||
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
|
||||
common_chat_prompt_preset asr_preset;
|
||||
asr_preset.system = "";
|
||||
asr_preset.user = "Transcribe audio to text";
|
||||
|
||||
if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
|
||||
asr_preset.system = "Perform ASR.";
|
||||
asr_preset.user = "";
|
||||
}
|
||||
|
||||
return asr_preset;
|
||||
}
|
||||
|
||||
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
|
||||
if (!variant.empty()) {
|
||||
if (variant == "tool_use") {
|
||||
@@ -2073,7 +2084,10 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
return common_chat_params_init_kimi_k2(tmpl, params);
|
||||
}
|
||||
|
||||
if (is_lfm2_template(src)) {
|
||||
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
|
||||
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
|
||||
if (src.find("<|tool_list_start|>") != std::string::npos &&
|
||||
src.find("<|tool_list_end|>") != std::string::npos) {
|
||||
LOG_DBG("Using specialized template: LFM2\n");
|
||||
return common_chat_params_init_lfm2(tmpl, params);
|
||||
}
|
||||
@@ -2382,3 +2396,4 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
|
||||
GGML_ASSERT(chat_templates->template_default != nullptr);
|
||||
return chat_templates->template_default->caps.to_map();
|
||||
}
|
||||
|
||||
|
||||
+3
-10
@@ -256,13 +256,14 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
|
||||
// Parses a JSON array of messages in OpenAI's chat completion API format.
|
||||
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
|
||||
|
||||
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
|
||||
|
||||
// DEPRECATED: only used in tests
|
||||
nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
|
||||
|
||||
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
|
||||
nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
||||
|
||||
nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
||||
|
||||
// get template caps, useful for reporting to server /props endpoint
|
||||
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
|
||||
|
||||
@@ -274,11 +275,3 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const common_chat_template & tmpl,
|
||||
const std::string & src,
|
||||
autoparser::generation_params & params);
|
||||
|
||||
// specialized per-task preset
|
||||
struct common_chat_prompt_preset {
|
||||
std::string system;
|
||||
std::string user;
|
||||
};
|
||||
|
||||
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
|
||||
|
||||
+1
-2
@@ -3,7 +3,6 @@
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "fit.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "sampling.h"
|
||||
@@ -1148,7 +1147,7 @@ common_init_result::common_init_result(common_params & params) :
|
||||
|
||||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
||||
common_fit_params(params.model.path.c_str(), &mparams, &cparams,
|
||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split,
|
||||
params.tensor_buft_overrides.data(),
|
||||
params.fit_params_target.data(),
|
||||
|
||||
+7
-7
@@ -274,7 +274,6 @@ struct common_params_sampling {
|
||||
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
|
||||
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
|
||||
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
|
||||
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
||||
|
||||
bool backend_sampling = false;
|
||||
|
||||
@@ -421,12 +420,11 @@ struct common_params {
|
||||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
bool fit_params_print = false; // print the estimated required memory to run the model
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
|
||||
// margin per device in bytes for fitting parameters to free memory:
|
||||
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
||||
@@ -582,6 +580,8 @@ struct common_params {
|
||||
bool force_pure_content_parser = false;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
|
||||
int reasoning_budget = -1;
|
||||
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
||||
|
||||
|
||||
-951
@@ -1,951 +0,0 @@
|
||||
#include "fit.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
#include <cinttypes>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||
// enum to identify part of a layer for distributing its tensors:
|
||||
enum common_layer_fraction_t {
|
||||
LAYER_FRACTION_NONE = 0, // nothing
|
||||
LAYER_FRACTION_ATTN = 1, // attention
|
||||
LAYER_FRACTION_UP = 2, // attention + up
|
||||
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||
};
|
||||
|
||||
class common_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
ggml_log_level log_level) {
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original_logger;
|
||||
ggml_log_level min_level; // prints below this log level go to debug log
|
||||
};
|
||||
user_data_t ud;
|
||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||
ud.min_level = log_level;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||
const user_data_t * ud = (const user_data_t *) user_data;
|
||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||
}, &ud);
|
||||
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.use_mmap = false;
|
||||
mparams_copy.use_mlock = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||
if (model == nullptr) {
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to load model");
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||
if (ctx == nullptr) {
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to create llama_context from model");
|
||||
}
|
||||
|
||||
const size_t nd = llama_model_n_devices(model);
|
||||
std::vector<llama_device_memory_data> ret(nd + 1);
|
||||
|
||||
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
|
||||
|
||||
for (const auto & [buft, mb] : memory_breakdown) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
ret.back().mb.model += mb.model;
|
||||
ret.back().mb.context += mb.context;
|
||||
ret.back().mb.compute += mb.compute;
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
if (dev == llama_model_get_device(model, i)) {
|
||||
ret[i].mb.model += mb.model;
|
||||
ret[i].mb.context += mb.context;
|
||||
ret[i].mb.compute += mb.compute;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error("no CPU backend found");
|
||||
}
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
ret.back().free = free;
|
||||
ret.back().total = total;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
free = ret.back().free;
|
||||
total = ret.back().total;
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
}
|
||||
|
||||
devs.clear();
|
||||
for (int i = 0; i < llama_model_n_devices(model); i++) {
|
||||
devs.push_back(llama_model_get_device(model, i));
|
||||
}
|
||||
|
||||
hp_ngl = llama_model_n_layer(model);
|
||||
hp_n_ctx_train = llama_model_n_ctx_train(model);
|
||||
hp_n_expert = llama_model_n_expert(model);
|
||||
|
||||
common_memory_breakdown_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void common_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
|
||||
}
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
if (nd == 0) {
|
||||
margins.push_back(margins_s[0]);
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
size_t max_length = 0;
|
||||
for (const auto & dev : devs) {
|
||||
std::string name = ggml_backend_dev_name(dev);
|
||||
name += " (";
|
||||
name += ggml_backend_dev_description(dev);
|
||||
name += ")";
|
||||
dev_names.push_back(name);
|
||||
max_length = std::max(max_length, name.length());
|
||||
}
|
||||
for (std::string & dn : dev_names) {
|
||||
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||
}
|
||||
}
|
||||
|
||||
int64_t sum_free = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_model = 0;
|
||||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd == 0) {
|
||||
sum_projected_used = dmds_full.back().mb.total();
|
||||
sum_free = dmds_full.back().total;
|
||||
sum_projected_free = sum_free - sum_projected_used;
|
||||
LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (sum_projected_free >= margins[0]) {
|
||||
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
|
||||
__func__, sum_projected_free/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (nd > 1) {
|
||||
LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LOG_INF("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
if (nd == 0) {
|
||||
global_surplus -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
if (nd <= 1) {
|
||||
LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
LOG_INF(
|
||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, -global_surplus/MiB);
|
||||
}
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
int64_t sum_used_target = sum_free;
|
||||
if (nd == 0) {
|
||||
sum_used_target -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
}
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
// - for dense models only whole layers can be assigned to devices
|
||||
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
|
||||
// - on average we expect a waste of 0.5 layers/tensors per device
|
||||
// - use slightly more than the expected average for nd devices to be safe
|
||||
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
||||
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
||||
}
|
||||
|
||||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
if (nd == 0) {
|
||||
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
|
||||
}
|
||||
}
|
||||
if (sum_used_target > sum_projected_used_min_ctx) {
|
||||
// linear interpolation between minimum and maximum context size:
|
||||
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
||||
/ (sum_projected_used - sum_projected_used_min_ctx);
|
||||
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
||||
|
||||
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
||||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
||||
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (nd <= 1) {
|
||||
LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||
} else {
|
||||
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
||||
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
}
|
||||
} else {
|
||||
if (n_ctx_min == UINT32_MAX) {
|
||||
LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
||||
} else {
|
||||
LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nd == 0) {
|
||||
throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
}
|
||||
if (nd > 1) {
|
||||
if (!tensor_split) {
|
||||
throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
||||
}
|
||||
if (mparams->tensor_split) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (mparams->tensor_split[id] != 0.0f) {
|
||||
throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||
}
|
||||
}
|
||||
if (!tensor_buft_overrides) {
|
||||
throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
||||
}
|
||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||
throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
||||
}
|
||||
|
||||
// step 3: iteratively fill the back to front with "dense" layers
|
||||
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||
|
||||
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||
auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
|
||||
constexpr size_t n_strings = 1000;
|
||||
if (il >= n_strings) {
|
||||
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||
}
|
||||
switch (lf) {
|
||||
case LAYER_FRACTION_ATTN: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_UP: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_GATE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_MOE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
};
|
||||
|
||||
struct ngl_t {
|
||||
uint32_t n_layer = 0; // number of total layers
|
||||
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||
|
||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||
common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||
|
||||
uint32_t n_full() const {
|
||||
assert(n_layer >= n_part);
|
||||
return n_layer - n_part;
|
||||
}
|
||||
};
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
|
||||
// utility function to set n_gpu_layers and tensor_split
|
||||
auto set_ngl_tensor_split_tbo = [&](
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
llama_model_params & mparams) {
|
||||
mparams.n_gpu_layers = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||
if (nd > 1) {
|
||||
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||
}
|
||||
}
|
||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
|
||||
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||
|
||||
mparams.tensor_split = tensor_split;
|
||||
|
||||
size_t itbo = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
il0 += ngl_per_device[id].n_full();
|
||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||
if (itbo + 1 >= ntbo) {
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
||||
+ std::to_string(ntbo) + " is insufficient for model");
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
||||
itbo++;
|
||||
}
|
||||
il0 += ngl_per_device[id].n_part;
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
};
|
||||
|
||||
// utility function that returns the memory use per device for given numbers of layers per device
|
||||
auto get_memory_for_layers = [&](
|
||||
const char * func_name,
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
||||
|
||||
const dmds_t dmd_nl = common_get_device_memory_data(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LOG_INF("%s: memory for test allocation by device:\n", func_name);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const ngl_t & n = ngl_per_device[id];
|
||||
LOG_INF(
|
||||
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ret;
|
||||
ret.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ret.push_back(dmd_nl[id].mb.total());
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
int64_t global_surplus_cpu_moe = 0;
|
||||
if (hp_nex > 0) {
|
||||
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
|
||||
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = common_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
||||
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||
__func__, global_surplus_cpu_moe/MiB);
|
||||
} else {
|
||||
LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||
__func__, -global_surplus_cpu_moe/MiB);
|
||||
}
|
||||
|
||||
// reset
|
||||
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
}
|
||||
|
||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margins[id]);
|
||||
LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
||||
overflow_bufts.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device(nd);
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
||||
|
||||
// optimize the number of layers per device using the method of false position:
|
||||
// - ngl_per_device has 0 layers for each device, lower bound
|
||||
// - try a "high" configuration where a device is given all unassigned layers
|
||||
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||
// - check memory use of our guess, replace either the low or high bound
|
||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||
// - the last device has the output layer, which cannot be a partial layer
|
||||
if (hp_nex == 0) {
|
||||
LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
|
||||
} else {
|
||||
LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
}
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
uint32_t n_unassigned = hp_ngl + 1;
|
||||
for (size_t jd = id + 1; jd < nd; ++jd) {
|
||||
assert(n_unassigned >= ngl_per_device[jd].n_layer);
|
||||
n_unassigned -= ngl_per_device[jd].n_layer;
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||
if (hp_nex > 0) {
|
||||
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
|
||||
}
|
||||
if (ngl_per_device_high[id].n_layer > 0) {
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
ngl_per_device_test[id].n_layer += step_size;
|
||||
if (hp_nex) {
|
||||
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
||||
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
||||
}
|
||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
}
|
||||
} else {
|
||||
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 4: for a MoE model where all dense tensors fit,
|
||||
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||
|
||||
size_t id_dense_start = nd;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
if (ngl_per_device[id].n_layer > 0) {
|
||||
id_dense_start = id;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
||||
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||
ngl_per_device_high[jd].n_part = 0;
|
||||
}
|
||||
size_t id_dense_start_high = nd - 1;
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
uint32_t n_converted_test = 0;
|
||||
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||
n_converted_test += n_convert_jd;
|
||||
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
id_dense_start_high = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
|
||||
// try to fit at least part of one more layer
|
||||
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||
ngl_per_device_test[id].n_layer++;
|
||||
ngl_per_device_test[id].n_part++;
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
||||
id_dense_start_test++;
|
||||
}
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
||||
if (id < nd - 1) {
|
||||
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
||||
}
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
// print info for devices that were not changed during the conversion from dense only to full layers:
|
||||
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
}
|
||||
|
||||
enum common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams,
|
||||
float * tensor_split,
|
||||
llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins,
|
||||
uint32_t n_ctx_min,
|
||||
ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
|
||||
try {
|
||||
common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
||||
LOG_INF("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const common_params_fit_exception & e) {
|
||||
LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = COMMON_PARAMS_FIT_STATUS_FAILURE;
|
||||
} catch (const std::runtime_error & e) {
|
||||
LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = COMMON_PARAMS_FIT_STATUS_ERROR;
|
||||
}
|
||||
const int64_t t1_us = llama_time_us();
|
||||
LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
return status;
|
||||
}
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
//const auto & devices = ctx->get_model().devices;
|
||||
const auto * model = llama_get_model(ctx);
|
||||
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
for (int i = 0; i < llama_model_n_devices(model); i++) {
|
||||
devices.push_back(llama_model_get_device(model, i));
|
||||
}
|
||||
|
||||
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
|
||||
|
||||
std::vector<std::array<std::string, 9>> table_data;
|
||||
table_data.reserve(devices.size());
|
||||
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
||||
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
||||
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
||||
|
||||
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
||||
|
||||
constexpr size_t MiB = 1024 * 1024;
|
||||
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
||||
|
||||
// track seen buffer types to avoid double counting:
|
||||
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
||||
|
||||
// accumulative memory breakdown for each device and for host:
|
||||
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
||||
llama_memory_breakdown_data mb_host;
|
||||
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
mb_host.model += mb.model;
|
||||
mb_host.context += mb.context;
|
||||
mb_host.compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (dev) {
|
||||
int i_dev = -1;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (devices[i] == dev) {
|
||||
i_dev = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i_dev != -1) {
|
||||
mb_dev[i_dev].model += mb.model;
|
||||
mb_dev[i_dev].context += mb.context;
|
||||
mb_dev[i_dev].compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print memory breakdown for each device:
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
ggml_backend_dev_t dev = devices[i];
|
||||
llama_memory_breakdown_data mb = mb_dev[i];
|
||||
|
||||
const std::string name = ggml_backend_dev_name(dev);
|
||||
std::string desc = ggml_backend_dev_description(dev);
|
||||
for (const std::string & prefix : desc_prefixes_strip) {
|
||||
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
||||
desc = desc.substr(prefix.length());
|
||||
}
|
||||
}
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
const size_t unaccounted = total - self - free;
|
||||
|
||||
table_data.push_back({
|
||||
template_gpu,
|
||||
" - " + name + " (" + desc + ")",
|
||||
std::to_string(total / MiB),
|
||||
std::to_string(free / MiB),
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
std::to_string(unaccounted / MiB)});
|
||||
}
|
||||
|
||||
// print memory breakdown for host:
|
||||
{
|
||||
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - Host",
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb_host.model / MiB),
|
||||
std::to_string(mb_host.context / MiB),
|
||||
std::to_string(mb_host.compute / MiB),
|
||||
""}); // unaccounted
|
||||
}
|
||||
|
||||
// print memory breakdown for all remaining buffer types:
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (seen_buffer_types.count(buft) == 1) {
|
||||
continue;
|
||||
}
|
||||
const std::string name = ggml_backend_buft_name(buft);
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - " + name,
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
""}); // unaccounted
|
||||
seen_buffer_types.insert(buft);
|
||||
}
|
||||
|
||||
for (size_t j = 1; j < table_data[0].size(); j++) {
|
||||
size_t max_len = 0;
|
||||
for (const auto & td : table_data) {
|
||||
max_len = std::max(max_len, td[j].length());
|
||||
}
|
||||
for (auto & td : table_data) {
|
||||
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
||||
}
|
||||
}
|
||||
for (const auto & td : table_data) {
|
||||
LOG_INF(td[0].c_str(),
|
||||
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
||||
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void common_fit_print(
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams) {
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
|
||||
GGML_ASSERT(dmd.size() == devs.size() + 1);
|
||||
|
||||
for (size_t id = 0; id < devs.size(); id++) {
|
||||
printf("%s ", ggml_backend_dev_name(devs[id]));
|
||||
printf("%zu ", dmd[id].mb.model/1024/1024);
|
||||
printf("%zu ", dmd[id].mb.context/1024/1024);
|
||||
printf("%zu ", dmd[id].mb.compute/1024/1024);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Host ");
|
||||
printf("%zu ", dmd.back().mb.model/1024/1024);
|
||||
printf("%zu ", dmd.back().mb.context/1024/1024);
|
||||
printf("%zu ", dmd.back().mb.compute/1024/1024);
|
||||
printf("\n");
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
enum common_params_fit_status {
|
||||
COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
COMMON_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
||||
COMMON_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
|
||||
};
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// - returns true if the parameters could be successfully modified to fit device memory
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
// with the exception of the context size which is modified if and only if equal to 0
|
||||
enum common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
// print estimated memory to stdout
|
||||
void common_fit_print(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams);
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx);
|
||||
+2
-4
@@ -1,12 +1,10 @@
|
||||
#include "sampling.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "fit.h"
|
||||
#include "ggml.h"
|
||||
#include "log.h"
|
||||
#include "reasoning-budget.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <climits>
|
||||
@@ -513,7 +511,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
|
||||
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
||||
|
||||
common_memory_breakdown_print(ctx);
|
||||
llama_memory_breakdown_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -749,7 +749,6 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
|
||||
|
||||
mod.reset();
|
||||
n_low = 0;
|
||||
i_last = 0;
|
||||
}
|
||||
} else {
|
||||
n_low = 0;
|
||||
|
||||
+12
-102
@@ -746,12 +746,7 @@ class ModelBase:
|
||||
|
||||
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
|
||||
with open(quant_config_file, "r", encoding="utf-8") as f:
|
||||
hf_quant_config = json.load(f)
|
||||
quant_config = hf_quant_config.get("quantization") or {}
|
||||
producer = hf_quant_config.get("producer") or {}
|
||||
producer_name = (producer.get("name") or "").lower()
|
||||
if quant_method is None:
|
||||
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
|
||||
quant_config = json.load(f).get("quantization") or {}
|
||||
quant_algo = quant_config.get("quant_algo", quant_algo)
|
||||
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
|
||||
|
||||
@@ -11860,7 +11855,7 @@ class LLaDAMoEModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
|
||||
class HunYuanModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
|
||||
@@ -11999,58 +11994,28 @@ class HunYuanModel(TextModel):
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLVisionModel(MmprojModel):
|
||||
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
|
||||
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
|
||||
# Each variant maps to a different projector type in clip.cpp so image
|
||||
# preprocessing follows the correct code path.
|
||||
|
||||
class HunyuanOCRVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
|
||||
# HunyuanOCR uses max_image_size instead of image_size
|
||||
if "image_size" not in self.hparams_vision:
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
|
||||
|
||||
@staticmethod
|
||||
def is_ocr_variant(hparams: dict) -> bool:
|
||||
"""Return True for HunyuanOCR, False for HunyuanVL.
|
||||
|
||||
The projector's output dim must equal the text model's hidden_size by
|
||||
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
|
||||
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
|
||||
ViT -> LLM projection dim is a hard architectural signature, not a
|
||||
magic number.
|
||||
"""
|
||||
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
|
||||
return vision_out == 1024
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
vcfg = self.hparams_vision
|
||||
|
||||
if self.is_ocr_variant(self.global_config):
|
||||
# --- HunyuanOCR ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
return
|
||||
|
||||
# --- HunyuanVL ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
|
||||
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
|
||||
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
|
||||
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
|
||||
hparams = self.hparams_vision
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if not name.startswith("vit."):
|
||||
return
|
||||
return # skip text tensors
|
||||
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
|
||||
if "position_embedding" in name:
|
||||
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
|
||||
@@ -12058,66 +12023,11 @@ class HunyuanVLVisionModel(MmprojModel):
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
|
||||
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
|
||||
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLTextModel(HunYuanModel):
|
||||
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
|
||||
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
|
||||
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
|
||||
# the config and pick the matching GGUF architecture.
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
|
||||
@staticmethod
|
||||
def _is_ocr_config(hparams: dict) -> bool:
|
||||
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
|
||||
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
|
||||
# HunyuanVLVisionModel.is_ocr_variant.
|
||||
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
|
||||
|
||||
def __init__(self, dir_model: Path, *args, **kwargs):
|
||||
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
|
||||
if self._is_ocr_config(raw_hparams):
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
else:
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
super().__init__(dir_model, *args, **kwargs)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
|
||||
# the HunYuan-Dense arch which already handles standard rope in super().
|
||||
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
|
||||
return
|
||||
|
||||
if self.rope_parameters.get("rope_type") != "xdrope":
|
||||
return
|
||||
|
||||
# defaults for HunyuanVL. The C++ side later computes:
|
||||
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
|
||||
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
|
||||
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
|
||||
|
||||
ctx_len = int(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
|
||||
self.gguf_writer.add_context_length(ctx_len)
|
||||
|
||||
self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Skip vision tensors — they are written by HunyuanVLVisionModel
|
||||
if name.startswith("vit."):
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("SmolLM3ForCausalLM")
|
||||
class SmolLM3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
||||
|
||||
@@ -244,6 +244,7 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
|
||||
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
|
||||
- `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
|
||||
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
|
||||
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
|
||||
|
||||
> [!NOTE]
|
||||
> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
|
||||
@@ -273,6 +274,8 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
|
||||
Run llama.cpp with OpenVINO backend Docker container.
|
||||
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
|
||||
|
||||
> [!NOTE]
|
||||
> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
|
||||
|
||||
```bash
|
||||
# Run Docker container
|
||||
|
||||
@@ -31,8 +31,6 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.
|
||||
|
||||
## Recommended Release
|
||||
|
||||
### Windows
|
||||
|
||||
The following releases are verified and recommended:
|
||||
|
||||
|Commit ID|Tag|Release|Verified Platform| Update date|
|
||||
@@ -41,13 +39,6 @@ The following releases are verified and recommended:
|
||||
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||
|
||||
### Ubuntu 24.04
|
||||
|
||||
The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
|
||||
|
||||
It is recommended to use them with Intel Docker.
|
||||
|
||||
The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.
|
||||
|
||||
## News
|
||||
|
||||
@@ -238,7 +229,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|
||||
|
||||
|Verified release|
|
||||
|-|
|
||||
|2025.3.3 |
|
||||
|2025.2.1|
|
||||
|2025.1|
|
||||
|2024.1|
|
||||
|
||||
@@ -8,24 +8,8 @@
|
||||
#include <clocale>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
struct spec_checkpoint {
|
||||
int64_t n_tokens = 0;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
size_t size() const {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return data.empty();
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
@@ -62,14 +46,6 @@ int main(int argc, char ** argv) {
|
||||
model_tgt = llama_init_tgt->model();
|
||||
ctx_tgt = llama_init_tgt->context();
|
||||
|
||||
// check if the context supports partial sequence removal
|
||||
const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
|
||||
const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
|
||||
|
||||
if (use_ckpt) {
|
||||
LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
|
||||
|
||||
// load the draft model
|
||||
@@ -143,7 +119,7 @@ int main(int argc, char ** argv) {
|
||||
const auto t_enc_start = ggml_time_us();
|
||||
|
||||
// target model sampling context
|
||||
common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));
|
||||
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
|
||||
|
||||
// eval the prompt
|
||||
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
|
||||
@@ -166,61 +142,21 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
|
||||
|
||||
size_t n_draft = 0;
|
||||
|
||||
llama_tokens draft;
|
||||
spec_checkpoint spec_ckpt;
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
while (true) {
|
||||
// generate or reuse draft tokens
|
||||
// optionally, generate draft tokens that can be appended to the target batch
|
||||
//
|
||||
// this is the most important part of the speculation. the more probable tokens that are provided here
|
||||
// the better the performance will be. in theory, this computation can be performed asynchronously and even
|
||||
// offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
|
||||
// from a cache or lookup tables.
|
||||
//
|
||||
if (draft.empty()) {
|
||||
// generate a new draft
|
||||
draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
|
||||
llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
|
||||
|
||||
if ((int) draft.size() > params_spec.n_max) {
|
||||
LOG_WRN("draft size %zu exceeds max %d, truncating\n", draft.size(), params_spec.n_max);
|
||||
draft.resize(params_spec.n_max);
|
||||
}
|
||||
|
||||
if ((int) draft.size() < params_spec.n_min) {
|
||||
LOG_DBG("ignoring small draft: %zu < %d\n", draft.size(), params_spec.n_min);
|
||||
draft.clear();
|
||||
}
|
||||
|
||||
// save the original draft size
|
||||
n_draft = draft.size();
|
||||
|
||||
// save a checkpoint of the target context before evaluating the draft
|
||||
// this allows us to restore the state if partial draft acceptance occurs
|
||||
if (!draft.empty() && use_ckpt) {
|
||||
const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
spec_ckpt.data.resize(ckpt_size);
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
GGML_ASSERT(n == ckpt_size);
|
||||
|
||||
spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
|
||||
LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
|
||||
spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
|
||||
}
|
||||
} else {
|
||||
// we have a previous (partial) draft to reuse from checkpoint restoration
|
||||
if (use_ckpt) {
|
||||
GGML_ASSERT(!spec_ckpt.empty());
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(n_draft > 0);
|
||||
//LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
|
||||
|
||||
// always have a token to evaluate from before - id_last
|
||||
common_batch_clear(batch_tgt);
|
||||
@@ -242,12 +178,6 @@ int main(int argc, char ** argv) {
|
||||
llama_decode(ctx_tgt, batch_tgt);
|
||||
}
|
||||
|
||||
// only save the sampler sampler state if we use checkpoints
|
||||
common_sampler_ptr smpl_save;
|
||||
if (use_ckpt) {
|
||||
smpl_save.reset(common_sampler_clone(smpl.get()));
|
||||
}
|
||||
|
||||
// sample from the full target batch and return the accepted tokens based on the target sampler
|
||||
//
|
||||
// for each token to be accepted, the sampler would have to sample that same token
|
||||
@@ -255,38 +185,14 @@ int main(int argc, char ** argv) {
|
||||
// available logits from the batch and sample the next token until we run out of logits or the sampler
|
||||
// disagrees with the draft
|
||||
//
|
||||
auto ids = common_sampler_sample_and_accept_n(smpl.get(), ctx_tgt, draft);
|
||||
const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
|
||||
|
||||
//LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
|
||||
|
||||
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
|
||||
|
||||
// check for partial draft acceptance:
|
||||
// if the context doesn't support partial sequence removal, restore the checkpoint
|
||||
// and make the accepted tokens the new partial draft for the next iteration
|
||||
if (use_ckpt && ids.size() - 1 < draft.size()) {
|
||||
LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());
|
||||
|
||||
draft = std::move(ids);
|
||||
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
GGML_ASSERT(n == spec_ckpt.size());
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
|
||||
|
||||
prompt_tgt.resize(spec_ckpt.n_tokens);
|
||||
smpl = std::move(smpl_save);
|
||||
|
||||
n_past = (int) prompt_tgt.size();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
common_speculative_accept(spec, ids.size() - 1);
|
||||
|
||||
// full acceptance: consume the draft and commit accepted tokens
|
||||
n_past += ids.size() - 1;
|
||||
n_drafted += n_draft; // note: we ignore the discarded small drafts
|
||||
n_drafted += draft.size(); // note: we ignore the discarded small drafts
|
||||
n_accept += ids.size() - 1;
|
||||
n_predict += ids.size();
|
||||
|
||||
@@ -316,9 +222,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
|
||||
|
||||
// clear the draft since it has been consumed
|
||||
draft.clear();
|
||||
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
@@ -351,10 +254,11 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("target:\n\n");
|
||||
common_perf_print(ctx_tgt, smpl.get());
|
||||
common_perf_print(ctx_tgt, smpl);
|
||||
|
||||
llama_batch_free(batch_tgt);
|
||||
|
||||
common_sampler_free(smpl);
|
||||
common_speculative_free(spec);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
+3
-3
@@ -4,8 +4,8 @@ project("ggml" C CXX ASM)
|
||||
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 10)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_MINOR 9)
|
||||
set(GGML_VERSION_PATCH 11)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
@@ -213,7 +213,7 @@ set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
|
||||
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
|
||||
|
||||
option(GGML_HIP "ggml: use HIP" OFF)
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph" ON)
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||
option(GGML_HIP_RCCL "ggml: use ROCm Collective Comm. Library" OFF)
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
|
||||
@@ -473,7 +473,7 @@ target_link_libraries(ggml-base PRIVATE Threads::Threads)
|
||||
find_library(MATH_LIBRARY m)
|
||||
if (MATH_LIBRARY)
|
||||
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
|
||||
target_link_libraries(ggml-base PRIVATE m)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
+80
-180
@@ -1133,7 +1133,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
|
||||
t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
|
||||
if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
|
||||
GGML_ASSERT(tensor->ne[split_dim] != 0);
|
||||
GGML_ASSERT(ne[split_dim] != 0 && tensor->ne[split_dim] != 0);
|
||||
const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
|
||||
GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);
|
||||
|
||||
@@ -1170,28 +1170,6 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
|
||||
simple_tensors.push_back(t_ij);
|
||||
}
|
||||
|
||||
// If one of the sources has a zero-sized slice, disable the computation:
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (tensor->src[i] == nullptr || !ggml_backend_buffer_is_meta(tensor->src[i]->buffer)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const ggml_backend_meta_split_state split_state_src = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
|
||||
if (split_state_src.axis < 0 || split_state_src.axis >= GGML_MAX_DIMS) {
|
||||
continue;
|
||||
}
|
||||
for (size_t j = 0; j < n_simple_bufs; j++) {
|
||||
int64_t ne_sum = 0;
|
||||
for (size_t s = 0; s < split_state_src.n_segments; s++) {
|
||||
ne_sum += split_state_src.ne[s*n_simple_bufs + j];
|
||||
}
|
||||
if (ne_sum == 0) {
|
||||
simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buf_ctx->simple_tensors[tensor] = simple_tensors;
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
@@ -1464,20 +1442,17 @@ struct ggml_backend_meta_context {
|
||||
struct backend_config {
|
||||
ggml_backend_t backend;
|
||||
|
||||
std::vector<cgraph_config> cgraphs;
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
std::vector<cgraph_config> cgraphs;
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
ggml_backend_buffer_ptr buf;
|
||||
|
||||
backend_config(ggml_backend_t backend, const size_t n_reduce_steps) : backend(backend) {
|
||||
bufs.resize(n_reduce_steps);
|
||||
}
|
||||
backend_config(ggml_backend_t backend) : backend(backend) {}
|
||||
};
|
||||
std::string name;
|
||||
std::vector<backend_config> backend_configs;
|
||||
ggml_context_ptr ctx;
|
||||
std::vector<ggml_cgraph *> cgraphs_aux;
|
||||
std::vector<ggml_tensor *> nodes_aux;
|
||||
size_t n_reduce_steps;
|
||||
int max_nnodes = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
size_t max_subgraphs = 0;
|
||||
@@ -1489,7 +1464,6 @@ struct ggml_backend_meta_context {
|
||||
|
||||
ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
|
||||
const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
|
||||
n_reduce_steps = std::ceil(std::log2(n_devs));
|
||||
name = "Meta(";
|
||||
std::vector<ggml_backend_t> simple_backends;
|
||||
backend_configs.reserve(n_devs);
|
||||
@@ -1501,7 +1475,7 @@ struct ggml_backend_meta_context {
|
||||
}
|
||||
name += ggml_backend_dev_name(simple_dev);
|
||||
simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
|
||||
backend_configs.emplace_back(simple_backends.back(), n_reduce_steps);
|
||||
backend_configs.emplace_back(simple_backends.back());
|
||||
}
|
||||
name += ")";
|
||||
|
||||
@@ -1531,6 +1505,10 @@ struct ggml_backend_meta_context {
|
||||
ggml_backend_free(bc.backend);
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_reduce_steps() const {
|
||||
return std::ceil(std::log2(backend_configs.size()));
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_meta_get_name(ggml_backend_t backend) {
|
||||
@@ -1683,36 +1661,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
|
||||
ggml_tensor * node = cgraph->nodes[id];
|
||||
int32_t n_used = ggml_node_get_use_count(cgraph, id);
|
||||
|
||||
// Skip MIRRORED nodes that don't consume node
|
||||
auto skip_unrelated = [&]() {
|
||||
while (id + 1 < cgraph->n_nodes) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (ggml_backend_meta_get_split_state(next, false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
break;
|
||||
}
|
||||
bool safe = true;
|
||||
for (int s = 0; s < GGML_MAX_SRC; s++) {
|
||||
if (next->src[s] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (next->src[s] == node) {
|
||||
safe = false;
|
||||
break;
|
||||
}
|
||||
if (ggml_backend_meta_get_split_state(next->src[s], false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
safe = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!safe) {
|
||||
break;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
};
|
||||
|
||||
skip_unrelated();
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
@@ -1727,12 +1675,10 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
// Chain of MULs with MIRRORED src[1]
|
||||
while (true) {
|
||||
skip_unrelated();
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_MUL && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
@@ -1740,8 +1686,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1810,17 +1754,16 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
if (max_tmp_size > backend_ctx->max_tmp_size) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i = 0; i < backend_ctx->n_reduce_steps; i++) {
|
||||
bcj.bufs[i].reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
}
|
||||
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
}
|
||||
backend_ctx->max_tmp_size = max_tmp_size;
|
||||
}
|
||||
|
||||
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
|
||||
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
|
||||
const size_t n_nodes_per_device = 3 * backend_ctx->n_reduce_steps; // tmp + ADD (+zeroing) graph per step and device
|
||||
const size_t n_cgraphs_per_device = 2 * backend_ctx->n_reduce_steps; // ADD ( + zeroing) graph per step and device
|
||||
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
|
||||
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
|
||||
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
@@ -1869,6 +1812,11 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
size_t iga = 0; // i graph aux
|
||||
size_t ina = 0; // i node aux
|
||||
|
||||
// FIXME usage_counts
|
||||
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
|
||||
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
|
||||
return ret;
|
||||
};
|
||||
auto get_node_aux = [&](ggml_tensor * t) -> ggml_tensor * {
|
||||
ggml_tensor * ret = backend_ctx->nodes_aux[ina++];
|
||||
memset(ret, 0, sizeof(ggml_tensor));
|
||||
@@ -1880,110 +1828,75 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
auto set_tmp_data = [&](ggml_tensor * tensor, const size_t j, const size_t i_buf) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
ggml_backend_buffer_ptr & buf_ptr = bcj.bufs[i_buf];
|
||||
if (!buf_ptr || ggml_backend_buffer_get_size(buf_ptr.get()) < backend_ctx->max_tmp_size) {
|
||||
buf_ptr.reset(ggml_backend_alloc_buffer(bcj.backend, backend_ctx->max_tmp_size));
|
||||
}
|
||||
tensor->buffer = buf_ptr.get();
|
||||
tensor->data = ggml_backend_buffer_get_base(buf_ptr.get());
|
||||
};
|
||||
// FIXME usage_counts
|
||||
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
|
||||
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
|
||||
return ret;
|
||||
};
|
||||
|
||||
// Preferentially use backend-specific allreduce_tensor_async (e.g. NCCL for CUDA), use a generic fallback if unavailable:
|
||||
auto allreduce_fallback = [&](size_t i) -> ggml_status {
|
||||
std::vector<ggml_cgraph *> step_cgraphs(n_backends, nullptr);
|
||||
|
||||
// Zero out nodes that were disabled due to having a zero-sized slice:
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
ggml_tensor * node = bcj.cgraphs[i].cgraph_main->nodes[bcj.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
if (node->flags & GGML_TENSOR_FLAG_COMPUTE) {
|
||||
continue;
|
||||
}
|
||||
ggml_tensor * node_zero = get_node_aux(node);
|
||||
node_zero->op = GGML_OP_SCALE; // FIXME 0.0f * NaN == NaN
|
||||
node_zero->src[0] = node;
|
||||
ggml_set_op_params_f32(node_zero, 0, 0.0f);
|
||||
node_zero->data = node->data;
|
||||
node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
|
||||
step_cgraphs[j] = get_cgraph_aux();
|
||||
step_cgraphs[j]->nodes[0] = node_zero;
|
||||
step_cgraphs[j]->n_nodes = 1;
|
||||
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, step_cgraphs[j]);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
|
||||
|
||||
auto push_data = [&](const size_t j_src, const size_t j_dst, const size_t i_buf) {
|
||||
assert(step_cgraphs[j_dst] == nullptr);
|
||||
auto & bcj_src = backend_ctx->backend_configs[j_src];
|
||||
auto & bcj_dst = backend_ctx->backend_configs[j_dst];
|
||||
|
||||
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
GGML_ASSERT(ggml_is_contiguous(node_src));
|
||||
GGML_ASSERT(ggml_is_contiguous(node_dst));
|
||||
|
||||
ggml_tensor * node_tmp = get_node_aux(node_dst);
|
||||
set_tmp_data(node_tmp, j_dst, i_buf);
|
||||
|
||||
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_tmp);
|
||||
|
||||
ggml_tensor * node_red = get_node_aux(node_dst);
|
||||
node_red->view_src = node_dst->view_src == nullptr ? node_dst : node_dst->view_src;
|
||||
node_red->view_offs = node_dst->view_offs;
|
||||
node_red->op = GGML_OP_ADD;
|
||||
node_red->src[0] = node_dst;
|
||||
node_red->src[1] = node_tmp;
|
||||
node_red->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
ggml_backend_view_init(node_red);
|
||||
|
||||
ggml_cgraph * cgraph_aux = get_cgraph_aux();
|
||||
cgraph_aux->nodes[0] = node_red;
|
||||
cgraph_aux->n_nodes = 1;
|
||||
step_cgraphs[j_dst] = cgraph_aux;
|
||||
};
|
||||
|
||||
size_t offset_j = n_backends/2;
|
||||
while ((offset_j & (offset_j - 1)) != 0) {
|
||||
offset_j--;
|
||||
}
|
||||
const size_t offset_j_max = offset_j;
|
||||
size_t i_buf = 0;
|
||||
|
||||
// If n_backends is not a power of 2, fold in the excess prior to butterfly reduction:
|
||||
for (size_t j_src = 2*offset_j_max; j_src < n_backends; j_src++) {
|
||||
const size_t j_dst = j_src - 2*offset_j_max;
|
||||
push_data(j_src, j_dst, i_buf);
|
||||
const ggml_status status = ggml_backend_graph_compute_async(backend_ctx->backend_configs[j_dst].backend, step_cgraphs[j_dst]);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
i_buf = 1;
|
||||
}
|
||||
|
||||
// Butterfly reduction:
|
||||
for (; offset_j >= 1; offset_j /= 2) {
|
||||
for (size_t offset_j = 1; offset_j < n_backends; offset_j *= 2) {
|
||||
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
|
||||
|
||||
for (size_t j = 0; j < 2*offset_j_max; j++) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
const size_t j_other = j ^ offset_j;
|
||||
if (j_other >= n_backends) {
|
||||
if (j_other > j) {
|
||||
continue;
|
||||
}
|
||||
push_data(j, j_other, i_buf);
|
||||
|
||||
auto & bcj1 = backend_ctx->backend_configs[j];
|
||||
auto & bcj2 = backend_ctx->backend_configs[j_other];
|
||||
|
||||
ggml_tensor * node1 = bcj1.cgraphs[i].cgraph_main->nodes[bcj1.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_tensor * node2 = bcj2.cgraphs[i].cgraph_main->nodes[bcj2.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
GGML_ASSERT(ggml_is_contiguous(node1));
|
||||
GGML_ASSERT(ggml_is_contiguous(node2));
|
||||
|
||||
// Tmp tensors to receive P2P copies
|
||||
ggml_tensor * node_tmp_1 = get_node_aux(node1);
|
||||
node_tmp_1->buffer = bcj1.buf.get();
|
||||
node_tmp_1->data = ggml_backend_buffer_get_base(bcj1.buf.get());
|
||||
|
||||
ggml_tensor * node_tmp_2 = get_node_aux(node2);
|
||||
node_tmp_2->buffer = bcj2.buf.get();
|
||||
node_tmp_2->data = ggml_backend_buffer_get_base(bcj2.buf.get());
|
||||
|
||||
// 2 P2P copies: exchange full buffers
|
||||
ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
|
||||
ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
|
||||
|
||||
// Local ADD: node1 += tmp1 (in-place via view)
|
||||
ggml_tensor * node_red_1 = get_node_aux(node1);
|
||||
node_red_1->view_src = node1->view_src == nullptr ? node1 : node1->view_src;
|
||||
node_red_1->view_offs = node1->view_offs;
|
||||
node_red_1->op = GGML_OP_ADD;
|
||||
node_red_1->src[0] = node1;
|
||||
node_red_1->src[1] = node_tmp_1;
|
||||
node_red_1->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
ggml_backend_view_init(node_red_1);
|
||||
|
||||
// Local ADD: node2 += tmp2 (in-place via view)
|
||||
ggml_tensor * node_red_2 = get_node_aux(node2);
|
||||
node_red_2->view_src = node2->view_src == nullptr ? node2 : node2->view_src;
|
||||
node_red_2->view_offs = node2->view_offs;
|
||||
node_red_2->op = GGML_OP_ADD;
|
||||
node_red_2->src[0] = node2;
|
||||
node_red_2->src[1] = node_tmp_2;
|
||||
node_red_2->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
ggml_backend_view_init(node_red_2);
|
||||
|
||||
// Build 1-node cgraphs for the ADD ops
|
||||
ggml_cgraph * cgraph_aux_1 = get_cgraph_aux();
|
||||
cgraph_aux_1->nodes[0] = node_red_1;
|
||||
cgraph_aux_1->n_nodes = 1;
|
||||
step_cgraphs[j] = cgraph_aux_1;
|
||||
|
||||
ggml_cgraph * cgraph_aux_2 = get_cgraph_aux();
|
||||
cgraph_aux_2->nodes[0] = node_red_2;
|
||||
cgraph_aux_2->n_nodes = 1;
|
||||
step_cgraphs[j_other] = cgraph_aux_2;
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < 2*offset_j_max; j++) {
|
||||
// Execute local ADDs for this step
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
if (step_cgraphs[j] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
@@ -1993,20 +1906,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
return status;
|
||||
}
|
||||
}
|
||||
i_buf++;
|
||||
}
|
||||
assert(i_buf == backend_ctx->n_reduce_steps);
|
||||
|
||||
// If n_backends is not a power of 2, copy back the reduced tensors to the excess:
|
||||
for (size_t j = 2*offset_j_max; j < n_backends; j++) {
|
||||
auto & bcj_src = backend_ctx->backend_configs[j - 2*offset_j_max];
|
||||
auto & bcj_dst = backend_ctx->backend_configs[j];
|
||||
|
||||
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_dst);
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
|
||||
@@ -83,6 +83,7 @@
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
|
||||
@@ -151,6 +151,8 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const block_q1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t sumv = vdupq_n_f32(0.0f);
|
||||
|
||||
@@ -210,13 +212,31 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv);
|
||||
sumf = vaddvq_f32(sumv);
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
// Scalar fallback
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
// Process 4 Q8_0 blocks
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const int bit_index = k * QK8_0 + j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
|
||||
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
||||
sumi += xi * y[i*4 + k].qs[j];
|
||||
}
|
||||
sumf += d0 * d1 * sumi;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -274,18 +274,6 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
|
||||
}
|
||||
#endif
|
||||
#elif defined(__SSSE3__)
|
||||
static inline __m128i bytes_from_bits_16(const uint8_t * x) {
|
||||
uint16_t x16;
|
||||
memcpy(&x16, x, sizeof(uint16_t));
|
||||
|
||||
const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
|
||||
__m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
|
||||
const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
|
||||
bytes = _mm_or_si128(bytes, bit_mask);
|
||||
|
||||
return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
|
||||
}
|
||||
|
||||
// horizontally add 4x4 floats
|
||||
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
|
||||
__m128 res_0 =_mm_hadd_ps(a, b);
|
||||
@@ -552,152 +540,6 @@ static inline __m128i get_scale_shuffle(int i) {
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK1_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
const __m256i ones_8 = _mm256_set1_epi8(1);
|
||||
const __m256i ones_16 = _mm256_set1_epi16(1);
|
||||
const __m256i byte_shuf = _mm256_setr_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
|
||||
const __m256i bit_masks = _mm256_setr_epi8(
|
||||
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
|
||||
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
|
||||
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
|
||||
|
||||
__m256 acc_block;
|
||||
{
|
||||
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
|
||||
const __m256i sm = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
|
||||
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
|
||||
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
|
||||
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
|
||||
}
|
||||
for (int K = 1; K < 4; ++K) {
|
||||
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs);
|
||||
const __m256i sm = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero);
|
||||
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
|
||||
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
|
||||
acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block);
|
||||
}
|
||||
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
const __m128i ones_8 = _mm_set1_epi8(1);
|
||||
const __m128i ones_16 = _mm_set1_epi16(1);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
|
||||
__m256 acc_block;
|
||||
{
|
||||
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[0]);
|
||||
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
|
||||
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
|
||||
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[0]);
|
||||
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[16]);
|
||||
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
|
||||
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
|
||||
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
|
||||
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
|
||||
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
|
||||
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
|
||||
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
|
||||
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
|
||||
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
|
||||
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), q);
|
||||
}
|
||||
for(int K = 1; K < 4; ++K) {
|
||||
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[(K) * 4]);
|
||||
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
|
||||
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
|
||||
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[0]);
|
||||
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[16]);
|
||||
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
|
||||
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
|
||||
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
|
||||
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
|
||||
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
|
||||
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
|
||||
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
|
||||
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
|
||||
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
|
||||
acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q));
|
||||
}
|
||||
#undef Q1_AVX_BLOCK
|
||||
|
||||
acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__SSSE3__)
|
||||
const __m128i ones_8 = _mm_set1_epi8(1);
|
||||
const __m128i ones_16 = _mm_set1_epi16(1);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128 acc_0 = _mm_setzero_ps();
|
||||
__m128 acc_1 = _mm_setzero_ps();
|
||||
__m128 acc_2 = _mm_setzero_ps();
|
||||
__m128 acc_3 = _mm_setzero_ps();
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
|
||||
|
||||
#define Q1_SSSE3_BLOCK(QS_OFF, Y_IDX, ACC) \
|
||||
{ \
|
||||
const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 0]); \
|
||||
const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 2]); \
|
||||
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[0]); \
|
||||
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[16]); \
|
||||
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
|
||||
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
|
||||
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
|
||||
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
|
||||
const __m128i sum_0 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_0), ones_16); \
|
||||
const __m128i sum_1 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_1), ones_16); \
|
||||
const __m128 q = _mm_cvtepi32_ps(_mm_add_epi32(sum_0, sum_1)); \
|
||||
(ACC) = _mm_add_ps((ACC), _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(Y_IDX)].d))), q)); \
|
||||
}
|
||||
Q1_SSSE3_BLOCK(0, 0, acc_0)
|
||||
Q1_SSSE3_BLOCK(4, 1, acc_1)
|
||||
Q1_SSSE3_BLOCK(8, 2, acc_2)
|
||||
Q1_SSSE3_BLOCK(12, 3, acc_3)
|
||||
#undef Q1_SSSE3_BLOCK
|
||||
}
|
||||
|
||||
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -137,28 +137,22 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
float sumf = 0.0;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
float sumi = 0.0f;
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
|
||||
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
|
||||
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
|
||||
|
||||
int sumi_block = 0;
|
||||
|
||||
const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
|
||||
const int8_t * GGML_RESTRICT qy = yb->qs;
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const int bit_index = k * QK8_0 + j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
|
||||
for (int b = 0; b < 4; ++b, qy += 8) {
|
||||
const unsigned mask = bits[b];
|
||||
sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
|
||||
+ ((mask & 0x02) ? qy[1] : -qy[1])
|
||||
+ ((mask & 0x04) ? qy[2] : -qy[2])
|
||||
+ ((mask & 0x08) ? qy[3] : -qy[3])
|
||||
+ ((mask & 0x10) ? qy[4] : -qy[4])
|
||||
+ ((mask & 0x20) ? qy[5] : -qy[5])
|
||||
+ ((mask & 0x40) ? qy[6] : -qy[6])
|
||||
+ ((mask & 0x80) ? qy[7] : -qy[7]);
|
||||
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
||||
sumi_block += xi * y[i*4 + k].qs[j];
|
||||
}
|
||||
|
||||
sumi += d1 * sumi_block;
|
||||
|
||||
@@ -368,21 +368,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
}
|
||||
|
||||
~ggml_cuda_pool_leg() {
|
||||
clear_pool();
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
void clear_pool() {
|
||||
ggml_cuda_set_device(device);
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cuda_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
CUDA_CHECK(cudaFree(b.ptr));
|
||||
pool_size -= b.size;
|
||||
b.ptr = nullptr;
|
||||
b.size = 0;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
@@ -427,20 +421,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
ggml_cuda_set_device(device);
|
||||
cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
|
||||
if (err == cudaErrorMemoryAllocation) {
|
||||
(void)cudaGetLastError();
|
||||
const size_t cached_bytes = pool_size;
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n",
|
||||
device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
clear_pool();
|
||||
err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
|
||||
if (err == cudaSuccess) {
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
|
||||
}
|
||||
}
|
||||
CUDA_CHECK(err);
|
||||
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
@@ -1222,13 +1203,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
// For small tensors, simply reduce them as FP32.
|
||||
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
|
||||
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
if ((tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, ggml_nbytes(tensors[i]), cuda_ctx->stream()));
|
||||
}
|
||||
}
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
@@ -1250,11 +1224,7 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
tmp[i].alloc(ne);
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
if (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) {
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemsetAsync(tmp[i].get(), 0, ne * sizeof(nv_bfloat16), cuda_ctx->stream()));
|
||||
}
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -3592,30 +3562,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
|
||||
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
|
||||
const ggml_tensor * unary = cgraph->nodes[node_idx];
|
||||
const ggml_tensor * sqr = cgraph->nodes[node_idx+1];
|
||||
|
||||
if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (unary->type != sqr->type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous(unary->src[0])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
|
||||
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
|
||||
const ggml_tensor *scale = cgraph->nodes[node_idx];
|
||||
@@ -4124,12 +4070,6 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
|
||||
ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
|
||||
i += 2;
|
||||
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
|
||||
|
||||
@@ -65,11 +65,6 @@ static __device__ __forceinline__ float op_sqr(float x) {
|
||||
return x * x;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_relu_sqr(float x) {
|
||||
const float r = fmaxf(x, 0.0f);
|
||||
return r * r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_sqrt(float x) {
|
||||
return sqrtf(x);
|
||||
}
|
||||
@@ -620,21 +615,3 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
|
||||
GGML_ABORT("Unsupported unary op for fused unary+mul");
|
||||
}
|
||||
}
|
||||
|
||||
/* fused relu + sqr */
|
||||
|
||||
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
|
||||
const ggml_tensor * src = relu_node->src[0];
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src));
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src->type == sqr_node->type);
|
||||
|
||||
const int k = ggml_nelements(src);
|
||||
if (src->type == GGML_TYPE_F16) {
|
||||
unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
|
||||
} else {
|
||||
unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,8 +91,6 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);
|
||||
|
||||
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
|
||||
|
||||
__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
|
||||
return x / (1.0f + expf(-x));
|
||||
}
|
||||
|
||||
Vendored
-1
@@ -58,7 +58,6 @@
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaErrorMemoryAllocation hipErrorOutOfMemory
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
|
||||
Vendored
-1
@@ -42,7 +42,6 @@
|
||||
#define cudaDeviceProp musaDeviceProp
|
||||
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||
#define cudaError_t musaError_t
|
||||
#define cudaErrorMemoryAllocation musaErrorMemoryAllocation
|
||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||
|
||||
@@ -2596,29 +2596,6 @@ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * se
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
// diag only supports F32 currently
|
||||
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Input must have ne[1] == 1 (vector input)
|
||||
if (src0->ne[1] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Output must be square in first two dimensions
|
||||
if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
||||
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
||||
return sess->c_name();
|
||||
@@ -2655,8 +2632,6 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
case GGML_OP_ROPE: return HTP_OP_ROPE;
|
||||
case GGML_OP_REPEAT: return HTP_OP_REPEAT;
|
||||
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
|
||||
case GGML_OP_FILL: return HTP_OP_FILL;
|
||||
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
||||
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(t)) {
|
||||
@@ -3054,17 +3029,6 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
||||
|
||||
@@ -3195,14 +3159,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_cumsum(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_FILL:
|
||||
supp = ggml_hexagon_supported_fill(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_DIAG:
|
||||
supp = ggml_hexagon_supported_diag(sess, op);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -34,8 +34,6 @@ add_library(${HTP_LIB} SHARED
|
||||
argsort-ops.c
|
||||
ssm-conv.c
|
||||
cumsum-ops.c
|
||||
fill-ops.c
|
||||
diag-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
|
||||
@@ -1,216 +0,0 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hvx-types.h"
|
||||
#include "hex-utils.h"
|
||||
#include "hvx-copy.h"
|
||||
#include "hex-dma.h"
|
||||
|
||||
#define htp_diag_tensors_preamble \
|
||||
const struct htp_tensor * restrict src0 = octx->src[0]; \
|
||||
const struct htp_tensor * restrict dst = octx->dst; \
|
||||
\
|
||||
const uint32_t ne02 = src0->ne[2]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
\
|
||||
const uint32_t nb02 = src0->nb[2]; \
|
||||
const uint32_t nb03 = src0->nb[3]; \
|
||||
\
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3];
|
||||
|
||||
struct htp_diag_context {
|
||||
struct htp_ops_context * octx;
|
||||
size_t src_batch_size;
|
||||
size_t dst_row_size;
|
||||
size_t src_batch_size_aligned;
|
||||
size_t dst_row_size_aligned;
|
||||
uint32_t batches_per_thread;
|
||||
uint32_t total_batches;
|
||||
};
|
||||
|
||||
#define htp_diag_preamble \
|
||||
struct htp_diag_context * dctx = (struct htp_diag_context *) data; \
|
||||
struct htp_ops_context * octx = dctx->octx; \
|
||||
htp_diag_tensors_preamble;
|
||||
|
||||
static inline void hvx_diag_row_f32(const float * restrict src, float * restrict dst,
|
||||
uint32_t row_idx, uint32_t n) {
|
||||
hvx_splat_f32_a((uint8_t *) dst, 0.0f, n);
|
||||
dst[row_idx] = src[row_idx];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per thread worker: DMA src fetch, compute in VTCM, DMA dst writeback
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void diag_thread_f32_dma(unsigned int nth, unsigned int ith, void * data) {
|
||||
htp_diag_preamble;
|
||||
dma_queue * dma_queue = octx->ctx->dma[ith];
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint32_t ib0 = dctx->batches_per_thread * ith;
|
||||
const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
|
||||
|
||||
if (ib0 >= ib1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t src_batch_size = dctx->src_batch_size;
|
||||
const size_t dst_row_size = dctx->dst_row_size;
|
||||
const size_t src_batch_size_aligned = dctx->src_batch_size_aligned;
|
||||
const size_t dst_row_size_aligned = dctx->dst_row_size_aligned;
|
||||
|
||||
const uint8_t * src_data = (const uint8_t *) src0->data;
|
||||
uint8_t * dst_data = (uint8_t *) dst->data;
|
||||
|
||||
// 1 src buffer + 1 dst row buffer per thread in VTCM
|
||||
uint8_t * src_spad = octx->src0_spad.data + (ith * src_batch_size_aligned);
|
||||
uint8_t * dst_spad = octx->dst_spad.data + (ith * dst_row_size_aligned);
|
||||
|
||||
for (uint32_t ib = ib0; ib < ib1; ib++) {
|
||||
const uint32_t i3 = ib / ne02;
|
||||
const uint32_t i2 = ib % ne02;
|
||||
|
||||
const uint8_t * src_batch = src_data + i3 * nb03 + i2 * nb02;
|
||||
|
||||
// Fetch source vector into VTCM
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue,
|
||||
dma_make_ptr(src_spad, src_batch),
|
||||
src_batch_size_aligned, src_batch_size, 1);
|
||||
dma_queue_flush(dma_queue);
|
||||
|
||||
const float * src_spad_f32 = (const float *) src_spad;
|
||||
float * dst_spad_f32 = (float *) dst_spad;
|
||||
|
||||
for (uint32_t i1 = 0; i1 < ne1; i1++) {
|
||||
// Compute row in VTCM
|
||||
hvx_diag_row_f32(src_spad_f32, dst_spad_f32, i1, ne0);
|
||||
|
||||
// Write completed row back to DDR
|
||||
uint8_t * dst_row = dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1;
|
||||
dma_queue_push_vtcm_to_ddr(dma_queue,
|
||||
dma_make_ptr(dst_row, dst_spad),
|
||||
dst_row_size, dst_row_size_aligned, 1);
|
||||
dma_queue_flush(dma_queue);
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "diag-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
|
||||
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per thread worker: Direct HVX (no DMA)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void diag_thread_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
htp_diag_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint8_t * src_data = (const uint8_t *) src0->data;
|
||||
uint8_t * dst_data = (uint8_t *) dst->data;
|
||||
|
||||
const uint32_t ib0 = dctx->batches_per_thread * ith;
|
||||
const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
|
||||
|
||||
for (uint32_t ib = ib0; ib < ib1; ib++) {
|
||||
const uint32_t i3 = ib / ne02;
|
||||
const uint32_t i2 = ib % ne02;
|
||||
|
||||
const float * restrict src_batch = (const float *)(src_data + i3 * nb03 + i2 * nb02);
|
||||
|
||||
for (uint32_t i1 = 0; i1 < ne1; i1++) {
|
||||
float * restrict dst_row = (float *)(dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1);
|
||||
hvx_diag_row_f32(src_batch, dst_row, i1, ne0);
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "diag-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
|
||||
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
int op_diag_f32(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
const uint32_t total_batches = src0->ne[2] * src0->ne[3];
|
||||
const uint32_t n_threads = MIN(octx->n_threads, total_batches);
|
||||
|
||||
const size_t src_batch_size = src0->ne[0] * sizeof(float);
|
||||
const size_t dst_row_size = dst->ne[0] * sizeof(float);
|
||||
const size_t src_batch_size_aligned = hex_round_up(src_batch_size, VLEN);
|
||||
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
||||
|
||||
// 1 src buffer + 1 dst row buffer per thread
|
||||
const size_t spad_per_thread = src_batch_size_aligned + dst_row_size_aligned;
|
||||
|
||||
octx->src0_spad.size_per_thread = src_batch_size_aligned;
|
||||
octx->dst_spad.size_per_thread = dst_row_size_aligned;
|
||||
|
||||
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
||||
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL;
|
||||
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.src = NULL;
|
||||
|
||||
struct htp_diag_context dctx = {
|
||||
.octx = octx,
|
||||
.src_batch_size = src_batch_size,
|
||||
.dst_row_size = dst_row_size,
|
||||
.src_batch_size_aligned = src_batch_size_aligned,
|
||||
.dst_row_size_aligned = dst_row_size_aligned,
|
||||
.batches_per_thread = (total_batches + n_threads - 1) / n_threads,
|
||||
.total_batches = total_batches,
|
||||
};
|
||||
|
||||
if (octx->ctx->vtcm_size < spad_per_thread * n_threads) {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32, &dctx, n_threads);
|
||||
} else {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32_dma, &dctx, n_threads);
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
int op_diag(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
int err = HTP_STATUS_OK;
|
||||
|
||||
switch (dst->type) {
|
||||
case HTP_TYPE_F32:
|
||||
err = op_diag_f32(octx);
|
||||
break;
|
||||
default:
|
||||
err = HTP_STATUS_NO_SUPPORT;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
@@ -1,123 +0,0 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "hvx-copy.h"
|
||||
#include "hvx-utils.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
// ggml op_params layout for FILL:
|
||||
// op_params[0] (as float) - the scalar fill value
|
||||
|
||||
#define fill_preamble \
|
||||
const struct htp_tensor * dst = octx->dst; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3]; \
|
||||
\
|
||||
const uint32_t nr = ne1 * ne2 * ne3;
|
||||
|
||||
struct htp_fill_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t nrows_per_thread;
|
||||
uint32_t total_rows; // ne1 * ne2 * ne3
|
||||
bool opt_path;
|
||||
HVX_Vector splat_vec;
|
||||
uint32_t elem_size;
|
||||
};
|
||||
|
||||
static void fill_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_fill_context * fctx = (const struct htp_fill_context *) data;
|
||||
struct htp_ops_context * octx = fctx->octx;
|
||||
fill_preamble;
|
||||
|
||||
// Parallelise over the flat row index spanning ne1*ne2*ne3
|
||||
const uint32_t ir0 = fctx->nrows_per_thread * ith;
|
||||
const uint32_t ir1 = MIN(ir0 + fctx->nrows_per_thread, fctx->total_rows);
|
||||
|
||||
uint64_t t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
if (fctx->opt_path) {
|
||||
// Opt path: tensor is fully contiguous, treat as flat array
|
||||
const uint32_t elem_start = ir0 * ne0;
|
||||
const uint32_t elem_end = ir1 * ne0;
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + elem_start * fctx->elem_size;
|
||||
hvx_splat_u(dst_ptr, fctx->splat_vec, elem_end - elem_start, fctx->elem_size);
|
||||
} else {
|
||||
// Non-contiguous path: must respect strides
|
||||
for (uint32_t ir = ir0; ir < ir1; ++ir) {
|
||||
const uint32_t i1 = ir % ne1;
|
||||
const uint32_t i2 = (ir / ne1) % ne2;
|
||||
const uint32_t i3 = ir / (ne1 * ne2);
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1*nb1 + i2*nb2 + i3*nb3;
|
||||
hvx_splat_u(dst_ptr, fctx->splat_vec, ne0, fctx->elem_size);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t t2 = HAP_perf_get_qtimer_count();
|
||||
FARF(HIGH, "fill %u/%u: rows %u:%u usec %u\n",
|
||||
ith, nth, ir0, ir1, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
int op_fill(struct htp_ops_context * octx) {
|
||||
fill_preamble;
|
||||
|
||||
if (dst->type != HTP_TYPE_F32 && dst->type != HTP_TYPE_F16) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
// nr = ne1*ne2*ne3 (flat row count across all outer dims); parallelise over it.
|
||||
const uint32_t n_threads = MIN(nr, octx->n_threads);
|
||||
|
||||
// Optimize if fully contiguous: skip stride arithmetic, treat as flat array
|
||||
const bool opt_path = (nb2 == nb1 * ne1) && (nb3 == nb2 * ne2);
|
||||
|
||||
FARF(HIGH, "fill: (%ux%ux%ux%u) type=%u opt=%d\n",
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->type, (int) opt_path);
|
||||
|
||||
float val_f32 = 0.f;
|
||||
memcpy(&val_f32, &octx->op_params[0], sizeof(float));
|
||||
|
||||
struct htp_fill_context fctx = {
|
||||
.octx = octx,
|
||||
.nrows_per_thread = (nr + n_threads - 1) / n_threads,
|
||||
.total_rows = nr,
|
||||
.opt_path = opt_path,
|
||||
};
|
||||
|
||||
switch (dst->type) {
|
||||
case HTP_TYPE_F32:
|
||||
fctx.splat_vec = hvx_vec_splat_f32(val_f32);
|
||||
fctx.elem_size = sizeof(float);
|
||||
break;
|
||||
case HTP_TYPE_F16:
|
||||
fctx.splat_vec = hvx_vec_splat_f16((_Float16) val_f32);
|
||||
fctx.elem_size = sizeof(_Float16);
|
||||
break;
|
||||
default:
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, fill_thread, &fctx, n_threads);
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -98,7 +98,5 @@ int op_repeat(struct htp_ops_context * octx);
|
||||
int op_argsort(struct htp_ops_context * octx);
|
||||
int op_ssm_conv(struct htp_ops_context * octx);
|
||||
int op_cumsum(struct htp_ops_context * octx);
|
||||
int op_fill(struct htp_ops_context * octx);
|
||||
int op_diag(struct htp_ops_context * octx);
|
||||
|
||||
#endif /* HTP_CTX_H */
|
||||
|
||||
@@ -80,8 +80,6 @@ enum htp_op_code {
|
||||
HTP_OP_SSM_CONV,
|
||||
HTP_OP_REPEAT,
|
||||
HTP_OP_CUMSUM,
|
||||
HTP_OP_FILL,
|
||||
HTP_OP_DIAG,
|
||||
|
||||
HTP_OP_INVALID
|
||||
};
|
||||
|
||||
@@ -514,12 +514,6 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_CUMSUM:
|
||||
return op_cumsum(octx);
|
||||
|
||||
case HTP_OP_FILL:
|
||||
return op_fill(octx);
|
||||
|
||||
case HTP_OP_DIAG:
|
||||
return op_diag(octx);
|
||||
|
||||
case HTP_OP_INVALID:
|
||||
break;
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ libggml-htp-v68.so = 1
|
||||
libggml-htp-v69.so = 1
|
||||
libggml-htp-v73.so = 1
|
||||
libggml-htp-v75.so = 1
|
||||
libggml-htp-v79.so = 1
|
||||
libggml-htp-v81.so = 1
|
||||
|
||||
[ControlFlags]
|
||||
@@ -32,7 +31,6 @@ libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
|
||||
[Strings]
|
||||
|
||||
@@ -931,13 +931,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
|
||||
}
|
||||
|
||||
struct ggml_metal_event {
|
||||
void * obj; // id<MTLSharedEvent>
|
||||
void * obj; // id<MTLEvent>
|
||||
|
||||
atomic_int value;
|
||||
};
|
||||
|
||||
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
|
||||
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
|
||||
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
|
||||
|
||||
@@ -945,7 +945,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
|
||||
}
|
||||
|
||||
void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
|
||||
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
|
||||
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
|
||||
|
||||
@@ -953,7 +953,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
|
||||
}
|
||||
|
||||
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
|
||||
id<MTLSharedEvent> event = [dev->mtl_device newSharedEvent];
|
||||
id<MTLEvent> event = [dev->mtl_device newEvent];
|
||||
|
||||
ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
|
||||
|
||||
@@ -964,7 +964,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
|
||||
}
|
||||
|
||||
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
|
||||
id<MTLSharedEvent> event = ev->obj;
|
||||
id<MTLEvent> event = ev->obj;
|
||||
[event release];
|
||||
|
||||
free(ev);
|
||||
@@ -973,13 +973,14 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
|
||||
}
|
||||
|
||||
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
|
||||
id<MTLSharedEvent> event = ev->obj;
|
||||
const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
|
||||
if (!res) {
|
||||
GGML_ABORT("%s: failed to wait for event\n", __func__);
|
||||
}
|
||||
@autoreleasepool {
|
||||
id<MTLEvent> event = ev->obj;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
|
||||
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
|
||||
[cmd_buf commit];
|
||||
[cmd_buf waitUntilCompleted];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
|
||||
|
||||
@@ -918,10 +918,6 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
|
||||
static std::vector<ggml_backend_device_ptr> devs;
|
||||
|
||||
if (!initialized) {
|
||||
// workaround macOS limitation (kIOGPUCommandBufferCallbackErrorImpactingInteractivity) until proper fix becomes possible
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/20141#issuecomment-4272947703
|
||||
setenv("AGX_RELAX_CDM_CTXSTORE_TIMEOUT", "1", true);
|
||||
|
||||
static ggml_backend_metal_reg_ptr reg_ctx(ggml_backend_metal_reg_init());
|
||||
|
||||
for (int i = 0; i < g_devices; ++i) {
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/core/dimension.hpp>
|
||||
#include <openvino/core/except.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
@@ -206,22 +207,8 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
|
||||
break;
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
const int mode = node->op_params[2];
|
||||
switch (mode) {
|
||||
case GGML_ROPE_TYPE_NEOX: {
|
||||
op_case = 0x00010000;
|
||||
break;
|
||||
}
|
||||
case GGML_ROPE_TYPE_IMROPE: {
|
||||
op_case = 0x00020000;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
op_case = 0x00000000;
|
||||
break;
|
||||
}
|
||||
if (node->src[0]->op == GGML_OP_VIEW) {
|
||||
op_case = (op_case | 0x00000002);
|
||||
op_case = 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -586,6 +573,9 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
|
||||
}
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
|
||||
static std::mutex weights_mutex;
|
||||
std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
auto * nodes = cgraph->nodes;
|
||||
auto n_nodes = cgraph->n_nodes;
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
#include <cstring>
|
||||
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
|
||||
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
|
||||
#include <openvino/runtime/properties.hpp>
|
||||
#include <optional>
|
||||
|
||||
ov::Core & ov_singleton_core() {
|
||||
@@ -43,13 +42,11 @@ void ggml_openvino_device_config::init() {
|
||||
{"NPUW_DQ", "YES" },
|
||||
{"NPUW_DQ_FULL", "NO" },
|
||||
};
|
||||
if (cache_dir && strlen(cache_dir) > 0) {
|
||||
if (cache_dir) {
|
||||
compile_config["NPUW_CACHE_DIR"] = cache_dir;
|
||||
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
|
||||
}
|
||||
} else if (cache_dir && strlen(cache_dir) > 0) {
|
||||
compile_config.insert(ov::cache_dir(cache_dir));
|
||||
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
|
||||
} else if (cache_dir) {
|
||||
ov_singleton_core().set_property(ov::cache_dir(cache_dir));
|
||||
}
|
||||
|
||||
// Initialize remote context with queue sharing for GPU
|
||||
@@ -262,12 +259,10 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t);
|
||||
// For symmetric quantization, no zp needed (weights stored as signed)
|
||||
if (layout.is_symmetric) {
|
||||
layout.zp_size = 0;
|
||||
} else {
|
||||
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
|
||||
}
|
||||
// For symmetric quantization, we only need one zp value (not one per block)
|
||||
// Zero points are stored in U4 or U8 format matching the weight type
|
||||
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
|
||||
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
|
||||
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
@@ -318,12 +313,10 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
||||
// Scales: F16 per block
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
// For symmetric quantization, no zp needed (weights stored as signed)
|
||||
if (layout.is_symmetric) {
|
||||
layout.zp_size = 0;
|
||||
} else {
|
||||
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
|
||||
}
|
||||
// Zero points: U4 or U8 matching weight type
|
||||
// For symmetric quantization, we only need one zp value (not one per block)
|
||||
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
|
||||
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
|
||||
|
||||
// Layout in buffer: [weights | scales | zp] with alignment
|
||||
layout.weights_offset = 0;
|
||||
|
||||
@@ -145,18 +145,13 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
|
||||
return ctx->data;
|
||||
}
|
||||
|
||||
static bool is_stateful_enabled() {
|
||||
static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
|
||||
return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
// GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
|
||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||
|
||||
// Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
|
||||
if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
|
||||
!is_stateful_enabled()) {
|
||||
!getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
|
||||
GGML_ASSERT(ctx->tensor_extras.empty());
|
||||
auto device = ctx->device;
|
||||
auto size = ctx->size;
|
||||
@@ -605,14 +600,6 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {
|
||||
|
||||
static void ggml_backend_openvino_free(ggml_backend_t backend) {
|
||||
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
|
||||
|
||||
if (ctx->runtime_context) {
|
||||
auto r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
|
||||
if (--r_ctx->backend_count == 0) {
|
||||
r_ctx->clear_caches();
|
||||
}
|
||||
}
|
||||
|
||||
delete ctx;
|
||||
delete backend;
|
||||
}
|
||||
@@ -657,12 +644,7 @@ static ggml_guid_t ggml_backend_openvino_guid(void) {
|
||||
}
|
||||
|
||||
static std::shared_ptr<ov_runtime_context> get_ov_runtime_context_ptr() {
|
||||
static std::shared_ptr<ov_runtime_context> r_ctx = [] {
|
||||
auto ctx = std::make_shared<ov_runtime_context>();
|
||||
ctx->device = ggml_openvino_get_device_name();
|
||||
ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu();
|
||||
return ctx;
|
||||
}();
|
||||
static std::shared_ptr<ov_runtime_context> r_ctx = std::make_shared<ov_runtime_context>();
|
||||
return r_ctx;
|
||||
}
|
||||
|
||||
@@ -687,7 +669,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
|
||||
}
|
||||
|
||||
std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
|
||||
r_ctx->backend_count++;
|
||||
r_ctx->device = ggml_openvino_get_device_name();
|
||||
r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
|
||||
|
||||
ggml_backend_t openvino_backend = new ggml_backend{
|
||||
/* .guid = */ ggml_backend_openvino_guid(),
|
||||
@@ -900,7 +883,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||
const int32_t * op_params = op->op_params;
|
||||
const int n_dims = op_params[1];
|
||||
const int mode = op_params[2];
|
||||
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_IMROPE) {
|
||||
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
|
||||
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
|
||||
return true;
|
||||
}
|
||||
@@ -913,6 +896,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
|
||||
return true;
|
||||
}
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
memcpy(&freq_scale, op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, op_params + 7, sizeof(float));
|
||||
if (ext_factor != 0.0f) {
|
||||
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
|
||||
return true;
|
||||
}
|
||||
if (op->src[0]->op == GGML_OP_VIEW) {
|
||||
if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
|
||||
// GGML_LOG_WARN(
|
||||
@@ -922,12 +913,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (mode == GGML_ROPE_TYPE_IMROPE &&
|
||||
(op->src[2] != 0 || ((const float *) op_params)[6] != 1 || ((const float *) op_params)[7] != 0 ||
|
||||
((const float *) op_params)[8] != 1)) {
|
||||
// GGML_LOG_WARN("OpenVINO backend does not support IMROPE with freq_factors, freq_scale, ext_factor, and attn_factor\n");
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -957,7 +942,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
|
||||
// GGML_OP_SOFT_MAX,
|
||||
GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
|
||||
static const std::set<ggml_unary_op> supported_unary_ops{
|
||||
GGML_UNARY_OP_GELU,
|
||||
GGML_UNARY_OP_SILU,
|
||||
};
|
||||
static const std::set<ggml_glu_op> supported_glu_ops{
|
||||
|
||||
@@ -46,7 +46,6 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
|
||||
|
||||
// Extracts (weight, scales, zp) from Q4_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 4bit weights|.
|
||||
// When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8).
|
||||
void extract_q4_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
@@ -56,32 +55,28 @@ void extract_q4_0_data(const ggml_tensor * tensor,
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
// For Q4_0, zero point is always 8
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
|
||||
}
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
// For asymmetric quantization, compute per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
// Pack two 4-bit zero points per byte
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8; // Lower nibble
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4); // Upper nibble
|
||||
}
|
||||
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
|
||||
});
|
||||
} else {
|
||||
// Symmetric: unpack as u4 then convert to i4 by subtracting 8 (XOR each nibble)
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
|
||||
// Convert u4 to i4: subtract 8 from each nibble. XOR 0x88 flips each nibble by 8.
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
weights[i * 16 + j] ^= 0x88;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
|
||||
});
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, zp) from Q4_1 tensors.
|
||||
@@ -128,7 +123,6 @@ void extract_q4_1_data(const ggml_tensor * tensor,
|
||||
|
||||
// Extracts (weight, scales, zp) from Q8_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 8bit weights|.
|
||||
// When zp_arr is empty (symmetric), weights are stored as signed i8 directly.
|
||||
void extract_q8_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
@@ -139,30 +133,29 @@ void extract_q8_0_data(const ggml_tensor * tensor,
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
|
||||
zp[i] = 128;
|
||||
for (size_t j = 0; j < weights_per_block; ++j) {
|
||||
uint8_t x = block_data[j + 2];
|
||||
x ^= 1 << 7; // Convert int8 to uint8 by flipping sign bit
|
||||
weights[i * weights_per_block + j] = x;
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// Symmetric: store original int8 values directly (no unsigned bias)
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
|
||||
// Copy int8 weights as-is (the tensor element type is i8)
|
||||
memcpy(weights + i * weights_per_block, block_data + 2, weights_per_block);
|
||||
});
|
||||
// For Q8_0, zero point is always 128
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 128;
|
||||
}
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[i] = 128;
|
||||
}
|
||||
for (size_t j = 0; j < weights_per_block; ++j) {
|
||||
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
|
||||
// Original data is in int8_t, so we add a bias of -128 and invert the first bit.
|
||||
x ^= 1 << 7;
|
||||
weights[i * weights_per_block + j] = x;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void unpack_256_4(const uint8_t * data, uint8_t * dst) {
|
||||
@@ -263,62 +256,44 @@ void extract_q6_k_data(const ggml_tensor * tensor,
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
|
||||
for (size_t j = 0; j < 16; j++) {
|
||||
scales[j + i * 16] =
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
|
||||
// For Q6_K, zero point is always 32
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 32;
|
||||
}
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
float scale_factor =
|
||||
static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2
|
||||
|
||||
for (size_t j = 0; j < 16; j++) {
|
||||
scales[j + i * 16] =
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[j + i * 16] = 32;
|
||||
}
|
||||
uint8_t * ql = block_data;
|
||||
uint8_t * qh = block_data + 128;
|
||||
for (int64_t j = 0; j < 32; ++j) {
|
||||
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
|
||||
weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// Symmetric: subtract 32 from each weight to store as signed i8
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
|
||||
for (size_t j = 0; j < 16; j++) {
|
||||
scales[j + i * 16] =
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
|
||||
}
|
||||
uint8_t * ql = block_data;
|
||||
uint8_t * qh = block_data + 128;
|
||||
auto * signed_weights = reinterpret_cast<int8_t *>(weights);
|
||||
for (int64_t j = 0; j < 32; ++j) {
|
||||
signed_weights[i * 256 + j] = static_cast<int8_t>((ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 32] =
|
||||
static_cast<int8_t>((ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 64] = static_cast<int8_t>((ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 96] =
|
||||
static_cast<int8_t>((ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 128] =
|
||||
static_cast<int8_t>((ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 160] =
|
||||
static_cast<int8_t>((ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 192] =
|
||||
static_cast<int8_t>((ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 224] =
|
||||
static_cast<int8_t>((ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t * ql = block_data;
|
||||
uint8_t * qh = block_data + 128;
|
||||
|
||||
for (int64_t j = 0; j < 32; ++j) {
|
||||
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
|
||||
weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
|
||||
@@ -414,10 +389,11 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||
size_t group_size,
|
||||
bool use_bias) {
|
||||
ov::Shape orig_shape = weight.get_shape();
|
||||
bool is_signed = (weight.get_element_type() == ov::element::i8); // Symmetric: signed weights, no ZP
|
||||
|
||||
// Expand dimensions for scales and zp/bias
|
||||
auto scale_shape = scales.get_shape();
|
||||
auto zp_shape = zp.get_shape();
|
||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||
|
||||
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
|
||||
|
||||
@@ -427,48 +403,37 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||
} else {
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
if (!is_signed && zp.get_size() > 0) {
|
||||
auto zp_shape = zp.get_shape();
|
||||
// For symmetric quantization, zp remains scalar (don't resize)
|
||||
if (!is_scalar_zp) {
|
||||
zp_shape.push_back(1);
|
||||
zp.set_shape(zp_shape);
|
||||
}
|
||||
}
|
||||
|
||||
// Create graph nodes
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
ov::Output<ov::Node> result;
|
||||
if (is_signed) {
|
||||
// Signed path: q * s (no zero point subtraction needed)
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i8, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
if (use_bias && !is_scalar_zp) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Unsigned path
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
if (use_bias && zp.get_size() > 0) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s =
|
||||
std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_point, zp_value)) {
|
||||
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_point, zp_value)) {
|
||||
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
|
||||
if (packed_shape.size() != 2) {
|
||||
@@ -487,10 +452,11 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||
size_t group_size,
|
||||
bool use_bias) {
|
||||
ov::Shape orig_weight_shape = weight.get_shape();
|
||||
bool is_signed = (weight.get_element_type() == ov::element::i4); // Symmetric: signed weights, no ZP
|
||||
|
||||
// Expand dimensions for scales and zp/bias
|
||||
ov::Shape scale_shape = scales.get_shape();
|
||||
auto zp_shape = zp.get_shape();
|
||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||
|
||||
// Create INT4 weight tensor
|
||||
ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
|
||||
@@ -501,48 +467,36 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||
} else {
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
if (!is_signed && zp.get_size() > 0) {
|
||||
auto zp_shape = zp.get_shape();
|
||||
// For symmetric quantization, zp remains scalar (don't resize)
|
||||
if (!is_scalar_zp) {
|
||||
zp_shape.push_back(1);
|
||||
zp.set_shape(zp_shape);
|
||||
}
|
||||
}
|
||||
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
|
||||
ov::Output<ov::Node> result;
|
||||
if (is_signed) {
|
||||
// Signed path: q * s (no zero point subtraction needed)
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i4, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
if (use_bias && !is_scalar_zp) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Unsigned path
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
if (use_bias && zp.get_size() > 0) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s =
|
||||
std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
|
||||
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
|
||||
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
|
||||
if (packed_shape.size() != 2) {
|
||||
@@ -745,32 +699,24 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
|
||||
|
||||
// Quantized path (normal extraction or quantized requant)
|
||||
// Create weight/scale/zp tensors - shared between both paths
|
||||
// For symmetric quantization, use signed types (i4/i8) and no ZP tensor
|
||||
ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
|
||||
(layout.is_u4 ? ov::element::u4 : ov::element::u8);
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
if (!layout.is_symmetric) {
|
||||
ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset);
|
||||
}
|
||||
// else: result.zp remains default-constructed (empty) for symmetric
|
||||
result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
} else {
|
||||
result.weights = ov::Tensor(weight_type, node_shape);
|
||||
result.scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
if (!layout.is_symmetric) {
|
||||
if (use_bias) {
|
||||
result.zp = ov::Tensor(ov::element::f16, scale_shape);
|
||||
} else {
|
||||
ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
result.zp = ov::Tensor(zp_type, scale_shape);
|
||||
}
|
||||
if (use_bias && !layout.is_symmetric) {
|
||||
// bias only has effect for asymmetric quant
|
||||
result.zp = ov::Tensor(ov::element::f16, zp_shape);
|
||||
} else {
|
||||
result.zp = ov::Tensor(weight_type, zp_shape);
|
||||
}
|
||||
// else: result.zp remains default-constructed (empty) for symmetric
|
||||
}
|
||||
|
||||
if (layout.is_requant && layout.requant_type.has_value()) {
|
||||
@@ -795,75 +741,59 @@ void quantize_q4_0(const float * x,
|
||||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f;
|
||||
float max = 0.0f;
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
// For Q4_0, zero point is always 8
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
const float d = max / -8;
|
||||
if (d == 0) {
|
||||
scales[i] = ov::float16(1.0f);
|
||||
}
|
||||
|
||||
const float d = max / -8;
|
||||
|
||||
if (d == 0) {
|
||||
scales[i] = ov::float16(1.0f);
|
||||
// zp is already set to 8 for symmetric, or set per-block for asymmetric
|
||||
if (!is_scalar_zp) {
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8;
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4);
|
||||
}
|
||||
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
|
||||
continue;
|
||||
}
|
||||
const float id = 1.0f / d;
|
||||
scales[i] = ov::float16(d);
|
||||
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
|
||||
continue;
|
||||
}
|
||||
|
||||
const float id = 1.0f / d;
|
||||
scales[i] = ov::float16(d);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8;
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4);
|
||||
}
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const float x0 = x[i * qk + 2 * j] * id;
|
||||
const float x1 = x[i * qk + 2 * j + 1] * id;
|
||||
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
|
||||
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
|
||||
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Symmetric: produce signed i4 values in [-8, 7]
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f;
|
||||
float max = 0.0f;
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
const float d = max / -8;
|
||||
if (d == 0) {
|
||||
scales[i] = ov::float16(1.0f);
|
||||
// i4 value 0 packed: 0x00
|
||||
memset(weights + i * qk / 2, 0, qk / 2);
|
||||
continue;
|
||||
}
|
||||
const float id = 1.0f / d;
|
||||
scales[i] = ov::float16(d);
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const float x0 = x[i * qk + 2 * j] * id;
|
||||
const float x1 = x[i * qk + 2 * j + 1] * id;
|
||||
// Signed i4: range [-8, 7]. Quantize as round(x*id), then pack as 4-bit two's complement.
|
||||
int8_t si0 = (int8_t) std::max(-8, std::min(7, (int) roundf(x0)));
|
||||
int8_t si1 = (int8_t) std::max(-8, std::min(7, (int) roundf(x1)));
|
||||
weights[i * qk / 2 + j] = (si0 & 0x0F) | ((si1 & 0x0F) << 4);
|
||||
}
|
||||
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const float x0 = x[i * qk + 2 * j] * id;
|
||||
const float x1 = x[i * qk + 2 * j + 1] * id;
|
||||
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
|
||||
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
|
||||
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -879,42 +809,36 @@ void quantize_q8_0(const float * x,
|
||||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f;
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
amax = std::max(amax, fabsf(v));
|
||||
}
|
||||
const float d = amax / 127.0f;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
zp[i] = 128;
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = x[i * qk + j] * id;
|
||||
const int8_t xi0 = roundf(x0);
|
||||
weights[i * qk + j] = (uint8_t) (xi0 + 128);
|
||||
// For Q8_0, zero point is always 128
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 128;
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Symmetric: store signed int8 values directly
|
||||
auto * signed_weights = reinterpret_cast<int8_t *>(weights);
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f;
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
amax = std::max(amax, fabsf(v));
|
||||
}
|
||||
const float d = amax / 127.0f;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = x[i * qk + j] * id;
|
||||
signed_weights[i * qk + j] = (int8_t) roundf(x0);
|
||||
}
|
||||
|
||||
const float d = amax / 127.0f;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[i] = 128;
|
||||
}
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = x[i * qk + j] * id;
|
||||
const int8_t xi0 = roundf(x0);
|
||||
weights[i * qk + j] = (uint8_t) (xi0 + 128);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -937,8 +861,12 @@ void quantize_q8_1(const float * x,
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
min = std::min(v, min);
|
||||
max = std::max(v, max);
|
||||
if (v < min) {
|
||||
min = v;
|
||||
}
|
||||
if (v > max) {
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = (max - min) / ((1 << 8) - 1);
|
||||
|
||||
@@ -9,17 +9,12 @@
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/cos.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/split.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <vector>
|
||||
|
||||
@@ -38,12 +33,6 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
auto data_node = context.get_input(0).get_node_shared_ptr();
|
||||
auto output_shape = context.get_output_shape().to_shape();
|
||||
int32_t * op_params = context.get_output_op_params();
|
||||
const int mode = (op_case & 0xFFFF0000) >> 16;
|
||||
op_case = (op_case & 0x0000FFFF);
|
||||
|
||||
constexpr int TYPE_NORMAL = 0;
|
||||
constexpr int TYPE_NEOX = 1;
|
||||
constexpr int TYPE_IMROPE = 2;
|
||||
|
||||
Output<Node> cos_theta_node;
|
||||
Output<Node> sin_theta_node;
|
||||
@@ -56,7 +45,7 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
if (context.get_input_size() == 3) {
|
||||
rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
|
||||
}
|
||||
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE);
|
||||
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
|
||||
sin_theta_node = sin_cos.first;
|
||||
cos_theta_node = sin_cos.second;
|
||||
}
|
||||
@@ -76,7 +65,11 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
}
|
||||
}
|
||||
|
||||
if (mode == TYPE_NORMAL) {
|
||||
const int mode = op_params[2];
|
||||
constexpr int ROPE_TYPE_NORMAL = 0;
|
||||
constexpr int ROPE_TYPE_NEOX = 2;
|
||||
|
||||
if (mode == ROPE_TYPE_NORMAL) {
|
||||
auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
@@ -104,7 +97,7 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
auto data_shape = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
|
||||
res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
|
||||
} else if (mode == TYPE_NEOX) {
|
||||
} else if (mode == ROPE_TYPE_NEOX) {
|
||||
auto data_split = std::make_shared<ov::op::v1::Split>(
|
||||
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2);
|
||||
Output<Node> slice_data_node_0 = data_split->outputs()[0];
|
||||
@@ -119,25 +112,6 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));
|
||||
|
||||
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
|
||||
} else if (mode == TYPE_IMROPE) {
|
||||
int64_t n_dims = data_node->get_shape()[3];
|
||||
auto cos_sin_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1,-1,1,(n_dims >> 1)});
|
||||
auto cos_reshaped = std::make_shared<ov::op::v1::Reshape>(cos_theta_node, cos_sin_shape, true);
|
||||
auto sin_reshaped = std::make_shared<ov::op::v1::Reshape>(sin_theta_node, cos_sin_shape, true);
|
||||
|
||||
auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3});
|
||||
auto split_a = std::make_shared<ov::op::v1::Split>(data_node, split_axis, 2);
|
||||
auto x0 = split_a->output(0);
|
||||
auto x1 = split_a->output(1);
|
||||
auto mul_a = std::make_shared<ov::op::v1::Multiply>(x0, cos_reshaped);
|
||||
auto mul_b = std::make_shared<ov::op::v1::Multiply>(x1, sin_reshaped);
|
||||
auto sub = std::make_shared<ov::op::v1::Subtract>(mul_a, mul_b);
|
||||
|
||||
auto mul_c = std::make_shared<ov::op::v1::Multiply>(x0, sin_reshaped);
|
||||
auto mul_d = std::make_shared<ov::op::v1::Multiply>(x1, cos_reshaped);
|
||||
auto add = std::make_shared<ov::op::v1::Add>(mul_c, mul_d);
|
||||
|
||||
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{sub, add}, 3);
|
||||
}
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
#include "../node_context.h"
|
||||
#include "../op_table.h"
|
||||
#include "../utils.h"
|
||||
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/gelu.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_unary_gelu(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
auto input = context.get_input(0);
|
||||
auto res = std::make_shared<ov::op::v7::Gelu>(input);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
@@ -31,7 +31,6 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
|
||||
{"GGML_OP_SOFT_MAX", op::translate_soft_max },
|
||||
{"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>},
|
||||
{"GGML_OP_TRANSPOSE", op::translate_transpose },
|
||||
{"GGML_UNARY_OP_GELU", op::translate_unary_gelu },
|
||||
{"GGML_UNARY_OP_SILU", op::translate_unary_silu },
|
||||
{"GGML_OP_VIEW", op::translate_view },
|
||||
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
|
||||
|
||||
@@ -21,7 +21,6 @@ GGML_OP_CONVERTER(translate_rms_norm);
|
||||
GGML_OP_CONVERTER(translate_rope);
|
||||
GGML_OP_CONVERTER(translate_scale);
|
||||
GGML_OP_CONVERTER(translate_unary_silu);
|
||||
GGML_OP_CONVERTER(translate_unary_gelu);
|
||||
GGML_OP_CONVERTER(translate_soft_max);
|
||||
GGML_OP_CONVERTER(translate_transpose);
|
||||
GGML_OP_CONVERTER(translate_view);
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
#include "eliminate_zp.h"
|
||||
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/parallel.hpp>
|
||||
#include <openvino/core/rt_info.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/pass/pattern/op/label.hpp>
|
||||
#include <openvino/pass/pattern/op/pattern.hpp>
|
||||
#include <openvino/pass/pattern/op/wrap_type.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
EliminateZeroPoints::EliminateZeroPoints() {
|
||||
// Find pattern:
|
||||
// (Multiply Any(scale)
|
||||
// (Subtract (Convert Constant(data)))
|
||||
// (Convert Constant(zero_point)))
|
||||
// where zero_point is a scalar
|
||||
// If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val
|
||||
// If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant
|
||||
|
||||
auto m_data_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
|
||||
auto m_data_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_data_constant});
|
||||
|
||||
auto m_zp_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
|
||||
auto m_zp_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_zp_constant});
|
||||
|
||||
auto m_subtract = ov::pass::pattern::wrap_type<ov::op::v1::Subtract>({m_data_convert, m_zp_convert});
|
||||
auto m_scale = ov::pass::pattern::any_input();
|
||||
auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
|
||||
|
||||
const auto callback = [=](ov::pass::pattern::Matcher & m) {
|
||||
const auto & pattern_map = m.get_pattern_value_map();
|
||||
|
||||
auto multiply_node =
|
||||
std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
|
||||
auto subtract_node =
|
||||
std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
|
||||
auto data_constant =
|
||||
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
|
||||
auto zp_constant =
|
||||
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
|
||||
|
||||
if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ov::shape_size(zp_constant->get_shape()) != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto data_type = data_constant->get_element_type();
|
||||
auto zp_data = zp_constant->cast_vector<int>();
|
||||
|
||||
if (zp_data.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int zp_value = zp_data[0];
|
||||
|
||||
bool should_eliminate = false;
|
||||
ov::element::Type target_type;
|
||||
|
||||
if (data_type == ov::element::u4 && zp_value == 8) {
|
||||
should_eliminate = true;
|
||||
target_type = ov::element::i4;
|
||||
} else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) {
|
||||
should_eliminate = true;
|
||||
target_type = ov::element::i8;
|
||||
}
|
||||
|
||||
if (!should_eliminate) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto data_shape = data_constant->get_shape();
|
||||
size_t total_elements = ov::shape_size(data_shape);
|
||||
|
||||
std::shared_ptr<ov::op::v0::Constant> new_constant;
|
||||
|
||||
// TODO improve performance
|
||||
if (data_type == ov::element::u4) {
|
||||
auto data_values = data_constant->cast_vector<uint8_t>();
|
||||
std::vector<int8_t> adjusted_values(total_elements);
|
||||
|
||||
ov::parallel_for(total_elements, [&](size_t i) {
|
||||
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - 8);
|
||||
});
|
||||
|
||||
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
|
||||
} else if (data_type == ov::element::u8) {
|
||||
auto data_values = data_constant->cast_vector<uint8_t>();
|
||||
std::vector<int8_t> adjusted_values(total_elements);
|
||||
|
||||
ov::parallel_for(total_elements, [&, zp_value](size_t i) {
|
||||
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - zp_value);
|
||||
});
|
||||
|
||||
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
|
||||
}
|
||||
|
||||
auto new_convert =
|
||||
std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
|
||||
ov::replace_node(subtract_node, new_convert);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
register_matcher(
|
||||
std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
|
||||
callback);
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
@@ -0,0 +1,17 @@
|
||||
#include "openvino/pass/matcher_pass.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
class EliminateZeroPoints : public ov::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints")
|
||||
EliminateZeroPoints();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
@@ -1,41 +0,0 @@
|
||||
// Copyright (C) 2018-2026 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <openvino/core/core_visibility.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/runtime_attribute.hpp>
|
||||
|
||||
namespace ov {
|
||||
|
||||
/**
|
||||
* @brief Holds weightless caching attributes of a single constant.
|
||||
*
|
||||
* WeightlessCacheAttribute class represents runtime info attribute that holds
|
||||
* the values of original size of the constant in bytes and the binary offset of the
|
||||
* constant's data in the weights file used by the weightless caching mechanism. It's
|
||||
* not copyable in case the data was changed (the original node was replaced by a new
|
||||
* one produced during the tranformation pipeline) - in that case weightless caching
|
||||
* can't be used for that constant.
|
||||
*/
|
||||
class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute {
|
||||
public:
|
||||
OPENVINO_RTTI("WeightlessCacheAttribute", "0", RuntimeAttribute)
|
||||
|
||||
WeightlessCacheAttribute() = delete;
|
||||
|
||||
WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype)
|
||||
: original_size(original_size),
|
||||
bin_offset(bin_offset),
|
||||
original_dtype(original_dtype) {}
|
||||
|
||||
bool is_copyable() const override;
|
||||
|
||||
size_t original_size;
|
||||
size_t bin_offset;
|
||||
ov::element::Type original_dtype;
|
||||
};
|
||||
|
||||
} // namespace ov
|
||||
@@ -3,16 +3,15 @@
|
||||
#include "ggml-openvino/openvino/node_context.h"
|
||||
#include "ggml-openvino/openvino/utils.h"
|
||||
#include "input_model.h"
|
||||
#include "pass/eliminate_zp.h"
|
||||
#include "pass/mark_decompression_convert_constant_folding.h"
|
||||
#include "pass/squeeze_matmul.h"
|
||||
#include "rt_info/weightless_caching_attributes.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/preprocess/pre_post_process.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
@@ -34,6 +33,7 @@
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/pass/constant_folding.hpp>
|
||||
#include <openvino/pass/make_stateful.hpp>
|
||||
#include <openvino/core/preprocess/pre_post_process.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
@@ -240,31 +240,6 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
|
||||
resulting_model = std::make_shared<Model>(results, used_params);
|
||||
|
||||
apply_transformations(resulting_model);
|
||||
|
||||
// Set WeightlessCacheAttribute on large constants to avoid unnecessary memory copies
|
||||
// in the NPUW plugin. Without this attribute, NPUW's LazyTensor constructor
|
||||
// (lazy_tensor.cpp, op::Const::Const) will memcpy every constant "in case export
|
||||
// occurs", doubling memory usage per compile_model call.
|
||||
//
|
||||
// The bin_offset field serves as a unique key (not a real file offset) — this is
|
||||
// the same convention the GPU plugin uses for non-IR models (see
|
||||
// Plugin::set_weightless_cache_attributes in intel_gpu/src/plugin/plugin.cpp).
|
||||
// Each constant must have a distinct bin_offset, otherwise GPU's weightless cache
|
||||
// import will map multiple constants to the same data.
|
||||
//
|
||||
// Small constants (< 16 elements) are excluded since they may be introduced by
|
||||
// optimization patterns and the overhead is negligible.
|
||||
size_t offset = 0;
|
||||
for (auto & node : resulting_model->get_ordered_ops()) {
|
||||
if (auto cnst = ov::as_type_ptr<ov::op::v0::Constant>(node);
|
||||
cnst && cnst->get_byte_size() / cnst->get_element_type().size() >= 16) {
|
||||
auto & rt_info = cnst->get_rt_info();
|
||||
if (rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()) == rt_info.end()) {
|
||||
rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] =
|
||||
ov::WeightlessCacheAttribute(cnst->get_byte_size(), offset++, cnst->get_element_type());
|
||||
}
|
||||
}
|
||||
}
|
||||
return resulting_model;
|
||||
}
|
||||
|
||||
@@ -282,6 +257,7 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
|
||||
}
|
||||
|
||||
if (ggml_model_decoder->is_static()) {
|
||||
manager.register_pass<pass::EliminateZeroPoints>();
|
||||
manager.register_pass<pass::SqueezeMatmul>();
|
||||
}
|
||||
manager.run_passes(model);
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <ctime>
|
||||
#include <memory>
|
||||
@@ -14,7 +13,6 @@
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/maximum.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
@@ -89,11 +87,8 @@ ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl
|
||||
auto ramp_y =
|
||||
std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
|
||||
auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
|
||||
// rope_yarn_ramp returns (1 - clamp(y)), so invert before scaling
|
||||
auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
|
||||
auto ramp_inverted = std::make_shared<ov::op::v1::Subtract>(one, ramp_clamped);
|
||||
auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
|
||||
auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_inverted, ext_factor_node);
|
||||
auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
|
||||
return ramp_mix;
|
||||
}
|
||||
|
||||
@@ -120,7 +115,6 @@ void ggml_rope_yarn_corr_dims(int n_dims,
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
|
||||
std::shared_ptr<ov::Node> inp_pos,
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight,
|
||||
bool imrope,
|
||||
bool stateful) {
|
||||
if (stateful) {
|
||||
inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
@@ -128,13 +122,6 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
|
||||
auto pos_perm =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
|
||||
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
|
||||
} else if (imrope) {
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
auto pos_shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, {0, 0, 0, 4, -1});
|
||||
inp_pos = std::make_shared<ov::op::v1::Reshape>(inp_pos, pos_shape, true);
|
||||
auto pos_transpose_shape =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{5}, std::vector<int64_t>{0, 1, 2, 4, 3});
|
||||
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_transpose_shape);
|
||||
} else {
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
auto pos_perm =
|
||||
@@ -149,7 +136,6 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
const int n_dims = rope_params[1];
|
||||
const size_t n_dims_half = n_dims >> 1;
|
||||
const int n_ctx_orig = rope_params[4];
|
||||
memcpy(&freq_base, rope_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, rope_params + 6, sizeof(float));
|
||||
@@ -160,74 +146,57 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
std::vector<float> factor(n_dims_half);
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
std::vector<float> factor(n_dims / 2);
|
||||
factor[0] = 1.0f;
|
||||
for (size_t i = 1; i < factor.size(); i++) {
|
||||
factor[i] = theta_scale * factor[i - 1];
|
||||
}
|
||||
|
||||
Output<Node> freq_factors;
|
||||
if (stateful) {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
|
||||
} else {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
|
||||
}
|
||||
if (rope_freqs_weight) {
|
||||
freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
|
||||
}
|
||||
|
||||
auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
|
||||
auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
|
||||
theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
|
||||
|
||||
Output<Node> theta;
|
||||
float mscale = attn_factor;
|
||||
if (imrope) {
|
||||
std::vector<int64_t> gather_indices(n_dims_half);
|
||||
for (size_t j = 0; j < n_dims_half; j++) {
|
||||
gather_indices[j] = j % 3;
|
||||
factor[j] = std::pow(theta_scale, j);
|
||||
}
|
||||
auto gather_indices_const =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{n_dims_half}, gather_indices);
|
||||
auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {4});
|
||||
inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, gather_indices_const, gather_axis);
|
||||
auto factor_const = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{n_dims_half}, factor);
|
||||
theta = std::make_shared<ov::op::v1::Multiply>(inp_pos, factor_const);
|
||||
if (ext_factor == 0.0f) {
|
||||
theta = theta_interp;
|
||||
} else {
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
factor[0] = 1.0f;
|
||||
for (size_t i = 1; i < factor.size(); i++) {
|
||||
factor[i] = theta_scale * factor[i - 1];
|
||||
}
|
||||
auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
|
||||
Output<Node> one;
|
||||
if (stateful) {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
|
||||
} else {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
|
||||
}
|
||||
if (rope_freqs_weight) {
|
||||
freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
|
||||
}
|
||||
auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
|
||||
|
||||
auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
|
||||
auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
|
||||
theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
|
||||
|
||||
if (ext_factor == 0.0f) {
|
||||
theta = theta_interp;
|
||||
} else {
|
||||
auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
|
||||
Output<Node> one;
|
||||
if (stateful) {
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
|
||||
} else {
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
|
||||
}
|
||||
auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
|
||||
|
||||
theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
|
||||
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
|
||||
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
|
||||
}
|
||||
theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
|
||||
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
|
||||
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
|
||||
}
|
||||
|
||||
Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
|
||||
Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
|
||||
|
||||
if (!imrope) {
|
||||
auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
|
||||
|
||||
cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
|
||||
sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
|
||||
}
|
||||
auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
|
||||
|
||||
cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
|
||||
sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
|
||||
return std::make_pair(sin_theta, cos_theta);
|
||||
}
|
||||
|
||||
|
||||
@@ -67,7 +67,6 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
|
||||
std::shared_ptr<ov::Node> inp_pos,
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight = nullptr,
|
||||
bool imrope = false,
|
||||
bool stateful = false);
|
||||
|
||||
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
|
||||
|
||||
@@ -81,8 +81,8 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) {
|
||||
auto & core = ov_singleton_core();
|
||||
const auto & config = ggml_openvino_get_compile_config();
|
||||
const auto & device = r_ctx->device;
|
||||
const auto & stateful = r_ctx->stateful;
|
||||
auto device = r_ctx->device;
|
||||
bool stateful = r_ctx->stateful;
|
||||
static auto is_static = false;
|
||||
|
||||
if (is_naive(cgraph)) {
|
||||
@@ -106,26 +106,14 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
int64_t infer_end_time;
|
||||
|
||||
{
|
||||
std::shared_ptr<decoder_runtime_ctx> entry;
|
||||
std::lock_guard<std::mutex> lock(r_ctx->ov_compute_mutex);
|
||||
|
||||
auto it = r_ctx->decoder_cache.find(key);
|
||||
|
||||
cache_hit = it != r_ctx->decoder_cache.end();
|
||||
ModelParams old_m_params;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
auto it = r_ctx->decoder_cache.find(key);
|
||||
cache_hit = it != r_ctx->decoder_cache.end();
|
||||
if (cache_hit) {
|
||||
entry = it->second;
|
||||
} else {
|
||||
auto mutex = std::make_shared<std::mutex>();
|
||||
entry = std::make_shared<decoder_runtime_ctx>(mutex);
|
||||
r_ctx->decoder_cache[key] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(*(entry->mutex));
|
||||
|
||||
if (cache_hit) {
|
||||
ggml_decoder = entry->ptr;
|
||||
ggml_decoder = it->second;
|
||||
old_m_params = ggml_decoder->get_model_params();
|
||||
cache_hit = old_m_params.can_reuse_dynamically(m_params);
|
||||
}
|
||||
@@ -138,10 +126,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
ggml_decoder->update_io(cgraph);
|
||||
}
|
||||
ggml_decoder->add_extra_inputs();
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
infer_request = r_ctx->infer_request_cache.at(key);
|
||||
}
|
||||
infer_request = r_ctx->infer_request_cache.at(key);
|
||||
|
||||
if (stateful) {
|
||||
const auto * inp_pos = get_inp_pos_tensor(cgraph);
|
||||
@@ -185,10 +170,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
conversion_end_time = decoder_end_time;
|
||||
compile_end_time = decoder_end_time;
|
||||
} else {
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
r_ctx->infer_request_cache.erase(key);
|
||||
}
|
||||
r_ctx->infer_request_cache.erase(key);
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||
@@ -217,7 +199,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
}
|
||||
compile_end_time = ggml_time_us();
|
||||
infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
|
||||
entry->ptr = ggml_decoder;
|
||||
r_ctx->infer_request_cache[key] = infer_request;
|
||||
r_ctx->decoder_cache[key] = ggml_decoder;
|
||||
|
||||
std::vector<std::string> ov_input_names;
|
||||
std::vector<std::string> ov_output_names;
|
||||
@@ -227,13 +210,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
for (const auto & ov_output : model->get_results()) {
|
||||
ov_output_names.push_back(ov_output->get_friendly_name());
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
r_ctx->infer_request_cache[key] = infer_request;
|
||||
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
}
|
||||
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
|
||||
if (stateful) {
|
||||
const auto * inp_pos = get_inp_pos_tensor(cgraph);
|
||||
@@ -246,13 +224,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> ov_input_names;
|
||||
std::vector<std::string> ov_output_names;
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
ov_input_names = r_ctx->ov_input_names_cache[key];
|
||||
ov_output_names = r_ctx->ov_output_names_cache[key];
|
||||
}
|
||||
auto ov_input_names = r_ctx->ov_input_names_cache[key];
|
||||
auto ov_output_names = r_ctx->ov_output_names_cache[key];
|
||||
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
@@ -333,26 +306,12 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
int64_t compile_end_time;
|
||||
int64_t infer_end_time;
|
||||
|
||||
std::shared_ptr<decoder_runtime_ctx> entry;
|
||||
auto it = r_ctx->decoder_cache.find(key);
|
||||
|
||||
cache_hit = it != r_ctx->decoder_cache.end();
|
||||
ModelParams old_m_params;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
auto it = r_ctx->decoder_cache.find(key);
|
||||
cache_hit = it != r_ctx->decoder_cache.end();
|
||||
if (cache_hit) {
|
||||
entry = it->second;
|
||||
} else {
|
||||
auto mutex = std::make_shared<std::mutex>();
|
||||
entry = std::make_shared<decoder_runtime_ctx>(mutex);
|
||||
r_ctx->decoder_cache[key] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(*(entry->mutex));
|
||||
|
||||
if (cache_hit) {
|
||||
ggml_decoder = entry->ptr;
|
||||
ggml_decoder = it->second;
|
||||
old_m_params = ggml_decoder->get_model_params();
|
||||
cache_hit = old_m_params.can_reuse_statically(m_params);
|
||||
}
|
||||
@@ -366,21 +325,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
ggml_decoder->update_io(cgraph);
|
||||
}
|
||||
ggml_decoder->add_extra_inputs();
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
infer_request =
|
||||
is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
|
||||
}
|
||||
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
|
||||
|
||||
decoder_end_time = ggml_time_us();
|
||||
conversion_end_time = decoder_end_time;
|
||||
compile_end_time = decoder_end_time;
|
||||
} else {
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
r_ctx->infer_request_cache.erase(key);
|
||||
r_ctx->infer_request_cache_prefill.erase(key);
|
||||
}
|
||||
r_ctx->infer_request_cache.erase(key);
|
||||
r_ctx->infer_request_cache_prefill.erase(key);
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||
@@ -420,14 +372,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
compiled_model_decode = core.compile_model(model_decode, device, config);
|
||||
}
|
||||
|
||||
auto infer_request_prefill = std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
|
||||
auto infer_request_decode = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
|
||||
r_ctx->infer_request_cache_prefill[key] =
|
||||
std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
|
||||
r_ctx->infer_request_cache[key] =
|
||||
std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
|
||||
compile_end_time = ggml_time_us();
|
||||
|
||||
model = is_prefill ? model_prefill : model_decode;
|
||||
ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
|
||||
infer_request = is_prefill ? infer_request_prefill : infer_request_decode;
|
||||
entry->ptr = ggml_decoder;
|
||||
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key];
|
||||
r_ctx->decoder_cache[key] = ggml_decoder;
|
||||
|
||||
std::vector<std::string> ov_input_names;
|
||||
std::vector<std::string> ov_output_names;
|
||||
@@ -437,29 +391,18 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
for (const auto & ov_output : model->get_results()) {
|
||||
ov_output_names.push_back(ov_output->get_friendly_name());
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
r_ctx->infer_request_cache_prefill[key] = infer_request_prefill;
|
||||
r_ctx->infer_request_cache[key] = infer_request_decode;
|
||||
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
}
|
||||
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
}
|
||||
|
||||
std::vector<std::string> ov_input_names_local;
|
||||
std::vector<std::string> ov_output_names_local;
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
ov_input_names_local = r_ctx->ov_input_names_cache[key];
|
||||
ov_output_names_local = r_ctx->ov_output_names_cache[key];
|
||||
}
|
||||
auto ov_input_names = r_ctx->ov_input_names_cache[key];
|
||||
auto ov_output_names = r_ctx->ov_output_names_cache[key];
|
||||
|
||||
if (is_prefill) {
|
||||
auto inp_len = inp_pos->ne[0];
|
||||
for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) {
|
||||
for (size_t i = 0; i < ov_input_names_local.size(); i++) {
|
||||
auto param_name = ov_input_names_local[i];
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index);
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
|
||||
@@ -469,8 +412,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
|
||||
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
|
||||
infer_request->set_output_tensor(i, output_tensor);
|
||||
}
|
||||
@@ -478,16 +421,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
infer_request->infer();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
|
||||
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
const auto output_tensor = infer_request->get_output_tensor(i);
|
||||
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
|
||||
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
|
||||
}
|
||||
}
|
||||
}
|
||||
infer_end_time = ggml_time_us();
|
||||
} else {
|
||||
for (size_t i = 0; i < ov_input_names_local.size(); i++) {
|
||||
auto param_name = ov_input_names_local[i];
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name);
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
|
||||
@@ -497,8 +440,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
|
||||
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
|
||||
infer_request->set_output_tensor(i, output_tensor);
|
||||
}
|
||||
@@ -507,9 +450,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
infer_end_time = ggml_time_us();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
|
||||
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
const auto output_tensor = infer_request->get_output_tensor(i);
|
||||
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
|
||||
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,15 +3,12 @@
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/runtime/core.hpp>
|
||||
#include <openvino/runtime/infer_request.hpp>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
struct graph_key {
|
||||
@@ -43,17 +40,11 @@ struct graph_key_hash {
|
||||
}
|
||||
};
|
||||
|
||||
struct decoder_runtime_ctx {
|
||||
decoder_runtime_ctx(std::shared_ptr<std::mutex> mutex) : mutex(std::move(mutex)) {}
|
||||
std::shared_ptr<std::mutex> mutex;
|
||||
std::shared_ptr<GgmlOvDecoder> ptr;
|
||||
};
|
||||
|
||||
struct ov_runtime_context {
|
||||
mutable std::mutex ctx_mutex;
|
||||
std::mutex ov_compute_mutex;
|
||||
std::string device;
|
||||
bool stateful;
|
||||
std::unordered_map<graph_key, std::shared_ptr<decoder_runtime_ctx>, graph_key_hash> decoder_cache;
|
||||
std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
|
||||
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
|
||||
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
|
||||
std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
|
||||
@@ -62,22 +53,11 @@ struct ov_runtime_context {
|
||||
// Simultanous stateful inference request support to be added.
|
||||
size_t stateful_kv_size;
|
||||
std::map<std::string, std::string> kv_state_input_name_map;
|
||||
std::atomic<int> backend_count;
|
||||
|
||||
ov_runtime_context() :
|
||||
device("CPU"),
|
||||
stateful(false),
|
||||
stateful_kv_size(0),
|
||||
backend_count(0) {}
|
||||
|
||||
void clear_caches() {
|
||||
std::lock_guard<std::mutex> lock(ctx_mutex);
|
||||
decoder_cache.clear();
|
||||
infer_request_cache.clear();
|
||||
infer_request_cache_prefill.clear();
|
||||
ov_input_names_cache.clear();
|
||||
ov_output_names_cache.clear();
|
||||
}
|
||||
stateful_kv_size(0) {}
|
||||
};
|
||||
|
||||
enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend);
|
||||
|
||||
@@ -28,13 +28,6 @@
|
||||
|
||||
namespace syclexp = sycl::ext::oneapi::experimental;
|
||||
|
||||
#if defined(__INTEL_LLVM_COMPILER) && __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
|
||||
#include <sycl/ext/oneapi/bfloat16.hpp>
|
||||
#ifndef GGML_SYCL_HAS_BF16
|
||||
#define GGML_SYCL_HAS_BF16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
#include "dnnl.hpp"
|
||||
#include "dnnl_sycl.hpp"
|
||||
|
||||
@@ -2,6 +2,13 @@
|
||||
#include "dequantize.hpp"
|
||||
#include "presets.hpp"
|
||||
|
||||
#if defined(__INTEL_LLVM_COMPILER)
|
||||
#if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
|
||||
#include <sycl/ext/oneapi/bfloat16.hpp>
|
||||
#define GGML_SYCL_HAS_BF16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
@@ -760,22 +767,6 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
}
|
||||
|
||||
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
to_bf16_sycl_t ggml_get_to_bf16_sycl(ggml_type type, ggml_tensor * /*dst*/) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_sycl<float>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_sycl<sycl::half>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
|
||||
default:
|
||||
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
|
||||
@@ -23,11 +23,6 @@ typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst);
|
||||
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst);
|
||||
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
typedef to_t_sycl_t<sycl::ext::oneapi::bfloat16> to_bf16_sycl_t;
|
||||
to_bf16_sycl_t ggml_get_to_bf16_sycl(ggml_type type, ggml_tensor * dst);
|
||||
#endif
|
||||
|
||||
// Nc = Non-contiguous
|
||||
template <typename T>
|
||||
using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
|
||||
@@ -40,19 +35,15 @@ template<typename dst_t, typename src_t>
|
||||
inline dst_t ggml_sycl_cast(src_t x) {
|
||||
if constexpr (std::is_same_v<dst_t, src_t>) {
|
||||
return x;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
} else if constexpr (std::is_same_v<dst_t, sycl::ext::oneapi::bfloat16>) {
|
||||
return sycl::ext::oneapi::bfloat16(float(x));
|
||||
} else if constexpr (std::is_same_v<src_t, sycl::ext::oneapi::bfloat16>) {
|
||||
return static_cast<float>(x);
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<src_t, sycl::float2> && std::is_same_v<dst_t, sycl::half2>) {
|
||||
return x.template convert<sycl::half, sycl::rounding_mode::rte>();
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
} else if constexpr (std::is_same_v<src_t, sycl::float2> &&
|
||||
std::is_same_v<dst_t, sycl::vec<sycl::ext::oneapi::bfloat16, 2>>) {
|
||||
return {x.x, x.y};
|
||||
#endif
|
||||
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
|
||||
return int32_t(x);
|
||||
} else {
|
||||
|
||||
@@ -29,9 +29,6 @@ public:
|
||||
static constexpr dt to_dt() {
|
||||
if constexpr (std::is_same_v<T, float>) return dt::f32;
|
||||
else if constexpr (std::is_same_v<T, sycl::half>) return dt::f16;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
else if constexpr (std::is_same_v<T, sycl::ext::oneapi::bfloat16>) return dt::bf16;
|
||||
#endif
|
||||
else static_assert(0);
|
||||
}
|
||||
|
||||
|
||||
@@ -2176,31 +2176,6 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
#else
|
||||
bool use_fp16 = false;
|
||||
#endif
|
||||
|
||||
#if GGML_SYCL_DNNL && defined(GGML_SYCL_HAS_BF16)
|
||||
// Fast path for bf16 src0
|
||||
if (src0->type == GGML_TYPE_BF16 && !g_ggml_sycl_disable_dnn && ggml_is_contiguous(src0) &&
|
||||
row_diff == src0->ne[1]) {
|
||||
using bf16_t = sycl::ext::oneapi::bfloat16;
|
||||
ggml_sycl_pool_alloc<bf16_t> src1_as_bf16(ctx.pool(), src1_ncols*ne10);
|
||||
if (src1->type != GGML_TYPE_BF16) {
|
||||
const to_bf16_sycl_t to_bf16_sycl = ggml_get_to_bf16_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_bf16_sycl != nullptr);
|
||||
to_bf16_sycl(src1_ddf_i, src1_as_bf16.get(), src1_ncols*ne10, stream);
|
||||
} else {
|
||||
stream->memcpy(src1_as_bf16.get(), src1_ddf_i, src1_ncols*ne10*sizeof(bf16_t));
|
||||
}
|
||||
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10,
|
||||
src0_dd_i, DnnlGemmWrapper::to_dt<bf16_t>(),
|
||||
src1_as_bf16.get(), DnnlGemmWrapper::to_dt<bf16_t>(),
|
||||
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||
GGML_UNUSED(dst);
|
||||
GGML_UNUSED(src1_ddq_i);
|
||||
GGML_UNUSED(src1_padded_row_size);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
|
||||
row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
|
||||
@@ -3808,51 +3783,6 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
|
||||
}
|
||||
}
|
||||
|
||||
// Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
|
||||
static bool ggml_sycl_mul_mat_id_mmvq_fused(
|
||||
ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
|
||||
{
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
if (ne12 != 1) return false;
|
||||
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
|
||||
if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
|
||||
if (!ggml_is_contiguous(src1)) return false;
|
||||
|
||||
// Reorder layout not supported; fall back.
|
||||
const ggml_tensor_extra_gpu * src0_extra =
|
||||
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
|
||||
if (src0_extra && src0_extra->optimized_feature.reorder) return false;
|
||||
|
||||
const int64_t n_ids_per_group = ids->ne[0];
|
||||
if (ids->ne[1] != 1) return false;
|
||||
if (ne11 != 1 && ne11 != n_ids_per_group) return false;
|
||||
|
||||
const queue_ptr stream = ctx.stream();
|
||||
const int src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
|
||||
const int n_experts_used = (int) n_ids_per_group;
|
||||
const int nrows = (int) src0->ne[1];
|
||||
|
||||
ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
|
||||
(size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
|
||||
char * src1_ddq = src1_q8_alloc.get();
|
||||
quantize_row_q8_1_sycl<quantize_q8_1>(
|
||||
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
|
||||
src1_padded_cols, stream);
|
||||
|
||||
const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
|
||||
const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
|
||||
|
||||
return ggml_sycl_mul_mat_vec_q_id(
|
||||
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
|
||||
(float *) dst->data, (int) ne10, nrows, n_experts_used,
|
||||
/*expert_weight_stride=*/ src0->nb[2],
|
||||
/*dst_row_stride=*/ dst->nb[1],
|
||||
src1_row_stride, stream);
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
ggml_tensor *dst) try {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
||||
@@ -3868,12 +3798,6 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
const int64_t n_as = ne02;
|
||||
const int64_t n_ids = ids->ne[0];
|
||||
|
||||
if (ne12 == 1) {
|
||||
if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
const char * ids_dev = (const char *) ids->data;
|
||||
|
||||
@@ -3924,9 +3848,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const int64_t n_routed_rows = ids->ne[1] * n_ids;
|
||||
ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne10);
|
||||
ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne0);
|
||||
ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
||||
ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
|
||||
src1_row.data = src1_contiguous.get();
|
||||
dst_row.data = dst_contiguous.get();
|
||||
|
||||
@@ -1199,154 +1199,3 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
GGML_UNUSED(src1_ddf_i);
|
||||
GGML_UNUSED(ctx);
|
||||
}
|
||||
|
||||
// src1_row_stride: 0 for shared src1 (gate/up proj), else per-expert stride (down proj).
|
||||
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
||||
static void mul_mat_vec_q_moe(
|
||||
const void * __restrict__ vx_base, const void * __restrict__ vy_base,
|
||||
float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
|
||||
const int ncols, const int nrows,
|
||||
const size_t expert_weight_stride, const size_t dst_row_stride,
|
||||
const size_t src1_row_stride,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
|
||||
const int expert_idx = item_ct1.get_group(1);
|
||||
const int i02 = ids_dev[expert_idx];
|
||||
|
||||
const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
|
||||
const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
|
||||
float * dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
|
||||
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int blocks_per_row = ncols / qk;
|
||||
constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi;
|
||||
|
||||
float tmp = 0.0f;
|
||||
|
||||
const block_q_t * x = (const block_q_t *) vx;
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
|
||||
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
|
||||
const int ibx = row * blocks_per_row + i;
|
||||
const int iby = i * (qk / QK8_1);
|
||||
|
||||
for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
|
||||
const int iqs = elem + vdr * (item_ct1.get_local_id(2) % (qi / vdr));
|
||||
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
|
||||
if (item_ct1.get_local_id(2) == 0) {
|
||||
dst[row] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
||||
static void launch_mul_mat_vec_q_moe(
|
||||
const void * vx_base, const void * vy, const int32_t * ids_dev,
|
||||
float * dst_base, const int ncols, const int nrows, const int n_experts_used,
|
||||
const size_t expert_weight_stride, const size_t dst_row_stride,
|
||||
const size_t src1_row_stride,
|
||||
dpct::queue_ptr stream) {
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_moe<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
|
||||
vx_base, vy, dst_base, ids_dev, ncols, nrows,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, item);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
bool ggml_sycl_mul_mat_vec_q_id(
|
||||
enum ggml_type src0_type,
|
||||
const void * vx_base,
|
||||
const void * vy,
|
||||
const int32_t * ids_dev,
|
||||
float * dst_base,
|
||||
int ncols,
|
||||
int nrows,
|
||||
int n_experts_used,
|
||||
size_t expert_weight_stride,
|
||||
size_t dst_row_stride,
|
||||
size_t src1_row_stride,
|
||||
dpct::queue_ptr stream) {
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
launch_mul_mat_vec_q_moe<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q4_1:
|
||||
launch_mul_mat_vec_q_moe<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q5_0:
|
||||
launch_mul_mat_vec_q_moe<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q5_1:
|
||||
launch_mul_mat_vec_q_moe<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q8_0:
|
||||
launch_mul_mat_vec_q_moe<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q2_K:
|
||||
launch_mul_mat_vec_q_moe<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q3_K:
|
||||
launch_mul_mat_vec_q_moe<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q4_K:
|
||||
launch_mul_mat_vec_q_moe<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q5_K:
|
||||
launch_mul_mat_vec_q_moe<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q6_K:
|
||||
launch_mul_mat_vec_q_moe<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_MXFP4:
|
||||
launch_mul_mat_vec_q_moe<QK_MXFP4, QI_MXFP4, block_mxfp4, VDR_MXFP4_Q8_1_MMVQ, vec_dot_mxfp4_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_NVFP4:
|
||||
launch_mul_mat_vec_q_moe<QK_NVFP4, QI_NVFP4, block_nvfp4, VDR_NVFP4_Q8_1_MMVQ, vec_dot_nvfp4_q8_1>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,20 +24,4 @@ void ggml_sycl_op_mul_mat_vec_q(
|
||||
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
||||
const dpct::queue_ptr &stream);
|
||||
|
||||
// Requires standard (non-reorder) block layout for src0.
|
||||
// Returns false if src0_type isn't handled; caller should fall back.
|
||||
bool ggml_sycl_mul_mat_vec_q_id(
|
||||
enum ggml_type src0_type,
|
||||
const void * vx_base, // start of stacked expert weights
|
||||
const void * vy, // pre-quantized src1 (Q8_1)
|
||||
const int32_t * ids_dev, // device-side int32, length n_experts_used
|
||||
float * dst_base,
|
||||
int ncols,
|
||||
int nrows,
|
||||
int n_experts_used,
|
||||
size_t expert_weight_stride, // bytes between experts in vx_base
|
||||
size_t dst_row_stride, // bytes between dst rows
|
||||
size_t src1_row_stride, // 0 = shared src1, else per-expert stride in bytes
|
||||
dpct::queue_ptr stream);
|
||||
|
||||
#endif // GGML_SYCL_MMVQ_HPP
|
||||
|
||||
@@ -4,11 +4,7 @@
|
||||
namespace utils {
|
||||
template<typename T>
|
||||
static constexpr bool is_arithmetic_v() {
|
||||
return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half>
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
|| std::is_same_v<T, sycl::ext::oneapi::bfloat16>
|
||||
#endif
|
||||
;
|
||||
return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -185,7 +181,6 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s
|
||||
stream
|
||||
);
|
||||
break;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
case GGML_TYPE_BF16:
|
||||
set_rows_sycl<TIn, TIdx, sycl::ext::oneapi::bfloat16>(
|
||||
src0_d, src1_d, (char *)dst->data,
|
||||
@@ -198,7 +193,6 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s
|
||||
stream
|
||||
);
|
||||
break;
|
||||
#endif
|
||||
case GGML_TYPE_Q8_0:
|
||||
set_rows_sycl_q<TIdx, block_q8_0, QK8_0, cpy_blck_f32_q8_0>(src0_d, src1_d, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
|
||||
@@ -792,7 +792,6 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_arange_f32;
|
||||
|
||||
vk_pipeline pipeline_fill_f32;
|
||||
vk_pipeline pipeline_fill_f16;
|
||||
|
||||
vk_pipeline pipeline_geglu[2];
|
||||
vk_pipeline pipeline_reglu[2];
|
||||
@@ -4578,7 +4577,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f16, "fill_f16", fill_f16_len, fill_f16_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
#define CREATE_GLU(name) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
@@ -9846,9 +9844,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_fill_f32;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_fill_f16;
|
||||
}
|
||||
return nullptr;
|
||||
default:
|
||||
return nullptr;
|
||||
@@ -15718,9 +15713,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32)
|
||||
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16);
|
||||
case GGML_OP_ARANGE:
|
||||
return op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_FILL:
|
||||
return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
|
||||
return op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SCALE:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_PAD:
|
||||
|
||||
@@ -889,7 +889,6 @@ void process_shaders() {
|
||||
string_to_spv("add1_f32_f32", "add1.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("arange_f32", "arange.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("fill_f32", "fill.comp", {{"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("fill_f16", "fill.comp", {{"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("step_f16", "step.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("step_f32", "step.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("round_f16", "round.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
|
||||
@@ -44,9 +44,18 @@
|
||||
// Matrix-vector multiplication parameters
|
||||
#define WEBGPU_MUL_MAT_VEC_WG_SIZE 256
|
||||
|
||||
#define WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG 4
|
||||
#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 4
|
||||
#define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG 4
|
||||
// Must be multiple of 4 to work with vectorized paths, and must divide
|
||||
// mul_mat_vec wg size
|
||||
#define WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG 64
|
||||
#define WEBGPU_MUL_MAT_VEC_FLOAT_TILE_K 256
|
||||
|
||||
#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 64
|
||||
#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_TILE_K 256
|
||||
|
||||
// Requires 32 threads per output (wg_size/outputs_per_wg == 32)
|
||||
#define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG 8
|
||||
// Requires at least two (and multiple of 2) k-quant blocks per tile
|
||||
#define WEBGPU_MUL_MAT_VEC_K_Q_TILE_K 512
|
||||
|
||||
// default size for legacy matrix multiplication
|
||||
#define WEBGPU_MUL_MAT_WG_SIZE 256
|
||||
@@ -69,7 +78,6 @@ struct ggml_webgpu_shader_lib_context {
|
||||
bool inplace = false;
|
||||
bool overlap = false;
|
||||
bool src_overlap = false;
|
||||
bool supports_subgroups = false;
|
||||
bool supports_subgroup_matrix = false;
|
||||
uint32_t sg_mat_m = 0;
|
||||
uint32_t sg_mat_n = 0;
|
||||
@@ -194,26 +202,6 @@ struct ggml_webgpu_row_norm_pipeline_key_hash {
|
||||
}
|
||||
};
|
||||
|
||||
/** RMS_NORM + MUL **/
|
||||
|
||||
struct ggml_webgpu_rms_norm_mul_pipeline_key {
|
||||
bool inplace;
|
||||
bool src_overlap;
|
||||
|
||||
bool operator==(const ggml_webgpu_rms_norm_mul_pipeline_key & other) const {
|
||||
return inplace == other.inplace && src_overlap == other.src_overlap;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_rms_norm_mul_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_rms_norm_mul_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.inplace);
|
||||
ggml_webgpu_hash_combine(seed, key.src_overlap);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
/** Pad **/
|
||||
struct ggml_webgpu_pad_pipeline_key {
|
||||
bool circular;
|
||||
@@ -260,46 +248,6 @@ struct ggml_webgpu_ssm_conv_pipeline_key {
|
||||
}
|
||||
};
|
||||
|
||||
/** CONV 2D */
|
||||
struct ggml_webgpu_conv2d_pipeline_key {
|
||||
ggml_type weight_type;
|
||||
ggml_type input_type;
|
||||
ggml_type output_type;
|
||||
|
||||
bool operator==(const ggml_webgpu_conv2d_pipeline_key & other) const {
|
||||
return weight_type == other.weight_type && input_type == other.input_type && output_type == other.output_type;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_conv2d_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_conv2d_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.weight_type);
|
||||
ggml_webgpu_hash_combine(seed, key.input_type);
|
||||
ggml_webgpu_hash_combine(seed, key.output_type);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
/** Im2Col **/
|
||||
struct ggml_webgpu_im2col_pipeline_key {
|
||||
ggml_type input_type;
|
||||
ggml_type output_type;
|
||||
|
||||
bool operator==(const ggml_webgpu_im2col_pipeline_key & other) const {
|
||||
return input_type == other.input_type && output_type == other.output_type;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_im2col_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_im2col_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.input_type);
|
||||
ggml_webgpu_hash_combine(seed, key.output_type);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
/** Gated Delta Net **/
|
||||
struct ggml_webgpu_gated_delta_net_pipeline_key {
|
||||
int type;
|
||||
@@ -556,7 +504,7 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
|
||||
const size_t q_tile = context.sg_mat_m;
|
||||
const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
|
||||
2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
|
||||
size_t bytes_per_kv = 0;
|
||||
size_t bytes_per_kv = 0;
|
||||
if (!key.kv_direct) {
|
||||
bytes_per_kv += std::max(key.head_dim_qk, key.head_dim_v);
|
||||
}
|
||||
@@ -627,6 +575,7 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash {
|
||||
|
||||
struct ggml_webgpu_mul_mat_vec_shader_decisions {
|
||||
uint32_t wg_size;
|
||||
uint32_t tile_k;
|
||||
uint32_t outputs_per_wg;
|
||||
uint32_t vec_size;
|
||||
};
|
||||
@@ -794,17 +743,16 @@ class ggml_webgpu_shader_lib {
|
||||
std::unordered_map<int, webgpu_pipeline> cumsum_pipelines; // key is fixed, no variants yet
|
||||
std::unordered_map<ggml_webgpu_row_norm_pipeline_key, webgpu_pipeline, ggml_webgpu_row_norm_pipeline_key_hash>
|
||||
row_norm_pipelines; // op/inplace
|
||||
|
||||
std::unordered_map<ggml_webgpu_get_rows_pipeline_key, webgpu_pipeline, ggml_webgpu_get_rows_pipeline_key_hash>
|
||||
get_rows_pipelines; // src_type, vectorized
|
||||
get_rows_pipelines; // src_type, vectorized
|
||||
std::unordered_map<ggml_webgpu_unary_pipeline_key, webgpu_pipeline, ggml_webgpu_unary_pipeline_key_hash>
|
||||
unary_pipelines; // type/op/inplace
|
||||
unary_pipelines; // type/op/inplace
|
||||
std::unordered_map<ggml_webgpu_scale_pipeline_key, webgpu_pipeline, ggml_webgpu_scale_pipeline_key_hash>
|
||||
scale_pipelines; // inplace
|
||||
scale_pipelines; // inplace
|
||||
std::unordered_map<ggml_webgpu_solve_tri_pipeline_key, webgpu_pipeline, ggml_webgpu_solve_tri_pipeline_key_hash>
|
||||
solve_tri_pipelines; // type
|
||||
solve_tri_pipelines; // type
|
||||
std::unordered_map<ggml_webgpu_ssm_conv_pipeline_key, webgpu_pipeline, ggml_webgpu_ssm_conv_pipeline_key_hash>
|
||||
ssm_conv_pipelines; // type/vectorized
|
||||
ssm_conv_pipelines; // type/vectorized
|
||||
std::unordered_map<ggml_webgpu_gated_delta_net_pipeline_key,
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_gated_delta_net_pipeline_key_hash>
|
||||
@@ -850,15 +798,6 @@ class ggml_webgpu_shader_lib {
|
||||
rope_pipelines;
|
||||
std::unordered_map<ggml_webgpu_soft_max_pipeline_key, webgpu_pipeline, ggml_webgpu_soft_max_pipeline_key_hash>
|
||||
soft_max_pipelines;
|
||||
std::unordered_map<ggml_webgpu_conv2d_pipeline_key, webgpu_pipeline, ggml_webgpu_conv2d_pipeline_key_hash>
|
||||
conv2d_pipelines;
|
||||
std::unordered_map<ggml_webgpu_im2col_pipeline_key, webgpu_pipeline, ggml_webgpu_im2col_pipeline_key_hash>
|
||||
im2col_pipelines;
|
||||
|
||||
std::unordered_map<ggml_webgpu_rms_norm_mul_pipeline_key,
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_rms_norm_mul_pipeline_key_hash>
|
||||
rms_norm_mul_pipelines;
|
||||
|
||||
public:
|
||||
ggml_webgpu_shader_lib(wgpu::Device device) { this->device = device; }
|
||||
@@ -1387,7 +1326,7 @@ class ggml_webgpu_shader_lib {
|
||||
ggml_webgpu_mul_mat_vec_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
key.src1_type = context.src1->type;
|
||||
key.vectorized = (context.src0->ne[0] % 4 == 0 &&
|
||||
key.vectorized = (context.src0->ne[0] % 4 == 0 && context.dst->ne[0] % 4 == 0 &&
|
||||
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
|
||||
1 :
|
||||
0;
|
||||
@@ -1398,8 +1337,7 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "mul_mat_vec";
|
||||
const char * shader_src = wgsl_mul_mat_vec;
|
||||
std::string variant = "mul_mat_vec";
|
||||
|
||||
// src0 type (matrix row)
|
||||
switch (context.src0->type) {
|
||||
@@ -1448,25 +1386,25 @@ class ggml_webgpu_shader_lib {
|
||||
defines.push_back(key.vectorized ? "VEC" : "SCALAR");
|
||||
|
||||
uint32_t wg_size = WEBGPU_MUL_MAT_VEC_WG_SIZE;
|
||||
uint32_t tile_k = WEBGPU_MUL_MAT_VEC_FLOAT_TILE_K;
|
||||
uint32_t outputs_per_wg = WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG;
|
||||
|
||||
if (key.src0_type >= GGML_TYPE_Q2_K) {
|
||||
tile_k = WEBGPU_MUL_MAT_VEC_K_Q_TILE_K;
|
||||
outputs_per_wg = WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG;
|
||||
} else if (key.src0_type >= GGML_TYPE_Q4_0) {
|
||||
tile_k = WEBGPU_MUL_MAT_VEC_LEGACY_Q_TILE_K;
|
||||
outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
|
||||
}
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
defines.push_back(std::string("TILE_K=") + std::to_string(tile_k));
|
||||
defines.push_back(std::string("OUTPUTS_PER_WG=") + std::to_string(outputs_per_wg));
|
||||
defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
|
||||
variant += context.supports_subgroups ? "_sg_reduce" : "_wg_reduce";
|
||||
if (key.vectorized) {
|
||||
variant += "_vectorized";
|
||||
}
|
||||
|
||||
auto processed = preprocessor.preprocess(shader_src, defines);
|
||||
auto processed = preprocessor.preprocess(wgsl_mul_mat_vec, defines);
|
||||
auto decisions = std::make_shared<ggml_webgpu_mul_mat_vec_shader_decisions>();
|
||||
decisions->wg_size = wg_size;
|
||||
decisions->tile_k = tile_k;
|
||||
decisions->outputs_per_wg = outputs_per_wg;
|
||||
decisions->vec_size = key.vectorized ? 4 : 1;
|
||||
|
||||
@@ -1875,39 +1813,6 @@ class ggml_webgpu_shader_lib {
|
||||
return unary_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_rms_norm_mul_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_rms_norm_mul_pipeline_key key = {};
|
||||
key.inplace = context.inplace;
|
||||
key.src_overlap = context.src_overlap;
|
||||
|
||||
auto it = rms_norm_mul_pipelines.find(key);
|
||||
if (it != rms_norm_mul_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::vector<std::string> defines;
|
||||
std::string op_name = "RMS_NORM_MUL";
|
||||
std::string variant = op_name;
|
||||
|
||||
if (key.inplace) {
|
||||
defines.push_back("INPLACE");
|
||||
variant += "_inplace";
|
||||
} else if (key.src_overlap) {
|
||||
defines.push_back("SRC_OVERLAP");
|
||||
variant += "_src_overlap";
|
||||
}
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_rms_norm_mul, defines);
|
||||
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
|
||||
decisions->wg_size = context.max_wg_size;
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
pipeline.context = decisions;
|
||||
rms_norm_mul_pipelines[key] = pipeline;
|
||||
return rms_norm_mul_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_binary_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_binary_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
@@ -2485,84 +2390,6 @@ class ggml_webgpu_shader_lib {
|
||||
return soft_max_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_conv2d_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_conv2d_pipeline_key key = {};
|
||||
key.weight_type = context.src0->type;
|
||||
key.input_type = context.src1->type;
|
||||
key.output_type = context.dst->type;
|
||||
|
||||
auto it = conv2d_pipelines.find(key);
|
||||
if (it != conv2d_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "conv_2d";
|
||||
|
||||
auto push_type_defines = [&](const char * prefix, ggml_type type) {
|
||||
std::string s_prefix = prefix;
|
||||
if (type == GGML_TYPE_F32) {
|
||||
defines.push_back(s_prefix + "_F32");
|
||||
} else if (type == GGML_TYPE_F16) {
|
||||
defines.push_back(s_prefix + "_F16");
|
||||
} else {
|
||||
GGML_ABORT("Unsupported type for CONV_2D shader");
|
||||
}
|
||||
};
|
||||
|
||||
push_type_defines("WEIGHT", key.weight_type);
|
||||
push_type_defines("INPUT", key.input_type);
|
||||
push_type_defines("OUTPUT", key.output_type);
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_conv2d, defines);
|
||||
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
|
||||
decisions->wg_size = context.max_wg_size;
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
pipeline.context = decisions;
|
||||
conv2d_pipelines[key] = pipeline;
|
||||
return conv2d_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_im2col_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_im2col_pipeline_key key = {};
|
||||
key.input_type = context.src1->type;
|
||||
key.output_type = context.dst->type;
|
||||
|
||||
auto it = im2col_pipelines.find(key);
|
||||
if (it != im2col_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "im2col";
|
||||
|
||||
auto push_type_defines = [&](const char * prefix, ggml_type type) {
|
||||
std::string s_prefix = prefix;
|
||||
if (type == GGML_TYPE_F32) {
|
||||
defines.push_back(s_prefix + "_F32");
|
||||
} else if (type == GGML_TYPE_F16) {
|
||||
defines.push_back(s_prefix + "_F16");
|
||||
} else {
|
||||
GGML_ABORT("Unsupported type for IM2COL shader");
|
||||
}
|
||||
};
|
||||
|
||||
push_type_defines("INPUT", key.input_type);
|
||||
push_type_defines("OUTPUT", key.output_type);
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_im2col, defines);
|
||||
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
|
||||
decisions->wg_size = context.max_wg_size;
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
pipeline.context = decisions;
|
||||
im2col_pipelines[key] = pipeline;
|
||||
return im2col_pipelines[key];
|
||||
}
|
||||
|
||||
private:
|
||||
static webgpu_pipeline ggml_webgpu_create_pipeline(wgpu::Device & device,
|
||||
std::string shader_code,
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-webgpu-shader-lib.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __EMSCRIPTEN__
|
||||
# include <emscripten/emscripten.h>
|
||||
@@ -182,7 +181,6 @@ struct webgpu_dispatch_desc {
|
||||
|
||||
struct webgpu_capabilities {
|
||||
wgpu::Limits limits;
|
||||
bool supports_subgroups = false;
|
||||
bool supports_subgroup_matrix = false;
|
||||
|
||||
uint32_t sg_mat_m = 0;
|
||||
@@ -212,7 +210,6 @@ struct webgpu_global_context_struct {
|
||||
wgpu::Buffer memset_params_buf;
|
||||
webgpu_pipeline memset_pipeline;
|
||||
|
||||
// TODO: We should rework the CPU profiling time handling to make it more useful. ref: https://github.com/ggml-org/llama.cpp/pull/22050
|
||||
#ifdef GGML_WEBGPU_CPU_PROFILE
|
||||
// Profiling: labeled CPU time in ms (total)
|
||||
std::unordered_map<std::string, double> cpu_time_ms;
|
||||
@@ -220,6 +217,11 @@ struct webgpu_global_context_struct {
|
||||
std::unordered_map<std::string, double> cpu_detail_ms;
|
||||
#endif
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
// Profiling: per-shader GPU time in ms
|
||||
std::unordered_map<std::string, double> shader_gpu_time_ms;
|
||||
#endif
|
||||
|
||||
#ifdef GGML_WEBGPU_DEBUG
|
||||
wgpu::Buffer debug_host_buf;
|
||||
wgpu::Buffer debug_dev_buf;
|
||||
@@ -265,12 +267,10 @@ struct webgpu_context_struct {
|
||||
size_t memset_bytes_per_thread;
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
// Profiling: per-shader GPU time in ms
|
||||
std::unordered_map<std::string, double> shader_gpu_time_ms;
|
||||
wgpu::Buffer profile_timestamp_dev_buf;
|
||||
wgpu::Buffer profile_timestamp_host_buf;
|
||||
wgpu::QuerySet profile_timestamp_query_set;
|
||||
uint32_t profile_timestamp_query_count = 0;
|
||||
wgpu::Buffer profile_timestamp_dev_buf;
|
||||
wgpu::Buffer profile_timestamp_host_buf;
|
||||
wgpu::QuerySet profile_timestamp_query_set;
|
||||
uint32_t profile_timestamp_query_count = 0;
|
||||
#endif
|
||||
|
||||
~webgpu_context_struct() {
|
||||
@@ -712,12 +712,12 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
std::cout << "\n[ggml_webgpu gpu profiling summary]\n";
|
||||
double total_gpu = 0.0;
|
||||
for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
|
||||
for (const auto & kv : ctx->webgpu_ctx->global_ctx->shader_gpu_time_ms) {
|
||||
total_gpu += kv.second;
|
||||
}
|
||||
std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n";
|
||||
std::cout << "\nggml_webgpu: gpu breakdown:\n";
|
||||
for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
|
||||
for (const auto & kv : ctx->webgpu_ctx->global_ctx->shader_gpu_time_ms) {
|
||||
double pct = (total_gpu > 0.0) ? (kv.second / total_gpu * 100.0) : 0.0;
|
||||
std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << std::fixed << std::setprecision(2)
|
||||
<< pct << "%)\n";
|
||||
@@ -922,170 +922,6 @@ static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx,
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx,
|
||||
ggml_tensor * src0,
|
||||
ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
const int32_t s0 = ggml_get_op_params_i32(dst, 0);
|
||||
const int32_t s1 = ggml_get_op_params_i32(dst, 1);
|
||||
const int32_t p0 = ggml_get_op_params_i32(dst, 2);
|
||||
const int32_t p1 = ggml_get_op_params_i32(dst, 3);
|
||||
const int32_t d0 = ggml_get_op_params_i32(dst, 4);
|
||||
const int32_t d1 = ggml_get_op_params_i32(dst, 5);
|
||||
|
||||
std::vector<uint32_t> params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
|
||||
(uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
|
||||
|
||||
(uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
|
||||
|
||||
(uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
||||
|
||||
(uint32_t) src0->ne[0],
|
||||
(uint32_t) src0->ne[1],
|
||||
(uint32_t) src0->ne[2],
|
||||
|
||||
(uint32_t) src1->ne[0],
|
||||
(uint32_t) src1->ne[1],
|
||||
|
||||
(uint32_t) dst->ne[0],
|
||||
(uint32_t) dst->ne[1],
|
||||
(uint32_t) dst->ne[2],
|
||||
(uint32_t) dst->ne[3],
|
||||
|
||||
(uint32_t) s0,
|
||||
(uint32_t) s1,
|
||||
(uint32_t) p0,
|
||||
(uint32_t) p1,
|
||||
(uint32_t) d0,
|
||||
(uint32_t) d1,
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0),
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
|
||||
};
|
||||
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
shader_lib_ctx.src0 = src0;
|
||||
shader_lib_ctx.src1 = src1;
|
||||
shader_lib_ctx.dst = dst;
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_conv2d_pipeline(shader_lib_ctx);
|
||||
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
|
||||
uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
|
||||
uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
|
||||
uint32_t wg_y = CEIL_DIV(total_wg, wg_x);
|
||||
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_im2col(webgpu_context & ctx,
|
||||
ggml_tensor * src0,
|
||||
ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
const int32_t s0 = ggml_get_op_params_i32(dst, 0);
|
||||
const int32_t s1 = ggml_get_op_params_i32(dst, 1);
|
||||
const int32_t p0 = ggml_get_op_params_i32(dst, 2);
|
||||
const int32_t p1 = ggml_get_op_params_i32(dst, 3);
|
||||
const int32_t d0 = ggml_get_op_params_i32(dst, 4);
|
||||
const int32_t d1 = ggml_get_op_params_i32(dst, 5);
|
||||
const bool is_2D = ggml_get_op_params_i32(dst, 6) == 1;
|
||||
|
||||
const uint32_t KW = src0->ne[0];
|
||||
const uint32_t KH = is_2D ? src0->ne[1] : 1;
|
||||
const uint32_t IC = is_2D ? src0->ne[2] : src0->ne[1];
|
||||
|
||||
const uint32_t IW = src1->ne[0];
|
||||
const uint32_t IH = is_2D ? src1->ne[1] : 1;
|
||||
const uint32_t N = is_2D ? src1->ne[3] : src1->ne[2];
|
||||
|
||||
const uint32_t OW = dst->ne[1];
|
||||
const uint32_t OH = is_2D ? dst->ne[2] : 1;
|
||||
|
||||
const uint32_t si0 = (uint32_t) (src1->nb[0] / ggml_type_size(src1->type));
|
||||
const uint32_t si1 = is_2D ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) : 0;
|
||||
const uint32_t si2 = is_2D ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) :
|
||||
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type));
|
||||
const uint32_t si3 = is_2D ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) :
|
||||
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type));
|
||||
|
||||
const uint32_t so0 = (uint32_t) (dst->nb[0] / ggml_type_size(dst->type));
|
||||
const uint32_t so1 = (uint32_t) (dst->nb[1] / ggml_type_size(dst->type));
|
||||
const uint32_t so2 = is_2D ? (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)) : 0;
|
||||
const uint32_t so3 = is_2D ? (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)) :
|
||||
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type));
|
||||
|
||||
std::vector<uint32_t> params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
|
||||
si0,
|
||||
si1,
|
||||
si2,
|
||||
si3,
|
||||
so0,
|
||||
so1,
|
||||
so2,
|
||||
so3,
|
||||
|
||||
KW,
|
||||
KH,
|
||||
IC,
|
||||
|
||||
IW,
|
||||
IH,
|
||||
N,
|
||||
|
||||
OW,
|
||||
OH,
|
||||
|
||||
(uint32_t) s0,
|
||||
(uint32_t) s1,
|
||||
(uint32_t) p0,
|
||||
(uint32_t) p1,
|
||||
(uint32_t) d0,
|
||||
(uint32_t) d1,
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1),
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst),
|
||||
};
|
||||
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
shader_lib_ctx.src0 = src0;
|
||||
shader_lib_ctx.src1 = src1;
|
||||
shader_lib_ctx.dst = dst;
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_im2col_pipeline(shader_lib_ctx);
|
||||
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
|
||||
uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
|
||||
uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
|
||||
uint32_t wg_y = CEIL_DIV(total_wg, wg_x);
|
||||
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx,
|
||||
ggml_tensor * src0,
|
||||
ggml_tensor * src1,
|
||||
@@ -1328,11 +1164,14 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q6_K:
|
||||
use_fast = true;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q2_K:
|
||||
use_fast = true;
|
||||
// we don't have fast mat-vec for these types, but we do have (semi) fast mat-mat
|
||||
use_fast = !is_vec;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -1343,12 +1182,10 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
}
|
||||
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
|
||||
shader_lib_ctx.src0 = src0;
|
||||
shader_lib_ctx.src1 = src1;
|
||||
shader_lib_ctx.dst = dst;
|
||||
shader_lib_ctx.src0 = src0;
|
||||
shader_lib_ctx.src1 = src1;
|
||||
shader_lib_ctx.dst = dst;
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
|
||||
shader_lib_ctx.supports_subgroup_matrix = ctx->global_ctx->capabilities.supports_subgroup_matrix;
|
||||
shader_lib_ctx.sg_mat_m = ctx->global_ctx->capabilities.sg_mat_m;
|
||||
shader_lib_ctx.sg_mat_n = ctx->global_ctx->capabilities.sg_mat_n;
|
||||
@@ -1450,8 +1287,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
|
||||
// Get or create pipeline
|
||||
webgpu_pipeline gather_pipeline;
|
||||
webgpu_pipeline main_pipeline;
|
||||
webgpu_pipeline gather_pipeline, main_pipeline;
|
||||
|
||||
std::vector<webgpu_dispatch_desc> dispatches;
|
||||
|
||||
@@ -2055,94 +1891,6 @@ static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, ggml_tensor *
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
||||
}
|
||||
|
||||
static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context & ctx,
|
||||
ggml_tensor * rn_src,
|
||||
ggml_tensor * rn_dst,
|
||||
ggml_tensor * mul_src0,
|
||||
ggml_tensor * mul_src1,
|
||||
ggml_tensor * dst) {
|
||||
ggml_tensor * mul_src;
|
||||
|
||||
if (ggml_webgpu_tensor_equal(rn_dst, mul_src0)) {
|
||||
mul_src = mul_src1;
|
||||
} else if (ggml_webgpu_tensor_equal(rn_dst, mul_src1)) {
|
||||
mul_src = mul_src0;
|
||||
} else {
|
||||
GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
|
||||
}
|
||||
|
||||
bool inplace = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
|
||||
(ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
|
||||
bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
|
||||
|
||||
uint32_t offset_merged_rn_src = 0;
|
||||
uint32_t offset_merged_mul_src = 0;
|
||||
size_t rn_src_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, rn_src);
|
||||
size_t mul_src_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, mul_src);
|
||||
|
||||
if (src_overlap) {
|
||||
size_t min_offset = std::min(rn_src_webgpu_tensor_align_offset, mul_src_webgpu_tensor_align_offset);
|
||||
offset_merged_rn_src =
|
||||
(uint32_t) ((rn_src_webgpu_tensor_align_offset - min_offset) / ggml_type_size(rn_src->type));
|
||||
offset_merged_mul_src =
|
||||
(uint32_t) ((mul_src_webgpu_tensor_align_offset - min_offset) / ggml_type_size(mul_src->type));
|
||||
}
|
||||
|
||||
std::vector<uint32_t> params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, rn_src) / ggml_type_size(rn_src->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mul_src) / ggml_type_size(mul_src->type)),
|
||||
offset_merged_rn_src,
|
||||
offset_merged_mul_src,
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
(uint32_t) (rn_src->nb[1] / ggml_type_size(rn_src->type)),
|
||||
(uint32_t) (rn_src->nb[2] / ggml_type_size(rn_src->type)),
|
||||
(uint32_t) (rn_src->nb[3] / ggml_type_size(rn_src->type)),
|
||||
(uint32_t) (mul_src->nb[1] / ggml_type_size(mul_src->type)),
|
||||
(uint32_t) (mul_src->nb[2] / ggml_type_size(mul_src->type)),
|
||||
(uint32_t) (mul_src->nb[3] / ggml_type_size(mul_src->type)),
|
||||
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
||||
(uint32_t) mul_src->ne[0],
|
||||
(uint32_t) mul_src->ne[1],
|
||||
(uint32_t) mul_src->ne[2],
|
||||
(uint32_t) mul_src->ne[3],
|
||||
(uint32_t) dst->ne[0],
|
||||
(uint32_t) dst->ne[1],
|
||||
(uint32_t) dst->ne[2],
|
||||
(uint32_t) dst->ne[3],
|
||||
ggml_webgpu_u32_from_f32(ggml_get_op_params_f32(rn_dst, 0)) // epsilon, treated as f32 in the shader
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries;
|
||||
|
||||
if (inplace) {
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
|
||||
} else if (src_overlap) {
|
||||
size_t merged_offset = std::min(rn_src_webgpu_tensor_align_offset, mul_src_webgpu_tensor_align_offset);
|
||||
size_t merged_end =
|
||||
std::max(rn_src_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, rn_src),
|
||||
mul_src_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, mul_src));
|
||||
entries.push_back(ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(rn_src), merged_offset,
|
||||
merged_end - merged_offset));
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst));
|
||||
} else {
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));
|
||||
}
|
||||
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
shader_lib_ctx.inplace = inplace;
|
||||
shader_lib_ctx.src_overlap = src_overlap;
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_rms_norm_mul_pipeline(shader_lib_ctx);
|
||||
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, ggml_nrows(dst));
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
||||
bool inplace = ggml_webgpu_tensor_equal(src, dst);
|
||||
|
||||
@@ -2639,48 +2387,15 @@ static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, ggml_tensor
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
||||
}
|
||||
|
||||
static bool ggml_webgpu_can_fuse_rms_norm_mul(const struct ggml_cgraph * cgraph, int node_idx) {
|
||||
if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// additional constraints specific to this fusion
|
||||
const ggml_tensor * rms_norm = cgraph->nodes[node_idx];
|
||||
const ggml_tensor * mul = cgraph->nodes[node_idx + 1];
|
||||
|
||||
GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
|
||||
// rms_norm only supports f32
|
||||
if (mul->src[0]->type != GGML_TYPE_F32 || mul->src[1]->type != GGML_TYPE_F32 || mul->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
// if rms_norm is the B operand, then we don't handle broadcast
|
||||
if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm)) {
|
||||
return false;
|
||||
}
|
||||
// rms_norm shader assumes contiguous rows
|
||||
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns the encoded command, or std::nullopt if the operation is a no-op
|
||||
static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
|
||||
ggml_cgraph * cgraph,
|
||||
int node_idx,
|
||||
int & num_encoded_ops) {
|
||||
ggml_tensor ** nodes = cgraph->nodes;
|
||||
ggml_tensor * node = nodes[node_idx];
|
||||
|
||||
static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
|
||||
if (ggml_is_empty(node)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return std::nullopt;
|
||||
}
|
||||
WEBGPU_LOG_DEBUG("ggml_webgpu_encode(" << node << ", " << ggml_op_name(node->op) << ")");
|
||||
WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
|
||||
|
||||
ggml_tensor * src0 = node->src[0];
|
||||
ggml_tensor * src1 = node->src[1];
|
||||
@@ -2723,13 +2438,6 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
|
||||
case GGML_OP_REPEAT:
|
||||
return ggml_webgpu_repeat(ctx, src0, node);
|
||||
case GGML_OP_RMS_NORM:
|
||||
if (ggml_webgpu_can_fuse_rms_norm_mul(cgraph, node_idx)) {
|
||||
num_encoded_ops = 2;
|
||||
ggml_tensor * mul_node = nodes[node_idx + 1];
|
||||
return ggml_webgpu_rms_norm_mul(ctx, src0, node, mul_node->src[0], mul_node->src[1], mul_node);
|
||||
} else {
|
||||
return ggml_webgpu_row_norm(ctx, src0, node);
|
||||
}
|
||||
case GGML_OP_L2_NORM:
|
||||
return ggml_webgpu_row_norm(ctx, src0, node);
|
||||
case GGML_OP_ROPE:
|
||||
@@ -2770,10 +2478,6 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
return ggml_webgpu_sum_rows(ctx, src0, node);
|
||||
case GGML_OP_CONV_2D:
|
||||
return ggml_webgpu_conv_2d(ctx, src0, src1, node);
|
||||
case GGML_OP_IM2COL:
|
||||
return ggml_webgpu_im2col(ctx, src0, src1, node);
|
||||
default:
|
||||
return std::nullopt;
|
||||
}
|
||||
@@ -2806,7 +2510,7 @@ static void ggml_backend_webgpu_collect_profile_results(webgpu_context &
|
||||
for (size_t i = 0; i < pipeline_names.size(); ++i) {
|
||||
// WebGPU timestamps are in ns; convert to ms.
|
||||
const double elapsed_ms = double(ts_data[2 * i + 1] - ts_data[2 * i]) * 1e-6;
|
||||
ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms;
|
||||
ctx->global_ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms;
|
||||
}
|
||||
|
||||
ctx->profile_timestamp_host_buf.Unmap();
|
||||
@@ -2842,8 +2546,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||
uint32_t num_inflight_batches = 0;
|
||||
bool contains_set_rows = false;
|
||||
bool batch_compute_passes = true;
|
||||
int num_encoded_ops = 1;
|
||||
int node_idx = 0;
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ctx->profile_timestamp_query_count = 0;
|
||||
@@ -2856,11 +2558,11 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||
ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
|
||||
}
|
||||
|
||||
while (node_idx < cgraph->n_nodes) {
|
||||
if (cgraph->nodes[node_idx]->op == GGML_OP_SET_ROWS) {
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
|
||||
contains_set_rows = true;
|
||||
}
|
||||
if (auto cmd = ggml_webgpu_encode(ctx, cgraph, node_idx, num_encoded_ops)) {
|
||||
if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
|
||||
commands.push_back(*cmd);
|
||||
num_batched_kernels += cmd.value().num_kernels;
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
@@ -2885,9 +2587,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||
ctx->param_arena.reset();
|
||||
commands.clear();
|
||||
}
|
||||
|
||||
node_idx += num_encoded_ops;
|
||||
num_encoded_ops = 1;
|
||||
}
|
||||
|
||||
if (ctx->active_compute_pass) {
|
||||
@@ -2917,107 +2616,22 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
struct ggml_backend_webgpu_event_context {
|
||||
webgpu_global_context global_ctx;
|
||||
wgpu::Future future;
|
||||
bool recorded = false;
|
||||
};
|
||||
|
||||
static ggml_backend_event_t ggml_backend_webgpu_device_event_new(ggml_backend_dev_t device) {
|
||||
ggml_backend_webgpu_device_context * dev_ctx = (ggml_backend_webgpu_device_context *) device->context;
|
||||
|
||||
auto * event_ctx = new ggml_backend_webgpu_event_context();
|
||||
event_ctx->global_ctx = dev_ctx->webgpu_global_ctx;
|
||||
|
||||
auto * event = new ggml_backend_event;
|
||||
event->device = device;
|
||||
event->context = event_ctx;
|
||||
return event;
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
||||
GGML_UNUSED(dev);
|
||||
delete static_cast<ggml_backend_webgpu_event_context *>(event->context);
|
||||
delete event;
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
||||
GGML_UNUSED(dev);
|
||||
ggml_backend_webgpu_event_context * event_ctx = (ggml_backend_webgpu_event_context *) event->context;
|
||||
if (!event_ctx->recorded) {
|
||||
return;
|
||||
}
|
||||
wgpu::WaitStatus status =
|
||||
event_ctx->global_ctx->instance.WaitAny(event_ctx->future, WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
|
||||
if (status == wgpu::WaitStatus::TimedOut) {
|
||||
GGML_ABORT("ggml_webgpu: event_synchronize timed out after %u ms\n", WEBGPU_RUNTIME_WAIT_TIMEOUT_MS);
|
||||
}
|
||||
event_ctx->recorded = false;
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_webgpu_context * backend_ctx = (ggml_backend_webgpu_context *) backend->context;
|
||||
ggml_backend_webgpu_event_context * event_ctx = (ggml_backend_webgpu_event_context *) event->context;
|
||||
|
||||
event_ctx->future = backend_ctx->webgpu_ctx->global_ctx->queue.OnSubmittedWorkDone(
|
||||
wgpu::CallbackMode::AllowSpontaneous, [](wgpu::QueueWorkDoneStatus, wgpu::StringView) {});
|
||||
event_ctx->recorded = true;
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
GGML_UNUSED(backend);
|
||||
ggml_backend_webgpu_device_event_synchronize(nullptr, event);
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_tensor * tensor,
|
||||
const void * data,
|
||||
size_t offset,
|
||||
size_t size) {
|
||||
GGML_UNUSED(backend);
|
||||
auto * buf_ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context;
|
||||
size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
|
||||
|
||||
// Write aligned portion
|
||||
buf_ctx->global_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
|
||||
|
||||
if (size % 4 != 0) {
|
||||
// If size is not a multiple of 4, we need to memset the remaining bytes
|
||||
size_t remaining_size = size % 4;
|
||||
|
||||
// pack the remaining bytes into a uint32_t
|
||||
uint32_t val32 = 0;
|
||||
|
||||
for (size_t i = 0; i < remaining_size; i++) {
|
||||
((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
|
||||
}
|
||||
// memset the remaining bytes
|
||||
ggml_backend_webgpu_buffer_memset(buf_ctx->global_ctx, buf_ctx->buffer, val32,
|
||||
total_offset + (size - remaining_size), remaining_size);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_synchronize(ggml_backend_t backend) {
|
||||
ggml_backend_webgpu_context * backend_ctx = (ggml_backend_webgpu_context *) backend->context;
|
||||
ggml_backend_webgpu_wait_queue(backend_ctx->webgpu_ctx->global_ctx);
|
||||
}
|
||||
|
||||
static ggml_backend_i ggml_backend_webgpu_i = {
|
||||
/* .get_name = */ ggml_backend_webgpu_name,
|
||||
/* .free = */ ggml_backend_webgpu_free,
|
||||
/* .set_tensor_async = */ ggml_backend_webgpu_set_tensor_async,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_webgpu_synchronize,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_webgpu_graph_compute,
|
||||
/* .event_record = */ ggml_backend_webgpu_event_record,
|
||||
/* .event_wait = */ ggml_backend_webgpu_event_wait,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
@@ -3426,8 +3040,6 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
|
||||
// we require f16 support
|
||||
GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
|
||||
ctx->webgpu_global_ctx->capabilities.supports_subgroups =
|
||||
ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
|
||||
|
||||
#ifndef __EMSCRIPTEN__
|
||||
// Accept f16 subgroup matrix configurations (square or non-square).
|
||||
@@ -3460,14 +3072,11 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
#ifndef __EMSCRIPTEN__
|
||||
required_features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
|
||||
if (ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
|
||||
required_features.push_back(wgpu::FeatureName::Subgroups);
|
||||
required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ctx->webgpu_global_ctx->capabilities.supports_subgroups) {
|
||||
required_features.push_back(wgpu::FeatureName::Subgroups);
|
||||
}
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
required_features.push_back(wgpu::FeatureName::TimestampQuery);
|
||||
#endif
|
||||
@@ -3882,15 +3491,6 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
|
||||
break;
|
||||
case GGML_OP_CONV_2D:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
||||
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
|
||||
break;
|
||||
case GGML_OP_IM2COL:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
supports_op = op->type == GGML_TYPE_F32;
|
||||
break;
|
||||
@@ -3984,9 +3584,9 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
|
||||
/* .supports_op = */ ggml_backend_webgpu_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_webgpu_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ ggml_backend_webgpu_device_event_new,
|
||||
/* .event_free = */ ggml_backend_webgpu_device_event_free,
|
||||
/* .event_synchronize = */ ggml_backend_webgpu_device_event_synchronize,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
/* End GGML Backend Device Interface */
|
||||
|
||||
@@ -45,13 +45,6 @@ fn load_u16_at_src0(byte_offset: u32) -> u32 {
|
||||
return (word >> shift) & 0xFFFFu;
|
||||
}
|
||||
|
||||
// Always reads the 4-byte-aligned word containing byte_offset.
|
||||
// Caller extracts the 16-bit half it needs via & 0xFFFFu or >> 16u.
|
||||
// this is used in k-quants for better performance
|
||||
fn load_u32_at_src0_aligned(byte_offset: u32) -> u32 {
|
||||
return src0[(byte_offset & ~3u) / 4u];
|
||||
}
|
||||
|
||||
fn load_u32_at_src0(byte_offset: u32) -> u32 {
|
||||
let word_idx = byte_offset / 4u;
|
||||
let shift = (byte_offset & 0x3u) * 8u;
|
||||
|
||||
@@ -1,165 +0,0 @@
|
||||
#include "common_decls.tmpl"
|
||||
enable f16;
|
||||
|
||||
@group(0) @binding(0)
|
||||
#if defined(WEIGHT_F32)
|
||||
var<storage, read_write> weights: array<f32>;
|
||||
#elif defined(WEIGHT_F16)
|
||||
var<storage, read_write> weights: array<f16>;
|
||||
#endif
|
||||
|
||||
@group(0) @binding(1)
|
||||
#if defined(INPUT_F32)
|
||||
var<storage, read_write> input: array<f32>;
|
||||
#elif defined(INPUT_F16)
|
||||
var<storage, read_write> input: array<f16>;
|
||||
#endif
|
||||
|
||||
@group(0) @binding(2)
|
||||
#if defined(OUTPUT_F32)
|
||||
var<storage, read_write> output: array<f32>;
|
||||
#elif defined(OUTPUT_F16)
|
||||
var<storage, read_write> output: array<f16>;
|
||||
#endif
|
||||
|
||||
struct Params {
|
||||
offset_w: u32,
|
||||
offset_i: u32,
|
||||
offset_o: u32,
|
||||
|
||||
// element strides
|
||||
sw0: u32, sw1: u32, sw2: u32, sw3: u32,
|
||||
si0: u32, si1: u32, si2: u32, si3: u32,
|
||||
so0: u32, so1: u32, so2: u32, so3: u32,
|
||||
|
||||
// kernel dimensions
|
||||
KW: u32, KH: u32, IC: u32,
|
||||
// input dimensions
|
||||
IW: u32, IH: u32,
|
||||
// output dimensions
|
||||
OW: u32, OH: u32, OC_out: u32, N_out: u32,
|
||||
|
||||
// stride
|
||||
s0: u32, s1: u32,
|
||||
// padding
|
||||
p0: u32, p1: u32,
|
||||
// dilation
|
||||
d0: u32, d1: u32,
|
||||
};
|
||||
|
||||
@group(0) @binding(3)
|
||||
var<uniform> params: Params;
|
||||
|
||||
fn load_weight(idx: u32) -> f32 {
|
||||
#if defined(WEIGHT_F32)
|
||||
return weights[idx];
|
||||
#elif defined(WEIGHT_F16)
|
||||
return f32(weights[idx]);
|
||||
#endif
|
||||
}
|
||||
|
||||
fn load_input(idx: u32) -> f32 {
|
||||
#if defined(INPUT_F32)
|
||||
return input[idx];
|
||||
#elif defined(INPUT_F16)
|
||||
return f32(input[idx]);
|
||||
#endif
|
||||
}
|
||||
|
||||
fn store_output(idx: u32, val: f32) {
|
||||
#if defined(OUTPUT_F32)
|
||||
output[idx] = val;
|
||||
#elif defined(OUTPUT_F16)
|
||||
output[idx] = f16(val);
|
||||
#endif
|
||||
}
|
||||
|
||||
fn ceil_div_u32(x: u32, y: u32) -> u32 {
|
||||
return (x + y - 1) / y;
|
||||
}
|
||||
|
||||
// returns the first valid kernel index k such that base + k * step >= 0
|
||||
fn first_valid_k(base: i32, step: u32) -> u32 {
|
||||
if (base >= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return ceil_div_u32(u32(-base), step);
|
||||
}
|
||||
|
||||
// returns the first invalid kernel index k such that base + k * step >= limit so valid k are in [0, end_valid_k)
|
||||
fn end_valid_k(base: i32, step: u32, limit: u32, k_max: u32) -> u32 {
|
||||
let remaining = i32(limit) - base;
|
||||
if (remaining <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return min(k_max, ceil_div_u32(u32(remaining), step));
|
||||
}
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(
|
||||
@builtin(global_invocation_id) gid: vec3<u32>,
|
||||
@builtin(num_workgroups) num_wg: vec3<u32>
|
||||
) {
|
||||
|
||||
let threads_per_group = u32(WG_SIZE);
|
||||
let i_out = gid.x + (num_wg.x * threads_per_group) * gid.y;
|
||||
let n_out = params.OW * params.OH * params.OC_out * params.N_out;
|
||||
|
||||
var sum: f32 = 0.0;
|
||||
if (i_out >= n_out) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Kernel layout: [KW, KH, IC, ..]
|
||||
// Input layout: [IW, IH, .., ..]
|
||||
// Output layout: [OW, OH, OC, N]
|
||||
|
||||
var i = i_out;
|
||||
let n = i / (params.OC_out * params.OH * params.OW);
|
||||
i = i % (params.OC_out * params.OH * params.OW);
|
||||
let oc = i / (params.OH * params.OW);
|
||||
i = i % (params.OH * params.OW);
|
||||
let oh = i / params.OW;
|
||||
let ow = i % params.OW;
|
||||
|
||||
let ow_base = i32(ow * params.s0) - i32(params.p0);
|
||||
let oh_base = i32(oh * params.s1) - i32(params.p1);
|
||||
|
||||
// clip the valid kernel window once
|
||||
let kw_begin = first_valid_k(ow_base, params.d0);
|
||||
let kw_end = end_valid_k(ow_base, params.d0, params.IW, params.KW);
|
||||
let kh_begin = first_valid_k(oh_base, params.d1);
|
||||
let kh_end = end_valid_k(oh_base, params.d1, params.IH, params.KH);
|
||||
|
||||
// entire receptive field is out of bounds
|
||||
if (kw_begin >= kw_end || kh_begin >= kh_end) {
|
||||
let out_idx = params.offset_o + ow * params.so0 + oh * params.so1 + oc * params.so2 + n * params.so3;
|
||||
store_output(out_idx, 0.0);
|
||||
return;
|
||||
}
|
||||
|
||||
let weight_oc_base = params.offset_w + oc * params.sw3;
|
||||
let input_n_base = params.offset_i + n * params.si3;
|
||||
|
||||
for (var ic: u32 = 0; ic < params.IC; ic += 1) {
|
||||
let w_base_ic = ic * params.sw2 + weight_oc_base;
|
||||
let in_base = ic * params.si2 + input_n_base;
|
||||
|
||||
for (var kh: u32 = kh_begin; kh < kh_end; kh += 1) {
|
||||
let ih = u32(oh_base + i32(kh * params.d1));
|
||||
let w_row_base = w_base_ic + kh * params.sw1;
|
||||
let in_row_base = in_base + ih * params.si1;
|
||||
for (var kw: u32 = kw_begin; kw < kw_end; kw += 1) {
|
||||
let iw = u32(ow_base + i32(kw * params.d0));
|
||||
let w_idx = w_row_base + kw * params.sw0;
|
||||
let in_idx = in_row_base + iw * params.si0;
|
||||
sum += load_weight(w_idx) * load_input(in_idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let out_idx = params.offset_o + ow * params.so0 + oh * params.so1 + oc * params.so2 + n * params.so3;
|
||||
store_output(out_idx, sum);
|
||||
}
|
||||
@@ -1,101 +0,0 @@
|
||||
#include "common_decls.tmpl"
|
||||
enable f16;
|
||||
|
||||
@group(0) @binding(0)
|
||||
#if defined(INPUT_F32)
|
||||
var<storage, read_write> input: array<f32>;
|
||||
#elif defined(INPUT_F16)
|
||||
var<storage, read_write> input: array<f16>;
|
||||
#endif
|
||||
|
||||
@group(0) @binding(1)
|
||||
#if defined(OUTPUT_F32)
|
||||
var<storage, read_write> output: array<f32>;
|
||||
#elif defined(OUTPUT_F16)
|
||||
var<storage, read_write> output: array<f16>;
|
||||
#endif
|
||||
|
||||
struct Params {
|
||||
offset_i: u32,
|
||||
offset_o: u32,
|
||||
|
||||
// element strides
|
||||
si0: u32, si1: u32, si2: u32, si3: u32,
|
||||
so0: u32, so1: u32, so2: u32, so3: u32,
|
||||
|
||||
KW: u32, KH: u32, IC: u32,
|
||||
IW: u32, IH: u32, N: u32,
|
||||
OW: u32, OH: u32,
|
||||
|
||||
// stride
|
||||
s0: u32, s1: u32,
|
||||
// padding
|
||||
p0: u32, p1: u32,
|
||||
// dilation
|
||||
d0: u32, d1: u32,
|
||||
}
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
fn load_input(idx: u32) -> f32 {
|
||||
#if defined(INPUT_F32)
|
||||
return input[idx];
|
||||
#elif defined(INPUT_F16)
|
||||
return f32(input[idx]);
|
||||
#endif
|
||||
}
|
||||
|
||||
fn store_output(idx: u32, val: f32) {
|
||||
#if defined(OUTPUT_F32)
|
||||
output[idx] = val;
|
||||
#elif defined(OUTPUT_F16)
|
||||
output[idx] = f16(val);
|
||||
#endif
|
||||
}
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(
|
||||
@builtin(global_invocation_id) gid: vec3<u32>,
|
||||
@builtin(num_workgroups) num_wg: vec3<u32>
|
||||
) {
|
||||
|
||||
let threads_per_group = u32(WG_SIZE);
|
||||
let i_out = gid.x + (num_wg.x * threads_per_group) * gid.y;
|
||||
let K = params.KW * params.KH * params.IC;
|
||||
let M = params.OW * params.OH;
|
||||
let total = K * M * params.N;
|
||||
|
||||
if (i_out >= total) {
|
||||
return;
|
||||
}
|
||||
|
||||
// decode (k, m, n)
|
||||
var i = i_out;
|
||||
let n = i / (K * M);
|
||||
i = i % (K * M);
|
||||
let m = i / K;
|
||||
let k = i % K;
|
||||
|
||||
// decode (oh, ow)
|
||||
let oh = m / params.OW;
|
||||
let ow = m % params.OW;
|
||||
|
||||
// decode (kw, kh, ic)
|
||||
let kw = k % params.KW;
|
||||
let tmp = k / params.KW;
|
||||
let kh = tmp % params.KH;
|
||||
let ic = tmp / params.KH;
|
||||
|
||||
let iw_i32 = i32(ow * params.s0 + kw * params.d0) - i32(params.p0);
|
||||
let ih_i32 = i32(oh * params.s1 + kh * params.d1) - i32(params.p1);
|
||||
|
||||
if (iw_i32 >= 0 && iw_i32 < i32(params.IW) && ih_i32 >= 0 && ih_i32 < i32(params.IH)) {
|
||||
let iw = u32(iw_i32);
|
||||
let ih = u32(ih_i32);
|
||||
let in_idx = params.offset_i + iw * params.si0 + ih * params.si1 + ic * params.si2 + n * params.si3;
|
||||
store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, load_input(in_idx));
|
||||
} else {
|
||||
store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, 0.0);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,139 +0,0 @@
|
||||
#ifdef INPLACE
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> rn_src: array<f32>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> mul_src: array<f32>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
|
||||
mul_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
|
||||
}
|
||||
|
||||
#elif SRC_OVERLAP
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> merged_src: array<f32>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> dst: array<f32>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
|
||||
dst[dst_offset] = scale * merged_src[rn_src_offset] * merged_src[mul_src_offset];
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> rn_src: array<f32>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> mul_src: array<f32>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<storage, read_write> dst: array<f32>;
|
||||
|
||||
@group(0) @binding(3)
|
||||
var<uniform> params: Params;
|
||||
|
||||
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
|
||||
dst[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
struct Params {
|
||||
offset_rn_src: u32,
|
||||
offset_mul_src: u32,
|
||||
offset_merged_rn_src: u32,
|
||||
offset_merged_mul_src: u32,
|
||||
offset_dst: u32,
|
||||
|
||||
stride_rn_src1: u32,
|
||||
stride_rn_src2: u32,
|
||||
stride_rn_src3: u32,
|
||||
|
||||
stride_mul_src1: u32,
|
||||
stride_mul_src2: u32,
|
||||
stride_mul_src3: u32,
|
||||
|
||||
stride_dst1: u32,
|
||||
stride_dst2: u32,
|
||||
stride_dst3: u32,
|
||||
|
||||
mul_src_ne0: u32,
|
||||
mul_src_ne1: u32,
|
||||
mul_src_ne2: u32,
|
||||
mul_src_ne3: u32,
|
||||
|
||||
ne0: u32,
|
||||
ne1: u32,
|
||||
ne2: u32,
|
||||
ne3: u32,
|
||||
|
||||
eps: f32
|
||||
};
|
||||
|
||||
var<workgroup> scratch: array<f32, WG_SIZE>;
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(workgroup_id) wid: vec3<u32>,
|
||||
@builtin(local_invocation_id) lid: vec3<u32>) {
|
||||
|
||||
// one thread per row
|
||||
var i = wid.x;
|
||||
let i3 = i / (params.ne2 * params.ne1);
|
||||
i = i % (params.ne2 * params.ne1);
|
||||
let i2 = i / params.ne1;
|
||||
let i1 = i % params.ne1;
|
||||
let i_rn_src_row = params.offset_rn_src + params.offset_merged_rn_src + i3 * params.stride_rn_src3 + i2 * params.stride_rn_src2 + i1 * params.stride_rn_src1;
|
||||
let i_mul_src_row = params.offset_mul_src + params.offset_merged_mul_src + (i3 % params.mul_src_ne3) * params.stride_mul_src3 + (i2 % params.mul_src_ne2) * params.stride_mul_src2 + (i1 % params.mul_src_ne1) * params.stride_mul_src1;
|
||||
let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
|
||||
|
||||
let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE;
|
||||
|
||||
var sum = 0.0f;
|
||||
var col = lid.x;
|
||||
for (var j: u32 = 0; j < elems; j++) {
|
||||
if (col >= params.ne0) {
|
||||
break;
|
||||
}
|
||||
#ifdef SRC_OVERLAP
|
||||
sum += pow(merged_src[i_rn_src_row + col], 2.0);
|
||||
#else
|
||||
sum += pow(rn_src[i_rn_src_row + col], 2.0);
|
||||
#endif
|
||||
col += WG_SIZE;
|
||||
}
|
||||
|
||||
scratch[lid.x] = sum;
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
var offset: u32 = WG_SIZE / 2;
|
||||
while (offset > 0) {
|
||||
if (lid.x < offset) {
|
||||
scratch[lid.x] += scratch[lid.x + offset];
|
||||
}
|
||||
offset = offset / 2;
|
||||
workgroupBarrier();
|
||||
}
|
||||
sum = scratch[0];
|
||||
|
||||
let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps);
|
||||
|
||||
col = lid.x;
|
||||
for (var j: u32 = 0; j < elems; j++) {
|
||||
if (col >= params.ne0) {
|
||||
break;
|
||||
}
|
||||
update(i_rn_src_row + col, i_dst_row + col, scale, i_mul_src_row + col % params.mul_src_ne0);
|
||||
col += WG_SIZE;
|
||||
}
|
||||
}
|
||||
@@ -197,7 +197,6 @@ class Keys:
|
||||
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
|
||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||
SCALING_ALPHA = "{arch}.rope.scaling.alpha"
|
||||
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||
@@ -472,7 +471,6 @@ class MODEL_ARCH(IntEnum):
|
||||
ERNIE4_5_MOE = auto()
|
||||
HUNYUAN_MOE = auto()
|
||||
HUNYUAN_DENSE = auto()
|
||||
HUNYUAN_VL = auto()
|
||||
SMOLLM3 = auto()
|
||||
GPT_OSS = auto()
|
||||
LFM2 = auto()
|
||||
@@ -959,7 +957,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.FALCON_H1: "falcon-h1",
|
||||
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
|
||||
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
|
||||
MODEL_ARCH.HUNYUAN_VL: "hunyuan_vl",
|
||||
MODEL_ARCH.SMOLLM3: "smollm3",
|
||||
MODEL_ARCH.GPT_OSS: "gpt-oss",
|
||||
MODEL_ARCH.LFM2: "lfm2",
|
||||
@@ -3492,22 +3489,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.HUNYUAN_VL: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.SMOLLM3: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
@@ -4157,7 +4138,6 @@ class VisionProjectorType:
|
||||
YOUTUVL = "youtuvl"
|
||||
NEMOTRON_V2_VL = "nemotron_v2_vl"
|
||||
HUNYUANOCR = "hunyuanocr"
|
||||
HUNYUANVL = "hunyuanvl"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -973,9 +973,6 @@ class GGUFWriter:
|
||||
def add_rope_scaling_factor(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_alpha(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_attn_factors(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
|
||||
|
||||
|
||||
@@ -511,6 +511,27 @@ extern "C" {
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
enum llama_params_fit_status {
|
||||
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
|
||||
};
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// - returns true if the parameters could be successfully modified to fit device memory
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
// with the exception of the context size which is modified if and only if equal to 0
|
||||
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API size_t llama_max_devices(void);
|
||||
@@ -1525,6 +1546,9 @@ extern "C" {
|
||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||
|
||||
// print a breakdown of per-device memory use via LLAMA_LOG:
|
||||
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
||||
@@ -1,991 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test parallel tool-calling capability via chat completions endpoint.
|
||||
|
||||
Only run this against models that actually support parallel tool calls — this
|
||||
script does not attempt to toggle that setting on the server. Each scenario is
|
||||
explicitly worded so that a capable model SHOULD emit multiple tool calls in a
|
||||
single assistant turn (either the same tool N times, or several different
|
||||
tools at once).
|
||||
|
||||
Each test case contains:
|
||||
- tools: list of tool definitions (OpenAI-compatible)
|
||||
- messages: initial conversation messages
|
||||
- mock_tool_responses: dict mapping tool_name -> callable(arguments) -> str (JSON)
|
||||
- expected_parallel: dict describing what constitutes a successful parallel turn
|
||||
{"min_parallel": int, # minimum tool_calls in one turn
|
||||
"require_same_tool": Optional[str], # all parallel calls must be this tool
|
||||
"require_distinct_tools": Optional[int], # >= N distinct tool names in one turn
|
||||
"min_distinct_args_key": Optional[str]} # parallel calls must span this
|
||||
# many distinct values of this arg key
|
||||
- validate: callable(turns, all_tool_calls, final_content) -> (passed, reason)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import requests
|
||||
import sys
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Color / formatting helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RESET = "\x1b[0m"
|
||||
BOLD = "\x1b[1m"
|
||||
DIM = "\x1b[2m"
|
||||
CYAN = "\x1b[36m"
|
||||
YELLOW = "\x1b[33m"
|
||||
GREEN = "\x1b[32m"
|
||||
RED = "\x1b[31m"
|
||||
BLUE = "\x1b[34m"
|
||||
WHITE = "\x1b[97m"
|
||||
MAGENTA = "\x1b[35m"
|
||||
|
||||
|
||||
def _print(text="", end="\n"):
|
||||
sys.stdout.write(text + end)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def print_header(title):
|
||||
bar = "─" * 60
|
||||
_print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}")
|
||||
_print(
|
||||
f"{BOLD}{CYAN}│ {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}"
|
||||
)
|
||||
_print(f"{BOLD}{CYAN}└{bar}┘{RESET}")
|
||||
|
||||
|
||||
def print_turn_banner(turn_idx, n_calls):
|
||||
color = MAGENTA if n_calls >= 2 else DIM
|
||||
_print(f"\n {BOLD}{color}▶ turn {turn_idx} — {n_calls} tool call(s){RESET}")
|
||||
|
||||
|
||||
def print_tool_call(name, args):
|
||||
args_str = json.dumps(args)
|
||||
_print(
|
||||
f" {BOLD}{YELLOW}⚙ {name}{RESET}{DIM}({args_str}){RESET}"
|
||||
)
|
||||
|
||||
|
||||
def print_tool_result(result):
|
||||
preview = result[:140] + ("…" if len(result) > 140 else "")
|
||||
_print(f" {DIM}{BLUE}↳ {preview}{RESET}")
|
||||
|
||||
|
||||
def print_model_output(text):
|
||||
sys.stdout.write(text)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def print_pass(reason):
|
||||
_print(f"\n{BOLD}{GREEN}✔ PASS{RESET} {reason}")
|
||||
|
||||
|
||||
def print_fail(reason):
|
||||
_print(f"\n{BOLD}{RED}✘ FAIL{RESET} {reason}")
|
||||
|
||||
|
||||
def print_info(msg):
|
||||
_print(f"{DIM}{msg}{RESET}")
|
||||
|
||||
|
||||
def print_warn(msg):
|
||||
_print(f"{BOLD}{YELLOW}⚠ {msg}{RESET}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def chat_completion(url, messages, tools=None, stream=False):
|
||||
payload = {
|
||||
"messages": messages,
|
||||
"stream": stream,
|
||||
"max_tokens": 4096,
|
||||
}
|
||||
if tools:
|
||||
payload["tools"] = tools
|
||||
payload["tool_choice"] = "auto"
|
||||
|
||||
try:
|
||||
response = requests.post(url, json=payload, stream=stream)
|
||||
response.raise_for_status()
|
||||
except requests.exceptions.RequestException as e:
|
||||
body = e.response.content if (e.response is not None) else b""
|
||||
print_fail(f"Request error: {e} | body: {body}")
|
||||
return None
|
||||
|
||||
full_content = ""
|
||||
reasoning_content = ""
|
||||
tool_calls: list[dict] = []
|
||||
|
||||
if stream:
|
||||
for line in response.iter_lines():
|
||||
if not line:
|
||||
continue
|
||||
decoded = line.decode("utf-8")
|
||||
if not decoded.startswith("data: "):
|
||||
continue
|
||||
data_str = decoded[6:]
|
||||
if data_str == "[DONE]":
|
||||
break
|
||||
try:
|
||||
data = json.loads(data_str)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
choices = data.get("choices", [])
|
||||
if not choices:
|
||||
continue
|
||||
delta = choices[0].get("delta", {})
|
||||
if delta.get("reasoning_content"):
|
||||
reasoning_content += delta["reasoning_content"]
|
||||
if delta.get("content"):
|
||||
full_content += delta["content"]
|
||||
print_model_output(delta["content"])
|
||||
for tc in delta.get("tool_calls", []):
|
||||
idx = tc.get("index", 0)
|
||||
while len(tool_calls) <= idx:
|
||||
tool_calls.append(
|
||||
{
|
||||
"id": "",
|
||||
"type": "function",
|
||||
"function": {"name": "", "arguments": ""},
|
||||
}
|
||||
)
|
||||
if "id" in tc:
|
||||
tool_calls[idx]["id"] += tc["id"]
|
||||
if "function" in tc:
|
||||
if "name" in tc["function"]:
|
||||
tool_calls[idx]["function"]["name"] += tc["function"]["name"]
|
||||
if "arguments" in tc["function"]:
|
||||
tool_calls[idx]["function"]["arguments"] += tc["function"][
|
||||
"arguments"
|
||||
]
|
||||
else:
|
||||
data = response.json()
|
||||
choices = data.get("choices", [])
|
||||
if choices:
|
||||
msg = choices[0].get("message", {})
|
||||
full_content = msg.get("content") or ""
|
||||
reasoning_content = msg.get("reasoning_content") or ""
|
||||
tool_calls = msg.get("tool_calls") or []
|
||||
if full_content:
|
||||
print_model_output(full_content)
|
||||
|
||||
result = {"content": full_content, "tool_calls": tool_calls}
|
||||
if reasoning_content:
|
||||
result["reasoning_content"] = reasoning_content
|
||||
return result
|
||||
|
||||
|
||||
def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
|
||||
"""
|
||||
Drive the multi-turn tool-call loop, but record each turn's tool calls
|
||||
separately so parallelism can be validated.
|
||||
|
||||
Returns (turns, all_tool_calls, final_content) where `turns` is a list
|
||||
of dicts: {"index": int, "tool_calls": [...], "content": str}.
|
||||
"""
|
||||
msgs = list(messages)
|
||||
turns: list[dict] = []
|
||||
all_tool_calls: list[dict] = []
|
||||
|
||||
for turn_idx in range(max_turns):
|
||||
result = chat_completion(url, msgs, tools=tools, stream=stream)
|
||||
if result is None:
|
||||
return turns, all_tool_calls, None
|
||||
|
||||
tcs = result.get("tool_calls") or []
|
||||
content = result.get("content") or ""
|
||||
|
||||
turns.append(
|
||||
{"index": turn_idx, "tool_calls": list(tcs), "content": content}
|
||||
)
|
||||
|
||||
if not tcs:
|
||||
if content:
|
||||
_print(f"\n{DIM}{'·' * 60}{RESET}")
|
||||
_print(f"{DIM} model response:{RESET}\n")
|
||||
return turns, all_tool_calls, content
|
||||
|
||||
print_turn_banner(turn_idx, len(tcs))
|
||||
all_tool_calls.extend(tcs)
|
||||
|
||||
assistant_msg: dict = {
|
||||
"role": "assistant",
|
||||
"content": content,
|
||||
"tool_calls": tcs,
|
||||
}
|
||||
reasoning = result.get("reasoning_content")
|
||||
if reasoning:
|
||||
assistant_msg["reasoning_content"] = reasoning
|
||||
msgs.append(assistant_msg)
|
||||
|
||||
for tc in tcs:
|
||||
tool_name = tc["function"]["name"]
|
||||
try:
|
||||
args = json.loads(tc["function"]["arguments"])
|
||||
except json.JSONDecodeError:
|
||||
args = {}
|
||||
|
||||
print_tool_call(tool_name, args)
|
||||
|
||||
mock_fn = mock_tool_responses.get(tool_name)
|
||||
if mock_fn:
|
||||
tool_result = mock_fn(args)
|
||||
else:
|
||||
tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"})
|
||||
|
||||
print_tool_result(tool_result)
|
||||
|
||||
msgs.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": tc.get("id", ""),
|
||||
"content": tool_result,
|
||||
}
|
||||
)
|
||||
|
||||
return turns, all_tool_calls, None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parallelism helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _best_parallel_turn(turns):
|
||||
"""Return the turn (dict) with the most tool calls, or None if no tools."""
|
||||
tool_turns = [t for t in turns if t["tool_calls"]]
|
||||
if not tool_turns:
|
||||
return None
|
||||
return max(tool_turns, key=lambda t: len(t["tool_calls"]))
|
||||
|
||||
|
||||
def _distinct_tool_names(turn):
|
||||
return {tc["function"]["name"] for tc in turn["tool_calls"]}
|
||||
|
||||
|
||||
def _distinct_arg_values(turn, key):
|
||||
values = set()
|
||||
for tc in turn["tool_calls"]:
|
||||
try:
|
||||
args = json.loads(tc["function"]["arguments"])
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
v = args.get(key)
|
||||
if v is not None:
|
||||
if isinstance(v, str):
|
||||
values.add(v.strip().lower())
|
||||
else:
|
||||
values.add(v)
|
||||
return values
|
||||
|
||||
|
||||
def _check_parallel(turns, expected):
|
||||
"""
|
||||
Check that at least one turn satisfies the parallel-call expectations.
|
||||
Returns (ok, reason).
|
||||
"""
|
||||
best = _best_parallel_turn(turns)
|
||||
if best is None:
|
||||
return False, "No tool calls were made at all"
|
||||
|
||||
min_parallel = expected.get("min_parallel", 2)
|
||||
if len(best["tool_calls"]) < min_parallel:
|
||||
by_turn = [len(t["tool_calls"]) for t in turns]
|
||||
return False, (
|
||||
f"No turn had >= {min_parallel} parallel tool calls "
|
||||
f"(per-turn counts: {by_turn})"
|
||||
)
|
||||
|
||||
require_same = expected.get("require_same_tool")
|
||||
if require_same is not None:
|
||||
names = [tc["function"]["name"] for tc in best["tool_calls"]]
|
||||
if any(n != require_same for n in names):
|
||||
return False, (
|
||||
f"Parallel turn mixed tools; expected all {require_same!r}, got {names}"
|
||||
)
|
||||
|
||||
require_distinct = expected.get("require_distinct_tools")
|
||||
if require_distinct is not None:
|
||||
distinct = _distinct_tool_names(best)
|
||||
if len(distinct) < require_distinct:
|
||||
return False, (
|
||||
f"Parallel turn had only {len(distinct)} distinct tool names "
|
||||
f"({distinct}); need >= {require_distinct}"
|
||||
)
|
||||
|
||||
distinct_key = expected.get("min_distinct_args_key")
|
||||
distinct_count = expected.get("min_distinct_args_count", min_parallel)
|
||||
if distinct_key is not None:
|
||||
values = _distinct_arg_values(best, distinct_key)
|
||||
if len(values) < distinct_count:
|
||||
return False, (
|
||||
f"Parallel turn had only {len(values)} distinct {distinct_key!r} "
|
||||
f"values ({values}); need >= {distinct_count}"
|
||||
)
|
||||
|
||||
return True, (
|
||||
f"Parallel turn had {len(best['tool_calls'])} calls across "
|
||||
f"{len(_distinct_tool_names(best))} distinct tool(s)"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test case runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def run_test(url, test_case, stream):
|
||||
name = test_case["name"]
|
||||
mode = f"{'stream' if stream else 'non-stream'}"
|
||||
print_header(f"{name} [{mode}]")
|
||||
|
||||
turns, all_tool_calls, final_content = run_agentic_loop(
|
||||
url,
|
||||
messages=test_case["messages"],
|
||||
tools=test_case["tools"],
|
||||
mock_tool_responses=test_case["mock_tool_responses"],
|
||||
stream=stream,
|
||||
)
|
||||
|
||||
if not turns:
|
||||
print_fail("No response from server.")
|
||||
return False
|
||||
|
||||
parallel_ok, parallel_reason = _check_parallel(turns, test_case["expected_parallel"])
|
||||
if not parallel_ok:
|
||||
print_fail(parallel_reason)
|
||||
return False
|
||||
|
||||
passed, reason = test_case["validate"](turns, all_tool_calls, final_content)
|
||||
if passed:
|
||||
print_pass(f"{parallel_reason}; {reason}")
|
||||
else:
|
||||
print_fail(reason)
|
||||
return passed
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test case definitions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# ---- Test 1: Multi-file read (same tool, multiple distinct paths) ----
|
||||
|
||||
_FILE_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"description": (
|
||||
"Read the full contents of a file from the local filesystem. "
|
||||
"Call this tool in parallel when asked to read several files — "
|
||||
"each path needs its own call."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Absolute or repo-relative path to a file",
|
||||
},
|
||||
},
|
||||
"required": ["path"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
_FILE_CONTENTS = {
|
||||
"config/database.yml": "host: db.internal\nport: 5432\nuser: svc_app\n",
|
||||
"config/redis.yml": "host: cache.internal\nport: 6379\ndb: 0\n",
|
||||
"config/queue.yml": "broker: rabbitmq.internal\nport: 5672\nvhost: prod\n",
|
||||
"config/auth.yml": "provider: oidc\nissuer: https://auth.internal\n",
|
||||
}
|
||||
|
||||
|
||||
def _read_file_mock(args):
|
||||
path = args.get("path", "")
|
||||
norm = path.lstrip("./").lstrip("/")
|
||||
content = _FILE_CONTENTS.get(norm)
|
||||
if content is None:
|
||||
for k, v in _FILE_CONTENTS.items():
|
||||
if path.endswith(k):
|
||||
content = v
|
||||
break
|
||||
if content is None:
|
||||
return json.dumps({"path": path, "error": "not found"})
|
||||
return json.dumps({"path": path, "content": content})
|
||||
|
||||
|
||||
MULTIFILE_READ_TEST = {
|
||||
"name": "Parallel multi-file read (same tool, 4 distinct paths)",
|
||||
"tools": _FILE_TOOLS,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Please read all four of these config files so I can review them "
|
||||
"together: config/database.yml, config/redis.yml, config/queue.yml, "
|
||||
"and config/auth.yml. Call read_file for every path in parallel in "
|
||||
"a single batch — do NOT read them one by one sequentially across "
|
||||
"turns. After you have all four, give me a one-line summary of each."
|
||||
),
|
||||
}
|
||||
],
|
||||
"mock_tool_responses": {"read_file": _read_file_mock},
|
||||
"expected_parallel": {
|
||||
"min_parallel": 4,
|
||||
"require_same_tool": "read_file",
|
||||
"min_distinct_args_key": "path",
|
||||
"min_distinct_args_count": 4,
|
||||
},
|
||||
"validate": lambda turns, tcs, content: _validate_multifile(turns, tcs, content),
|
||||
}
|
||||
|
||||
|
||||
def _validate_multifile(turns, tcs, content):
|
||||
del turns
|
||||
if not content:
|
||||
return False, "No final summary produced"
|
||||
return True, f"{len(tcs)} total read_file calls; content length={len(content)}"
|
||||
|
||||
|
||||
# ---- Test 2: Batch TODO marking (same tool, N calls in one turn) ----
|
||||
|
||||
_TODO_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "mark_todo_complete",
|
||||
"description": (
|
||||
"Mark a single TODO item as complete by ID. When the user wants "
|
||||
"several items marked at once, call this tool in parallel — "
|
||||
"one call per item — rather than sequentially across turns."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"todo_id": {
|
||||
"type": "string",
|
||||
"description": "Identifier of the TODO item",
|
||||
},
|
||||
"note": {
|
||||
"type": "string",
|
||||
"description": "Optional completion note",
|
||||
},
|
||||
},
|
||||
"required": ["todo_id"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
_TODO_DB = {
|
||||
"T-101": "Draft onboarding doc",
|
||||
"T-102": "Update dependency lockfile",
|
||||
"T-103": "Fix flaky login test",
|
||||
"T-104": "Rotate service credentials",
|
||||
"T-105": "Archive Q4 reports",
|
||||
}
|
||||
|
||||
|
||||
def _mark_todo_mock(args):
|
||||
tid = args.get("todo_id", "")
|
||||
if tid in _TODO_DB:
|
||||
return json.dumps({"todo_id": tid, "title": _TODO_DB[tid], "status": "done"})
|
||||
return json.dumps({"todo_id": tid, "error": "unknown id"})
|
||||
|
||||
|
||||
TODO_BATCH_TEST = {
|
||||
"name": "Batch TODO completion (same tool, 5 IDs in one turn)",
|
||||
"tools": _TODO_TOOLS,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"I finished every item on today's list. Please mark all of the "
|
||||
"following TODOs as complete, in one parallel batch: T-101, T-102, "
|
||||
"T-103, T-104, T-105. Don't mark them one at a time across separate "
|
||||
"turns — issue all five mark_todo_complete calls at once. Afterwards "
|
||||
"confirm which ones succeeded."
|
||||
),
|
||||
}
|
||||
],
|
||||
"mock_tool_responses": {"mark_todo_complete": _mark_todo_mock},
|
||||
"expected_parallel": {
|
||||
"min_parallel": 5,
|
||||
"require_same_tool": "mark_todo_complete",
|
||||
"min_distinct_args_key": "todo_id",
|
||||
"min_distinct_args_count": 5,
|
||||
},
|
||||
"validate": lambda turns, tcs, content: _validate_todo(turns, tcs, content),
|
||||
}
|
||||
|
||||
|
||||
def _validate_todo(turns, tcs, content):
|
||||
del turns
|
||||
if not content:
|
||||
return False, "No confirmation summary produced"
|
||||
return True, f"{len(tcs)} total mark_todo_complete calls"
|
||||
|
||||
|
||||
# ---- Test 3: Multi-city weather (same tool, N parallel locations) ----
|
||||
|
||||
_WEATHER_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": (
|
||||
"Fetch current weather for ONE city. When the user asks about "
|
||||
"several cities, call this tool in parallel — one call per city — "
|
||||
"instead of sequentially."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {"type": "string", "description": "City name"},
|
||||
"units": {
|
||||
"type": "string",
|
||||
"enum": ["metric", "imperial"],
|
||||
"default": "metric",
|
||||
},
|
||||
},
|
||||
"required": ["city"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
_WEATHER_DB = {
|
||||
"tokyo": {"city": "Tokyo", "temp_c": 18.4, "condition": "partly cloudy", "humidity": 64},
|
||||
"london": {"city": "London", "temp_c": 9.1, "condition": "overcast", "humidity": 81},
|
||||
"new york": {"city": "New York", "temp_c": 12.7, "condition": "clear", "humidity": 55},
|
||||
"paris": {"city": "Paris", "temp_c": 11.3, "condition": "light rain", "humidity": 78},
|
||||
}
|
||||
|
||||
|
||||
def _weather_mock(args):
|
||||
city = args.get("city", "").strip().lower()
|
||||
if city.startswith("new york"):
|
||||
city = "new york"
|
||||
if city in _WEATHER_DB:
|
||||
return json.dumps(_WEATHER_DB[city])
|
||||
return json.dumps({"city": args.get("city", ""), "error": "unknown city"})
|
||||
|
||||
|
||||
MULTI_WEATHER_TEST = {
|
||||
"name": "Parallel multi-city weather (same tool, 4 cities)",
|
||||
"tools": _WEATHER_TOOLS,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"I'm comparing today's weather across four cities for a travel "
|
||||
"decision: Tokyo, London, New York, and Paris. Please call "
|
||||
"get_weather for all four in parallel in a single turn — don't "
|
||||
"fetch them one at a time. Then rank them from warmest to coolest."
|
||||
),
|
||||
}
|
||||
],
|
||||
"mock_tool_responses": {"get_weather": _weather_mock},
|
||||
"expected_parallel": {
|
||||
"min_parallel": 4,
|
||||
"require_same_tool": "get_weather",
|
||||
"min_distinct_args_key": "city",
|
||||
"min_distinct_args_count": 4,
|
||||
},
|
||||
"validate": lambda turns, tcs, content: _validate_weather(turns, tcs, content),
|
||||
}
|
||||
|
||||
|
||||
def _validate_weather(turns, tcs, content):
|
||||
del turns
|
||||
if not content or not any(
|
||||
kw in content.lower() for kw in ("warmest", "rank", "hot", "cool")
|
||||
):
|
||||
return False, f"Final content missing a ranking: {content!r}"
|
||||
return True, f"{len(tcs)} total get_weather calls; ranking produced"
|
||||
|
||||
|
||||
# ---- Test 4: Trip planning (different tools, parallel in one turn) ----
|
||||
|
||||
_TRIP_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search_flights",
|
||||
"description": "Search one-way flights between two airports on a given date.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from_airport": {"type": "string", "description": "IATA code, e.g. SFO"},
|
||||
"to_airport": {"type": "string", "description": "IATA code, e.g. JFK"},
|
||||
"date": {"type": "string", "description": "YYYY-MM-DD"},
|
||||
},
|
||||
"required": ["from_airport", "to_airport", "date"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search_hotels",
|
||||
"description": "Search hotels in a city for a date range.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {"type": "string"},
|
||||
"check_in": {"type": "string", "description": "YYYY-MM-DD"},
|
||||
"check_out": {"type": "string", "description": "YYYY-MM-DD"},
|
||||
"max_price": {"type": "integer"},
|
||||
},
|
||||
"required": ["city", "check_in", "check_out"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search_restaurants",
|
||||
"description": "Search restaurants in a city by cuisine.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {"type": "string"},
|
||||
"cuisine": {"type": "string"},
|
||||
},
|
||||
"required": ["city"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
_FLIGHTS_RESULT = {
|
||||
"results": [
|
||||
{"flight": "UA 1552", "depart": "08:15", "arrive": "16:45", "price": 389},
|
||||
{"flight": "AA 20", "depart": "10:00", "arrive": "18:35", "price": 412},
|
||||
]
|
||||
}
|
||||
_HOTELS_RESULT = {
|
||||
"results": [
|
||||
{"name": "Midtown Grand", "nightly_rate": 245, "rating": 4.3},
|
||||
{"name": "Harbour Boutique", "nightly_rate": 312, "rating": 4.6},
|
||||
]
|
||||
}
|
||||
_RESTAURANTS_RESULT = {
|
||||
"results": [
|
||||
{"name": "Trattoria Nona", "cuisine": "italian", "rating": 4.5},
|
||||
{"name": "Osteria Blu", "cuisine": "italian", "rating": 4.4},
|
||||
]
|
||||
}
|
||||
|
||||
TRIP_PLAN_TEST = {
|
||||
"name": "Trip planning (3 different tools in parallel)",
|
||||
"tools": _TRIP_TOOLS,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"I'm flying from SFO to JFK on 2026-06-12 and staying four nights "
|
||||
"(check out 2026-06-16). I'd also like some Italian restaurant "
|
||||
"suggestions in New York. Please call search_flights, search_hotels, "
|
||||
"and search_restaurants in parallel — all three in a single turn, "
|
||||
"since they don't depend on each other. Then give me a concise "
|
||||
"travel summary."
|
||||
),
|
||||
}
|
||||
],
|
||||
"mock_tool_responses": {
|
||||
"search_flights": lambda _: json.dumps(_FLIGHTS_RESULT),
|
||||
"search_hotels": lambda _: json.dumps(_HOTELS_RESULT),
|
||||
"search_restaurants": lambda _: json.dumps(_RESTAURANTS_RESULT),
|
||||
},
|
||||
"expected_parallel": {
|
||||
"min_parallel": 3,
|
||||
"require_distinct_tools": 3,
|
||||
},
|
||||
"validate": lambda turns, tcs, content: _validate_trip(turns, tcs, content),
|
||||
}
|
||||
|
||||
|
||||
def _validate_trip(turns, tcs, content):
|
||||
del turns
|
||||
names = {tc["function"]["name"] for tc in tcs}
|
||||
required = {"search_flights", "search_hotels", "search_restaurants"}
|
||||
missing = required - names
|
||||
if missing:
|
||||
return False, f"Missing tool calls: {missing}"
|
||||
if not content:
|
||||
return False, "No travel summary produced"
|
||||
return True, f"All three tools called; summary length={len(content)}"
|
||||
|
||||
|
||||
# ---- Test 5: Portfolio check (same tool, parallel tickers) ----
|
||||
|
||||
_STOCK_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_stock_quote",
|
||||
"description": (
|
||||
"Get the latest quote for ONE ticker. When the user asks about "
|
||||
"multiple tickers, call this tool in parallel — one per symbol — "
|
||||
"rather than sequentially."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"symbol": {"type": "string", "description": "Ticker symbol"},
|
||||
},
|
||||
"required": ["symbol"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
_STOCK_DB = {
|
||||
"AAPL": {"symbol": "AAPL", "price": 218.45, "change_pct": "+0.8%"},
|
||||
"MSFT": {"symbol": "MSFT", "price": 421.10, "change_pct": "+1.2%"},
|
||||
"GOOGL":{"symbol": "GOOGL","price": 175.22, "change_pct": "-0.3%"},
|
||||
"AMZN": {"symbol": "AMZN", "price": 189.76, "change_pct": "+0.5%"},
|
||||
"NVDA": {"symbol": "NVDA", "price": 140.88, "change_pct": "+2.4%"},
|
||||
}
|
||||
|
||||
|
||||
def _stock_mock(args):
|
||||
sym = args.get("symbol", "").strip().upper()
|
||||
if sym in _STOCK_DB:
|
||||
return json.dumps(_STOCK_DB[sym])
|
||||
return json.dumps({"symbol": sym, "error": "unknown ticker"})
|
||||
|
||||
|
||||
PORTFOLIO_TEST = {
|
||||
"name": "Portfolio check (same tool, 5 tickers in parallel)",
|
||||
"tools": _STOCK_TOOLS,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Pull the latest quote for every ticker in my portfolio — AAPL, "
|
||||
"MSFT, GOOGL, AMZN, and NVDA — in a single parallel batch. These "
|
||||
"lookups are independent, so please don't chain them across turns. "
|
||||
"Once you have all five, tell me which ticker had the biggest "
|
||||
"percentage change today."
|
||||
),
|
||||
}
|
||||
],
|
||||
"mock_tool_responses": {"get_stock_quote": _stock_mock},
|
||||
"expected_parallel": {
|
||||
"min_parallel": 5,
|
||||
"require_same_tool": "get_stock_quote",
|
||||
"min_distinct_args_key": "symbol",
|
||||
"min_distinct_args_count": 5,
|
||||
},
|
||||
"validate": lambda turns, tcs, content: _validate_portfolio(turns, tcs, content),
|
||||
}
|
||||
|
||||
|
||||
def _validate_portfolio(turns, tcs, content):
|
||||
del turns
|
||||
if not content or ("nvda" not in content.lower() and "NVDA" not in content):
|
||||
return False, f"Expected NVDA to be identified as the biggest mover: {content!r}"
|
||||
return True, f"{len(tcs)} total quotes pulled"
|
||||
|
||||
|
||||
# ---- Test 6: Mixed — translate + dictionary in parallel for the same word ----
|
||||
|
||||
_LANG_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "translate_text",
|
||||
"description": "Translate a short text into a target language.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {"type": "string"},
|
||||
"target_language": {"type": "string",
|
||||
"description": "ISO 639-1 language code, e.g. 'es'"},
|
||||
},
|
||||
"required": ["text", "target_language"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_definition",
|
||||
"description": "Get the English dictionary definition of a word.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"word": {"type": "string"},
|
||||
},
|
||||
"required": ["word"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_synonyms",
|
||||
"description": "Get English synonyms for a word.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"word": {"type": "string"},
|
||||
},
|
||||
"required": ["word"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _translate_mock(args):
|
||||
t = args.get("text", "")
|
||||
lang = args.get("target_language", "")
|
||||
return json.dumps({"source": t, "target_language": lang, "translation": f"[{lang}] {t}"})
|
||||
|
||||
|
||||
def _definition_mock(args):
|
||||
w = args.get("word", "")
|
||||
return json.dumps({
|
||||
"word": w,
|
||||
"definition": f"A standard dictionary definition of {w!r}.",
|
||||
})
|
||||
|
||||
|
||||
def _synonyms_mock(args):
|
||||
w = args.get("word", "")
|
||||
return json.dumps({
|
||||
"word": w,
|
||||
"synonyms": ["synonym_a", "synonym_b", "synonym_c"],
|
||||
})
|
||||
|
||||
|
||||
LANG_TOOLKIT_TEST = {
|
||||
"name": "Language toolkit (translate + definition + synonyms in parallel)",
|
||||
"tools": _LANG_TOOLS,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"For the English word 'resilient', I need three independent "
|
||||
"look-ups at once: (a) translate it into Spanish, (b) fetch its "
|
||||
"dictionary definition, and (c) list its synonyms. These three "
|
||||
"calls don't depend on each other — please issue them in parallel "
|
||||
"in a single turn. Then present the combined results as a short "
|
||||
"language note."
|
||||
),
|
||||
}
|
||||
],
|
||||
"mock_tool_responses": {
|
||||
"translate_text": _translate_mock,
|
||||
"get_definition": _definition_mock,
|
||||
"get_synonyms": _synonyms_mock,
|
||||
},
|
||||
"expected_parallel": {
|
||||
"min_parallel": 3,
|
||||
"require_distinct_tools": 3,
|
||||
},
|
||||
"validate": lambda turns, tcs, content: _validate_lang(turns, tcs, content),
|
||||
}
|
||||
|
||||
|
||||
def _validate_lang(turns, tcs, content):
|
||||
del turns
|
||||
names = {tc["function"]["name"] for tc in tcs}
|
||||
required = {"translate_text", "get_definition", "get_synonyms"}
|
||||
missing = required - names
|
||||
if missing:
|
||||
return False, f"Missing tool calls: {missing}"
|
||||
if not content:
|
||||
return False, "No language note produced"
|
||||
return True, f"All three lookup tools called; note length={len(content)}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# All test cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ALL_TEST_CASES = [
|
||||
MULTIFILE_READ_TEST,
|
||||
TODO_BATCH_TEST,
|
||||
MULTI_WEATHER_TEST,
|
||||
TRIP_PLAN_TEST,
|
||||
PORTFOLIO_TEST,
|
||||
LANG_TOOLKIT_TEST,
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Test llama-server parallel tool-calling capability. Run this only "
|
||||
"against models configured for parallel tool calls — this script "
|
||||
"does not configure that itself."
|
||||
)
|
||||
)
|
||||
parser.add_argument("--host", default="localhost")
|
||||
parser.add_argument("--port", default=8080, type=int)
|
||||
parser.add_argument(
|
||||
"--no-stream", action="store_true", help="Disable streaming mode tests"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stream-only", action="store_true", help="Only run streaming mode tests"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
help="Run only the test whose name contains this substring (case-insensitive)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
url = f"http://{args.host}:{args.port}/v1/chat/completions"
|
||||
print_info(f"Testing server at {url}")
|
||||
print_warn(
|
||||
"This script expects the target model to emit multiple tool calls in a "
|
||||
"single assistant turn. Run it only against parallel-tool-capable models."
|
||||
)
|
||||
|
||||
modes: list[bool] = []
|
||||
if not args.stream_only:
|
||||
modes.append(False)
|
||||
if not args.no_stream:
|
||||
modes.append(True)
|
||||
|
||||
cases: list[dict] = ALL_TEST_CASES
|
||||
if args.test:
|
||||
name_filter = args.test.lower()
|
||||
cases = [c for c in cases if name_filter in str(c["name"]).lower()]
|
||||
if not cases:
|
||||
print_fail(f"No test cases matched '{args.test}'")
|
||||
sys.exit(1)
|
||||
|
||||
total = 0
|
||||
passed = 0
|
||||
for stream in modes:
|
||||
for case in cases:
|
||||
total += 1
|
||||
if run_test(url, case, stream=stream):
|
||||
passed += 1
|
||||
|
||||
color = GREEN if passed == total else RED
|
||||
_print(f"\n{BOLD}{color}{'─' * 60}{RESET}")
|
||||
_print(f"{BOLD}{color} Results: {passed}/{total} passed{RESET}")
|
||||
_print(f"{BOLD}{color}{'─' * 60}{RESET}\n")
|
||||
sys.exit(0 if passed == total else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,980 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test structured output capability via chat completions endpoint.
|
||||
|
||||
Each test case contains:
|
||||
- response_format: OpenAI-compatible response_format specification
|
||||
(json_schema only — llama.cpp does not support json_object)
|
||||
- messages: initial conversation messages
|
||||
- tools (optional): tool definitions (for mixed tool + structured tests)
|
||||
- mock_tool_responses (optional): dict mapping tool_name -> callable(arguments) -> str (JSON)
|
||||
- apply_stage: "always" to apply response_format to every request,
|
||||
"after_tools" to run the tool loop plain, then request a
|
||||
structured summary in a follow-up user turn.
|
||||
- followup (optional, for after_tools): user message appended before the
|
||||
final structured call.
|
||||
- validate: callable(parsed_json, tool_calls_history, raw_content) -> (passed: bool, reason: str)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import requests
|
||||
import sys
|
||||
from typing import Any, cast
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Color / formatting helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RESET = "\x1b[0m"
|
||||
BOLD = "\x1b[1m"
|
||||
DIM = "\x1b[2m"
|
||||
CYAN = "\x1b[36m"
|
||||
YELLOW = "\x1b[33m"
|
||||
GREEN = "\x1b[32m"
|
||||
RED = "\x1b[31m"
|
||||
BLUE = "\x1b[34m"
|
||||
WHITE = "\x1b[97m"
|
||||
MAGENTA = "\x1b[35m"
|
||||
|
||||
|
||||
def _print(text="", end="\n"):
|
||||
sys.stdout.write(text + end)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def print_header(title):
|
||||
bar = "─" * 60
|
||||
_print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}")
|
||||
_print(
|
||||
f"{BOLD}{CYAN}│ {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}"
|
||||
)
|
||||
_print(f"{BOLD}{CYAN}└{bar}┘{RESET}")
|
||||
|
||||
|
||||
def print_tool_call(name, args):
|
||||
args_str = json.dumps(args)
|
||||
_print(
|
||||
f"\n {BOLD}{YELLOW}⚙ tool call{RESET} {CYAN}{name}{RESET}{DIM}({args_str}){RESET}"
|
||||
)
|
||||
|
||||
|
||||
def print_tool_result(result):
|
||||
preview = result[:160] + ("…" if len(result) > 160 else "")
|
||||
_print(f" {DIM}{BLUE}↳ result{RESET} {DIM}{preview}{RESET}")
|
||||
|
||||
|
||||
def print_model_output(text):
|
||||
sys.stdout.write(text)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def print_pass(reason):
|
||||
_print(f"\n{BOLD}{GREEN}✔ PASS{RESET} {reason}")
|
||||
|
||||
|
||||
def print_fail(reason):
|
||||
_print(f"\n{BOLD}{RED}✘ FAIL{RESET} {reason}")
|
||||
|
||||
|
||||
def print_info(msg):
|
||||
_print(f"{DIM}{msg}{RESET}")
|
||||
|
||||
|
||||
def print_schema_note(label, rf):
|
||||
kind = rf.get("type", "?")
|
||||
name = ""
|
||||
if kind == "json_schema":
|
||||
name = rf.get("json_schema", {}).get("name", "")
|
||||
_print(f"{DIM}{MAGENTA} ⟐ response_format [{label}]: {kind}"
|
||||
f"{(' / ' + name) if name else ''}{RESET}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def chat_completion(url, messages, tools=None, response_format=None, stream=False):
|
||||
payload = {
|
||||
"messages": messages,
|
||||
"stream": stream,
|
||||
"max_tokens": 4096,
|
||||
}
|
||||
if tools:
|
||||
payload["tools"] = tools
|
||||
payload["tool_choice"] = "auto"
|
||||
if response_format is not None:
|
||||
payload["response_format"] = response_format
|
||||
|
||||
try:
|
||||
response = requests.post(url, json=payload, stream=stream)
|
||||
response.raise_for_status()
|
||||
except requests.exceptions.RequestException as e:
|
||||
body = e.response.content if (e.response is not None) else b""
|
||||
print_fail(f"Request error: {e} | body: {body}")
|
||||
return None
|
||||
|
||||
full_content = ""
|
||||
reasoning_content = ""
|
||||
tool_calls: list[dict] = []
|
||||
|
||||
if stream:
|
||||
for line in response.iter_lines():
|
||||
if not line:
|
||||
continue
|
||||
decoded = line.decode("utf-8")
|
||||
if not decoded.startswith("data: "):
|
||||
continue
|
||||
data_str = decoded[6:]
|
||||
if data_str == "[DONE]":
|
||||
break
|
||||
try:
|
||||
data = json.loads(data_str)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
choices = data.get("choices", [])
|
||||
if not choices:
|
||||
continue
|
||||
delta = choices[0].get("delta", {})
|
||||
if delta.get("reasoning_content"):
|
||||
reasoning_content += delta["reasoning_content"]
|
||||
if delta.get("content"):
|
||||
full_content += delta["content"]
|
||||
print_model_output(delta["content"])
|
||||
for tc in delta.get("tool_calls", []):
|
||||
idx = tc.get("index", 0)
|
||||
while len(tool_calls) <= idx:
|
||||
tool_calls.append(
|
||||
{
|
||||
"id": "",
|
||||
"type": "function",
|
||||
"function": {"name": "", "arguments": ""},
|
||||
}
|
||||
)
|
||||
if "id" in tc:
|
||||
tool_calls[idx]["id"] += tc["id"]
|
||||
if "function" in tc:
|
||||
if "name" in tc["function"]:
|
||||
tool_calls[idx]["function"]["name"] += tc["function"]["name"]
|
||||
if "arguments" in tc["function"]:
|
||||
tool_calls[idx]["function"]["arguments"] += tc["function"][
|
||||
"arguments"
|
||||
]
|
||||
else:
|
||||
data = response.json()
|
||||
choices = data.get("choices", [])
|
||||
if choices:
|
||||
msg = choices[0].get("message", {})
|
||||
full_content = msg.get("content") or ""
|
||||
reasoning_content = msg.get("reasoning_content") or ""
|
||||
tool_calls = msg.get("tool_calls") or []
|
||||
if full_content:
|
||||
print_model_output(full_content)
|
||||
|
||||
result = {"content": full_content, "tool_calls": tool_calls}
|
||||
if reasoning_content:
|
||||
result["reasoning_content"] = reasoning_content
|
||||
return result
|
||||
|
||||
|
||||
def run_tool_loop(
|
||||
url, messages, tools, mock_tool_responses, stream, response_format=None,
|
||||
max_turns=6,
|
||||
):
|
||||
"""
|
||||
Drive the tool-call loop. If response_format is provided it is applied to
|
||||
every request. Returns (all_tool_calls, final_messages, final_content).
|
||||
"""
|
||||
msgs = list(messages)
|
||||
all_tool_calls: list[dict] = []
|
||||
|
||||
for _ in range(max_turns):
|
||||
result = chat_completion(
|
||||
url, msgs, tools=tools, response_format=response_format, stream=stream
|
||||
)
|
||||
if result is None:
|
||||
return all_tool_calls, msgs, None
|
||||
|
||||
tcs = result.get("tool_calls") or []
|
||||
content = result.get("content") or ""
|
||||
|
||||
if not tcs:
|
||||
if content:
|
||||
_print(f"\n{DIM}{'·' * 60}{RESET}")
|
||||
return all_tool_calls, msgs, content
|
||||
|
||||
all_tool_calls.extend(tcs)
|
||||
|
||||
assistant_msg: dict = {
|
||||
"role": "assistant",
|
||||
"content": content,
|
||||
"tool_calls": tcs,
|
||||
}
|
||||
reasoning = result.get("reasoning_content")
|
||||
if reasoning:
|
||||
assistant_msg["reasoning_content"] = reasoning
|
||||
msgs.append(assistant_msg)
|
||||
|
||||
for tc in tcs:
|
||||
tool_name = tc["function"]["name"]
|
||||
try:
|
||||
args = json.loads(tc["function"]["arguments"])
|
||||
except json.JSONDecodeError:
|
||||
args = {}
|
||||
|
||||
print_tool_call(tool_name, args)
|
||||
|
||||
mock_fn = mock_tool_responses.get(tool_name) if mock_tool_responses else None
|
||||
if mock_fn:
|
||||
tool_result = mock_fn(args)
|
||||
else:
|
||||
tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"})
|
||||
|
||||
print_tool_result(tool_result)
|
||||
|
||||
msgs.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": tc.get("id", ""),
|
||||
"content": tool_result,
|
||||
}
|
||||
)
|
||||
|
||||
return all_tool_calls, msgs, None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test case runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _try_parse_json(text):
|
||||
"""Attempt to parse text as JSON, trimming common markdown fences."""
|
||||
if text is None:
|
||||
return None
|
||||
stripped = text.strip()
|
||||
if stripped.startswith("```"):
|
||||
lines = stripped.splitlines()
|
||||
if lines and lines[0].startswith("```"):
|
||||
lines = lines[1:]
|
||||
if lines and lines[-1].strip().startswith("```"):
|
||||
lines = lines[:-1]
|
||||
stripped = "\n".join(lines).strip()
|
||||
try:
|
||||
return json.loads(stripped)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def run_test(url, test_case, stream):
|
||||
name = test_case["name"]
|
||||
mode = f"{'stream' if stream else 'non-stream'}"
|
||||
apply_stage = test_case.get("apply_stage", "always")
|
||||
print_header(f"{name} [{mode}] ({apply_stage})")
|
||||
|
||||
response_format = test_case["response_format"]
|
||||
print_schema_note(apply_stage, response_format)
|
||||
|
||||
tools = test_case.get("tools")
|
||||
mocks = test_case.get("mock_tool_responses") or {}
|
||||
|
||||
all_tcs: list[dict] = []
|
||||
final_content = None
|
||||
|
||||
if apply_stage == "always":
|
||||
all_tcs, _msgs, final_content = run_tool_loop(
|
||||
url,
|
||||
messages=list(test_case["messages"]),
|
||||
tools=tools,
|
||||
mock_tool_responses=mocks,
|
||||
stream=stream,
|
||||
response_format=response_format,
|
||||
)
|
||||
elif apply_stage == "after_tools":
|
||||
# Phase 1: plain tool loop, no response_format applied yet.
|
||||
all_tcs, msgs, interim_content = run_tool_loop(
|
||||
url,
|
||||
messages=list(test_case["messages"]),
|
||||
tools=tools,
|
||||
mock_tool_responses=mocks,
|
||||
stream=stream,
|
||||
response_format=None,
|
||||
)
|
||||
if interim_content:
|
||||
msgs.append({"role": "assistant", "content": interim_content})
|
||||
followup = test_case.get(
|
||||
"followup",
|
||||
"Now output the answer strictly as JSON matching the provided schema. "
|
||||
"Do not include commentary.",
|
||||
)
|
||||
msgs.append({"role": "user", "content": followup})
|
||||
|
||||
# Phase 2: request final structured output. Tools are not passed so the
|
||||
# model focuses on producing the schema-constrained answer.
|
||||
_print(f"\n{DIM}{MAGENTA} ⟐ follow-up turn with response_format applied{RESET}")
|
||||
result = chat_completion(
|
||||
url, msgs, tools=None, response_format=response_format, stream=stream
|
||||
)
|
||||
final_content = result["content"] if result else None
|
||||
else:
|
||||
print_fail(f"Unknown apply_stage: {apply_stage}")
|
||||
return False
|
||||
|
||||
if final_content is None:
|
||||
print_fail("No final content from server.")
|
||||
return False
|
||||
|
||||
parsed = _try_parse_json(final_content)
|
||||
if parsed is None:
|
||||
print_fail(f"Final content is not valid JSON: {final_content[:200]!r}")
|
||||
return False
|
||||
|
||||
passed, reason = test_case["validate"](parsed, all_tcs, final_content)
|
||||
if passed:
|
||||
print_pass(reason)
|
||||
else:
|
||||
print_fail(reason)
|
||||
return passed
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test case definitions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# ---- Test 1: Book metadata extraction (always / json_schema) ----
|
||||
|
||||
_BOOK_SCHEMA = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "book_metadata",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"author": {"type": "string"},
|
||||
"year": {"type": "integer"},
|
||||
"genre": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"fiction",
|
||||
"non-fiction",
|
||||
"fantasy",
|
||||
"sci-fi",
|
||||
"mystery",
|
||||
"biography",
|
||||
"history",
|
||||
"other",
|
||||
],
|
||||
},
|
||||
"page_count": {"type": "integer"},
|
||||
},
|
||||
"required": ["title", "author", "year", "genre", "page_count"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
BOOK_TEST_CASE = {
|
||||
"name": "Book metadata extraction (json_schema, always)",
|
||||
"response_format": _BOOK_SCHEMA,
|
||||
"apply_stage": "always",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Extract book metadata from this description: "
|
||||
"'Dune is a 1965 science fiction epic by Frank Herbert, spanning roughly "
|
||||
"688 pages in its first edition, set on the desert planet Arrakis.' "
|
||||
"Return the data as JSON."
|
||||
),
|
||||
}
|
||||
],
|
||||
"validate": lambda parsed, tcs, raw: _validate_book(parsed),
|
||||
}
|
||||
|
||||
|
||||
def _validate_book(parsed):
|
||||
required = {"title", "author", "year", "genre", "page_count"}
|
||||
missing = required - parsed.keys()
|
||||
if missing:
|
||||
return False, f"Missing fields: {missing}"
|
||||
if not isinstance(parsed["title"], str) or not parsed["title"]:
|
||||
return False, "title must be a non-empty string"
|
||||
if not isinstance(parsed["author"], str) or "herbert" not in parsed["author"].lower():
|
||||
return False, f"author unexpected: {parsed['author']!r}"
|
||||
if not isinstance(parsed["year"], int) or parsed["year"] != 1965:
|
||||
return False, f"year should be 1965, got {parsed['year']!r}"
|
||||
if parsed["genre"] not in {
|
||||
"fiction", "non-fiction", "fantasy", "sci-fi", "mystery",
|
||||
"biography", "history", "other",
|
||||
}:
|
||||
return False, f"genre not in enum: {parsed['genre']!r}"
|
||||
if not isinstance(parsed["page_count"], int) or parsed["page_count"] <= 0:
|
||||
return False, f"page_count should be positive int: {parsed['page_count']!r}"
|
||||
return True, f"Book: {parsed['title']} ({parsed['year']}) / {parsed['genre']}"
|
||||
|
||||
|
||||
# ---- Test 2: Sentiment classification (always / enum-constrained) ----
|
||||
|
||||
_SENTIMENT_SCHEMA = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "sentiment_analysis",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"sentiment": {
|
||||
"type": "string",
|
||||
"enum": ["positive", "negative", "neutral"],
|
||||
},
|
||||
"confidence": {"type": "number"},
|
||||
"keywords": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"minItems": 1,
|
||||
"maxItems": 5,
|
||||
},
|
||||
},
|
||||
"required": ["sentiment", "confidence", "keywords"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
SENTIMENT_TEST_CASE = {
|
||||
"name": "Sentiment analysis with enum and array",
|
||||
"response_format": _SENTIMENT_SCHEMA,
|
||||
"apply_stage": "always",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Analyse the sentiment of this review and return JSON with the "
|
||||
"detected sentiment label, a confidence score between 0 and 1, "
|
||||
"and up to five keyword strings that drove the classification:\n\n"
|
||||
"'This product completely exceeded my expectations. The build "
|
||||
"quality is phenomenal, it arrived a day early, and customer "
|
||||
"support was delightful when I had a setup question.'"
|
||||
),
|
||||
}
|
||||
],
|
||||
"validate": lambda parsed, tcs, raw: _validate_sentiment(parsed),
|
||||
}
|
||||
|
||||
|
||||
def _validate_sentiment(parsed):
|
||||
if parsed.get("sentiment") not in {"positive", "negative", "neutral"}:
|
||||
return False, f"sentiment not in enum: {parsed.get('sentiment')!r}"
|
||||
if parsed["sentiment"] != "positive":
|
||||
return False, f"expected positive sentiment, got {parsed['sentiment']}"
|
||||
conf = parsed.get("confidence")
|
||||
if not isinstance(conf, (int, float)) or not (0.0 <= conf <= 1.0):
|
||||
return False, f"confidence not in [0,1]: {conf!r}"
|
||||
kws = parsed.get("keywords")
|
||||
if not isinstance(kws, list) or not (1 <= len(kws) <= 5):
|
||||
return False, f"keywords length out of range: {kws!r}"
|
||||
if not all(isinstance(k, str) and k for k in kws):
|
||||
return False, f"keywords must be non-empty strings: {kws!r}"
|
||||
return True, f"sentiment={parsed['sentiment']} conf={conf} kws={kws}"
|
||||
|
||||
|
||||
# ---- Test 3: Nested recipe schema (always) ----
|
||||
|
||||
_RECIPE_SCHEMA = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "recipe",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"servings": {"type": "integer"},
|
||||
"ingredients": {
|
||||
"type": "array",
|
||||
"minItems": 2,
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"item": {"type": "string"},
|
||||
"quantity": {"type": "string"},
|
||||
},
|
||||
"required": ["item", "quantity"],
|
||||
},
|
||||
},
|
||||
"steps": {
|
||||
"type": "array",
|
||||
"minItems": 2,
|
||||
"items": {"type": "string"},
|
||||
},
|
||||
"prep_time_minutes": {"type": "integer"},
|
||||
},
|
||||
"required": ["name", "servings", "ingredients", "steps", "prep_time_minutes"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
RECIPE_TEST_CASE = {
|
||||
"name": "Nested recipe with arrays of objects",
|
||||
"response_format": _RECIPE_SCHEMA,
|
||||
"apply_stage": "always",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Give me a simple 4-serving scrambled eggs recipe as structured JSON. "
|
||||
"Include the recipe name, servings, ingredients (each with item and "
|
||||
"quantity), preparation steps, and total prep time in minutes."
|
||||
),
|
||||
}
|
||||
],
|
||||
"validate": lambda parsed, tcs, raw: _validate_recipe(parsed),
|
||||
}
|
||||
|
||||
|
||||
def _validate_recipe(parsed):
|
||||
required = {"name", "servings", "ingredients", "steps", "prep_time_minutes"}
|
||||
missing = required - parsed.keys()
|
||||
if missing:
|
||||
return False, f"Missing fields: {missing}"
|
||||
if not isinstance(parsed["name"], str) or not parsed["name"]:
|
||||
return False, "name must be a non-empty string"
|
||||
if not isinstance(parsed["servings"], int) or parsed["servings"] <= 0:
|
||||
return False, f"servings must be positive int: {parsed['servings']!r}"
|
||||
ings = parsed["ingredients"]
|
||||
if not isinstance(ings, list) or len(ings) < 2:
|
||||
return False, f"ingredients must be array of >=2: got {ings!r}"
|
||||
for i, ing in enumerate(ings):
|
||||
if not isinstance(ing, dict):
|
||||
return False, f"ingredient[{i}] is not an object: {ing!r}"
|
||||
ing_d = cast(dict[str, Any], ing)
|
||||
item_val = ing_d.get("item")
|
||||
qty_val = ing_d.get("quantity")
|
||||
if item_val is None or qty_val is None:
|
||||
return False, f"ingredient[{i}] missing item/quantity: {ing!r}"
|
||||
if not isinstance(item_val, str) or not isinstance(qty_val, str):
|
||||
return False, f"ingredient[{i}] fields must be strings: {ing!r}"
|
||||
steps = parsed["steps"]
|
||||
if not isinstance(steps, list) or len(steps) < 2:
|
||||
return False, f"steps must be array of >=2 strings: got {steps!r}"
|
||||
if not all(isinstance(s, str) and s for s in steps):
|
||||
return False, "all steps must be non-empty strings"
|
||||
pt = parsed["prep_time_minutes"]
|
||||
if not isinstance(pt, int) or pt <= 0:
|
||||
return False, f"prep_time_minutes must be positive int: {pt!r}"
|
||||
return True, f"recipe '{parsed['name']}' with {len(ings)} ingredients, {len(steps)} steps"
|
||||
|
||||
|
||||
# ---- Test 4: Tool call -> structured product comparison (after_tools) ----
|
||||
|
||||
_SHOP_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search_products",
|
||||
"description": "Search a product catalogue by keyword.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string"},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_product_details",
|
||||
"description": "Get detailed specs for a product by ID.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"product_id": {"type": "string"},
|
||||
},
|
||||
"required": ["product_id"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
_SHOP_SEARCH_RESULT = {
|
||||
"results": [
|
||||
{"product_id": "LAP-001", "title": "AeroBook 13 Pro", "price": 1399.0, "rating": 4.7},
|
||||
{"product_id": "LAP-002", "title": "QuantumSlim 14", "price": 1199.0, "rating": 4.4},
|
||||
{"product_id": "LAP-003", "title": "NimbusWork Ultra 15", "price": 999.0, "rating": 4.2},
|
||||
],
|
||||
}
|
||||
_SHOP_PRODUCT_DETAILS = {
|
||||
"LAP-001": {
|
||||
"product_id": "LAP-001",
|
||||
"title": "AeroBook 13 Pro",
|
||||
"cpu": "M-series 10-core",
|
||||
"ram_gb": 16,
|
||||
"storage_gb": 512,
|
||||
"battery_hours": 18,
|
||||
"weight_kg": 1.24,
|
||||
"price": 1399.0,
|
||||
},
|
||||
"LAP-002": {
|
||||
"product_id": "LAP-002",
|
||||
"title": "QuantumSlim 14",
|
||||
"cpu": "Core i7 12-core",
|
||||
"ram_gb": 16,
|
||||
"storage_gb": 512,
|
||||
"battery_hours": 12,
|
||||
"weight_kg": 1.35,
|
||||
"price": 1199.0,
|
||||
},
|
||||
"LAP-003": {
|
||||
"product_id": "LAP-003",
|
||||
"title": "NimbusWork Ultra 15",
|
||||
"cpu": "Ryzen 7 8-core",
|
||||
"ram_gb": 16,
|
||||
"storage_gb": 1024,
|
||||
"battery_hours": 10,
|
||||
"weight_kg": 1.70,
|
||||
"price": 999.0,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _shop_details_mock(args):
|
||||
pid = args.get("product_id", "")
|
||||
if pid in _SHOP_PRODUCT_DETAILS:
|
||||
return json.dumps(_SHOP_PRODUCT_DETAILS[pid])
|
||||
return json.dumps({"error": f"unknown product_id: {pid}"})
|
||||
|
||||
|
||||
_SHOP_COMPARISON_SCHEMA = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "laptop_comparison",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"recommendation": {"type": "string"},
|
||||
"ranked_candidates": {
|
||||
"type": "array",
|
||||
"minItems": 2,
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"product_id": {"type": "string"},
|
||||
"title": {"type": "string"},
|
||||
"score": {"type": "number"},
|
||||
"reason": {"type": "string"},
|
||||
},
|
||||
"required": ["product_id", "title", "score", "reason"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["recommendation", "ranked_candidates"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
SHOP_COMPARISON_TEST_CASE = {
|
||||
"name": "Tool calls then structured laptop comparison (after_tools)",
|
||||
"response_format": _SHOP_COMPARISON_SCHEMA,
|
||||
"apply_stage": "after_tools",
|
||||
"tools": _SHOP_TOOLS,
|
||||
"mock_tool_responses": {
|
||||
"search_products": lambda _: json.dumps(_SHOP_SEARCH_RESULT),
|
||||
"get_product_details": _shop_details_mock,
|
||||
},
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"I need a lightweight laptop for travel. Please search the catalogue "
|
||||
"for 'ultraportable laptop', then fetch detailed specs for at least two "
|
||||
"of the top candidates. Once you've gathered the data I'll ask you to "
|
||||
"produce a structured comparison."
|
||||
),
|
||||
}
|
||||
],
|
||||
"followup": (
|
||||
"Thanks. Now produce the final comparison strictly as JSON matching the "
|
||||
"laptop_comparison schema: your single best recommendation (the product_id), "
|
||||
"and a ranked_candidates array of at least two laptops, each with "
|
||||
"product_id, title, a numeric score, and a short reason."
|
||||
),
|
||||
"validate": lambda parsed, tcs, raw: _validate_shop_comparison(parsed, tcs),
|
||||
}
|
||||
|
||||
|
||||
def _validate_shop_comparison(parsed, tcs):
|
||||
names = [tc["function"]["name"] for tc in tcs]
|
||||
if "search_products" not in names:
|
||||
return False, f"expected search_products tool call, got {names}"
|
||||
if "get_product_details" not in names:
|
||||
return False, f"expected get_product_details tool call, got {names}"
|
||||
if "recommendation" not in parsed or not isinstance(parsed["recommendation"], str):
|
||||
return False, f"recommendation missing or not a string: {parsed!r}"
|
||||
cands = parsed.get("ranked_candidates")
|
||||
if not isinstance(cands, list) or len(cands) < 2:
|
||||
return False, f"ranked_candidates must be >=2: {cands!r}"
|
||||
valid_ids = set(_SHOP_PRODUCT_DETAILS.keys())
|
||||
candidate_pids: list = []
|
||||
for i, c in enumerate(cands):
|
||||
if not isinstance(c, dict):
|
||||
return False, f"candidate[{i}] not an object: {c!r}"
|
||||
c_d = cast(dict[str, Any], c)
|
||||
pid = c_d.get("product_id")
|
||||
title = c_d.get("title")
|
||||
score = c_d.get("score")
|
||||
reason = c_d.get("reason")
|
||||
for k, v in (("product_id", pid), ("title", title),
|
||||
("score", score), ("reason", reason)):
|
||||
if v is None:
|
||||
return False, f"candidate[{i}] missing {k}: {c!r}"
|
||||
if pid not in valid_ids:
|
||||
return False, f"candidate[{i}].product_id not in catalogue: {pid!r}"
|
||||
if not isinstance(score, (int, float)):
|
||||
return False, f"candidate[{i}].score not numeric: {score!r}"
|
||||
candidate_pids.append(pid)
|
||||
recommendation = parsed["recommendation"]
|
||||
if recommendation not in valid_ids and recommendation not in candidate_pids:
|
||||
return False, f"recommendation {recommendation!r} not in candidates"
|
||||
return True, (
|
||||
f"tools={names}; recommended={parsed['recommendation']}; "
|
||||
f"{len(cands)} ranked candidates"
|
||||
)
|
||||
|
||||
|
||||
# ---- Test 5: Multi-step research then structured report (after_tools) ----
|
||||
|
||||
_RESEARCH_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_country_stats",
|
||||
"description": "Fetch basic statistics for a country (population, GDP, capital).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"country": {"type": "string"},
|
||||
},
|
||||
"required": ["country"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_climate_info",
|
||||
"description": "Fetch climate information for a country.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"country": {"type": "string"},
|
||||
},
|
||||
"required": ["country"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
_COUNTRY_STATS = {
|
||||
"norway": {
|
||||
"country": "Norway",
|
||||
"capital": "Oslo",
|
||||
"population": 5_480_000,
|
||||
"gdp_usd_trillion": 0.48,
|
||||
"currency": "NOK",
|
||||
}
|
||||
}
|
||||
_CLIMATE_INFO = {
|
||||
"norway": {
|
||||
"country": "Norway",
|
||||
"climate_zone": "subarctic / temperate coastal",
|
||||
"avg_winter_temp_c": -4.5,
|
||||
"avg_summer_temp_c": 16.0,
|
||||
"annual_precipitation_mm": 1400,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _country_stats_mock(args):
|
||||
c = args.get("country", "").strip().lower()
|
||||
if c in _COUNTRY_STATS:
|
||||
return json.dumps(_COUNTRY_STATS[c])
|
||||
return json.dumps({"error": f"unknown country: {c}"})
|
||||
|
||||
|
||||
def _climate_info_mock(args):
|
||||
c = args.get("country", "").strip().lower()
|
||||
if c in _CLIMATE_INFO:
|
||||
return json.dumps(_CLIMATE_INFO[c])
|
||||
return json.dumps({"error": f"unknown country: {c}"})
|
||||
|
||||
|
||||
_RESEARCH_REPORT_SCHEMA = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "country_report",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"country": {"type": "string"},
|
||||
"capital": {"type": "string"},
|
||||
"population": {"type": "integer"},
|
||||
"climate_summary": {"type": "string"},
|
||||
"highlights": {
|
||||
"type": "array",
|
||||
"minItems": 2,
|
||||
"maxItems": 5,
|
||||
"items": {"type": "string"},
|
||||
},
|
||||
"suitable_for_tourism": {"type": "boolean"},
|
||||
},
|
||||
"required": [
|
||||
"country", "capital", "population",
|
||||
"climate_summary", "highlights", "suitable_for_tourism",
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
COUNTRY_REPORT_TEST_CASE = {
|
||||
"name": "Research pipeline then structured country report (after_tools)",
|
||||
"response_format": _RESEARCH_REPORT_SCHEMA,
|
||||
"apply_stage": "after_tools",
|
||||
"tools": _RESEARCH_TOOLS,
|
||||
"mock_tool_responses": {
|
||||
"get_country_stats": _country_stats_mock,
|
||||
"get_climate_info": _climate_info_mock,
|
||||
},
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"I'm preparing a short briefing on Norway. Please call the "
|
||||
"get_country_stats and get_climate_info tools to gather data "
|
||||
"first. Afterwards I'll ask for a structured summary."
|
||||
),
|
||||
}
|
||||
],
|
||||
"followup": (
|
||||
"Based on the tool results, produce the briefing as JSON matching the "
|
||||
"country_report schema. Populate every required field and provide between "
|
||||
"two and five highlights."
|
||||
),
|
||||
"validate": lambda parsed, tcs, raw: _validate_country_report(parsed, tcs),
|
||||
}
|
||||
|
||||
|
||||
def _validate_country_report(parsed, tcs):
|
||||
names = [tc["function"]["name"] for tc in tcs]
|
||||
for required_tool in ("get_country_stats", "get_climate_info"):
|
||||
if required_tool not in names:
|
||||
return False, f"missing tool call {required_tool!r}: got {names}"
|
||||
required = {
|
||||
"country", "capital", "population",
|
||||
"climate_summary", "highlights", "suitable_for_tourism",
|
||||
}
|
||||
missing = required - parsed.keys()
|
||||
if missing:
|
||||
return False, f"missing report fields: {missing}"
|
||||
if "norway" not in parsed["country"].lower():
|
||||
return False, f"country should reference Norway: {parsed['country']!r}"
|
||||
if "oslo" not in parsed["capital"].lower():
|
||||
return False, f"capital should be Oslo: {parsed['capital']!r}"
|
||||
if not isinstance(parsed["population"], int) or parsed["population"] < 1_000_000:
|
||||
return False, f"population implausible: {parsed['population']!r}"
|
||||
if not isinstance(parsed["climate_summary"], str) or not parsed["climate_summary"]:
|
||||
return False, "climate_summary must be a non-empty string"
|
||||
hls = parsed["highlights"]
|
||||
if not isinstance(hls, list) or not (2 <= len(hls) <= 5):
|
||||
return False, f"highlights length out of range: {hls!r}"
|
||||
if not all(isinstance(h, str) and h for h in hls):
|
||||
return False, "each highlight must be a non-empty string"
|
||||
if not isinstance(parsed["suitable_for_tourism"], bool):
|
||||
return False, f"suitable_for_tourism must be bool: {parsed['suitable_for_tourism']!r}"
|
||||
return True, (
|
||||
f"tools={names}; report for {parsed['country']} "
|
||||
f"(pop {parsed['population']}, {len(hls)} highlights)"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# All test cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ALL_TEST_CASES = [
|
||||
BOOK_TEST_CASE,
|
||||
SENTIMENT_TEST_CASE,
|
||||
RECIPE_TEST_CASE,
|
||||
SHOP_COMPARISON_TEST_CASE,
|
||||
COUNTRY_REPORT_TEST_CASE,
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Test llama-server structured-output capability."
|
||||
)
|
||||
parser.add_argument("--host", default="localhost")
|
||||
parser.add_argument("--port", default=8080, type=int)
|
||||
parser.add_argument(
|
||||
"--no-stream", action="store_true", help="Disable streaming mode tests"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stream-only", action="store_true", help="Only run streaming mode tests"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
help="Run only the test whose name contains this substring (case-insensitive)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
url = f"http://{args.host}:{args.port}/v1/chat/completions"
|
||||
print_info(f"Testing server at {url}")
|
||||
|
||||
modes: list[bool] = []
|
||||
if not args.stream_only:
|
||||
modes.append(False)
|
||||
if not args.no_stream:
|
||||
modes.append(True)
|
||||
|
||||
cases: list[dict] = ALL_TEST_CASES
|
||||
if args.test:
|
||||
name_filter = args.test.lower()
|
||||
cases = [c for c in cases if name_filter in str(c["name"]).lower()]
|
||||
if not cases:
|
||||
print_fail(f"No test cases matched '{args.test}'")
|
||||
sys.exit(1)
|
||||
|
||||
total = 0
|
||||
passed = 0
|
||||
for stream in modes:
|
||||
for case in cases:
|
||||
total += 1
|
||||
if run_test(url, case, stream=stream):
|
||||
passed += 1
|
||||
|
||||
color = GREEN if passed == total else RED
|
||||
_print(f"\n{BOLD}{color}{'─' * 60}{RESET}")
|
||||
_print(f"{BOLD}{color} Results: {passed}/{total} passed{RESET}")
|
||||
_print(f"{BOLD}{color}{'─' * 60}{RESET}\n")
|
||||
sys.exit(0 if passed == total else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +1 @@
|
||||
1c40d85a4dcfcd62176f649b8682433bb1a6caef
|
||||
49f84a924f6ea4fc2ef73dbbd8cc4d734b54bd6d
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
HTTPLIB_VERSION = "refs/tags/v0.43.1"
|
||||
HTTPLIB_VERSION = "refs/tags/v0.42.0"
|
||||
|
||||
vendor = {
|
||||
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
|
||||
|
||||
@@ -109,7 +109,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
||||
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
||||
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
|
||||
{ LLM_ARCH_HUNYUAN_VL, "hunyuan_vl" },
|
||||
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
||||
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
|
||||
{ LLM_ARCH_LFM2, "lfm2" },
|
||||
@@ -251,7 +250,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ALPHA, "%s.rope.scaling.alpha" },
|
||||
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||
|
||||
@@ -113,7 +113,6 @@ enum llm_arch {
|
||||
LLM_ARCH_ERNIE4_5_MOE,
|
||||
LLM_ARCH_HUNYUAN_MOE,
|
||||
LLM_ARCH_HUNYUAN_DENSE,
|
||||
LLM_ARCH_HUNYUAN_VL,
|
||||
LLM_ARCH_SMOLLM3,
|
||||
LLM_ARCH_OPENAI_MOE,
|
||||
LLM_ARCH_LFM2,
|
||||
@@ -255,7 +254,6 @@ enum llm_kv {
|
||||
LLM_KV_ROPE_SCALE_LINEAR,
|
||||
LLM_KV_ROPE_SCALING_TYPE,
|
||||
LLM_KV_ROPE_SCALING_FACTOR,
|
||||
LLM_KV_ROPE_SCALING_ALPHA,
|
||||
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
||||
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
||||
LLM_KV_ROPE_SCALING_FINETUNED,
|
||||
|
||||
+137
-9
@@ -2636,7 +2636,7 @@ void llama_context::perf_reset() {
|
||||
n_reused = 0;
|
||||
}
|
||||
|
||||
llama_memory_breakdown llama_context::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||
for (const auto & [buft, size] : model.memory_breakdown()) {
|
||||
ret[buft].model += size;
|
||||
@@ -3493,6 +3493,142 @@ void llama_perf_context_reset(llama_context * ctx) {
|
||||
ctx->perf_reset();
|
||||
}
|
||||
|
||||
void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
const auto & devices = ctx->get_model().devices;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
std::vector<std::array<std::string, 9>> table_data;
|
||||
table_data.reserve(devices.size());
|
||||
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
||||
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
||||
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
||||
|
||||
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
||||
|
||||
constexpr size_t MiB = 1024 * 1024;
|
||||
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
||||
|
||||
// track seen buffer types to avoid double counting:
|
||||
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
||||
|
||||
// accumulative memory breakdown for each device and for host:
|
||||
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
||||
llama_memory_breakdown_data mb_host;
|
||||
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
mb_host.model += mb.model;
|
||||
mb_host.context += mb.context;
|
||||
mb_host.compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (dev) {
|
||||
int i_dev = -1;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (devices[i].dev == dev) {
|
||||
i_dev = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i_dev != -1) {
|
||||
mb_dev[i_dev].model += mb.model;
|
||||
mb_dev[i_dev].context += mb.context;
|
||||
mb_dev[i_dev].compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print memory breakdown for each device:
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
ggml_backend_dev_t dev = devices[i].dev;
|
||||
llama_memory_breakdown_data mb = mb_dev[i];
|
||||
|
||||
const std::string name = ggml_backend_dev_name(dev);
|
||||
std::string desc = ggml_backend_dev_description(dev);
|
||||
for (const std::string & prefix : desc_prefixes_strip) {
|
||||
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
||||
desc = desc.substr(prefix.length());
|
||||
}
|
||||
}
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
const size_t unaccounted = total - self - free;
|
||||
|
||||
table_data.push_back({
|
||||
template_gpu,
|
||||
" - " + name + " (" + desc + ")",
|
||||
std::to_string(total / MiB),
|
||||
std::to_string(free / MiB),
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
std::to_string(unaccounted / MiB)});
|
||||
}
|
||||
|
||||
// print memory breakdown for host:
|
||||
{
|
||||
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - Host",
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb_host.model / MiB),
|
||||
std::to_string(mb_host.context / MiB),
|
||||
std::to_string(mb_host.compute / MiB),
|
||||
""}); // unaccounted
|
||||
}
|
||||
|
||||
// print memory breakdown for all remaining buffer types:
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (seen_buffer_types.count(buft) == 1) {
|
||||
continue;
|
||||
}
|
||||
const std::string name = ggml_backend_buft_name(buft);
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - " + name,
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
""}); // unaccounted
|
||||
seen_buffer_types.insert(buft);
|
||||
}
|
||||
|
||||
for (size_t j = 1; j < table_data[0].size(); j++) {
|
||||
size_t max_len = 0;
|
||||
for (const auto & td : table_data) {
|
||||
max_len = std::max(max_len, td[j].length());
|
||||
}
|
||||
for (auto & td : table_data) {
|
||||
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
||||
}
|
||||
}
|
||||
for (const auto & td : table_data) {
|
||||
LLAMA_LOG_INFO(td[0].c_str(),
|
||||
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
||||
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
@@ -3523,11 +3659,3 @@ void llama_opt_epoch(
|
||||
callback_train,
|
||||
callback_eval);
|
||||
}
|
||||
|
||||
//
|
||||
// ext
|
||||
//
|
||||
|
||||
llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) {
|
||||
return ctx->memory_breakdown();
|
||||
}
|
||||
|
||||
+12
-2
@@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-ext.h"
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-adapter.h"
|
||||
@@ -23,6 +22,17 @@ class llama_io_write_i;
|
||||
struct llama_memory_i;
|
||||
struct llama_memory_context_i;
|
||||
|
||||
// "memory" as in physical memory for a buffer type, in bytes
|
||||
struct llama_memory_breakdown_data {
|
||||
size_t model = 0; // memory allocated for the model
|
||||
size_t context = 0; // memory allocated for the context
|
||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||
|
||||
size_t total() const {
|
||||
return model + context + compute;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
// init scheduler and compute buffers, reserve worst-case graphs
|
||||
llama_context(
|
||||
@@ -162,7 +172,7 @@ struct llama_context {
|
||||
llama_perf_context_data perf_get_data() const;
|
||||
void perf_reset();
|
||||
|
||||
llama_memory_breakdown memory_breakdown() const;
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
||||
|
||||
//
|
||||
// training
|
||||
|
||||
+1
-35
@@ -1,12 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
// this is a staging header for new llama.cpp API
|
||||
// breaking changes and C++ are allowed. everything here should be considered WIP
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
|
||||
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
|
||||
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||
@@ -18,6 +14,7 @@ LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||
// Get the default ggml_type for a given ftype.
|
||||
LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
|
||||
|
||||
// Quantization state.
|
||||
struct quantize_state_impl;
|
||||
|
||||
LLAMA_API quantize_state_impl * llama_quant_init(
|
||||
@@ -57,34 +54,3 @@ LLAMA_API void llama_quant_compute_types(
|
||||
ggml_tensor ** tensors,
|
||||
ggml_type * result_types,
|
||||
size_t n_tensors);
|
||||
|
||||
//
|
||||
// device memory querying
|
||||
//
|
||||
|
||||
// "memory" as in physical memory for a buffer type, in bytes
|
||||
struct llama_memory_breakdown_data {
|
||||
size_t model = 0; // memory allocated for the model
|
||||
size_t context = 0; // memory allocated for the context
|
||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||
|
||||
size_t total() const {
|
||||
return model + context + compute;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_device_memory_data {
|
||||
int64_t total;
|
||||
int64_t free;
|
||||
llama_memory_breakdown_data mb;
|
||||
};
|
||||
|
||||
// TODO: convert to C-style data structure
|
||||
using llama_memory_breakdown = std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data>;
|
||||
|
||||
LLAMA_API int32_t llama_model_n_expert (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
|
||||
|
||||
LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
|
||||
|
||||
LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
|
||||
|
||||
@@ -116,7 +116,6 @@ struct llama_hparams {
|
||||
float rope_freq_base_train_swa = 10000.0f;
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa = 1.0f;
|
||||
float rope_scaling_alpha = 0.0f; // NTK-aware alpha for XDRoPE
|
||||
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
|
||||
+3
-53
@@ -1,7 +1,6 @@
|
||||
#include "llama-model.h"
|
||||
|
||||
#include "llama-arch.h"
|
||||
#include "llama-ext.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
@@ -78,23 +77,11 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
const ggml_tensor * tensor_axis_0;
|
||||
|
||||
uint32_t il;
|
||||
size_t rotation; // when assigning tensor slices, rotate how the rounding is done for more even allocation
|
||||
size_t rotation;
|
||||
};
|
||||
|
||||
auto get_tensor_config_impl = [&](
|
||||
const ggml_backend_meta_split_axis axis, const std::string & suffix = "", const std::string & suffix_fallback = "") -> tensor_config {
|
||||
// the layers in a tensor can be inhomogeneous, if the pattern is cleanly divided by the number of GPUs there can be aliasing effects,
|
||||
// count only the same type of previous layers to avoid this
|
||||
auto get_il_eff = [&](const size_t il){
|
||||
size_t ret = 0;
|
||||
const bool il_is_recurrent = hparams.is_recurrent(il);
|
||||
const bool il_is_swa = hparams.is_swa(il);
|
||||
for (size_t il_prev = 0; il_prev < il; il_prev++) {
|
||||
ret += hparams.is_recurrent(il_prev) == il_is_recurrent && hparams.is_swa(il_prev) == il_is_swa;
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
uint32_t il;
|
||||
std::string prefix;
|
||||
size_t rotation;
|
||||
@@ -103,13 +90,13 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
GGML_ASSERT(length_prefix != std::string::npos);
|
||||
prefix = tensor_name.substr(0, length_prefix + 1);
|
||||
il = std::stoull(tensor_name.substr(4, length_prefix));
|
||||
rotation = get_il_eff(il) % ud->n_devices;
|
||||
rotation = il % ud->n_devices;
|
||||
} else if (tensor_name.substr(0, 6) == "cache_") {
|
||||
const size_t layer_index_start = tensor_name.find("_l", 6);
|
||||
GGML_ASSERT(layer_index_start != std::string::npos);
|
||||
il = std::stoull(tensor_name.substr(layer_index_start + 2));
|
||||
prefix = "blk." + std::to_string(il) + ".";
|
||||
rotation = get_il_eff(il) % ud->n_devices;
|
||||
rotation = il % ud->n_devices;
|
||||
} else {
|
||||
il = 0;
|
||||
rotation = hparams.n_layer % ud->n_devices;
|
||||
@@ -737,13 +724,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
|
||||
|
||||
if (arch == LLM_ARCH_HUNYUAN_VL || arch == LLM_ARCH_HUNYUAN_DENSE) {
|
||||
if (hparams.n_expert <= 1) {
|
||||
hparams.n_expert = 0;
|
||||
hparams.n_expert_used = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
|
||||
@@ -822,7 +802,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_ALPHA, hparams.rope_scaling_alpha, false);
|
||||
|
||||
// non-transformer models do not have attention heads
|
||||
if (hparams.n_head() > 0) {
|
||||
@@ -2600,18 +2579,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_HUNYUAN_VL:
|
||||
case LLM_ARCH_HUNYUAN_DENSE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||
|
||||
// XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2))
|
||||
if (hparams.rope_scaling_alpha > 0.0f) {
|
||||
const int dim = hparams.n_embd_head_k();
|
||||
hparams.rope_freq_base_train = hparams.rope_freq_base_train
|
||||
* powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2));
|
||||
}
|
||||
|
||||
switch (hparams.n_embd) {
|
||||
case 1024: type = LLM_TYPE_0_5B; break;
|
||||
@@ -6964,7 +6934,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_HUNYUAN_VL:
|
||||
case LLM_ARCH_HUNYUAN_DENSE:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -8985,7 +8954,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_HUNYUAN_VL:
|
||||
case LLM_ARCH_HUNYUAN_DENSE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
|
||||
@@ -9335,9 +9303,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_HUNYUAN_VL:
|
||||
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
// all model arches should be listed explicitly here
|
||||
case LLM_ARCH_UNKNOWN:
|
||||
GGML_ABORT("unknown architecture");
|
||||
@@ -9472,18 +9437,3 @@ bool llama_model_is_diffusion(const llama_model * model) {
|
||||
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|
||||
return model->tensors_by_name;
|
||||
}
|
||||
|
||||
int32_t llama_model_n_expert(const struct llama_model * model) {
|
||||
return model->hparams.n_expert;
|
||||
}
|
||||
|
||||
int32_t llama_model_n_devices(const struct llama_model * model) {
|
||||
return (int32_t)model->devices.size();
|
||||
}
|
||||
|
||||
ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i) {
|
||||
if (i < 0 || i >= (int)model->devices.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
return model->devices[i].dev;
|
||||
}
|
||||
|
||||
+760
@@ -46,6 +46,766 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
struct llama_device_memory_data {
|
||||
int64_t total;
|
||||
int64_t free;
|
||||
llama_memory_breakdown_data mb;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
|
||||
std::vector<llama_device> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
|
||||
const ggml_log_level log_level) {
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original_logger;
|
||||
ggml_log_level min_level; // prints below this log level go to debug log
|
||||
};
|
||||
user_data_t ud;
|
||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||
ud.min_level = log_level;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||
const user_data_t * ud = (const user_data_t *) user_data;
|
||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||
}, &ud);
|
||||
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.use_mmap = false;
|
||||
mparams_copy.use_mlock = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||
if (model == nullptr) {
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to load model");
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||
if (ctx == nullptr) {
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to create llama_context from model");
|
||||
}
|
||||
|
||||
const size_t nd = model->n_devices();
|
||||
std::vector<llama_device_memory_data> ret(nd + 1);
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
for (const auto & [buft, mb] : memory_breakdown) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
ret.back().mb.model += mb.model;
|
||||
ret.back().mb.context += mb.context;
|
||||
ret.back().mb.compute += mb.compute;
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
if (model->devices[i].dev == dev) {
|
||||
ret[i].mb.model += mb.model;
|
||||
ret[i].mb.context += mb.context;
|
||||
ret[i].mb.compute += mb.compute;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
ret.back().free = free;
|
||||
ret.back().total = total;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
free = ret.back().free;
|
||||
total = ret.back().total;
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
}
|
||||
|
||||
devs = model->devices;
|
||||
hp_ngl = model->hparams.n_layer;
|
||||
hp_n_ctx_train = model->hparams.n_ctx_train;
|
||||
hp_n_expert = model->hparams.n_expert;
|
||||
|
||||
llama_memory_breakdown_print(ctx); // goes to debug log
|
||||
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// enum to identify part of a layer for distributing its tensors:
|
||||
enum layer_fraction_t {
|
||||
LAYER_FRACTION_NONE = 0, // nothing
|
||||
LAYER_FRACTION_ATTN = 1, // attention
|
||||
LAYER_FRACTION_UP = 2, // attention + up
|
||||
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||
};
|
||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||
|
||||
class llama_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static void llama_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
throw llama_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
|
||||
}
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
std::vector<llama_device> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
if (nd == 0) {
|
||||
margins.push_back(margins_s[0]);
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
size_t max_length = 0;
|
||||
for (const llama_device & dev : devs) {
|
||||
std::string name = ggml_backend_dev_name(dev.dev);
|
||||
name += " (";
|
||||
name += ggml_backend_dev_description(dev.dev);
|
||||
name += ")";
|
||||
dev_names.push_back(name);
|
||||
max_length = std::max(max_length, name.length());
|
||||
}
|
||||
for (std::string & dn : dev_names) {
|
||||
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||
}
|
||||
}
|
||||
|
||||
int64_t sum_free = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_model = 0;
|
||||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd == 0) {
|
||||
sum_projected_used = dmds_full.back().mb.total();
|
||||
sum_free = dmds_full.back().total;
|
||||
sum_projected_free = sum_free - sum_projected_used;
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (sum_projected_free >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
|
||||
__func__, sum_projected_free/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
if (nd == 0) {
|
||||
global_surplus -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
if (nd <= 1) {
|
||||
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, -global_surplus/MiB);
|
||||
}
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
int64_t sum_used_target = sum_free;
|
||||
if (nd == 0) {
|
||||
sum_used_target -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
}
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
// - for dense models only whole layers can be assigned to devices
|
||||
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
|
||||
// - on average we expect a waste of 0.5 layers/tensors per device
|
||||
// - use slightly more than the expected average for nd devices to be safe
|
||||
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
||||
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
||||
}
|
||||
|
||||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
if (nd == 0) {
|
||||
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
|
||||
}
|
||||
}
|
||||
if (sum_used_target > sum_projected_used_min_ctx) {
|
||||
// linear interpolation between minimum and maximum context size:
|
||||
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
||||
/ (sum_projected_used - sum_projected_used_min_ctx);
|
||||
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
||||
|
||||
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
||||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (nd <= 1) {
|
||||
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||
} else {
|
||||
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
}
|
||||
} else {
|
||||
if (n_ctx_min == UINT32_MAX) {
|
||||
LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nd == 0) {
|
||||
throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
}
|
||||
if (nd > 1) {
|
||||
if (!tensor_split) {
|
||||
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
||||
}
|
||||
if (mparams->tensor_split) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (mparams->tensor_split[id] != 0.0f) {
|
||||
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||
}
|
||||
}
|
||||
if (!tensor_buft_overrides) {
|
||||
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
||||
}
|
||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
||||
}
|
||||
|
||||
// step 3: iteratively fill the back to front with "dense" layers
|
||||
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||
|
||||
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||
auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
|
||||
constexpr size_t n_strings = 1000;
|
||||
if (il >= n_strings) {
|
||||
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||
}
|
||||
switch (lf) {
|
||||
case LAYER_FRACTION_ATTN: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_UP: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_GATE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_MOE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
};
|
||||
|
||||
struct ngl_t {
|
||||
uint32_t n_layer = 0; // number of total layers
|
||||
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||
|
||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||
|
||||
uint32_t n_full() const {
|
||||
assert(n_layer >= n_part);
|
||||
return n_layer - n_part;
|
||||
}
|
||||
};
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
|
||||
// utility function to set n_gpu_layers and tensor_split
|
||||
auto set_ngl_tensor_split_tbo = [&](
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
llama_model_params & mparams) {
|
||||
mparams.n_gpu_layers = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||
if (nd > 1) {
|
||||
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||
}
|
||||
}
|
||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
|
||||
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||
|
||||
mparams.tensor_split = tensor_split;
|
||||
|
||||
size_t itbo = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
il0 += ngl_per_device[id].n_full();
|
||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||
if (itbo + 1 >= ntbo) {
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
||||
+ std::to_string(ntbo) + " is insufficient for model");
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
||||
itbo++;
|
||||
}
|
||||
il0 += ngl_per_device[id].n_part;
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
};
|
||||
|
||||
// utility function that returns the memory use per device for given numbers of layers per device
|
||||
auto get_memory_for_layers = [&](
|
||||
const char * func_name,
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
||||
|
||||
const dmds_t dmd_nl = llama_get_device_memory_data(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const ngl_t & n = ngl_per_device[id];
|
||||
LLAMA_LOG_DEBUG(
|
||||
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ret;
|
||||
ret.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ret.push_back(dmd_nl[id].mb.total());
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
int64_t global_surplus_cpu_moe = 0;
|
||||
if (hp_nex > 0) {
|
||||
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
|
||||
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
||||
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||
__func__, global_surplus_cpu_moe/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||
__func__, -global_surplus_cpu_moe/MiB);
|
||||
}
|
||||
|
||||
// reset
|
||||
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
}
|
||||
|
||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margins[id]);
|
||||
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
||||
overflow_bufts.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device(nd);
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
||||
|
||||
// optimize the number of layers per device using the method of false position:
|
||||
// - ngl_per_device has 0 layers for each device, lower bound
|
||||
// - try a "high" configuration where a device is given all unassigned layers
|
||||
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||
// - check memory use of our guess, replace either the low or high bound
|
||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||
// - the last device has the output layer, which cannot be a partial layer
|
||||
if (hp_nex == 0) {
|
||||
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
}
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
uint32_t n_unassigned = hp_ngl + 1;
|
||||
for (size_t jd = id + 1; jd < nd; ++jd) {
|
||||
assert(n_unassigned >= ngl_per_device[jd].n_layer);
|
||||
n_unassigned -= ngl_per_device[jd].n_layer;
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||
if (hp_nex > 0) {
|
||||
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
|
||||
}
|
||||
if (ngl_per_device_high[id].n_layer > 0) {
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
ngl_per_device_test[id].n_layer += step_size;
|
||||
if (hp_nex) {
|
||||
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
||||
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
||||
}
|
||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
}
|
||||
} else {
|
||||
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 4: for a MoE model where all dense tensors fit,
|
||||
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||
|
||||
size_t id_dense_start = nd;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
if (ngl_per_device[id].n_layer > 0) {
|
||||
id_dense_start = id;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
||||
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||
ngl_per_device_high[jd].n_part = 0;
|
||||
}
|
||||
size_t id_dense_start_high = nd - 1;
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
uint32_t n_converted_test = 0;
|
||||
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||
n_converted_test += n_convert_jd;
|
||||
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
id_dense_start_high = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
|
||||
// try to fit at least part of one more layer
|
||||
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||
ngl_per_device_test[id].n_layer++;
|
||||
ngl_per_device_test[id].n_part++;
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
||||
id_dense_start_test++;
|
||||
}
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
||||
if (id < nd - 1) {
|
||||
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1].dev);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
// print info for devices that were not changed during the conversion from dense only to full layers:
|
||||
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
}
|
||||
|
||||
enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
||||
try {
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
||||
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const llama_params_fit_exception & e) {
|
||||
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
|
||||
} catch (const std::runtime_error & e) {
|
||||
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = LLAMA_PARAMS_FIT_STATUS_ERROR;
|
||||
}
|
||||
const int64_t t1_us = llama_time_us();
|
||||
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
return status;
|
||||
}
|
||||
|
||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||
struct llama_sampler_chain_params result = {
|
||||
/*.no_perf =*/ true,
|
||||
|
||||
@@ -6,11 +6,6 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
const bool use_mrope = hparams.use_mrope();
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
@@ -42,36 +37,22 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
if (use_mrope) {
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
} else {
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
}
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = build_norm(Kcur,
|
||||
model.layers[il].attn_k_norm, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
|
||||
@@ -155,8 +155,6 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
||||
llama_build_and_test(test-grammar-integration.cpp)
|
||||
llama_build_and_test(test-llama-grammar.cpp)
|
||||
llama_build_and_test(test-chat.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
target_include_directories(test-chat PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
|
||||
target_link_libraries(test-chat PRIVATE server-context)
|
||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
|
||||
@@ -3522,40 +3522,6 @@ struct test_add_rms_norm : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_UNARY(RELU) + GGML_OP_SQR (fused operation)
|
||||
struct test_relu_sqr : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "RELU_SQR";
|
||||
}
|
||||
|
||||
bool run_whole_graph() override { return true; }
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_relu_sqr(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {128, 2, 2, 2})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * r = ggml_relu(ctx, a);
|
||||
ggml_set_name(r, "relu");
|
||||
|
||||
ggml_tensor * out = ggml_sqr(ctx, r);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_SSM_CONV
|
||||
struct test_ssm_conv : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -7345,12 +7311,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
// fused relu + sqr (squared ReLU)
|
||||
for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
test_cases.emplace_back(new test_relu_sqr(type, { 128, 2, 2, 2 }));
|
||||
test_cases.emplace_back(new test_relu_sqr(type, { 5, 7, 11, 13 }));
|
||||
}
|
||||
|
||||
// glu ops
|
||||
for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (int v : {0, 1}) {
|
||||
|
||||
+2
-223
@@ -7,7 +7,6 @@
|
||||
//
|
||||
#include "../src/llama-grammar.h"
|
||||
#include "../src/unicode.h"
|
||||
#include "../tools/server/server-chat.h"
|
||||
#include "chat-auto-parser.h"
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
@@ -1515,117 +1514,6 @@ static void test_tools_oaicompat_json_conversion() {
|
||||
common_chat_tools_to_json_oaicompat({ special_function_tool }).dump(2));
|
||||
}
|
||||
|
||||
static void test_convert_responses_to_chatcmpl() {
|
||||
LOG_DBG("%s\n", __func__);
|
||||
|
||||
// Test basic conversion with input messages (user/assistant alternating)
|
||||
{
|
||||
json input = json::parse(R"({
|
||||
"input": [
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": "hi wassup"
|
||||
},
|
||||
{
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": "Hey! 👋 Not much, just here ready to chat. What's up with you? Anything I can help you with today?"
|
||||
},
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": "hi"
|
||||
}
|
||||
],
|
||||
"model": "gpt-5-mini",
|
||||
"stream": false,
|
||||
"text": {},
|
||||
"reasoning": {
|
||||
"effort": "medium"
|
||||
}
|
||||
})");
|
||||
|
||||
json result = server_chat_convert_responses_to_chatcmpl(input);
|
||||
|
||||
// Verify messages were converted correctly
|
||||
assert_equals(true, result.contains("messages"));
|
||||
assert_equals(true, result.at("messages").is_array());
|
||||
assert_equals((size_t)3, result.at("messages").size());
|
||||
|
||||
// Check first message (user)
|
||||
const auto & msg0 = result.at("messages")[0];
|
||||
assert_equals(std::string("user"), msg0.at("role").get<std::string>());
|
||||
assert_equals(true, msg0.at("content").is_array());
|
||||
assert_equals(std::string("text"), msg0.at("content")[0].at("type").get<std::string>());
|
||||
assert_equals(std::string("hi wassup"), msg0.at("content")[0].at("text").get<std::string>());
|
||||
|
||||
// Check second message (assistant)
|
||||
const auto & msg1 = result.at("messages")[1];
|
||||
assert_equals(std::string("assistant"), msg1.at("role").get<std::string>());
|
||||
assert_equals(true, msg1.at("content").is_array());
|
||||
assert_equals(std::string("text"), msg1.at("content")[0].at("type").get<std::string>());
|
||||
assert_equals(std::string("Hey! 👋 Not much, just here ready to chat. What's up with you? Anything I can help you with today?"), msg1.at("content")[0].at("text").get<std::string>());
|
||||
|
||||
// Check third message (user)
|
||||
const auto & msg2 = result.at("messages")[2];
|
||||
assert_equals(std::string("user"), msg2.at("role").get<std::string>());
|
||||
assert_equals(true, msg2.at("content").is_array());
|
||||
assert_equals(std::string("text"), msg2.at("content")[0].at("type").get<std::string>());
|
||||
assert_equals(std::string("hi"), msg2.at("content")[0].at("text").get<std::string>());
|
||||
|
||||
// Verify other fields preserved
|
||||
assert_equals(std::string("gpt-5-mini"), result.at("model").get<std::string>());
|
||||
assert_equals(false, result.at("stream").get<bool>());
|
||||
}
|
||||
|
||||
// Test string input
|
||||
{
|
||||
json input = json::parse(R"({
|
||||
"input": "Hello, world!",
|
||||
"model": "test-model"
|
||||
})");
|
||||
|
||||
json result = server_chat_convert_responses_to_chatcmpl(input);
|
||||
|
||||
assert_equals((size_t)1, result.at("messages").size());
|
||||
const auto & msg = result.at("messages")[0];
|
||||
assert_equals(std::string("user"), msg.at("role").get<std::string>());
|
||||
assert_equals(std::string("Hello, world!"), msg.at("content").get<std::string>());
|
||||
}
|
||||
|
||||
// Test with instructions (system message)
|
||||
{
|
||||
json input = json::parse(R"({
|
||||
"input": "Hello",
|
||||
"instructions": "You are a helpful assistant.",
|
||||
"model": "test-model"
|
||||
})");
|
||||
|
||||
json result = server_chat_convert_responses_to_chatcmpl(input);
|
||||
|
||||
assert_equals((size_t)2, result.at("messages").size());
|
||||
const auto & sys_msg = result.at("messages")[0];
|
||||
assert_equals(std::string("system"), sys_msg.at("role").get<std::string>());
|
||||
assert_equals(std::string("You are a helpful assistant."), sys_msg.at("content").get<std::string>());
|
||||
}
|
||||
|
||||
// Test with max_output_tokens conversion
|
||||
{
|
||||
json input = json::parse(R"({
|
||||
"input": "Hello",
|
||||
"model": "test-model",
|
||||
"max_output_tokens": 100
|
||||
})");
|
||||
|
||||
json result = server_chat_convert_responses_to_chatcmpl(input);
|
||||
|
||||
assert_equals(true, result.contains("max_tokens"));
|
||||
assert_equals(false, result.contains("max_output_tokens"));
|
||||
assert_equals(100, result.at("max_tokens").get<int>());
|
||||
}
|
||||
}
|
||||
|
||||
static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
LOG_DBG("%s\n", __func__);
|
||||
|
||||
@@ -3707,51 +3595,6 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.run();
|
||||
}
|
||||
|
||||
// Reka Edge
|
||||
{
|
||||
auto tst = peg_tester("models/templates/Reka-Edge.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?")
|
||||
.enable_thinking(false)
|
||||
.expect(message_assist)
|
||||
.run();
|
||||
tst.test("I'm\nthinking</think>\n\nHello, world!\nWhat's up?")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
|
||||
.expect(message_assist_thoughts)
|
||||
.run();
|
||||
tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.run();
|
||||
tst.test("Hello, world!\nWhat's up?\n<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_content)
|
||||
.run();
|
||||
tst.test("I'm\nthinking</think>\n\n<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n</tool_call>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_thoughts)
|
||||
.run();
|
||||
tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n</tool_call>\n<tool_call>\n{\"name\": \"special_function_with_opt\", \"arguments\": {\"arg1\": 1, \"arg2\": 2}}\n</tool_call>")
|
||||
.enable_thinking(false)
|
||||
.parallel_tool_calls(true)
|
||||
.tools({ special_function_tool, special_function_tool_with_optional_param })
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
.run();
|
||||
tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg")
|
||||
.enable_thinking(false)
|
||||
.tools({ special_function_tool })
|
||||
.is_partial(true)
|
||||
.expect(message_assist_call_cutoff_args)
|
||||
.run();
|
||||
}
|
||||
|
||||
// Apriel 1.5
|
||||
{
|
||||
auto tst = peg_tester("models/templates/unsloth-Apriel-1.5.jinja", detailed_debug);
|
||||
@@ -4234,55 +4077,6 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
}
|
||||
}
|
||||
|
||||
static void test_reka_edge_common_path() {
|
||||
auto tmpls = read_templates("models/templates/Reka-Edge.jinja");
|
||||
|
||||
{
|
||||
common_chat_templates_inputs inputs;
|
||||
common_chat_msg system_msg;
|
||||
system_msg.role = "system";
|
||||
system_msg.content = "Use tools when needed.";
|
||||
|
||||
common_chat_msg tool_call_msg = simple_assist_msg("", "", "special_function", "{\"arg1\": 1}");
|
||||
|
||||
common_chat_msg tool_msg;
|
||||
tool_msg.role = "tool";
|
||||
tool_msg.tool_name = "special_function";
|
||||
tool_msg.tool_call_id = "call0";
|
||||
tool_msg.content = "Sunny";
|
||||
|
||||
inputs.messages = { system_msg, message_user, tool_call_msg, tool_msg, message_user };
|
||||
inputs.tools = { special_function_tool };
|
||||
inputs.enable_thinking = true;
|
||||
inputs.add_generation_prompt = true;
|
||||
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
|
||||
if (params.prompt.find("<tool_response>\nSunny\n</tool_response><sep>") == std::string::npos) {
|
||||
throw std::runtime_error("Reka Edge prompt did not render tool response history");
|
||||
}
|
||||
if (params.prompt.rfind("assistant: <think>\n") == std::string::npos) {
|
||||
throw std::runtime_error("Reka Edge prompt did not render thinking generation prompt");
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = {
|
||||
message_user,
|
||||
simple_assist_msg("The first point is")
|
||||
};
|
||||
inputs.add_generation_prompt = false;
|
||||
inputs.enable_thinking = false;
|
||||
inputs.chat_template_kwargs["continue_final_message"] = "true";
|
||||
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
if (string_ends_with(params.prompt, "<sep>")) {
|
||||
throw std::runtime_error("Reka Edge continue_final_message unexpectedly closed the assistant turn");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test the developer role to system workaround with a simple mock template
|
||||
static void test_developer_role_to_system_workaround() {
|
||||
LOG_DBG("%s\n", __func__);
|
||||
@@ -4403,7 +4197,7 @@ int main(int argc, char ** argv) {
|
||||
bool detailed_debug = false;
|
||||
bool only_run_filtered = false;
|
||||
|
||||
// Check for --template and --detailed flags
|
||||
// Check for --template flag
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
if (arg == "--template" && i + 1 < argc) {
|
||||
@@ -4428,20 +4222,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
// Check if any argument is a .jinja file (for template format detection mode)
|
||||
bool has_jinja_files = false;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
if (arg == "--detailed") {
|
||||
continue;
|
||||
}
|
||||
if (arg.size() >= 6 && arg.rfind(".jinja") == arg.size() - 6) {
|
||||
has_jinja_files = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_jinja_files) {
|
||||
if (argc > 1) {
|
||||
common_chat_templates_inputs inputs;
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
@@ -4474,9 +4255,7 @@ int main(int argc, char ** argv) {
|
||||
test_msg_diffs_compute();
|
||||
test_msgs_oaicompat_json_conversion();
|
||||
test_tools_oaicompat_json_conversion();
|
||||
test_convert_responses_to_chatcmpl();
|
||||
test_developer_role_to_system_workaround();
|
||||
test_reka_edge_common_path();
|
||||
test_template_output_peg_parsers(detailed_debug);
|
||||
std::cout << "\n[chat] All tests passed!" << '\n';
|
||||
}
|
||||
|
||||
+4
-7
@@ -2,7 +2,6 @@
|
||||
#include "common.h"
|
||||
#include "arg.h"
|
||||
#include "console.h"
|
||||
#include "fit.h"
|
||||
// #include "log.h"
|
||||
|
||||
#include "server-common.h"
|
||||
@@ -77,8 +76,8 @@ struct cli_context {
|
||||
// defaults.return_progress = true; // TODO: show progress
|
||||
|
||||
verbose_prompt = params.verbose_prompt;
|
||||
reasoning_budget = params.sampling.reasoning_budget_tokens;
|
||||
reasoning_budget_message = params.sampling.reasoning_budget_message;
|
||||
reasoning_budget = params.reasoning_budget;
|
||||
reasoning_budget_message = params.reasoning_budget_message;
|
||||
}
|
||||
|
||||
std::string generate_completion(result_timings & out_timings) {
|
||||
@@ -207,8 +206,6 @@ struct cli_context {
|
||||
auto meta = ctx_server.get_meta();
|
||||
auto & chat_params = meta.chat_params;
|
||||
|
||||
auto caps = common_chat_templates_get_caps(chat_params.tmpls.get());
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
|
||||
inputs.tools = {}; // TODO
|
||||
@@ -216,7 +213,7 @@ struct cli_context {
|
||||
inputs.json_schema = ""; // TODO
|
||||
inputs.grammar = ""; // TODO
|
||||
inputs.use_jinja = chat_params.use_jinja;
|
||||
inputs.parallel_tool_calls = caps["supports_parallel_tool_calls"];
|
||||
inputs.parallel_tool_calls = false;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
inputs.force_pure_content = chat_params.force_pure_content;
|
||||
@@ -650,7 +647,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// bump the log level to display timings
|
||||
common_log_set_verbosity_thold(LOG_LEVEL_INFO);
|
||||
common_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
|
||||
llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "fit.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <cinttypes>
|
||||
#include <thread>
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@@ -17,58 +19,49 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FIT_PARAMS)) {
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
if (!params.fit_params_print) {
|
||||
const common_params_fit_status status = common_fit_params(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
if (status != COMMON_PARAMS_FIT_STATUS_SUCCESS) {
|
||||
LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__);
|
||||
common_log_flush(common_log_main());
|
||||
printf("-c %" PRIu32 " -ngl %" PRIi32, cparams.n_ctx, mparams.n_gpu_layers);
|
||||
|
||||
size_t nd = llama_max_devices();
|
||||
while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {
|
||||
nd--;
|
||||
}
|
||||
if (nd > 1) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (id == 0) {
|
||||
printf(" -ts ");
|
||||
}
|
||||
printf("%s%" PRIu32, id > 0 ? "," : "", uint32_t(mparams.tensor_split[id]));
|
||||
}
|
||||
}
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
bool any_tbo = false;
|
||||
for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) {
|
||||
if (itbo == 0) {
|
||||
printf(" -ot \"");
|
||||
}
|
||||
printf("%s%s=%s", itbo > 0 ? "," : "", mparams.tensor_buft_overrides[itbo].pattern, ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft));
|
||||
any_tbo = true;
|
||||
}
|
||||
printf("%s\n", any_tbo ? "\"" : "");
|
||||
} else {
|
||||
LOG_INF("%s: printing estimated memory in MiB to stdout (device, model, context, compute) ...\n", __func__);
|
||||
common_log_flush(common_log_main());
|
||||
|
||||
common_fit_print(params.model.path.c_str(), &mparams, &cparams);
|
||||
const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
|
||||
LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__);
|
||||
common_log_flush(common_log_main());
|
||||
printf("-c %" PRIu32 " -ngl %" PRIi32, cparams.n_ctx, mparams.n_gpu_layers);
|
||||
|
||||
size_t nd = llama_max_devices();
|
||||
while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {
|
||||
nd--;
|
||||
}
|
||||
if (nd > 1) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (id == 0) {
|
||||
printf(" -ts ");
|
||||
}
|
||||
printf("%s%" PRIu32, id > 0 ? "," : "", uint32_t(mparams.tensor_split[id]));
|
||||
}
|
||||
}
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
bool any_tbo = false;
|
||||
for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) {
|
||||
if (itbo == 0) {
|
||||
printf(" -ot \"");
|
||||
}
|
||||
printf("%s%s=%s", itbo > 0 ? "," : "", mparams.tensor_buft_overrides[itbo].pattern, ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft));
|
||||
any_tbo = true;
|
||||
}
|
||||
printf("%s\n", any_tbo ? "\"" : "");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -22,7 +22,6 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "download.h"
|
||||
#include "fit.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -2226,7 +2225,7 @@ int main(int argc, char ** argv) {
|
||||
prev_inst = nullptr;
|
||||
}
|
||||
|
||||
// use default n_gpu_layers and n_ctx so common_fit_params can adjust them
|
||||
// use default n_gpu_layers and n_ctx so llama_params_fit can adjust them
|
||||
mparams.n_gpu_layers = llama_model_default_params().n_gpu_layers;
|
||||
mparams.tensor_split = fit_tensor_split.data();
|
||||
mparams.tensor_buft_overrides = fit_overrides.data();
|
||||
@@ -2237,7 +2236,7 @@ int main(int argc, char ** argv) {
|
||||
uint32_t n_ctx_needed = inst.n_prompt + inst.n_gen + inst.n_depth;
|
||||
cparams.n_ctx = std::max(cparams.n_ctx, n_ctx_needed);
|
||||
|
||||
common_fit_params(inst.model.c_str(), &mparams, &cparams,
|
||||
llama_params_fit(inst.model.c_str(), &mparams, &cparams,
|
||||
fit_tensor_split.data(),
|
||||
fit_overrides.data(),
|
||||
margins.data(),
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user