Compare commits
48 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 35df147d80 | |||
| c1891fd6eb | |||
| 81df3f7cfa | |||
| de71b5f81c | |||
| 788fcbc5dd | |||
| 9d49acb2a7 | |||
| e365e658f0 | |||
| 4eac5b4509 | |||
| d5b780a676 | |||
| 471540ae8a | |||
| 19124078be | |||
| bcdcc1044f | |||
| 037bfe38d0 | |||
| 8685e7b075 | |||
| 09b4efa95f | |||
| 455d8e4be8 | |||
| 91fef95362 | |||
| 9e5647affa | |||
| 4f02d47339 | |||
| 23b8cc4991 | |||
| 59accc8863 | |||
| 83d58e02fc | |||
| 89a5474f0e | |||
| fd1c0ec3f0 | |||
| 45cac7ca70 | |||
| b94050e896 | |||
| a279d0f0f4 | |||
| 268d61e178 | |||
| 6990e2f1f7 | |||
| fcc7508759 | |||
| 5e6c0e18b6 | |||
| 30dce2cf29 | |||
| 089dd41fe3 | |||
| 85dde8dc4a | |||
| 4fbdabdc61 | |||
| e45dbdece8 | |||
| 4adac43f6f | |||
| 9db77a020c | |||
| f772f6e434 | |||
| b572d1ecd6 | |||
| 03b3d07798 | |||
| 3f7c29d318 | |||
| ae2d34899e | |||
| 1e796eb41f | |||
| 5637536517 | |||
| 90fb96a7b3 | |||
| 82677a6ede | |||
| 8612ed18b7 |
@@ -18,6 +18,7 @@
|
||||
vulkan-loader,
|
||||
openssl,
|
||||
shaderc,
|
||||
spirv-headers,
|
||||
useBlas ?
|
||||
builtins.all (x: !x) [
|
||||
useCuda
|
||||
@@ -145,6 +146,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
ninja
|
||||
pkg-config
|
||||
git
|
||||
spirv-headers
|
||||
]
|
||||
++ optionals useCuda [
|
||||
cudaPackages.cuda_nvcc
|
||||
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
distribution: zulu
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
|
||||
@@ -246,6 +246,7 @@ jobs:
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
glslc \
|
||||
spirv-headers \
|
||||
gcc-14-loongarch64-linux-gnu \
|
||||
g++-14-loongarch64-linux-gnu \
|
||||
libvulkan-dev:loong64
|
||||
|
||||
@@ -47,22 +47,10 @@ jobs:
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential wget git-lfs
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
|
||||
if ! which rustc; then
|
||||
# Install Rust stable version
|
||||
sudo apt-get install -y rustup
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
fi
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: GCC version check
|
||||
@@ -74,12 +62,12 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# FIXME: Enable when ggml-org/ccache-action works on riscv64
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanytizer }}-${{ matrix.build_type }}
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
with:
|
||||
key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
@@ -97,6 +97,36 @@ jobs:
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: investigate slight precision issues in some operations for test-backend-ops on the WebGPU backend.
|
||||
#ggml-ci-nvidia-webgpu:
|
||||
# runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Dawn Dependency
|
||||
# id: dawn-depends
|
||||
# run: |
|
||||
# DAWN_VERSION="v20260317.182325"
|
||||
# DAWN_OWNER="google"
|
||||
# DAWN_REPO="dawn"
|
||||
# DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
|
||||
# echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
# curl -L -o artifact.tar.gz \
|
||||
# "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
# mkdir dawn
|
||||
# tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_WEBGPU=1 \
|
||||
# GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
# GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
|
||||
# bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
# runs-on: [self-hosted, Linux, CPU, AMX]
|
||||
|
||||
+58
-17
@@ -267,6 +267,56 @@ jobs:
|
||||
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
|
||||
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
android-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
NDK_VERSION: "29.0.14206865"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: android-arm64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: temurin
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Install NDK
|
||||
run: |
|
||||
sdkmanager "ndk;${{ env.NDK_VERSION }}"
|
||||
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-latest-rpc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -1001,22 +1051,14 @@ jobs:
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential libssl-dev wget git-lfs
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libssl-dev
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
|
||||
if ! which rustc; then
|
||||
# Install Rust stable version
|
||||
sudo apt-get install -y rustup
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
fi
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: Check environment
|
||||
@@ -1032,13 +1074,12 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# FIXME: Enable when ggml-org/ccache-action works on riscv64
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-cpu-riscv64-native
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
with:
|
||||
key: ubuntu-cpu-riscv64-native
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
@@ -236,6 +236,75 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
|
||||
android-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
NDK_VERSION: "29.0.14206865"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: android-arm64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: temurin
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Install NDK
|
||||
run: |
|
||||
sdkmanager "ndk;${{ env.NDK_VERSION }}"
|
||||
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz
|
||||
name: llama-bin-android-arm64.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
@@ -618,6 +687,11 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
@@ -971,6 +1045,7 @@ jobs:
|
||||
- ubuntu-cpu
|
||||
- ubuntu-vulkan
|
||||
- ubuntu-24-openvino
|
||||
- android-arm64
|
||||
- macOS-cpu
|
||||
- ios-xcode-build
|
||||
- openEuler-cann
|
||||
@@ -1059,6 +1134,9 @@ jobs:
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
|
||||
**Android:**
|
||||
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
|
||||
|
||||
**Windows:**
|
||||
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
|
||||
|
||||
+5
-1
@@ -225,7 +225,7 @@ foreach(FILE_PATH ${EXTRA_LICENSES})
|
||||
endforeach()
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
license_generate(common)
|
||||
license_generate(llama-common)
|
||||
endif()
|
||||
|
||||
#
|
||||
@@ -249,6 +249,10 @@ set_target_properties(llama
|
||||
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
install(TARGETS llama-common LIBRARY)
|
||||
endif()
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
||||
|
||||
+17
-1
@@ -1,5 +1,21 @@
|
||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
||||
# multiplie collaborators per item can be specified
|
||||
# multiple collaborators per item can be specified
|
||||
#
|
||||
# ggml-org/ci : CISC, danbev, ggerganov, netrunnereve, ngxson, taronaeo
|
||||
# ggml-org/ggml-cann : hipudding
|
||||
# ggml-org/ggml-cuda : JohannesGaessler, am17an, IMbackK, ORippler
|
||||
# ggml-org/ggml-hexagon : lhez, max-krasnyansky
|
||||
# ggml-org/ggml-metal : ggerganov
|
||||
# ggml-org/ggml-opencl : lhez, max-krasnyansky
|
||||
# ggml-org/ggml-rpc : rgerganov
|
||||
# ggml-org/ggml-sycl : arthw
|
||||
# ggml-org/ggml-vulkan : 0cc4m, jeffbolznv
|
||||
# ggml-org/ggml-webgpu : reeselevine
|
||||
# ggml-org/ggml-zdnn : taronaeo
|
||||
# ggml-org/llama-common : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
|
||||
# ggml-org/llama-mtmd : ngxson
|
||||
# ggml-org/llama-server : ggerganov, ngxson, allozaur, angt, ServeurpersoCom
|
||||
# ggml-org/llama-webui : allozaur
|
||||
|
||||
/.devops/*.Dockerfile @ngxson
|
||||
/.github/actions/ @ggml-org/ci
|
||||
|
||||
+27
-10
@@ -1,9 +1,11 @@
|
||||
# common
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
llama_add_compile_flags()
|
||||
|
||||
#
|
||||
# llama-common-base
|
||||
#
|
||||
|
||||
# Build info header
|
||||
|
||||
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
@@ -33,17 +35,25 @@ endif()
|
||||
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
|
||||
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
|
||||
set(TARGET build_info)
|
||||
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
|
||||
set(TARGET llama-common-base)
|
||||
add_library(${TARGET} STATIC ${OUTPUT_FILE})
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
set(TARGET common)
|
||||
#
|
||||
# llama-common
|
||||
#
|
||||
|
||||
add_library(${TARGET} STATIC
|
||||
set(TARGET llama-common)
|
||||
|
||||
add_library(${TARGET}
|
||||
arg.cpp
|
||||
arg.h
|
||||
base64.hpp
|
||||
@@ -106,17 +116,24 @@ add_library(${TARGET} STATIC
|
||||
jinja/caps.h
|
||||
)
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES
|
||||
VERSION ${LLAMA_INSTALL_VERSION}
|
||||
SOVERSION 0
|
||||
MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
|
||||
)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
# TODO: make fine-grained exports in the future
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
endif()
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
build_info
|
||||
cpp-httplib
|
||||
)
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common-base)
|
||||
target_link_libraries(${TARGET} PRIVATE cpp-httplib)
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
include(ExternalProject)
|
||||
|
||||
+8
-7
@@ -1,5 +1,6 @@
|
||||
#include "arg.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "download.h"
|
||||
@@ -291,7 +292,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
|
||||
hf_tag = "default";
|
||||
}
|
||||
|
||||
std::string model_endpoint = get_model_endpoint();
|
||||
std::string model_endpoint = common_get_model_endpoint();
|
||||
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
|
||||
|
||||
// prepare local path for caching
|
||||
@@ -1044,8 +1045,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--version"},
|
||||
"show version and build info",
|
||||
[](common_params &) {
|
||||
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
fprintf(stderr, "version: %d (%s)\n", llama_build_number(), llama_commit());
|
||||
fprintf(stderr, "built with %s for %s\n", llama_compiler(), llama_build_target());
|
||||
exit(0);
|
||||
}
|
||||
));
|
||||
@@ -1315,13 +1316,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
||||
add_opt(common_arg(
|
||||
{"--clear-idle"},
|
||||
{"--no-clear-idle"},
|
||||
{"--cache-idle-slots"},
|
||||
{"--no-cache-idle-slots"},
|
||||
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
|
||||
[](common_params & params, bool value) {
|
||||
params.clear_idle = value;
|
||||
params.cache_idle_slots = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--context-shift"},
|
||||
{"--no-context-shift"},
|
||||
|
||||
@@ -1,4 +1,35 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
char const * LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||
char const * LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const * LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
|
||||
int llama_build_number(void) {
|
||||
return LLAMA_BUILD_NUMBER;
|
||||
}
|
||||
|
||||
const char * llama_commit(void) {
|
||||
return LLAMA_COMMIT;
|
||||
}
|
||||
|
||||
const char * llama_compiler(void) {
|
||||
return LLAMA_COMPILER;
|
||||
}
|
||||
|
||||
const char * llama_build_target(void) {
|
||||
return LLAMA_BUILD_TARGET;
|
||||
}
|
||||
|
||||
const char * llama_build_info(void) {
|
||||
static std::string s = "b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT;
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
void llama_print_build_info(void) {
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, llama_build_number(), llama_commit());
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, llama_compiler(), llama_build_target());
|
||||
}
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
int llama_build_number(void);
|
||||
|
||||
const char * llama_commit(void);
|
||||
const char * llama_compiler(void);
|
||||
|
||||
const char * llama_build_target(void);
|
||||
const char * llama_build_info(void);
|
||||
|
||||
void llama_print_build_info(void);
|
||||
@@ -443,14 +443,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
if (!format.per_call_start.empty()) {
|
||||
auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call) + p.space());
|
||||
} else {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call);
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.space());
|
||||
}
|
||||
if (!format.section_start.empty()) {
|
||||
tool_calls = p.trigger_rule("tool-calls",
|
||||
p.literal(format.section_start) + p.space() + tool_calls + p.space() +
|
||||
(format.section_end.empty() ? p.end() : p.literal(format.section_end)));
|
||||
(format.section_end.empty() ? p.end() : p.literal(format.section_end) + p.space()));
|
||||
}
|
||||
} else {
|
||||
std::string separator = ", "; // Default
|
||||
|
||||
+1
-1
@@ -2334,7 +2334,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
||||
? input
|
||||
: params.generation_prompt + input;
|
||||
|
||||
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
|
||||
//LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
|
||||
|
||||
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
|
||||
if (params.debug) {
|
||||
|
||||
+39
-2
@@ -1,6 +1,7 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
@@ -372,7 +373,7 @@ void common_init() {
|
||||
const char * build_type = " (debug)";
|
||||
#endif
|
||||
|
||||
LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
||||
LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
|
||||
}
|
||||
|
||||
std::string common_params_get_system_info(const common_params & params) {
|
||||
@@ -1381,7 +1382,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
|
||||
common_init_result::~common_init_result() = default;
|
||||
|
||||
std::string get_model_endpoint() {
|
||||
std::string common_get_model_endpoint() {
|
||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
|
||||
@@ -1396,6 +1397,42 @@ std::string get_model_endpoint() {
|
||||
return model_endpoint;
|
||||
}
|
||||
|
||||
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
if (mem == nullptr) {
|
||||
return COMMON_CONTEXT_SEQ_RM_TYPE_NO;
|
||||
}
|
||||
|
||||
common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
|
||||
|
||||
llama_memory_clear(mem, true);
|
||||
|
||||
// eval 2 tokens to check if the context is compatible
|
||||
std::vector<llama_token> tmp;
|
||||
tmp.push_back(0);
|
||||
tmp.push_back(0);
|
||||
|
||||
int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// try to remove the last tokens
|
||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
llama_memory_clear(mem, true);
|
||||
llama_synchronize(ctx);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
||||
std::vector<llama_adapter_lora *> loras;
|
||||
std::vector<float> scales;
|
||||
|
||||
+24
-21
@@ -2,15 +2,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "ggml.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
@@ -27,11 +27,6 @@
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
struct common_time_meas {
|
||||
common_time_meas(int64_t & t_acc, bool disable = false);
|
||||
~common_time_meas();
|
||||
@@ -53,14 +48,6 @@ struct common_adapter_lora_info {
|
||||
|
||||
using llama_tokens = std::vector<llama_token>;
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern const char * LLAMA_COMMIT;
|
||||
extern const char * LLAMA_COMPILER;
|
||||
extern const char * LLAMA_BUILD_TARGET;
|
||||
|
||||
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||
|
||||
struct common_control_vector_load_info;
|
||||
|
||||
//
|
||||
@@ -315,15 +302,15 @@ struct common_params_speculative {
|
||||
// general-purpose speculative decoding parameters
|
||||
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
// ngram-based speculative decoding
|
||||
|
||||
uint16_t ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
uint16_t ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
|
||||
std::shared_ptr<common_ngram_mod> ngram_mod;
|
||||
|
||||
@@ -579,7 +566,7 @@ struct common_params {
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
bool cache_prompt = true; // whether to enable prompt caching
|
||||
bool clear_idle = true; // save and clear idle slots upon starting a new task
|
||||
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
@@ -859,7 +846,23 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
||||
|
||||
std::string get_model_endpoint();
|
||||
// model endpoint from env
|
||||
std::string common_get_model_endpoint();
|
||||
|
||||
//
|
||||
// Context utils
|
||||
//
|
||||
|
||||
enum common_context_seq_rm_type {
|
||||
COMMON_CONTEXT_SEQ_RM_TYPE_NO = 0, // seq_rm not supported (e.g. no memory module)
|
||||
COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences
|
||||
COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only
|
||||
};
|
||||
|
||||
// check if the llama_context can remove sequences
|
||||
// note: clears the memory of the context
|
||||
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
|
||||
|
||||
|
||||
//
|
||||
// Batch utils
|
||||
|
||||
+3
-2
@@ -1,5 +1,6 @@
|
||||
#include "arg.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "download.h"
|
||||
@@ -303,7 +304,7 @@ static int common_download_file_single_online(const std::string & url,
|
||||
headers.emplace(h.first, h.second);
|
||||
}
|
||||
if (headers.find("User-Agent") == headers.end()) {
|
||||
headers.emplace("User-Agent", "llama-cpp/" + build_info);
|
||||
headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
|
||||
}
|
||||
if (!opts.bearer_token.empty()) {
|
||||
headers.emplace("Authorization", "Bearer " + opts.bearer_token);
|
||||
@@ -441,7 +442,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
||||
headers.emplace(h.first, h.second);
|
||||
}
|
||||
if (headers.find("User-Agent") == headers.end()) {
|
||||
headers.emplace("User-Agent", "llama-cpp/" + build_info);
|
||||
headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
|
||||
}
|
||||
|
||||
if (params.timeout > 0) {
|
||||
|
||||
+4
-3
@@ -1,5 +1,6 @@
|
||||
#include "hf-cache.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "http.h"
|
||||
@@ -200,7 +201,7 @@ static nl::json api_get(const std::string & url,
|
||||
auto [cli, parts] = common_http_client(url);
|
||||
|
||||
httplib::Headers headers = {
|
||||
{"User-Agent", "llama-cpp/" + build_info},
|
||||
{"User-Agent", "llama-cpp/" + std::string(llama_build_info())},
|
||||
{"Accept", "application/json"}
|
||||
};
|
||||
|
||||
@@ -229,7 +230,7 @@ static nl::json api_get(const std::string & url,
|
||||
static std::string get_repo_commit(const std::string & repo_id,
|
||||
const std::string & token) {
|
||||
try {
|
||||
auto endpoint = get_model_endpoint();
|
||||
auto endpoint = common_get_model_endpoint();
|
||||
auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token);
|
||||
|
||||
if (!json.is_object() ||
|
||||
@@ -307,7 +308,7 @@ hf_files get_repo_files(const std::string & repo_id,
|
||||
hf_files files;
|
||||
|
||||
try {
|
||||
auto endpoint = get_model_endpoint();
|
||||
auto endpoint = common_get_model_endpoint();
|
||||
auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token);
|
||||
|
||||
if (!json.is_array()) {
|
||||
|
||||
@@ -23,6 +23,10 @@
|
||||
|
||||
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||
|
||||
int common_log_get_verbosity_thold(void) {
|
||||
return common_log_verbosity_thold;
|
||||
}
|
||||
|
||||
void common_log_set_verbosity_thold(int verbosity) {
|
||||
common_log_verbosity_thold = verbosity;
|
||||
}
|
||||
|
||||
+2
-2
@@ -38,7 +38,7 @@ enum log_colors {
|
||||
|
||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||
// set via common_log_set_verbosity()
|
||||
extern int common_log_verbosity_thold;
|
||||
int common_log_get_verbosity_thold(void);
|
||||
|
||||
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
||||
|
||||
@@ -98,7 +98,7 @@ void common_log_flush (struct common_log * log); // f
|
||||
|
||||
#define LOG_TMPL(level, verbosity, ...) \
|
||||
do { \
|
||||
if ((verbosity) <= common_log_verbosity_thold) { \
|
||||
if ((verbosity) <= common_log_get_verbosity_thold()) { \
|
||||
common_log_add(common_log_main(), (level), __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -208,7 +208,7 @@ void common_ngram_map_begin(
|
||||
count_keys, count_keys_del, count_values_del, count_map_entries_upd);
|
||||
}
|
||||
|
||||
map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
|
||||
map.idx_last_check = size_begin;
|
||||
map.size_last_begin = size_begin;
|
||||
}
|
||||
|
||||
@@ -231,7 +231,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
|
||||
}
|
||||
|
||||
if (map.idx_last_check > cur_len) {
|
||||
if (map.idx_last_check > cur_len) {
|
||||
// Should not happen because of common_ngram_map_begin().
|
||||
GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
|
||||
}
|
||||
@@ -386,7 +386,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
|
||||
curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
|
||||
|
||||
map.last_draft_created = false;
|
||||
map.last_draft_created = true;
|
||||
map.last_draft_key_idx = key_offset;
|
||||
map.last_draft_value_idx = 0; // value 0 is used for simple mode
|
||||
return;
|
||||
@@ -524,7 +524,7 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
|
||||
struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.
|
||||
|
||||
// update the value statistics
|
||||
LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
|
||||
LOG_DBG("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
|
||||
n_accepted, curr_value.n_accepted);
|
||||
curr_value.n_accepted = n_accepted;
|
||||
}
|
||||
|
||||
+134
-52
@@ -13,6 +13,7 @@
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <cinttypes>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
@@ -144,10 +145,28 @@ struct common_speculative_state {
|
||||
virtual void accept(uint16_t n_accepted) = 0;
|
||||
};
|
||||
|
||||
struct common_speculative_checkpoint {
|
||||
llama_pos pos_min = 0;
|
||||
llama_pos pos_max = 0;
|
||||
|
||||
int64_t n_tokens = 0;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
size_t size() const {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
size_t ckpt_size = 0;
|
||||
};
|
||||
|
||||
struct common_speculative_state_draft : public common_speculative_state {
|
||||
llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
|
||||
llama_context * ctx_dft;
|
||||
|
||||
bool use_ckpt = false;
|
||||
struct common_speculative_checkpoint ckpt;
|
||||
|
||||
common_sampler * smpl;
|
||||
|
||||
llama_batch batch;
|
||||
@@ -160,10 +179,12 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
enum common_speculative_type type,
|
||||
llama_context * ctx_tgt,
|
||||
llama_context * ctx_dft,
|
||||
const std::vector<std::pair<std::string, std::string>> & replacements)
|
||||
const std::vector<std::pair<std::string, std::string>> & replacements,
|
||||
bool use_ckpt)
|
||||
: common_speculative_state(type)
|
||||
, ctx_tgt(ctx_tgt)
|
||||
, ctx_dft(ctx_dft)
|
||||
, use_ckpt(use_ckpt)
|
||||
{
|
||||
batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
||||
smpl = nullptr;
|
||||
@@ -218,7 +239,48 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
GGML_UNUSED(prompt);
|
||||
if (use_ckpt && ckpt.size() > 0) {
|
||||
// delete checkpoint
|
||||
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
|
||||
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
|
||||
ckpt.pos_min = 0;
|
||||
ckpt.pos_max = 0;
|
||||
ckpt.n_tokens = 0;
|
||||
ckpt.ckpt_size = 0;
|
||||
ckpt.data.clear();
|
||||
}
|
||||
}
|
||||
|
||||
size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
|
||||
int slot_id = 0;
|
||||
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
|
||||
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
|
||||
ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
|
||||
ckpt.data.resize(checkpoint_size);
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
if (n != checkpoint_size) {
|
||||
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
|
||||
}
|
||||
|
||||
LOG_DBG("%s: pos_min = %d, pos_max = %d, size = %.3f MiB\n", __func__,
|
||||
ckpt.pos_min, ckpt.pos_max, (float) ckpt.data.size() / 1024 / 1024);
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
|
||||
int slot_id = 0;
|
||||
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
if (n != ckpt_size_part_expected) {
|
||||
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
|
||||
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
|
||||
}
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
void draft(
|
||||
@@ -236,8 +298,8 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
|
||||
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||
|
||||
int reuse_i = 0;
|
||||
int reuse_n = 0;
|
||||
int reuse_i = 0; // index of part to be reused in prompt_dft
|
||||
int reuse_n = 0; // length of part to be reused in prompt_dft
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;
|
||||
|
||||
@@ -287,18 +349,26 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
|
||||
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
|
||||
if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
|
||||
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
|
||||
__func__, reuse_i, reuse_n);
|
||||
reuse_i = 0;
|
||||
reuse_n = 0;
|
||||
}
|
||||
|
||||
result.clear();
|
||||
result.reserve(params.n_max);
|
||||
|
||||
if (reuse_n == 0) {
|
||||
bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
|
||||
if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
|
||||
llama_memory_clear(mem_dft, false);
|
||||
prompt_dft.clear();
|
||||
} else {
|
||||
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
||||
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
||||
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
||||
if (reuse_i + reuse_n < (int64_t) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
|
||||
result.push_back(prompt_dft[i]);
|
||||
|
||||
@@ -310,19 +380,50 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
return;
|
||||
}
|
||||
|
||||
bool do_restore = false;
|
||||
if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
|
||||
// This can happen after a partial acceptance (speculative decoding with checkpoints)
|
||||
LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
|
||||
__func__, prompt_dft.size(), prompt_cur.size());
|
||||
prompt_dft.resize(prompt_cur.size());
|
||||
do_restore = true;
|
||||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||
bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||
if (!is_removed) {
|
||||
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
|
||||
}
|
||||
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt_dft.size()) {
|
||||
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
if (reuse_n < (int) prompt_dft.size() || do_restore) {
|
||||
if (use_ckpt) {
|
||||
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
|
||||
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
|
||||
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
|
||||
}
|
||||
draft_restore_checkpoint(ckpt.ckpt_size);
|
||||
reuse_n = ckpt.n_tokens;
|
||||
prompt_dft.resize(reuse_n);
|
||||
needs_ckpt = false;
|
||||
} else {
|
||||
bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||
if (!is_removed) {
|
||||
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
|
||||
__func__, reuse_n, prompt_dft.size());
|
||||
}
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (needs_ckpt) {
|
||||
ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
|
||||
}
|
||||
|
||||
// prepare a batch to evaluate any new tokens in the prompt
|
||||
common_batch_clear(batch);
|
||||
|
||||
@@ -337,7 +438,11 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
if (batch.n_tokens > 0) {
|
||||
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
||||
|
||||
llama_decode(ctx_dft, batch);
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0 && ret != 1) {
|
||||
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
|
||||
__func__, ret, prompt_cur.size());
|
||||
}
|
||||
}
|
||||
|
||||
const llama_pos n_past = prompt_dft.size();
|
||||
@@ -351,7 +456,11 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
|
||||
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
|
||||
|
||||
llama_decode(ctx_dft, batch);
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0 && ret != 1) {
|
||||
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
|
||||
__func__, ret, prompt_cur.size(), prompt_dft.size());
|
||||
}
|
||||
|
||||
common_sampler_reset(smpl);
|
||||
|
||||
@@ -387,7 +496,11 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||
|
||||
// evaluate the drafted tokens on the draft model
|
||||
llama_decode(ctx_dft, batch);
|
||||
ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode[%d] returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
|
||||
__func__, i, ret, prompt_cur.size(), prompt_dft.size());
|
||||
}
|
||||
|
||||
prompt_dft.push_back(id);
|
||||
}
|
||||
@@ -739,6 +852,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {
|
||||
|
||||
struct common_speculative {
|
||||
std::vector<std::unique_ptr<common_speculative_state>> impls; // list of implementations to use and their states
|
||||
|
||||
common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
|
||||
};
|
||||
|
||||
@@ -798,42 +912,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||
return it->second;
|
||||
}
|
||||
|
||||
bool common_speculative_is_compat(llama_context * ctx_tgt) {
|
||||
auto * mem = llama_get_memory(ctx_tgt);
|
||||
if (mem == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res = true;
|
||||
|
||||
llama_memory_clear(mem, true);
|
||||
|
||||
// eval 2 tokens to check if the context is compatible
|
||||
std::vector<llama_token> tmp;
|
||||
tmp.push_back(0);
|
||||
tmp.push_back(0);
|
||||
|
||||
int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
|
||||
res = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// try to remove the last tokens
|
||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
||||
res = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
llama_memory_clear(mem, true);
|
||||
llama_synchronize(ctx_tgt);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// initialization of the speculative decoding system
|
||||
//
|
||||
common_speculative * common_speculative_init(
|
||||
@@ -908,10 +986,13 @@ common_speculative * common_speculative_init(
|
||||
case COMMON_SPECULATIVE_TYPE_NONE:
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT: {
|
||||
const bool use_ckpt = common_context_can_seq_rm(ctx_dft) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
|
||||
impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
|
||||
/* .ctx_tgt = */ ctx_tgt,
|
||||
/* .ctx_dft = */ ctx_dft,
|
||||
/* .replacements = */ params.replacements
|
||||
/* .replacements = */ params.replacements,
|
||||
/* .use_ckpt = */ use_ckpt
|
||||
));
|
||||
break;
|
||||
}
|
||||
@@ -966,7 +1047,8 @@ common_speculative * common_speculative_init(
|
||||
}
|
||||
|
||||
auto * result = new common_speculative {
|
||||
/* .impls = */ std::move(impls)
|
||||
/* .impls = */ std::move(impls),
|
||||
/* .curr_impl = */ nullptr,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
||||
@@ -14,10 +14,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||
// convert type to string
|
||||
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
||||
|
||||
// check if the llama_context is compatible for speculative decoding
|
||||
// note: clears the memory of the context
|
||||
bool common_speculative_is_compat(llama_context * ctx_tgt);
|
||||
|
||||
common_speculative * common_speculative_init(
|
||||
common_params_speculative & params,
|
||||
llama_context * ctx_tgt);
|
||||
@@ -39,3 +35,9 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
|
||||
|
||||
// print statistics about the speculative decoding
|
||||
void common_speculative_print_stats(const common_speculative * spec);
|
||||
|
||||
struct common_speculative_deleter {
|
||||
void operator()(common_speculative * s) { common_speculative_free(s); }
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<common_speculative, common_speculative_deleter> common_speculative_ptr;
|
||||
|
||||
+71
-6
@@ -1850,20 +1850,28 @@ class TextModel(ModelBase):
|
||||
with open(module_path, encoding="utf-8") as f:
|
||||
modules = json.load(f)
|
||||
for mod in modules:
|
||||
if mod["type"] == "sentence_transformers.models.Pooling":
|
||||
if mod["type"].endswith("Pooling"):
|
||||
pooling_path = mod["path"]
|
||||
break
|
||||
|
||||
mode_mapping = {
|
||||
"mean": gguf.PoolingType.MEAN,
|
||||
"cls": gguf.PoolingType.CLS,
|
||||
"lasttoken": gguf.PoolingType.LAST,
|
||||
}
|
||||
|
||||
# get pooling type
|
||||
if pooling_path is not None:
|
||||
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
||||
pooling = json.load(f)
|
||||
if pooling["pooling_mode_mean_tokens"]:
|
||||
if pooling.get("pooling_mode_mean_tokens"):
|
||||
pooling_type = gguf.PoolingType.MEAN
|
||||
elif pooling["pooling_mode_cls_token"]:
|
||||
elif pooling.get("pooling_mode_cls_token"):
|
||||
pooling_type = gguf.PoolingType.CLS
|
||||
elif pooling["pooling_mode_lasttoken"]:
|
||||
elif pooling.get("pooling_mode_lasttoken"):
|
||||
pooling_type = gguf.PoolingType.LAST
|
||||
elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping:
|
||||
pooling_type = mode_mapping[pooling_mode]
|
||||
else:
|
||||
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
||||
self.gguf_writer.add_pooling_type(pooling_type)
|
||||
@@ -7180,7 +7188,7 @@ class EmbeddingGemma(Gemma3Model):
|
||||
with open(modules_file, encoding="utf-8") as modules_json_file:
|
||||
mods = json.load(modules_json_file)
|
||||
for mod in mods:
|
||||
if mod["type"] == "sentence_transformers.models.Dense":
|
||||
if mod["type"].endswith("Dense"):
|
||||
mod_path = mod["path"]
|
||||
# check if model.safetensors file for Dense layer exists
|
||||
model_tensors_file = self.dir_model / mod_path / "model.safetensors"
|
||||
@@ -10893,7 +10901,64 @@ class NemotronHModel(GraniteHybridModel):
|
||||
self.gguf_writer.add_moe_latent_size(latent_size)
|
||||
|
||||
def set_vocab(self):
|
||||
super().set_vocab()
|
||||
# The NemotronH config uses pattern characters (e.g. '-') that may not
|
||||
# be supported by the installed transformers version. AutoTokenizer
|
||||
# internally calls AutoConfig which triggers this parsing failure.
|
||||
# Using trust_remote_code=True to load the model's own config class.
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
|
||||
# Pad vocab size (from Mamba2Model/GraniteHybridModel)
|
||||
self.hparams["pad_vocab_size_multiple"] = 8 # Setting this here since GraniteHybridModel.set_vocab() isn't being invoked now.
|
||||
# From Mamba2Model.set_vocab():
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
|
||||
# ref: https://stackoverflow.com/a/17511341/22827863
|
||||
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
||||
self.hparams["vocab_size"] = vocab_size
|
||||
|
||||
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
|
||||
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
|
||||
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
else:
|
||||
token: str = reverse_vocab[i]
|
||||
if token in added_vocab:
|
||||
if not added_tokens_decoder[i].normalized:
|
||||
previous_token = token
|
||||
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
|
||||
if previous_token != token:
|
||||
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
||||
|
||||
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
tokens.append(token)
|
||||
|
||||
# From TextModel.set_vocab_gpt2():
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
# The tokenizer _does_ add a BOS token (via post_processor type
|
||||
# TemplateProcessing) but does not set add_bos_token to true in the
|
||||
|
||||
+8
-8
@@ -22,13 +22,13 @@ Legend:
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
@@ -46,7 +46,7 @@ Legend:
|
||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
@@ -84,10 +84,10 @@ Legend:
|
||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -116,6 +116,6 @@ Legend:
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
+2930
-848
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-batched)
|
||||
add_executable(${TARGET} batched.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-convert-llama2c-to-ggml)
|
||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-debug)
|
||||
add_executable(${TARGET} debug.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-diffusion-cli)
|
||||
add_executable(${TARGET} diffusion-cli.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-embedding)
|
||||
add_executable(${TARGET} embedding.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
set(TARGET llama-eval-callback)
|
||||
add_executable(${TARGET} eval-callback.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_BUILD_TESTS)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-gen-docs)
|
||||
add_executable(${TARGET} gen-docs.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-idle)
|
||||
add_executable(${TARGET} idle.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -51,6 +51,6 @@ target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE
|
||||
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
llama
|
||||
common
|
||||
llama-common
|
||||
android
|
||||
log)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-lookahead)
|
||||
add_executable(${TARGET} lookahead.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
set(TARGET llama-lookup)
|
||||
add_executable(${TARGET} lookup.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-create)
|
||||
add_executable(${TARGET} lookup-create.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-merge)
|
||||
add_executable(${TARGET} lookup-merge.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-stats)
|
||||
add_executable(${TARGET} lookup-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-parallel)
|
||||
add_executable(${TARGET} parallel.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-passkey)
|
||||
add_executable(${TARGET} passkey.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-retrieval)
|
||||
add_executable(${TARGET} retrieval.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-speculative-simple)
|
||||
add_executable(${TARGET} speculative-simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-speculative)
|
||||
add_executable(${TARGET} speculative.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -5,5 +5,5 @@
|
||||
set(TARGET llama-ls-sycl-device)
|
||||
add_executable(${TARGET} ls-sycl-device.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-finetune)
|
||||
add_executable(${TARGET} finetune.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,11 +1,5 @@
|
||||
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
|
||||
|
||||
# ref: https://cmake.org/cmake/help/latest/policy/CMP0194.html
|
||||
# MSVC is not a valid assembler for the ASM language.
|
||||
# Set to NEW to avoid a warning on CMake 4.1+ with MSVC.
|
||||
if (POLICY CMP0194)
|
||||
cmake_policy(SET CMP0194 NEW)
|
||||
endif()
|
||||
project("ggml" C CXX ASM)
|
||||
|
||||
### GGML Version
|
||||
|
||||
+199
-148
@@ -1270,7 +1270,45 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
|
||||
if (split_state.n_segments != 1) {
|
||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
size_t offset_data = 0;
|
||||
std::vector<size_t> simple_offsets(n_bufs, 0);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[1] == size);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[2] == size);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (split_state.axis) {
|
||||
case GGML_BACKEND_SPLIT_AXIS_0:
|
||||
@@ -1418,6 +1456,8 @@ struct ggml_backend_meta_context {
|
||||
int max_nnodes = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
size_t max_subgraphs = 0;
|
||||
size_t n_subgraphs = 0;
|
||||
uint64_t uid = 0;
|
||||
|
||||
void * comm_ctx = nullptr;
|
||||
ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;
|
||||
@@ -1578,6 +1618,9 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
const size_t n_backends = ggml_backend_meta_n_backends(backend);
|
||||
ggml_backend_meta_context * backend_ctx = (ggml_backend_meta_context *) backend->context;
|
||||
|
||||
// If the previous cgraph had a defined UID it can be used to skip rebuilding the subgraphs per simple backend.
|
||||
const bool needs_rebuild = (cgraph->uid == 0) || (cgraph->uid != backend_ctx->uid);
|
||||
|
||||
bool max_nnodes_raised = false;
|
||||
if (cgraph->n_nodes > backend_ctx->max_nnodes) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
@@ -1587,173 +1630,181 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
}
|
||||
backend_ctx->max_nnodes = cgraph->n_nodes;
|
||||
max_nnodes_raised = true;
|
||||
assert(needs_rebuild);
|
||||
}
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
|
||||
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
|
||||
bcj.nodes[i] = node;
|
||||
continue;
|
||||
if (needs_rebuild) {
|
||||
size_t n_subgraphs = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
|
||||
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
|
||||
bcj.nodes[i] = node;
|
||||
continue;
|
||||
}
|
||||
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
|
||||
GGML_ASSERT(bcj.nodes[i]);
|
||||
}
|
||||
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
|
||||
GGML_ASSERT(bcj.nodes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_subgraphs = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
{
|
||||
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
|
||||
auto get_i_delayed = [&](const int i) -> int {
|
||||
int id = i; // i_delayed
|
||||
int idr = i; // i_delayed return, last safe return value
|
||||
{
|
||||
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
|
||||
auto get_i_delayed = [&](const int i) -> int {
|
||||
int id = i; // i_delayed
|
||||
int idr = i; // i_delayed return, last safe return value
|
||||
|
||||
ggml_tensor * node = cgraph->nodes[id];
|
||||
int32_t n_used = ggml_node_get_use_count(cgraph, id);
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
|
||||
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
ggml_tensor * node = cgraph->nodes[id];
|
||||
int32_t n_used = ggml_node_get_use_count(cgraph, id);
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
|
||||
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_MUL && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
for (int32_t k = 0; k < n_used; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
|
||||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
|
||||
ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_MUL && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
|
||||
for (int32_t k = 0; k < n_used - 2; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
idr = id;
|
||||
return idr;
|
||||
}
|
||||
for (int32_t k = 0; k < n_used; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
|
||||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
|
||||
ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
for (int32_t k = 0; k < n_used - 2; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
idr = id;
|
||||
return idr;
|
||||
};
|
||||
};
|
||||
|
||||
int i_start = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
continue;
|
||||
}
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
|
||||
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
|
||||
}
|
||||
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
|
||||
if (!new_subgraph) {
|
||||
continue;
|
||||
}
|
||||
int i_start = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
continue;
|
||||
}
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
|
||||
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
|
||||
}
|
||||
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
|
||||
if (!new_subgraph) {
|
||||
continue;
|
||||
}
|
||||
|
||||
i = get_i_delayed(i);
|
||||
i = get_i_delayed(i);
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.cgraphs[n_subgraphs].offset = i_start;
|
||||
}
|
||||
n_subgraphs++;
|
||||
i_start = i + 1;
|
||||
}
|
||||
GGML_ASSERT(i_start == cgraph->n_nodes);
|
||||
}
|
||||
|
||||
backend_ctx->uid = cgraph->uid;
|
||||
backend_ctx->n_subgraphs = n_subgraphs;
|
||||
|
||||
if (max_tmp_size > backend_ctx->max_tmp_size) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.cgraphs[n_subgraphs].offset = i_start;
|
||||
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
}
|
||||
n_subgraphs++;
|
||||
i_start = i + 1;
|
||||
backend_ctx->max_tmp_size = max_tmp_size;
|
||||
}
|
||||
GGML_ASSERT(i_start == cgraph->n_nodes);
|
||||
}
|
||||
|
||||
if (max_tmp_size > backend_ctx->max_tmp_size) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
}
|
||||
backend_ctx->max_tmp_size = max_tmp_size;
|
||||
}
|
||||
|
||||
|
||||
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
|
||||
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
|
||||
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
|
||||
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
|
||||
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
backend_ctx->ctx.reset(ggml_init(params));
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i = 0; i < n_subgraphs; i++) {
|
||||
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
|
||||
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
|
||||
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
|
||||
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
|
||||
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
|
||||
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
backend_ctx->ctx.reset(ggml_init(params));
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i = 0; i < n_subgraphs; i++) {
|
||||
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
|
||||
}
|
||||
}
|
||||
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
|
||||
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
|
||||
}
|
||||
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
|
||||
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
|
||||
}
|
||||
}
|
||||
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
|
||||
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
|
||||
}
|
||||
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
|
||||
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
|
||||
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
|
||||
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
|
||||
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
|
||||
cgraph_ij->n_nodes = i_node_stop - i_node_start;
|
||||
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
|
||||
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
|
||||
ggml_tensor * node_ij = bcj.nodes[i_node];
|
||||
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
|
||||
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
|
||||
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
|
||||
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
|
||||
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
|
||||
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
|
||||
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
|
||||
cgraph_ij->n_nodes = i_node_stop - i_node_start;
|
||||
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
|
||||
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
|
||||
ggml_tensor * node_ij = bcj.nodes[i_node];
|
||||
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
|
||||
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
|
||||
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
|
||||
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
|
||||
}
|
||||
cgraph_ij->uid = ggml_graph_next_uid();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1860,7 +1911,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
};
|
||||
|
||||
|
||||
for (size_t i = 0; i < n_subgraphs; i++) {
|
||||
for (size_t i = 0; i < backend_ctx->n_subgraphs; i++) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, bcj.cgraphs[i].cgraph_main);
|
||||
@@ -1869,7 +1920,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
}
|
||||
}
|
||||
|
||||
if (n_backends > 1 && i < n_subgraphs - 1) {
|
||||
if (n_backends > 1 && i < backend_ctx->n_subgraphs - 1) {
|
||||
bool backend_allreduce_success = false;
|
||||
if (backend_ctx->comm_ctx) {
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
|
||||
@@ -1030,6 +1030,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||
GGML_ABORT("%s: failed to initialize context\n", __func__);
|
||||
}
|
||||
|
||||
graph->uid = ggml_graph_next_uid();
|
||||
|
||||
// pass 1: assign backends to ops with pre-allocated inputs
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
@@ -1477,6 +1479,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||
assert(graph_copy->size > graph_copy->n_leafs);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||
}
|
||||
|
||||
// set ids for all splits
|
||||
for (int i = 0; i < sched->n_splits; ++i) {
|
||||
sched->splits[i].graph.uid = ggml_graph_next_uid();
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -109,6 +109,96 @@ static void simd_gemm(
|
||||
C += N;
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_SIMD) && defined(__riscv_v_intrinsic)
|
||||
// RM accumulators + 1 B vector = RM + 1 <= 8 => RM <= 7
|
||||
// Microkernel: C[RM x vl] += A[RM x K] * B[K x N]
|
||||
template <int RM>
|
||||
static inline void rvv_simd_gemm_ukernel(
|
||||
float * GGML_RESTRICT C,
|
||||
const float * GGML_RESTRICT A,
|
||||
const float * GGML_RESTRICT B,
|
||||
int K, int N, size_t vl)
|
||||
{
|
||||
static_assert(RM >= 1 && RM <= 7, "RM must be 1..7 for LMUL=4");
|
||||
|
||||
vfloat32m4_t acc_0 = __riscv_vle32_v_f32m4(C + 0 * N, vl);
|
||||
vfloat32m4_t acc_1, acc_2, acc_3, acc_4, acc_5, acc_6;
|
||||
if constexpr (RM > 1) acc_1 = __riscv_vle32_v_f32m4(C + 1 * N, vl);
|
||||
if constexpr (RM > 2) acc_2 = __riscv_vle32_v_f32m4(C + 2 * N, vl);
|
||||
if constexpr (RM > 3) acc_3 = __riscv_vle32_v_f32m4(C + 3 * N, vl);
|
||||
if constexpr (RM > 4) acc_4 = __riscv_vle32_v_f32m4(C + 4 * N, vl);
|
||||
if constexpr (RM > 5) acc_5 = __riscv_vle32_v_f32m4(C + 5 * N, vl);
|
||||
if constexpr (RM > 6) acc_6 = __riscv_vle32_v_f32m4(C + 6 * N, vl);
|
||||
|
||||
for (int kk = 0; kk < K; kk++) {
|
||||
vfloat32m4_t b_0 = __riscv_vle32_v_f32m4(B + kk * N, vl);
|
||||
|
||||
acc_0 = __riscv_vfmacc_vf_f32m4(acc_0, A[0 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 1) acc_1 = __riscv_vfmacc_vf_f32m4(acc_1, A[1 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 2) acc_2 = __riscv_vfmacc_vf_f32m4(acc_2, A[2 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 3) acc_3 = __riscv_vfmacc_vf_f32m4(acc_3, A[3 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 4) acc_4 = __riscv_vfmacc_vf_f32m4(acc_4, A[4 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 5) acc_5 = __riscv_vfmacc_vf_f32m4(acc_5, A[5 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 6) acc_6 = __riscv_vfmacc_vf_f32m4(acc_6, A[6 * K + kk], b_0, vl);
|
||||
}
|
||||
|
||||
__riscv_vse32_v_f32m4(C + 0 * N, acc_0, vl);
|
||||
if constexpr (RM > 1) __riscv_vse32_v_f32m4(C + 1 * N, acc_1, vl);
|
||||
if constexpr (RM > 2) __riscv_vse32_v_f32m4(C + 2 * N, acc_2, vl);
|
||||
if constexpr (RM > 3) __riscv_vse32_v_f32m4(C + 3 * N, acc_3, vl);
|
||||
if constexpr (RM > 4) __riscv_vse32_v_f32m4(C + 4 * N, acc_4, vl);
|
||||
if constexpr (RM > 5) __riscv_vse32_v_f32m4(C + 5 * N, acc_5, vl);
|
||||
if constexpr (RM > 6) __riscv_vse32_v_f32m4(C + 6 * N, acc_6, vl);
|
||||
}
|
||||
|
||||
template <int RM>
|
||||
static inline void rvv_simd_gemm_dispatch_tail(
|
||||
float * GGML_RESTRICT C,
|
||||
const float * GGML_RESTRICT A,
|
||||
const float * GGML_RESTRICT B,
|
||||
int K, int N, int KN, int remaining_rows)
|
||||
{
|
||||
if constexpr (RM > 0) {
|
||||
if (remaining_rows == RM) {
|
||||
int64_t jj = 0;
|
||||
for (; jj + KN <= N; jj += KN) {
|
||||
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, KN);
|
||||
}
|
||||
if (jj < N) {
|
||||
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, N - jj);
|
||||
}
|
||||
} else {
|
||||
rvv_simd_gemm_dispatch_tail<RM - 1>(C, A, B, K, N, KN, remaining_rows);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr int GEMM_RM = 7;
|
||||
|
||||
// C[M x N] += A[M x K] * B[K x N]
|
||||
static void simd_gemm(
|
||||
float * GGML_RESTRICT C,
|
||||
const float * GGML_RESTRICT A,
|
||||
const float * GGML_RESTRICT B,
|
||||
int M, int K, int N)
|
||||
{
|
||||
const int KN = (int)__riscv_vlenb();
|
||||
int64_t ii = 0;
|
||||
for (; ii + GEMM_RM <= M; ii += GEMM_RM) {
|
||||
int64_t jj = 0;
|
||||
for (; jj + KN <= N; jj += KN) {
|
||||
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, KN);
|
||||
}
|
||||
if (jj < N) {
|
||||
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, N - jj);
|
||||
}
|
||||
A += GEMM_RM * K;
|
||||
C += GEMM_RM * N;
|
||||
}
|
||||
|
||||
int remaining_rows = M - ii;
|
||||
rvv_simd_gemm_dispatch_tail<GEMM_RM - 1>(C, A, B, K, N, KN, remaining_rows);
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
@@ -269,10 +269,6 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
#define FLASH_ATTN_AVAILABLE
|
||||
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
||||
|
||||
#if defined(TURING_MMA_AVAILABLE)
|
||||
#define LDMATRIX_TRANS_AVAILABLE
|
||||
#endif // defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
static bool fp16_available(const int cc) {
|
||||
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL ||
|
||||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
|
||||
@@ -1186,6 +1182,8 @@ struct ggml_cuda_graph {
|
||||
std::vector<cudaGraphNode_t> nodes;
|
||||
bool disable_due_to_gpu_arch = false;
|
||||
bool warmup_complete = false;
|
||||
uint64_t uid = 0;
|
||||
int64_t last_used_time = 0;
|
||||
struct node_properties {
|
||||
ggml_tensor node;
|
||||
void * node_src_data_ptrs[GGML_MAX_SRC];
|
||||
@@ -1367,12 +1365,28 @@ struct ggml_backend_cuda_context {
|
||||
// when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
|
||||
std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
|
||||
|
||||
int64_t last_graph_eviction_sweep = 0;
|
||||
|
||||
ggml_cuda_graph * cuda_graph(const void * first_node_ptr) {
|
||||
const int64_t time_now = ggml_time_us();
|
||||
|
||||
// sweep every 5s, evicting cuda graphs unused for >=10s
|
||||
if (time_now - last_graph_eviction_sweep >= 5'000'000) {
|
||||
last_graph_eviction_sweep = time_now;
|
||||
for (auto it = cuda_graphs.begin(); it != cuda_graphs.end(); ) {
|
||||
if (time_now - it->second->last_used_time >= 10'000'000) {
|
||||
it = cuda_graphs.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto it = cuda_graphs.find(first_node_ptr);
|
||||
if (it == cuda_graphs.end()) {
|
||||
cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
|
||||
return cuda_graphs[first_node_ptr].get();
|
||||
it = cuda_graphs.emplace(first_node_ptr, std::make_unique<ggml_cuda_graph>()).first;
|
||||
}
|
||||
it->second->last_used_time = time_now;
|
||||
return it->second.get();
|
||||
}
|
||||
|
||||
|
||||
@@ -305,12 +305,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
|
||||
const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV, const int i_sup) {
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
// K/V data is loaded with decreasing granularity for D for better memory bandwidth.
|
||||
// The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
|
||||
// The minimum granularity is 16 bytes.
|
||||
constexpr int h2_per_chunk = 16/sizeof(half2);
|
||||
const int chunks_per_row = D2 / h2_per_chunk;
|
||||
if constexpr (use_cp_async) {
|
||||
static_assert(warp_size == 32, "bad warp_size");
|
||||
static_assert(!oob_check, "OOB check not compatible with cp_async");
|
||||
constexpr int preload = 64;
|
||||
constexpr int h2_per_chunk = 16/sizeof(half2);
|
||||
const int chunks_per_row = D2 / h2_per_chunk;
|
||||
|
||||
const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
|
||||
|
||||
@@ -348,11 +349,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
|
||||
// 6: max 1*16= 16 bytes, 8 half
|
||||
ggml_cuda_unroll<6>{}(load);
|
||||
} else {
|
||||
// TODO use ggml_cuda_memcpy_1
|
||||
const half2 zero[4] = {{0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}};
|
||||
auto load = [&] __device__ (const int n) {
|
||||
const int stride_k = warp_size >> n;
|
||||
const int k0_start = stride_k == warp_size ? 0 : D2 - D2 % (2*stride_k);
|
||||
const int k0_stop = D2 - D2 % (1*stride_k);
|
||||
const int stride_k = 32 >> n;
|
||||
const int k0_start = stride_k == 32 ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
|
||||
const int k0_stop = chunks_per_row - chunks_per_row % (1*stride_k);
|
||||
const int stride_i = warp_size / stride_k;
|
||||
|
||||
if (k0_start == k0_stop) {
|
||||
@@ -371,15 +372,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
|
||||
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
|
||||
const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);
|
||||
|
||||
tile_KV[i*stride_tile + k] = !oob_check || i < i_sup ? KV[i*stride_KV + k] : make_half2(0.0f, 0.0f);
|
||||
ggml_cuda_memcpy_1<16>(tile_KV + i*stride_tile + k*4,
|
||||
!oob_check || i < i_sup ? KV + i*stride_KV + k*h2_per_chunk : zero);
|
||||
}
|
||||
}
|
||||
};
|
||||
// 1: max 32* 4=128 bytes, 64 half
|
||||
// 2: max 16* 4= 64 bytes, 32 half
|
||||
// 3: max 8* 4= 32 bytes, 16 half
|
||||
// 4: max 4* 4= 16 bytes, 8 half
|
||||
ggml_cuda_unroll<4>{}(load);
|
||||
// 1: max 32*16=512 bytes, 256 half
|
||||
// 2: max 16*16=256 bytes, 128 half
|
||||
// 3: max 8*16=128 bytes, 64 half
|
||||
// 4: max 4*16= 64 bytes, 32 half
|
||||
// 5: max 2*16= 32 bytes, 16 half
|
||||
// 6: max 1*16= 16 bytes, 8 half
|
||||
ggml_cuda_unroll<6>{}(load);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -862,11 +866,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
}
|
||||
|
||||
|
||||
#if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
T_A_VKQ A_identity;
|
||||
make_identity_mat(A_identity);
|
||||
#endif // defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
|
||||
// Calculate VKQ tile, need to use logical rather than physical elements for i0 due to transposition of V:
|
||||
#pragma unroll
|
||||
for (int i0_start = 0; i0_start < DV; i0_start += 2*nbatch_V2) {
|
||||
@@ -897,29 +896,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
const int k0 = k00 + (threadIdx.y % np)*T_A_VKQ::J;
|
||||
|
||||
T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
|
||||
#if defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
// MFMA A register layout: A_mat[i=lane%16][k=4*(lane/16)+reg].
|
||||
// Normal load gives A_mat[seq][dv] but we need A_mat[dv][seq] = V^T.
|
||||
// Load with transposed addressing: 4 strided half loads.
|
||||
{
|
||||
const half2 * xs0 = tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2;
|
||||
const half * xs0_h = (const half *) xs0;
|
||||
const int stride_h = stride_tile_V * 2; // stride in half units
|
||||
half * A_h = (half *) A.x;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
A_h[l] = xs0_h[(4*(threadIdx.x / 16) + l) * stride_h + threadIdx.x % 16];
|
||||
}
|
||||
}
|
||||
#else
|
||||
// TODO: Try to transpose tile_V when loading gmem to smem.
|
||||
// Use mma to transpose T_A_VKQ for RDNA.
|
||||
T_A_VKQ A_trans;
|
||||
load_ldmatrix(A_trans, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
|
||||
mma(A, A_trans, A_identity);
|
||||
#endif // defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
if constexpr (T_B_KQ::I == 8) {
|
||||
mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
|
||||
} else {
|
||||
|
||||
@@ -3108,6 +3108,15 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
|
||||
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
|
||||
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
|
||||
|
||||
if (cgraph->uid != 0 &&
|
||||
cgraph->uid == graph->uid) {
|
||||
GGML_LOG_DEBUG("CUDA Graph id %zu reused\n", cgraph->uid);
|
||||
GGML_ASSERT((int)graph->node_props.size() == cgraph->n_nodes);
|
||||
return false;
|
||||
}
|
||||
|
||||
graph->uid = cgraph->uid;
|
||||
|
||||
// Check if the graph size has changed
|
||||
if ((int)graph->node_props.size() != cgraph->n_nodes) {
|
||||
res = true;
|
||||
|
||||
+79
-166
@@ -86,17 +86,12 @@ namespace ggml_cuda_mma {
|
||||
// - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
|
||||
// - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
|
||||
|
||||
static constexpr bool is_i_major(const data_layout dl) {
|
||||
return dl == DATA_LAYOUT_I_MAJOR ||
|
||||
dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
|
||||
}
|
||||
|
||||
static constexpr __device__ data_layout get_input_data_layout() {
|
||||
#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
|
||||
return DATA_LAYOUT_I_MAJOR_MIRRORED;
|
||||
#else
|
||||
return DATA_LAYOUT_I_MAJOR;
|
||||
#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
|
||||
@@ -113,7 +108,6 @@ namespace ggml_cuda_mma {
|
||||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 64 && J == 2) return true;
|
||||
if (I == 16 && J == 8) return true;
|
||||
if (I == 32 && J == 4) return true;
|
||||
if (I == 16 && J == 16) return true;
|
||||
@@ -122,7 +116,7 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return threadIdx.x % 16;
|
||||
@@ -139,8 +133,8 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
return (2 * ((threadIdx.x / 16) % 2) + l);
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
return threadIdx.x / 16;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return 2 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
@@ -154,7 +148,7 @@ namespace ggml_cuda_mma {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#elif defined(VOLTA_MMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
@@ -283,7 +277,7 @@ namespace ggml_cuda_mma {
|
||||
static constexpr int J = J_;
|
||||
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
half2 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
@@ -407,7 +401,7 @@ namespace ggml_cuda_mma {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
};
|
||||
|
||||
template <int I_, int J_>
|
||||
@@ -701,57 +695,12 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
#endif // defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
static __device__ __forceinline__ void make_identity_mat(tile<16, 8, half2> & t) {
|
||||
#if defined(RDNA4)
|
||||
const int row = t.get_i(0);
|
||||
const int left_right = t.get_j(0) / 4;
|
||||
const int up_down = row / 8;
|
||||
const int idx = row % 8;
|
||||
reinterpret_cast<half*>(t.x)[idx] = left_right == up_down ? 1.0f : 0.0f;
|
||||
#else
|
||||
GGML_UNUSED_VARS(t);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(RDNA4)
|
||||
}
|
||||
|
||||
template <int I, int J, typename T, data_layout dl>
|
||||
static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
} else {
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
// All wmma layout has contiguous data when i-major.
|
||||
if constexpr (is_i_major(dl)) {
|
||||
// the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
|
||||
constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
|
||||
if constexpr (sizeof(t.x) > aligned_copy_bytes) {
|
||||
static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size");
|
||||
constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < aligned_copy_count; ++i) {
|
||||
ggml_cuda_memcpy_1<aligned_copy_bytes>(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i));
|
||||
}
|
||||
} else {
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
#endif // defined(AMD_MFMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -764,26 +713,37 @@ namespace ggml_cuda_mma {
|
||||
: "=r"(xi[0]), "=r"(xi[1])
|
||||
: "l"(xs));
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename T, data_layout dl>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<16, 4, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
tile<16, 4, T, dl> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
int * xi = (int *) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride;
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||
: "=r"(xi[0]), "=r"(xi[1])
|
||||
: "l"(xs));
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#ifdef RDNA3
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 16, "bad ne");
|
||||
ggml_cuda_memcpy_1<8>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
|
||||
ggml_cuda_memcpy_1<8>(t.x + 2, xs0 + t.get_i(0)*stride + 2);
|
||||
#else
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 8, "bad ne");
|
||||
ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#endif // RDNA3
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
static_assert(sizeof(t.x) == 4, "bad ne");
|
||||
ggml_cuda_memcpy_1<4>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#else
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -796,19 +756,26 @@ namespace ggml_cuda_mma {
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
|
||||
: "l"(xs));
|
||||
#else
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if 1
|
||||
// TODO: more generic handling
|
||||
static_assert(sizeof(T) == 4, "bad type size");
|
||||
#elif defined(VOLTA_MMA_AVAILABLE)
|
||||
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
|
||||
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 4, xs0 + t.get_i(4)*stride + 4);
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#ifdef RDNA3
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 32, "bad ne");
|
||||
ggml_cuda_memcpy_1<16>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
|
||||
ggml_cuda_memcpy_1<16>(t.x + 4, xs0 + t.get_i(0)*stride + 4);
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // 1
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 16, "bad ne");
|
||||
ggml_cuda_memcpy_1<16>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#endif // RDNA3
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
static_assert(sizeof(t.x) == 8, "bad ne");
|
||||
ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -827,23 +794,30 @@ namespace ggml_cuda_mma {
|
||||
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<32, 4, half2> & t, const half2 * __restrict__ xs0, const int stride) {
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
|
||||
#else
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix_trans(
|
||||
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
int * xi = (int * ) t.x;
|
||||
int * xi = (int *) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(xi[0]), "=r"(xi[2]), "=r"(xi[1]), "=r"(xi[3])
|
||||
: "l"(xs));
|
||||
#elif defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
half * xh = (half *) t.x;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
xh[2*l + 0] = ((const half *) xs0)[(2*t.get_j(l) + 0)*(2*stride) + t.get_i(l)];
|
||||
xh[2*l + 1] = ((const half *) xs0)[(2*t.get_j(l) + 1)*(2*stride) + t.get_i(l)];
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
@@ -1218,73 +1192,27 @@ namespace ggml_cuda_mma {
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
#if defined(CDNA4) || defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
|
||||
#endif // defined(CDNA4) || defined(CDNA3)
|
||||
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
|
||||
#if defined(RDNA4)
|
||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[1],
|
||||
true,
|
||||
b_vec[1],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], true);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[1], true, b_vec[1], acc[0], true);
|
||||
#elif defined(RDNA3)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * a_vec = (int32x4_t *) A.x;
|
||||
int32x4_t * b_vec = (int32x4_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
true,
|
||||
a_vec[1],
|
||||
true,
|
||||
b_vec[1],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], true);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[1], true, b_vec[1], acc[0], true);
|
||||
#endif // RDNA4
|
||||
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
@@ -1297,19 +1225,10 @@ namespace ggml_cuda_mma {
|
||||
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
|
||||
int32x16_t * acc = (int32x16_t *) D.x;
|
||||
#if defined(CDNA4) || defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
|
||||
#endif // defined(CDNA4) || defined(CDNA3)
|
||||
|
||||
#else
|
||||
@@ -1329,7 +1248,7 @@ namespace ggml_cuda_mma {
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<32, 8, float> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) {
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
@@ -1344,12 +1263,12 @@ namespace ggml_cuda_mma {
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<32, 4, half2> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & B) {
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
@@ -1364,41 +1283,35 @@ namespace ggml_cuda_mma {
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <data_layout dl_d, data_layout dl_ab>
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
#if defined(CDNA4) || defined(CDNA3)
|
||||
const int64_t xA = uint32_t(A.x[0]);
|
||||
const int64_t xB = uint32_t(B.x[0]);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(xA, xB, acc[0], 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
|
||||
#endif // defined(CDNA4) || defined(CDNA3)
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
#if defined(RDNA4)
|
||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
false
|
||||
);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], false);
|
||||
#elif defined(RDNA3)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * a_vec = (int32x4_t *) A.x;
|
||||
int32x4_t * b_vec = (int32x4_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
false
|
||||
);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], false);
|
||||
#endif // RDNA4
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
|
||||
+16
-185
@@ -104,7 +104,7 @@ struct tile_x_sizes {
|
||||
};
|
||||
|
||||
static int get_mmq_x_max_host(const int cc) {
|
||||
return (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
|
||||
return (turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
|
||||
GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
128 : 64;
|
||||
@@ -114,9 +114,9 @@ static int get_mmq_x_max_host(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_x_max_device() {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
return 128;
|
||||
#else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
#else // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
return 64;
|
||||
@@ -1054,13 +1054,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
float dB;
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
@@ -1295,13 +1295,13 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
|
||||
@@ -1435,57 +1435,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const float * y_df = (const float *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B[1];
|
||||
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
mma(C, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4] * dB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
@@ -1510,13 +1460,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
|
||||
@@ -1742,74 +1692,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const half2 * y_ds = (const half2 *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B[1];
|
||||
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x/2 : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y/2;
|
||||
const float sB = (k01 >= MMQ_TILE_NE_K * 3/4) ? 0
|
||||
: (((k01/4)%2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).y
|
||||
: __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).x);
|
||||
|
||||
tile_C Cm;
|
||||
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
||||
tile_A A1;
|
||||
A1.x[0] = 0x01010101;
|
||||
A1.x[1] = 0x01010101;
|
||||
mma(Cm, A1, B[0]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C Cd;
|
||||
mma(Cd, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/4]);
|
||||
float tmp = Cd.x[l]*dm.x;
|
||||
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
||||
tmp -= Cm.x[l]*dm.y;
|
||||
}
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*dB;
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] -= dm.y*sB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
@@ -1834,13 +1717,13 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y;
|
||||
@@ -2573,59 +2456,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const float * y_df = (const float *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B[1];
|
||||
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
mma(C, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16);
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K] * dB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
@@ -2651,13 +2482,13 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
|
||||
|
||||
Vendored
-1
@@ -33,7 +33,6 @@
|
||||
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
|
||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
|
||||
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
|
||||
#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
|
||||
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
|
||||
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
|
||||
@@ -648,9 +648,9 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
|
||||
assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
|
||||
assert(k_block % HMX_FP16_TILE_N_COLS == 0);
|
||||
|
||||
int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
|
||||
int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
|
||||
int n_tot_tiles = n_col_tiles * n_k_tiles;
|
||||
size_t n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
|
||||
size_t n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
|
||||
size_t n_tot_tiles = n_col_tiles * n_k_tiles;
|
||||
|
||||
size_t n_tiles_per_task = hmx_ceil_div(n_tot_tiles, ctx->n_threads);
|
||||
|
||||
@@ -678,9 +678,8 @@ static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict
|
||||
__builtin_assume(n_dot_tiles > 0);
|
||||
|
||||
Q6_bias_mxmem2_A((void *)scales);
|
||||
|
||||
for (int r = 0; r < n_row_tiles; ++r) {
|
||||
for (int c = 0; c < n_col_tiles; ++c) {
|
||||
for (size_t c = 0; c < n_col_tiles; ++c) {
|
||||
Q6_mxclracc_hf();
|
||||
|
||||
const __fp16 *row_tiles = activation + r * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
@@ -738,25 +737,25 @@ static inline void hmx_matmul_job_init(hmx_matmul_job_t * job,
|
||||
|
||||
static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, int n_rows, int n_cols, int n) {
|
||||
assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
|
||||
const int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
|
||||
const size_t tile_row_stride = (n_cols / HMX_FP16_TILE_N_COLS) * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
const HVX_Vector one = hvx_vec_splat_f16(1.0);
|
||||
|
||||
for (int r = 0; r < n_rows; r += 2) {
|
||||
int r0 = r / HMX_FP16_TILE_N_ROWS;
|
||||
int r1 = r % HMX_FP16_TILE_N_ROWS;
|
||||
for (size_t r = 0; r < n_rows; r += 2) {
|
||||
const size_t r0 = r / HMX_FP16_TILE_N_ROWS;
|
||||
const size_t r1 = (r % HMX_FP16_TILE_N_ROWS) / 2; // index of the row pair within the tile
|
||||
const __fp16 *row_base = vtcm_src + r0 * tile_row_stride;
|
||||
float *output_row_base = dst + r * n; // global memory row base for row r (and r+1)
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int c = 0; c < n_cols; c += HMX_FP16_TILE_N_COLS) {
|
||||
int c0 = c / HMX_FP16_TILE_N_COLS;
|
||||
|
||||
const __fp16 *tile = vtcm_src + (r0 * n_col_tiles + c0) * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
HVX_Vector v = ((const HVX_Vector *) tile)[r1 / 2];
|
||||
for (size_t c = 0; c < n_cols; c += HMX_FP16_TILE_N_COLS) {
|
||||
const size_t c0 = c / HMX_FP16_TILE_N_COLS;
|
||||
const __fp16 *tile = row_base + c0 * HMX_FP16_TILE_N_ELMS;
|
||||
HVX_Vector v = ((const HVX_Vector *) tile)[r1];
|
||||
HVX_VectorPair vp = Q6_Wqf32_vmpy_VhfVhf(v, one);
|
||||
|
||||
volatile HVX_Vector *pv_out0 = (volatile HVX_Vector *) (dst + (r * n + c + 0));
|
||||
volatile HVX_Vector *pv_out1 = (volatile HVX_Vector *) (dst + (r * n + c + n)); // next row in global memory
|
||||
volatile HVX_Vector *pv_out0 = (volatile HVX_Vector *) (output_row_base + c + 0);
|
||||
volatile HVX_Vector *pv_out1 = (volatile HVX_Vector *) (output_row_base + c + n); // next row in global memory
|
||||
|
||||
*pv_out0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(vp));
|
||||
if (r + 1 < n_rows) {
|
||||
@@ -794,7 +793,7 @@ static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst,
|
||||
assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
|
||||
|
||||
size_t n_tot_chunks = n_rows;
|
||||
size_t n_chunks_per_task = 32; // must be multiple of HMX_FP16_TILE_N_ROWS (32)
|
||||
size_t n_chunks_per_task = HMX_FP16_TILE_N_ROWS; // must be multiple of HMX_FP16_TILE_N_ROWS (32)
|
||||
|
||||
output_transfer_task_state_t state;
|
||||
state.n_tasks = (n_tot_chunks + n_chunks_per_task - 1) / n_chunks_per_task;
|
||||
@@ -926,7 +925,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
|
||||
}
|
||||
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
FARF(MEDIUM, "%s: grouped path m=%d k=%d n=%d group=%d streams=%d mc=%zu nc=%zu vtcm=%zu/%zu",
|
||||
__func__, params->m, params->k, params->n, group_size, params->ne13,
|
||||
@@ -944,12 +943,15 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
const size_t fp16_row_bytes = (size_t) params->k * sizeof(__fp16);
|
||||
const size_t weight_row_bytes = (size_t) params->weight_stride * sizeof(__fp16);
|
||||
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
|
||||
for (int b3 = 0; b3 < params->ne13; ++b3) {
|
||||
for (int b2_base = 0; b2_base < params->ne12; b2_base += group_size) {
|
||||
const __fp16 *weight_group = hmx_matmul_weight_batch_ptr(params, b2_base, b3);
|
||||
|
||||
for (size_t mr = 0; mr < (size_t) params->m; mr += m_chunk_n_rows) {
|
||||
const size_t n_rows = hex_smin((size_t) params->m - mr, m_chunk_n_rows);
|
||||
const size_t n_row_tiles = hmx_ceil_div((int) n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
|
||||
// Pre-load activations for all heads in the group (once per m_chunk).
|
||||
// When the source is strided (permuted Q), use 2D DMA to gather
|
||||
@@ -987,10 +989,9 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_first);
|
||||
}
|
||||
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
|
||||
for (size_t nc = 0; nc < (size_t) params->n; nc += n_chunk_n_cols) {
|
||||
const size_t n_cols = hex_smin((size_t) params->n - nc, n_chunk_n_cols);
|
||||
const size_t n_col_tiles = hmx_ceil_div((int) n_cols, HMX_FP16_TILE_N_COLS);
|
||||
|
||||
TIMER_START(weight_load);
|
||||
{
|
||||
@@ -1014,11 +1015,9 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
for (int g = 0; g < group_size; ++g) {
|
||||
TIMER_START(hmx_core);
|
||||
{
|
||||
const __fp16 *vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride;
|
||||
const int n_row_tiles = hmx_ceil_div((int) n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
const int n_col_tiles = hmx_ceil_div((int) n_cols, HMX_FP16_TILE_N_COLS);
|
||||
core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales,
|
||||
n_row_tiles, n_col_tiles, params->k / 32);
|
||||
const __fp16 * vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride;
|
||||
core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles,
|
||||
params->k / 32);
|
||||
}
|
||||
TIMER_STOP(hmx_core);
|
||||
|
||||
@@ -1030,12 +1029,12 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
TIMER_STOP(output_store);
|
||||
}
|
||||
}
|
||||
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
|
||||
TIMER_STOP(total);
|
||||
|
||||
#if defined(ENABLE_PROFILE_TIMERS)
|
||||
@@ -1103,7 +1102,7 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
return -1;
|
||||
}
|
||||
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
FARF(MEDIUM, "%s: m=%d k=%d n=%d mc=%zu nc=%zu vtcm=%zu/%zu",
|
||||
__func__, m, k, n, m_chunk_n_rows, n_chunk_n_cols,
|
||||
@@ -1121,7 +1120,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
// transfer activation matrix chunk into VTCM
|
||||
size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
|
||||
TIMER_START(activation_load);
|
||||
{
|
||||
@@ -1159,7 +1159,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
}
|
||||
|
||||
for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) {
|
||||
size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
|
||||
|
||||
TIMER_START(weight_load);
|
||||
{
|
||||
@@ -1184,8 +1185,6 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
|
||||
TIMER_START(hmx_core);
|
||||
{
|
||||
const int n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
const int n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
|
||||
core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32);
|
||||
}
|
||||
TIMER_STOP(hmx_core);
|
||||
@@ -1307,7 +1306,7 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
return -1;
|
||||
}
|
||||
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d pipe=%d mc=%zu nc=%zu vtcm=%zu/%zu",
|
||||
__func__, m, k, n, weight_type, use_pipeline,
|
||||
@@ -1330,7 +1329,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
// transfer activation matrix chunk into VTCM
|
||||
size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
|
||||
TIMER_START(activation_load);
|
||||
{
|
||||
@@ -1348,7 +1348,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
}
|
||||
|
||||
for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) {
|
||||
size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
|
||||
|
||||
TIMER_START(weight_load);
|
||||
{
|
||||
@@ -1373,8 +1374,6 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
|
||||
TIMER_START(hmx_core);
|
||||
{
|
||||
const int n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
const int n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
|
||||
core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32);
|
||||
}
|
||||
TIMER_STOP(hmx_core);
|
||||
@@ -1521,14 +1520,16 @@ void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __f
|
||||
|
||||
Q6_bias_mxmem2_A((void *)col_scales);
|
||||
|
||||
for (int i = 0; i < n_row_tiles; ++i) {
|
||||
for (int j = 0; j < n_col_tiles; ++j) {
|
||||
const size_t dot_tile_stride = n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
for (size_t i = 0; i < n_row_tiles; ++i) {
|
||||
const __fp16 *row_base = a + i * dot_tile_stride;
|
||||
__fp16 *res_base = c + i * n_col_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
for (size_t j = 0; j < n_col_tiles; ++j) {
|
||||
Q6_mxclracc_hf();
|
||||
|
||||
const __fp16 *row_tiles = a + i * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
const __fp16 *col_tiles = b + j * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
__fp16 *accum_tile = c + (i * n_col_tiles + j) * HMX_FP16_TILE_N_ELMS;
|
||||
const __fp16 *col_tiles = b + j * dot_tile_stride;
|
||||
const __fp16 *row_tiles = row_base;
|
||||
__fp16 *accum_tile = res_base + j * HMX_FP16_TILE_N_ELMS;
|
||||
if (!zero_init) {
|
||||
Q6_activation_hf_mxmem_RR((unsigned int)accum_tile, 2047);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047);
|
||||
@@ -1697,7 +1698,7 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
v = Q6_V_vror_VR(v, VLEN - 8);
|
||||
}
|
||||
}
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
TIMER_DEFINE(fetch);
|
||||
TIMER_DEFINE(act_load);
|
||||
@@ -1715,7 +1716,7 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
const int n_col_tiles = hmx_ceil_div(n_blk_sz, HMX_FP16_TILE_N_COLS);
|
||||
|
||||
for (size_t kk = 0; kk < k; kk += K_BLOCK_SIZE) {
|
||||
size_t k_blk_sz = hex_smin(k - kk, K_BLOCK_SIZE);
|
||||
const size_t k_blk_sz = hex_smin(k - kk, K_BLOCK_SIZE);
|
||||
|
||||
TIMER_START(fetch);
|
||||
// fetch activation block into VTCM
|
||||
@@ -1731,13 +1732,13 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
}
|
||||
|
||||
// fetch weight block into VTCM (x4x2 sub-block: quants + scales)
|
||||
const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
|
||||
{
|
||||
qweight_fetch_task_state_t s;
|
||||
|
||||
const int blk_start = kk / QK_Q4_0x4x2;
|
||||
const int nb_sub = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2;
|
||||
const int full_qrow = (weight_type == HTP_TYPE_Q8_0) ? k : (k / 2);
|
||||
const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
|
||||
const int scale_blk_size =
|
||||
(weight_type == HTP_TYPE_MXFP4) ? HMX_X4X2_MXFP4_EBLK_SIZE : HMX_X4X2_DBLK_SIZE;
|
||||
|
||||
@@ -1777,7 +1778,6 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
// vtcm_scratch0 is used to store the qweight chunk
|
||||
// worker_pool_run_func already returned, so fetch is done
|
||||
const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, vtcm_scratch0,
|
||||
n_blk_sz, k_blk_sz, sub_row_stride, weight_type);
|
||||
}
|
||||
|
||||
@@ -98,6 +98,8 @@ enum htp_op_code {
|
||||
#define HTP_OP_MAX_VMEM (3221225472u)
|
||||
#endif
|
||||
|
||||
#define HTP_MMAP_MAX_VMEM (2147483648u)
|
||||
|
||||
enum htp_tensor_flags {
|
||||
HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights)
|
||||
HTP_TENSOR_FLUSHED = (1U << 1) // Tensor buffer has been flushed (set by the NPU)
|
||||
|
||||
@@ -118,7 +118,11 @@ AEEResult htp_iface_close(remote_handle64 handle) {
|
||||
// release the mmaps (if any)
|
||||
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
|
||||
if (ctx->mmap[i].size) {
|
||||
#if __HVX_ARCH__ > 73
|
||||
HAP_munmap2((void *) ctx->mmap[i].base, ctx->mmap[i].size);
|
||||
#else
|
||||
HAP_munmap((void *) ctx->mmap[i].base, ctx->mmap[i].size);
|
||||
#endif
|
||||
ctx->mmap[i].size = 0;
|
||||
ctx->mmap[i].base = NULL;
|
||||
ctx->mmap[i].fd = -1;
|
||||
@@ -173,8 +177,16 @@ AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t
|
||||
struct htp_mmap *m = &ctx->mmap[i];
|
||||
if (!m->size) {
|
||||
FARF(HIGH, "mmap : fd %u size %u pinned %u", fd, size, pinned);
|
||||
|
||||
#if __HVX_ARCH__ > 73
|
||||
void *va = HAP_mmap2(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
|
||||
#else
|
||||
if (size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
|
||||
FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) size);
|
||||
abort(); // can't do much else at this point
|
||||
}
|
||||
|
||||
void *va = HAP_mmap(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
|
||||
#endif
|
||||
if (va == (void*)-1) {
|
||||
FARF(ERROR, "mmap failed : va %p fd %u size %u", va, fd, (uint32_t) size);
|
||||
return AEE_EFAILED;
|
||||
@@ -202,7 +214,11 @@ AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
|
||||
struct htp_mmap *m = &ctx->mmap[i];
|
||||
if (fd < 0 || m->fd == fd) {
|
||||
FARF(HIGH, "unmmap : base %p fd %u size %u", (void*) m->base, m->fd, (uint32_t) m->size);
|
||||
#if __HVX_ARCH__ > 73
|
||||
HAP_munmap2((void *) m->base, m->size);
|
||||
#else
|
||||
HAP_munmap((void *) m->base, m->size);
|
||||
#endif
|
||||
m->size = 0;
|
||||
m->base = NULL;
|
||||
m->fd = -1;
|
||||
@@ -526,7 +542,11 @@ static inline bool reuse_buf(struct htp_context *ctx, uint32_t *m_reuse, struct
|
||||
static inline void drop_mmap(struct htp_context *ctx, struct htp_mmap *m) {
|
||||
if (m->size && !m->pinned) {
|
||||
FARF(HIGH, "unmap : fd %u base %p size %u pinned %u", m->fd, (void*) m->base, (uint32_t) m->size, m->pinned);
|
||||
#if __HVX_ARCH__ > 73
|
||||
HAP_munmap2((void *) m->base, m->size);
|
||||
#else
|
||||
HAP_munmap((void *) m->base, m->size);
|
||||
#endif
|
||||
m->size = 0;
|
||||
m->base = 0;
|
||||
m->fd = -1;
|
||||
@@ -540,7 +560,16 @@ static inline void mmap_buf(struct htp_context *ctx, struct htp_buf_desc *b) {
|
||||
for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
|
||||
struct htp_mmap *m = &ctx->mmap[i];
|
||||
if (!m->size) {
|
||||
#if __HVX_ARCH__ > 73
|
||||
void *va = HAP_mmap2(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
|
||||
#else
|
||||
if (b->size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
|
||||
FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) b->size);
|
||||
abort(); // can't do much else at this point
|
||||
}
|
||||
|
||||
void *va = HAP_mmap(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
|
||||
#endif
|
||||
if (va == (void*)-1) {
|
||||
FARF(ERROR, "mmap failed : va %p fd %u size %u", va, b->fd, (uint32_t) b->size);
|
||||
abort(); // can't do much else at this point
|
||||
|
||||
@@ -30,6 +30,8 @@ extern "C" {
|
||||
|
||||
void ggml_print_backtrace(void);
|
||||
|
||||
uint64_t ggml_graph_next_uid(void);
|
||||
|
||||
#ifndef MIN
|
||||
# define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
@@ -338,6 +340,10 @@ struct ggml_cgraph {
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
|
||||
// an optional identifier that can be utilized to recognize same graphs if two non-zero values match
|
||||
// a value of 0 means it is not set and should be ignored
|
||||
uint64_t uid;
|
||||
};
|
||||
|
||||
// returns a slice of cgraph with nodes [i0, i1)
|
||||
|
||||
@@ -1819,6 +1819,23 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale(ggml_met
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_roll(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_ROLL);
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
snprintf(base, 256, "kernel_roll_%s", ggml_type_name(op->src[0]->type));
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_PAD);
|
||||
|
||||
|
||||
@@ -152,6 +152,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_3d
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_roll (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
|
||||
@@ -1138,6 +1138,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_TOP_K:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_ROLL:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
// for new head sizes, add checks here
|
||||
|
||||
@@ -1017,6 +1017,29 @@ typedef struct {
|
||||
int32_t p1;
|
||||
} ggml_metal_kargs_pad_reflect_1d;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
int32_t s0;
|
||||
int32_t s1;
|
||||
int32_t s2;
|
||||
int32_t s3;
|
||||
} ggml_metal_kargs_roll;
|
||||
|
||||
typedef struct {
|
||||
uint64_t nb1;
|
||||
int dim;
|
||||
|
||||
@@ -410,6 +410,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
{
|
||||
n_fuse = ggml_metal_op_pad_reflect_1d(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_ROLL:
|
||||
{
|
||||
n_fuse = ggml_metal_op_roll(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
n_fuse = ggml_metal_op_arange(ctx, idx);
|
||||
@@ -3945,6 +3949,59 @@ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_roll(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
ggml_metal_library_t lib = ctx->lib;
|
||||
ggml_metal_encoder_t enc = ctx->enc;
|
||||
|
||||
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||
|
||||
const int32_t s0 = ggml_get_op_params_i32(op, 0);
|
||||
const int32_t s1 = ggml_get_op_params_i32(op, 1);
|
||||
const int32_t s2 = ggml_get_op_params_i32(op, 2);
|
||||
const int32_t s3 = ggml_get_op_params_i32(op, 3);
|
||||
|
||||
ggml_metal_kargs_roll args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.s0 =*/ s0,
|
||||
/*.s1 =*/ s1,
|
||||
/*.s2 =*/ s2,
|
||||
/*.s3 =*/ s3
|
||||
};
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_roll(lib, op);
|
||||
|
||||
const int nth = std::min(1024, ne0);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_upscale (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_pad (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_pad_reflect_1d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_roll (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_arange (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_argmax (ggml_metal_op_t ctx, int idx);
|
||||
|
||||
@@ -5247,6 +5247,40 @@ kernel void kernel_upscale_bicubic_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_roll_f32(
|
||||
constant ggml_metal_kargs_roll & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
device const float * src0_ptr = (device const float *) src0;
|
||||
device float * dst_ptr = (device float *) dst;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
// apply shifts and wrap around
|
||||
int64_t i00 = i0 - args.s0;
|
||||
int64_t i01 = i1 - args.s1;
|
||||
int64_t i02 = i2 - args.s2;
|
||||
int64_t i03 = i3 - args.s3;
|
||||
|
||||
if (i00 < 0) { i00 += args.ne00; } else if (i00 >= args.ne00) { i00 -= args.ne00; }
|
||||
if (i01 < 0) { i01 += args.ne01; } else if (i01 >= args.ne01) { i01 -= args.ne01; }
|
||||
if (i02 < 0) { i02 += args.ne02; } else if (i02 >= args.ne02) { i02 -= args.ne02; }
|
||||
if (i03 < 0) { i03 += args.ne03; } else if (i03 >= args.ne03) { i03 -= args.ne03; }
|
||||
|
||||
int64_t src_idx = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00 + i00;
|
||||
int64_t dst_idx = i3 *args.ne2 *args.ne1 *args.ne0 + i2 *args.ne1 *args.ne0 + i1 *args.ne0 + i0;
|
||||
|
||||
dst_ptr[dst_idx] = src0_ptr[src_idx];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_pad_f32(
|
||||
constant ggml_metal_kargs_pad & args,
|
||||
device const char * src0,
|
||||
|
||||
@@ -121,6 +121,8 @@ set(GGML_OPENCL_KERNELS
|
||||
gemm_noshuffle_q4_k_f32
|
||||
gemv_noshuffle_q6_k_f32
|
||||
gemm_noshuffle_q6_k_f32
|
||||
gemv_noshuffle_q5_k_f32
|
||||
gemm_noshuffle_q5_k_f32
|
||||
mul
|
||||
neg
|
||||
norm
|
||||
|
||||
@@ -542,6 +542,8 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_restore_block_q4_K_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
|
||||
cl_kernel kernel_convert_block_q5_K, kernel_restore_block_q5_K;
|
||||
cl_kernel kernel_convert_block_q5_K_noshuffle;
|
||||
cl_kernel kernel_restore_block_q5_K_noshuffle;
|
||||
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q4_1_f32;
|
||||
@@ -730,6 +732,8 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_gemm_noshuffle_q4_k_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q6_K_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q6_K_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q5_k_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q5_k_f32;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
void free() {
|
||||
@@ -944,6 +948,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_K_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_K_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err));
|
||||
@@ -2794,6 +2800,45 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q6_K_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q6_K_f32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_noshuffle_q5_k_f32
|
||||
{
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable ";
|
||||
if (backend_ctx->has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_noshuffle_q5_k_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_noshuffle_q5_k_f32.cl");
|
||||
#endif
|
||||
|
||||
cl_program prog = build_program_from_source(
|
||||
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_k_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_k_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_q5_k_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q5_k_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q5_k_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_k_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_k_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_LOG_CONT("\n");
|
||||
}
|
||||
@@ -5071,115 +5116,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
// Transpose weights
|
||||
size_t q_size_bytes = K * M / 4 * sizeof(float);
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = q_size_bytes;
|
||||
cl_mem qT_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_quant_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_mem q_d_image1D;
|
||||
cl_mem qT_d_image1D;
|
||||
|
||||
cl_image_format img_fmt_1d;
|
||||
cl_image_desc img_desc_1d;
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4 / 4;
|
||||
img_desc_1d.buffer = extra->q;
|
||||
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4 / 4;
|
||||
img_desc_1d.buffer = qT_d;
|
||||
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int height_q = M / 4;
|
||||
int width_q = K / 4 / 4;
|
||||
kernel = backend_ctx->kernel_transpose_32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
|
||||
|
||||
size_t local_size_q[3] = {4, 16, 1};
|
||||
size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
// Transpose scales
|
||||
size_t d_size_bytes = M * (K / 32) * 2;
|
||||
region.origin = 0;
|
||||
region.size = d_size_bytes;
|
||||
cl_mem dT_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_scales_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_mem d_d_image1D;
|
||||
cl_mem dT_d_image1D;
|
||||
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
|
||||
img_desc_1d.image_width = M * K / 32;
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.buffer = extra->d;
|
||||
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 32 / 4;
|
||||
img_desc_1d.buffer = dT_d;
|
||||
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int height_s = M / 4;
|
||||
int width_s = K / 32;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_16_4x1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
|
||||
|
||||
size_t local_size_s[3] = {4, 16, 1};
|
||||
size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
// copy transposed buffer contents to original buffers
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(qT_d));
|
||||
CL_CHECK(clReleaseMemObject(dT_d));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(q_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(d_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(qT_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(dT_d_image1D));
|
||||
transpose_2d_as_32b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
||||
} // end transpose
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
@@ -5354,7 +5292,17 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
CL_CHECK(err);
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K;
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
kernel = backend_ctx->kernel_convert_block_q5_K_noshuffle;
|
||||
}
|
||||
#else
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K;
|
||||
#endif
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
||||
@@ -5362,6 +5310,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
@@ -5378,6 +5328,21 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
extra->size_dm = size_dm;
|
||||
|
||||
tensor->extra = extra;
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
// Transpose q, d, dm as ushort, qh as uchar
|
||||
transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
|
||||
transpose_2d_as_8b (backend_ctx, extra->qh, extra->qh, size_qh, K/8, M);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/256, M);
|
||||
transpose_2d_as_16b(backend_ctx, extra->dm, extra->dm, size_dm, K/256, M);
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
@@ -5894,6 +5859,57 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
|
||||
size_t size_q = extra->size_q;
|
||||
size_t size_qh = extra->size_qh;
|
||||
size_t size_d = extra->size_d;
|
||||
size_t size_dm = extra->size_dm;
|
||||
|
||||
static ggml_cl_buffer buf_trans_q;
|
||||
static ggml_cl_buffer buf_trans_qh;
|
||||
static ggml_cl_buffer buf_trans_d;
|
||||
static ggml_cl_buffer buf_trans_dm;
|
||||
|
||||
buf_trans_q.allocate(backend_ctx->context, size_q);
|
||||
buf_trans_qh.allocate(backend_ctx->context, size_qh);
|
||||
buf_trans_d.allocate(backend_ctx->context, size_d);
|
||||
buf_trans_dm.allocate(backend_ctx->context, size_dm);
|
||||
|
||||
// Reverse transpose q, qh, d, dm
|
||||
transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
|
||||
transpose_2d_as_8b (backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/256);
|
||||
transpose_2d_as_16b(backend_ctx, extra->dm, buf_trans_dm.buffer, size_dm, M, K/256);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_K_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_qh.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_d.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_trans_dm.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_K;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
|
||||
@@ -5901,6 +5917,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
@@ -9831,19 +9849,18 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const enum ggml_type src0t = src0->type;
|
||||
const enum ggml_type src1t = src1->type;
|
||||
|
||||
GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_Q8_0);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
GGML_ASSERT(src1->view_offs == 0);
|
||||
GGML_ASSERT(dst->view_offs == 0);
|
||||
|
||||
@@ -9864,148 +9881,112 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
// init CL objects
|
||||
cl_int status;
|
||||
cl_image_format img_fmt_1d;
|
||||
cl_image_desc img_desc_1d;
|
||||
cl_int err;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
cl_buffer_region region;
|
||||
cl_mem A_image1d;
|
||||
cl_mem B_image1d;
|
||||
cl_mem B_sub_buffer;
|
||||
cl_mem S_image1d;
|
||||
// for B transpose
|
||||
cl_mem B_image1d_trans = nullptr;
|
||||
cl_mem B_d = nullptr;
|
||||
|
||||
cl_mem D_image1d;
|
||||
cl_mem D_sub_buffer;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
// create an image for A
|
||||
img_fmt_1d = { CL_R, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float
|
||||
img_desc_1d.buffer = extra0_q8_0->q;
|
||||
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
if (ne1 == 1) {
|
||||
cl_mem q_img = nullptr;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// create an image for Scale
|
||||
img_fmt_1d = { CL_R, CL_HALF_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 32; // Block size is 32
|
||||
img_desc_1d.buffer = extra0_q8_0->d;
|
||||
S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
// image for q
|
||||
img_fmt = { CL_R, CL_UNSIGNED_INT32};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 4;
|
||||
img_desc.buffer = extra0_q8_0->q;
|
||||
CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// create a sub_buffer for B
|
||||
region.origin = (extra1->offset); // + src1->view_offs);
|
||||
region.size = K * N * sizeof(float);
|
||||
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
// create a sub_buffer for B
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// create an image for B from sub_buffer: RGBA (OCL)
|
||||
img_fmt_1d = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = K * N / 4;
|
||||
img_desc_1d.buffer = B_sub_buffer;
|
||||
B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// Create subbuffer and image1d_buffer for dst
|
||||
region.origin = (extrad->offset); // + dst->view_offs;
|
||||
region.size = M * N * sizeof(float);
|
||||
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
img_fmt_1d = {CL_R, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * N;
|
||||
img_desc_1d.buffer = D_sub_buffer;
|
||||
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
size_t local_work_size[3] = {1, 1, 1};
|
||||
size_t global_work_size[3] = {1, 1, 1};
|
||||
|
||||
if (N == 1) {
|
||||
kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
|
||||
|
||||
int r2 = 1;
|
||||
int r3 = 1;
|
||||
cl_uint k_arg = 0;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &extra1->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &extrad->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
||||
|
||||
size_t wavesize = backend_ctx->adreno_wave_size;
|
||||
local_work_size[0] = wavesize;
|
||||
local_work_size[1] = 4; // reduce factor
|
||||
local_work_size[2] = 1;
|
||||
size_t local_work_size[] = { wavesize, 4, 1 };
|
||||
size_t global_work_size[] = { CEIL_DIV(M, wavesize)*wavesize, 4, 1 };
|
||||
|
||||
global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
|
||||
global_work_size[1] = 4; // reduce factor
|
||||
global_work_size[2] = 1;
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(q_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
} else {
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
int padding;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem b_img_trans = nullptr;
|
||||
|
||||
//how many extra elements beyond multiple of 8
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
|
||||
//how much padding to add
|
||||
padding = 0;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0){
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// Specify the starting offset (in bytes)
|
||||
// subbuffer for transposed activations
|
||||
region.origin = 0;
|
||||
// Specify the size of the sub-buffer (divide by 2 for FP16)
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
B_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
|
||||
cl_image_desc image_desc_B_d_output = {
|
||||
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
||||
static_cast<size_t>(K * (N + padding)/4),
|
||||
0, 0, 0, 0, 0, 0, 0, { B_d }
|
||||
};
|
||||
B_image1d_trans = clCreateImage(
|
||||
context,
|
||||
0,
|
||||
&image_format_B_d_output,
|
||||
&image_desc_B_d_output,
|
||||
NULL,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
// image for transposed activations
|
||||
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * (N + padding) / 4;
|
||||
img_desc.buffer = b_sub_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// transpose activations
|
||||
int height_B = N/4;
|
||||
if (height_B == 0) {
|
||||
height_B = 1;
|
||||
@@ -10014,58 +9995,39 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
int padded_height_B = (N + padding)/4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_size_t[2] = { 1, 16 };
|
||||
size_t global_size_t[2] = {
|
||||
static_cast<size_t>(width_B),
|
||||
static_cast<size_t>(padded_height_B)
|
||||
};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
||||
size_t local_work_size_t[2] = { 1, 16 };
|
||||
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
|
||||
|
||||
int N_with_padding = N + padding;
|
||||
int padded_N = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
global_work_size[0] = (size_t)(N + 7) / 8;
|
||||
global_work_size[1] = (size_t)(M + 3) / 4;
|
||||
global_work_size[2] = 1;
|
||||
size_t global_work_size[] = { (size_t)CEIL_DIV(N, 8), (size_t)CEIL_DIV(M, 4), 1 };
|
||||
size_t local_work_size[] = { 2, 128, 1 };
|
||||
|
||||
local_work_size[0] = 2;
|
||||
local_work_size[1] = 128;
|
||||
local_work_size[2] = 1;
|
||||
}
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
// enqueue kernel with profiling
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
// deallocate sub buffers and images
|
||||
CL_CHECK(clReleaseMemObject(A_image1d));
|
||||
CL_CHECK(clReleaseMemObject(B_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(B_image1d));
|
||||
CL_CHECK(clReleaseMemObject(S_image1d));
|
||||
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(D_image1d));
|
||||
if (B_image1d_trans) {
|
||||
CL_CHECK(clReleaseMemObject(B_image1d_trans));
|
||||
}
|
||||
if (B_d) {
|
||||
CL_CHECK(clReleaseMemObject(B_d));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
@@ -10451,6 +10413,201 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
ggml_tensor_extra_cl_q5_K * extra0_q5_k = (ggml_tensor_extra_cl_q5_K *)src0->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
cl_int err;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
cl_buffer_region region;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
cl_uchar mask_d6 = 0x3F;
|
||||
cl_uchar mask_d4 = 0x0F;
|
||||
cl_uchar mask_hi2 = 0xC0;
|
||||
|
||||
if (ne1 == 1) {
|
||||
cl_mem q_img = nullptr;
|
||||
cl_mem qh_img = nullptr;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// image for q (CL_R, CL_UNSIGNED_INT32): width = M*K/2/4
|
||||
img_fmt = {CL_R, CL_UNSIGNED_INT32};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 2 / 4;
|
||||
img_desc.buffer = extra0_q5_k->q;
|
||||
CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// image for qh (CL_R, CL_HALF_FLOAT): width = M*K/16
|
||||
img_fmt = {CL_R, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 16;
|
||||
img_desc.buffer = extra0_q5_k->qh;
|
||||
CL_CHECK((qh_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations (CL_RGBA, CL_FLOAT): width = K*N/4
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
kernel = backend_ctx->kernel_gemv_noshuffle_q5_k_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qh_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_k->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_k->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra0_q5_k->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_uchar), &mask_d6));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_d4));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_uchar), &mask_hi2));
|
||||
|
||||
size_t local_work_size[3] = {64, 4, 1};
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(q_img));
|
||||
CL_CHECK(clReleaseMemObject(qh_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
} else {
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem b_img_trans = nullptr;
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0) {
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// subbuffer for transposed activations
|
||||
region.origin = 0;
|
||||
region.size = K * (N + padding) * sizeof(float) / 2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for transposed activations
|
||||
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * (N + padding) / 4;
|
||||
img_desc.buffer = b_sub_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// transpose activations
|
||||
int height_B = N / 4;
|
||||
if (height_B == 0) height_B = 1;
|
||||
int width_B = K / 4;
|
||||
int padded_height_B = (N + padding) / 4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_work_size_t[2] = {1, 16};
|
||||
size_t global_work_size_t[2] = {(size_t)width_B, (size_t)padded_height_B};
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_gemm_noshuffle_q5_k_f32;
|
||||
int padded_N = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_k->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_k->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_k->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_k->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra0_q5_k->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_uchar), &mask_d6));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_uchar), &mask_d4));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_uchar), &mask_hi2));
|
||||
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
|
||||
size_t local_work_size[3] = {1, 128, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -10600,6 +10757,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
return;
|
||||
}
|
||||
|
||||
// q5_K x fp32
|
||||
if (src0t == GGML_TYPE_Q5_K && src1t == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_q5_K_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// q4_0 x fp32
|
||||
if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
|
||||
// TODO: remove duplicate definitions of image description + format -- move to top
|
||||
|
||||
@@ -568,7 +568,9 @@ kernel void kernel_convert_block_q5_K(
|
||||
global uchar * dst_qh,
|
||||
global uchar * dst_s,
|
||||
global half * dst_d,
|
||||
global half * dst_dm
|
||||
global half * dst_dm,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
|
||||
@@ -599,7 +601,9 @@ kernel void kernel_restore_block_q5_K(
|
||||
global uchar * src_s,
|
||||
global half * src_d,
|
||||
global half * src_dm,
|
||||
global struct block_q5_K * dst
|
||||
global struct block_q5_K * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
|
||||
@@ -622,6 +626,92 @@ kernel void kernel_restore_block_q5_K(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_K_noshuffle(
|
||||
global struct block_q5_K * src0,
|
||||
global uchar * dst_q,
|
||||
global uchar * dst_qh,
|
||||
global uchar * dst_s,
|
||||
global half * dst_d,
|
||||
global half * dst_dm,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0);
|
||||
global uchar * qh = (global uchar *) dst_qh + QK_K/8 * get_global_id(0);
|
||||
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
global half * dm = (global half *) dst_dm + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*dm = b->dm;
|
||||
|
||||
for (int i = 0; i < QK_K / 64; ++i) {
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
uchar x0 = b->qs[i*32 + 2*j];
|
||||
uchar x1 = b->qs[i*32 + 2*j + 1];
|
||||
q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
|
||||
q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
|
||||
}
|
||||
}
|
||||
|
||||
for (int l = 0; l < QK_K/8; ++l) {
|
||||
uchar x0 = 0;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
x0 |= ((b->qh[(l%4)*8+i] >> (l/4)) & 0x01) << i;
|
||||
}
|
||||
qh[l] = x0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
||||
s[i] = b->s[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_K_noshuffle(
|
||||
global uchar * src_q,
|
||||
global uchar * src_qh,
|
||||
global uchar * src_s,
|
||||
global half * src_d,
|
||||
global half * src_dm,
|
||||
global struct block_q5_K * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0);
|
||||
global uchar * qh = (global uchar *) src_qh + QK_K/8 * get_global_id(0);
|
||||
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
global half * dm = (global half *) src_dm + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
b->dm = *dm;
|
||||
|
||||
for (int i = 0; i < QK_K / 64; ++i) {
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
uchar lo = q[i*32 + j];
|
||||
uchar hi = q[i*32 + j + 16];
|
||||
b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
|
||||
b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
|
||||
}
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; ++g) {
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
uchar x0 = 0;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
x0 |= ((qh[4*k+g] >> i) & 0x01) << k;
|
||||
}
|
||||
b->qh[g*8+i] = x0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
||||
b->s[i] = s[i];
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q6_K
|
||||
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
|
||||
|
||||
@@ -0,0 +1,176 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
|
||||
inline void get_scale_min_k4(
|
||||
int j,
|
||||
global const uchar * q,
|
||||
uchar * d,
|
||||
uchar * m,
|
||||
uchar mask_d6,
|
||||
uchar mask_d4,
|
||||
uchar mask_hi2
|
||||
) {
|
||||
if (j < 4) {
|
||||
*d = q[j] & mask_d6;
|
||||
*m = q[j+4] & mask_d6;
|
||||
} else {
|
||||
*d = (q[j+4] & mask_d4) | ((q[j-4] & mask_hi2) >> 2);
|
||||
*m = ((q[j+4] >> 4) & mask_d4) | ((q[j] & mask_hi2) >> 2);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
kernel void kernel_gemm_noshuffle_q5_k_f32(
|
||||
global const ushort * src0_q,
|
||||
global const uchar * src0_qh,
|
||||
global const uchar * src0_s,
|
||||
global const half * src0_d,
|
||||
global const half * src0_dm,
|
||||
read_only image1d_buffer_t src1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
int n_no_padding,
|
||||
uchar mask_d6,
|
||||
uchar mask_d4,
|
||||
uchar mask_hi2
|
||||
) {
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
int n_4 = n >> 2;
|
||||
int gy = get_global_id(0);
|
||||
int gx = get_global_id(1);
|
||||
int gx_2 = gx << 2;
|
||||
|
||||
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
|
||||
half8 B;
|
||||
half4 dequantized_weights;
|
||||
|
||||
int num_blocks_K = k / QK_K;
|
||||
|
||||
global const ushort * weight_ptr = src0_q + gx_2;
|
||||
global const uchar * qh_ptr = src0_qh + gx_2;
|
||||
global const half * d_ptr = src0_d + gx_2;
|
||||
global const half * dm_ptr = src0_dm + gx_2;
|
||||
|
||||
for (int i = 0; i < k; i += 32) {
|
||||
int sb_idx = i / QK_K;
|
||||
int sub_idx = (i / 32) % 8;
|
||||
|
||||
half4 d = vload4(0, d_ptr + sb_idx * m);
|
||||
half4 dm = vload4(0, dm_ptr + sb_idx * m);
|
||||
|
||||
global const uchar * sc0 = src0_s + (gx_2+0) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
|
||||
global const uchar * sc1 = src0_s + (gx_2+1) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
|
||||
global const uchar * sc2 = src0_s + (gx_2+2) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
|
||||
global const uchar * sc3 = src0_s + (gx_2+3) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
|
||||
|
||||
uchar sv0, mn0, sv1, mn1, sv2, mn2, sv3, mn3;
|
||||
get_scale_min_k4(sub_idx, sc0, &sv0, &mn0, mask_d6, mask_d4, mask_hi2);
|
||||
get_scale_min_k4(sub_idx, sc1, &sv1, &mn1, mask_d6, mask_d4, mask_hi2);
|
||||
get_scale_min_k4(sub_idx, sc2, &sv2, &mn2, mask_d6, mask_d4, mask_hi2);
|
||||
get_scale_min_k4(sub_idx, sc3, &sv3, &mn3, mask_d6, mask_d4, mask_hi2);
|
||||
|
||||
half4 scale = convert_half4(convert_float4(d) * convert_float4((uchar4)(sv0, sv1, sv2, sv3)));
|
||||
half4 mval = convert_half4(convert_float4(dm) * convert_float4((uchar4)(mn0, mn1, mn2, mn3)));
|
||||
|
||||
for (int l = 0; l < 32; l += 4) {
|
||||
int ki = i + l;
|
||||
ushort4 bits4 = vload4(0, weight_ptr + (ki/4) * m);
|
||||
uchar4 qh_bits = vload4(0, qh_ptr + (ki/8) * m);
|
||||
int qh_shift = ki % 8;
|
||||
|
||||
// j=0
|
||||
B.s0123 = read_imageh(src1, gy*2 + (ki+0) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2+1 + (ki+0) * n_4);
|
||||
dequantized_weights.s0 = ((bits4.s0 & 0x000F) | (((qh_bits.s0 >> (qh_shift+0)) & 1) << 4)) * scale.s0 - mval.s0;
|
||||
dequantized_weights.s1 = ((bits4.s1 & 0x000F) | (((qh_bits.s1 >> (qh_shift+0)) & 1) << 4)) * scale.s1 - mval.s1;
|
||||
dequantized_weights.s2 = ((bits4.s2 & 0x000F) | (((qh_bits.s2 >> (qh_shift+0)) & 1) << 4)) * scale.s2 - mval.s2;
|
||||
dequantized_weights.s3 = ((bits4.s3 & 0x000F) | (((qh_bits.s3 >> (qh_shift+0)) & 1) << 4)) * scale.s3 - mval.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=1
|
||||
B.s0123 = read_imageh(src1, gy*2 + (ki+1) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2+1 + (ki+1) * n_4);
|
||||
dequantized_weights.s0 = (((bits4.s0 & 0x00F0) >> 4) | (((qh_bits.s0 >> (qh_shift+1)) & 1) << 4)) * scale.s0 - mval.s0;
|
||||
dequantized_weights.s1 = (((bits4.s1 & 0x00F0) >> 4) | (((qh_bits.s1 >> (qh_shift+1)) & 1) << 4)) * scale.s1 - mval.s1;
|
||||
dequantized_weights.s2 = (((bits4.s2 & 0x00F0) >> 4) | (((qh_bits.s2 >> (qh_shift+1)) & 1) << 4)) * scale.s2 - mval.s2;
|
||||
dequantized_weights.s3 = (((bits4.s3 & 0x00F0) >> 4) | (((qh_bits.s3 >> (qh_shift+1)) & 1) << 4)) * scale.s3 - mval.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=2
|
||||
B.s0123 = read_imageh(src1, gy*2 + (ki+2) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2+1 + (ki+2) * n_4);
|
||||
dequantized_weights.s0 = (((bits4.s0 & 0x0F00) >> 8) | (((qh_bits.s0 >> (qh_shift+2)) & 1) << 4)) * scale.s0 - mval.s0;
|
||||
dequantized_weights.s1 = (((bits4.s1 & 0x0F00) >> 8) | (((qh_bits.s1 >> (qh_shift+2)) & 1) << 4)) * scale.s1 - mval.s1;
|
||||
dequantized_weights.s2 = (((bits4.s2 & 0x0F00) >> 8) | (((qh_bits.s2 >> (qh_shift+2)) & 1) << 4)) * scale.s2 - mval.s2;
|
||||
dequantized_weights.s3 = (((bits4.s3 & 0x0F00) >> 8) | (((qh_bits.s3 >> (qh_shift+2)) & 1) << 4)) * scale.s3 - mval.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=3
|
||||
B.s0123 = read_imageh(src1, gy*2 + (ki+3) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2+1 + (ki+3) * n_4);
|
||||
dequantized_weights.s0 = (((bits4.s0 & 0xF000) >> 12) | (((qh_bits.s0 >> (qh_shift+3)) & 1) << 4)) * scale.s0 - mval.s0;
|
||||
dequantized_weights.s1 = (((bits4.s1 & 0xF000) >> 12) | (((qh_bits.s1 >> (qh_shift+3)) & 1) << 4)) * scale.s1 - mval.s1;
|
||||
dequantized_weights.s2 = (((bits4.s2 & 0xF000) >> 12) | (((qh_bits.s2 >> (qh_shift+3)) & 1) << 4)) * scale.s2 - mval.s2;
|
||||
dequantized_weights.s3 = (((bits4.s3 & 0xF000) >> 12) | (((qh_bits.s3 >> (qh_shift+3)) & 1) << 4)) * scale.s3 - mval.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
}
|
||||
}
|
||||
|
||||
int idx = (gy<<3)*m + (gx<<2);
|
||||
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,326 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#endif
|
||||
|
||||
#define QK_K 256
|
||||
#define NSUBGROUPS 4
|
||||
#define SUBGROUP_SIZE 64
|
||||
|
||||
inline void get_scale_min_k4(
|
||||
int j,
|
||||
global const uchar * q,
|
||||
uchar * d,
|
||||
uchar * m,
|
||||
uchar mask_d6,
|
||||
uchar mask_d4,
|
||||
uchar mask_hi2
|
||||
) {
|
||||
if (j < 4) {
|
||||
*d = q[j] & mask_d6;
|
||||
*m = q[j+4] & mask_d6;
|
||||
} else {
|
||||
*d = (q[j+4] & mask_d4) | ((q[j-4] & mask_hi2) >> 2);
|
||||
*m = ((q[j+4] >> 4) & mask_d4) | ((q[j] & mask_hi2) >> 2);
|
||||
}
|
||||
}
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, minv, y) \
|
||||
float shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | ((bits1.s0 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | ((bits1.s1 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | ((bits1.s2 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | ((bits1.s3 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, minv, y) \
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | ((bits1.s4 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | ((bits1.s5 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | ((bits1.s6 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | ((bits1.s7 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, minv, y) \
|
||||
float8 shared_y; \
|
||||
shared_y = sub_group_broadcast(y, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | ((bits1.s0 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | ((bits1.s1 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | ((bits1.s2 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | ((bits1.s3 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s7; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, minv, y) \
|
||||
shared_y = sub_group_broadcast(y, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | ((bits1.s4 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | ((bits1.s5 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | ((bits1.s6 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | ((bits1.s7 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s7; \
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_gemv_noshuffle_q5_k_f32(
|
||||
read_only image1d_buffer_t src0_q,
|
||||
read_only image1d_buffer_t src0_qh,
|
||||
global half2 * src0_d,
|
||||
global half2 * src0_m,
|
||||
global uchar * src0_s,
|
||||
read_only image1d_buffer_t src1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
uchar mask_d6,
|
||||
uchar mask_d4,
|
||||
uchar mask_hi2)
|
||||
{
|
||||
uint groupId = get_local_id(1);
|
||||
uint gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
uint K = ne00;
|
||||
uint M = ne01;
|
||||
|
||||
uint LINE_STRIDE_A = M / 2;
|
||||
uint BLOCK_STRIDE_A = NSUBGROUPS * M;
|
||||
|
||||
uint LINE_STRIDE_A_QH = M / 2;
|
||||
uint BLOCK_STRIDE_A_QH = NSUBGROUPS * M / 2;
|
||||
uint scales_per_row = (K / QK_K) * 12;
|
||||
|
||||
private uint4 regA;
|
||||
private ushort4 regH;
|
||||
private half2 regS;
|
||||
private half2 regM;
|
||||
private float8 regB;
|
||||
|
||||
private float2 totalSum = (float2)(0.0f);
|
||||
|
||||
for (uint k = groupId; k < (K / 32); k += NSUBGROUPS) {
|
||||
uint sb = k / 8;
|
||||
uint j = k % 8;
|
||||
|
||||
half2 d = src0_d[gid + sb * LINE_STRIDE_A];
|
||||
half2 dm = src0_m[gid + sb * LINE_STRIDE_A];
|
||||
|
||||
global const uchar * sc0 = src0_s + 2 * gid * scales_per_row + sb * 12;
|
||||
global const uchar * sc1 = src0_s + (2 * gid + 1) * scales_per_row + sb * 12;
|
||||
|
||||
uchar sv0, mn0, sv1, mn1;
|
||||
get_scale_min_k4(j, sc0, &sv0, &mn0, mask_d6, mask_d4, mask_hi2);
|
||||
get_scale_min_k4(j, sc1, &sv1, &mn1, mask_d6, mask_d4, mask_hi2);
|
||||
|
||||
regS = convert_half2(convert_float2(d) * convert_float2((uchar2)(sv0, sv1)));
|
||||
regM = convert_half2(convert_float2(dm) * convert_float2((uchar2)(mn0, mn1)));
|
||||
|
||||
if (slid < 4) {
|
||||
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||
}
|
||||
|
||||
regH.s0 = as_ushort(read_imageh(src0_qh, (gid + k * BLOCK_STRIDE_A_QH + LINE_STRIDE_A_QH * 0)).x);
|
||||
regH.s1 = as_ushort(read_imageh(src0_qh, (gid + k * BLOCK_STRIDE_A_QH + LINE_STRIDE_A_QH * 1)).x);
|
||||
regH.s2 = as_ushort(read_imageh(src0_qh, (gid + k * BLOCK_STRIDE_A_QH + LINE_STRIDE_A_QH * 2)).x);
|
||||
regH.s3 = as_ushort(read_imageh(src0_qh, (gid + k * BLOCK_STRIDE_A_QH + LINE_STRIDE_A_QH * 3)).x);
|
||||
|
||||
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), as_uchar8(regH), regS, regM, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), as_uchar8(regH), regS, regM, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
|
||||
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), as_uchar8(regH), regS, regM, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), as_uchar8(regH), regS, regM, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #wave=4
|
||||
local float2 reduceLM[SUBGROUP_SIZE * 3];
|
||||
if (groupId == 1) {
|
||||
reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 2) {
|
||||
reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 3) {
|
||||
reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
|
||||
}
|
||||
|
||||
// 2 outputs per fiber in wave 0
|
||||
if (groupId == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ message(STATUS "Using RPC backend")
|
||||
|
||||
ggml_add_backend_library(ggml-rpc
|
||||
ggml-rpc.cpp
|
||||
transport.cpp
|
||||
)
|
||||
|
||||
if (WIN32)
|
||||
|
||||
+64
-742
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,683 @@
|
||||
#include "transport.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
# include <winsock2.h>
|
||||
#else
|
||||
# include <arpa/inet.h>
|
||||
# include <sys/socket.h>
|
||||
# include <sys/types.h>
|
||||
# include <netinet/in.h>
|
||||
# include <netinet/tcp.h>
|
||||
# include <netdb.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <cstdlib>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
|
||||
#ifdef GGML_RPC_RDMA
|
||||
# include <infiniband/verbs.h>
|
||||
# include <time.h>
|
||||
# ifndef _WIN32
|
||||
# include <poll.h>
|
||||
# endif
|
||||
#endif // GGML_RPC_RDMA
|
||||
|
||||
#ifdef _WIN32
|
||||
typedef SOCKET sockfd_t;
|
||||
using ssize_t = __int64;
|
||||
#else
|
||||
typedef int sockfd_t;
|
||||
#endif
|
||||
|
||||
static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
|
||||
|
||||
#define LOG_DBG(...) \
|
||||
do { if (RPC_DEBUG) GGML_LOG_DEBUG(__VA_ARGS__); } while (0)
|
||||
|
||||
#ifdef GGML_RPC_RDMA
|
||||
static constexpr size_t RDMA_CHUNK = 256 * 1024; // 256 KiB per send/recv (fits default 8 MiB memlock)
|
||||
static constexpr int RDMA_RX_DEPTH = 24; // pre-posted recv ring: 24 × 256 KiB = 6 MiB
|
||||
static constexpr size_t RDMA_GID_SIZE = 16; // RoCE GID / IB GID is always 16 bytes
|
||||
using rdma_gid_t = std::array<uint8_t, RDMA_GID_SIZE>;
|
||||
|
||||
struct rdma_conn {
|
||||
struct ibv_context * ctx = nullptr;
|
||||
struct ibv_pd * pd = nullptr;
|
||||
struct ibv_cq * scq = nullptr; // send completions
|
||||
struct ibv_cq * rcq = nullptr; // recv completions
|
||||
struct ibv_qp * qp = nullptr;
|
||||
|
||||
void * tx_buf = nullptr;
|
||||
struct ibv_mr * tx_mr = nullptr;
|
||||
|
||||
void * rx_buf = nullptr; // RDMA_RX_DEPTH × RDMA_CHUNK contiguous
|
||||
struct ibv_mr * rx_mr = nullptr;
|
||||
int rx_head = 0;
|
||||
|
||||
uint32_t max_inline = 0;
|
||||
|
||||
uint8_t * rx_slot(int i) const {
|
||||
return static_cast<uint8_t *>(rx_buf) + static_cast<size_t>(i) * RDMA_CHUNK;
|
||||
}
|
||||
|
||||
bool post_rx(int i) {
|
||||
struct ibv_sge sge = {};
|
||||
sge.addr = (uintptr_t)rx_slot(i);
|
||||
sge.length = RDMA_CHUNK;
|
||||
sge.lkey = rx_mr->lkey;
|
||||
struct ibv_recv_wr wr = {}, * bad = nullptr;
|
||||
wr.wr_id = (uint64_t)i;
|
||||
wr.sg_list = &sge;
|
||||
wr.num_sge = 1;
|
||||
return ibv_post_recv(qp, &wr, &bad) == 0;
|
||||
}
|
||||
|
||||
~rdma_conn() {
|
||||
if (tx_mr) ibv_dereg_mr(tx_mr);
|
||||
if (rx_mr) ibv_dereg_mr(rx_mr);
|
||||
free(tx_buf);
|
||||
free(rx_buf);
|
||||
if (qp) ibv_destroy_qp(qp);
|
||||
if (scq) ibv_destroy_cq(scq);
|
||||
if (rcq) ibv_destroy_cq(rcq);
|
||||
if (pd) ibv_dealloc_pd(pd);
|
||||
if (ctx) ibv_close_device(ctx);
|
||||
}
|
||||
};
|
||||
|
||||
// Local RDMA parameters captured during the probe phase and later consumed
|
||||
// by rdma_activate() after the remote side's caps arrive via HELLO.
|
||||
struct rdma_local_info {
|
||||
uint32_t qpn = 0;
|
||||
uint32_t psn = 0;
|
||||
uint8_t gid[RDMA_GID_SIZE] = {};
|
||||
uint8_t ib_port = 0;
|
||||
int gid_idx = 0;
|
||||
enum ibv_mtu path_mtu = IBV_MTU_1024;
|
||||
};
|
||||
|
||||
struct rdma_caps {
|
||||
uint32_t qpn;
|
||||
uint32_t psn;
|
||||
uint8_t gid[RDMA_GID_SIZE];
|
||||
};
|
||||
|
||||
static_assert(sizeof(rdma_caps) == RPC_CONN_CAPS_SIZE, "rdma_caps must match conn_caps size");
|
||||
|
||||
#endif // GGML_RPC_RDMA
|
||||
|
||||
struct socket_t::impl {
|
||||
impl(sockfd_t fd) : use_rdma(false), fd(fd) {}
|
||||
~impl();
|
||||
bool send_data(const void * data, size_t size);
|
||||
bool recv_data(void * data, size_t size);
|
||||
void get_caps(uint8_t * local_caps);
|
||||
void update_caps(const uint8_t * remote_caps);
|
||||
|
||||
#ifdef GGML_RPC_RDMA
|
||||
bool tcp_peer_closed();
|
||||
std::optional<rdma_gid_t> rdma_build_target_gid();
|
||||
bool rdma_probe();
|
||||
bool rdma_activate(uint32_t remote_qpn, uint32_t remote_psn, const uint8_t * remote_gid);
|
||||
bool rdma_poll(struct ibv_cq * cq, struct ibv_wc * wc);
|
||||
bool rdma_send(const void * data, size_t size);
|
||||
bool rdma_recv(void * data, size_t size);
|
||||
|
||||
std::unique_ptr<rdma_conn> rdma;
|
||||
rdma_local_info rdma_local = {};
|
||||
#endif // GGML_RPC_RDMA
|
||||
bool use_rdma;
|
||||
sockfd_t fd;
|
||||
};
|
||||
|
||||
socket_t::impl::~impl() {
|
||||
#ifdef GGML_RPC_RDMA
|
||||
rdma.reset();
|
||||
#endif // GGML_RPC_RDMA
|
||||
LOG_DBG("[%s] closing socket %d\n", __func__, this->fd);
|
||||
#ifdef _WIN32
|
||||
if (fd != INVALID_SOCKET) closesocket(this->fd);
|
||||
#else
|
||||
if (fd >= 0) close(this->fd);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GGML_RPC_RDMA
|
||||
|
||||
bool socket_t::impl::tcp_peer_closed() {
|
||||
if (fd < 0) return false;
|
||||
#ifndef _WIN32
|
||||
struct pollfd pfd = { fd, POLLIN | POLLRDHUP, 0 };
|
||||
int r = poll(&pfd, 1, 0);
|
||||
return r > 0 && (pfd.revents & (POLLHUP | POLLERR | POLLRDHUP));
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Build a RoCE GID-shaped 16-byte target from a TCP socket's local address.
|
||||
// Used to match the socket's local IP against the kernel's GID table so that
|
||||
// a single memcmp handles IPv4, IPv4-mapped IPv6, and native IPv6 uniformly:
|
||||
// AF_INET -> ::ffff:a.b.c.d (bytes 10-11 = 0xff, last 4 = IPv4)
|
||||
// AF_INET6 (IPv4-mapped) -> ::ffff:a.b.c.d (already in GID shape)
|
||||
// AF_INET6 (native v6) -> the 16-byte IPv6 address as-is
|
||||
// Returns std::nullopt on unsupported family or getsockname failure.
|
||||
std::optional<rdma_gid_t> socket_t::impl::rdma_build_target_gid() {
|
||||
sockaddr_storage addr = {};
|
||||
socklen_t addr_len = sizeof(addr);
|
||||
if (getsockname(fd, reinterpret_cast<sockaddr *>(&addr), &addr_len) != 0) {
|
||||
return std::nullopt;
|
||||
}
|
||||
rdma_gid_t target = {};
|
||||
if (addr.ss_family == AF_INET) {
|
||||
const auto * a = reinterpret_cast<const sockaddr_in *>(&addr);
|
||||
target[10] = 0xff;
|
||||
target[11] = 0xff;
|
||||
memcpy(&target[12], &a->sin_addr, 4);
|
||||
return target;
|
||||
}
|
||||
if (addr.ss_family == AF_INET6) {
|
||||
const auto * a = reinterpret_cast<const sockaddr_in6 *>(&addr);
|
||||
memcpy(target.data(), &a->sin6_addr, RDMA_GID_SIZE);
|
||||
return target;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
bool socket_t::impl::rdma_probe() {
|
||||
const char * dev_env = std::getenv("GGML_RDMA_DEV");
|
||||
const char * gid_env = std::getenv("GGML_RDMA_GID");
|
||||
|
||||
auto target_gid = rdma_build_target_gid();
|
||||
if (!target_gid) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const uint8_t ib_port = 1;
|
||||
int num_devs = 0;
|
||||
ibv_device ** devs = ibv_get_device_list(&num_devs);
|
||||
if (!devs || num_devs == 0) return false;
|
||||
|
||||
ibv_context * ibctx = nullptr;
|
||||
const char * matched_dev = nullptr;
|
||||
int gid_idx = gid_env ? atoi(gid_env) : -1;
|
||||
int gid_version = IBV_GID_TYPE_IB; // 0 = unknown/IB
|
||||
|
||||
for (int d = 0; d < num_devs; d++) {
|
||||
const char * dn = ibv_get_device_name(devs[d]);
|
||||
if (dev_env && strcmp(dev_env, dn) != 0) continue;
|
||||
|
||||
ibv_context * ctx = ibv_open_device(devs[d]);
|
||||
if (!ctx) continue;
|
||||
|
||||
ibv_port_attr pa;
|
||||
if (ibv_query_port(ctx, ib_port, &pa) != 0) { ibv_close_device(ctx); continue; }
|
||||
|
||||
int found_gid = gid_idx;
|
||||
int found_version = IBV_GID_TYPE_IB;
|
||||
if (found_gid < 0) {
|
||||
// Find a GID on this port whose bytes equal the local TCP address
|
||||
// (IPv4 or IPv6). Prefer RoCE v2 (UDP/IP, L3-routable) over v1
|
||||
// (raw Ethernet, same-L2 only) so silent hangs on L3-routed paths
|
||||
// are avoided. ibv_query_gid_ex returns gid+type in one call.
|
||||
int v2_idx = -1;
|
||||
int v1_idx = -1;
|
||||
for (int i = 0; i < pa.gid_tbl_len; i++) {
|
||||
ibv_gid_entry entry = {};
|
||||
if (ibv_query_gid_ex(ctx, ib_port, i, &entry, 0) != 0) continue;
|
||||
if (memcmp(entry.gid.raw, target_gid->data(), RDMA_GID_SIZE) != 0) continue;
|
||||
if (entry.gid_type == IBV_GID_TYPE_ROCE_V2 && v2_idx < 0) {
|
||||
v2_idx = i;
|
||||
} else if (entry.gid_type == IBV_GID_TYPE_ROCE_V1 && v1_idx < 0) {
|
||||
v1_idx = i;
|
||||
}
|
||||
}
|
||||
if (v2_idx >= 0) {
|
||||
found_gid = v2_idx;
|
||||
found_version = IBV_GID_TYPE_ROCE_V2;
|
||||
} else if (v1_idx >= 0) {
|
||||
found_gid = v1_idx;
|
||||
found_version = IBV_GID_TYPE_ROCE_V1;
|
||||
}
|
||||
} else {
|
||||
// Explicit GID index from GGML_RDMA_GID — fetch its type for logging.
|
||||
ibv_gid_entry entry = {};
|
||||
if (ibv_query_gid_ex(ctx, ib_port, found_gid, &entry, 0) == 0) {
|
||||
found_version = entry.gid_type;
|
||||
}
|
||||
}
|
||||
if (found_gid >= 0) {
|
||||
ibctx = ctx;
|
||||
gid_idx = found_gid;
|
||||
gid_version = found_version;
|
||||
matched_dev = dn;
|
||||
rdma_local.path_mtu = pa.active_mtu;
|
||||
break;
|
||||
}
|
||||
ibv_close_device(ctx);
|
||||
}
|
||||
ibv_free_device_list(devs);
|
||||
if (!ibctx) return false;
|
||||
|
||||
rdma_local.ib_port = ib_port;
|
||||
rdma_local.gid_idx = gid_idx;
|
||||
|
||||
rdma = std::make_unique<rdma_conn>();
|
||||
rdma->ctx = ibctx;
|
||||
|
||||
rdma->pd = ibv_alloc_pd(ibctx);
|
||||
if (!rdma->pd) return false;
|
||||
|
||||
rdma->scq = ibv_create_cq(ibctx, 16, nullptr, nullptr, 0);
|
||||
rdma->rcq = ibv_create_cq(ibctx, RDMA_RX_DEPTH + 4, nullptr, nullptr, 0);
|
||||
if (!rdma->scq || !rdma->rcq) return false;
|
||||
|
||||
ibv_qp_init_attr qia = {};
|
||||
qia.send_cq = rdma->scq;
|
||||
qia.recv_cq = rdma->rcq;
|
||||
qia.qp_type = IBV_QPT_RC;
|
||||
qia.cap.max_send_wr = 4;
|
||||
qia.cap.max_recv_wr = RDMA_RX_DEPTH + 4;
|
||||
qia.cap.max_send_sge = 1;
|
||||
qia.cap.max_recv_sge = 1;
|
||||
qia.cap.max_inline_data = 256;
|
||||
|
||||
rdma->qp = ibv_create_qp(rdma->pd, &qia);
|
||||
if (!rdma->qp) return false;
|
||||
rdma->max_inline = qia.cap.max_inline_data;
|
||||
|
||||
rdma->tx_buf = aligned_alloc(4096, RDMA_CHUNK);
|
||||
rdma->rx_buf = aligned_alloc(4096, static_cast<size_t>(RDMA_RX_DEPTH) * RDMA_CHUNK);
|
||||
if (!rdma->tx_buf || !rdma->rx_buf) return false;
|
||||
|
||||
rdma->tx_mr = ibv_reg_mr(rdma->pd, rdma->tx_buf, RDMA_CHUNK, IBV_ACCESS_LOCAL_WRITE);
|
||||
rdma->rx_mr = ibv_reg_mr(rdma->pd, rdma->rx_buf, static_cast<size_t>(RDMA_RX_DEPTH) * RDMA_CHUNK,
|
||||
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
|
||||
if (!rdma->tx_mr || !rdma->rx_mr) return false;
|
||||
|
||||
ibv_gid local_gid;
|
||||
if (ibv_query_gid(ibctx, ib_port, gid_idx, &local_gid) != 0) return false;
|
||||
|
||||
rdma_local.qpn = rdma->qp->qp_num;
|
||||
rdma_local.psn = rdma->qp->qp_num & 0xffffff;
|
||||
memcpy(&rdma_local.gid, &local_gid, RDMA_GID_SIZE);
|
||||
|
||||
const char * ver_str = "";
|
||||
if (gid_version == IBV_GID_TYPE_ROCE_V2) {
|
||||
ver_str = " RoCEv2";
|
||||
} else if (gid_version == IBV_GID_TYPE_ROCE_V1) {
|
||||
ver_str = " RoCEv1";
|
||||
}
|
||||
GGML_LOG_INFO("RDMA probed: dev=%s gid=%d%s qpn=%u inline=%u\n",
|
||||
matched_dev, gid_idx, ver_str, rdma_local.qpn, rdma->max_inline);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Phase 2: Given remote QPN/PSN/GID, transition QP: RESET->INIT->pre-post->RTR->RTS.
|
||||
// On success, the connection is live and ready for rdma_send/rdma_recv.
|
||||
bool socket_t::impl::rdma_activate(uint32_t remote_qpn, uint32_t remote_psn, const uint8_t * remote_gid) {
|
||||
// RESET -> INIT
|
||||
{
|
||||
struct ibv_qp_attr a = {};
|
||||
a.qp_state = IBV_QPS_INIT;
|
||||
a.port_num = rdma_local.ib_port;
|
||||
a.pkey_index = 0;
|
||||
a.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE;
|
||||
if (ibv_modify_qp(rdma->qp, &a,
|
||||
IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < RDMA_RX_DEPTH; i++) {
|
||||
if (!rdma->post_rx(i)) return false;
|
||||
}
|
||||
|
||||
// INIT -> RTR
|
||||
{
|
||||
struct ibv_qp_attr a = {};
|
||||
a.qp_state = IBV_QPS_RTR;
|
||||
a.path_mtu = rdma_local.path_mtu;
|
||||
a.dest_qp_num = remote_qpn;
|
||||
a.rq_psn = remote_psn;
|
||||
a.max_dest_rd_atomic = 1;
|
||||
a.min_rnr_timer = 1;
|
||||
a.ah_attr.is_global = 1;
|
||||
memcpy(&a.ah_attr.grh.dgid, remote_gid, RDMA_GID_SIZE);
|
||||
a.ah_attr.grh.hop_limit = 1;
|
||||
a.ah_attr.grh.sgid_index = rdma_local.gid_idx;
|
||||
a.ah_attr.dlid = 0;
|
||||
a.ah_attr.port_num = rdma_local.ib_port;
|
||||
if (ibv_modify_qp(rdma->qp, &a,
|
||||
IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
|
||||
IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// RTR -> RTS
|
||||
{
|
||||
struct ibv_qp_attr a = {};
|
||||
a.qp_state = IBV_QPS_RTS;
|
||||
a.timeout = 14;
|
||||
a.retry_cnt = 7;
|
||||
a.rnr_retry = 7;
|
||||
a.sq_psn = rdma_local.psn;
|
||||
a.max_rd_atomic = 1;
|
||||
if (ibv_modify_qp(rdma->qp, &a,
|
||||
IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
|
||||
IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("RDMA activated: qpn=%u->%u mtu=%d rx_depth=%d\n",
|
||||
rdma_local.qpn, remote_qpn, 128 << rdma_local.path_mtu, RDMA_RX_DEPTH);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool socket_t::impl::rdma_poll(struct ibv_cq * cq, struct ibv_wc * wc) {
|
||||
for (uint64_t s = 0; ; s++) {
|
||||
int n = ibv_poll_cq(cq, 1, wc);
|
||||
if (n > 0) {
|
||||
if (wc->status != IBV_WC_SUCCESS) {
|
||||
GGML_LOG_ERROR("RDMA CQ wc error: status=%d (%s) vendor_err=0x%x\n",
|
||||
wc->status, ibv_wc_status_str(wc->status), wc->vendor_err);
|
||||
}
|
||||
return wc->status == IBV_WC_SUCCESS;
|
||||
}
|
||||
if (n < 0) return false;
|
||||
if ((s & 0xFFFFF) == 0 && s > 0) {
|
||||
if (tcp_peer_closed()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool socket_t::impl::rdma_send(const void * data, size_t size) {
|
||||
rdma_conn * c = rdma.get();
|
||||
const uint8_t * src = (const uint8_t *)data;
|
||||
size_t rem = size;
|
||||
while (rem > 0) {
|
||||
size_t chunk = std::min(rem, RDMA_CHUNK);
|
||||
|
||||
struct ibv_sge sge = {};
|
||||
struct ibv_send_wr wr = {}, * bad = nullptr;
|
||||
wr.opcode = IBV_WR_SEND;
|
||||
wr.sg_list = &sge;
|
||||
wr.num_sge = 1;
|
||||
|
||||
if (chunk <= c->max_inline) {
|
||||
sge.addr = (uintptr_t)src;
|
||||
sge.length = chunk;
|
||||
wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
|
||||
} else {
|
||||
memcpy(c->tx_buf, src, chunk);
|
||||
sge.addr = (uintptr_t)c->tx_buf;
|
||||
sge.length = chunk;
|
||||
sge.lkey = c->tx_mr->lkey;
|
||||
wr.send_flags = IBV_SEND_SIGNALED;
|
||||
}
|
||||
|
||||
if (ibv_post_send(c->qp, &wr, &bad) != 0) return false;
|
||||
struct ibv_wc wc;
|
||||
if (!rdma_poll(c->scq, &wc)) return false;
|
||||
|
||||
src += chunk;
|
||||
rem -= chunk;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool socket_t::impl::rdma_recv(void * data, size_t size) {
|
||||
rdma_conn * c = rdma.get();
|
||||
uint8_t * dst = (uint8_t *)data;
|
||||
size_t rem = size;
|
||||
while (rem > 0) {
|
||||
struct ibv_wc wc;
|
||||
if (!rdma_poll(c->rcq, &wc)) return false;
|
||||
|
||||
int slot = (int)wc.wr_id;
|
||||
size_t got = wc.byte_len;
|
||||
memcpy(dst, c->rx_slot(slot), got);
|
||||
|
||||
if (!c->post_rx(slot)) return false;
|
||||
|
||||
dst += got;
|
||||
rem -= got;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif // GGML_RPC_RDMA
|
||||
|
||||
bool socket_t::impl::send_data(const void * data, size_t size) {
|
||||
#ifdef GGML_RPC_RDMA
|
||||
if (use_rdma) {
|
||||
return rdma_send(data, size);
|
||||
}
|
||||
#endif
|
||||
size_t bytes_sent = 0;
|
||||
while (bytes_sent < size) {
|
||||
size_t size_to_send = std::min(size - bytes_sent, MAX_CHUNK_SIZE);
|
||||
ssize_t n = send(fd, (const char *)data + bytes_sent, size_to_send, 0);
|
||||
if (n < 0) {
|
||||
GGML_LOG_ERROR("send failed (bytes_sent=%zu, size_to_send=%zu)\n",
|
||||
bytes_sent, size_to_send);
|
||||
return false;
|
||||
}
|
||||
bytes_sent += (size_t)n;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool socket_t::impl::recv_data(void * data, size_t size) {
|
||||
#ifdef GGML_RPC_RDMA
|
||||
if (use_rdma) {
|
||||
return rdma_recv(data, size);
|
||||
}
|
||||
#endif
|
||||
size_t bytes_recv = 0;
|
||||
while (bytes_recv < size) {
|
||||
size_t size_to_recv = std::min(size - bytes_recv, MAX_CHUNK_SIZE);
|
||||
ssize_t n = recv(fd, (char *)data + bytes_recv, size_to_recv, 0);
|
||||
if (n < 0) {
|
||||
GGML_LOG_ERROR("recv failed (bytes_recv=%zu, size_to_recv=%zu)\n",
|
||||
bytes_recv, size_to_recv);
|
||||
return false;
|
||||
}
|
||||
if (n == 0) {
|
||||
LOG_DBG("recv returned 0 (peer closed?)\n");
|
||||
return false;
|
||||
}
|
||||
bytes_recv += (size_t)n;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void socket_t::impl::get_caps(uint8_t * local_caps) {
|
||||
memset(local_caps, 0, RPC_CONN_CAPS_SIZE);
|
||||
#ifdef GGML_RPC_RDMA
|
||||
rdma_local = {};
|
||||
if (rdma_probe()) {
|
||||
rdma_caps rc = {};
|
||||
rc.qpn = rdma_local.qpn;
|
||||
rc.psn = rdma_local.psn;
|
||||
memcpy(rc.gid, rdma_local.gid, RDMA_GID_SIZE);
|
||||
memcpy(local_caps, &rc, sizeof(rc));
|
||||
} else {
|
||||
rdma.reset();
|
||||
}
|
||||
#endif // GGML_RPC_RDMA
|
||||
}
|
||||
|
||||
void socket_t::impl::update_caps(const uint8_t * remote_caps) {
|
||||
#ifdef GGML_RPC_RDMA
|
||||
if (!rdma) {
|
||||
return;
|
||||
}
|
||||
rdma_caps rc = {};
|
||||
memcpy(&rc, remote_caps, sizeof(rc));
|
||||
if (rc.qpn == 0) {
|
||||
rdma.reset();
|
||||
return;
|
||||
}
|
||||
if (rdma_activate(rc.qpn, rc.psn, rc.gid)) {
|
||||
use_rdma = true;
|
||||
} else {
|
||||
GGML_LOG_ERROR("RDMA activate failed, staying on TCP\n");
|
||||
rdma.reset();
|
||||
}
|
||||
#else
|
||||
(void)remote_caps;
|
||||
#endif // GGML_RPC_RDMA
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
socket_t::socket_t(std::unique_ptr<impl> p) : pimpl(std::move(p)) {}
|
||||
|
||||
socket_t::~socket_t() = default;
|
||||
|
||||
bool socket_t::send_data(const void * data, size_t size) {
|
||||
return pimpl->send_data(data, size);
|
||||
}
|
||||
|
||||
bool socket_t::recv_data(void * data, size_t size) {
|
||||
return pimpl->recv_data(data, size);
|
||||
}
|
||||
|
||||
void socket_t::get_caps(uint8_t * local_caps) {
|
||||
return pimpl->get_caps(local_caps);
|
||||
}
|
||||
|
||||
void socket_t::update_caps(const uint8_t * remote_caps) {
|
||||
return pimpl->update_caps(remote_caps);
|
||||
}
|
||||
|
||||
static bool is_valid_fd(sockfd_t sockfd) {
|
||||
#ifdef _WIN32
|
||||
return sockfd != INVALID_SOCKET;
|
||||
#else
|
||||
return sockfd >= 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool set_no_delay(sockfd_t sockfd) {
|
||||
int flag = 1;
|
||||
// set TCP_NODELAY to disable Nagle's algorithm
|
||||
int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
|
||||
return ret == 0;
|
||||
}
|
||||
|
||||
static bool set_reuse_addr(sockfd_t sockfd) {
|
||||
int flag = 1;
|
||||
int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
|
||||
return ret == 0;
|
||||
}
|
||||
|
||||
socket_ptr socket_t::accept() {
|
||||
auto client_socket_fd = ::accept(pimpl->fd, NULL, NULL);
|
||||
if (!is_valid_fd(client_socket_fd)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!set_no_delay(client_socket_fd)) {
|
||||
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
|
||||
return nullptr;
|
||||
}
|
||||
return socket_ptr(new socket_t(std::make_unique<impl>(client_socket_fd)));
|
||||
}
|
||||
|
||||
socket_ptr socket_t::create_server(const char * host, int port) {
|
||||
auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
|
||||
if (!is_valid_fd(sockfd)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!set_reuse_addr(sockfd)) {
|
||||
GGML_LOG_ERROR("Failed to set SO_REUSEADDR\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (inet_addr(host) == INADDR_NONE) {
|
||||
GGML_LOG_ERROR("Invalid host address: %s\n", host);
|
||||
return nullptr;
|
||||
}
|
||||
struct sockaddr_in serv_addr;
|
||||
serv_addr.sin_family = AF_INET;
|
||||
serv_addr.sin_addr.s_addr = inet_addr(host);
|
||||
serv_addr.sin_port = htons(port);
|
||||
|
||||
if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
|
||||
return nullptr;
|
||||
}
|
||||
if (listen(sockfd, 1) < 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return socket_ptr(new socket_t(std::make_unique<impl>(sockfd)));
|
||||
}
|
||||
|
||||
socket_ptr socket_t::connect(const char * host, int port) {
|
||||
auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
|
||||
if (!is_valid_fd(sockfd)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!set_no_delay(sockfd)) {
|
||||
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
|
||||
return nullptr;
|
||||
}
|
||||
struct sockaddr_in addr;
|
||||
addr.sin_family = AF_INET;
|
||||
addr.sin_port = htons(port);
|
||||
struct hostent * server = gethostbyname(host);
|
||||
if (server == NULL) {
|
||||
GGML_LOG_ERROR("Cannot resolve host '%s'\n", host);
|
||||
return nullptr;
|
||||
}
|
||||
memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length);
|
||||
if (::connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return socket_ptr(new socket_t(std::make_unique<impl>(sockfd)));
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
static std::mutex g_rpc_transport_mu;
|
||||
static bool g_rpc_transport_wsa_started = false;
|
||||
#endif
|
||||
|
||||
bool rpc_transport_init() {
|
||||
#ifdef _WIN32
|
||||
std::lock_guard<std::mutex> lock(g_rpc_transport_mu);
|
||||
if (g_rpc_transport_wsa_started) {
|
||||
return true;
|
||||
}
|
||||
WSADATA wsaData;
|
||||
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
||||
if (res != 0) {
|
||||
return false;
|
||||
}
|
||||
g_rpc_transport_wsa_started = true;
|
||||
return true;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
void rpc_transport_shutdown() {
|
||||
#ifdef _WIN32
|
||||
std::lock_guard<std::mutex> lock(g_rpc_transport_mu);
|
||||
if (!g_rpc_transport_wsa_started) {
|
||||
return;
|
||||
}
|
||||
WSACleanup();
|
||||
g_rpc_transport_wsa_started = false;
|
||||
#endif
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
struct socket_t;
|
||||
typedef std::shared_ptr<socket_t> socket_ptr;
|
||||
|
||||
static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
|
||||
static constexpr size_t RPC_CONN_CAPS_SIZE = 24;
|
||||
|
||||
struct socket_t {
|
||||
~socket_t();
|
||||
|
||||
bool send_data(const void * data, size_t size);
|
||||
bool recv_data(void * data, size_t size);
|
||||
|
||||
socket_ptr accept();
|
||||
|
||||
void get_caps(uint8_t * local_caps);
|
||||
void update_caps(const uint8_t * remote_caps);
|
||||
|
||||
static socket_ptr create_server(const char * host, int port);
|
||||
static socket_ptr connect(const char * host, int port);
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
explicit socket_t(std::unique_ptr<impl> p);
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
|
||||
bool rpc_transport_init();
|
||||
void rpc_transport_shutdown();
|
||||
+12
-12
@@ -537,9 +537,9 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
||||
static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
@@ -682,9 +682,9 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
@@ -798,9 +798,9 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
@@ -842,9 +842,9 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
|
||||
constexpr size_t num_subgroups = WARP_SIZE;
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
@@ -390,12 +390,11 @@ struct ggml_webgpu_flash_attn_pipeline_key {
|
||||
bool has_mask;
|
||||
bool has_sinks;
|
||||
bool uses_logit_softcap;
|
||||
bool use_vec;
|
||||
|
||||
bool operator==(const ggml_webgpu_flash_attn_pipeline_key & other) const {
|
||||
return kv_type == other.kv_type && head_dim_qk == other.head_dim_qk && head_dim_v == other.head_dim_v &&
|
||||
kv_direct == other.kv_direct && has_mask == other.has_mask && has_sinks == other.has_sinks &&
|
||||
uses_logit_softcap == other.uses_logit_softcap && use_vec == other.use_vec;
|
||||
uses_logit_softcap == other.uses_logit_softcap;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -409,47 +408,37 @@ struct ggml_webgpu_flash_attn_pipeline_key_hash {
|
||||
ggml_webgpu_hash_combine(seed, key.has_mask);
|
||||
ggml_webgpu_hash_combine(seed, key.has_sinks);
|
||||
ggml_webgpu_hash_combine(seed, key.uses_logit_softcap);
|
||||
ggml_webgpu_hash_combine(seed, key.use_vec);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_flash_attn_shader_lib_context {
|
||||
ggml_webgpu_flash_attn_pipeline_key key;
|
||||
uint32_t sg_mat_m;
|
||||
uint32_t sg_mat_n;
|
||||
uint32_t sg_mat_k;
|
||||
size_t wg_mem_limit_bytes;
|
||||
uint32_t max_subgroup_size;
|
||||
};
|
||||
|
||||
struct ggml_webgpu_flash_attn_shader_decisions {
|
||||
struct ggml_webgpu_flash_attn_decisions {
|
||||
uint32_t q_tile = 0;
|
||||
uint32_t kv_tile = 0;
|
||||
uint32_t wg_size = 0;
|
||||
};
|
||||
|
||||
inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_pipeline_key & key) {
|
||||
// Keep conservative defaults unless this is the f16 vec-split shape family.
|
||||
if (key.kv_type != GGML_TYPE_F16 || key.head_dim_qk != key.head_dim_v) {
|
||||
return 1u;
|
||||
}
|
||||
struct ggml_webgpu_flash_attn_vec_decisions {
|
||||
uint32_t kv_tile = 0;
|
||||
uint32_t wg_size = 0;
|
||||
};
|
||||
|
||||
// Head-dim specializations used by the tuned vec f16 path.
|
||||
switch (key.head_dim_qk) {
|
||||
case 64:
|
||||
return 2u;
|
||||
case 96:
|
||||
return 4u;
|
||||
case 128:
|
||||
return 1u;
|
||||
case 192:
|
||||
return 2u;
|
||||
case 576:
|
||||
return 2u;
|
||||
default:
|
||||
return 1u;
|
||||
}
|
||||
inline ggml_webgpu_flash_attn_pipeline_key ggml_webgpu_flash_attn_make_pipeline_key(
|
||||
const ggml_webgpu_shader_lib_context & context) {
|
||||
const bool has_mask = context.src3 != nullptr;
|
||||
const bool has_sinks = context.src4 != nullptr;
|
||||
const bool kv_direct = (context.src1->type == GGML_TYPE_F16) && (context.src0->ne[0] % context.sg_mat_k == 0) &&
|
||||
(context.src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);
|
||||
|
||||
ggml_webgpu_flash_attn_pipeline_key key = {};
|
||||
key.kv_type = context.src1->type;
|
||||
key.head_dim_qk = (uint32_t) context.src0->ne[0];
|
||||
key.head_dim_v = (uint32_t) context.src2->ne[0];
|
||||
key.kv_direct = kv_direct;
|
||||
key.has_mask = has_mask;
|
||||
key.has_sinks = has_sinks;
|
||||
key.uses_logit_softcap = ggml_get_op_params_f32(context.dst, 2) != 0.0f;
|
||||
return key;
|
||||
}
|
||||
|
||||
struct ggml_webgpu_flash_attn_vec_reduce_pipeline_key {
|
||||
@@ -471,79 +460,20 @@ inline bool operator==(const ggml_webgpu_flash_attn_vec_reduce_pipeline_key & lh
|
||||
return lhs.head_dim_v == rhs.head_dim_v && lhs.wg_size == rhs.wg_size;
|
||||
}
|
||||
|
||||
struct ggml_webgpu_flash_attn_vec_reduce_shader_lib_context {
|
||||
ggml_webgpu_flash_attn_vec_reduce_pipeline_key key;
|
||||
uint32_t max_wg_size;
|
||||
};
|
||||
|
||||
inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_flash_attn_vec_reduce_shader(
|
||||
pre_wgsl::Preprocessor & preprocessor,
|
||||
const char * shader_src,
|
||||
const ggml_webgpu_flash_attn_vec_reduce_shader_lib_context & context) {
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn_vec_reduce";
|
||||
|
||||
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(context.key.head_dim_v));
|
||||
variant += std::string("_hsv") + std::to_string(context.key.head_dim_v);
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
variant += std::string("_wg") + std::to_string(context.max_wg_size);
|
||||
|
||||
ggml_webgpu_processed_shader result;
|
||||
result.wgsl = preprocessor.preprocess(shader_src, defines);
|
||||
result.variant = variant;
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_webgpu_flash_attn_blk_pipeline_key {
|
||||
uint32_t q_tile;
|
||||
uint32_t kv_tile;
|
||||
|
||||
bool operator==(const ggml_webgpu_flash_attn_blk_pipeline_key & other) const {
|
||||
return q_tile == other.q_tile && kv_tile == other.kv_tile;
|
||||
}
|
||||
bool operator==(const ggml_webgpu_flash_attn_blk_pipeline_key & other) const { return kv_tile == other.kv_tile; }
|
||||
};
|
||||
|
||||
struct ggml_webgpu_flash_attn_blk_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_flash_attn_blk_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.q_tile);
|
||||
ggml_webgpu_hash_combine(seed, key.kv_tile);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_flash_attn_blk_shader_lib_context {
|
||||
ggml_webgpu_flash_attn_blk_pipeline_key key;
|
||||
uint32_t max_wg_size;
|
||||
};
|
||||
|
||||
inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_flash_attn_blk_shader(
|
||||
pre_wgsl::Preprocessor & preprocessor,
|
||||
const char * shader_src,
|
||||
const ggml_webgpu_flash_attn_blk_shader_lib_context & context) {
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn_vec_blk";
|
||||
|
||||
defines.push_back(std::string("Q_TILE=") + std::to_string(context.key.q_tile));
|
||||
variant += std::string("_qt") + std::to_string(context.key.q_tile);
|
||||
|
||||
defines.push_back(std::string("KV_TILE=") + std::to_string(context.key.kv_tile));
|
||||
variant += std::string("_kvt") + std::to_string(context.key.kv_tile);
|
||||
|
||||
uint32_t wg_size = 1;
|
||||
while ((wg_size << 1) <= context.max_wg_size) {
|
||||
wg_size <<= 1;
|
||||
}
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
variant += std::string("_wg") + std::to_string(wg_size);
|
||||
|
||||
ggml_webgpu_processed_shader result;
|
||||
result.wgsl = preprocessor.preprocess(shader_src, defines);
|
||||
result.variant = variant;
|
||||
return result;
|
||||
}
|
||||
|
||||
// This is exposed because it's necessary in supports_op
|
||||
inline size_t ggml_webgpu_flash_attn_wg_mem_bytes(uint32_t q_tile,
|
||||
uint32_t kv_tile,
|
||||
@@ -568,6 +498,41 @@ inline size_t ggml_webgpu_flash_attn_wg_mem_bytes(uint32_t q_tile,
|
||||
return f16_elems * GGML_WEBGPU_F16_SIZE_BYTES + f32_elems * GGML_WEBGPU_F32_SIZE_BYTES;
|
||||
}
|
||||
|
||||
inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_context & context,
|
||||
const ggml_webgpu_flash_attn_pipeline_key & key) {
|
||||
const size_t limit_bytes = context.wg_mem_limit_bytes;
|
||||
const size_t q_tile = context.sg_mat_m;
|
||||
const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
|
||||
2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
|
||||
size_t bytes_per_kv = 0;
|
||||
if (!key.kv_direct) {
|
||||
bytes_per_kv += std::max(key.head_dim_qk, key.head_dim_v);
|
||||
}
|
||||
if (key.has_mask) {
|
||||
bytes_per_kv += q_tile;
|
||||
}
|
||||
bytes_per_kv += q_tile;
|
||||
bytes_per_kv *= GGML_WEBGPU_F16_SIZE_BYTES;
|
||||
const uint32_t max_kv_tile = (limit_bytes - base_q_bytes) / bytes_per_kv;
|
||||
return (max_kv_tile / context.sg_mat_n) * context.sg_mat_n;
|
||||
}
|
||||
|
||||
inline uint32_t ggml_webgpu_flash_attn_vec_get_kv_tile(const ggml_webgpu_shader_lib_context & context) {
|
||||
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
|
||||
const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
|
||||
uint32_t kv_tile = std::max(context.sg_mat_n, std::min(32u, min_kv_tile));
|
||||
kv_tile = (kv_tile / context.sg_mat_n) * context.sg_mat_n;
|
||||
|
||||
if (key.kv_direct) {
|
||||
kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
|
||||
while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
|
||||
kv_tile -= context.sg_mat_n;
|
||||
}
|
||||
}
|
||||
|
||||
return kv_tile;
|
||||
}
|
||||
|
||||
/** Matrix Multiplication **/
|
||||
|
||||
struct ggml_webgpu_legacy_mul_mat_pipeline_key {
|
||||
@@ -802,6 +767,8 @@ class ggml_webgpu_shader_lib {
|
||||
repeat_pipelines; // type
|
||||
std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
|
||||
flash_attn_pipelines;
|
||||
std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
|
||||
flash_attn_vec_pipelines;
|
||||
std::unordered_map<ggml_webgpu_flash_attn_vec_reduce_pipeline_key,
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_flash_attn_vec_reduce_pipeline_key_hash>
|
||||
@@ -849,10 +816,9 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_row_norm_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_row_norm_pipeline_key key = {
|
||||
.op = context.dst->op,
|
||||
.inplace = context.inplace,
|
||||
};
|
||||
ggml_webgpu_row_norm_pipeline_key key = {};
|
||||
key.op = context.dst->op;
|
||||
key.inplace = context.inplace;
|
||||
|
||||
auto it = row_norm_pipelines.find(key);
|
||||
if (it != row_norm_pipelines.end()) {
|
||||
@@ -908,9 +874,10 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_set_rows_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_set_rows_pipeline_key key = { .dst_type = context.dst->type,
|
||||
.vec4 = context.src0->ne[0] % 4 == 0,
|
||||
.i64_idx = context.src1->type == GGML_TYPE_I64 };
|
||||
ggml_webgpu_set_rows_pipeline_key key = {};
|
||||
key.dst_type = context.dst->type;
|
||||
key.vec4 = context.src0->ne[0] % 4 == 0;
|
||||
key.i64_idx = context.src1->type == GGML_TYPE_I64;
|
||||
|
||||
auto it = set_rows_pipelines.find(key);
|
||||
if (it != set_rows_pipelines.end()) {
|
||||
@@ -955,7 +922,9 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_set_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_set_pipeline_key key = { .type = context.dst->type, .inplace = context.inplace };
|
||||
ggml_webgpu_set_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
key.inplace = context.inplace;
|
||||
|
||||
auto it = set_pipelines.find(key);
|
||||
if (it != set_pipelines.end()) {
|
||||
@@ -1062,10 +1031,9 @@ class ggml_webgpu_shader_lib {
|
||||
|
||||
webgpu_pipeline get_get_rows_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
const bool vectorized = context.src0->type == GGML_TYPE_F32 && context.dst->ne[0] % 4 == 0;
|
||||
ggml_webgpu_get_rows_pipeline_key key = {
|
||||
.src_type = context.src0->type,
|
||||
.vectorized = (int) vectorized,
|
||||
};
|
||||
ggml_webgpu_get_rows_pipeline_key key = {};
|
||||
key.src_type = context.src0->type;
|
||||
key.vectorized = (int) vectorized;
|
||||
|
||||
auto it = get_rows_pipelines.find(key);
|
||||
if (it != get_rows_pipelines.end()) {
|
||||
@@ -1115,8 +1083,7 @@ class ggml_webgpu_shader_lib {
|
||||
std::string type_upper = type_str;
|
||||
std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
|
||||
|
||||
switch (key.src_type)
|
||||
{
|
||||
switch (key.src_type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
@@ -1136,9 +1103,9 @@ class ggml_webgpu_shader_lib {
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
defines.push_back(std::string("SRC_TYPE=") + type_str);
|
||||
}
|
||||
{
|
||||
defines.push_back(std::string("SRC_TYPE=") + type_str);
|
||||
}
|
||||
}
|
||||
|
||||
defines.push_back("BYTE_HELPERS");
|
||||
@@ -1181,7 +1148,8 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_scale_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_scale_pipeline_key key = { .inplace = context.inplace };
|
||||
ggml_webgpu_scale_pipeline_key key = {};
|
||||
key.inplace = context.inplace;
|
||||
|
||||
auto it = scale_pipelines.find(key);
|
||||
if (it != scale_pipelines.end()) {
|
||||
@@ -1208,11 +1176,10 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_solve_tri_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_solve_tri_pipeline_key key = {
|
||||
.type = context.dst->type,
|
||||
.n = (int) context.src0->ne[0],
|
||||
.k = (int) context.src1->ne[0],
|
||||
};
|
||||
ggml_webgpu_solve_tri_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
key.n = (int) context.src0->ne[0];
|
||||
key.k = (int) context.src1->ne[0];
|
||||
|
||||
auto it = solve_tri_pipelines.find(key);
|
||||
if (it != solve_tri_pipelines.end()) {
|
||||
@@ -1250,10 +1217,9 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_ssm_conv_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_ssm_conv_pipeline_key key = {
|
||||
.type = context.dst->type,
|
||||
.vectorized = context.src1->ne[0] == 4,
|
||||
};
|
||||
ggml_webgpu_ssm_conv_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
key.vectorized = context.src1->ne[0] == 4;
|
||||
|
||||
auto it = ssm_conv_pipelines.find(key);
|
||||
if (it != ssm_conv_pipelines.end()) {
|
||||
@@ -1293,11 +1259,10 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_gated_delta_net_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_gated_delta_net_pipeline_key key = {
|
||||
.type = context.dst->type,
|
||||
.s_v = (int) context.src2->ne[0],
|
||||
.kda = context.src3->ne[0] == context.src2->ne[0],
|
||||
};
|
||||
ggml_webgpu_gated_delta_net_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
key.s_v = (int) context.src2->ne[0];
|
||||
key.kda = context.src3->ne[0] == context.src2->ne[0];
|
||||
|
||||
auto it = gated_delta_net_pipelines.find(key);
|
||||
if (it != gated_delta_net_pipelines.end()) {
|
||||
@@ -1330,7 +1295,8 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_pad_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_pad_pipeline_key key = { .circular = ggml_get_op_params_i32(context.dst, 8) != 0 };
|
||||
ggml_webgpu_pad_pipeline_key key = {};
|
||||
key.circular = ggml_get_op_params_i32(context.dst, 8) != 0;
|
||||
|
||||
auto it = pad_pipelines.find(key);
|
||||
if (it != pad_pipelines.end()) {
|
||||
@@ -1357,15 +1323,13 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_mul_mat_vec_pipeline_key key = {
|
||||
.src0_type = context.src0->type,
|
||||
.src1_type = context.src1->type,
|
||||
// Quantized mat-vec path currently runs scalar; only allow vectorization when both inputs are float
|
||||
.vectorized = (context.src0->ne[0] % 4 == 0 && context.dst->ne[0] % 4 == 0 &&
|
||||
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
|
||||
1 :
|
||||
0,
|
||||
};
|
||||
ggml_webgpu_mul_mat_vec_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
key.src1_type = context.src1->type;
|
||||
key.vectorized = (context.src0->ne[0] % 4 == 0 && context.dst->ne[0] % 4 == 0 &&
|
||||
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
|
||||
1 :
|
||||
0;
|
||||
|
||||
auto it = mul_mat_vec_pipelines.find(key);
|
||||
if (it != mul_mat_vec_pipelines.end()) {
|
||||
@@ -1451,15 +1415,14 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_fast_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_mul_mat_pipeline_key key = {
|
||||
.src0_type = context.src0->type,
|
||||
.src1_type = context.src1->type,
|
||||
.vectorized = (context.src0->ne[0] % 4 == 0 && context.dst->ne[0] % 4 == 0 && context.dst->ne[1] % 4 == 0 &&
|
||||
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
|
||||
1 :
|
||||
0,
|
||||
.use_subgroup_matrix = context.supports_subgroup_matrix
|
||||
};
|
||||
ggml_webgpu_mul_mat_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
key.src1_type = context.src1->type;
|
||||
key.vectorized = (context.src0->ne[0] % 4 == 0 && context.dst->ne[0] % 4 == 0 && context.dst->ne[1] % 4 == 0 &&
|
||||
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
|
||||
1 :
|
||||
0;
|
||||
key.use_subgroup_matrix = context.supports_subgroup_matrix;
|
||||
|
||||
auto it = mul_mat_fast_pipelines.find(key);
|
||||
if (it != mul_mat_fast_pipelines.end()) {
|
||||
@@ -1578,8 +1541,9 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_legacy_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_legacy_mul_mat_pipeline_key key = { .src0_type = context.src0->type,
|
||||
.src1_type = context.src1->type };
|
||||
ggml_webgpu_legacy_mul_mat_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
key.src1_type = context.src1->type;
|
||||
|
||||
auto it = mul_mat_legacy_pipelines.find(key);
|
||||
if (it != mul_mat_legacy_pipelines.end()) {
|
||||
@@ -1621,8 +1585,7 @@ class ggml_webgpu_shader_lib {
|
||||
std::string type_upper = src0_name;
|
||||
std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
|
||||
|
||||
switch (context.src0->type)
|
||||
{
|
||||
switch (context.src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
@@ -1642,9 +1605,9 @@ class ggml_webgpu_shader_lib {
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
defines.push_back(std::string("SRC0_TYPE=") + src0_name);
|
||||
}
|
||||
{
|
||||
defines.push_back(std::string("SRC0_TYPE=") + src0_name);
|
||||
}
|
||||
}
|
||||
|
||||
defines.push_back("BYTE_HELPERS");
|
||||
@@ -1689,10 +1652,9 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_id_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_mul_mat_id_pipeline_key key = {
|
||||
.src0_type = context.src0->type,
|
||||
.src1_type = context.src1->type,
|
||||
};
|
||||
ggml_webgpu_mul_mat_id_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
key.src1_type = context.src1->type;
|
||||
|
||||
auto it = mul_mat_id_pipelines.find(key);
|
||||
if (it != mul_mat_id_pipelines.end()) {
|
||||
@@ -1782,13 +1744,12 @@ class ggml_webgpu_shader_lib {
|
||||
webgpu_pipeline get_unary_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
const bool is_unary = context.dst->op == GGML_OP_UNARY;
|
||||
const int op = is_unary ? (int) ggml_get_unary_op(context.dst) : context.dst->op;
|
||||
ggml_webgpu_unary_pipeline_key key = {
|
||||
.type = context.dst->type,
|
||||
.op = op,
|
||||
.is_unary = is_unary,
|
||||
.inplace = context.inplace,
|
||||
.ttype = (ggml_tri_type) ggml_get_op_params_i32(context.dst, 0),
|
||||
};
|
||||
ggml_webgpu_unary_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
key.op = op;
|
||||
key.is_unary = is_unary;
|
||||
key.inplace = context.inplace;
|
||||
key.ttype = (ggml_tri_type) ggml_get_op_params_i32(context.dst, 0);
|
||||
|
||||
auto it = unary_pipelines.find(key);
|
||||
if (it != unary_pipelines.end()) {
|
||||
@@ -1853,13 +1814,12 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_binary_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_binary_pipeline_key key = {
|
||||
.type = context.dst->type,
|
||||
.op = context.dst->op,
|
||||
.inplace = context.inplace,
|
||||
.overlap = context.overlap,
|
||||
.src_overlap = context.src_overlap,
|
||||
};
|
||||
ggml_webgpu_binary_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
key.op = context.dst->op;
|
||||
key.inplace = context.inplace;
|
||||
key.overlap = context.overlap;
|
||||
key.src_overlap = context.src_overlap;
|
||||
|
||||
auto it = binary_pipelines.find(key);
|
||||
if (it != binary_pipelines.end()) {
|
||||
@@ -1908,9 +1868,8 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_concat_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_concat_pipeline_key key = {
|
||||
.type = context.dst->type,
|
||||
};
|
||||
ggml_webgpu_concat_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
|
||||
auto it = concat_pipelines.find(key);
|
||||
if (it != concat_pipelines.end()) {
|
||||
@@ -1945,9 +1904,8 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_repeat_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_repeat_pipeline_key key = {
|
||||
.type = context.dst->type,
|
||||
};
|
||||
ggml_webgpu_repeat_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
|
||||
auto it = repeat_pipelines.find(key);
|
||||
if (it != repeat_pipelines.end()) {
|
||||
@@ -1985,16 +1943,16 @@ class ggml_webgpu_shader_lib {
|
||||
return repeat_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_flash_attn_shader_lib_context & context) {
|
||||
auto it = flash_attn_pipelines.find(context.key);
|
||||
webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
|
||||
auto it = flash_attn_pipelines.find(key);
|
||||
if (it != flash_attn_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn";
|
||||
|
||||
switch (context.key.kv_type) {
|
||||
switch (key.kv_type) {
|
||||
case GGML_TYPE_F32:
|
||||
defines.push_back("KV_F32");
|
||||
break;
|
||||
@@ -2010,111 +1968,206 @@ class ggml_webgpu_shader_lib {
|
||||
default:
|
||||
GGML_ABORT("Unsupported KV type for flash attention shader");
|
||||
}
|
||||
variant += std::string("_") + ggml_type_name(context.key.kv_type);
|
||||
variant += std::string("_") + ggml_type_name(key.kv_type);
|
||||
|
||||
if (context.key.has_mask) {
|
||||
if (key.has_mask) {
|
||||
defines.push_back("MASK");
|
||||
variant += "_mask";
|
||||
}
|
||||
if (context.key.has_sinks) {
|
||||
if (key.has_sinks) {
|
||||
defines.push_back("SINKS");
|
||||
variant += "_sinks";
|
||||
}
|
||||
if (context.key.uses_logit_softcap) {
|
||||
if (key.uses_logit_softcap) {
|
||||
defines.push_back("LOGIT_SOFTCAP");
|
||||
variant += "_lgsc";
|
||||
}
|
||||
if (context.key.kv_direct) {
|
||||
if (key.kv_direct) {
|
||||
defines.push_back("KV_DIRECT");
|
||||
variant += "_kvdirect";
|
||||
}
|
||||
if (context.key.has_mask && context.key.use_vec) {
|
||||
defines.push_back("BLK");
|
||||
variant += "_blk";
|
||||
}
|
||||
|
||||
defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(context.key.head_dim_qk));
|
||||
variant += std::string("_hsqk") + std::to_string(context.key.head_dim_qk);
|
||||
defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
|
||||
variant += std::string("_hsqk") + std::to_string(key.head_dim_qk);
|
||||
|
||||
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(context.key.head_dim_v));
|
||||
variant += std::string("_hsv") + std::to_string(context.key.head_dim_v);
|
||||
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
|
||||
variant += std::string("_hsv") + std::to_string(key.head_dim_v);
|
||||
|
||||
defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
|
||||
defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
|
||||
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
|
||||
|
||||
uint32_t q_tile = context.sg_mat_m;
|
||||
uint32_t kv_tile = std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
|
||||
context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
|
||||
if (context.key.use_vec) {
|
||||
q_tile = 1;
|
||||
kv_tile = std::max(context.sg_mat_n, std::min(32u, ggml_webgpu_flash_attn_max_kv_tile(context)));
|
||||
kv_tile = (kv_tile / context.sg_mat_n) * context.sg_mat_n;
|
||||
const uint32_t vec_ne = ggml_webgpu_flash_attn_pick_vec_ne(context.key);
|
||||
defines.push_back(std::string("VEC_NE=") + std::to_string(vec_ne) + "u");
|
||||
}
|
||||
if (context.key.kv_direct) {
|
||||
GGML_ASSERT(kv_tile <= GGML_WEBGPU_KV_SEQ_PAD);
|
||||
auto decisions = std::make_shared<ggml_webgpu_flash_attn_decisions>();
|
||||
decisions->q_tile = context.sg_mat_m;
|
||||
|
||||
const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
|
||||
uint32_t kv_tile = std::min(min_kv_tile, context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
|
||||
|
||||
if (key.kv_direct) {
|
||||
kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
|
||||
while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
|
||||
kv_tile -= context.sg_mat_n;
|
||||
}
|
||||
}
|
||||
|
||||
defines.push_back(std::string("Q_TILE=") + std::to_string(q_tile));
|
||||
defines.push_back(std::string("KV_TILE=") + std::to_string(kv_tile));
|
||||
decisions->kv_tile = kv_tile;
|
||||
decisions->wg_size = std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
|
||||
|
||||
uint32_t wg_size = 0;
|
||||
if (context.key.use_vec) {
|
||||
wg_size = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
|
||||
} else {
|
||||
wg_size = std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
|
||||
}
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
defines.push_back(std::string("Q_TILE=") + std::to_string(decisions->q_tile));
|
||||
defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
|
||||
|
||||
const char * shader_src = context.key.use_vec ? wgsl_flash_attn_vec_split : wgsl_flash_attn;
|
||||
webgpu_pipeline pipeline =
|
||||
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(shader_src, defines), variant);
|
||||
auto decisions = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
|
||||
decisions->q_tile = q_tile;
|
||||
decisions->kv_tile = kv_tile;
|
||||
decisions->wg_size = wg_size;
|
||||
pipeline.context = decisions;
|
||||
flash_attn_pipelines[context.key] = pipeline;
|
||||
return flash_attn_pipelines[context.key];
|
||||
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn, defines), variant);
|
||||
pipeline.context = decisions;
|
||||
flash_attn_pipelines[key] = pipeline;
|
||||
return flash_attn_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_flash_attn_blk_shader_lib_context & context) {
|
||||
auto it = flash_attn_blk_pipelines.find(context.key);
|
||||
webgpu_pipeline get_flash_attn_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
|
||||
auto it = flash_attn_vec_pipelines.find(key);
|
||||
if (it != flash_attn_vec_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn_vec";
|
||||
|
||||
switch (key.kv_type) {
|
||||
case GGML_TYPE_F32:
|
||||
defines.push_back("KV_F32");
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
defines.push_back("KV_F16");
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
defines.push_back("KV_Q4_0");
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
defines.push_back("KV_Q8_0");
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported KV type for flash attention shader");
|
||||
}
|
||||
variant += std::string("_") + ggml_type_name(key.kv_type);
|
||||
|
||||
if (key.has_mask) {
|
||||
defines.push_back("MASK");
|
||||
defines.push_back("BLK");
|
||||
variant += "_mask_blk";
|
||||
}
|
||||
if (key.has_sinks) {
|
||||
defines.push_back("SINKS");
|
||||
variant += "_sinks";
|
||||
}
|
||||
if (key.uses_logit_softcap) {
|
||||
defines.push_back("LOGIT_SOFTCAP");
|
||||
variant += "_lgsc";
|
||||
}
|
||||
if (key.kv_direct) {
|
||||
defines.push_back("KV_DIRECT");
|
||||
variant += "_kvdirect";
|
||||
}
|
||||
|
||||
defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
|
||||
variant += std::string("_hsqk") + std::to_string(key.head_dim_qk);
|
||||
|
||||
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
|
||||
variant += std::string("_hsv") + std::to_string(key.head_dim_v);
|
||||
|
||||
defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
|
||||
defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
|
||||
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
|
||||
defines.push_back("Q_TILE=1");
|
||||
|
||||
auto decisions = std::make_shared<ggml_webgpu_flash_attn_vec_decisions>();
|
||||
decisions->kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
|
||||
decisions->wg_size = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
|
||||
uint32_t vec_ne = 1u;
|
||||
|
||||
// Keep conservative defaults unless this is the f16 vec-split shape family.
|
||||
if (key.kv_type == GGML_TYPE_F16 && key.head_dim_qk == key.head_dim_v) {
|
||||
switch (key.head_dim_qk) {
|
||||
case 64:
|
||||
case 192:
|
||||
case 576:
|
||||
vec_ne = 2u;
|
||||
break;
|
||||
case 96:
|
||||
vec_ne = 4u;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
|
||||
defines.push_back(std::string("VEC_NE=") + std::to_string(vec_ne) + "u");
|
||||
|
||||
webgpu_pipeline pipeline =
|
||||
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn_vec_split, defines), variant);
|
||||
pipeline.context = decisions;
|
||||
flash_attn_vec_pipelines[key] = pipeline;
|
||||
return flash_attn_vec_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_flash_attn_blk_pipeline_key key = {};
|
||||
key.kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
|
||||
auto it = flash_attn_blk_pipelines.find(key);
|
||||
if (it != flash_attn_blk_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_flash_attn_blk_shader(preprocessor, wgsl_flash_attn_vec_blk, context);
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
|
||||
flash_attn_blk_pipelines[context.key] = pipeline;
|
||||
return flash_attn_blk_pipelines[context.key];
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn_vec_blk";
|
||||
|
||||
defines.push_back(std::string("KV_TILE=") + std::to_string(key.kv_tile));
|
||||
variant += std::string("_kvt") + std::to_string(key.kv_tile);
|
||||
|
||||
uint32_t wg_size = 1;
|
||||
while ((wg_size << 1) <= context.max_wg_size) {
|
||||
wg_size <<= 1;
|
||||
}
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
variant += std::string("_wg") + std::to_string(wg_size);
|
||||
|
||||
webgpu_pipeline pipeline =
|
||||
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn_vec_blk, defines), variant);
|
||||
flash_attn_blk_pipelines[key] = pipeline;
|
||||
return flash_attn_blk_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_flash_attn_vec_reduce_pipeline(
|
||||
const ggml_webgpu_flash_attn_vec_reduce_shader_lib_context & context) {
|
||||
auto it = flash_attn_vec_reduce_pipelines.find(context.key);
|
||||
webgpu_pipeline get_flash_attn_vec_reduce_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_flash_attn_vec_reduce_pipeline_key key = {};
|
||||
key.head_dim_v = (uint32_t) context.src2->ne[0];
|
||||
key.wg_size = context.max_wg_size;
|
||||
auto it = flash_attn_vec_reduce_pipelines.find(key);
|
||||
if (it != flash_attn_vec_reduce_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_flash_attn_vec_reduce_shader(preprocessor, wgsl_flash_attn_vec_reduce, context);
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
|
||||
flash_attn_vec_reduce_pipelines[context.key] = pipeline;
|
||||
return flash_attn_vec_reduce_pipelines[context.key];
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn_vec_reduce";
|
||||
|
||||
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
|
||||
variant += std::string("_hsv") + std::to_string(key.head_dim_v);
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
variant += std::string("_wg") + std::to_string(context.max_wg_size);
|
||||
|
||||
webgpu_pipeline pipeline =
|
||||
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn_vec_reduce, defines), variant);
|
||||
flash_attn_vec_reduce_pipelines[key] = pipeline;
|
||||
return flash_attn_vec_reduce_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_cpy_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_cpy_pipeline_key key = {
|
||||
.src_type = context.src0->type,
|
||||
.dst_type = context.dst->type,
|
||||
};
|
||||
ggml_webgpu_cpy_pipeline_key key = {};
|
||||
key.src_type = context.src0->type;
|
||||
key.dst_type = context.dst->type;
|
||||
|
||||
auto it = cpy_pipelines.find(key);
|
||||
if (it != cpy_pipelines.end()) {
|
||||
@@ -2166,11 +2219,10 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_glu_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_glu_pipeline_key key = {
|
||||
.glu_op = ggml_get_glu_op(context.dst),
|
||||
.type = context.dst->type,
|
||||
.split = (context.src1 != nullptr),
|
||||
};
|
||||
ggml_webgpu_glu_pipeline_key key = {};
|
||||
key.glu_op = ggml_get_glu_op(context.dst);
|
||||
key.type = context.dst->type;
|
||||
key.split = (context.src1 != nullptr);
|
||||
|
||||
auto it = glu_pipelines.find(key);
|
||||
if (it != glu_pipelines.end()) {
|
||||
@@ -2239,11 +2291,10 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_rope_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_rope_pipeline_key key = {
|
||||
.type = context.dst->type,
|
||||
.inplace = context.inplace,
|
||||
.has_ff = (context.src2 != nullptr),
|
||||
};
|
||||
ggml_webgpu_rope_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
key.inplace = context.inplace;
|
||||
key.has_ff = (context.src2 != nullptr);
|
||||
|
||||
auto it = rope_pipelines.find(key);
|
||||
if (it != rope_pipelines.end()) {
|
||||
@@ -2288,12 +2339,11 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
|
||||
webgpu_pipeline get_soft_max_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_soft_max_pipeline_key key = {
|
||||
.mask_type = context.src1 ? context.src1->type : GGML_TYPE_F32,
|
||||
.has_mask = (context.src1 != nullptr),
|
||||
.has_sink = (context.src2 != nullptr),
|
||||
.inplace = context.inplace,
|
||||
};
|
||||
ggml_webgpu_soft_max_pipeline_key key = {};
|
||||
key.mask_type = context.src1 ? context.src1->type : GGML_TYPE_F32;
|
||||
key.has_mask = (context.src1 != nullptr);
|
||||
key.has_sink = (context.src2 != nullptr);
|
||||
key.inplace = context.inplace;
|
||||
|
||||
auto it = soft_max_pipelines.find(key);
|
||||
if (it != soft_max_pipelines.end()) {
|
||||
@@ -2359,25 +2409,6 @@ class ggml_webgpu_shader_lib {
|
||||
pipeline_desc.layout = nullptr; // nullptr means auto layout
|
||||
return { device.CreateComputePipeline(&pipeline_desc), label };
|
||||
}
|
||||
|
||||
static uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_flash_attn_shader_lib_context & context) {
|
||||
const size_t limit_bytes = context.wg_mem_limit_bytes;
|
||||
const size_t q_tile = context.sg_mat_m;
|
||||
const size_t base_q_bytes =
|
||||
(context.key.head_dim_qk + context.key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
|
||||
2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
|
||||
size_t bytes_per_kv = 0;
|
||||
if (!context.key.kv_direct) {
|
||||
bytes_per_kv += std::max(context.key.head_dim_qk, context.key.head_dim_v);
|
||||
}
|
||||
if (context.key.has_mask) {
|
||||
bytes_per_kv += q_tile;
|
||||
}
|
||||
bytes_per_kv += q_tile;
|
||||
bytes_per_kv *= GGML_WEBGPU_F16_SIZE_BYTES;
|
||||
const uint32_t max_kv_tile = (limit_bytes - base_q_bytes) / bytes_per_kv;
|
||||
return (max_kv_tile / context.sg_mat_n) * context.sg_mat_n;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // GGML_WEBGPU_SHADER_LIB_HPP
|
||||
|
||||
+931
-1320
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,6 @@
|
||||
diagnostic(off, subgroup_uniformity);
|
||||
enable f16;
|
||||
|
||||
#define Q_TILE 1
|
||||
#define KV_TILE 32
|
||||
#define WG_SIZE 32
|
||||
|
||||
@@ -11,7 +10,7 @@ struct Params {
|
||||
seq_len_kv: u32,
|
||||
stride_mask3: u32,
|
||||
// Number of KV blocks and Q blocks per batch.
|
||||
// nblk0 = ceil(seq_len_kv / KV_TILE), nblk1 = ceil(seq_len_q / Q_TILE).
|
||||
// nblk0 = ceil(seq_len_kv / KV_TILE), nblk1 = seq_len_q.
|
||||
nblk0: u32,
|
||||
nblk1: u32,
|
||||
};
|
||||
@@ -40,7 +39,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
return;
|
||||
}
|
||||
|
||||
let q_start = q_blk * Q_TILE;
|
||||
let q_start = q_blk;
|
||||
let k_start = kv_blk * KV_TILE;
|
||||
|
||||
let mask_batch = select(0u, batch_idx, params.stride_mask3 > 0u);
|
||||
@@ -54,11 +53,8 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
var local_max = -MASK_MAX;
|
||||
var local_any = 0u;
|
||||
|
||||
for (var q_rel = 0u; q_rel < Q_TILE; q_rel += 1u) {
|
||||
let q_row = q_start + q_rel;
|
||||
if (q_row >= params.seq_len_q) {
|
||||
continue;
|
||||
}
|
||||
let q_row = q_start;
|
||||
if (q_row < params.seq_len_q) {
|
||||
let row_base = mask_batch_base + q_row * params.seq_len_kv;
|
||||
for (var k_rel = local_id.x; k_rel < KV_TILE; k_rel += WG_SIZE) {
|
||||
let k_col = k_start + k_rel;
|
||||
|
||||
@@ -53,6 +53,16 @@
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
uint64_t ggml_graph_next_uid(void) {
|
||||
#ifdef _MSC_VER
|
||||
static volatile long long counter = 1;
|
||||
return (uint64_t) _InterlockedIncrement64(&counter) - 1;
|
||||
#else
|
||||
static uint64_t counter = 1;
|
||||
return __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Needed for ggml_fp32_to_bf16_row()
|
||||
#if defined(__AVX512BF16__)
|
||||
#if defined(_MSC_VER)
|
||||
@@ -7098,6 +7108,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
|
||||
/*.use_counts =*/ use_counts_ptr,
|
||||
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
|
||||
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
||||
/*.uid =*/ 0,
|
||||
};
|
||||
|
||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||
@@ -7125,6 +7136,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
|
||||
/*.use_counts =*/ cgraph0->use_counts,
|
||||
/*.visited_hash_set =*/ cgraph0->visited_hash_set,
|
||||
/*.order =*/ cgraph0->order,
|
||||
/*.uid =*/ 0
|
||||
};
|
||||
|
||||
return cgraph;
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
set(TARGET llama-vdot)
|
||||
add_executable(${TARGET} vdot.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-q8dot)
|
||||
add_executable(${TARGET} q8dot.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
HTTPLIB_VERSION = "refs/tags/v0.40.0"
|
||||
HTTPLIB_VERSION = "refs/tags/v0.42.0"
|
||||
|
||||
vendor = {
|
||||
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
|
||||
|
||||
+3
-113
@@ -6,6 +6,8 @@ llama_add_compile_flags()
|
||||
|
||||
# llama
|
||||
|
||||
file(GLOB LLAMA_MODELS_SOURCES "models/*.cpp")
|
||||
|
||||
add_library(llama
|
||||
../include/llama.h
|
||||
llama.cpp
|
||||
@@ -36,119 +38,7 @@ add_library(llama
|
||||
unicode-data.cpp
|
||||
unicode.cpp
|
||||
unicode.h
|
||||
models/afmoe.cpp
|
||||
models/apertus.cpp
|
||||
models/arcee.cpp
|
||||
models/arctic.cpp
|
||||
models/arwkv7.cpp
|
||||
models/baichuan.cpp
|
||||
models/bailingmoe.cpp
|
||||
models/bailingmoe2.cpp
|
||||
models/bert.cpp
|
||||
models/bitnet.cpp
|
||||
models/bloom.cpp
|
||||
models/chameleon.cpp
|
||||
models/chatglm.cpp
|
||||
models/codeshell.cpp
|
||||
models/cogvlm.cpp
|
||||
models/cohere2-iswa.cpp
|
||||
models/command-r.cpp
|
||||
models/dbrx.cpp
|
||||
models/deci.cpp
|
||||
models/deepseek.cpp
|
||||
models/deepseek2.cpp
|
||||
models/delta-net-base.cpp
|
||||
models/dots1.cpp
|
||||
models/dream.cpp
|
||||
models/ernie4-5-moe.cpp
|
||||
models/ernie4-5.cpp
|
||||
models/eurobert.cpp
|
||||
models/exaone-moe.cpp
|
||||
models/exaone.cpp
|
||||
models/exaone4.cpp
|
||||
models/falcon-h1.cpp
|
||||
models/falcon.cpp
|
||||
models/gemma-embedding.cpp
|
||||
models/gemma.cpp
|
||||
models/gemma2-iswa.cpp
|
||||
models/gemma3.cpp
|
||||
models/gemma3n-iswa.cpp
|
||||
models/gemma4-iswa.cpp
|
||||
models/glm4-moe.cpp
|
||||
models/glm4.cpp
|
||||
models/gpt2.cpp
|
||||
models/gptneox.cpp
|
||||
models/granite-hybrid.cpp
|
||||
models/granite.cpp
|
||||
models/grok.cpp
|
||||
models/grovemoe.cpp
|
||||
models/hunyuan-dense.cpp
|
||||
models/hunyuan-moe.cpp
|
||||
models/internlm2.cpp
|
||||
models/jais.cpp
|
||||
models/jais2.cpp
|
||||
models/jamba.cpp
|
||||
models/kimi-linear.cpp
|
||||
models/lfm2.cpp
|
||||
models/llada-moe.cpp
|
||||
models/llada.cpp
|
||||
models/llama-iswa.cpp
|
||||
models/llama.cpp
|
||||
models/maincoder.cpp
|
||||
models/mamba-base.cpp
|
||||
models/mamba.cpp
|
||||
models/mimo2-iswa.cpp
|
||||
models/minicpm3.cpp
|
||||
models/minimax-m2.cpp
|
||||
models/mistral3.cpp
|
||||
models/modern-bert.cpp
|
||||
models/mpt.cpp
|
||||
models/nemotron-h.cpp
|
||||
models/nemotron.cpp
|
||||
models/neo-bert.cpp
|
||||
models/olmo.cpp
|
||||
models/olmo2.cpp
|
||||
models/olmoe.cpp
|
||||
models/openai-moe-iswa.cpp
|
||||
models/openelm.cpp
|
||||
models/orion.cpp
|
||||
models/paddleocr.cpp
|
||||
models/pangu-embedded.cpp
|
||||
models/phi2.cpp
|
||||
models/phi3.cpp
|
||||
models/plamo.cpp
|
||||
models/plamo2.cpp
|
||||
models/plamo3.cpp
|
||||
models/plm.cpp
|
||||
models/qwen.cpp
|
||||
models/qwen2.cpp
|
||||
models/qwen2moe.cpp
|
||||
models/qwen2vl.cpp
|
||||
models/qwen3.cpp
|
||||
models/qwen35.cpp
|
||||
models/qwen35moe.cpp
|
||||
models/qwen3moe.cpp
|
||||
models/qwen3next.cpp
|
||||
models/qwen3vl-moe.cpp
|
||||
models/qwen3vl.cpp
|
||||
models/refact.cpp
|
||||
models/rnd1.cpp
|
||||
models/rwkv6-base.cpp
|
||||
models/rwkv6.cpp
|
||||
models/rwkv6qwen2.cpp
|
||||
models/rwkv7-base.cpp
|
||||
models/rwkv7.cpp
|
||||
models/seed-oss.cpp
|
||||
models/smallthinker.cpp
|
||||
models/smollm3.cpp
|
||||
models/stablelm.cpp
|
||||
models/starcoder.cpp
|
||||
models/starcoder2.cpp
|
||||
models/step35-iswa.cpp
|
||||
models/t5-dec.cpp
|
||||
models/t5-enc.cpp
|
||||
models/wavtokenizer-dec.cpp
|
||||
models/xverse.cpp
|
||||
${LLAMA_MODELS_SOURCES}
|
||||
)
|
||||
|
||||
set_target_properties(llama PROPERTIES
|
||||
|
||||
+99
-5
@@ -1,6 +1,7 @@
|
||||
#include "llama-graph.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-cparams.h"
|
||||
|
||||
@@ -1059,6 +1060,84 @@ ggml_tensor * llm_graph_context::build_norm(
|
||||
return cur;
|
||||
}
|
||||
|
||||
|
||||
llm_graph_qkv llm_graph_context::build_qkv(
|
||||
const llama_layer & layer,
|
||||
ggml_tensor * cur,
|
||||
int64_t n_embd_head,
|
||||
int64_t n_head,
|
||||
int64_t n_head_kv,
|
||||
int il) const {
|
||||
const int64_t n_embd_q = n_embd_head * n_head;
|
||||
const int64_t n_embd_kv = n_embd_head * n_head_kv;
|
||||
|
||||
ggml_tensor * Qcur, * Kcur, * Vcur;
|
||||
|
||||
if (layer.wqkv) {
|
||||
// fused QKV path
|
||||
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur, layer.wqkv_s);
|
||||
cb(qkv, "wqkv", il);
|
||||
if (layer.wqkv_b) {
|
||||
qkv = ggml_add(ctx0, qkv, layer.wqkv_b);
|
||||
cb(qkv, "wqkv_b", il);
|
||||
}
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
qkv = ggml_clamp(ctx0, qkv, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
cb(qkv, "wqkv_clamped", il);
|
||||
}
|
||||
Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens,
|
||||
ggml_row_size(qkv->type, n_embd_head), qkv->nb[1], 0);
|
||||
Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
|
||||
ggml_row_size(qkv->type, n_embd_head), qkv->nb[1],
|
||||
ggml_row_size(qkv->type, n_embd_q));
|
||||
Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
|
||||
ggml_row_size(qkv->type, n_embd_head), qkv->nb[1],
|
||||
ggml_row_size(qkv->type, n_embd_q + n_embd_kv));
|
||||
} else {
|
||||
// separate Q/K/V path
|
||||
Qcur = build_lora_mm(layer.wq, cur, layer.wq_s);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (layer.wq_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.wq_b);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
cb(Qcur, "Qcur_clamped", il);
|
||||
}
|
||||
Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (layer.wk_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.wk_b);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
cb(Kcur, "Kcur_clamped", il);
|
||||
}
|
||||
Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (layer.wv_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.wv_b);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
cb(Vcur, "Vcur_clamped", il);
|
||||
}
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
return { Qcur, Kcur, Vcur };
|
||||
}
|
||||
|
||||
|
||||
ggml_tensor * llm_graph_context::build_ffn(
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * up,
|
||||
@@ -2011,6 +2090,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_no_cache * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
@@ -2044,7 +2124,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
cur = build_lora_mm(wo, cur, wo_s);
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
@@ -2095,6 +2175,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_kv * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
@@ -2146,10 +2227,15 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
}
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
|
||||
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
|
||||
cur = build_lora_mm(wo, cur);
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
if (wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, wo_s);
|
||||
}
|
||||
} else {
|
||||
cur = build_lora_mm(wo, cur, wo_s);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2193,6 +2279,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_k * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
@@ -2227,10 +2314,15 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
||||
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
||||
cur = build_lora_mm(wo, cur);
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
if (wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, wo_s);
|
||||
}
|
||||
} else {
|
||||
cur = build_lora_mm(wo, cur, wo_s);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2245,6 +2337,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_kv_iswa * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
@@ -2313,7 +2406,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
}
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
cur = build_lora_mm(wo, cur, wo_s);
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
@@ -2344,6 +2437,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_cross * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
@@ -2368,7 +2462,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
cur = build_lora_mm(wo, cur, wo_s);
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
|
||||
@@ -17,6 +17,7 @@ struct ggml_context;
|
||||
struct ggml_tensor;
|
||||
|
||||
struct llama_cparams;
|
||||
struct llama_layer;
|
||||
|
||||
struct llama_memory_context_i;
|
||||
|
||||
@@ -707,6 +708,12 @@ using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
|
||||
// used in build_rs to properly order writes and avoid unnecessary copies
|
||||
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
||||
|
||||
struct llm_graph_qkv {
|
||||
ggml_tensor * q; // [n_embd_head, n_head, n_tokens]
|
||||
ggml_tensor * k; // [n_embd_head, n_head_kv, n_tokens]
|
||||
ggml_tensor * v; // [n_embd_head, n_head_kv, n_tokens]
|
||||
};
|
||||
|
||||
struct llm_graph_context {
|
||||
const llm_arch arch;
|
||||
|
||||
@@ -793,6 +800,17 @@ struct llm_graph_context {
|
||||
llm_norm_type type,
|
||||
int il) const;
|
||||
|
||||
|
||||
// compute Q, K, V projections with optional bias and reshape
|
||||
// supports both fused wqkv and separate wq/wk/wv paths
|
||||
llm_graph_qkv build_qkv(
|
||||
const llama_layer & layer,
|
||||
ggml_tensor * cur,
|
||||
int64_t n_embd_head,
|
||||
int64_t n_head,
|
||||
int64_t n_head_kv,
|
||||
int il) const;
|
||||
|
||||
ggml_tensor * build_ffn(
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * up,
|
||||
@@ -892,6 +910,7 @@ struct llm_graph_context {
|
||||
llm_graph_input_attn_no_cache * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
@@ -907,6 +926,7 @@ struct llm_graph_context {
|
||||
llm_graph_input_attn_kv * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
@@ -922,6 +942,7 @@ struct llm_graph_context {
|
||||
llm_graph_input_attn_k * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
@@ -938,6 +959,7 @@ struct llm_graph_context {
|
||||
llm_graph_input_attn_kv_iswa * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
||||
@@ -953,6 +975,7 @@ struct llm_graph_context {
|
||||
llm_graph_input_attn_cross * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * wo_s,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
|
||||
+233
-457
File diff suppressed because it is too large
Load Diff
+4
-7
@@ -84,6 +84,7 @@ enum llm_type {
|
||||
LLM_TYPE_26B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_31B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
LLM_TYPE_35B,
|
||||
@@ -118,6 +119,7 @@ enum llm_type {
|
||||
LLM_TYPE_16B_A1B,
|
||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||
LLM_TYPE_24B_A2B, // lfm2moe
|
||||
LLM_TYPE_26B_A4B, // Gemma4
|
||||
LLM_TYPE_30B_A3B,
|
||||
LLM_TYPE_31B_A3_5B,
|
||||
LLM_TYPE_35B_A3B, // Qwen3.5
|
||||
@@ -244,6 +246,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * wkv_b = nullptr;
|
||||
struct ggml_tensor * wk_b = nullptr;
|
||||
struct ggml_tensor * wv_b = nullptr;
|
||||
struct ggml_tensor * wqkv_b = nullptr;
|
||||
struct ggml_tensor * wo_b = nullptr;
|
||||
struct ggml_tensor * wq_cross = nullptr;
|
||||
struct ggml_tensor * wk_cross = nullptr;
|
||||
struct ggml_tensor * wv_cross = nullptr;
|
||||
@@ -254,13 +258,6 @@ struct llama_layer {
|
||||
struct ggml_tensor * wo_enc = nullptr;
|
||||
struct ggml_tensor * wqkv_gate = nullptr;
|
||||
|
||||
// attention bias
|
||||
struct ggml_tensor * bq = nullptr;
|
||||
struct ggml_tensor * bk = nullptr;
|
||||
struct ggml_tensor * bv = nullptr;
|
||||
struct ggml_tensor * bo = nullptr;
|
||||
struct ggml_tensor * bqkv = nullptr;
|
||||
|
||||
// relative position bias
|
||||
struct ggml_tensor * attn_rel_b = nullptr;
|
||||
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
||||
|
||||
+99
-58
@@ -91,12 +91,16 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
throw std::runtime_error("failed to create llama_context from model");
|
||||
}
|
||||
|
||||
std::vector<llama_device_memory_data> ret(model->devices.size());
|
||||
const size_t nd = model->n_devices();
|
||||
std::vector<llama_device_memory_data> ret(nd + 1);
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
for (const auto & [buft, mb] : memory_breakdown) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
ret.back().mb.model += mb.model;
|
||||
ret.back().mb.context += mb.context;
|
||||
ret.back().mb.compute += mb.compute;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -104,7 +108,7 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
if (!dev) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
if (model->devices[i].dev == dev) {
|
||||
ret[i].mb.model += mb.model;
|
||||
ret[i].mb.context += mb.context;
|
||||
@@ -113,7 +117,19 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
|
||||
{
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
ret.back().free = free;
|
||||
ret.back().total = total;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
|
||||
@@ -122,11 +138,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
free = ret.back().free;
|
||||
total = ret.back().total;
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
@@ -180,15 +193,15 @@ static void llama_params_fit_impl(
|
||||
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
if (nd == 0) {
|
||||
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
if (nd == 0) {
|
||||
margins.push_back(margins_s[0]);
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
@@ -215,46 +228,59 @@ static void llama_params_fit_impl(
|
||||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
if (nd == 0) {
|
||||
sum_projected_used = dmds_full.back().mb.total();
|
||||
sum_free = dmds_full.back().total;
|
||||
sum_projected_free = sum_free - sum_projected_used;
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (sum_projected_free >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
|
||||
__func__, sum_projected_free/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -262,11 +288,15 @@ static void llama_params_fit_impl(
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
if (nd == 0) {
|
||||
global_surplus -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
if (nd == 1) {
|
||||
if (nd <= 1) {
|
||||
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
@@ -277,8 +307,12 @@ static void llama_params_fit_impl(
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
int64_t sum_used_target = sum_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
if (nd == 0) {
|
||||
sum_used_target -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
}
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
@@ -293,8 +327,12 @@ static void llama_params_fit_impl(
|
||||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
for (const auto & dmd : dmds_min_ctx) {
|
||||
sum_projected_used_min_ctx += dmd.mb.total();
|
||||
if (nd == 0) {
|
||||
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
|
||||
}
|
||||
}
|
||||
if (sum_used_target > sum_projected_used_min_ctx) {
|
||||
// linear interpolation between minimum and maximum context size:
|
||||
@@ -306,7 +344,7 @@ static void llama_params_fit_impl(
|
||||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (nd == 1) {
|
||||
if (nd <= 1) {
|
||||
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
@@ -329,6 +367,9 @@ static void llama_params_fit_impl(
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nd == 0) {
|
||||
throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
@@ -476,8 +517,8 @@ static void llama_params_fit_impl(
|
||||
|
||||
std::vector<int64_t> ret;
|
||||
ret.reserve(nd);
|
||||
for (const llama_device_memory_data & dmd : dmd_nl) {
|
||||
ret.push_back(dmd.mb.total());
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ret.push_back(dmd_nl[id].mb.total());
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
+4
-15
@@ -41,22 +41,13 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
||||
{
|
||||
ggml_tensor * attn_inp = cur; // save input for gate computation
|
||||
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
// compute gate from input
|
||||
ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
|
||||
cb(gate, "attn_gate_proj", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// Q/K normalization
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
@@ -77,10 +68,8 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
}
|
||||
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
NULL, NULL, // wo will be applied after gating
|
||||
NULL, NULL, NULL, // wo will be applied after gating
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
|
||||
@@ -91,7 +80,7 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
||||
cb(cur, "attn_gated", il);
|
||||
|
||||
// now apply output projection
|
||||
cur = build_lora_mm(model.layers[il].wo, cur);
|
||||
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
||||
cb(cur, "attn_o_proj", il);
|
||||
}
|
||||
|
||||
|
||||
+3
-15
@@ -1,7 +1,5 @@
|
||||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
@@ -32,25 +30,15 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
@@ -62,7 +50,7 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
|
||||
cb(Vcur, "Vcur_pos", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
+3
-26
@@ -1,6 +1,5 @@
|
||||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
@@ -36,30 +35,8 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
@@ -78,7 +55,7 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
+3
-13
@@ -30,18 +30,8 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
@@ -60,7 +50,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
||||
+3
-14
@@ -1,6 +1,5 @@
|
||||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
@@ -29,18 +28,8 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
switch (model.type) {
|
||||
case LLM_TYPE_7B:
|
||||
@@ -67,7 +56,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
||||
@@ -28,30 +28,8 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head_k, n_head, n_head_kv, il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
@@ -70,7 +48,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
@@ -29,15 +28,8 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
|
||||
|
||||
// self_attention
|
||||
{
|
||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 0 * sizeof(float) * (n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 1 * sizeof(float) * (n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
@@ -56,7 +48,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
||||
+3
-31
@@ -2,7 +2,6 @@
|
||||
|
||||
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
@@ -39,35 +38,8 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
{
|
||||
ggml_tensor * Qcur;
|
||||
ggml_tensor * Kcur;
|
||||
ggml_tensor * Vcur;
|
||||
|
||||
// self-attention
|
||||
if (model.layers[il].wqkv) {
|
||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
if (model.layers[il].bqkv) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
|
||||
0 * sizeof(float) * (n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 1 * sizeof(float) * (n_embd));
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||
} else {
|
||||
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
||||
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
||||
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
}
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
if (model.layers[il].attn_q_norm) {
|
||||
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head * n_head, n_tokens);
|
||||
@@ -100,7 +72,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
+5
-30
@@ -28,33 +28,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
// B1.K
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
// B1.V
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
@@ -73,7 +48,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
|
||||
cur = build_norm(cur,
|
||||
@@ -82,8 +57,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
||||
cb(cur, "attn_sub_norm", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
||||
if (model.layers[il].bo) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||
if (model.layers[il].wo_b) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].wo_b);
|
||||
}
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
+3
-15
@@ -2,7 +2,6 @@
|
||||
|
||||
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
@@ -30,22 +29,11 @@ llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_para
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user