Compare commits

...

16 Commits

Author SHA1 Message Date
Joe Todd 18133cab40 Revert "use the correct SYCL context for host USM allocations"
Manually reverting:
https://github.com/ggerganov/llama.cpp/pull/7858

Signed-off-by: Joe Todd <joe.todd@codeplay.com>
2024-06-13 12:08:27 +01:00
Joe Todd abd7c7b8c2 Formatting
Signed-off-by: Joe Todd <joe.todd@codeplay.com>
2024-06-13 10:36:05 +01:00
Joe Todd 0c0f3f0000 [SYCL] Update unsupported ops
Rope is only supported for contiguous input data.

Concat currently only supports dim=2

Signed-off-by: Joe Todd <joe.todd@codeplay.com>
2024-06-13 10:33:34 +01:00
Joe Todd 9b81b57239 [SYCL] unify rope norm/neox
As per: https://github.com/ggerganov/llama.cpp/pull/7634

Signed-off-by: Joe Todd <joe.todd@codeplay.com>
2024-06-13 10:30:43 +01:00
Georgi Gerganov a9cae48003 tests : add non-cont unary tests (#7857)
* tests : add non-cont unary tests

* ggml : update unary asserts and "supports_op"

ggml-ci
2024-06-12 16:00:22 +03:00
Georgi Gerganov bfaa676b08 ggml : improve ggml_is_contiguous logic (#7856)
* ggml : improve ggml_is_contiguous logic

ggml-ci

* ggml : support more contiguous cases

ggml-ci
2024-06-12 15:24:20 +03:00
Georgi Gerganov 704a35b183 server : restore numeric prompts (#7883) 2024-06-12 14:42:29 +03:00
Meng, Hengyu dcf752707d update intel docker oneapi-basekit to 2024.1.1-devel-ubuntu22.04 (#7894)
In addition this reverts a workaround we had to do to workaround the upstream issue with expired intel GPG package keys in 2024.0.1-devel-ubuntu22.04
2024-06-12 19:05:35 +10:00
Patrice Ferlet f2b5764beb Fix a typo and add Fedora 40 pacakge to install for Vulkan (#7794) [no ci]
Fix "appropiate" to "appropriate" and add Fedora 40 packages to install to compile with Vulkan support
2024-06-12 11:18:16 +10:00
k.h.lai 73bac2b11d vulkan: select only one device for single gpu with multiple drivers (#7582) 2024-06-11 21:26:05 +02:00
0cc4m ef52d1d16a Update Vulkan RoPE implementation (#7818)
* Update Vulkan RoPE implementation

* Return nullptr on alloc_buffer when allocation fails, instead of throwing an exception

Minor fixes

* Fix segfault when running out of VRAM

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
2024-06-11 21:20:29 +02:00
Deven Mistry 14f83526cd fix broken link in pr template (#7880) [no ci]
* fix broken link in pr template

* Update pull_request_template.md [no ci]

---------

Co-authored-by: Brian <mofosyne@gmail.com>
2024-06-12 02:18:58 +10:00
Brian 6fe42d073f github: move PR template to .github/ root (#7868) 2024-06-11 17:43:41 +03:00
Johannes Gäßler 148995e5e5 llama-bench: more compact markdown tables (#7879) 2024-06-11 14:45:40 +02:00
Georgi Gerganov 4bfe50f741 tests : check the Python version (#7872)
ggml-ci
2024-06-11 10:10:20 +03:00
Johannes Gäßler bdcb8f4222 CUDA: int8 tensor cores for MMQ (q4_K, q5_K, q6_K) (#7860) 2024-06-11 08:26:07 +02:00
20 changed files with 2008 additions and 1506 deletions
+1 -9
View File
@@ -1,15 +1,7 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
+1 -17
View File
@@ -1,15 +1,7 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
@@ -27,14 +19,6 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
@@ -2,4 +2,4 @@
- [ ] Review Complexity : Low
- [ ] Review Complexity : Medium
- [ ] Review Complexity : High
- [ ] I have read the [contributing guidelines](CONTRIBUTING.md)
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
+3 -1
View File
@@ -576,7 +576,9 @@ Building the program with BLAS support may lead to some performance improvements
vulkaninfo
```
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
Alternatively your package manager might be able to provide the appropriate libraries.
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Then, build llama.cpp using the cmake command below:
+21
View File
@@ -1033,6 +1033,27 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") {
return 3;
}
if (field == "n_threads") {
return 7;
}
if (field == "n_batch") {
return 7;
}
if (field == "n_ubatch") {
return 8;
}
if (field == "type_k" || field == "type_v") {
return 6;
}
if (field == "split_mode") {
return 5;
}
if (field == "flash_attn") {
return 2;
}
if (field == "use_mmap") {
return 4;
}
if (field == "test") {
return 13;
}
+12 -7
View File
@@ -147,7 +147,7 @@ struct server_slot {
int32_t n_prompt_tokens = 0;
int32_t n_prompt_tokens_processed = 0;
std::string prompt;
json prompt; // can be either a string, array of strings or array of token ids
// when a task is submitted, we first tokenize the prompt and store it here
std::vector<llama_token> prompt_tokens;
@@ -822,8 +822,13 @@ struct server_context {
continue;
}
// skip the slot if it does not contains prompt
if (!slot.prompt.is_string()) {
continue;
}
// current slot's prompt
std::string slot_prompt = slot.prompt;
std::string slot_prompt = slot.prompt.get<std::string>();
// length of the current slot's prompt
int slot_prompt_len = slot_prompt.size();
@@ -957,12 +962,12 @@ struct server_context {
return false;
}
if (prompt->is_string()) {
slot.prompt = prompt->get<std::string>();
} else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) {
slot.prompt = prompt->at(0).get<std::string>();
if ((prompt->is_string()) ||
(prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
(prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
slot.prompt = *prompt;
} else {
send_error(task, "\"prompt\" must be a string or an array of strings", ERROR_TYPE_INVALID_REQUEST);
send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
return false;
}
}
+1 -1
View File
@@ -886,7 +886,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free(*buffers[i]);
ggml_backend_buffer_free((*buffers)[i]);
}
free(*buffers);
return false;
+1 -1
View File
@@ -2740,7 +2740,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_TANH:
return true;
return ggml_is_contiguous(op->src[0]);
default:
return false;
}
+66
View File
@@ -1,5 +1,27 @@
#include "common.cuh"
struct mma_int_A_I16K4 {
static constexpr int I = 16;
static constexpr int K = 4;
static constexpr int ne = 2;
int x[ne] = {0};
static __device__ __forceinline__ int get_i(const int l) {
const int ret = (l%2) * (I/2) + threadIdx.x / K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < I);
return ret;
}
static __device__ __forceinline__ int get_k(const int /* l */) {
const int ret = threadIdx.x % K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < K);
return ret;
}
};
struct mma_int_A_I16K8 {
static constexpr int I = 16;
static constexpr int K = 8;
@@ -22,6 +44,28 @@ struct mma_int_A_I16K8 {
}
};
struct mma_int_B_J8K4 {
static constexpr int J = 8;
static constexpr int K = 4;
static constexpr int ne = 1;
int x[ne] = {0};
static __device__ __forceinline__ int get_j(const int /* l */) {
const int ret = threadIdx.x / K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < J);
return ret;
}
static __device__ __forceinline__ int get_k(const int /* l */) {
const int ret = threadIdx.x % K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < K);
return ret;
}
};
struct mma_int_B_J8K8 {
static constexpr int J = 8;
static constexpr int K = 8;
@@ -65,6 +109,28 @@ struct mma_int_C_I16J8 {
return ret;
}
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
#ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
#else
// On Turing m16n8k16 mma is not available, use 2x m8n8k16 mma instead:
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[0]), "+r"(x[1])
: "r"(mma_A.x[0]), "r"(mma_B.x[0]));
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
#endif // __CUDA_ARCH__ >= CC_AMPERE
#else
GGML_UNUSED(mma_A);
GGML_UNUSED(mma_B);
NO_DEVICE_CODE;
#endif // INT8_MMA_AVAILABLE
}
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
#ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE
+294 -6
View File
@@ -1089,7 +1089,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -1115,6 +1115,97 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K8 mma_A;
typedef mma_int_B_J8K8 mma_B;
typedef mma_int_C_I16J8 mma_C;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
mma_A A[2];
int scA[mma_C::ne/2][2];
int mA[mma_C::ne/2][2];
half2 dmA[mma_C::ne/2];
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q4_K_Q8_1_MMQ; kvdr += 4) {
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = k0 + mma_A::get_k(l);
A[kvdr/4].x[l] = (x_ql[i*(WARP_SIZE + 1) + k] >> kvdr) & 0x0F0F0F0F;
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
const uint8_t * m = sc + 8;
scA[l][kvdr/4] = sc[kvdr/4];
mA[l][kvdr/4] = m[kvdr/4];
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dmA[l] = x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K + k0/QI5_K];
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
float tmpd[mma_C::ne] = {0.0f};
float tmpm[mma_C::ne] = {0.0f};
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q5_K_Q8_1_MMQ; kvdr += 4) {
mma_C C;
mma_B B;
half2 dsB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + 2*kvdr + mma_B::get_k(l)) % WARP_SIZE;
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dsB[l] = y_ds[j*MMQ_TILE_Y_K + ((2*k0 + 2*kvdr)/QI8_1) % (WARP_SIZE/QI8_1)];
}
C.mma_K8(A[kvdr/4], B);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
tmpd[l] += (C.x[l]*scA[l/2][kvdr/4]) * __low2float(dsB[l%2]);
tmpm[l] += mA[l/2][kvdr/4] * __high2float(dsB[l%2]);
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
sum[(j0/mma_B::J)*mma_C::ne + l] += __low2float(dmA[l/2])*tmpd[l] - __high2float(dmA[l/2])*tmpm[l];
}
}
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
@@ -1188,7 +1279,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -1214,6 +1305,97 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K8 mma_A;
typedef mma_int_B_J8K8 mma_B;
typedef mma_int_C_I16J8 mma_C;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
mma_A A[2];
int scA[mma_C::ne/2][2];
int mA[mma_C::ne/2][2];
half2 dmA[mma_C::ne/2];
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q5_K_Q8_1_MMQ; kvdr += 4) {
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = QR5_K*k0 + QR5_K*kvdr + mma_A::get_k(l);
A[kvdr/4].x[l] = x_ql[i*(QR5_K*WARP_SIZE + 1) + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
const uint8_t * m = sc + 8;
scA[l][kvdr/4] = sc[kvdr/4];
mA[l][kvdr/4] = m[kvdr/4];
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dmA[l] = x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K + k0/QI5_K];
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
float tmpd[mma_C::ne] = {0.0f};
float tmpm[mma_C::ne] = {0.0f};
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q5_K_Q8_1_MMQ; kvdr += 4) {
mma_C C;
mma_B B;
half2 dsB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + 2*kvdr + mma_B::get_k(l)) % WARP_SIZE;
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dsB[l] = y_ds[j*MMQ_TILE_Y_K + ((2*k0 + 2*kvdr)/QI8_1) % (WARP_SIZE/QI8_1)];
}
C.mma_K8(A[kvdr/4], B);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
tmpd[l] += (C.x[l]*scA[l/2][kvdr/4]) * __low2float(dsB[l%2]);
tmpm[l] += mA[l/2][kvdr/4] * __high2float(dsB[l%2]);
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
sum[(j0/mma_B::J)*mma_C::ne + l] += __low2float(dmA[l/2])*tmpd[l] - __high2float(dmA[l/2])*tmpm[l];
}
}
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
@@ -1280,7 +1462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -1307,6 +1489,97 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K4 mma_A;
typedef mma_int_B_J8K4 mma_B;
typedef mma_int_C_I16J8 mma_C;
const float * x_df = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
mma_A A[4];
int scA[mma_C::ne/2][4];
float dA[mma_C::ne/2];
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q6_K_Q8_1_MMQ; kvdr += 4) {
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = QR6_K*k0 + QR6_K*kvdr + mma_A::get_k(l);
A[kvdr/2 + 0].x[l] = x_ql[i*(QR6_K*WARP_SIZE + 1) + k + 0];
A[kvdr/2 + 1].x[l] = x_ql[i*(QR6_K*WARP_SIZE + 1) + k + mma_A::K];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
scA[l][kvdr/2 + 0] = sc[kvdr/2 + 0];
scA[l][kvdr/2 + 1] = sc[kvdr/2 + 1];
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dA[l] = x_df[i*(WARP_SIZE/QI6_K) + i/QI6_K + k0/QI6_K];
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
float tmp[mma_C::ne] = {0.0f};
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q6_K_Q8_1_MMQ; kvdr += 4) {
mma_C C[2];
mma_B B[2];
float dB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + 2*kvdr + mma_B::get_k(l)) % WARP_SIZE;
B[0].x[l] = y_qs[j*MMQ_TILE_Y_K + k + 0];
B[1].x[l] = y_qs[j*MMQ_TILE_Y_K + k + mma_B::K];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dB[l] = y_df[j*MMQ_TILE_Y_K + ((2*k0 + 2*kvdr)/QI8_1) % (WARP_SIZE/QI8_1)];
}
C[0].mma_K4(A[kvdr/2 + 0], B[0]);
C[1].mma_K4(A[kvdr/2 + 1], B[1]);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
tmp[l] += (C[0].x[l]*scA[l/2][kvdr/2 + 0] + C[1].x[l]*scA[l/2][kvdr/2 + 1])*dB[l%2];
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
sum[(j0/mma_B::J)*mma_C::ne + l] += tmp[l]*dA[l/2];
}
}
}
template<int mmq_x, int mmq_y, int nwarps, bool need_check>
static __device__ __forceinline__ void mmq_write_back_dp4a(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) {
#pragma unroll
@@ -1448,24 +1721,39 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
};
static int mmq_need_sum(const ggml_type type_x) {
+20
View File
@@ -148,6 +148,8 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -160,6 +162,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -172,6 +176,8 @@ void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -184,6 +190,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -196,6 +204,8 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -208,6 +218,8 @@ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -220,6 +232,8 @@ void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -232,6 +246,8 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -244,6 +260,8 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -259,6 +277,8 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
+1 -1
View File
@@ -1340,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
return true;
return ggml_is_contiguous(op->src[0]);
default:
;
}
+1 -1
View File
@@ -744,7 +744,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
return true;
return ggml_is_contiguous(op->src[0]);
default:
return false;
}
+94 -127
View File
@@ -8826,7 +8826,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
}
struct rope_corr_dims {
float v[4];
float v[2];
};
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
@@ -8850,29 +8850,38 @@ static void rope_yarn(
}
// rope == RoPE == rotary positional embedding
template<typename T, bool has_pos>
static void rope(
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
float ext_factor, float attn_factor, rope_corr_dims corr_dims
,
template<typename T, bool has_ff>
static void rope_norm(
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
const sycl::nd_item<3> &item_ct1) {
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
item_ct1.get_local_id(1));
if (col >= ncols) {
if (i0 >= ne0) {
return;
}
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
const int i = row*ncols + col;
if (i0 >= n_dims) {
const int i = row*ne0 + i0;
dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1];
return;
}
const int i = row*ne0 + i0;
const int i2 = row/p_delta_rows;
const int p = has_pos ? pos[i2] : 0;
const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
const float x0 = x[i + 0];
const float x1 = x[i + 1];
@@ -8881,25 +8890,25 @@ static void rope(
dst[i + 1] = x0*sin_theta + x1*cos_theta;
}
template<typename T, bool has_pos, bool has_freq_facs>
static void rope_neox(
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
template <typename T, bool has_ff>
static void rope_neox(const T *x, T *dst, int ne0, int n_dims,
const int32_t *pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor,
rope_corr_dims corr_dims, float theta_scale,
const float *freq_factors,
const sycl::nd_item<3> &item_ct1) {
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
item_ct1.get_local_id(1));
if (col >= ncols) {
if (i0 >= ne0) {
return;
}
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
const int ib = col / n_dims;
const int ic = col % n_dims;
if (ib > 0) {
const int i = row*ncols + ib*n_dims + ic;
if (i0 >= n_dims) {
const int i = row*ne0 + i0;
dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1];
@@ -8907,19 +8916,14 @@ static void rope_neox(
return;
}
const int i = row*ncols + ib*n_dims + ic/2;
const int i = row*ne0 + i0/2;
const int i2 = row/p_delta_rows;
float cur_rot = inv_ndims * ic - ib;
const int p = has_pos ? pos[i2] : 0;
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
const float theta_base =
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
const float x0 = x[i + 0];
const float x1 = x[i + n_dims/2];
@@ -12375,15 +12379,18 @@ static void clamp_f32_sycl(const float *x, float *dst, const float min,
}
template <typename T>
static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
static void rope_norm_sycl(const T *x, T *dst, int ne0, int n_dims, int nr,
const int32_t *pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor,
rope_corr_dims corr_dims, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % 2 == 0);
rope_corr_dims corr_dims, const float * freq_factors, dpct::queue_ptr stream) {
GGML_ASSERT(ne0 % 2 == 0);
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
if (pos == nullptr) {
const int n_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
const sycl::range<3> block_nums(1, n_blocks_x, nr);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
if (freq_factors == nullptr) {
/*
DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
@@ -12395,8 +12402,8 @@ static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
freq_base, ext_factor, attn_factor, corr_dims,
rope_norm<T, false>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
item_ct1);
});
} else {
@@ -12411,70 +12418,46 @@ static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
freq_base, ext_factor, attn_factor, corr_dims,
rope_norm<T, true>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
item_ct1);
});
}
}
template <typename T>
static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
static void rope_neox_sycl(const T *x, T *dst, int ne0, int n_dims, int nr,
const int32_t *pos, float freq_scale,
int p_delta_rows, float freq_base, float ext_factor,
float attn_factor, rope_corr_dims corr_dims,
const float * freq_factors, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % 2 == 0);
GGML_ASSERT(ne0 % 2 == 0);
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
const int n_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
const sycl::range<3> block_nums(1, n_blocks_x, nr);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float inv_ndims = -1.0f / n_dims;
if (pos == nullptr) {
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
if (freq_factors == nullptr) {
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
p_delta_rows, ext_factor, attn_factor,
corr_dims, theta_scale, inv_ndims, freq_factors,
item_ct1);
});
} else {
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
p_delta_rows, ext_factor, attn_factor,
corr_dims, theta_scale, inv_ndims, freq_factors,
item_ct1);
});
}
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
if (freq_factors == nullptr) {
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
rope_neox<T, false>(x, dst, ne0, n_dims, pos, freq_scale,
p_delta_rows, ext_factor, attn_factor,
corr_dims, theta_scale, freq_factors,
item_ct1);
});
} else {
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
if (freq_factors == nullptr) {
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
p_delta_rows, ext_factor, attn_factor,
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
});
} else {
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
p_delta_rows, ext_factor, attn_factor,
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
});
}
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
rope_neox<T, true>(x, dst, ne0, n_dims, pos, freq_scale,
p_delta_rows, ext_factor, attn_factor,
corr_dims, theta_scale, freq_factors,
item_ct1);
});
}
}
@@ -13089,12 +13072,9 @@ void *ggml_sycl_host_malloc(size_t size) try {
return nullptr;
}
ggml_sycl_set_device(g_main_device);
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
void * ptr = nullptr;
dpct::err0 err = CHECK_TRY_ERROR(
ptr = (void *)sycl::malloc_host(size, *main_stream));
ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
if (err != 0) {
// clear the error
@@ -13115,9 +13095,7 @@ catch (sycl::exception const &exc) {
}
void ggml_sycl_host_free(void *ptr) try {
ggml_sycl_set_device(g_main_device);
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *main_stream)));
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -14005,8 +13983,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne2 = dst->ne[2];
const int64_t nrows = ggml_nrows(src0);
const int64_t nr = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -14023,27 +14000,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
const float * freq_factors = nullptr;
const int32_t * pos = nullptr;
if ((mode & 1) == 0) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(src1->ne[0] == ne2);
pos = (const int32_t *) src1_dd;
}
const bool is_neox = mode & 2;
#pragma message("TODO: update rope NORM mode to match NEOX mode")
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
const int32_t * pos = (const int32_t *) src1_dd;
if (is_neox) {
pos = (const int32_t *) src1_dd;
if (src2 != nullptr) {
freq_factors = (const float *) src2->data;
}
} else {
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
const float * freq_factors = nullptr;
if (src2 != nullptr) {
freq_factors = (const float *) src2->data;
}
rope_corr_dims corr_dims;
@@ -14053,12 +14016,12 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
if (is_neox) {
if (src0->type == GGML_TYPE_F32) {
rope_neox_sycl(
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, main_stream
);
} else if (src0->type == GGML_TYPE_F16) {
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
ne00, n_dims, nrows, pos, freq_scale, ne01,
ne00, n_dims, nr, pos, freq_scale, ne01,
freq_base, ext_factor, attn_factor, corr_dims,
freq_factors, main_stream);
} else {
@@ -14066,14 +14029,14 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
}
} else {
if (src0->type == GGML_TYPE_F32) {
rope_sycl(
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, main_stream
rope_norm_sycl(
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, main_stream
);
} else if (src0->type == GGML_TYPE_F16) {
rope_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
nrows, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, main_stream);
rope_norm_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, main_stream);
} else {
GGML_ASSERT(false);
}
@@ -17190,7 +17153,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_TANH:
return true;
return ggml_is_contiguous(op->src[0]);
default:
return false;
}
@@ -17267,7 +17230,12 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
case GGML_OP_CONCAT:
{
ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
int dim = op->op_params[0];
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16 && dim == 2;
} break;
case GGML_OP_ROPE:
{
return ggml_is_contiguous(op->src[0]);
} break;
case GGML_OP_DUP:
case GGML_OP_NONE:
@@ -17287,7 +17255,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_IM2COL:
case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS:
+1239 -1138
View File
File diff suppressed because it is too large Load Diff
+113 -64
View File
@@ -1,5 +1,5 @@
#include "ggml-vulkan.h"
#include <vulkan/vulkan_core.h>
#ifdef GGML_VULKAN_RUN_TESTS
#include <chrono>
#endif
@@ -9,12 +9,13 @@
#include <algorithm>
#include <cmath>
#include <iostream>
#include <limits>
#include <tuple>
#include <vector>
#include <sstream>
#include <utility>
#include <memory>
#include <limits>
#include <map>
#include "ggml.h"
#include "ggml-backend-impl.h"
@@ -150,7 +151,7 @@ struct vk_device {
vk_pipeline pipeline_relu_f32;
vk_pipeline pipeline_diag_mask_inf_f32;
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
vk_pipeline pipeline_argsort_f32;
vk_pipeline pipeline_sum_rows_f32;
@@ -283,26 +284,15 @@ struct vk_op_diag_mask_push_constants {
struct vk_op_rope_push_constants {
uint32_t ncols;
uint32_t n_dims;
float freq_scale;
uint32_t p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[4];
};
struct vk_op_rope_neox_push_constants {
uint32_t ncols;
uint32_t ndims;
float freq_scale;
uint32_t p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[4];
float corr_dims[2];
float theta_scale;
float inv_ndims;
uint32_t has_freq_facs;
uint32_t has_ff;
};
struct vk_op_soft_max_push_constants {
@@ -1534,11 +1524,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
@@ -1566,8 +1556,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
vk::PhysicalDeviceProperties2 props2;
vk::PhysicalDeviceMaintenance3Properties props3;
vk::PhysicalDeviceSubgroupProperties subgroup_props;
vk::PhysicalDeviceDriverProperties driver_props;
props2.pNext = &props3;
props3.pNext = &subgroup_props;
subgroup_props.pNext = &driver_props;
physical_device.getProperties2(&props2);
const size_t subgroup_size = subgroup_props.subgroupSize;
@@ -1611,7 +1603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
fp16 = fp16 && vk12_features.shaderFloat16;
std::string device_name = props2.properties.deviceName.data();
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1707,7 +1699,78 @@ void ggml_vk_instance_init() {
vk::PhysicalDeviceProperties props = devices[i].getProperties();
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
vk_instance.device_indices.push_back(i);
// Check if there are two physical devices corresponding to the same GPU
auto old_device = std::find_if(
vk_instance.device_indices.begin(),
vk_instance.device_indices.end(),
[&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
);
if (old_device == vk_instance.device_indices.end()) {
vk_instance.device_indices.push_back(i);
} else {
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
// This can cause error when splitting layers aross the devices, need to keep only 1
#ifdef GGML_VULKAN_DEBUG
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
#endif
vk::PhysicalDeviceProperties2 old_prop;
vk::PhysicalDeviceDriverProperties old_driver;
old_prop.pNext = &old_driver;
devices[*old_device].getProperties2(&old_prop);
vk::PhysicalDeviceProperties2 new_prop;
vk::PhysicalDeviceDriverProperties new_driver;
new_prop.pNext = &new_driver;
devices[i].getProperties2(&new_prop);
std::map<vk::DriverId, int> driver_priorities {};
int old_priority = std::numeric_limits<int>::max();
int new_priority = std::numeric_limits<int>::max();
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
// Smaller number -> higher priority
switch (old_prop.properties.vendorID) {
case VK_VENDOR_ID_AMD:
driver_priorities[vk::DriverId::eMesaRadv] = 1;
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
break;
case VK_VENDOR_ID_INTEL:
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
break;
case VK_VENDOR_ID_NVIDIA:
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
driver_priorities[vk::DriverId::eMesaNvk] = 2;
#endif
break;
}
if (driver_priorities.count(old_driver.driverID)) {
old_priority = driver_priorities[old_driver.driverID];
}
if (driver_priorities.count(new_driver.driverID)) {
new_priority = driver_priorities[new_driver.driverID];
}
if (new_priority < old_priority) {
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
vk_instance.device_indices.push_back(i);
#ifdef GGML_VULKAN_DEBUG
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
#endif
}
#ifdef GGML_VULKAN_DEBUG
else {
std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
}
#endif
}
}
}
@@ -3905,10 +3968,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
}
} else {
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_rope_f32;
return ctx->device->pipeline_rope_norm_f32;
}
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_rope_f16;
return ctx->device->pipeline_rope_norm_f16;
}
}
return nullptr;
@@ -4152,24 +4215,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else if (op == GGML_OP_ROPE) {
const int mode = ((int32_t *) dst->op_params)[2];
const bool is_neox = mode & 2;
if (is_neox) {
// Empty src2 is possible in rope, but the shader needs a buffer
vk_subbuffer subbuf_z;
if (use_src2) {
subbuf_z = { d_Z, z_buf_offset, z_sz };
} else {
subbuf_z = { d_X, 0, d_X->size };
}
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
// Empty src2 is possible in rope, but the shader needs a buffer
vk_subbuffer subbuf_z;
if (use_src2) {
subbuf_z = { d_Z, z_buf_offset, z_sz };
} else {
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
subbuf_z = { d_X, 0, d_X->size };
}
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else if (use_src2) {
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4391,7 +4446,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
// const int mode = ((int32_t *) dst->op_params)[2];
// const int n_ctx = ((int32_t *) dst->op_params)[3];
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
const float freq_base = ((float *) dst->op_params)[5];
@@ -4401,28 +4456,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
const float beta_fast = ((float *) dst->op_params)[9];
const float beta_slow = ((float *) dst->op_params)[10];
const bool is_neox = mode & 2;
#pragma message("TODO: update rope NORM mode to match NEOX mode")
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
if (is_neox) {
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float inv_ndims = -1.0f / n_dims;
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
src2 != nullptr,
});
} else {
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
});
}
const float theta_scale = powf(freq_base, -2.0f/n_dims);
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
src2 != nullptr,
});
}
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -6070,7 +6113,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
#endif
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
vk_buffer dev_buffer = nullptr;
try {
dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
} catch (const vk::SystemError& e) {
return nullptr;
}
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
@@ -6390,7 +6439,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
return true;
return ggml_is_contiguous(op->src[0]);
default:
return false;
}
@@ -6466,7 +6515,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
// } break;
case GGML_OP_ROPE:
return true;
return ggml_is_contiguous(op->src[0]);
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
+80 -92
View File
@@ -3212,35 +3212,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1];
}
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
size_t next_nb = ggml_type_size(tensor->type);
if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
return false;
}
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
if (tensor->ne[i] != 1) {
if (i > n) {
if (tensor->nb[i] != next_nb) {
return false;
}
next_nb *= tensor->ne[i];
} else {
// this dimension does not need to be contiguous
next_nb = tensor->ne[i]*tensor->nb[i];
}
}
}
return true;
}
return
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
return ggml_is_contiguous_0(tensor);
}
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
return ggml_is_contiguous(tensor);
return ggml_is_contiguous_n(tensor, 0);
}
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
return ggml_is_contiguous_n(tensor, 1);
}
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
return ggml_is_contiguous_n(tensor, 2);
}
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
@@ -3272,20 +3279,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
(t0->ne[0] == t1->ne[0] ) &&
(t0->ne[1] == t1->ne[1] ) &&
(t0->ne[2] == t1->ne[2] ) &&
(t0->ne[3] == t1->ne[3] );
(t0->ne[0] == t1->ne[0]) &&
(t0->ne[1] == t1->ne[1]) &&
(t0->ne[2] == t1->ne[2]) &&
(t0->ne[3] == t1->ne[3]);
}
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
(t0->nb[0] == t1->nb[0] ) &&
(t0->nb[1] == t1->nb[1] ) &&
(t0->nb[2] == t1->nb[2] ) &&
(t0->nb[3] == t1->nb[3] );
(t0->nb[0] == t1->nb[0]) &&
(t0->nb[1] == t1->nb[1]) &&
(t0->nb[2] == t1->nb[2]) &&
(t0->nb[3] == t1->nb[3]);
}
// check if t1 can be represented as a repeatition of t0
@@ -4078,32 +4085,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
switch (tensor->type) {
case GGML_TYPE_I8:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
return ((int8_t *)(tensor->data))[i];
}
case GGML_TYPE_I16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
return ((int16_t *)(tensor->data))[i];
}
case GGML_TYPE_I32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
return ((int32_t *)(tensor->data))[i];
}
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
}
case GGML_TYPE_BF16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
}
case GGML_TYPE_F32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(float));
return ((float *)(tensor->data))[i];
}
default:
@@ -4125,32 +4126,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
switch (tensor->type) {
case GGML_TYPE_I8:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
((int8_t *)(tensor->data))[i] = value;
} break;
case GGML_TYPE_I16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
((int16_t *)(tensor->data))[i] = value;
} break;
case GGML_TYPE_I32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
((int32_t *)(tensor->data))[i] = value;
} break;
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
case GGML_TYPE_BF16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
} break;
case GGML_TYPE_F32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(float));
((float *)(tensor->data))[i] = value;
} break;
default:
@@ -7343,13 +7338,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
}
// gmml_unary
// ggml_unary
static struct ggml_tensor * ggml_unary_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op,
bool inplace) {
GGML_ASSERT(ggml_is_contiguous_1(a));
bool is_node = false;
if (!inplace && (a->grad)) {
@@ -11014,6 +11011,8 @@ static void ggml_compute_forward_abs_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11023,9 +11022,6 @@ static void ggml_compute_forward_abs_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_abs_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11060,6 +11056,8 @@ static void ggml_compute_forward_sgn_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11069,9 +11067,6 @@ static void ggml_compute_forward_sgn_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_sgn_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11106,6 +11101,8 @@ static void ggml_compute_forward_neg_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11115,9 +11112,6 @@ static void ggml_compute_forward_neg_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_neg_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11152,6 +11146,8 @@ static void ggml_compute_forward_step_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11161,9 +11157,6 @@ static void ggml_compute_forward_step_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_step_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11198,6 +11191,8 @@ static void ggml_compute_forward_tanh_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11207,9 +11202,6 @@ static void ggml_compute_forward_tanh_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_tanh_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11244,6 +11236,8 @@ static void ggml_compute_forward_elu_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11253,9 +11247,6 @@ static void ggml_compute_forward_elu_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_elu_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11290,6 +11281,8 @@ static void ggml_compute_forward_relu_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11299,9 +11292,6 @@ static void ggml_compute_forward_relu_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_relu_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11336,6 +11326,8 @@ static void ggml_compute_forward_sigmoid_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11345,9 +11337,6 @@ static void ggml_compute_forward_sigmoid_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_sigmoid_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11381,9 +11370,9 @@ static void ggml_compute_forward_gelu_f32(
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -11444,9 +11433,9 @@ static void ggml_compute_forward_gelu_quick_f32(
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -11507,9 +11496,9 @@ static void ggml_compute_forward_silu_f32(
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -11570,6 +11559,8 @@ static void ggml_compute_forward_leaky_relu_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11619,11 +11610,11 @@ static void ggml_compute_forward_silu_back_f32(
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * grad = dst->src[1];
GGML_ASSERT(ggml_is_contiguous_1(grad));
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src0, grad));
assert(ggml_is_contiguous_1(grad));
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
assert(ggml_are_same_shape(src0, grad));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -11685,6 +11676,8 @@ static void ggml_compute_forward_hardswish_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11694,9 +11687,6 @@ static void ggml_compute_forward_hardswish_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardswish_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11728,6 +11718,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11737,9 +11729,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardsigmoid_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -16686,7 +16675,10 @@ static void ggml_compute_forward_map_unary_f32(
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(ggml_are_same_shape(src0, dst));
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -16695,9 +16687,6 @@ static void ggml_compute_forward_map_unary_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert( dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
fun(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -16735,6 +16724,9 @@ static void ggml_compute_forward_map_binary_f32(
const struct ggml_tensor * src1 = dst->src[1];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(src1));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -16744,10 +16736,6 @@ static void ggml_compute_forward_map_binary_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert( dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
assert(src1->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
fun(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
+37 -29
View File
@@ -2400,7 +2400,7 @@ void main() {
"""
# ROPE
rope_src = """
rope_norm_src = """
#version 450
#extension GL_EXT_shader_16bit_storage : require
@@ -2408,17 +2408,21 @@ rope_src = """
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) readonly buffer Y {int data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
layout (binding = 1) readonly buffer Y {int data_pos[];};
layout (binding = 2) readonly buffer Z {float data_ff[];};
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
layout (push_constant) uniform parameter {
uint ncols;
uint n_dims;
float freq_scale;
uint p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[4];
float corr_dims[2];
float theta_scale;
uint has_ff;
} p;
float rope_yarn_ramp(const float low, const float high, const uint i0) {
@@ -2450,14 +2454,24 @@ void main() {
return;
}
if (col >= p.n_dims) {
const uint i = row*p.ncols + col;
data_d[i + 0] = data_a[i + 0];
data_d[i + 1] = data_a[i + 1];
return;
}
const uint i = row*p.ncols + col;
const uint i2 = row/p.p_delta_rows;
const int pos = data_b[i2];
const float theta_base = pos * pow(p.freq_base, -float(col)/p.ncols);
const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base, col, cos_theta, sin_theta);
rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
const float x0 = float(data_a[i + 0]);
const float x1 = float(data_a[i + 1]);
@@ -2475,22 +2489,21 @@ rope_neox_src = """
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) readonly buffer Y {int data_b[];};
layout (binding = 2) readonly buffer Z {float data_freq_factors[];};
layout (binding = 1) readonly buffer Y {int data_pos[];};
layout (binding = 2) readonly buffer Z {float data_ff[];};
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
layout (push_constant) uniform parameter {
uint ncols;
uint ndims;
uint n_dims;
float freq_scale;
uint p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[4];
float corr_dims[2];
float theta_scale;
float inv_ndims;
uint has_freq_facs;
uint has_ff;
} p;
float rope_yarn_ramp(const float low, const float high, const uint i0) {
@@ -2522,11 +2535,8 @@ void main() {
return;
}
const uint ib = col / p.ndims;
const uint ic = col % p.ndims;
if (ib > 0) {
const uint i = row*p.ncols + ib*p.ndims + ic;
if (col >= p.n_dims) {
const uint i = row*p.ncols + col;
data_d[i + 0] = data_a[i + 0];
data_d[i + 1] = data_a[i + 1];
@@ -2534,29 +2544,27 @@ void main() {
return;
}
const uint i = row*p.ncols + ib*p.ndims + ic/2;
const uint i = row*p.ncols + col/2;
const uint i2 = row/p.p_delta_rows;
const int pos = data_b[i2];
const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
float cos_theta, sin_theta;
rope_yarn(theta_base, ic, cos_theta, sin_theta);
rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
const float x0 = float(data_a[i + 0]);
const float x1 = float(data_a[i + p.ndims/2]);
const float x1 = float(data_a[i + p.n_dims/2]);
data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
data_d[i + p.ndims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
data_d[i + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
}
"""
argsort_src = """
#version 450
#extension GL_EXT_shader_16bit_storage : require
#define BLOCK_SIZE 1024
#define ASC 0
@@ -3039,8 +3047,8 @@ async def main():
tasks.append(string_to_spv("soft_max_f32", f"{soft_max_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "C_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("soft_max_f32_f16", f"{soft_max_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float16_t", "C_TYPE": "float16_t", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("rope_norm_f32", rope_norm_src, {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_norm_f16", rope_norm_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
+20 -9
View File
@@ -642,20 +642,29 @@ struct test_case {
struct test_unary : public test_case {
const ggml_unary_op op;
const ggml_type type;
const std::array<int64_t, 4> ne;
const std::array<int64_t, 4> ne_a;
int v; // view (1 : non-contiguous a)
std::string vars() override {
return VARS_TO_STR2(type, ne);
return VARS_TO_STR3(type, ne_a, v);
}
test_unary(ggml_unary_op op,
ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {128, 10, 10, 10})
: op(op), type(type), ne(ne) {}
std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
int v = 0)
: op(op), type(type), ne_a(ne_a), v(v) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * in = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_unary(ctx, in, op);
ggml_tensor * a;
if (v & 1) {
auto ne = ne_a; ne[0] *= 3;
a = ggml_new_tensor(ctx, type, 4, ne.data());
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
} else {
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
}
ggml_tensor * out = ggml_unary(ctx, a, op);
return out;
}
@@ -2016,9 +2025,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
};
// unary ops
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
test_cases.emplace_back(new test_unary((ggml_unary_op) op));
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }));
for (int v : {0, 1}) {
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v));
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v));
}
}
test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
+2 -2
View File
@@ -870,7 +870,7 @@ int main() {
}
});
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) {
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
test_all("Python", [](const TestCase & tc) {
write("test-json-schema-input.tmp", tc.schema);
tc.verify_status(std::system(
@@ -878,7 +878,7 @@ int main() {
tc.verify(read("test-grammar-output.tmp"));
});
} else {
fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m");
fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
}
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {