Compare commits
30 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0b047287fe | |||
| efbada936f | |||
| f3c3e0e9a0 | |||
| 5755a100cd | |||
| 1e5ad35d56 | |||
| 65d7a8bbf0 | |||
| 00d56b11c3 | |||
| 5757c4dcb1 | |||
| e20b83930c | |||
| fd89556567 | |||
| 60489932ec | |||
| 4a4f819cb6 | |||
| 046e284437 | |||
| 66001722aa | |||
| c5703e03a5 | |||
| b46812de78 | |||
| 49956041ee | |||
| 9f5f0e689c | |||
| f9cd456ea5 | |||
| 5d6f18a638 | |||
| 29debb3a6a | |||
| 9dcf835528 | |||
| 58e68df0f9 | |||
| 9b2925e1e0 | |||
| a8fd165fec | |||
| 6d57a49a70 | |||
| 3e941b813b | |||
| f3e8d149ce | |||
| 1d72d87349 | |||
| 6a2a2513dc |
@@ -33,10 +33,10 @@ RUN mkdir -p /app/full \
|
||||
|
||||
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
|
||||
|
||||
ARG IGC_VERSION=v2.30.1
|
||||
ARG IGC_VERSION_FULL=2_2.30.1+20950
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
|
||||
ARG IGC_VERSION=v2.32.7
|
||||
ARG IGC_VERSION_FULL=2_2.32.7+21184
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.14.37833.4
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.14.37833.4-0
|
||||
ARG IGDGMM_VERSION=22.9.0
|
||||
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
|
||||
|
||||
@@ -103,6 +103,7 @@ let
|
||||
vulkan-headers
|
||||
vulkan-loader
|
||||
shaderc
|
||||
spirv-headers
|
||||
];
|
||||
in
|
||||
|
||||
@@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
ninja
|
||||
pkg-config
|
||||
git
|
||||
spirv-headers
|
||||
]
|
||||
++ optionals useCuda [
|
||||
cudaPackages.cuda_nvcc
|
||||
|
||||
@@ -110,6 +110,7 @@ uv.lock
|
||||
|
||||
# Nix
|
||||
|
||||
flake.lock
|
||||
/result
|
||||
|
||||
# Test binaries
|
||||
|
||||
@@ -369,9 +369,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
arguments.name_suffix) +
|
||||
arguments.value_prefix +
|
||||
(schema_info.resolves_to_string(param_schema) ?
|
||||
p.tool_arg_string_value(p.schema(until_suffix,
|
||||
"tool-" + name + "-arg-" + param_name + "-schema",
|
||||
param_schema, true)) :
|
||||
p.tool_arg_string_value(until_suffix) :
|
||||
p.tool_arg_json_value(p.schema(
|
||||
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
|
||||
p.space()) +
|
||||
|
||||
@@ -158,8 +158,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
|
||||
for (size_t i = 0; i < cur_p->size; i++) {
|
||||
if (cur_p->data[i].id != forced) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
cur_p->data[i].logit = +INFINITY; // force the token
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+58
-3
@@ -710,7 +710,7 @@ class ModelBase:
|
||||
self._repack_nvfp4(name, weight, scale, scale2, input_scale)
|
||||
|
||||
# Flush any remaining experts (fallback if n_experts was unknown)
|
||||
for bid, proj_type in expert_blocks.keys():
|
||||
for bid, proj_type in list(expert_blocks.keys()):
|
||||
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
|
||||
|
||||
# Remove consumed tensors so get_tensors/modify_tensors won't see them
|
||||
@@ -718,7 +718,7 @@ class ModelBase:
|
||||
self.model_tensors.pop(name, None)
|
||||
|
||||
# Remove any remaining unused auxiliary tensors
|
||||
for name in self.model_tensors.keys():
|
||||
for name in list(self.model_tensors.keys()):
|
||||
if name.endswith((".k_scale", ".v_scale")):
|
||||
del self.model_tensors[name]
|
||||
|
||||
@@ -1570,6 +1570,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
|
||||
# ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
|
||||
res = "f2llmv2"
|
||||
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
|
||||
# ref: https://huggingface.co/sarvamai/sarvam-30b
|
||||
res = "sarvam-moe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -7988,13 +7991,37 @@ class Gemma4Model(Gemma3Model):
|
||||
rope_freqs_full = torch.tensor(values, dtype=torch.float32)
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
|
||||
|
||||
def _generate_nvfp4_tensors(self):
|
||||
# Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales
|
||||
# each expert's contribution. It's mathematically equivalent to a per-expert
|
||||
# scalar on the down_proj output, which is exactly where ffn_down_exps_s is
|
||||
# applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the
|
||||
# existing NVFP4 path produces the right scales.
|
||||
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
|
||||
for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]:
|
||||
bid_match = re.search(r"\.layers\.(\d+)\.", name)
|
||||
if bid_match is None:
|
||||
continue
|
||||
bid = bid_match.group(1)
|
||||
prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")]
|
||||
w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)]
|
||||
present = [w2 in self.model_tensors for w2 in w2_targets]
|
||||
if not any(present):
|
||||
continue
|
||||
assert all(present), f"layer {bid}: partial NVFP4 quantization across experts"
|
||||
r = self.model_tensors.pop(name)
|
||||
for e, w2 in enumerate(w2_targets):
|
||||
s = self.model_tensors[w2]
|
||||
self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i]
|
||||
super()._generate_nvfp4_tensors()
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
|
||||
if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
|
||||
name = name + ".weight"
|
||||
if ".experts." in name and not name.endswith(".weight"):
|
||||
if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
|
||||
name += ".weight"
|
||||
|
||||
return super().filter_tensors((name, gen))
|
||||
@@ -11567,6 +11594,34 @@ class BailingMoeV2Model(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
|
||||
class SarvamMoEModel(BailingMoeV2Model):
|
||||
model_arch = gguf.MODEL_ARCH.BAILINGMOE2
|
||||
# Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
|
||||
# - full rotary (no partial_rotary_factor)
|
||||
# - expert bias is zero-mean normalized at load time
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
# Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
if name.endswith(".expert_bias"):
|
||||
# Sarvam normalizes expert bias to zero mean
|
||||
inner = gen
|
||||
|
||||
def gen():
|
||||
t = inner()
|
||||
return t - t.mean()
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
|
||||
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
|
||||
class GroveMoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GROVEMOE
|
||||
|
||||
@@ -155,6 +155,7 @@ models = [
|
||||
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
|
||||
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
|
||||
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
|
||||
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
@@ -737,6 +737,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
|
||||
|
||||
## Compile-time Flags
|
||||
|
||||
Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spot.
|
||||
|
||||
| Name | Function |
|
||||
|-----------------|----------------------------------------------------------------------------------|
|
||||
| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
|
||||
|
||||
## Design Rule
|
||||
|
||||
- Open to all contributors.
|
||||
|
||||
@@ -111,14 +111,14 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
|
||||
echo "Use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
|
||||
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
|
||||
else
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
GPUS_SETTING="-sm ${SPLIT_MODE}"
|
||||
fi
|
||||
|
||||
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
|
||||
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000"
|
||||
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000
|
||||
|
||||
|
||||
|
||||
Generated
-58
@@ -1,58 +0,0 @@
|
||||
{
|
||||
"nodes": {
|
||||
"flake-parts": {
|
||||
"inputs": {
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1730504689,
|
||||
"narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "506278e768c2a08bec68eb62932193e341f55c90",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1732014248,
|
||||
"narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1730504152,
|
||||
"narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-parts": "flake-parts",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
+1
-1
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 11)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_PATCH 1)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
@@ -169,7 +169,7 @@ extern "C" {
|
||||
// device type
|
||||
enum ggml_backend_dev_type type;
|
||||
// device id
|
||||
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
|
||||
// for PCI devices, this should be the lower-case PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:c1:00.0")
|
||||
// if the id is unknown, this should be NULL
|
||||
const char * device_id;
|
||||
// device capabilities
|
||||
|
||||
@@ -965,7 +965,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||
}
|
||||
if (sched->debug > 1) {
|
||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_desc(node), node->name,
|
||||
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
|
||||
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
|
||||
@@ -0,0 +1,968 @@
|
||||
#include "allreduce.cuh"
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#include "convert.cuh"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// CUDA AllReduce for tensor-parallel inference across two GPUs.
|
||||
//
|
||||
// Provides an in-place sum reduction over matching tensors on two CUDA
|
||||
// devices in the same process. Used by the tensor-split path alongside
|
||||
// NCCL; targets setups without NVLink, where data is exchanged between the
|
||||
// GPUs by staging it through pinned host memory over PCIe.
|
||||
//
|
||||
// Two reduction strategies are selected per call by tensor size:
|
||||
//
|
||||
// * Chunked kernel path (small reductions): a single CUDA kernel both
|
||||
// stages data through pinned host memory and performs the local sum.
|
||||
// Cross-GPU synchronization happens *inside the kernel* (busy-wait on
|
||||
// a host-memory flag), which keeps launch overhead low for the
|
||||
// latency-sensitive token-generation case.
|
||||
//
|
||||
// * Copy-engine path (large reductions): the transfer is split into
|
||||
// D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine,
|
||||
// followed by a small device-side add kernel. Cross-GPU
|
||||
// synchronization happens *outside the kernel*, via CUDA events
|
||||
// between streams. This keeps the compute engine free while large
|
||||
// transfers are in flight, which matters for prefill-sized tensors.
|
||||
// Reductions larger than the per-call inner cap are processed by an
|
||||
// outer chunker that issues sequential inner calls.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cross-GPU signal mechanism
|
||||
//
|
||||
// One int per (slot, rank) pair in pinned host memory. Each AR call writes a
|
||||
// strictly increasing token (= the AR call number) into its own arrival int.
|
||||
// The peer spins until its read of the other's arrival int equals the token
|
||||
// it expects for this call -- a mismatch means the peer hasn't arrived yet.
|
||||
// Tokens never repeat over realistic call rates (32-bit int wraps in tens of
|
||||
// days at thousands of ARs/sec), so arrival ints don't need to be reset
|
||||
// between calls; we initialize once at pipeline init and let the values
|
||||
// accumulate.
|
||||
//
|
||||
// There is exactly one writer (the owning GPU) and one reader (the peer), so
|
||||
// we don't need atomics. A volatile store paired with __threadfence_system()
|
||||
// provides the release ordering that makes the D2H writes visible system-wide
|
||||
// before the arrival token is observed.
|
||||
//
|
||||
// atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable
|
||||
// on PCIe-attached consumer GPUs without NVLink, so the volatile path is the
|
||||
// portable choice.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) {
|
||||
*(volatile int *)p = token;
|
||||
}
|
||||
static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) {
|
||||
return *(const volatile int *)p;
|
||||
}
|
||||
|
||||
// Byte spacing between adjacent arrival ints. 64 bytes (one cache line)
|
||||
// ensures each GPU/block's arrival slot lives on its own line, preventing
|
||||
// false-sharing stalls on the polling GPU.
|
||||
static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64;
|
||||
|
||||
// Number of blocks the chunked kernel launches with. Each block stripes a
|
||||
// disjoint slice of the data and synchronizes through its own arrival-token
|
||||
// slot so multiple SMs can pump PCIe stores in parallel.
|
||||
static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16.
|
||||
//
|
||||
// Both GPUs run this kernel simultaneously on independent streams. sendbuf
|
||||
// and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other
|
||||
// carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g.
|
||||
// T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe). When
|
||||
// T_dst == T_wire the casts below are no-ops.
|
||||
//
|
||||
// Each GPU runs three phases:
|
||||
//
|
||||
// Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as
|
||||
// single-instruction-width vectors into host_mine.
|
||||
// __threadfence_system() commits these writes to host
|
||||
// memory.
|
||||
// Phase 2 (thread 0): write token to arrival_mine; spin until
|
||||
// arrival_other == token.
|
||||
// Phase 3 (all threads): read T_wire vectors from host_other, cast
|
||||
// each element to T_dst, and sum with the local
|
||||
// sendbuf value (also rounded through T_wire so that
|
||||
// both GPUs truncate identically -- this guarantees
|
||||
// bit-equivalent results across the two devices).
|
||||
//
|
||||
// Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global
|
||||
// threads to keep multiple SMs issuing PCIe stores in parallel. Each block
|
||||
// has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE);
|
||||
// thread 0 of each block signals/spins on that slot independently of other
|
||||
// blocks. Tail elements (the leftover < ELEMS_PER_VEC at the end) are
|
||||
// handled only by block 0 to avoid cross-block writes to the same slots.
|
||||
// ---------------------------------------------------------------------------
|
||||
template <typename T_dst, typename T_wire>
|
||||
static __global__ void ggml_cuda_ar_kernel(
|
||||
const T_dst * sendbuf,
|
||||
T_dst * recvbuf,
|
||||
T_wire * __restrict__ host_mine,
|
||||
const T_wire * __restrict__ host_other,
|
||||
int count,
|
||||
int * arrival_mine,
|
||||
int * arrival_other,
|
||||
int token) {
|
||||
|
||||
// Vector unit for the wire type, sized to the arch's widest single-instruction
|
||||
// copy (16 B on Volta+). Each phase-1 iter writes one vector to host memory;
|
||||
// each phase-3 iter reads one and produces ELEMS_PER_VEC sums.
|
||||
constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire);
|
||||
constexpr int ARRIVAL_INTS = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int));
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int nt = blockDim.x;
|
||||
const int bid = blockIdx.x;
|
||||
const int gtid = bid * nt + tid;
|
||||
const int gnt = gridDim.x * nt;
|
||||
const int count_vec = count / ELEMS_PER_VEC;
|
||||
const int tail = count_vec * ELEMS_PER_VEC;
|
||||
|
||||
// Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors.
|
||||
{
|
||||
for (int i = gtid; i < count_vec; i += gnt) {
|
||||
const int off = i * ELEMS_PER_VEC;
|
||||
T_wire wire[ELEMS_PER_VEC];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
|
||||
wire[k] = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
|
||||
}
|
||||
ggml_cuda_memcpy_1<sizeof(wire)>(&host_mine[off], wire);
|
||||
}
|
||||
if (bid == 0 && tid < count - tail) {
|
||||
host_mine[tail + tid] = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
|
||||
}
|
||||
}
|
||||
|
||||
// Commit this block's host writes before signalling.
|
||||
__threadfence_system();
|
||||
__syncthreads();
|
||||
|
||||
// Phase 2: thread 0 of each block signals on its own arrival slot, then
|
||||
// spins for the matching slot from peer. Per-block tokens mean blocks
|
||||
// proceed independently -- no inter-block barrier needed.
|
||||
if (tid == 0) {
|
||||
int * my_slot = arrival_mine + bid * ARRIVAL_INTS;
|
||||
const int * other_slot = arrival_other + bid * ARRIVAL_INTS;
|
||||
|
||||
ggml_cuda_ar_signal_set(my_slot, token);
|
||||
__threadfence_system(); // make our signal visible system-wide
|
||||
|
||||
while (ggml_cuda_ar_signal_get(other_slot) != token) {
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
__nanosleep(100);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Acquire peer's host_other writes (this block's stripe of them).
|
||||
__threadfence_system();
|
||||
|
||||
// Phase 3: read peer's T_wire vector, cast both sides through T_wire for
|
||||
// bit-equivalence, sum in T_dst precision, and write back to recvbuf.
|
||||
{
|
||||
for (int i = gtid; i < count_vec; i += gnt) {
|
||||
const int off = i * ELEMS_PER_VEC;
|
||||
T_wire wire[ELEMS_PER_VEC];
|
||||
ggml_cuda_memcpy_1<sizeof(wire)>(wire, &host_other[off]);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
|
||||
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
|
||||
recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
|
||||
}
|
||||
}
|
||||
if (bid == 0 && tid < count - tail) {
|
||||
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
|
||||
recvbuf[tail + tid] =
|
||||
ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Combined load-convert-add kernel. The peer's contribution arrives as T_src
|
||||
// (which may be a lower-precision type than T_dst when the BF16 round-trip is
|
||||
// active). For bit-equivalence between the two GPUs, dst is first rounded
|
||||
// through T_src's precision via ggml_cuda_cast -- peer already truncated its
|
||||
// own value the same way before sending -- so both sides perform identical
|
||||
// arithmetic. When T_dst == T_src the round-trip cast is a no-op.
|
||||
template <typename T_dst, typename T_src>
|
||||
static __global__ void ggml_cuda_ar_add_kernel(
|
||||
T_dst * __restrict__ dst,
|
||||
const T_src * __restrict__ src,
|
||||
int count) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int nt = gridDim.x * blockDim.x;
|
||||
for (int i = tid; i < count; i += nt) {
|
||||
const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
|
||||
dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pipeline structure
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Number of slots in the event / arrival ring. Two slots is sufficient:
|
||||
// lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so
|
||||
// slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2]
|
||||
// from AR N-2 by the time we get to AR N. acquire_slot's
|
||||
// cudaEventSynchronize on ev.ker for both devices makes that consumption
|
||||
// explicit before we overwrite host_buf[slot] for the new AR.
|
||||
static constexpr int GGML_CUDA_AR_POOL_SIZE = 2;
|
||||
|
||||
// Maximum chunk size (bytes per GPU) handled by one chunked kernel launch.
|
||||
// Larger tensors are reduced by issuing multiple chunked launches.
|
||||
static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB
|
||||
|
||||
// Copy-engine path: largest tensor accepted on this path; sets host_large /
|
||||
// dev_tmp allocation size.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB
|
||||
|
||||
// AR wire size at which the copy-engine path takes over from the chunked-
|
||||
// kernel path. Override via GGML_CUDA_AR_COPY_THRESHOLD.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB
|
||||
// Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX).
|
||||
// The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with
|
||||
// the peer); the clamps cover the cases where nbytes/4 is too small (per-
|
||||
// memcpy fixed cost dominates) or too large (chunk-level pipelining stalls).
|
||||
// Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024; // 512 KB
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024; // 2 MB
|
||||
// Absolute floor that an env-var override is allowed to set; this caps the
|
||||
// per-slot copy-event array. 256 KB -> up to 128 chunks per 32 MB tensor.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024;
|
||||
static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS =
|
||||
static_cast<int>((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) /
|
||||
GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
|
||||
|
||||
struct ggml_cuda_ar_event_slot {
|
||||
cudaEvent_t app = nullptr; // upstream computation complete
|
||||
cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {}; // copy-engine D2H chunks complete
|
||||
cudaEvent_t h2d = nullptr; // copy-engine H2Ds complete (handoff AR stream -> compute stream)
|
||||
cudaEvent_t ker = nullptr; // AllReduce kernel complete
|
||||
};
|
||||
|
||||
// Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer
|
||||
// in one place, with the host handle preserved for cudaFreeHost. Used where
|
||||
// the CPU never touches the buffer -- only the device reads/writes via the
|
||||
// mapped device pointer. Required on systems where cudaDevAttrCanUseHost-
|
||||
// PointerForRegisteredMem is 0 and the host pointer can't be used as a
|
||||
// device pointer.
|
||||
struct ggml_cuda_ar_host_mapping {
|
||||
uint8_t * host = nullptr; // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync
|
||||
uint8_t * dev = nullptr; // device-side pointer for kernels / cudaMemset
|
||||
|
||||
cudaError_t alloc(size_t bytes) {
|
||||
cudaError_t rc = cudaHostAlloc(reinterpret_cast<void **>(&host), bytes,
|
||||
cudaHostAllocPortable | cudaHostAllocMapped);
|
||||
if (rc != cudaSuccess) {
|
||||
host = nullptr;
|
||||
return rc;
|
||||
}
|
||||
rc = cudaHostGetDevicePointer(reinterpret_cast<void **>(&dev), host, 0);
|
||||
if (rc != cudaSuccess) {
|
||||
cudaFreeHost(host);
|
||||
host = nullptr;
|
||||
dev = nullptr;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
void free() {
|
||||
if (host) {
|
||||
cudaFreeHost(host);
|
||||
host = nullptr;
|
||||
dev = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_cuda_ar_pipeline {
|
||||
int n_devices;
|
||||
int devices[GGML_CUDA_MAX_DEVICES];
|
||||
size_t buf_bytes; // bytes per device in host_buf[]
|
||||
size_t copy_bytes; // bytes per device in host_large[] / dev_tmp[]
|
||||
size_t copy_threshold;
|
||||
size_t copy_chunk_bytes;
|
||||
size_t bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables
|
||||
uint64_t call_count;
|
||||
|
||||
// Per-device resources.
|
||||
ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES]; // pinned staging (chunked kernel)
|
||||
ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine)
|
||||
char * dev_tmp[GGML_CUDA_MAX_DEVICES]; // device scratch for copy-engine path
|
||||
cudaStream_t streams[GGML_CUDA_MAX_DEVICES]; // non-blocking
|
||||
ggml_cuda_ar_event_slot ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE];
|
||||
|
||||
// Copy-engine: per-device "I finished reading my peer's host_large"
|
||||
// event. Indexed by RECORDER device. Recorded same-device on streams[i]
|
||||
// after stage 2's last H2D from host_large[peer]. Waited cross-device
|
||||
// by peer's stage-1 stream before the next AR overwrites host_large[peer].
|
||||
cudaEvent_t host_large_read_done[GGML_CUDA_MAX_DEVICES];
|
||||
bool host_large_read_done_valid;
|
||||
|
||||
// Copy-engine: per-device "my add_kernel is done with dev_tmp" event.
|
||||
// Recorded on the compute stream after each add_kernel; the AR stream
|
||||
// waits on it before the next copy_impl's H2D overwrites dev_tmp. Lets us
|
||||
// single-buffer dev_tmp despite add_kernel running on a separate stream.
|
||||
cudaEvent_t dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES];
|
||||
bool dev_tmp_kernel_done_valid;
|
||||
|
||||
// Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints. Mapped pinned
|
||||
// memory; CPU never reads/writes -- only the kernel and cudaMemset.
|
||||
// Use ggml_cuda_ar_arrival_ptr() to index.
|
||||
ggml_cuda_ar_host_mapping arrival;
|
||||
};
|
||||
|
||||
// Base pointer for the (slot, rank) per-block token block. The kernel adds
|
||||
// blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot.
|
||||
static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) {
|
||||
const size_t offset = ((size_t)slot * p->n_devices + rank) *
|
||||
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
|
||||
return reinterpret_cast<int *>(p->arrival.dev + offset);
|
||||
}
|
||||
|
||||
static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) {
|
||||
const char * value = getenv(name);
|
||||
if (value == nullptr || value[0] == '\0') {
|
||||
return default_value;
|
||||
}
|
||||
|
||||
char * end = nullptr;
|
||||
const unsigned long long parsed = strtoull(value, &end, 10);
|
||||
return end != value ? (uint64_t) parsed : default_value;
|
||||
}
|
||||
|
||||
struct ggml_cuda_ar_slot_info {
|
||||
int slot;
|
||||
int token;
|
||||
};
|
||||
|
||||
static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) {
|
||||
const int slot = static_cast<int>(p->call_count % GGML_CUDA_AR_POOL_SIZE);
|
||||
const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE;
|
||||
p->call_count++;
|
||||
|
||||
if (pool_lapped) {
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker));
|
||||
}
|
||||
}
|
||||
|
||||
return { slot, (int) p->call_count };
|
||||
}
|
||||
|
||||
// Per-AR copy-engine chunk size: env-var override if set, else heuristic
|
||||
// (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)).
|
||||
static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) {
|
||||
if (p->copy_chunk_bytes > 0) {
|
||||
return p->copy_chunk_bytes;
|
||||
}
|
||||
return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX,
|
||||
std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4));
|
||||
}
|
||||
|
||||
static void ggml_cuda_ar_wait_for_compute(
|
||||
ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) {
|
||||
ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot];
|
||||
CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream()));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Init / free
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) {
|
||||
|
||||
if (n_devices != 2) {
|
||||
GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); "
|
||||
"falling back\n", __func__, n_devices);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// The chunked kernel uses __nanosleep, which is sm70+ (Volta+).
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
const int cc = ggml_cuda_info().devices[devices[i]].cc;
|
||||
if (cc < GGML_CUDA_CC_VOLTA) {
|
||||
GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d "
|
||||
"(device %d has cc=%d); falling back\n",
|
||||
__func__, GGML_CUDA_CC_VOLTA, devices[i], cc);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto * p = new ggml_cuda_ar_pipeline{};
|
||||
p->n_devices = n_devices;
|
||||
p->copy_bytes = GGML_CUDA_AR_COPY_MAX_BYTES;
|
||||
p->copy_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT);
|
||||
// 0 = use the per-call heuristic (default). Non-zero env value forces a
|
||||
// fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN.
|
||||
p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0);
|
||||
if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) {
|
||||
GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n",
|
||||
__func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
|
||||
p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN;
|
||||
}
|
||||
// Default 1: BF16 round-trip is always on for F32 inputs (any non-zero
|
||||
// ne). Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger
|
||||
// byte threshold to opt out for small tensors.
|
||||
p->bf16_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1);
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
p->devices[i] = devices[i];
|
||||
}
|
||||
|
||||
// Per-device streams and event pools.
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
|
||||
cudaStream_t stream = nullptr;
|
||||
if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n",
|
||||
__func__, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
p->streams[i] = stream;
|
||||
|
||||
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
|
||||
bool ok =
|
||||
cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess &&
|
||||
cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess &&
|
||||
cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess;
|
||||
for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
|
||||
ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess;
|
||||
}
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n",
|
||||
__func__, p->devices[i], s);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n",
|
||||
__func__, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n",
|
||||
__func__, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Arrival ring: cache-line padded so each GPU's int is on its own line.
|
||||
const size_t arrival_bytes =
|
||||
(size_t)GGML_CUDA_AR_POOL_SIZE * n_devices *
|
||||
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
|
||||
if (p->arrival.alloc(arrival_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n",
|
||||
__func__, arrival_bytes);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
ggml_cuda_set_device(p->devices[0]);
|
||||
if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n",
|
||||
__func__, arrival_bytes);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked-
|
||||
// kernel can write the next slot's data while the peer is still reading
|
||||
// the previous slot's. Indexed by (slot * buf_bytes) at the call site.
|
||||
p->buf_bytes = GGML_CUDA_AR_MAX_BYTES;
|
||||
const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes;
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n",
|
||||
__func__, host_buf_total);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy-engine path: pinned host staging + device scratch, sized for the
|
||||
// largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES).
|
||||
// dev_tmp is single-buffered; cross-AR safety is enforced by an explicit
|
||||
// cross-stream wait in copy_impl on the prior AR's add_kernel-done event.
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n",
|
||||
__func__, p->copy_bytes);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
if (cudaMalloc(reinterpret_cast<void **>(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n",
|
||||
__func__, p->copy_bytes, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, "
|
||||
"%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n",
|
||||
__func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) {
|
||||
if (!p) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Drain all in-flight kernels before tearing down resources.
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
if (p->streams[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaStreamSynchronize(p->streams[i]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
p->host_buf[i].free();
|
||||
p->host_large[i].free();
|
||||
if (p->dev_tmp[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaFree(p->dev_tmp[i]);
|
||||
}
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
|
||||
if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); }
|
||||
for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
|
||||
if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); }
|
||||
}
|
||||
if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); }
|
||||
if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); }
|
||||
}
|
||||
if (p->host_large_read_done[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaEventDestroy(p->host_large_read_done[i]);
|
||||
}
|
||||
if (p->dev_tmp_kernel_done[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaEventDestroy(p->dev_tmp_kernel_done[i]);
|
||||
}
|
||||
if (p->streams[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaStreamDestroy(p->streams[i]);
|
||||
}
|
||||
}
|
||||
p->arrival.free();
|
||||
delete p;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Dispatch
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of
|
||||
// nbytes per ne element); accumulated locally into a T_dst buffer. When
|
||||
// T_src == T_dst this is the original homogeneous reduction. When they differ
|
||||
// (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src
|
||||
// for bit-equivalence between GPUs and we skip the otherwise-needed
|
||||
// post-conversion entirely.
|
||||
template <typename T_src, typename T_dst>
|
||||
static bool ggml_cuda_ar_allreduce_copy_impl(
|
||||
ggml_cuda_ar_pipeline * p,
|
||||
ggml_backend_t * backends,
|
||||
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
|
||||
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
|
||||
const bool compute[GGML_CUDA_MAX_DEVICES],
|
||||
int64_t ne,
|
||||
size_t nbytes) {
|
||||
GGML_ASSERT(p->n_devices == 2);
|
||||
GGML_ASSERT(nbytes <= p->copy_bytes);
|
||||
GGML_ASSERT(ne <= std::numeric_limits<int>::max());
|
||||
|
||||
const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes);
|
||||
GGML_ASSERT(chunk_bytes > 0);
|
||||
|
||||
const int slot = ggml_cuda_ar_acquire_slot(p).slot;
|
||||
const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes;
|
||||
GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS);
|
||||
|
||||
ggml_backend_cuda_context * cuda_ctx[2] = {};
|
||||
|
||||
// Stage 1: both GPUs copy their local contribution to pinned host memory.
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cuda_ctx[i] = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]);
|
||||
|
||||
ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot);
|
||||
|
||||
// Wait for peer's H2D from our host_large[i] (recorded in the
|
||||
// previous AR's stage 2) to complete before we overwrite host_large[i].
|
||||
// host_large_read_done[peer] = peer finished reading host_large[i].
|
||||
// No-op on the first AR -- no prior record exists.
|
||||
if (p->host_large_read_done_valid) {
|
||||
const int peer = 1 - i;
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer]));
|
||||
}
|
||||
|
||||
if (!compute[i]) {
|
||||
CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i]));
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < copy_chunks; ++c) {
|
||||
const size_t offset = c * chunk_bytes;
|
||||
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
|
||||
(nbytes - offset) : chunk_bytes;
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
p->host_large[i].host + offset, reinterpret_cast<char *>(src_buf[i]) + offset, this_bytes,
|
||||
cudaMemcpyDeviceToHost, p->streams[i]));
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i]));
|
||||
}
|
||||
}
|
||||
|
||||
// Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to
|
||||
// local device scratch (dev_tmp), then performs one device-local add over
|
||||
// the assembled peer tensor. The H2Ds run on the AR stream (copy engine)
|
||||
// and the add_kernel runs on the caller's compute stream, so the AR stream
|
||||
// stays pure-copy and avoids an in-stream copy->compute engine switch every
|
||||
// AR. dev_tmp is single-buffered: the AR stream waits cross-stream on the
|
||||
// prior AR's add_kernel-done event before overwriting it.
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
const int peer = 1 - i;
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
|
||||
// Wait for the previous AR's add_kernel (on the compute stream) to
|
||||
// finish reading dev_tmp before our H2D overwrites it. No-op on the
|
||||
// first copy_impl call.
|
||||
if (p->dev_tmp_kernel_done_valid) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i]));
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < copy_chunks; ++c) {
|
||||
const size_t offset = c * chunk_bytes;
|
||||
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
|
||||
(nbytes - offset) : chunk_bytes;
|
||||
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c]));
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes,
|
||||
cudaMemcpyHostToDevice, p->streams[i]));
|
||||
}
|
||||
|
||||
// Mark our reads of host_large[peer] complete so peer's next AR can
|
||||
// safely overwrite it.
|
||||
CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i]));
|
||||
|
||||
// Hand off from AR stream (copy engine) to compute stream: compute
|
||||
// stream waits for all H2Ds to finish, then runs the add_kernel.
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i]));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d));
|
||||
|
||||
const int block_size = 256;
|
||||
int n_blocks = (int) ((ne + block_size - 1) / block_size);
|
||||
if (n_blocks > 1024) {
|
||||
n_blocks = 1024;
|
||||
}
|
||||
ggml_cuda_ar_add_kernel<T_dst, T_src><<<n_blocks, block_size, 0, cuda_ctx[i]->stream()>>>(
|
||||
dst_buf[i],
|
||||
reinterpret_cast<const T_src *>(p->dev_tmp[i]),
|
||||
(int) ne);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// Record dev_tmp-released on the compute stream so the next copy_impl
|
||||
// can wait for the kernel to finish before overwriting dev_tmp. Also
|
||||
// record AR-done as ev.ker for acquire_slot's pool-wraparound sync.
|
||||
CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream()));
|
||||
}
|
||||
p->host_large_read_done_valid = true;
|
||||
p->dev_tmp_kernel_done_valid = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by
|
||||
// the host_large / dev_tmp allocation size). When the full AR exceeds that,
|
||||
// slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly.
|
||||
// Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own
|
||||
// slot, so cross-AR fences and pool wraparound work the same way as for any
|
||||
// other sequence of small ARs.
|
||||
template <typename T_src, typename T_dst>
|
||||
static bool ggml_cuda_ar_allreduce_copy_outer(
|
||||
ggml_cuda_ar_pipeline * p,
|
||||
ggml_backend_t * backends,
|
||||
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
|
||||
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
|
||||
const bool compute[GGML_CUDA_MAX_DEVICES],
|
||||
int64_t ne) {
|
||||
const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src));
|
||||
GGML_ASSERT(outer_max_elems > 0);
|
||||
|
||||
bool ok = true;
|
||||
for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) {
|
||||
const int64_t outer_ne = std::min(outer_max_elems, ne - outer_start);
|
||||
const size_t outer_nbytes = (size_t) outer_ne * sizeof(T_src);
|
||||
|
||||
T_src * src[GGML_CUDA_MAX_DEVICES] = {};
|
||||
T_dst * dst[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
src[i] = src_buf[i] + outer_start;
|
||||
dst[i] = dst_buf[i] + outer_start;
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_impl<T_src, T_dst>(
|
||||
p, backends, src, dst, compute, outer_ne, outer_nbytes);
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
bool ggml_cuda_ar_allreduce(
|
||||
ggml_cuda_ar_pipeline * p,
|
||||
ggml_backend_t * backends,
|
||||
ggml_tensor ** tensors) {
|
||||
GGML_ASSERT(p != nullptr);
|
||||
|
||||
const int n = p->n_devices;
|
||||
GGML_ASSERT(n == 2);
|
||||
|
||||
const ggml_type input_type = tensors[0]->type;
|
||||
GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16);
|
||||
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
GGML_ASSERT(ne > 0);
|
||||
|
||||
const size_t input_nbytes = ggml_nbytes(tensors[0]);
|
||||
|
||||
// BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for
|
||||
// the reduction (chunked or copy-engine), halving on-wire bytes. Matches
|
||||
// NCCL's behaviour. The pre-conversion zeroes inactive shards so the
|
||||
// inner paths see them as already-prepared compute tensors.
|
||||
const bool use_bf16 =
|
||||
input_type == GGML_TYPE_F32 &&
|
||||
p->bf16_threshold > 0 &&
|
||||
input_nbytes >= p->bf16_threshold;
|
||||
|
||||
const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type;
|
||||
const size_t type_size = ggml_type_size(kernel_type);
|
||||
GGML_ASSERT(p->buf_bytes >= type_size);
|
||||
const size_t nbytes = (size_t) ne * type_size;
|
||||
|
||||
bool compute_flag[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0;
|
||||
}
|
||||
|
||||
// Decide between copy-engine and chunked kernel paths based on the working
|
||||
// type's actual byte count. No upper bound: copy_outer slices reductions
|
||||
// larger than copy_bytes into copy_bytes-sized pieces.
|
||||
const bool use_copy_engine =
|
||||
p->copy_threshold > 0 &&
|
||||
nbytes >= p->copy_threshold;
|
||||
|
||||
// BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel
|
||||
// (chunked kernel path) and the combined add kernel (copy_engine path)
|
||||
// both accumulate into the F32 tensor data directly, so an inactive
|
||||
// shard's accumulator must start at zero.
|
||||
if (use_bf16) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
if (!compute_flag[i]) {
|
||||
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16
|
||||
// path; the chunked kernel path's combined kernel does the conversion
|
||||
// inline as it writes to host_buf.
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> bf16_tmp[GGML_CUDA_MAX_DEVICES];
|
||||
void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {};
|
||||
|
||||
if (use_copy_engine && use_bf16) {
|
||||
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
|
||||
bf16_tmp[i].pool = &cuda_ctx->pool();
|
||||
bf16_tmp[i].alloc(ne);
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
if (compute_flag[i]) {
|
||||
to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream()));
|
||||
}
|
||||
copy_src_ptr[i] = bf16_tmp[i].get();
|
||||
}
|
||||
}
|
||||
|
||||
bool ok = true;
|
||||
if (use_copy_engine) {
|
||||
// After up-front BF16 conversion, the tmp buffers already hold the
|
||||
// (possibly zeroed-for-inactive) data, so the inner path can treat
|
||||
// every shard as compute.
|
||||
bool inner_compute[GGML_CUDA_MAX_DEVICES];
|
||||
for (int i = 0; i < n; ++i) {
|
||||
inner_compute[i] = use_bf16 ? true : compute_flag[i];
|
||||
}
|
||||
|
||||
// Dispatch into copy_impl with explicit src/dst types. When use_bf16
|
||||
// is on, the wire type is BF16 (src = bf16_tmp) and the accumulator
|
||||
// is F32 (dst = tensors[i]->data); the combined add kernel rounds dst
|
||||
// through BF16 for bit-equivalence and writes F32 directly, so no
|
||||
// post-conversion is needed. Otherwise src == dst (same native type).
|
||||
if (use_bf16) {
|
||||
GGML_ASSERT(kernel_type == GGML_TYPE_BF16);
|
||||
nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {};
|
||||
float * dst[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
src[i] = static_cast<nv_bfloat16 *>(copy_src_ptr[i]);
|
||||
dst[i] = static_cast<float *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, float>(
|
||||
p, backends, src, dst, inner_compute, ne);
|
||||
} else {
|
||||
switch (kernel_type) {
|
||||
case GGML_TYPE_F32: {
|
||||
float * buf[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
buf[i] = static_cast<float *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<float, float>(
|
||||
p, backends, buf, buf, inner_compute, ne);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_BF16: {
|
||||
nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
buf[i] = static_cast<nv_bfloat16 *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, nv_bfloat16>(
|
||||
p, backends, buf, buf, inner_compute, ne);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
half * buf[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
buf[i] = static_cast<half *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<half, half>(
|
||||
p, backends, buf, buf, inner_compute, ne);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// host_buf carries T_wire-typed data; max_chunk_elems is the count that
|
||||
// fits in one host_buf at the wire size.
|
||||
const size_t max_chunk_elems = p->buf_bytes / type_size;
|
||||
const size_t input_type_size = ggml_type_size(input_type);
|
||||
|
||||
// Chunked kernel path runs entirely on the caller's compute stream:
|
||||
// since AR is a barrier here, same-stream ordering subsumes any
|
||||
// cross-stream event handshake that the copy-engine path needs, and
|
||||
// skips the cross-stream scheduling overhead that was hurting the
|
||||
// small-tensor (tg) latency on the AR-stream variant. Only ev.ker is
|
||||
// still recorded at end-of-AR for acquire_slot's pool-wraparound check.
|
||||
for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) {
|
||||
const size_t remaining_elems = (size_t) (ne - chunk_start);
|
||||
const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems;
|
||||
const size_t chunk_dst_bytes = chunk_elems * input_type_size;
|
||||
|
||||
const auto [slot, token] = ggml_cuda_ar_acquire_slot(p);
|
||||
const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne;
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
const int peer = 1 - i; // valid for n == 2 only
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
|
||||
cudaStream_t stream = cuda_ctx->stream();
|
||||
|
||||
char * data = static_cast<char *>(tensors[i]->data) + chunk_start * (int64_t) input_type_size;
|
||||
|
||||
// Match NCCL/meta-backend semantics: inactive shards contribute
|
||||
// zeros. On the BF16 path the F32 tensor data was already
|
||||
// zeroed up-front (above), so per-chunk zeroing isn't needed.
|
||||
if (!compute_flag[i] && !use_bf16) {
|
||||
CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream));
|
||||
}
|
||||
|
||||
#define LAUNCH_AR_KERNEL(T_dst, T_wire) \
|
||||
ggml_cuda_ar_kernel<T_dst, T_wire><<<dim3(GGML_CUDA_AR_KERNEL_BLOCKS), dim3(256), 0, stream>>>( \
|
||||
reinterpret_cast<const T_dst *>(data), \
|
||||
reinterpret_cast<T_dst *>(data), \
|
||||
reinterpret_cast<T_wire *>(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \
|
||||
reinterpret_cast<const T_wire *>(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \
|
||||
static_cast<int>(chunk_elems), \
|
||||
ggml_cuda_ar_arrival_ptr(p, slot, i), \
|
||||
ggml_cuda_ar_arrival_ptr(p, slot, peer), \
|
||||
token)
|
||||
|
||||
if (use_bf16) {
|
||||
GGML_ASSERT(input_type == GGML_TYPE_F32);
|
||||
LAUNCH_AR_KERNEL(float, nv_bfloat16);
|
||||
} else {
|
||||
switch (input_type) {
|
||||
case GGML_TYPE_F32: LAUNCH_AR_KERNEL(float, float); break;
|
||||
case GGML_TYPE_F16: LAUNCH_AR_KERNEL(half, half); break;
|
||||
case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break;
|
||||
default: GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
#undef LAUNCH_AR_KERNEL
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (last_chunk) {
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
#else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
|
||||
|
||||
// HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable
|
||||
// / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this
|
||||
// implementation relies on, so the internal AllReduce is a CUDA-only feature.
|
||||
// The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed"
|
||||
// and silently falls back to the meta backend's generic AllReduce.
|
||||
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) {
|
||||
return nullptr;
|
||||
}
|
||||
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) {
|
||||
}
|
||||
bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
@@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
// Opaque pipeline context -- owns all pinned buffers, streams, and events.
|
||||
struct ggml_cuda_ar_pipeline;
|
||||
|
||||
// Allocate a pipeline for n_devices GPUs.
|
||||
// devices[] holds the CUDA device IDs in rank order.
|
||||
// Returns nullptr on allocation failure.
|
||||
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(
|
||||
const int * devices, size_t n_devices);
|
||||
|
||||
// Release all resources owned by the pipeline.
|
||||
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline);
|
||||
|
||||
// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1].
|
||||
// tensors[i] must live on the device managed by backends[i] and be
|
||||
// contiguous F32, F16, or BF16.
|
||||
// Preconditions are checked by the CUDA comm dispatcher before calling this.
|
||||
// Returns true once the reduction work has been enqueued successfully.
|
||||
bool ggml_cuda_ar_allreduce(
|
||||
ggml_cuda_ar_pipeline * pipeline,
|
||||
ggml_backend_t * backends,
|
||||
ggml_tensor ** tensors);
|
||||
|
||||
@@ -61,6 +61,11 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2, 64, 64, 64, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 128, 2, 64, 64, 64, 64, 2, true);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 8, 64, 4, 64, 96, 64, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16, 64, 4, 32, 96, 64, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 128, 2, 32, 96, 64, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 128, 2, 32, 96, 64, 64, 2, true);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 8, 64, 4, 64, 128, 128, 128, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 64, 4, 32, 128, 128, 128, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 32, 128, 128, 128, 2, true);
|
||||
@@ -1561,6 +1566,10 @@ static __global__ void flash_attn_ext_f16(
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
if (DKQ == 192 && ncols2 != 8 && ncols2 != 16) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#ifdef VOLTA_MMA_AVAILABLE
|
||||
if (ncols1*ncols2 < 32) {
|
||||
NO_DEVICE_CODE;
|
||||
|
||||
@@ -34,6 +34,10 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
GGML_ASSERT(V->ne[0] == K->ne[0]);
|
||||
ggml_cuda_flash_attn_ext_tile_case<128, 128>(ctx, dst);
|
||||
} break;
|
||||
case 192: {
|
||||
GGML_ASSERT(V->ne[0] == 128);
|
||||
ggml_cuda_flash_attn_ext_tile_case<192, 128>(ctx, dst);
|
||||
} break;
|
||||
case 256: {
|
||||
GGML_ASSERT(V->ne[0] == K->ne[0]);
|
||||
ggml_cuda_flash_attn_ext_tile_case<256, 256>(ctx, dst);
|
||||
|
||||
@@ -62,6 +62,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 64, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 64, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 64, 64)
|
||||
@@ -124,6 +130,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 128, 3, 32, 128)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 128, 3, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 128, 3, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2, 32, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 128, 3, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 3, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 32, 256)
|
||||
@@ -193,6 +205,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 2, 64, 32)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 256, 2, 128, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2, 32, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 256, 2, 128, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 256, 2, 64, 128)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 64, 128)
|
||||
@@ -264,6 +282,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 3, 128, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 3, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 64, 8, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 128, 6, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 128, 6, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 5, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 3, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 64, 8, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 6, 32, 256)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 128, 6, 32, 256)
|
||||
@@ -1250,7 +1274,20 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (DKQ <= 512 && DKQ != 320) {
|
||||
if constexpr (DKQ == 192) {
|
||||
// MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
|
||||
if (use_gqa_opt && gqa_ratio % 16 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
if (use_gqa_opt && gqa_ratio % 8 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("flash-attn tile (192/128): expected GQA ratio multiple of 8");
|
||||
}
|
||||
|
||||
if constexpr (DKQ <= 512 && DKQ != 320 && DKQ != 192) {
|
||||
if (use_gqa_opt && gqa_ratio % 8 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
@@ -1303,6 +1340,7 @@ extern DECL_FATTN_TILE_CASE( 80, 80);
|
||||
extern DECL_FATTN_TILE_CASE( 96, 96);
|
||||
extern DECL_FATTN_TILE_CASE(112, 112);
|
||||
extern DECL_FATTN_TILE_CASE(128, 128);
|
||||
extern DECL_FATTN_TILE_CASE(192, 128);
|
||||
extern DECL_FATTN_TILE_CASE(256, 256);
|
||||
extern DECL_FATTN_TILE_CASE(320, 256);
|
||||
extern DECL_FATTN_TILE_CASE(512, 512);
|
||||
|
||||
@@ -139,6 +139,22 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
|
||||
GGML_ASSERT(V->ne[0] == 128);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
|
||||
break;
|
||||
case 192: {
|
||||
// MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
|
||||
GGML_ASSERT(V->ne[0] == 128);
|
||||
float max_bias = 0.0f;
|
||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f;
|
||||
GGML_ASSERT(use_gqa_opt);
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
if (gqa_ratio % 16 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128, 16>(ctx, dst);
|
||||
} else {
|
||||
GGML_ASSERT(gqa_ratio % 8 == 0);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128, 8>(ctx, dst);
|
||||
}
|
||||
} break;
|
||||
case 256:
|
||||
GGML_ASSERT(V->ne[0] == 256);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
|
||||
@@ -368,6 +384,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
break;
|
||||
case 192:
|
||||
if (V->ne[0] != 128 || !gqa_opt_applies) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
if (gqa_ratio % 8 != 0) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
break;
|
||||
case 320:
|
||||
if (V->ne[0] != 256 || !gqa_opt_applies) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
@@ -425,7 +449,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
}
|
||||
|
||||
// For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
// 192 satisfies % 64 == 0 but has no vec instance (DKQ != DV); force it onto the MMA path.
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && Q->ne[0] != 192 && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
|
||||
// If Turing tensor cores are available, use them:
|
||||
if (turing_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
|
||||
@@ -454,7 +479,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
|
||||
if (volta_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
|
||||
int gqa_ratio_eff = 1;
|
||||
const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
|
||||
const int ncols2_max = (Q->ne[0] == 576 || Q->ne[0] == 192) ? 16 : 8;
|
||||
while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
|
||||
gqa_ratio_eff *= 2;
|
||||
}
|
||||
@@ -468,7 +493,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
}
|
||||
|
||||
// Use the WMMA kernel if possible:
|
||||
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 512 && Q->ne[0] != 576) {
|
||||
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 512 && Q->ne[0] != 576) {
|
||||
if (can_use_vector_kernel && Q->ne[1] <= 2) {
|
||||
return BEST_FATTN_KERNEL_VEC;
|
||||
}
|
||||
@@ -501,7 +526,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
}
|
||||
|
||||
// Use MFMA flash attention for CDNA (MI100+):
|
||||
if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
|
||||
if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
|
||||
const int64_t eff_nq = Q->ne[1] * (gqa_opt_applies ? gqa_ratio : 1);
|
||||
// MMA vs tile crossover benchmarked on MI300X @ d32768:
|
||||
// hsk=64 (gqa=4): MMA wins at eff >= 128 (+11%)
|
||||
|
||||
+243
-59
@@ -2,6 +2,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-cuda/allreduce.cuh"
|
||||
#include "ggml-cuda/common.cuh"
|
||||
#include "ggml-cuda/acc.cuh"
|
||||
#include "ggml-cuda/add-id.cuh"
|
||||
@@ -39,6 +40,7 @@
|
||||
#include "ggml-cuda/rope.cuh"
|
||||
#include "ggml-cuda/roll.cuh"
|
||||
#include "ggml-cuda/scale.cuh"
|
||||
#include "ggml-cuda/snake.cuh"
|
||||
#include "ggml-cuda/softcap.cuh"
|
||||
#include "ggml-cuda/softmax.cuh"
|
||||
#include "ggml-cuda/ssm-conv.cuh"
|
||||
@@ -85,6 +87,9 @@
|
||||
|
||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
#define GGML_LOG_WARN_ONCE(str) \
|
||||
{ static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
|
||||
|
||||
[[noreturn]]
|
||||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
||||
int id = -1; // in case cudaGetDevice fails
|
||||
@@ -1138,70 +1143,46 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
// Communication context for multi-GPU AllReduce during tensor parallelism.
|
||||
//
|
||||
// Created once per meta backend instance. Resources for the selected mode
|
||||
// (NCCL communicators or the internal AllReduce pipeline) are initialised
|
||||
// eagerly during comm_init so any init failure surfaces at startup rather
|
||||
// than mid-run.
|
||||
struct ggml_backend_cuda_comm_context {
|
||||
using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
|
||||
|
||||
std::vector<ggml_backend_t> backends;
|
||||
std::vector<ncclComm_t> comms;
|
||||
std::vector<int> dev_ids;
|
||||
|
||||
// Set by the init chain (comm_init_{nccl, internal, none}) to one of
|
||||
// try_allreduce_{nccl, internal, butterfly}. nccl needs `comms`,
|
||||
// internal needs `ar_pipeline`, butterfly needs nothing. Per-call
|
||||
// failures return false; the meta backend's generic implementation then
|
||||
// handles that call.
|
||||
try_allreduce_fn try_allreduce = nullptr;
|
||||
|
||||
ggml_cuda_ar_pipeline * ar_pipeline = nullptr;
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
std::vector<ncclComm_t> comms;
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
~ggml_backend_cuda_comm_context() {
|
||||
#ifdef GGML_USE_NCCL
|
||||
for (ncclComm_t comm : comms) {
|
||||
NCCL_CHECK(ncclCommDestroy(comm));
|
||||
}
|
||||
#endif // GGML_USE_NCCL
|
||||
ggml_cuda_ar_pipeline_free(ar_pipeline);
|
||||
}
|
||||
};
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return;
|
||||
}
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
delete comm_ctx;
|
||||
#else
|
||||
GGML_UNUSED(comm_ctx_v);
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
if (!ggml_backend_is_cuda(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
|
||||
std::vector<int> dev_ids;
|
||||
ret->backends.reserve(n_backends);
|
||||
dev_ids.reserve(n_backends);
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
ret->backends.push_back(backends[i]);
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
dev_ids.push_back(cuda_ctx->device);
|
||||
}
|
||||
|
||||
ret->comms.resize(n_backends);
|
||||
NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
|
||||
return ret;
|
||||
#else
|
||||
// If NCCL is installed it is used by default for optimal performance.
|
||||
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
|
||||
// RCCL is disabled by default, users are explicitly opting in.
|
||||
// Therefore print no warning for RCCL.
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
static bool warning_printed = false;
|
||||
if (!warning_printed) {
|
||||
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
|
||||
warning_printed = true;
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
GGML_UNUSED_VARS(backends, n_backends);
|
||||
return nullptr;
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
|
||||
// tensors (bandwidth-bound), then converts back to FP32.
|
||||
static bool ggml_backend_cuda_comm_allreduce_nccl(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
|
||||
// This then causes a crash in this function
|
||||
@@ -1209,8 +1190,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_ASSERT(comm_ctx_v != nullptr);
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
@@ -1235,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1274,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
}
|
||||
|
||||
return true;
|
||||
#else
|
||||
GGML_UNUSED_VARS(comm_ctx_v, tensors);
|
||||
return false;
|
||||
}
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
// Run the internal AR pipeline. Returns false on unsupported / failed input
|
||||
// -- the caller decides whether to abort (env-forced) or fall back silently.
|
||||
static bool ggml_backend_cuda_comm_allreduce_internal(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);
|
||||
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
GGML_ASSERT(n_backends == 2);
|
||||
GGML_ASSERT(tensors[0] != nullptr);
|
||||
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
const ggml_type type = tensors[0]->type;
|
||||
|
||||
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
|
||||
GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ne == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
if (tensors[i] == nullptr) {
|
||||
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i);
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) {
|
||||
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n",
|
||||
__func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type);
|
||||
return false;
|
||||
}
|
||||
if (!ggml_is_contiguously_allocated(tensors[i])) {
|
||||
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n",
|
||||
__func__, i, ne, ggml_nbytes(tensors[i]),
|
||||
(size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type);
|
||||
return false;
|
||||
}
|
||||
if (((uintptr_t) tensors[i]->data & 0xF) != 0) {
|
||||
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n",
|
||||
__func__, i, tensors[i]->data, (int) type, ne);
|
||||
return false;
|
||||
}
|
||||
GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0);
|
||||
}
|
||||
|
||||
return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-call dispatch -- three variants, one per backend. Each is set as
|
||||
// comm_ctx->try_allreduce by the matching init step. Per-call failure
|
||||
// returns false; the meta backend's generic implementation handles that call.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
static bool ggml_backend_cuda_comm_try_allreduce_nccl(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
|
||||
}
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
static bool ggml_backend_cuda_comm_try_allreduce_internal(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
|
||||
ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return;
|
||||
}
|
||||
delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Init -- chained nccl -> internal -> none. Each step tries to bring up its
|
||||
// resource; on failure it warns and recurses into the next step.
|
||||
// ---------------------------------------------------------------------------
|
||||
static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) {
|
||||
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) {
|
||||
ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size());
|
||||
if (ret->ar_pipeline) {
|
||||
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
|
||||
return;
|
||||
}
|
||||
|
||||
// Clear sticky CUDA error from the failed init.
|
||||
(void) cudaGetLastError();
|
||||
GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); "
|
||||
"falling back to meta-backend butterfly\n");
|
||||
ggml_backend_cuda_comm_init_none(ret);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
const size_t n = ret->dev_ids.size();
|
||||
ret->comms.resize(n);
|
||||
ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data());
|
||||
if (rc == ncclSuccess) {
|
||||
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
|
||||
return;
|
||||
}
|
||||
|
||||
ret->comms.clear();
|
||||
GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n",
|
||||
ncclGetErrorString(rc));
|
||||
#else // GGML_USE_NCCL
|
||||
#ifndef GGML_USE_HIP
|
||||
GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce. "
|
||||
"Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n");
|
||||
#endif // !GGML_USE_HIP
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
ggml_backend_cuda_comm_init_internal(ret);
|
||||
}
|
||||
|
||||
// Top-level init. Picks one of the three init paths based on
|
||||
// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
|
||||
// any fallback. Unrecognised env values warn and fall through to the
|
||||
// platform default.
|
||||
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
if (!ggml_backend_is_cuda(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto * ret = new ggml_backend_cuda_comm_context;
|
||||
ret->backends.assign(backends, backends + n_backends);
|
||||
ret->dev_ids.reserve(n_backends);
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
|
||||
}
|
||||
|
||||
const char * env = getenv("GGML_CUDA_ALLREDUCE");
|
||||
if (!env) {
|
||||
// Platform default: Linux uses NCCL, otherwise (generally Windows) internal
|
||||
#if defined(__linux__)
|
||||
ggml_backend_cuda_comm_init_nccl(ret);
|
||||
#else
|
||||
ggml_backend_cuda_comm_init_internal(ret);
|
||||
#endif // defined(__linux__)
|
||||
} else {
|
||||
std::string env_str(env);
|
||||
if (env_str == "nccl") {
|
||||
ggml_backend_cuda_comm_init_nccl(ret);
|
||||
} else if (env_str == "internal") {
|
||||
ggml_backend_cuda_comm_init_internal(ret);
|
||||
} else if (env_str == "none") {
|
||||
ggml_backend_cuda_comm_init_none(ret);
|
||||
} else {
|
||||
GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env);
|
||||
ggml_backend_cuda_comm_init_none(ret);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Top-level dispatch -- calls the function pointer chosen by comm_init.
|
||||
// Returns false to let the meta-backend's butterfly run.
|
||||
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return false;
|
||||
}
|
||||
auto * comm_ctx = static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
|
||||
return comm_ctx->try_allreduce(comm_ctx, tensors);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
|
||||
@@ -3757,6 +3909,35 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
|
||||
return 2;
|
||||
}
|
||||
|
||||
// Snake activation: y = x + sin(a*x)^2 * inv_b
|
||||
// Naive 5-op decomposition emitted by frontends: mul -> sin -> sqr -> mul -> add
|
||||
if (ggml_can_fuse_subgraph(cgraph, i,
|
||||
{ GGML_OP_MUL, GGML_OP_SIN, GGML_OP_SQR, GGML_OP_MUL, GGML_OP_ADD },
|
||||
{ i + 4 })) {
|
||||
const ggml_tensor * mul0 = cgraph->nodes[i];
|
||||
const ggml_tensor * sqr = cgraph->nodes[i + 2];
|
||||
const ggml_tensor * mul1 = cgraph->nodes[i + 3];
|
||||
ggml_tensor * add = cgraph->nodes[i + 4];
|
||||
|
||||
// x carries the full activation shape, a is the broadcast operand
|
||||
const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
|
||||
const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
|
||||
|
||||
// mul1 reads sqr and inv_b in either operand order
|
||||
const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
|
||||
|
||||
// closure check: the trailing add must read the same x as the leading mul
|
||||
const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];
|
||||
|
||||
const bool type_ok = (x->type == GGML_TYPE_F32 || x->type == GGML_TYPE_F16 || x->type == GGML_TYPE_BF16);
|
||||
const bool shape_ok = ggml_are_same_shape(a, inv_b) && a->ne[0] == 1 && a->ne[1] == x->ne[1];
|
||||
|
||||
if (type_ok && shape_ok && x_in_add == x && add->type == x->type) {
|
||||
ggml_cuda_op_snake_fused(*cuda_ctx, x, a, inv_b, add);
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
// multi-(add or mul)
|
||||
if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
|
||||
int n_fuse = 0;
|
||||
@@ -5434,6 +5615,9 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
char pci_bus_id[32] = {};
|
||||
CUDA_CHECK(cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), i));
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
for (char & c : dev_ctx->pci_bus_id) {
|
||||
c = std::tolower(c);
|
||||
}
|
||||
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
#include "snake.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
// Fused Snake activation: y = x + sin^2(a * x) * inv_b
|
||||
// x: [T, C] (T contiguous), a: [1, C], inv_b: [1, C]
|
||||
// Supports F32, F16, BF16 data with F32 compute.
|
||||
|
||||
template <typename T>
|
||||
static __global__ void snake_kernel(
|
||||
const T * __restrict__ x,
|
||||
const float * __restrict__ a,
|
||||
const float * __restrict__ inv_b,
|
||||
T * __restrict__ dst,
|
||||
const int total,
|
||||
const uint3 T_len_fastdiv) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx >= total) return;
|
||||
|
||||
const int c = (int) fastdiv((uint32_t) idx, T_len_fastdiv);
|
||||
|
||||
const float xi = ggml_cuda_cast<float>(x[idx]);
|
||||
const float s = sinf(a[c] * xi);
|
||||
dst[idx] = ggml_cuda_cast<T>(xi + s * s * inv_b[c]);
|
||||
}
|
||||
|
||||
// Internal launcher with explicit x/a/inv_b/dst tensors.
|
||||
// Shared by the public op (reads dst->src) and the fusion path (explicit args).
|
||||
static void launch_snake(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * x,
|
||||
const ggml_tensor * a,
|
||||
const ggml_tensor * inv_b,
|
||||
ggml_tensor * dst) {
|
||||
const float * a_d = (const float *)a->data;
|
||||
const float * inv_b_d = (const float *)inv_b->data;
|
||||
|
||||
const int T = (int)x->ne[0];
|
||||
const int C = (int)x->ne[1];
|
||||
const int total = T * C;
|
||||
const uint3 T_len_fastdiv = init_fastdiv_values((uint64_t) T);
|
||||
|
||||
const int block_size = 256;
|
||||
const int grid_size = (total + block_size - 1) / block_size;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
switch (x->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
snake_kernel<<<grid_size, block_size, 0, stream>>>(
|
||||
(const float *)x->data, a_d, inv_b_d, (float *)dst->data, total, T_len_fastdiv);
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
snake_kernel<<<grid_size, block_size, 0, stream>>>(
|
||||
(const half *)x->data, a_d, inv_b_d, (half *)dst->data, total, T_len_fastdiv);
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
snake_kernel<<<grid_size, block_size, 0, stream>>>(
|
||||
(const nv_bfloat16 *)x->data, a_d, inv_b_d, (nv_bfloat16 *)dst->data, total, T_len_fastdiv);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("snake: unsupported type");
|
||||
}
|
||||
}
|
||||
|
||||
// Fusion entry: caller supplies x/a/inv_b explicitly from the matched
|
||||
// mul -> sin -> sqr -> mul -> add pattern. The dst is the trailing add output.
|
||||
void ggml_cuda_op_snake_fused(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * x,
|
||||
const ggml_tensor * a,
|
||||
const ggml_tensor * inv_b,
|
||||
ggml_tensor * dst) {
|
||||
launch_snake(ctx, x, a, inv_b, dst);
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
#include "common.cuh"
|
||||
|
||||
// Fusion entry point. Caller supplies x/a/inv_b explicitly.
|
||||
void ggml_cuda_op_snake_fused(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * x,
|
||||
const ggml_tensor * a,
|
||||
const ggml_tensor * inv_b,
|
||||
ggml_tensor * dst);
|
||||
@@ -2,4 +2,5 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 1, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
|
||||
|
||||
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 1, 8);
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 2, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
|
||||
|
||||
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 2, 8);
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 4, 16);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
|
||||
|
||||
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 4, 8);
|
||||
|
||||
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(192, 128, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(512, 512, 8, 8);
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-tile.cuh"
|
||||
|
||||
DECL_FATTN_TILE_CASE(192, 128);
|
||||
@@ -3,7 +3,10 @@
|
||||
from glob import glob
|
||||
import os
|
||||
|
||||
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 320, 512, 576]
|
||||
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 192, 256, 320, 512, 576]
|
||||
|
||||
# DKQ -> DV override for asymmetric head dims.
|
||||
HEAD_SIZES_V_OVERRIDE = {576: 512, 320: 256, 192: 128}
|
||||
|
||||
TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_BF16"]
|
||||
|
||||
@@ -62,7 +65,7 @@ for filename in glob("*.cu"):
|
||||
os.remove(filename)
|
||||
|
||||
for head_size_kq in HEAD_SIZES_KQ:
|
||||
head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
|
||||
head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
|
||||
with open(f"fattn-tile-instance-dkq{head_size_kq}-dv{head_size_v}.cu", "w") as f:
|
||||
f.write(SOURCE_FATTN_TILE.format(head_size_kq=head_size_kq, head_size_v=head_size_v))
|
||||
|
||||
@@ -85,15 +88,17 @@ for ncols in [8, 16, 32, 64]:
|
||||
if head_size_kq == 72:
|
||||
continue
|
||||
# Skip compilation of unused ncols2 values for niche head sizes:
|
||||
if head_size_kq == 192 and ncols2 not in (8, 16): # MiMo-V2.5
|
||||
continue
|
||||
if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
|
||||
continue
|
||||
if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
|
||||
continue
|
||||
if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
|
||||
continue
|
||||
if head_size_kq not in (320, 576) and ncols2 in (16, 32):
|
||||
if head_size_kq not in (192, 320, 576) and ncols2 in (16, 32):
|
||||
continue
|
||||
head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
|
||||
head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
|
||||
f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))
|
||||
|
||||
for type in TYPES_MMQ:
|
||||
|
||||
@@ -2261,6 +2261,58 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * q = op->src[0];
|
||||
const struct ggml_tensor * k = op->src[1];
|
||||
const struct ggml_tensor * v = op->src[2];
|
||||
const struct ggml_tensor * g = op->src[3];
|
||||
const struct ggml_tensor * beta = op->src[4];
|
||||
const struct ggml_tensor * state = op->src[5];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (!q || !k || !v || !g || !beta || !state) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
|
||||
g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
|
||||
dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
|
||||
!ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
|
||||
!ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H = v->ne[1];
|
||||
const int64_t n_tokens = v->ne[2];
|
||||
const int64_t n_seqs = v->ne[3];
|
||||
|
||||
if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
|
||||
return false;
|
||||
}
|
||||
if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
|
||||
q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
|
||||
(n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
|
||||
return false;
|
||||
}
|
||||
if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
|
||||
return false;
|
||||
}
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
@@ -2420,8 +2472,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: add support for non-contiguous elements within a row
|
||||
if (!ggml_is_contiguous_rows(src0) || !ggml_is_contiguous_rows(dst)) {
|
||||
// dst must be contiguous; src0 may be non-contiguous
|
||||
if (!ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2777,32 +2829,34 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
||||
|
||||
static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
switch (t->op) {
|
||||
case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
|
||||
case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT;
|
||||
case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID;
|
||||
case GGML_OP_MUL: return HTP_OP_MUL;
|
||||
case GGML_OP_ADD: return HTP_OP_ADD;
|
||||
case GGML_OP_ADD_ID: return HTP_OP_ADD_ID;
|
||||
case GGML_OP_SUB: return HTP_OP_SUB;
|
||||
case GGML_OP_DIV: return HTP_OP_DIV;
|
||||
case GGML_OP_CPY: return HTP_OP_CPY;
|
||||
case GGML_OP_CONT: return HTP_OP_CPY;
|
||||
case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS;
|
||||
case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
|
||||
case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
|
||||
case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
|
||||
case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
|
||||
case GGML_OP_SCALE: return HTP_OP_SCALE;
|
||||
case GGML_OP_SQR: return HTP_OP_SQR;
|
||||
case GGML_OP_SQRT: return HTP_OP_SQRT;
|
||||
case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX;
|
||||
case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV;
|
||||
case GGML_OP_ROPE: return HTP_OP_ROPE;
|
||||
case GGML_OP_REPEAT: return HTP_OP_REPEAT;
|
||||
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
|
||||
case GGML_OP_FILL: return HTP_OP_FILL;
|
||||
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
||||
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
|
||||
case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
|
||||
case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT;
|
||||
case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID;
|
||||
case GGML_OP_MUL: return HTP_OP_MUL;
|
||||
case GGML_OP_ADD: return HTP_OP_ADD;
|
||||
case GGML_OP_ADD_ID: return HTP_OP_ADD_ID;
|
||||
case GGML_OP_SUB: return HTP_OP_SUB;
|
||||
case GGML_OP_DIV: return HTP_OP_DIV;
|
||||
case GGML_OP_CPY: return HTP_OP_CPY;
|
||||
case GGML_OP_CONT: return HTP_OP_CPY;
|
||||
case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS;
|
||||
case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
|
||||
case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
|
||||
case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
|
||||
case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
|
||||
case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
|
||||
case GGML_OP_SCALE: return HTP_OP_SCALE;
|
||||
case GGML_OP_SQR: return HTP_OP_SQR;
|
||||
case GGML_OP_SQRT: return HTP_OP_SQRT;
|
||||
case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX;
|
||||
case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV;
|
||||
case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
|
||||
case GGML_OP_ROPE: return HTP_OP_ROPE;
|
||||
case GGML_OP_REPEAT: return HTP_OP_REPEAT;
|
||||
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
|
||||
case GGML_OP_FILL: return HTP_OP_FILL;
|
||||
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
||||
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(t)) {
|
||||
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
|
||||
@@ -3253,6 +3307,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_add_id(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_L2_NORM:
|
||||
supp = ggml_hexagon_supported_unary(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
supp = ggml_hexagon_supported_unary(sess, op);
|
||||
@@ -3336,6 +3394,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_ssm_conv(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_GATED_DELTA_NET:
|
||||
supp = ggml_hexagon_supported_gated_delta_net(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_CUMSUM:
|
||||
supp = ggml_hexagon_supported_cumsum(sess, op);
|
||||
break;
|
||||
|
||||
@@ -37,6 +37,7 @@ add_library(${HTP_LIB} SHARED
|
||||
fill-ops.c
|
||||
diag-ops.c
|
||||
solve-tri-ops.c
|
||||
gated-delta-net-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
|
||||
@@ -0,0 +1,955 @@
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "hvx-utils.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#define HTP_GDN_MAX_SV 128
|
||||
|
||||
struct htp_gdn_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t rows_per_thread;
|
||||
size_t state_bytes;
|
||||
bool use_vtcm;
|
||||
uint8_t * vtcm_state_base;
|
||||
size_t vtcm_state_per_thread;
|
||||
};
|
||||
|
||||
static inline float gdn_mul_dot_f32(float * restrict dst, const float * restrict mul,
|
||||
const float * restrict dot, uint32_t n) {
|
||||
HVX_Vector acc = Q6_V_vzero();
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
|
||||
hvx_vmemu(dst + i * epv) = out;
|
||||
acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
|
||||
hvx_vec_store_u(dst + off, tail * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
|
||||
acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
|
||||
}
|
||||
|
||||
return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
|
||||
}
|
||||
|
||||
static inline float gdn_mul_scalar_dot_f32(float * restrict dst, float mul,
|
||||
const float * restrict dot, uint32_t n) {
|
||||
HVX_Vector acc = Q6_V_vzero();
|
||||
const HVX_Vector vmul = hvx_vec_splat_f32(mul);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
|
||||
hvx_vmemu(dst + i * epv) = out;
|
||||
acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
|
||||
hvx_vec_store_u(dst + off, tail * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
|
||||
acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
|
||||
}
|
||||
|
||||
return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
|
||||
}
|
||||
|
||||
static inline float gdn_add_scaled_dot_f32(float * restrict dst, const float * restrict src,
|
||||
float scale, const float * restrict dot, uint32_t n) {
|
||||
HVX_Vector acc = Q6_V_vzero();
|
||||
const HVX_Vector vscale = hvx_vec_splat_f32(scale);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
|
||||
hvx_vmemu(dst + i * epv) = out;
|
||||
acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
|
||||
hvx_vec_store_u(dst + off, tail * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
|
||||
acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
|
||||
}
|
||||
|
||||
return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
|
||||
}
|
||||
|
||||
static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, const float * restrict mul,
|
||||
const float * restrict dot, uint32_t n, float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
|
||||
}
|
||||
|
||||
static inline void gdn_mul_scalar_dot4_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, float mul,
|
||||
const float * restrict dot, uint32_t n, float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
const HVX_Vector vmul = hvx_vec_splat_f32(mul);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
|
||||
}
|
||||
|
||||
static inline void gdn_add_scaled_dot4_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, const float * restrict src,
|
||||
const float * restrict scale, const float * restrict dot, uint32_t n,
|
||||
float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
|
||||
const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
|
||||
const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
|
||||
const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
|
||||
HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
|
||||
HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
|
||||
HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
|
||||
HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
|
||||
}
|
||||
|
||||
static inline void gdn_mul_dot8_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, float * restrict dst4,
|
||||
float * restrict dst5, float * restrict dst6, float * restrict dst7,
|
||||
const float * restrict mul, const float * restrict dot, uint32_t n,
|
||||
float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
HVX_Vector acc4 = Q6_V_vzero();
|
||||
HVX_Vector acc5 = Q6_V_vzero();
|
||||
HVX_Vector acc6 = Q6_V_vzero();
|
||||
HVX_Vector acc7 = Q6_V_vzero();
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);
|
||||
HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vm);
|
||||
HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vm);
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vm);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vm);
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
hvx_vmemu(dst4 + i * epv) = out4;
|
||||
hvx_vmemu(dst5 + i * epv) = out5;
|
||||
hvx_vmemu(dst6 + i * epv) = out6;
|
||||
hvx_vmemu(dst7 + i * epv) = out7;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
|
||||
HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vm);
|
||||
HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vm);
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vm);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vm);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
|
||||
hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
|
||||
hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
|
||||
}
|
||||
|
||||
static inline void gdn_mul_scalar_dot8_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, float * restrict dst4,
|
||||
float * restrict dst5, float * restrict dst6, float * restrict dst7,
|
||||
float mul, const float * restrict dot, uint32_t n, float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
HVX_Vector acc4 = Q6_V_vzero();
|
||||
HVX_Vector acc5 = Q6_V_vzero();
|
||||
HVX_Vector acc6 = Q6_V_vzero();
|
||||
HVX_Vector acc7 = Q6_V_vzero();
|
||||
const HVX_Vector vmul = hvx_vec_splat_f32(mul);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);
|
||||
HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vmul);
|
||||
HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vmul);
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vmul);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vmul);
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
hvx_vmemu(dst4 + i * epv) = out4;
|
||||
hvx_vmemu(dst5 + i * epv) = out5;
|
||||
hvx_vmemu(dst6 + i * epv) = out6;
|
||||
hvx_vmemu(dst7 + i * epv) = out7;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
|
||||
HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
|
||||
HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vmul);
|
||||
HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vmul);
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vmul);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vmul);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
|
||||
hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
|
||||
hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
|
||||
}
|
||||
|
||||
static inline void gdn_add_scaled_dot8_f32(float * restrict dst0, float * restrict dst1,
|
||||
float * restrict dst2, float * restrict dst3, float * restrict dst4,
|
||||
float * restrict dst5, float * restrict dst6, float * restrict dst7,
|
||||
const float * restrict src, const float * restrict scale,
|
||||
const float * restrict dot, uint32_t n, float * restrict sums) {
|
||||
HVX_Vector acc0 = Q6_V_vzero();
|
||||
HVX_Vector acc1 = Q6_V_vzero();
|
||||
HVX_Vector acc2 = Q6_V_vzero();
|
||||
HVX_Vector acc3 = Q6_V_vzero();
|
||||
HVX_Vector acc4 = Q6_V_vzero();
|
||||
HVX_Vector acc5 = Q6_V_vzero();
|
||||
HVX_Vector acc6 = Q6_V_vzero();
|
||||
HVX_Vector acc7 = Q6_V_vzero();
|
||||
const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
|
||||
const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
|
||||
const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
|
||||
const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);
|
||||
const HVX_Vector scale4 = hvx_vec_splat_f32(scale[4]);
|
||||
const HVX_Vector scale5 = hvx_vec_splat_f32(scale[5]);
|
||||
const HVX_Vector scale6 = hvx_vec_splat_f32(scale[6]);
|
||||
const HVX_Vector scale7 = hvx_vec_splat_f32(scale[7]);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
|
||||
HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
|
||||
HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));
|
||||
HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + i * epv), hvx_vec_mul_f32_f32(vs, scale4));
|
||||
HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + i * epv), hvx_vec_mul_f32_f32(vs, scale5));
|
||||
HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + i * epv), hvx_vec_mul_f32_f32(vs, scale6));
|
||||
HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + i * epv), hvx_vec_mul_f32_f32(vs, scale7));
|
||||
|
||||
hvx_vmemu(dst0 + i * epv) = out0;
|
||||
hvx_vmemu(dst1 + i * epv) = out1;
|
||||
hvx_vmemu(dst2 + i * epv) = out2;
|
||||
hvx_vmemu(dst3 + i * epv) = out3;
|
||||
hvx_vmemu(dst4 + i * epv) = out4;
|
||||
hvx_vmemu(dst5 + i * epv) = out5;
|
||||
hvx_vmemu(dst6 + i * epv) = out6;
|
||||
hvx_vmemu(dst7 + i * epv) = out7;
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
|
||||
HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
|
||||
HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
|
||||
HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + off), hvx_vec_mul_f32_f32(vs, scale4));
|
||||
HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + off), hvx_vec_mul_f32_f32(vs, scale5));
|
||||
HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + off), hvx_vec_mul_f32_f32(vs, scale6));
|
||||
HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + off), hvx_vec_mul_f32_f32(vs, scale7));
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
|
||||
acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
|
||||
acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
|
||||
acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
|
||||
}
|
||||
|
||||
HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
|
||||
HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
|
||||
hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
|
||||
hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
|
||||
}
|
||||
|
||||
static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
|
||||
struct htp_ops_context * octx = gctx->octx;
|
||||
|
||||
const struct htp_tensor * q = octx->src[0];
|
||||
const struct htp_tensor * k = octx->src[1];
|
||||
const struct htp_tensor * v = octx->src[2];
|
||||
const struct htp_tensor * g = octx->src[3];
|
||||
const struct htp_tensor * beta = octx->src[4];
|
||||
const struct htp_tensor * state = octx->src[5];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const uint32_t S_v = v->ne[0];
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_tokens = v->ne[2];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
|
||||
const uint32_t total_rows = H * n_seqs;
|
||||
if (ith >= total_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t rq3 = n_seqs / q->ne[3];
|
||||
const uint32_t rk3 = n_seqs / k->ne[3];
|
||||
const float scale = 1.0f / sqrtf((float) S_v);
|
||||
|
||||
float * dst_base = (float *) (uintptr_t) dst->data;
|
||||
float * state_out_base = dst_base + (uint64_t) S_v * H * n_tokens * n_seqs;
|
||||
const float * state_in_base = (const float *) (uintptr_t) state->data;
|
||||
|
||||
const bool kda = (g->ne[0] == S_v);
|
||||
float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_sums[4] __attribute__((aligned(128)));
|
||||
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
const uint32_t iv1 = ir % H;
|
||||
const uint32_t iv3 = ir / H;
|
||||
|
||||
const uint32_t iq1 = iv1 % q->ne[1];
|
||||
const uint32_t ik1 = iv1 % k->ne[1];
|
||||
const uint32_t iq3 = iv3 / rq3;
|
||||
const uint32_t ik3 = iv3 / rk3;
|
||||
|
||||
float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
|
||||
memcpy(s_out, s_in, gctx->state_bytes);
|
||||
float * s_work = s_out;
|
||||
|
||||
float * attn_data = dst_base + ((uint64_t) iv3 * n_tokens * H + iv1) * S_v;
|
||||
|
||||
for (uint32_t t = 0; t < n_tokens; ++t) {
|
||||
const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
|
||||
(uint64_t) iq3 * q->nb[3] + (uint64_t) t * q->nb[2] + (uint64_t) iq1 * q->nb[1]);
|
||||
const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
|
||||
(uint64_t) ik3 * k->nb[3] + (uint64_t) t * k->nb[2] + (uint64_t) ik1 * k->nb[1]);
|
||||
const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
|
||||
(uint64_t) iv3 * v->nb[3] + (uint64_t) t * v->nb[2] + (uint64_t) iv1 * v->nb[1]);
|
||||
const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
|
||||
(uint64_t) iv3 * g->nb[3] + (uint64_t) t * g->nb[2] + (uint64_t) iv1 * g->nb[1]);
|
||||
const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
|
||||
(uint64_t) iv3 * beta->nb[3] + (uint64_t) t * beta->nb[2] + (uint64_t) iv1 * beta->nb[1]);
|
||||
|
||||
memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
|
||||
memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
|
||||
|
||||
if (kda) {
|
||||
hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
|
||||
|
||||
uint32_t j = 0;
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
}
|
||||
} else {
|
||||
const float gate = expf(g_t[0]);
|
||||
uint32_t j = 0;
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
}
|
||||
}
|
||||
|
||||
attn_data += (uint64_t) S_v * H;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
|
||||
struct htp_ops_context * octx = gctx->octx;
|
||||
|
||||
const struct htp_tensor * q = octx->src[0];
|
||||
const struct htp_tensor * k = octx->src[1];
|
||||
const struct htp_tensor * v = octx->src[2];
|
||||
const struct htp_tensor * g = octx->src[3];
|
||||
const struct htp_tensor * beta = octx->src[4];
|
||||
const struct htp_tensor * state = octx->src[5];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const uint32_t S_v = v->ne[0];
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
|
||||
const uint32_t total_rows = H * n_seqs;
|
||||
if (ith >= total_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t rq3 = n_seqs / q->ne[3];
|
||||
const uint32_t rk3 = n_seqs / k->ne[3];
|
||||
const float scale = 1.0f / sqrtf((float) S_v);
|
||||
|
||||
float * dst_base = (float *) (uintptr_t) dst->data;
|
||||
float * state_out_base = dst_base + (uint64_t) S_v * H * n_seqs;
|
||||
const float * state_in_base = (const float *) (uintptr_t) state->data;
|
||||
|
||||
const bool kda = (g->ne[0] == S_v);
|
||||
float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_sums[8] __attribute__((aligned(128)));
|
||||
|
||||
dma_queue * dma = octx->ctx->dma[ith];
|
||||
|
||||
uint8_t * spad = NULL;
|
||||
if (gctx->use_vtcm) {
|
||||
spad = gctx->vtcm_state_base + gctx->vtcm_state_per_thread * ith;
|
||||
}
|
||||
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
const uint32_t iv1 = ir % H;
|
||||
const uint32_t iv3 = ir / H;
|
||||
|
||||
const uint32_t iq1 = iv1 % q->ne[1];
|
||||
const uint32_t ik1 = iv1 % k->ne[1];
|
||||
const uint32_t iq3 = iv3 / rq3;
|
||||
const uint32_t ik3 = iv3 / rk3;
|
||||
|
||||
float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
float * s_work;
|
||||
|
||||
if (spad) {
|
||||
dma_queue_push(dma, dma_make_ptr(spad, s_in),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
dma_queue_pop(dma);
|
||||
s_work = (float *) spad;
|
||||
} else {
|
||||
s_work = s_out;
|
||||
memcpy(s_work, s_in, gctx->state_bytes);
|
||||
}
|
||||
|
||||
float * attn_data = dst_base + ((uint64_t) iv3 * H + iv1) * S_v;
|
||||
|
||||
const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
|
||||
(uint64_t) iq3 * q->nb[3] + (uint64_t) iq1 * q->nb[1]);
|
||||
const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
|
||||
(uint64_t) ik3 * k->nb[3] + (uint64_t) ik1 * k->nb[1]);
|
||||
const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
|
||||
(uint64_t) iv3 * v->nb[3] + (uint64_t) iv1 * v->nb[1]);
|
||||
const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
|
||||
(uint64_t) iv3 * g->nb[3] + (uint64_t) iv1 * g->nb[1]);
|
||||
const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
|
||||
(uint64_t) iv3 * beta->nb[3] + (uint64_t) iv1 * beta->nb[1]);
|
||||
|
||||
memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
|
||||
memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
|
||||
|
||||
if (kda) {
|
||||
hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
|
||||
|
||||
uint32_t j = 0;
|
||||
for (; j + 8 <= S_v; j += 8) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work + (uint64_t) (j + 7) * S_v;
|
||||
gdn_mul_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[8] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
}
|
||||
} else {
|
||||
const float gate = expf(g_t[0]);
|
||||
uint32_t j = 0;
|
||||
for (; j + 8 <= S_v; j += 8) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work + (uint64_t) (j + 7) * S_v;
|
||||
gdn_mul_scalar_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[8] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
}
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
}
|
||||
}
|
||||
|
||||
if (spad) {
|
||||
dma_queue_push(dma, dma_make_ptr(s_out, spad),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
dma_queue_pop(dma);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int op_gated_delta_net(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * q = octx->src[0];
|
||||
const struct htp_tensor * k = octx->src[1];
|
||||
const struct htp_tensor * v = octx->src[2];
|
||||
const struct htp_tensor * g = octx->src[3];
|
||||
const struct htp_tensor * beta = octx->src[4];
|
||||
const struct htp_tensor * state = octx->src[5];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
if (!q || !k || !v || !g || !beta || !state || !dst) {
|
||||
return HTP_STATUS_INVAL_PARAMS;
|
||||
}
|
||||
|
||||
if (q->type != HTP_TYPE_F32 || k->type != HTP_TYPE_F32 || v->type != HTP_TYPE_F32 ||
|
||||
g->type != HTP_TYPE_F32 || beta->type != HTP_TYPE_F32 || state->type != HTP_TYPE_F32 ||
|
||||
dst->type != HTP_TYPE_F32) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
const uint32_t S_v = v->ne[0];
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_tokens = v->ne[2];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
|
||||
if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] == 0 || k->ne[1] == 0 ||
|
||||
q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] == 0 || k->ne[3] == 0 ||
|
||||
(n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (state->ne[0] * state->ne[1] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
struct htp_gdn_context gctx;
|
||||
gctx.octx = octx;
|
||||
gctx.rows_per_thread = (H * n_seqs + octx->n_threads - 1) / octx->n_threads;
|
||||
gctx.state_bytes = (size_t) S_v * S_v * sizeof(float);
|
||||
|
||||
size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
|
||||
state_aligned = (state_aligned + 127) & ~(size_t)127;
|
||||
|
||||
gctx.use_vtcm = false;
|
||||
gctx.vtcm_state_base = NULL;
|
||||
gctx.vtcm_state_per_thread = 0;
|
||||
|
||||
if (n_tokens == 1 && octx->ctx->vtcm_base) {
|
||||
size_t vtcm_total = state_aligned * octx->n_threads;
|
||||
if (octx->ctx->vtcm_size >= vtcm_total) {
|
||||
gctx.use_vtcm = true;
|
||||
gctx.vtcm_state_base = octx->ctx->vtcm_base;
|
||||
gctx.vtcm_state_per_thread = state_aligned;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_tokens == 1) {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_tg_thread, &gctx, octx->n_threads);
|
||||
} else {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_pp_thread, &gctx, octx->n_threads);
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -106,5 +106,6 @@ int op_cumsum(struct htp_ops_context * octx);
|
||||
int op_fill(struct htp_ops_context * octx);
|
||||
int op_diag(struct htp_ops_context * octx);
|
||||
int op_solve_tri(struct htp_ops_context * octx);
|
||||
int op_gated_delta_net(struct htp_ops_context * octx);
|
||||
|
||||
#endif /* HTP_CTX_H */
|
||||
|
||||
@@ -83,6 +83,9 @@ enum htp_op_code {
|
||||
HTP_OP_FILL,
|
||||
HTP_OP_DIAG,
|
||||
HTP_OP_SOLVE_TRI,
|
||||
HTP_OP_L2_NORM,
|
||||
HTP_OP_GATED_DELTA_NET,
|
||||
|
||||
HTP_OP_INVALID
|
||||
};
|
||||
|
||||
|
||||
@@ -542,6 +542,7 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_UNARY_SIGMOID:
|
||||
case HTP_OP_UNARY_NEG:
|
||||
case HTP_OP_UNARY_EXP:
|
||||
case HTP_OP_L2_NORM:
|
||||
return op_unary(octx);
|
||||
|
||||
case HTP_OP_UNARY_SILU:
|
||||
@@ -593,6 +594,9 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_SOLVE_TRI:
|
||||
return op_solve_tri(octx);
|
||||
|
||||
case HTP_OP_GATED_DELTA_NET:
|
||||
return op_gated_delta_net(octx);
|
||||
|
||||
case HTP_OP_INVALID:
|
||||
break;
|
||||
|
||||
|
||||
@@ -298,6 +298,81 @@ static void softplus_f32(const float * restrict src,
|
||||
}
|
||||
}
|
||||
|
||||
// --- L2_NORM HVX kernel ---
|
||||
// Computes y[i] = x[i] / fmax(sqrt(sum(x[j]^2)), epsilon) for each row.
|
||||
// scale = 1/fmax(sqrt(sum), epsilon) is computed entirely in HVX registers
|
||||
// using rsqrt + inverse to avoid scalar extraction.
|
||||
static void hvx_fast_l2_norm_f32(const uint8_t * restrict src,
|
||||
uint8_t * restrict dst,
|
||||
uint8_t * restrict pad,
|
||||
const int num_elems,
|
||||
float epsilon) {
|
||||
(void)pad;
|
||||
|
||||
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
||||
|
||||
HVX_Vector sum_v = hvx_vec_splat_f32(0.0f);
|
||||
|
||||
const int nvec = num_elems / VLEN_FP32;
|
||||
const int nloe = num_elems % VLEN_FP32;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
HVX_Vector v1 = v_src[i];
|
||||
HVX_Vector sq = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, sq);
|
||||
}
|
||||
|
||||
// Include tail elements in the sum-of-squares using a predicate mask
|
||||
if (nloe > 0) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
|
||||
HVX_Vector sq = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, sq);
|
||||
}
|
||||
|
||||
// Compute scale = 1/fmax(sqrt(sum), epsilon) entirely in HVX registers.
|
||||
// hvx_vec_rsqrt_f32 + hvx_vec_inverse_f32 avoids scalar extraction.
|
||||
HVX_Vector sum_sf = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
|
||||
HVX_Vector rsqrt_v = hvx_vec_rsqrt_f32(sum_sf); // 1/sqrt(sum)
|
||||
HVX_Vector sqrt_v = hvx_vec_inverse_f32(rsqrt_v); // sqrt(sum)
|
||||
HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
|
||||
HVX_Vector denom_v = Q6_Vsf_vmax_VsfVsf(sqrt_v, epsilon_v); // fmax(sqrt(sum), epsilon)
|
||||
HVX_Vector scale_v = hvx_vec_inverse_f32(denom_v); // 1/fmax(sqrt(sum), epsilon)
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
HVX_Vector v1 = v_src[i];
|
||||
v_dst[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(v1, scale_v));
|
||||
}
|
||||
|
||||
if (nloe > 0) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
|
||||
HVX_Vector result = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(v1, scale_v));
|
||||
hvx_vec_store_a(&v_dst[nvec], nloe * 4, result);
|
||||
}
|
||||
}
|
||||
|
||||
static void l2_norm_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params) {
|
||||
float epsilon = 0.f;
|
||||
memcpy(&epsilon, op_params, sizeof(float));
|
||||
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const float * restrict src_f = (const float *)((const uint8_t *)src + (ir * row_size));
|
||||
float * restrict dst_f = (float *)((uint8_t *)dst + (ir * row_size));
|
||||
|
||||
hvx_fast_l2_norm_f32((const uint8_t *)src_f, (uint8_t *)dst_f, spad, row_elems, epsilon);
|
||||
}
|
||||
}
|
||||
|
||||
static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
|
||||
struct htp_ops_context * octx = uctx->octx;
|
||||
@@ -402,6 +477,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
||||
case HTP_OP_UNARY_SOFTPLUS:
|
||||
softplus_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_L2_NORM:
|
||||
l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -469,6 +547,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||
case HTP_OP_UNARY_SOFTPLUS:
|
||||
op_type = "softplus-f32";
|
||||
break;
|
||||
case HTP_OP_L2_NORM:
|
||||
op_type = "l2norm-f32";
|
||||
break;
|
||||
|
||||
default:
|
||||
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
||||
|
||||
@@ -102,6 +102,8 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mv_id_q8_0_f32_flat
|
||||
mul_mv_id_mxfp4_f32
|
||||
mul_mv_id_mxfp4_f32_flat
|
||||
gemm_moe_q4_0_f32_ns
|
||||
gemv_moe_q4_0_f32_ns
|
||||
gemm_moe_mxfp4_f32
|
||||
gemv_moe_mxfp4_f32
|
||||
gemm_moe_mxfp4_f32_ns
|
||||
|
||||
@@ -542,6 +542,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul_mm_f16_f32_kq;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
||||
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
||||
cl_kernel kernel_convert_block_q4_0_trans4_ns, kernel_restore_block_q4_0_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
|
||||
cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
|
||||
cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns;
|
||||
@@ -600,6 +601,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_conv_2d_f16_f32;
|
||||
cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns;
|
||||
cl_kernel kernel_moe_reorder_b;
|
||||
@@ -950,6 +952,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1", &err), err));
|
||||
@@ -2884,6 +2888,40 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_moe_q4_0_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_moe_q4_0_f32_ns.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_moe_q4_0_f32_ns.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemv_moe_q4_0_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q4_0_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_moe_q4_0_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_moe_q4_0_f32_ns.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_moe_q4_0_f32_ns.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_q4_0_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q4_0_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_moe_mxfp4_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -3657,11 +3695,14 @@ struct ggml_tensor_extra_cl_q4_0 {
|
||||
CL_CHECK(clReleaseMemObject(d));
|
||||
d = nullptr;
|
||||
}
|
||||
if (q_img != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(q_img));
|
||||
q_img = nullptr;
|
||||
}
|
||||
// Currently, q_img and d_img are only initialized when SMALL_ALLOC is
|
||||
// enabled. They point to the images in ggml_backend_opencl_buffer_context.
|
||||
// So, there is no need to release them here.
|
||||
// TODO: initialize them for non SMALL_PATH path, or remove them.
|
||||
q_img = nullptr;
|
||||
d_img = nullptr;
|
||||
size_q = 0;
|
||||
size_d = 0;
|
||||
@@ -4926,17 +4967,53 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
//cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Adreno moe q4_0 kernel needs special transpose and unshuffling
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0_trans4_ns;
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
|
||||
|
||||
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
||||
size_t local_work_size[3] = {64, 2, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
// Create image for Q
|
||||
cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32};
|
||||
cl_image_desc img_desc_q = {
|
||||
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
||||
static_cast<size_t>(ggml_nelements(tensor) / 8),
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
{ extra->q }
|
||||
};
|
||||
extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
|
||||
tensor->extra = extra;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
|
||||
|
||||
// The optimized kernels need weights in natural order, so unshuffle.
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
|
||||
}
|
||||
#else
|
||||
#else
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
@@ -4952,7 +5029,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
tensor->extra = extra;
|
||||
|
||||
// transpose the weights and scales
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Only do transpose for large, non batched matrix
|
||||
// TODO: use preallocated images instead of sub-buffer then image
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
@@ -4966,10 +5043,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
// Transpose d as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
return;
|
||||
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q4_1) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
@@ -5689,6 +5764,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
cl_int err;
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0_trans4_ns;
|
||||
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
||||
size_t local_work_size[3] = {64, 2, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
ggml_cl_buffer buf_trans_q;
|
||||
ggml_cl_buffer buf_trans_d;
|
||||
@@ -12811,6 +12916,179 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
// subgroup mat vec
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0: {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_moe_kernels(backend_ctx, src0)) {
|
||||
cl_int status;
|
||||
|
||||
size_t local_size[3] = {64, 2, 1};
|
||||
size_t global_size[3] = {64, 2, 1};
|
||||
|
||||
if (ne12 == 1) { // for gemv
|
||||
kernel = backend_ctx->kernel_gemv_moe_q4_0_f32_ns;
|
||||
|
||||
cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
|
||||
|
||||
// create a sub_buffer for src2
|
||||
cl_buffer_region region;
|
||||
region.origin = offset2;
|
||||
region.size = ne20 * ne21 * sizeof(int);
|
||||
buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
|
||||
// create a sub_buffer for src1
|
||||
region.origin = offset1;
|
||||
region.size = ne10 * ne11 * ne12 * sizeof(float);
|
||||
src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// create image for src1
|
||||
cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
|
||||
buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Set kernel args
|
||||
int arg_idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11));
|
||||
|
||||
// launch kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
||||
|
||||
// deallocate sub buffers and images
|
||||
CL_CHECK(clReleaseMemObject(src1_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(buf_src1_image));
|
||||
CL_CHECK(clReleaseMemObject(buf_src2));
|
||||
|
||||
} else { // for gemm
|
||||
kernel = backend_ctx->kernel_gemm_moe_q4_0_f32_ns;
|
||||
|
||||
// Reorder router if called from test-backend-ops or when new router is generated.
|
||||
// Otherwise reuse the reordered result from previous mul_mat_id call.
|
||||
if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) {
|
||||
moe_router_reoerder(backend, src2, ne20);
|
||||
backend_ctx->toggle_reorder = false;
|
||||
}
|
||||
|
||||
cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image;
|
||||
cl_mem buf_src2, buf_src2_emap;
|
||||
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = sizeof(int) * max_post_router_tile * n_tile_size;
|
||||
buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
region.origin = 0;
|
||||
region.size = sizeof(short) * max_post_router_tile;
|
||||
buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Reorder activations
|
||||
// create a sub_buffer for src1
|
||||
region.origin = offset1;
|
||||
region.size = ne10 * ne11 * ne12 * sizeof(float);
|
||||
sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Create image for reordered src1
|
||||
// Use pre-allocated placeholder
|
||||
region.origin = 0;
|
||||
region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float);
|
||||
backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size);
|
||||
buf_src1_reordered = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
cl_image_format image_format_buf_src1;
|
||||
cl_image_desc image_desc_buf_src1;
|
||||
image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
|
||||
image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
unsigned short map_ratio = ne20 / ne11;
|
||||
GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n");
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer)));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size));
|
||||
|
||||
size_t reorder_b_local_size[3] = {256, 1, 1};
|
||||
size_t reorder_b_global_size[3] = {static_cast<size_t>(((ne00 / 4) + 255) / 256 * 256), static_cast<size_t>(max_post_router_tile * n_tile_size), 1};
|
||||
|
||||
// Dispatch reorder kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
|
||||
|
||||
// MoE kernel prepare
|
||||
// Create sub buffer for dst
|
||||
region.origin = offsetd;
|
||||
region.size = ne0 * ne1 * ne2 * sizeof(float);
|
||||
sub_buf_dst = clCreateSubBuffer(
|
||||
extrad->data_device,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
// Create image for dst
|
||||
cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT};
|
||||
cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}};
|
||||
buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Set kernel args
|
||||
int arg_idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->q_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer)));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
||||
|
||||
// set thread grid
|
||||
global_size[1] = static_cast<size_t>((ne01 + 63) / 64);
|
||||
global_size[2] = static_cast<size_t>(max_post_router_tile);
|
||||
local_size[1] = 1;
|
||||
local_size[2] = 1;
|
||||
|
||||
// Dispatch kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
||||
|
||||
clReleaseMemObject(sub_buf_src1_pre);
|
||||
clReleaseMemObject(buf_src1_reordered);
|
||||
clReleaseMemObject(image_src1_reordered);
|
||||
clReleaseMemObject(buf_src2);
|
||||
clReleaseMemObject(buf_src2_emap);
|
||||
clReleaseMemObject(sub_buf_dst);
|
||||
clReleaseMemObject(buf_dst_image);
|
||||
}
|
||||
return;
|
||||
} // fallback to generic Q4_0 MoE kernel
|
||||
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
|
||||
@@ -190,6 +190,92 @@ kernel void kernel_restore_block_q4_0_noshuffle(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q4_0_trans4_ns(
|
||||
global struct block_q4_0 * src0,
|
||||
__global uint * dst_q,
|
||||
__global half * dst_d,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK4_0;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
global struct block_q4_0 * b = src0 + src_blk_offset;
|
||||
dst_d[dst_blk_offset] = b->d;
|
||||
|
||||
// extract quantization and unshuffle
|
||||
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
||||
|
||||
ushort8 post_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK4_0 / 4; ++i) {
|
||||
uchar x0 = pre_block_ptr[2*i + 0];
|
||||
uchar x1 = pre_block_ptr[2*i + 1];
|
||||
|
||||
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
post_block_ptr[i + QK4_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
uint4 q_block = as_uint4(post_block);
|
||||
|
||||
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
dst_q[offset] = q_block.x;
|
||||
dst_q[offset + ne01] = q_block.y;
|
||||
dst_q[offset + ne01 * 2] = q_block.z;
|
||||
dst_q[offset + ne01 * 3] = q_block.w;
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q4_0_trans4_ns(
|
||||
__global uint * src_q,
|
||||
__global half * src_d,
|
||||
__global struct block_q4_0 * dst0,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK4_0;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
__global struct block_q4_0 * b = dst0 + dst_blk_offset;
|
||||
b->d = src_d[src_d_offset];
|
||||
|
||||
// collect transposed quantization parts for a block
|
||||
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
uint4 q_block;
|
||||
q_block.x = src_q[src_q_offset];
|
||||
q_block.y = src_q[src_q_offset + ne01];
|
||||
q_block.z = src_q[src_q_offset + ne01 * 2];
|
||||
q_block.w = src_q[src_q_offset + ne01 * 3];
|
||||
|
||||
ushort8 post_block = as_ushort8(q_block);
|
||||
ushort8 pre_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK4_0 / 4; ++i) {
|
||||
uchar x0 = post_block_ptr[i + 0];
|
||||
uchar x1 = post_block_ptr[i + QK4_0 / 4];
|
||||
|
||||
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q4_1
|
||||
// Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
|
||||
|
||||
@@ -0,0 +1,252 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
|
||||
#define TILESIZE_K 16
|
||||
#define TILESIZE_M 64
|
||||
#define TILESIZE_N 32
|
||||
|
||||
|
||||
#define dequantize_q4_0(q4, a_f16, scale) \
|
||||
a_f16.s0 = (half)((q4.s0 & 0x000F) - 8) * scale; \
|
||||
a_f16.s1 = (half)(((q4.s0 & 0x00F0) >> 4) - 8) * scale; \
|
||||
a_f16.s2 = (half)(((q4.s0 & 0x0F00) >> 8) - 8) * scale; \
|
||||
a_f16.s3 = (half)(((q4.s0 & 0xF000) >> 12) - 8) * scale; \
|
||||
a_f16.s4 = (half)((q4.s1 & 0x000F) - 8) * scale; \
|
||||
a_f16.s5 = (half)(((q4.s1 & 0x00F0) >> 4) - 8) * scale; \
|
||||
a_f16.s6 = (half)(((q4.s1 & 0x0F00) >> 8) - 8) * scale; \
|
||||
a_f16.s7 = (half)(((q4.s1 & 0xF000) >> 12) - 8) * scale; \
|
||||
a_f16.s8 = (half)((q4.s2 & 0x000F) - 8) * scale; \
|
||||
a_f16.s9 = (half)(((q4.s2 & 0x00F0) >> 4) - 8) * scale; \
|
||||
a_f16.sa = (half)(((q4.s2 & 0x0F00) >> 8) - 8) * scale; \
|
||||
a_f16.sb = (half)(((q4.s2 & 0xF000) >> 12) - 8) * scale; \
|
||||
a_f16.sc = (half)((q4.s3 & 0x000F) - 8) * scale; \
|
||||
a_f16.sd = (half)(((q4.s3 & 0x00F0) >> 4) - 8) * scale; \
|
||||
a_f16.se = (half)(((q4.s3 & 0x0F00) >> 8) - 8) * scale; \
|
||||
a_f16.sf = (half)(((q4.s3 & 0xF000) >> 12) - 8) * scale; \
|
||||
|
||||
|
||||
#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
|
||||
acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
|
||||
acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
|
||||
acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
|
||||
acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
|
||||
acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
|
||||
acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
|
||||
acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
|
||||
acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
|
||||
acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
|
||||
acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
|
||||
acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
|
||||
acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
|
||||
acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
|
||||
acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
|
||||
acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
|
||||
acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
|
||||
acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
|
||||
acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
|
||||
acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
|
||||
acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
|
||||
acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
|
||||
acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
|
||||
acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
|
||||
acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
|
||||
acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
|
||||
acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
|
||||
acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
|
||||
acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
|
||||
acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
|
||||
acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
|
||||
acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
|
||||
acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
|
||||
acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
|
||||
acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
|
||||
acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
|
||||
acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
|
||||
acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
|
||||
acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
|
||||
acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
|
||||
acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
|
||||
acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
|
||||
acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
|
||||
acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
|
||||
acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
|
||||
acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
|
||||
acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
|
||||
acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
|
||||
acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
|
||||
acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
|
||||
acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
|
||||
acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
|
||||
acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
|
||||
acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
|
||||
acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
|
||||
acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
|
||||
acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
|
||||
acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
|
||||
acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
|
||||
acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
|
||||
acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
|
||||
acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
|
||||
acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
|
||||
acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
|
||||
|
||||
__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
|
||||
kernel void kernel_gemm_moe_q4_0_f32_ns(
|
||||
__read_only image1d_buffer_t src0_q,
|
||||
__global half * src0_d,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global ushort * src2_emap,
|
||||
__write_only image1d_buffer_t dst,
|
||||
__global int * total_tiles,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint block_id_m = get_global_id(1); // m_tile
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
return;
|
||||
}
|
||||
|
||||
__private half16 reg_a;
|
||||
__private float32 reg_c = (float32)(0);
|
||||
__local half4 shared_b[128];
|
||||
|
||||
const ushort expert_id = src2_emap[block_id_n];
|
||||
|
||||
const uint row = block_id_m * TILESIZE_M;
|
||||
const uint col = block_id_n * TILESIZE_N;
|
||||
|
||||
uint sub_block_id_m = get_local_id(0);
|
||||
uint2 b_global_offset;
|
||||
b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
|
||||
b_global_offset.y = b_global_offset.x + (16 * ne00);
|
||||
uint2 b_local_offset;
|
||||
b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
|
||||
b_local_offset.y = b_local_offset.x + 16;
|
||||
|
||||
// Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
|
||||
for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
|
||||
// First sub-block
|
||||
uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
|
||||
uint b_sub_offset = col * ne00 + step;
|
||||
|
||||
// Load scale for current Q4_0 block
|
||||
uint s_offset = s_sub_offset + get_global_id(0);
|
||||
half s = src0_d[s_offset];
|
||||
|
||||
// Load 16 q (64-bits) in transposed layout
|
||||
uint2 q4x16;
|
||||
q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
|
||||
q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
float8 bx8_f32;
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
half8 bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q4_0(as_ushort4(q4x16), reg_a, s);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 8 elements reduction for better precision
|
||||
half16 acc;
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
|
||||
// Repeat for second sub-block
|
||||
uint half_step = step + TILESIZE_K;
|
||||
q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
b_sub_offset = col * ne00 + half_step;
|
||||
|
||||
// Load next 16 q (64-bits) in transposed layout
|
||||
q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
|
||||
q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q4_0(as_ushort4(q4x16), reg_a, s);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 3-levels reduction for better precision
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
if (get_local_id(0) < TILESIZE_N) {
|
||||
uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
|
||||
if (idx == 0xFFFFFFFF) {
|
||||
idx = src2[block_id_n * TILESIZE_N + 0];
|
||||
}
|
||||
out_idx[get_local_id(0)] = idx * ne01;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Scatter results back to original position in output grid
|
||||
uint m_offset = row + get_local_id(0);
|
||||
|
||||
write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
|
||||
write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
|
||||
write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
|
||||
write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
|
||||
write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
|
||||
write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
|
||||
write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
|
||||
write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
|
||||
write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
|
||||
write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
|
||||
write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
|
||||
write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
|
||||
write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
|
||||
write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
|
||||
write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
|
||||
write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
|
||||
write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
|
||||
write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
|
||||
write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
|
||||
write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
|
||||
write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
|
||||
write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
|
||||
write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
|
||||
write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
|
||||
write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
|
||||
write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
|
||||
write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
|
||||
write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
|
||||
write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
|
||||
write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
|
||||
write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
|
||||
|
||||
// Store zero padding parts to the index of first output in tile, override correct result in the end
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#define QK_Q4_0 32
|
||||
#define N_SIMDGROUP 4
|
||||
#define SIMDGROUP_WIDTH 64
|
||||
|
||||
static inline float8 q4_0_to_fp32_packed8(ushort2 q4x8) {
|
||||
float8 fp32x8;
|
||||
fp32x8.s0 = (float)((q4x8.s0 & 0x000F) - 8);
|
||||
fp32x8.s1 = (float)(((q4x8.s0 & 0x00F0) >> 4) - 8);
|
||||
fp32x8.s2 = (float)(((q4x8.s0 & 0x0F00) >> 8) - 8);
|
||||
fp32x8.s3 = (float)(((q4x8.s0 & 0xF000) >> 12) - 8);
|
||||
fp32x8.s4 = (float)((q4x8.s1 & 0x000F) - 8);
|
||||
fp32x8.s5 = (float)(((q4x8.s1 & 0x00F0) >> 4) - 8);
|
||||
fp32x8.s6 = (float)(((q4x8.s1 & 0x0F00) >> 8) - 8);
|
||||
fp32x8.s7 = (float)(((q4x8.s1 & 0xF000) >> 12) - 8);
|
||||
return fp32x8;
|
||||
}
|
||||
|
||||
|
||||
__attribute__((qcom_reqd_sub_group_size("half")))
|
||||
__kernel void kernel_gemv_moe_q4_0_f32_ns(
|
||||
__global uint * src0_q,
|
||||
__global half * src0_d,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne11
|
||||
) {
|
||||
uint i01 = get_global_id(0);
|
||||
uint i20 = get_global_id(2);
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
uint expert_offset = expert_id * ne00 * ne01 / 32;
|
||||
|
||||
__private float sum = 0.0f; // each thread calculate partial sum of one output
|
||||
|
||||
// loop along ne00 in block granularity, skip 4 blocks every iter
|
||||
for (uint ib00 = sgid; ib00 < (ne00 / QK_Q4_0); ib00 += N_SIMDGROUP) {
|
||||
|
||||
// load one block of q
|
||||
uint4 regQ;
|
||||
uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01;
|
||||
|
||||
regQ.s0 = src0_q[block_offset];
|
||||
regQ.s1 = src0_q[block_offset + ne01];
|
||||
regQ.s2 = src0_q[block_offset + ne01 * 2];
|
||||
regQ.s3 = src0_q[block_offset + ne01 * 3];
|
||||
|
||||
uint offset = i11 * ne00 / 4 + ib00 * 8;
|
||||
|
||||
float8 fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s0));
|
||||
|
||||
float4 shared_y4;
|
||||
shared_y4 = read_imagef(src1, (offset + 0));
|
||||
float4 acc = shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 1));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s1));
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 2));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 3));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s2));
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 4));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 5));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s3));
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 6));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 7));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
half regS = src0_d[ib00 * ne01 + i01 + expert_offset];
|
||||
sum += (float)(regS) * ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #subgroups=4
|
||||
__local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
|
||||
if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
|
||||
if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
|
||||
if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||
|
||||
// 1 outputs per thread in subgroup 0
|
||||
if (sgid == 0) {
|
||||
dst = dst + (offsetd >> 2);
|
||||
dst[i01 + i20 * ne01] = sum;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -135,7 +135,11 @@ endif()
|
||||
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
|
||||
if (NOT GGML_SYCL_DEVICE_ARCH)
|
||||
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
|
||||
else()
|
||||
message(STATUS "Skipping -ze-intel-greater-than-4GB-buffer-required for spir64_gen AOT")
|
||||
endif()
|
||||
|
||||
# Link against Intel oneMKL
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
@@ -160,7 +164,15 @@ if (GGML_SYCL_HOST_MEM_FALLBACK)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
message(STATUS "GGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} (AOT via spir64_gen)")
|
||||
target_compile_options(
|
||||
ggml-sycl PRIVATE
|
||||
-fsycl-targets=spir64_gen
|
||||
"SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
|
||||
)
|
||||
target_link_options(
|
||||
ggml-sycl PRIVATE
|
||||
-fsycl-targets=spir64_gen
|
||||
"SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "presets.hpp"
|
||||
#include "type.hpp"
|
||||
#include "sycl_hw.hpp"
|
||||
#include "fattn-buffers.hpp"
|
||||
|
||||
namespace syclexp = sycl::ext::oneapi::experimental;
|
||||
|
||||
@@ -404,12 +405,16 @@ struct ggml_backend_sycl_context {
|
||||
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
|
||||
std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;
|
||||
|
||||
std::unique_ptr<ggml_sycl_fattn_kv_buffers> fattn_bufs[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
|
||||
|
||||
static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
|
||||
|
||||
static std::unique_ptr<ggml_sycl_fattn_kv_buffers> new_fattn_kv_buffers(queue_ptr qptr, int device);
|
||||
|
||||
ggml_sycl_pool & pool(int device) {
|
||||
if (pools[device] == nullptr) {
|
||||
pools[device] = new_pool_for_device(stream(device,0), device);
|
||||
@@ -421,6 +426,17 @@ struct ggml_backend_sycl_context {
|
||||
return pool(device);
|
||||
}
|
||||
|
||||
ggml_sycl_fattn_kv_buffers & fattn_buffers(int device) {
|
||||
if (fattn_bufs[device] == nullptr) {
|
||||
fattn_bufs[device] = new_fattn_kv_buffers(stream(device, 0), device);
|
||||
}
|
||||
return *fattn_bufs[device];
|
||||
}
|
||||
|
||||
ggml_sycl_fattn_kv_buffers & fattn_buffers() {
|
||||
return fattn_buffers(device);
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_GRAPH
|
||||
std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
|
||||
#endif
|
||||
|
||||
@@ -252,6 +252,23 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q5_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(K_SCALE_SIZE), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q5_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::queue_ptr stream) {
|
||||
@@ -643,7 +660,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
return dequantize_row_q4_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_sycl;
|
||||
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q5_K_sycl_reorder;
|
||||
} else {
|
||||
return dequantize_row_q5_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q6_K:
|
||||
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q6_K_sycl_reorder;
|
||||
@@ -718,7 +739,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
return dequantize_row_q4_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_sycl;
|
||||
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q5_K_sycl_reorder;
|
||||
} else {
|
||||
return dequantize_row_q5_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q6_K:
|
||||
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q6_K_sycl_reorder;
|
||||
|
||||
@@ -537,6 +537,63 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_block_q5_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
uint8_t * scales_local, const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
|
||||
const int64_t ib = item_ct1.get_group(2);
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 64 threads
|
||||
const int64_t tid = item_ct1.get_local_id(2);
|
||||
const int64_t il = tid / 16; // 0...3
|
||||
const int64_t ir = tid % 16; // 0...15
|
||||
const int64_t is = 2 * il;
|
||||
|
||||
dst_t * y = yy + ib * QK_K + 64 * il + 2 * ir;
|
||||
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vx);
|
||||
|
||||
// Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales (K_SCALE_SIZE per block)] [dm (half2 per block)]
|
||||
const size_t qs_offset = ib * (QK_K / 2);
|
||||
const size_t qh_offset = n_blocks * (QK_K / 2) + ib * (QK_K / 8);
|
||||
const size_t scales_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + ib * K_SCALE_SIZE;
|
||||
const size_t dm_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + n_blocks * K_SCALE_SIZE + ib * sizeof(ggml_half2);
|
||||
|
||||
const uint8_t * qs_ptr = base + qs_offset;
|
||||
const uint8_t * qh_ptr = base + qh_offset;
|
||||
const uint8_t * scales_ptr = base + scales_offset;
|
||||
const ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
|
||||
|
||||
const float dall = dm_values.x();
|
||||
const float dmin = dm_values.y();
|
||||
|
||||
const uint8_t * ql = qs_ptr + 32 * il + 2 * ir;
|
||||
const uint8_t * qh = qh_ptr + 2 * ir;
|
||||
|
||||
if (tid < K_SCALE_SIZE) {
|
||||
scales_local[tid] = scales_ptr[tid];
|
||||
}
|
||||
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(is + 0, scales_local, sc, m);
|
||||
const float d1 = dall * sc; const float m1 = dmin * m;
|
||||
get_scale_min_k4(is + 1, scales_local, sc, m);
|
||||
const float d2 = dall * sc; const float m2 = dmin * m;
|
||||
|
||||
uint8_t hm = 1 << (2 * il);
|
||||
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
|
||||
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
|
||||
hm <<= 1;
|
||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||
#else
|
||||
GGML_UNUSED(ib); GGML_UNUSED(tid); GGML_UNUSED(yy); GGML_UNUSED(scales_local); GGML_UNUSED(n_blocks);
|
||||
GGML_ABORT("Q5_K reorder dequantize not supported for QK_K != 256");
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2025 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
sycl::half * ggml_sycl_fattn_kv_buffers::kv_buffer::ensure_half(size_t n_elems) {
|
||||
const size_t need_bytes = n_elems * sizeof(sycl::half);
|
||||
|
||||
if (capacity >= need_bytes) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
if (ptr) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(qptr->wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
|
||||
ptr = nullptr;
|
||||
capacity = 0;
|
||||
}
|
||||
|
||||
size_t cap = 0;
|
||||
while (cap < need_bytes) {
|
||||
cap += CHUNK_SIZE;
|
||||
}
|
||||
|
||||
void * dev_ptr;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(dev_ptr = sycl::malloc_device(
|
||||
cap, *qptr)));
|
||||
|
||||
if (!dev_ptr) {
|
||||
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, cap);
|
||||
GGML_ABORT("fattn buffer alloc failed");
|
||||
}
|
||||
|
||||
ptr = static_cast<sycl::half *>(dev_ptr);
|
||||
capacity = cap;
|
||||
return ptr;
|
||||
}
|
||||
|
||||
ggml_sycl_fattn_kv_buffers::kv_buffer::~kv_buffer() {
|
||||
#ifdef DEBUG_SYCL_POOL
|
||||
GGML_LOG_INFO("ggml_sycl_fattn_kv_buffer[%d]: %.2f MiB\n", device, capacity / 1024.0 / 1024.0);
|
||||
#endif
|
||||
if (ptr) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2025 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_FATTN_BUFFERS_HPP
|
||||
#define GGML_SYCL_FATTN_BUFFERS_HPP
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
typedef sycl::queue *queue_ptr;
|
||||
|
||||
struct ggml_sycl_fattn_kv_buffers {
|
||||
// buffers grow in chunks of this size
|
||||
static constexpr size_t CHUNK_SIZE = 16ull << 20; // 16 MiB
|
||||
|
||||
struct kv_buffer {
|
||||
kv_buffer(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
|
||||
~kv_buffer();
|
||||
|
||||
kv_buffer(const kv_buffer &) = delete;
|
||||
kv_buffer & operator=(const kv_buffer &) = delete;
|
||||
|
||||
sycl::half * ensure_half(size_t n_elems);
|
||||
|
||||
private:
|
||||
sycl::half * ptr = nullptr;
|
||||
size_t capacity = 0;
|
||||
queue_ptr qptr = nullptr;
|
||||
[[maybe_unused]] int device = 0;
|
||||
};
|
||||
|
||||
kv_buffer K;
|
||||
kv_buffer V;
|
||||
|
||||
ggml_sycl_fattn_kv_buffers(queue_ptr qptr, int device) : K(qptr, device), V(qptr, device) {}
|
||||
|
||||
ggml_sycl_fattn_kv_buffers(const ggml_sycl_fattn_kv_buffers &) = delete;
|
||||
ggml_sycl_fattn_kv_buffers & operator=(const ggml_sycl_fattn_kv_buffers &) = delete;
|
||||
};
|
||||
|
||||
/**
|
||||
* Imitates `ggml_sycl_pool_alloc` to keep the code calling alloc unchanged.
|
||||
*/
|
||||
struct ggml_sycl_fattn_alloc {
|
||||
ggml_sycl_fattn_kv_buffers::kv_buffer & buf;
|
||||
sycl::half * ptr = nullptr;
|
||||
|
||||
explicit ggml_sycl_fattn_alloc(ggml_sycl_fattn_kv_buffers::kv_buffer & buf_) : buf(buf_) {}
|
||||
|
||||
sycl::half * alloc(size_t n_elems) {
|
||||
ptr = buf.ensure_half(n_elems);
|
||||
return ptr;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "common.hpp"
|
||||
#include "convert.hpp"
|
||||
#include "vecdotq.hpp"
|
||||
#include "fattn-buffers.hpp"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
@@ -918,12 +919,13 @@ void launch_fattn(
|
||||
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
||||
|
||||
ggml_sycl_pool & pool = ctx.pool();
|
||||
ggml_sycl_fattn_kv_buffers & fbuf = ctx.fattn_buffers();
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
const int id = ggml_sycl_get_device();
|
||||
const int nsm = ggml_sycl_info().devices[id].nsm;
|
||||
|
||||
ggml_sycl_pool_alloc<sycl::half> K_f16(pool);
|
||||
ggml_sycl_pool_alloc<sycl::half> V_f16(pool);
|
||||
ggml_sycl_fattn_alloc K_f16(fbuf.K);
|
||||
ggml_sycl_fattn_alloc V_f16(fbuf.V);
|
||||
ggml_sycl_pool_alloc<int> KV_max(pool);
|
||||
ggml_sycl_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_sycl_pool_alloc<sycl::float2> dst_tmp_meta(pool);
|
||||
|
||||
@@ -183,6 +183,10 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::half *)dst->src[0]->data,
|
||||
src1_i32, (float *)dst->data, ctx.stream());
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::ext::oneapi::bfloat16 *)dst->src[0]->data,
|
||||
src1_i32, (float *)dst->data, ctx.stream());
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
|
||||
src1_i32, (float *)dst->data, ctx.stream());
|
||||
|
||||
@@ -1286,6 +1286,23 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
|
||||
|
||||
~ggml_sycl_pool_leg() {
|
||||
#ifdef DEBUG_SYCL_POOL
|
||||
int n_cached = 0;
|
||||
size_t bytes_cached = 0;
|
||||
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
||||
if (buffer_pool[i].ptr != nullptr) {
|
||||
++n_cached;
|
||||
bytes_cached += buffer_pool[i].size;
|
||||
}
|
||||
}
|
||||
GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
|
||||
n_cached, bytes_cached / 1024.0 / 1024.0);
|
||||
const auto slots = format_slots_in_alloc_order();
|
||||
if (!slots.empty()) {
|
||||
GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
||||
ggml_sycl_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
@@ -1296,6 +1313,26 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_SYCL_POOL
|
||||
std::string format_slots_in_alloc_order() const {
|
||||
std::string line;
|
||||
char buf[32];
|
||||
bool first = true;
|
||||
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
||||
if (buffer_pool[i].ptr == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (!first) {
|
||||
line += '/';
|
||||
}
|
||||
first = false;
|
||||
snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
|
||||
line += buf;
|
||||
}
|
||||
return line;
|
||||
}
|
||||
#endif
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
#ifdef DEBUG_sycl_MALLOC
|
||||
int nnz = 0;
|
||||
@@ -1459,6 +1496,10 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(q
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
||||
}
|
||||
|
||||
std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
|
||||
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
|
||||
}
|
||||
|
||||
// TBD pool with virtual memory management
|
||||
// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
|
||||
|
||||
@@ -3303,6 +3344,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return !g_ggml_sycl_prioritize_dmmv;
|
||||
default:
|
||||
@@ -3325,6 +3367,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return true;
|
||||
default:
|
||||
@@ -3541,6 +3584,54 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(size % sizeof(block_q5_K) == 0);
|
||||
GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
|
||||
|
||||
const int nblocks = size / sizeof(block_q5_K);
|
||||
|
||||
sycl_reorder_temp_buffer tmp(stream, size);
|
||||
if (!tmp) {
|
||||
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
||||
return false;
|
||||
}
|
||||
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
||||
|
||||
sycl::event copy_event;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
copy_event.wait();
|
||||
}
|
||||
|
||||
auto * qs_ptr = data_device;
|
||||
auto * qh_ptr = qs_ptr + (QK_K / 2) * nblocks;
|
||||
auto * scales_ptr = qh_ptr + (QK_K / 8) * nblocks;
|
||||
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
|
||||
|
||||
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
|
||||
const block_q5_K * x = (const block_q5_K *) tmp_buf;
|
||||
const int ib = i;
|
||||
|
||||
for (int j = 0; j < QK_K / 2; ++j) {
|
||||
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK_K / 8; ++j) {
|
||||
qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
||||
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
||||
}
|
||||
|
||||
dm_ptr[ib] = x[ib].dm;
|
||||
});
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
reorder_event.wait_and_throw();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(size % sizeof(block_q6_K) == 0);
|
||||
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
|
||||
@@ -3607,6 +3698,8 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
|
||||
case GGML_TYPE_Q4_K:
|
||||
return reorder_qw_q4_k(data_device, size, 0, stream);
|
||||
case GGML_TYPE_Q5_K:
|
||||
return reorder_qw_q5_k(data_device, size, 0, stream);
|
||||
case GGML_TYPE_Q6_K:
|
||||
return reorder_qw_q6_k(data_device, size, 0, stream);
|
||||
default:
|
||||
@@ -4922,6 +5015,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
@@ -5104,11 +5198,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_ACC:
|
||||
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
case GGML_OP_PAD:
|
||||
// TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985
|
||||
if (ggml_get_op_params_i32(op, 8) != 0) {
|
||||
return false;
|
||||
}
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
return true;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
|
||||
@@ -839,6 +839,26 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
}
|
||||
}
|
||||
|
||||
static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(vx, vy, dst, ncols,
|
||||
nrows, nd_item);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
@@ -1125,6 +1145,7 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n");
|
||||
reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q8_0_q8_1_sycl\n");
|
||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
@@ -1145,7 +1166,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl\n");
|
||||
reorder_mul_mat_vec_q5_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_K_q8_1_sycl\n");
|
||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
|
||||
+27
-27
@@ -13,7 +13,8 @@
|
||||
//#include "common.hpp"
|
||||
#include "pad.hpp"
|
||||
|
||||
static void pad_f32(const float * src, float * dst,
|
||||
static void pad_f32(const float * src, size_t s00, size_t s01, size_t s02, size_t s03,
|
||||
float * dst,
|
||||
const int lp0, const int rp0, const int lp1, const int rp1,
|
||||
const int lp2, const int rp2, const int lp3, const int rp3,
|
||||
const int ne0, const int ne1, const int ne2, const int ne3,
|
||||
@@ -27,7 +28,6 @@ static void pad_f32(const float * src, float * dst,
|
||||
return;
|
||||
}
|
||||
|
||||
// operation
|
||||
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
||||
if ((i0 >= lp0 && i0 < ne0 - rp0) &&
|
||||
(i1 >= lp1 && i1 < ne1 - rp1) &&
|
||||
@@ -37,12 +37,8 @@ static void pad_f32(const float * src, float * dst,
|
||||
const int64_t i01 = i1 - lp1;
|
||||
const int64_t i02 = i2 - lp2;
|
||||
const int64_t i03 = i3 - lp3;
|
||||
const int64_t ne02 = ne2 - lp2 - rp2;
|
||||
const int64_t ne01 = ne1 - lp1 - rp1;
|
||||
const int64_t ne00 = ne0 - lp0 - rp0;
|
||||
|
||||
const int64_t src_idx = i03 * (ne00 * ne01 * ne02) +
|
||||
i02 * (ne00 * ne01) + i01 * ne00 + i00;
|
||||
const int64_t src_idx = i03 * s03 + i02 * s02 + i01 * s01 + i00 * s00;
|
||||
|
||||
dst[dst_idx] = src[src_idx];
|
||||
} else {
|
||||
@@ -50,20 +46,19 @@ static void pad_f32(const float * src, float * dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void pad_f32_sycl(const float *src, float *dst, const int lp0,
|
||||
const int rp0, const int lp1, const int rp1,
|
||||
const int lp2, const int rp2, const int lp3,
|
||||
const int rp3, const int ne0, const int ne1,
|
||||
const int ne2, const int ne3,
|
||||
static void pad_f32_sycl(const float * src, size_t s00, size_t s01, size_t s02, size_t s03,
|
||||
float * dst, const int lp0, const int rp0, const int lp1, const int rp1,
|
||||
const int lp2, const int rp2, const int lp3, const int rp3,
|
||||
const int ne0, const int ne1, const int ne2, const int ne3,
|
||||
dpct::queue_ptr stream) {
|
||||
int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
|
||||
dpct::dim3 gridDim(num_blocks, ne1, ne2 * ne3);
|
||||
sycl::range<3> grid(ne2 * ne3, ne1, num_blocks);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
|
||||
sycl::nd_range<3>(grid * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
pad_f32(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1,
|
||||
ne2, ne3, item_ct1);
|
||||
pad_f32(src, s00, s01, s02, s03, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
|
||||
ne0, ne1, ne2, ne3, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -71,22 +66,27 @@ void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const int32_t lp0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t rp0 = ((const int32_t*)(dst->op_params))[1];
|
||||
const int32_t lp1 = ((const int32_t*)(dst->op_params))[2];
|
||||
const int32_t rp1 = ((const int32_t*)(dst->op_params))[3];
|
||||
const int32_t lp2 = ((const int32_t*)(dst->op_params))[4];
|
||||
const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
|
||||
const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
|
||||
const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
|
||||
const size_t ts = ggml_type_size(src0->type);
|
||||
const size_t s00 = src0->nb[0] / ts;
|
||||
const size_t s01 = src0->nb[1] / ts;
|
||||
const size_t s02 = src0->nb[2] / ts;
|
||||
const size_t s03 = src0->nb[3] / ts;
|
||||
|
||||
pad_f32_sycl(src0_d, dst_d,
|
||||
const int32_t lp0 = ((const int32_t *)(dst->op_params))[0];
|
||||
const int32_t rp0 = ((const int32_t *)(dst->op_params))[1];
|
||||
const int32_t lp1 = ((const int32_t *)(dst->op_params))[2];
|
||||
const int32_t rp1 = ((const int32_t *)(dst->op_params))[3];
|
||||
const int32_t lp2 = ((const int32_t *)(dst->op_params))[4];
|
||||
const int32_t rp2 = ((const int32_t *)(dst->op_params))[5];
|
||||
const int32_t lp3 = ((const int32_t *)(dst->op_params))[6];
|
||||
const int32_t rp3 = ((const int32_t *)(dst->op_params))[7];
|
||||
|
||||
pad_f32_sycl(src0_d, s00, s01, s02, s03, dst_d,
|
||||
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
||||
}
|
||||
|
||||
@@ -79,6 +79,31 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
};
|
||||
|
||||
template <> struct block_q_t<GGML_TYPE_Q5_K> {
|
||||
struct traits {
|
||||
static constexpr uint32_t qk = QK_K;
|
||||
static constexpr uint32_t qi = QI5_K;
|
||||
static constexpr uint32_t qr = QR5_K;
|
||||
static constexpr uint32_t vdr_mmvq = 2;
|
||||
};
|
||||
|
||||
// Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales] [dm]
|
||||
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
|
||||
auto qs_offset = block_index * (QK_K / 2);
|
||||
auto qh_offset = n_blocks * (QK_K / 2) + block_index * (QK_K / 8);
|
||||
return { qs_offset, qh_offset };
|
||||
}
|
||||
|
||||
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
||||
auto nblocks = (nrows * (ncols / QK_K));
|
||||
auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 8);
|
||||
return { total_qs_bytes + block_index * K_SCALE_SIZE,
|
||||
total_qs_bytes + nblocks * K_SCALE_SIZE + block_index * sizeof(ggml_half2) };
|
||||
}
|
||||
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
};
|
||||
|
||||
template <> struct block_q_t<GGML_TYPE_Q6_K> {
|
||||
struct traits {
|
||||
static constexpr uint32_t qk = QK_K;
|
||||
|
||||
@@ -357,38 +357,31 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0> {
|
||||
using q8_0_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q8_0>;
|
||||
using q8_0_traits = typename q8_0_block::traits;
|
||||
|
||||
__dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int * v, const int * u, const float & d8_0, const sycl::half2 & ds8) {
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
|
||||
// Q8_0 values are signed int8, no nibble extraction needed
|
||||
// Direct dp4a: each int packs 4 int8 values
|
||||
sumi = dpct::dp4a(v[i], u[i], sumi);
|
||||
}
|
||||
|
||||
const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
|
||||
|
||||
// Q8_0 has no bias term (values are signed), so just scale
|
||||
return d8_0 * sumi * ds8f.x();
|
||||
}
|
||||
|
||||
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
||||
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
|
||||
const sycl::half2 * q8_1_ds, const int & iqs) {
|
||||
const int8_t * bq8_0 = static_cast<const int8_t *>(vbq) + ibx_offset.first;
|
||||
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
|
||||
int v[q8_0_traits::vdr_mmvq];
|
||||
int u[q8_0_traits::vdr_mmvq];
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
||||
const int8_t * qs = reinterpret_cast<const int8_t *>(base + ibx_offset.first);
|
||||
const ggml_half d = *reinterpret_cast<const ggml_half *>(base + d_offset.first);
|
||||
|
||||
int v[q8_0_traits::vdr_mmvq];
|
||||
int u[q8_0_traits::vdr_mmvq];
|
||||
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
|
||||
v[i] = get_int_from_int8(bq8_0, iqs + i);
|
||||
v[i] = get_int_from_int8(qs, iqs + i);
|
||||
u[i] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
|
||||
}
|
||||
|
||||
return vec_dot_q8_0_q8_1_impl(v, u, d, *q8_1_ds);
|
||||
};
|
||||
int sumi = 0;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
|
||||
sumi = dpct::dp4a(v[i], u[i], sumi);
|
||||
}
|
||||
|
||||
const sycl::half2 ds_values = *q8_1_ds;
|
||||
return static_cast<float>(d) * static_cast<float>(ds_values[0]) * sumi;
|
||||
}
|
||||
};
|
||||
|
||||
static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
|
||||
@@ -481,6 +474,65 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K> {
|
||||
static constexpr ggml_type gtype = GGML_TYPE_Q5_K;
|
||||
|
||||
using q5_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q5_K>;
|
||||
using q5_k_traits = typename q5_k_block::traits;
|
||||
|
||||
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
||||
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
|
||||
const sycl::half2 * q8_1_ds, const int & iqs) {
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
||||
const uint8_t * qs = base + ibx_offset.first; // low 4 bits
|
||||
const uint8_t * qh_base = base + ibx_offset.second; // high bit
|
||||
const uint8_t * scs = base + d_offset.first;
|
||||
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
|
||||
|
||||
const int bq8_offset = QR5_K * ((iqs / 2) / (QI8_1 / 2));
|
||||
const int * ql_ptr = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
|
||||
const int * qh_ptr = (const int *) (qh_base + 4 * ((iqs / 2) % 4));
|
||||
const uint16_t * scales = (const uint16_t *) scs;
|
||||
|
||||
int vl[2];
|
||||
int vh[2];
|
||||
int u[2 * QR5_K];
|
||||
float d8[QR5_K];
|
||||
|
||||
vl[0] = ql_ptr[0];
|
||||
vl[1] = ql_ptr[4];
|
||||
|
||||
vh[0] = qh_ptr[0] >> bq8_offset;
|
||||
vh[1] = qh_ptr[4] >> bq8_offset;
|
||||
|
||||
uint16_t aux[2];
|
||||
const int j = (QR5_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
|
||||
if (j < 2) {
|
||||
aux[0] = scales[j + 0] & 0x3f3f;
|
||||
aux[1] = scales[j + 2] & 0x3f3f;
|
||||
} else {
|
||||
aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
|
||||
aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
|
||||
}
|
||||
|
||||
const uint8_t * sc = (const uint8_t *) aux;
|
||||
const uint8_t * m = sc + 2;
|
||||
|
||||
for (int i = 0; i < QR5_K; ++i) {
|
||||
const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
|
||||
sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
|
||||
|
||||
d8[i] = ds_values[0];
|
||||
|
||||
const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
|
||||
u[2 * i + 0] = q8[0];
|
||||
u[2 * i + 1] = q8[4];
|
||||
}
|
||||
|
||||
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, *dms, d8);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
|
||||
static constexpr ggml_type gtype = GGML_TYPE_Q6_K;
|
||||
|
||||
|
||||
@@ -2149,11 +2149,11 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
|
||||
// Patch SPIR-V to enable RTE rounding for FP16, avoiding the need for
|
||||
// separate shader variants compiled with -DRTE16.
|
||||
std::vector<uint32_t> spv;
|
||||
std::vector<uint32_t> spirv;
|
||||
if (device->float_controls_rte_fp16) {
|
||||
const uint32_t* spv_words = reinterpret_cast<const uint32_t *>(spv_data);
|
||||
size_t word_count = spv_size / sizeof(uint32_t);
|
||||
spv.assign(spv_words, spv_words + word_count);
|
||||
spirv.assign(spv_words, spv_words + word_count);
|
||||
|
||||
// Find insertion points respecting SPIR-V layout order:
|
||||
// Header(5) -> OpCapability -> OpExtension -> ... -> OpEntryPoint -> OpExecutionMode -> ...
|
||||
@@ -2163,9 +2163,9 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
size_t exec_insert_pos = pos;
|
||||
uint32_t entry_point_id = 0;
|
||||
|
||||
while (pos < spv.size()) {
|
||||
uint32_t opcode = spv[pos] & spv::OpCodeMask;
|
||||
uint32_t len = spv[pos] >> spv::WordCountShift;
|
||||
while (pos < spirv.size()) {
|
||||
uint32_t opcode = spirv[pos] & spv::OpCodeMask;
|
||||
uint32_t len = spirv[pos] >> spv::WordCountShift;
|
||||
if (len == 0) break;
|
||||
|
||||
if (opcode == spv::OpCapability) {
|
||||
@@ -2174,7 +2174,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
} else if (opcode == spv::OpExtension) {
|
||||
ext_insert_pos = pos + len;
|
||||
} else if (opcode == spv::OpEntryPoint) {
|
||||
entry_point_id = spv[pos + 2];
|
||||
entry_point_id = spirv[pos + 2];
|
||||
exec_insert_pos = pos + len;
|
||||
} else if (opcode == spv::OpExecutionMode || opcode == spv::OpExecutionModeId) {
|
||||
exec_insert_pos = pos + len;
|
||||
@@ -2189,7 +2189,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
|
||||
// OpExecutionMode %entrypoint RoundingModeRTE 16
|
||||
uint32_t exec_mode[] = { (4u << spv::WordCountShift) | spv::OpExecutionMode, entry_point_id, spv::ExecutionModeRoundingModeRTE, 16 };
|
||||
spv.insert(spv.begin() + exec_insert_pos, std::begin(exec_mode), std::end(exec_mode));
|
||||
spirv.insert(spirv.begin() + exec_insert_pos, std::begin(exec_mode), std::end(exec_mode));
|
||||
|
||||
// OpExtension "SPV_KHR_float_controls"
|
||||
const char ext_str[] = "SPV_KHR_float_controls";
|
||||
@@ -2197,13 +2197,13 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
std::vector<uint32_t> extension(1 + ext_str_words, 0);
|
||||
extension[0] = (uint32_t)((1 + ext_str_words) << spv::WordCountShift) | spv::OpExtension;
|
||||
memcpy(&extension[1], ext_str, sizeof(ext_str));
|
||||
spv.insert(spv.begin() + ext_insert_pos, extension.begin(), extension.end());
|
||||
spirv.insert(spirv.begin() + ext_insert_pos, extension.begin(), extension.end());
|
||||
|
||||
// OpCapability RoundingModeRTE
|
||||
uint32_t capability[] = { (2u << spv::WordCountShift) | spv::OpCapability, spv::CapabilityRoundingModeRTE };
|
||||
spv.insert(spv.begin() + cap_insert_pos, std::begin(capability), std::end(capability));
|
||||
spirv.insert(spirv.begin() + cap_insert_pos, std::begin(capability), std::end(capability));
|
||||
|
||||
shader_module_create_info = vk::ShaderModuleCreateInfo({}, spv.size() * sizeof(uint32_t), spv.data());
|
||||
shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
|
||||
}
|
||||
|
||||
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
||||
|
||||
@@ -2443,6 +2443,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
|
||||
@@ -1 +1 @@
|
||||
ac6f7b44f60fde0091f0b3d99afde48f8c99b13a
|
||||
628249b398293fc8d2fa81a449ae2920a02c6523
|
||||
|
||||
@@ -1131,10 +1131,6 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
|
||||
}
|
||||
|
||||
// for differentiating model types
|
||||
uint32_t n_vocab = 0;
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
||||
|
||||
// for classifier models
|
||||
ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
|
||||
if (!classifier_labels.empty()) {
|
||||
|
||||
@@ -503,6 +503,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
};
|
||||
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE:
|
||||
// Sarvam uses SPM-style BPE (same shape as Gemma4): spaces replaced with U+2581
|
||||
// by the normalizer, BPE merges over the whole text on raw UTF-8.
|
||||
regex_exprs = {
|
||||
"[^\\n]+|[\\n]+",
|
||||
};
|
||||
byte_encode = false;
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
@@ -2005,6 +2013,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "gemma4") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
|
||||
escape_whitespaces = true;
|
||||
} else if (
|
||||
tokenizer_pre == "sarvam-moe") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE;
|
||||
escape_whitespaces = true;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "jina-v1-en" ||
|
||||
tokenizer_pre == "jina-v2-code" ||
|
||||
|
||||
@@ -59,6 +59,7 @@ enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
uint32_t n_vocab = 0;
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
||||
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
|
||||
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
|
||||
|
||||
+11
-5
@@ -110,7 +110,13 @@ void llama_model_gemma4::load_arch_tensors(llama_model_loader &) {
|
||||
layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0);
|
||||
|
||||
// MoE FFN
|
||||
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0);
|
||||
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
if (layer.ffn_gate_up_exps == nullptr) {
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
||||
}
|
||||
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
|
||||
// per-expert scale will be loaded as down_exps_s at the end of the current switch case
|
||||
@@ -286,8 +292,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
|
||||
|
||||
cur_moe = build_moe_ffn(cur_moe,
|
||||
nullptr, // gate_inp
|
||||
nullptr, // up_exps
|
||||
nullptr, // gate_exps
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr, // exp_probs_b (not used for gemma4)
|
||||
n_expert, n_expert_used,
|
||||
@@ -296,8 +302,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il, logits,
|
||||
model.layers[il].ffn_gate_up_exps,
|
||||
nullptr, // up_exps_s
|
||||
nullptr, // gate_exps_s
|
||||
model.layers[il].ffn_up_exps_s,
|
||||
model.layers[il].ffn_gate_exps_s,
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cur_moe = build_norm(cur_moe,
|
||||
model.layers[il].ffn_post_norm_2, nullptr,
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
uint32_t n_vocab = 0;
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
|
||||
@@ -3556,6 +3556,73 @@ struct test_relu_sqr : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// SNAKE activation fusion: y = x + sin(a*x)^2 * inv_b
|
||||
// CUDA backend matches the naive 5-op chain (mul, sin, sqr, mul, add)
|
||||
// and dispatches a single fused kernel.
|
||||
struct test_snake_fuse : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 2> ne; // [T, C]
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "SNAKE_FUSE";
|
||||
}
|
||||
|
||||
bool run_whole_graph() override { return true; }
|
||||
|
||||
double max_nmse_err() override {
|
||||
// BF16 epsilon ~ 7.8e-3, F16 epsilon ~ 9.7e-4: relax tolerance to match
|
||||
// the natural roundoff drift between the naive CPU chain and the fused
|
||||
// CUDA kernel. F32 keeps the default tight bound.
|
||||
switch (type) {
|
||||
case GGML_TYPE_BF16: return 5e-3;
|
||||
case GGML_TYPE_F16: return 5e-5;
|
||||
default: return 1e-7;
|
||||
}
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_snake_fuse(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 2> ne = {256, 192})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * x = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]);
|
||||
ggml_set_name(x, "x");
|
||||
|
||||
ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, ne[1]);
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * inv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, ne[1]);
|
||||
ggml_set_name(inv_b, "inv_b");
|
||||
|
||||
// exact 5-op chain that BigVGAN / Vocos frontends emit
|
||||
ggml_tensor * ax = ggml_mul(ctx, x, a);
|
||||
ggml_tensor * sin_ax = ggml_sin(ctx, ax);
|
||||
ggml_tensor * sin_sq = ggml_sqr(ctx, sin_ax);
|
||||
ggml_tensor * scaled = ggml_mul(ctx, sin_sq, inv_b);
|
||||
ggml_tensor * out = ggml_add(ctx, x, scaled);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
// x in [-pi, pi] to exercise sin periodicity, params in default [-1, 1]
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
const std::string name = ggml_get_name(t);
|
||||
if (name == "x") {
|
||||
init_tensor_uniform(t, -3.14159f, 3.14159f);
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_SSM_CONV
|
||||
struct test_ssm_conv : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -7489,6 +7556,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_relu_sqr(type, { 5, 7, 11, 13 }));
|
||||
}
|
||||
|
||||
// SNAKE activation fusion: x + sin(a*x)^2 * inv_b
|
||||
for (ggml_type type : { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
|
||||
test_cases.emplace_back(new test_snake_fuse(type, { 5, 7})); // primes sub-block
|
||||
test_cases.emplace_back(new test_snake_fuse(type, { 33, 32})); // boundary
|
||||
test_cases.emplace_back(new test_snake_fuse(type, {1025, 13})); // large prime, grid-stride
|
||||
test_cases.emplace_back(new test_snake_fuse(type, { 128, 16})); // power-of-two
|
||||
test_cases.emplace_back(new test_snake_fuse(type, { 256, 192})); // BigVGAN-ish
|
||||
}
|
||||
|
||||
// glu ops
|
||||
for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (int v : {0, 1}) {
|
||||
@@ -8785,8 +8861,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
if (nh == 1 && hsk != 320 && hsk != 576) continue;
|
||||
for (int nr3 : { 1, 3, }) {
|
||||
if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes
|
||||
for (int nr2 : { 1, 4, 12, 20, 32 }) {
|
||||
for (int nr2 : { 1, 4, 8, 12, 16, 20, 32 }) {
|
||||
if (nr2 == 8 && hsk != 192) continue;
|
||||
if (nr2 == 12 && hsk != 128) continue;
|
||||
if (nr2 == 16 && hsk != 192) continue;
|
||||
if (nr2 == 20 && (nh != 1 || hsk != 576)) continue;
|
||||
if (nr2 == 32 && (nh != 1 || hsk != 320)) continue;
|
||||
//for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) {
|
||||
@@ -9014,6 +9092,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 1, 1}));
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
|
||||
|
||||
// SNAKE activation fusion at BigVGAN scale (T=7680 = 24 kHz x 320 ms, C=192)
|
||||
test_cases.emplace_back(new test_snake_fuse(GGML_TYPE_F32, {7680, 192}));
|
||||
test_cases.emplace_back(new test_snake_fuse(GGML_TYPE_F16, {7680, 192}));
|
||||
test_cases.emplace_back(new test_snake_fuse(GGML_TYPE_BF16, {7680, 192}));
|
||||
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
|
||||
|
||||
|
||||
@@ -70,20 +70,20 @@ static void test_reasoning_budget(
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
|
||||
// Check if forcing is active (all logits except one should be -INFINITY)
|
||||
size_t not_neg_inf = 0;
|
||||
llama_token not_neg_inf_token = -1;
|
||||
size_t finite_count = 0;
|
||||
llama_token finite_token = -1;
|
||||
for (size_t j = 0; j < cur.size(); j++) {
|
||||
if (std::isfinite(cur[j].logit) || cur[j].logit > 0) { // +INFINITY
|
||||
not_neg_inf++;
|
||||
not_neg_inf_token = cur[j].id;
|
||||
if (std::isfinite(cur[j].logit)) {
|
||||
finite_count++;
|
||||
finite_token = cur[j].id;
|
||||
}
|
||||
}
|
||||
|
||||
llama_sampler_accept(sampler, sequence[i]);
|
||||
|
||||
fprintf(stderr, " i=%zu: token=%d, not_neg_inf_count=%zu, not_neg_inf_token=%d\n", i, (int)sequence[i], not_neg_inf, (int)not_neg_inf_token);
|
||||
fprintf(stderr, " i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);
|
||||
|
||||
if (not_neg_inf == 1) {
|
||||
if (finite_count == 1) {
|
||||
if (actual_force_start == SIZE_MAX) {
|
||||
actual_force_start = i;
|
||||
}
|
||||
|
||||
@@ -1651,6 +1651,7 @@ Note:
|
||||
2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
|
||||
- If a model is running but updated or removed from the source, it will be unloaded
|
||||
- If a model is not running, it will be added or updated according to the source
|
||||
3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
|
||||
|
||||
The `status` object can be:
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
+3538
-3507
File diff suppressed because it is too large
Load Diff
@@ -3926,22 +3926,7 @@ void server_routes::init_routes() {
|
||||
}},
|
||||
{"object", "list"},
|
||||
{"data", {
|
||||
{
|
||||
{"id", meta->model_name},
|
||||
{"aliases", meta->model_aliases},
|
||||
{"tags", meta->model_tags},
|
||||
{"object", "model"},
|
||||
{"created", std::time(0)},
|
||||
{"owned_by", "llamacpp"},
|
||||
{"meta", {
|
||||
{"vocab_type", meta->model_vocab_type},
|
||||
{"n_vocab", meta->model_vocab_n_tokens},
|
||||
{"n_ctx_train", meta->model_n_ctx_train},
|
||||
{"n_embd", meta->model_n_embd_inp},
|
||||
{"n_params", meta->model_n_params},
|
||||
{"size", meta->model_size},
|
||||
}},
|
||||
},
|
||||
get_model_info(),
|
||||
}}
|
||||
};
|
||||
|
||||
@@ -4155,6 +4140,26 @@ void server_routes::init_routes() {
|
||||
};
|
||||
}
|
||||
|
||||
json server_routes::get_model_info() const {
|
||||
return json {
|
||||
{"id", meta->model_name},
|
||||
{"aliases", meta->model_aliases},
|
||||
{"tags", meta->model_tags},
|
||||
{"object", "model"},
|
||||
{"created", std::time(0)},
|
||||
{"owned_by", "llamacpp"},
|
||||
{"meta", {
|
||||
{"vocab_type", meta->model_vocab_type},
|
||||
{"n_vocab", meta->model_vocab_n_tokens},
|
||||
{"n_ctx", meta->slot_n_ctx},
|
||||
{"n_ctx_train", meta->model_n_ctx_train},
|
||||
{"n_embd", meta->model_n_embd_inp},
|
||||
{"n_params", meta->model_n_params},
|
||||
{"size", meta->model_size},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const server_http_req & req, int id_slot) {
|
||||
auto res = create_response();
|
||||
const json request_data = json::parse(req.body);
|
||||
|
||||
@@ -122,6 +122,10 @@ struct server_routes {
|
||||
server_http_context::handler_t post_rerank;
|
||||
server_http_context::handler_t get_lora_adapters;
|
||||
server_http_context::handler_t post_lora_adapters;
|
||||
|
||||
// to be used in router mode
|
||||
json get_model_info() const;
|
||||
|
||||
private:
|
||||
std::unique_ptr<server_res_generator> handle_completions_impl(
|
||||
const server_http_req & req,
|
||||
|
||||
@@ -4,7 +4,9 @@
|
||||
|
||||
#include <cpp-httplib/httplib.h>
|
||||
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <future>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
||||
@@ -51,11 +53,51 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
|
||||
SRV_DBG("response: %s\n", res.body.c_str());
|
||||
}
|
||||
|
||||
// For Google Cloud Platform deployment compatibility
|
||||
struct gcp_params {
|
||||
bool enabled;
|
||||
std::string path_health;
|
||||
std::string path_predict;
|
||||
int port;
|
||||
|
||||
// Ref: https://docs.cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#aip-variables
|
||||
gcp_params() {
|
||||
enabled = getenv("AIP_MODE", "") == "PREDICTION";
|
||||
path_health = getenv("AIP_HEALTH_ROUTE", "", true); // default: using the route defined in server.cpp
|
||||
path_predict = getenv("AIP_PREDICT_ROUTE", "/predict", true);
|
||||
port = std::stoi(getenv("AIP_HTTP_PORT", "8080"));
|
||||
}
|
||||
|
||||
static std::string getenv(const char * name, const std::string & default_value, bool ensure_leading_slash = false) {
|
||||
const char * value = std::getenv(name);
|
||||
if (value == nullptr || value[0] == '\0') {
|
||||
return default_value;
|
||||
}
|
||||
std::string val = value;
|
||||
if (ensure_leading_slash && !val.empty() && val[0] != '/') {
|
||||
val.insert(val.begin(), '/');
|
||||
}
|
||||
return val;
|
||||
}
|
||||
};
|
||||
|
||||
bool server_http_context::init(const common_params & params) {
|
||||
const gcp_params gcp;
|
||||
|
||||
path_prefix = params.api_prefix;
|
||||
port = params.port;
|
||||
hostname = params.hostname;
|
||||
|
||||
if (gcp.enabled) {
|
||||
LOG_INF("%s: Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", __func__, gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port);
|
||||
|
||||
if (port != gcp.port) {
|
||||
LOG_WRN("%s: Google Cloud Platform compat: overriding server port %d with AIP_HTTP_PORT %d\n", __func__, port, gcp.port);
|
||||
}
|
||||
|
||||
port = gcp.port;
|
||||
}
|
||||
|
||||
auto & srv = pimpl->srv;
|
||||
|
||||
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
@@ -420,6 +462,7 @@ static void process_handler_response(server_http_req_ptr && request, server_http
|
||||
}
|
||||
|
||||
void server_http_context::get(const std::string & path, const server_http_context::handler_t & handler) const {
|
||||
handlers.emplace(path, handler);
|
||||
pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
|
||||
server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
|
||||
get_params(req),
|
||||
@@ -436,6 +479,7 @@ void server_http_context::get(const std::string & path, const server_http_contex
|
||||
}
|
||||
|
||||
void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const {
|
||||
handlers.emplace(path, handler);
|
||||
pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
|
||||
std::string body = req.body;
|
||||
std::map<std::string, uploaded_file> files;
|
||||
@@ -481,3 +525,176 @@ void server_http_context::post(const std::string & path, const server_http_conte
|
||||
});
|
||||
}
|
||||
|
||||
//
|
||||
// Vertex AI Prediction protocol (AIP_PREDICT_ROUTE)
|
||||
// https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements
|
||||
//
|
||||
|
||||
// Derives the camelCase @requestFormat alias for a registered path.
|
||||
// e.g. "/v1/chat/completions" -> "chatCompletions", "/apply-template" -> "applyTemplate"
|
||||
static std::string path_to_gcp_format(const std::string & path) {
|
||||
std::string s = path;
|
||||
if (s.size() > 3 && s[0] == '/' && s[1] == 'v' && s[2] == '1') {
|
||||
s = s.substr(3);
|
||||
}
|
||||
if (!s.empty() && s[0] == '/') {
|
||||
s = s.substr(1);
|
||||
}
|
||||
std::string result;
|
||||
bool cap = false;
|
||||
for (unsigned char c : s) {
|
||||
if (c == ':') break; // stop before path parameters
|
||||
if (c == '/' || c == '-' || c == '_') {
|
||||
cap = true;
|
||||
} else {
|
||||
result += cap ? (char)std::toupper(c) : (char)c;
|
||||
cap = false;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static json parse_gcp_predict_response(const server_http_res_ptr & res) {
|
||||
if (res == nullptr) {
|
||||
throw std::runtime_error("empty response from internal handler");
|
||||
}
|
||||
if (res->is_stream()) {
|
||||
throw std::invalid_argument("predict route does not support streaming responses");
|
||||
}
|
||||
if (res->data.empty()) {
|
||||
return nullptr;
|
||||
}
|
||||
try {
|
||||
return json::parse(res->data);
|
||||
} catch (...) {
|
||||
return res->data;
|
||||
}
|
||||
}
|
||||
|
||||
void server_http_context::register_gcp_compat() {
|
||||
const gcp_params gcp;
|
||||
|
||||
if (!gcp.enabled) {
|
||||
// do nothing
|
||||
return;
|
||||
}
|
||||
|
||||
if (handlers.count(gcp.path_predict)) {
|
||||
LOG_ERR("%s: AIP_PREDICT_ROUTE=%s conflicts with an existing llama-server route\n", __func__, gcp.path_predict.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// camelCase alias -> canonical path (first registration wins on collision)
|
||||
// e.g. "chatCompletions" -> "/v1/chat/completions"
|
||||
std::unordered_map<std::string, std::string> alias_to_path;
|
||||
for (const auto & [path, _] : handlers) {
|
||||
alias_to_path.emplace(path_to_gcp_format(path), path);
|
||||
}
|
||||
|
||||
if (!gcp.path_health.empty()) {
|
||||
auto health_handler = handlers.find("/health");
|
||||
GGML_ASSERT(health_handler != handlers.end());
|
||||
get(gcp.path_health, health_handler->second);
|
||||
}
|
||||
|
||||
post(gcp.path_predict, [this, alias_to_path = std::move(alias_to_path)](const server_http_req & req) -> server_http_res_ptr {
|
||||
static const auto build_error = [](const std::string & message, error_type type) -> json {
|
||||
return json {{"error", format_error_response(message, type)}};
|
||||
};
|
||||
|
||||
json data;
|
||||
try {
|
||||
data = json::parse(req.body);
|
||||
} catch (const std::exception & e) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 400;
|
||||
res->data = safe_json_to_str({{"error", format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)}});
|
||||
return res;
|
||||
}
|
||||
if (!data.is_object()) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 400;
|
||||
res->data = safe_json_to_str({{"error", format_error_response("request body must be a JSON object", ERROR_TYPE_INVALID_REQUEST)}});
|
||||
return res;
|
||||
}
|
||||
if (!data.contains("instances") || !data.at("instances").is_array()) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 400;
|
||||
res->data = safe_json_to_str({{"error", format_error_response("request body must include an array field named instances", ERROR_TYPE_INVALID_REQUEST)}});
|
||||
return res;
|
||||
}
|
||||
|
||||
const json & instances = data.at("instances");
|
||||
static const size_t MAX_INSTANCES = 128;
|
||||
if (instances.size() > MAX_INSTANCES) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 400;
|
||||
res->data = safe_json_to_str({{"error", format_error_response("instances array exceeds maximum size of " + std::to_string(MAX_INSTANCES), ERROR_TYPE_INVALID_REQUEST)}});
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<std::future<json>> futures;
|
||||
futures.reserve(instances.size());
|
||||
|
||||
for (const auto & instance : instances) {
|
||||
futures.push_back(std::async(std::launch::async, [this, &req, &alias_to_path, instance]() -> json {
|
||||
if (!instance.is_object()) {
|
||||
return build_error("each instance must be a JSON object", ERROR_TYPE_INVALID_REQUEST);
|
||||
}
|
||||
if (!instance.contains("@requestFormat") || !instance.at("@requestFormat").is_string()) {
|
||||
return build_error("each instance must include a string @requestFormat", ERROR_TYPE_INVALID_REQUEST);
|
||||
}
|
||||
|
||||
try {
|
||||
json payload = instance;
|
||||
const std::string format = payload.at("@requestFormat").get<std::string>();
|
||||
payload.erase("@requestFormat");
|
||||
|
||||
if (payload.contains("stream")) {
|
||||
LOG_WRN("%s: ignoring client-provided stream field in instance, streaming is not supported in predict route\n", __func__);
|
||||
payload["stream"] = false;
|
||||
}
|
||||
|
||||
// accept both camelCase aliases (e.g. "chatCompletions") and direct paths
|
||||
std::string dispatch_path;
|
||||
auto it_alias = alias_to_path.find(format);
|
||||
if (it_alias != alias_to_path.end()) {
|
||||
dispatch_path = it_alias->second;
|
||||
} else if (handlers.count(format)) {
|
||||
dispatch_path = format;
|
||||
} else {
|
||||
return build_error("no handler registered for @requestFormat: " + format, ERROR_TYPE_INVALID_REQUEST);
|
||||
}
|
||||
|
||||
const server_http_req internal_req {
|
||||
req.params,
|
||||
req.headers,
|
||||
path_prefix + dispatch_path,
|
||||
req.query_string,
|
||||
payload.dump(),
|
||||
{},
|
||||
req.should_stop,
|
||||
};
|
||||
|
||||
server_http_res_ptr internal_res = handlers.at(dispatch_path)(internal_req);
|
||||
return parse_gcp_predict_response(internal_res);
|
||||
} catch (const std::invalid_argument & e) {
|
||||
return build_error(e.what(), ERROR_TYPE_INVALID_REQUEST);
|
||||
} catch (const std::exception & e) {
|
||||
return build_error(e.what(), ERROR_TYPE_SERVER);
|
||||
} catch (...) {
|
||||
return build_error("unknown error", ERROR_TYPE_SERVER);
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
json predictions = json::array();
|
||||
for (auto & future : futures) {
|
||||
predictions.push_back(future.get());
|
||||
}
|
||||
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->data = safe_json_to_str({{"predictions", predictions}});
|
||||
return res;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -67,6 +67,10 @@ struct server_http_context {
|
||||
std::thread thread; // server thread
|
||||
std::atomic<bool> is_ready = false;
|
||||
|
||||
// note: the handler should never throw exceptions
|
||||
using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
|
||||
mutable std::unordered_map<std::string, handler_t> handlers;
|
||||
|
||||
std::string path_prefix;
|
||||
std::string hostname;
|
||||
int port;
|
||||
@@ -78,12 +82,13 @@ struct server_http_context {
|
||||
bool start();
|
||||
void stop() const;
|
||||
|
||||
// note: the handler should never throw exceptions
|
||||
using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
|
||||
|
||||
void get(const std::string & path, const handler_t & handler) const;
|
||||
void post(const std::string & path, const handler_t & handler) const;
|
||||
|
||||
// Register the Google Cloud Platform (Vertex AI) compat (AIP_PREDICT_ROUTE env var, or /predict)
|
||||
// Must be called AFTER all other API routes are registered
|
||||
void register_gcp_compat();
|
||||
|
||||
// for debugging
|
||||
std::string listening_address;
|
||||
};
|
||||
|
||||
@@ -44,6 +44,7 @@ extern char **environ;
|
||||
#define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit"
|
||||
#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
|
||||
#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
|
||||
#define CMD_CHILD_TO_ROUTER_INFO "cmd_child_to_router:info:" // followed by json string
|
||||
|
||||
// address for child process, this is needed because router may run on 0.0.0.0
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
|
||||
@@ -718,10 +719,11 @@ void server_models::load(const std::string & name) {
|
||||
|
||||
// prepare new instance info
|
||||
instance_t inst;
|
||||
inst.meta = meta;
|
||||
inst.meta.port = get_free_port();
|
||||
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
|
||||
inst.meta.last_used = ggml_time_ms();
|
||||
inst.meta = meta;
|
||||
inst.meta.port = get_free_port();
|
||||
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
|
||||
inst.meta.loaded_info = json{};
|
||||
inst.meta.last_used = ggml_time_ms();
|
||||
|
||||
if (inst.meta.port <= 0) {
|
||||
throw std::runtime_error("failed to get a port number");
|
||||
@@ -767,12 +769,14 @@ void server_models::load(const std::string & name) {
|
||||
// read stdout/stderr and forward to main server log
|
||||
// also handle status report from child process
|
||||
if (stdout_file) {
|
||||
char buffer[4096];
|
||||
char buffer[128 * 1024]; // large buffer for storing info
|
||||
while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
|
||||
LOG("[%5d] %s", port, buffer);
|
||||
std::string str(buffer);
|
||||
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
|
||||
this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
|
||||
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
|
||||
this->update_loaded_info(name, str);
|
||||
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
|
||||
this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
|
||||
}
|
||||
@@ -916,6 +920,29 @@ void server_models::update_status(const std::string & name, server_model_status
|
||||
cv.notify_all();
|
||||
}
|
||||
|
||||
void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
|
||||
if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
|
||||
SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
json info;
|
||||
try {
|
||||
info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
|
||||
} catch (const std::exception & e) {
|
||||
SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
|
||||
return;
|
||||
}
|
||||
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
auto it = mapping.find(name);
|
||||
if (it != mapping.end()) {
|
||||
auto & meta = it->second.meta;
|
||||
meta.loaded_info = info;
|
||||
}
|
||||
cv.notify_all();
|
||||
}
|
||||
|
||||
void server_models::wait_until_loading_finished(const std::string & name) {
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
cv.wait(lk, [this, &name]() {
|
||||
@@ -994,12 +1021,14 @@ bool server_models::is_child_server() {
|
||||
return router_port != nullptr;
|
||||
}
|
||||
|
||||
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
|
||||
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
|
||||
// send a notification to the router server that a model instance is ready
|
||||
common_log_pause(common_log_main());
|
||||
fflush(stdout);
|
||||
fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
|
||||
fflush(stdout);
|
||||
fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
|
||||
fflush(stdout);
|
||||
common_log_resume(common_log_main());
|
||||
|
||||
// setup thread for monitoring stdin
|
||||
@@ -1176,7 +1205,8 @@ void server_models_routes::init_routes() {
|
||||
status["exit_code"] = meta.exit_code;
|
||||
status["failed"] = true;
|
||||
}
|
||||
models_json.push_back(json {
|
||||
|
||||
json model_info = json {
|
||||
{"id", meta.name},
|
||||
{"aliases", meta.aliases},
|
||||
{"tags", meta.tags},
|
||||
@@ -1185,7 +1215,17 @@ void server_models_routes::init_routes() {
|
||||
{"created", t}, // for OAI-compat
|
||||
{"status", status},
|
||||
// TODO: add other fields, may require reading GGUF metadata
|
||||
});
|
||||
};
|
||||
|
||||
// merge with loaded_info from the child process if available
|
||||
if (meta.is_running()) {
|
||||
for (auto it = meta.loaded_info.begin(); it != meta.loaded_info.end(); ++it) {
|
||||
if (!model_info.contains(it.key())) {
|
||||
model_info[it.key()] = it.value();
|
||||
}
|
||||
}
|
||||
}
|
||||
models_json.push_back(model_info);
|
||||
}
|
||||
res_ok(res, {
|
||||
{"data", models_json},
|
||||
|
||||
@@ -63,6 +63,7 @@ struct server_model_meta {
|
||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||
int64_t last_used = 0; // for LRU unloading
|
||||
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
||||
json loaded_info; // info to be reflected via /v1/models endpoint
|
||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
|
||||
|
||||
@@ -145,6 +146,7 @@ public:
|
||||
|
||||
// update the status of a model instance (thread-safe)
|
||||
void update_status(const std::string & name, server_model_status status, int exit_code);
|
||||
void update_loaded_info(const std::string & name, std::string & raw_info);
|
||||
|
||||
// wait until the model instance is fully loaded (thread-safe)
|
||||
// return when the model no longer in "loading" state
|
||||
@@ -163,7 +165,7 @@ public:
|
||||
|
||||
// notify the router server that a model instance is ready
|
||||
// return the monitoring thread (to be joined by the caller)
|
||||
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
|
||||
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);
|
||||
|
||||
// notify the router server that the sleeping state has changed
|
||||
static void notify_router_sleeping_state(bool sleeping);
|
||||
|
||||
@@ -204,6 +204,10 @@ int main(int argc, char ** argv) {
|
||||
// Save & load slots
|
||||
ctx_http.get ("/slots", ex_wrapper(routes.get_slots));
|
||||
ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots));
|
||||
|
||||
// Google Cloud Platform (Vertex AI) compat
|
||||
ctx_http.register_gcp_compat();
|
||||
|
||||
// CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
|
||||
if (params.webui_mcp_proxy) {
|
||||
SRV_WRN("%s", "-----------------\n");
|
||||
@@ -334,7 +338,8 @@ int main(int argc, char ** argv) {
|
||||
// optionally, notify router server that this instance is ready
|
||||
std::thread monitor_thread;
|
||||
if (server_models::is_child_server()) {
|
||||
monitor_thread = server_models::setup_child_server(shutdown_handler);
|
||||
json model_info = routes.get_model_info();
|
||||
monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
|
||||
}
|
||||
|
||||
// this call blocks the main thread until queue_tasks.terminate() is called
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
server.gcp_compat = True
|
||||
|
||||
|
||||
def test_gcp_predict_camel_case():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/predict", data={
|
||||
"instances": [
|
||||
{
|
||||
"@requestFormat": "chatCompletions",
|
||||
"max_tokens": 8,
|
||||
"messages": [
|
||||
{"role": "user", "content": "What is the meaning of life?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "predictions" in res.body
|
||||
assert len(res.body["predictions"]) == 1
|
||||
prediction = res.body["predictions"][0]
|
||||
assert "choices" in prediction
|
||||
assert len(prediction["choices"]) == 1
|
||||
assert prediction["choices"][0]["message"]["role"] == "assistant"
|
||||
assert len(prediction["choices"][0]["message"]["content"]) > 0
|
||||
|
||||
|
||||
def test_gcp_predict_multiple_instances():
|
||||
global server
|
||||
server.n_slots = 2
|
||||
server.start()
|
||||
res = server.make_request("POST", "/predict", data={
|
||||
"instances": [
|
||||
{
|
||||
"@requestFormat": "chatCompletions",
|
||||
"max_tokens": 8,
|
||||
"messages": [{"role": "user", "content": "Say hello"}],
|
||||
},
|
||||
{
|
||||
"@requestFormat": "chatCompletions",
|
||||
"max_tokens": 8,
|
||||
"messages": [{"role": "user", "content": "Say world"}],
|
||||
},
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["predictions"]) == 2
|
||||
for prediction in res.body["predictions"]:
|
||||
assert "choices" in prediction
|
||||
assert len(prediction["choices"][0]["message"]["content"]) > 0
|
||||
@@ -108,6 +108,7 @@ class ServerProcess:
|
||||
no_cache_idle_slots: bool = False
|
||||
log_path: str | None = None
|
||||
webui_mcp_proxy: bool = False
|
||||
gcp_compat: bool = False
|
||||
|
||||
# session variables
|
||||
process: subprocess.Popen | None = None
|
||||
@@ -122,6 +123,9 @@ class ServerProcess:
|
||||
self.external_server = "DEBUG_EXTERNAL" in os.environ
|
||||
|
||||
def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None:
|
||||
env = {**os.environ}
|
||||
if "LLAMA_CACHE" not in os.environ:
|
||||
env["LLAMA_CACHE"] = "tmp"
|
||||
if self.external_server:
|
||||
print(f"[external_server]: Assuming external server running on {self.server_host}:{self.server_port}")
|
||||
return
|
||||
@@ -248,6 +252,8 @@ class ServerProcess:
|
||||
server_args.append("--no-cache-idle-slots")
|
||||
if self.webui_mcp_proxy:
|
||||
server_args.append("--webui-mcp-proxy")
|
||||
if self.gcp_compat:
|
||||
env["AIP_MODE"] = "PREDICTION"
|
||||
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"tests: starting server with: {' '.join(args)}")
|
||||
@@ -268,7 +274,7 @@ class ServerProcess:
|
||||
creationflags=flags,
|
||||
stdout=self._log,
|
||||
stderr=self._log if self._log != sys.stdout else sys.stdout,
|
||||
env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
|
||||
env=env,
|
||||
)
|
||||
server_instances.add(self)
|
||||
|
||||
|
||||
Generated
+18
-18
@@ -2307,9 +2307,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@sveltejs/kit": {
|
||||
"version": "2.57.1",
|
||||
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.57.1.tgz",
|
||||
"integrity": "sha512-VRdSbB96cI1EnRh09CqmnQqP/YJvET5buj8S6k7CxaJqBJD4bw4fRKDjcarAj/eX9k2eHifQfDH8NtOh+ZxxPw==",
|
||||
"version": "2.59.1",
|
||||
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz",
|
||||
"integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
@@ -3640,9 +3640,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/bits-ui": {
|
||||
"version": "2.18.0",
|
||||
"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.18.0.tgz",
|
||||
"integrity": "sha512-GLOBZRVy3hxNHIQ2MpD/+5aK9KcBFZRhUJtZ1UDABXdlVR4K6zFpgt4T+Rwuhf2sQzlc6yK1q/DprHPjwT4Pjw==",
|
||||
"version": "2.18.1",
|
||||
"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.18.1.tgz",
|
||||
"integrity": "sha512-KkemzKFH4T3gt3H+P86JcnAWExjByv/6vlwjm/BoCwTPHu03yiCdxbghdJLvFReQTe0acCAiRcKfmixxD6XvlA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
@@ -4856,9 +4856,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/express-rate-limit": {
|
||||
"version": "8.3.2",
|
||||
"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.3.2.tgz",
|
||||
"integrity": "sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg==",
|
||||
"version": "8.5.0",
|
||||
"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.0.tgz",
|
||||
"integrity": "sha512-XKhFohWaSBdVJNTi5TaHziqnPkv04I9UQV6q1Wy7Ui6GGQZVW12ojDFwqer14EvCXxjvPG0CyWXx7cAXpALB4Q==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ip-address": "10.1.0"
|
||||
@@ -7943,9 +7943,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/postcss": {
|
||||
"version": "8.5.6",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
|
||||
"integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==",
|
||||
"version": "8.5.14",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.14.tgz",
|
||||
"integrity": "sha512-SoSL4+OSEtR99LHFZQiJLkT59C5B1amGO1NzTwj7TT1qCUgUO6hxOvzkOYxD+vMrXBM3XJIKzokoERdqQq/Zmg==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
{
|
||||
@@ -10084,9 +10084,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/uuid": {
|
||||
"version": "13.0.0",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-13.0.0.tgz",
|
||||
"integrity": "sha512-XQegIaBTVUjSHliKqcnFqYypAd4S+WCYt5NIeRs6w/UAry7z8Y9j5ZwRRL4kzq9U3sD6v+85er9FvkEaBpji2w==",
|
||||
"version": "13.0.2",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-13.0.2.tgz",
|
||||
"integrity": "sha512-vzi9uRZ926x4XV73S/4qQaTwPXM2JBj6/6lI/byHH1jOpCzb0zDbfytgA9LcN/hzb2l7WQSQnxITOVx5un/wGw==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
@@ -10302,9 +10302,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/vite-plugin-devtools-json/node_modules/uuid": {
|
||||
"version": "11.1.0",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-11.1.0.tgz",
|
||||
"integrity": "sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A==",
|
||||
"version": "11.1.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-11.1.1.tgz",
|
||||
"integrity": "sha512-vIYxrBCC/N/K+Js3qSN88go7kIfNPssr/hHCesKCQNAjmgvYS2oqr69kIufEG+O4+PfezOH4EbIeHCfFov8ZgQ==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
|
||||
+2
-1
@@ -8,6 +8,7 @@
|
||||
import { HealthCheckStatus } from '$lib/enums';
|
||||
import type { MCPServerSettingsEntry } from '$lib/types';
|
||||
import { goto } from '$app/navigation';
|
||||
import { ROUTES } from '$lib/constants/routes';
|
||||
|
||||
interface Props {
|
||||
onMcpSettingsClick?: () => void;
|
||||
@@ -52,7 +53,7 @@
|
||||
function handleMcpSettingsClick() {
|
||||
onMcpSettingsClick?.();
|
||||
|
||||
goto(`${hasMcpServers ? '' : '?add'}#/settings/mcp`);
|
||||
goto(`${hasMcpServers ? '' : '?add'}${ROUTES.MCP_SERVERS}`);
|
||||
}
|
||||
</script>
|
||||
|
||||
|
||||
+7
-2
@@ -12,6 +12,8 @@
|
||||
import { useAttachmentMenu } from '$lib/hooks/use-attachment-menu.svelte';
|
||||
import { AttachmentMenuItemId } from '$lib/enums';
|
||||
import { PencilRuler } from '@lucide/svelte';
|
||||
import { ROUTES, SETTINGS_SECTION_SLUGS } from '$lib/constants/routes';
|
||||
import { RouterService } from '$lib/services/router.service';
|
||||
|
||||
interface Props {
|
||||
class?: string;
|
||||
@@ -146,13 +148,16 @@
|
||||
|
||||
<div class="my-2 border-t"></div>
|
||||
|
||||
<a href="#/settings/mcp" class="flex items-center gap-3 px-3 py-2">
|
||||
<a href={ROUTES.MCP_SERVERS} class="flex items-center gap-3 px-3 py-2">
|
||||
<McpLogo class="inline h-4 w-4" />
|
||||
|
||||
<span class="text-sm">MCP Servers</span>
|
||||
</a>
|
||||
|
||||
<a href="#/settings/chat/tools" class="flex items-center gap-3 px-3 py-2">
|
||||
<a
|
||||
href={RouterService.settings(SETTINGS_SECTION_SLUGS.TOOLS)}
|
||||
class="flex items-center gap-3 px-3 py-2"
|
||||
>
|
||||
<PencilRuler class="inline h-4 w-4" />
|
||||
|
||||
<span class="text-sm">Tools</span>
|
||||
|
||||
+2
-1
@@ -13,6 +13,7 @@
|
||||
import { conversationsStore } from '$lib/stores/conversations.svelte';
|
||||
import { getFileTypeCategory } from '$lib/utils';
|
||||
import { goto } from '$app/navigation';
|
||||
import { ROUTES } from '$lib/constants/routes';
|
||||
|
||||
interface Props {
|
||||
canSend?: boolean;
|
||||
@@ -100,7 +101,7 @@
|
||||
{onSystemPromptClick}
|
||||
{onMcpPromptClick}
|
||||
{onMcpResourcesClick}
|
||||
onMcpSettingsClick={() => goto('#/settings/mcp')}
|
||||
onMcpSettingsClick={() => goto(ROUTES.MCP_SERVERS)}
|
||||
/>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
+4
-3
@@ -17,6 +17,7 @@
|
||||
import { parseFilesToMessageExtras } from '$lib/utils/browser-only';
|
||||
import { deriveAgenticSections } from '$lib/utils';
|
||||
import type { DatabaseMessageExtraMcpPrompt } from '$lib/types';
|
||||
import { ROUTES } from '$lib/constants/routes';
|
||||
|
||||
interface Props {
|
||||
class?: string;
|
||||
@@ -182,7 +183,7 @@
|
||||
const conversationDeleted = await chatStore.removeSystemPromptPlaceholder(message.id);
|
||||
|
||||
if (conversationDeleted) {
|
||||
goto(`#/`);
|
||||
goto(ROUTES.START);
|
||||
}
|
||||
|
||||
return;
|
||||
@@ -205,7 +206,7 @@
|
||||
const conversationDeleted = await chatStore.removeSystemPromptPlaceholder(message.id);
|
||||
|
||||
if (conversationDeleted) {
|
||||
goto(`#/`);
|
||||
goto(ROUTES.START);
|
||||
}
|
||||
} else {
|
||||
chatActions.delete(message);
|
||||
@@ -271,7 +272,7 @@
|
||||
const conversationDeleted = await chatStore.removeSystemPromptPlaceholder(message.id);
|
||||
isEditing = false;
|
||||
if (conversationDeleted) {
|
||||
goto(`#/`);
|
||||
goto(ROUTES.START);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
+1
-6
@@ -85,10 +85,6 @@
|
||||
editCtx.setUploadedFiles([...editCtx.editedUploadedFiles, ...processed]);
|
||||
}
|
||||
|
||||
function handleUploadedFilesChange(files: ChatUploadedFile[]) {
|
||||
editCtx.setUploadedFiles(files);
|
||||
}
|
||||
|
||||
$effect(() => {
|
||||
chatStore.setEditModeActive(handleFilesAdd);
|
||||
|
||||
@@ -104,7 +100,7 @@
|
||||
<ChatForm
|
||||
value={editCtx.editedContent}
|
||||
attachments={editCtx.editedExtras}
|
||||
uploadedFiles={editCtx.editedUploadedFiles}
|
||||
bind:uploadedFiles={editCtx.editedUploadedFiles}
|
||||
placeholder="Edit your message..."
|
||||
showMcpPromptButton
|
||||
showAddButton={editCtx.messageRole === MessageRole.USER}
|
||||
@@ -112,7 +108,6 @@
|
||||
onValueChange={editCtx.setContent}
|
||||
onAttachmentRemove={handleAttachmentRemove}
|
||||
onUploadedFileRemove={handleUploadedFileRemove}
|
||||
onUploadedFilesChange={handleUploadedFilesChange}
|
||||
onFilesAdd={handleFilesAdd}
|
||||
onSubmit={handleSubmit}
|
||||
/>
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
<script lang="ts">
|
||||
import * as AlertDialog from '$lib/components/ui/alert-dialog';
|
||||
import { Checkbox } from '$lib/components/ui/checkbox';
|
||||
import Label from '$lib/components/ui/label/label.svelte';
|
||||
import { Shield, ShieldOff } from '@lucide/svelte';
|
||||
|
||||
let {
|
||||
open = $bindable(),
|
||||
includeSensitiveData = $bindable(false),
|
||||
onCancel,
|
||||
onConfirm
|
||||
}: {
|
||||
open: boolean;
|
||||
includeSensitiveData: boolean;
|
||||
onCancel: () => void;
|
||||
onConfirm: () => void;
|
||||
} = $props();
|
||||
|
||||
function handleOpenChange(newOpen: boolean) {
|
||||
if (!newOpen) {
|
||||
onCancel();
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<AlertDialog.Root {open} onOpenChange={handleOpenChange}>
|
||||
<AlertDialog.Content>
|
||||
<AlertDialog.Header>
|
||||
<AlertDialog.Title class="flex items-center gap-2">
|
||||
{#if includeSensitiveData}
|
||||
<ShieldOff class="h-5 w-5 text-destructive" />
|
||||
{:else}
|
||||
<Shield class="h-5 w-5 text-destructive" />
|
||||
{/if}
|
||||
Export Settings
|
||||
</AlertDialog.Title>
|
||||
|
||||
<AlertDialog.Description>
|
||||
{#if includeSensitiveData}
|
||||
<p class="text-amber-500">
|
||||
Warning: This export will include sensitive data such as API keys and MCP server custom
|
||||
headers (e.g., authorization tokens). Do not share this file with anyone you don't
|
||||
trust.
|
||||
</p>
|
||||
{:else}
|
||||
<p>
|
||||
Sensitive data (API keys, MCP server custom headers) will not be included in the export
|
||||
to protect your credentials.
|
||||
</p>
|
||||
{/if}
|
||||
</AlertDialog.Description>
|
||||
</AlertDialog.Header>
|
||||
|
||||
<div class="flex items-center gap-2 py-2">
|
||||
<Checkbox id="include-sensitive" bind:checked={includeSensitiveData} />
|
||||
|
||||
<Label
|
||||
for="include-sensitive"
|
||||
class="text-sm leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70"
|
||||
>
|
||||
{#if includeSensitiveData}
|
||||
<span class="text-destructive">Include sensitive data (not recommended)</span>
|
||||
{:else}
|
||||
<span>Include sensitive data</span>
|
||||
{/if}
|
||||
</Label>
|
||||
</div>
|
||||
|
||||
<AlertDialog.Footer>
|
||||
<AlertDialog.Cancel onclick={onCancel}>Cancel</AlertDialog.Cancel>
|
||||
<AlertDialog.Action
|
||||
onclick={onConfirm}
|
||||
class="bg-destructive text-white hover:bg-destructive/80"
|
||||
>
|
||||
{#if includeSensitiveData}
|
||||
Export Anyway
|
||||
{:else}
|
||||
Export Without Sensitive Data
|
||||
{/if}
|
||||
</AlertDialog.Action>
|
||||
</AlertDialog.Footer>
|
||||
</AlertDialog.Content>
|
||||
</AlertDialog.Root>
|
||||
@@ -18,6 +18,37 @@
|
||||
*/
|
||||
export { default as DialogMcpServerAddNew } from './DialogMcpServerAddNew.svelte';
|
||||
|
||||
/**
|
||||
* **DialogExportSettings** - Settings export dialog with sensitive data warning
|
||||
*
|
||||
* Dialog for exporting settings with an option to include or exclude
|
||||
* sensitive data (API keys, MCP server custom headers). Defaults to excluding
|
||||
* sensitive data for security. User must explicitly opt-in to include them.
|
||||
*
|
||||
* **Architecture:**
|
||||
* - Uses ShadCN AlertDialog
|
||||
* - Checkbox to toggle sensitive data inclusion (defaults to false)
|
||||
* - Warning icon and message when sensitive data is included
|
||||
* - Destructive variant for the action button when exporting with sensitive data
|
||||
*
|
||||
* **Features:**
|
||||
* - Secure default: sensitive data excluded by default
|
||||
* - User must explicitly opt-in to include sensitive data
|
||||
* - Visual warning (ShieldOff icon) when sensitive data is included
|
||||
* - Different action text based on sensitive data state
|
||||
*
|
||||
* @example
|
||||
* ```svelte
|
||||
* <DialogExportSettings
|
||||
* bind:open={showExportSettings}
|
||||
* bind:includeSensitiveData
|
||||
* onConfirm={handleSettingsExport}
|
||||
* onCancel={() => showExportSettings = false}
|
||||
* />
|
||||
* ```
|
||||
*/
|
||||
export { default as DialogExportSettings } from './DialogExportSettings.svelte';
|
||||
|
||||
/**
|
||||
*
|
||||
* CONFIRMATION DIALOGS
|
||||
|
||||
+4
-2
@@ -11,6 +11,8 @@
|
||||
import ScrollArea from '$lib/components/ui/scroll-area/scroll-area.svelte';
|
||||
import * as Sidebar from '$lib/components/ui/sidebar';
|
||||
import Input from '$lib/components/ui/input/input.svelte';
|
||||
import { ROUTES } from '$lib/constants/routes';
|
||||
import { RouterService } from '$lib/services/router.service';
|
||||
import {
|
||||
conversationsStore,
|
||||
conversations,
|
||||
@@ -159,7 +161,7 @@
|
||||
}
|
||||
|
||||
handleMobileSidebarItemClick();
|
||||
await goto(`#/chat/${id}`);
|
||||
await goto(RouterService.chat(id));
|
||||
}
|
||||
|
||||
function handleStopGeneration(id: string) {
|
||||
@@ -171,7 +173,7 @@
|
||||
<ScrollArea class="h-full flex-1">
|
||||
<Sidebar.Header class="gap-4 bg-sidebar/50 p-3 backdrop-blur-lg md:pt-4 md:pb-2">
|
||||
<div class="flex items-center justify-between">
|
||||
<a href="#/" onclick={handleMobileSidebarItemClick}>
|
||||
<a href={ROUTES.START} onclick={handleMobileSidebarItemClick}>
|
||||
<h1 class="inline-flex items-center gap-1 px-2 text-xl font-semibold">{APP_NAME}</h1>
|
||||
</a>
|
||||
|
||||
|
||||
+2
-1
@@ -11,6 +11,7 @@
|
||||
import { DropdownMenuActions } from '$lib/components/app';
|
||||
import * as Tooltip from '$lib/components/ui/tooltip';
|
||||
import { FORK_TREE_DEPTH_PADDING } from '$lib/constants';
|
||||
import { RouterService } from '$lib/services/router.service';
|
||||
import { getAllLoadingChats } from '$lib/stores/chat.svelte';
|
||||
import { conversationsStore } from '$lib/stores/conversations.svelte';
|
||||
import { TruncatedText } from '$lib/components/app';
|
||||
@@ -113,7 +114,7 @@
|
||||
<Tooltip.Root>
|
||||
<Tooltip.Trigger>
|
||||
<a
|
||||
href="#/chat/{conversation.forkedFromConversationId}"
|
||||
href={RouterService.chat(conversation.forkedFromConversationId)}
|
||||
class="flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
|
||||
>
|
||||
<GitBranch class="h-3.5 w-3.5" />
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
import Label from '$lib/components/ui/label/label.svelte';
|
||||
import { serverStore, serverLoading } from '$lib/stores/server.svelte';
|
||||
import { config, settingsStore } from '$lib/stores/settings.svelte';
|
||||
import { SETTINGS_KEYS } from '$lib/constants';
|
||||
import { ROUTES } from '$lib/constants/routes';
|
||||
import { fade, fly, scale } from 'svelte/transition';
|
||||
import { KeyboardKey } from '$lib/enums';
|
||||
|
||||
@@ -63,7 +65,7 @@
|
||||
|
||||
try {
|
||||
// Update the API key in settings first
|
||||
settingsStore.updateConfig('apiKey', apiKeyInput.trim());
|
||||
settingsStore.updateConfig(SETTINGS_KEYS.API_KEY, apiKeyInput.trim());
|
||||
|
||||
// Test the API key by making a real request to the server
|
||||
const response = await fetch(`${base}/props`, {
|
||||
@@ -79,7 +81,7 @@
|
||||
|
||||
// Show success state briefly, then navigate to home
|
||||
setTimeout(() => {
|
||||
goto(`#/`);
|
||||
goto(ROUTES.START);
|
||||
}, 1000);
|
||||
} else {
|
||||
// API key is invalid - User Story A
|
||||
|
||||
+29
-9
@@ -1,11 +1,11 @@
|
||||
<script lang="ts">
|
||||
import SettingsChatFooter from './SettingsChatFooter.svelte';
|
||||
import SettingsChatFields from './SettingsChatFields.svelte';
|
||||
import SettingsChatToolsTab from './SettingsChatToolsTab.svelte';
|
||||
import SettingsChatImportExportTab from './SettingsChatImportExportTab.svelte';
|
||||
import {
|
||||
SettingsChatDesktopSidebar,
|
||||
SettingsChatMobileHeader
|
||||
SettingsChatFields,
|
||||
SettingsChatImportExportTab,
|
||||
SettingsChatMobileHeader,
|
||||
SettingsChatToolsTab,
|
||||
SettingsFooter
|
||||
} from '$lib/components/app/settings';
|
||||
import { config, settingsStore } from '$lib/stores/settings.svelte';
|
||||
import {
|
||||
@@ -15,6 +15,7 @@
|
||||
SETTINGS_SECTION_TITLES,
|
||||
type SettingsSection
|
||||
} from '$lib/constants';
|
||||
import { RouterService } from '$lib/services/router.service';
|
||||
import { setMode } from 'mode-watcher';
|
||||
import { ColorMode } from '$lib/enums/ui';
|
||||
import { fade } from 'svelte/transition';
|
||||
@@ -22,7 +23,8 @@
|
||||
import { page } from '$app/state';
|
||||
import { setChatSettingsConfigContext } from '$lib/contexts';
|
||||
import { settingsReferrer } from '$lib/stores/settings-referrer.svelte';
|
||||
|
||||
import { modelsStore } from '$lib/stores/models.svelte';
|
||||
import { isRouterMode } from '$lib/stores/server.svelte';
|
||||
interface Props {
|
||||
initialSection?: string;
|
||||
getSectionHref?: (section: SettingsSection) => string;
|
||||
@@ -33,14 +35,30 @@
|
||||
let activeSlug = $derived(
|
||||
initialSection ?? (page.params as Record<string, string | undefined>).section ?? 'general'
|
||||
);
|
||||
|
||||
let currentSection = $derived(
|
||||
SETTINGS_CHAT_SECTIONS.find((section) => section.slug === activeSlug) ||
|
||||
SETTINGS_CHAT_SECTIONS[0]
|
||||
);
|
||||
|
||||
let localConfig: SettingsConfigType = $state({ ...config() });
|
||||
|
||||
let mobileHeader: { updateCarousel: () => void } | undefined;
|
||||
|
||||
let fetchInitiated = false;
|
||||
|
||||
$effect(() => {
|
||||
if (isRouterMode() && currentSection.fields && !fetchInitiated) {
|
||||
fetchInitiated = true;
|
||||
|
||||
void modelsStore
|
||||
.fetch()
|
||||
.then(() => modelsStore.fetchRouterModels())
|
||||
.then(() => modelsStore.fetchModalitiesForLoadedModels())
|
||||
.then(() => modelsStore.ensureFirstModelSelected());
|
||||
}
|
||||
});
|
||||
|
||||
function handleThemeChange(newTheme: string) {
|
||||
localConfig.theme = newTheme;
|
||||
setMode(newTheme as ColorMode);
|
||||
@@ -110,13 +128,15 @@
|
||||
<SettingsChatDesktopSidebar
|
||||
sections={SETTINGS_CHAT_SECTIONS}
|
||||
isActive={(section: SettingsSection) => section.slug === activeSlug}
|
||||
getHref={getSectionHref ?? ((section: SettingsSection) => `#/settings/chat/${section.slug}`)}
|
||||
getHref={getSectionHref ??
|
||||
((section: SettingsSection) => RouterService.settings(section.slug))}
|
||||
/>
|
||||
|
||||
<SettingsChatMobileHeader
|
||||
sections={SETTINGS_CHAT_SECTIONS}
|
||||
isActive={(section: SettingsSection) => section.slug === activeSlug}
|
||||
getHref={getSectionHref ?? ((section: SettingsSection) => `#/settings/chat/${section.slug}`)}
|
||||
getHref={getSectionHref ??
|
||||
((section: SettingsSection) => RouterService.settings(section.slug))}
|
||||
bind:this={mobileHeader}
|
||||
/>
|
||||
|
||||
@@ -149,7 +169,7 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<SettingsChatFooter onReset={handleReset} onSave={handleSave} />
|
||||
<SettingsFooter onReset={handleReset} onSave={handleSave} />
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
+20
-13
@@ -9,9 +9,9 @@
|
||||
import { SettingsFieldType } from '$lib/enums/settings';
|
||||
import { settingsStore } from '$lib/stores/settings.svelte';
|
||||
import { serverStore } from '$lib/stores/server.svelte';
|
||||
import { modelsStore, selectedModelName } from '$lib/stores/models.svelte';
|
||||
import { modelsStore, selectedModelName, propsCacheVersion } from '$lib/stores/models.svelte';
|
||||
import { normalizeFloatingPoint } from '$lib/utils/precision';
|
||||
import SettingsChatParameterSourceIndicator from './SettingsChatParameterSourceIndicator.svelte';
|
||||
import { SettingsChatParameterSourceIndicator } from '$lib/components/app/settings';
|
||||
import type { Component } from 'svelte';
|
||||
|
||||
interface Props {
|
||||
@@ -23,13 +23,19 @@
|
||||
|
||||
let { fields, localConfig, onConfigChange, onThemeChange }: Props = $props();
|
||||
|
||||
// server sampling defaults for placeholders
|
||||
let sp = $derived.by(() => {
|
||||
let currentModelParams = $derived.by(() => {
|
||||
propsCacheVersion();
|
||||
|
||||
if (serverStore.isRouterMode) {
|
||||
const m = selectedModelName();
|
||||
if (m) {
|
||||
const p = modelsStore.getModelProps(m);
|
||||
return (p?.default_generation_settings?.params ?? {}) as Record<string, unknown>;
|
||||
const currentModelName = selectedModelName();
|
||||
|
||||
if (currentModelName) {
|
||||
const currentModelProps = modelsStore.getModelProps(currentModelName);
|
||||
|
||||
return (currentModelProps?.default_generation_settings?.params ?? {}) as Record<
|
||||
string,
|
||||
unknown
|
||||
>;
|
||||
}
|
||||
}
|
||||
return (serverStore.defaultParams ?? {}) as Record<string, unknown>;
|
||||
@@ -40,7 +46,7 @@
|
||||
<div class="space-y-2">
|
||||
{#if field.type === SettingsFieldType.INPUT}
|
||||
{@const currentValue = String(localConfig[field.key] ?? '')}
|
||||
{@const serverDefault = sp[field.key]}
|
||||
{@const serverDefault = currentModelParams[field.key]}
|
||||
{@const isCustomRealTime = (() => {
|
||||
if (serverDefault == null) return false;
|
||||
if (currentValue === '') return false;
|
||||
@@ -78,8 +84,8 @@
|
||||
// Update local config immediately for real-time badge feedback
|
||||
onConfigChange(field.key, e.currentTarget.value);
|
||||
}}
|
||||
placeholder={sp[field.key] != null
|
||||
? `Default: ${normalizeFloatingPoint(sp[field.key])}`
|
||||
placeholder={currentModelParams[field.key] != null
|
||||
? `Default: ${normalizeFloatingPoint(currentModelParams[field.key])}`
|
||||
: ''}
|
||||
class="w-full {isCustomRealTime ? 'pr-8' : ''}"
|
||||
/>
|
||||
@@ -133,7 +139,8 @@
|
||||
<Checkbox
|
||||
id="showSystemMessage"
|
||||
checked={Boolean(localConfig.showSystemMessage ?? true)}
|
||||
onCheckedChange={(checked) => onConfigChange('showSystemMessage', Boolean(checked))}
|
||||
onCheckedChange={(checked) =>
|
||||
onConfigChange(SETTINGS_KEYS.SHOW_SYSTEM_MESSAGE, Boolean(checked))}
|
||||
/>
|
||||
|
||||
<Label for="showSystemMessage" class="cursor-pointer text-sm font-normal">
|
||||
@@ -147,7 +154,7 @@
|
||||
opt.value === localConfig[field.key]
|
||||
)}
|
||||
{@const currentValue = localConfig[field.key]}
|
||||
{@const serverDefault = sp[field.key]}
|
||||
{@const serverDefault = currentModelParams[field.key]}
|
||||
{@const isCustomRealTime = (() => {
|
||||
if (serverDefault == null) return false;
|
||||
if (currentValue === '' || currentValue === undefined) return false;
|
||||
|
||||
+62
@@ -0,0 +1,62 @@
|
||||
<script lang="ts">
|
||||
import type { Component } from 'svelte';
|
||||
import { Button, type ButtonVariant } from '$lib/components/ui/button';
|
||||
|
||||
let {
|
||||
title,
|
||||
description,
|
||||
IconComponent,
|
||||
buttonText,
|
||||
onclick,
|
||||
titleClass,
|
||||
buttonVariant,
|
||||
buttonClass,
|
||||
wrapperClass,
|
||||
summary
|
||||
}: {
|
||||
title: string;
|
||||
description: string;
|
||||
IconComponent: Component;
|
||||
buttonText: string;
|
||||
onclick: () => void;
|
||||
titleClass?: string;
|
||||
buttonVariant?: ButtonVariant;
|
||||
buttonClass?: string;
|
||||
wrapperClass?: string;
|
||||
summary?: { show: boolean; verb: string; items: DatabaseConversation[] };
|
||||
} = $props();
|
||||
|
||||
let sectionButtonClass = $derived(buttonClass ?? 'justify-start justify-self-start md:w-auto');
|
||||
let sectionButtonVariant = $derived(buttonVariant ?? 'outline');
|
||||
</script>
|
||||
|
||||
<div class="grid gap-1 {wrapperClass ?? ''}">
|
||||
<h4 class="mt-0 mb-2 text-sm font-medium {titleClass ?? ''}">{title}</h4>
|
||||
|
||||
<p class="mb-4 text-sm text-muted-foreground">{description}</p>
|
||||
|
||||
<Button class={sectionButtonClass} {onclick} variant={sectionButtonVariant}>
|
||||
<IconComponent class="mr-2 h-4 w-4" />
|
||||
|
||||
{buttonText}
|
||||
</Button>
|
||||
|
||||
{#if summary && summary.show && summary.items.length > 0}
|
||||
<div class="mt-4 grid overflow-x-auto rounded-lg border border-border/50 bg-muted/30 p-4">
|
||||
<h5 class="mb-2 text-sm font-medium">
|
||||
{summary.verb}
|
||||
{summary.items.length} conversation{summary.items.length === 1 ? '' : 's'}
|
||||
</h5>
|
||||
|
||||
<ul class="space-y-1 text-sm text-muted-foreground">
|
||||
{#each summary.items.slice(0, 10) as conv (conv.id)}
|
||||
<li class="truncate">• {conv.name || 'Untitled conversation'}</li>
|
||||
{/each}
|
||||
|
||||
{#if summary.items.length > 10}
|
||||
<li class="italic">... and {summary.items.length - 10} more</li>
|
||||
{/if}
|
||||
</ul>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
+138
-93
@@ -1,21 +1,18 @@
|
||||
<script lang="ts">
|
||||
import type { Component } from 'svelte';
|
||||
import { Download, Upload, Trash2 } from '@lucide/svelte';
|
||||
import { Button, type ButtonVariant } from '$lib/components/ui/button';
|
||||
import { DialogConversationSelection, DialogConfirmation } from '$lib/components/app';
|
||||
import {
|
||||
DialogConversationSelection,
|
||||
DialogConfirmation,
|
||||
DialogExportSettings
|
||||
} from '$lib/components/app';
|
||||
import { createMessageCountMap } from '$lib/utils';
|
||||
import { settingsStore } from '$lib/stores/settings.svelte';
|
||||
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
||||
import { toast } from 'svelte-sonner';
|
||||
import { fade } from 'svelte/transition';
|
||||
import { ConversationSelectionMode, HtmlInputType, FileExtensionText } from '$lib/enums';
|
||||
|
||||
interface SectionOpts {
|
||||
wrapperClass?: string;
|
||||
titleClass?: string;
|
||||
buttonVariant?: ButtonVariant;
|
||||
buttonClass?: string;
|
||||
summary?: { show: boolean; verb: string; items: DatabaseConversation[] };
|
||||
}
|
||||
import SettingsChatImportExportSection from './SettingsChatImportExportSection.svelte';
|
||||
import SettingsGroup from '$lib/components/app/settings/SettingsGroup.svelte';
|
||||
|
||||
let exportedConversations = $state<DatabaseConversation[]>([]);
|
||||
let importedConversations = $state<DatabaseConversation[]>([]);
|
||||
@@ -33,6 +30,82 @@
|
||||
// Delete functionality state
|
||||
let showDeleteDialog = $state(false);
|
||||
|
||||
// Settings import/export state
|
||||
let showSettingsExportSummary = $state(false);
|
||||
let showSettingsImportSummary = $state(false);
|
||||
let showSettingsExportDialog = $state(false);
|
||||
let includeSensitiveData = $state(false);
|
||||
|
||||
function handleSettingsExport() {
|
||||
showSettingsExportDialog = true;
|
||||
includeSensitiveData = false;
|
||||
}
|
||||
|
||||
function handleSettingsExportConfirm() {
|
||||
showSettingsExportDialog = false;
|
||||
|
||||
try {
|
||||
const data = settingsStore.exportSettings(includeSensitiveData);
|
||||
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `llama_settings_${new Date().toISOString().split('T')[0]}.json`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
|
||||
showSettingsExportSummary = true;
|
||||
showSettingsImportSummary = false;
|
||||
toast.success('Settings exported');
|
||||
} catch (err) {
|
||||
console.error('Failed to export settings:', err);
|
||||
toast.error('Failed to export settings');
|
||||
}
|
||||
}
|
||||
|
||||
function handleSettingsExportCancel() {
|
||||
showSettingsExportDialog = false;
|
||||
}
|
||||
|
||||
function handleSettingsImport() {
|
||||
try {
|
||||
const input = document.createElement('input');
|
||||
input.type = HtmlInputType.FILE;
|
||||
input.accept = FileExtensionText.JSON;
|
||||
|
||||
input.onchange = async (e) => {
|
||||
const file = (e.target as HTMLInputElement)?.files?.[0];
|
||||
if (!file) return;
|
||||
|
||||
try {
|
||||
const text = await file.text();
|
||||
const data = JSON.parse(text);
|
||||
|
||||
if (!data || typeof data !== 'object' || !data.config) {
|
||||
toast.error('Invalid settings file: missing config');
|
||||
return;
|
||||
}
|
||||
|
||||
settingsStore.importSettings(data);
|
||||
|
||||
showSettingsImportSummary = true;
|
||||
showSettingsExportSummary = false;
|
||||
toast.success('Settings imported successfully');
|
||||
} catch (err) {
|
||||
console.error('Failed to import settings:', err);
|
||||
toast.error('Failed to import settings');
|
||||
}
|
||||
};
|
||||
|
||||
input.click();
|
||||
} catch (err) {
|
||||
console.error('Failed to open file picker:', err);
|
||||
toast.error('Failed to open file picker');
|
||||
}
|
||||
}
|
||||
|
||||
async function handleExportClick() {
|
||||
try {
|
||||
const allConversations = conversations();
|
||||
@@ -181,94 +254,66 @@
|
||||
}
|
||||
</script>
|
||||
|
||||
{#snippet summaryList(show: boolean, verb: string, items: DatabaseConversation[])}
|
||||
{#if show && items.length > 0}
|
||||
<div class="mt-4 grid overflow-x-auto rounded-lg border border-border/50 bg-muted/30 p-4">
|
||||
<h5 class="mb-2 text-sm font-medium">
|
||||
{verb}
|
||||
{items.length} conversation{items.length === 1 ? '' : 's'}
|
||||
</h5>
|
||||
<div class="space-y-12" in:fade={{ duration: 150 }}>
|
||||
<SettingsGroup title="Conversations">
|
||||
<SettingsChatImportExportSection
|
||||
title="Export"
|
||||
description="Download your conversations as a JSON file. This includes all messages, attachments, and conversation history."
|
||||
IconComponent={Download}
|
||||
buttonText="Export conversations"
|
||||
onclick={handleExportClick}
|
||||
summary={{ show: showExportSummary, verb: 'Exported', items: exportedConversations }}
|
||||
/>
|
||||
|
||||
<ul class="space-y-1 text-sm text-muted-foreground">
|
||||
{#each items.slice(0, 10) as conv (conv.id)}
|
||||
<li class="truncate">• {conv.name || 'Untitled conversation'}</li>
|
||||
{/each}
|
||||
<SettingsChatImportExportSection
|
||||
title="Import"
|
||||
description="Import one or more conversations from a previously exported JSON file. This will merge with your existing conversations."
|
||||
IconComponent={Upload}
|
||||
buttonText="Import conversations"
|
||||
onclick={handleImportClick}
|
||||
summary={{ show: showImportSummary, verb: 'Imported', items: importedConversations }}
|
||||
/>
|
||||
|
||||
{#if items.length > 10}
|
||||
<li class="italic">... and {items.length - 10} more</li>
|
||||
{/if}
|
||||
</ul>
|
||||
</div>
|
||||
{/if}
|
||||
{/snippet}
|
||||
<SettingsChatImportExportSection
|
||||
title="Delete All"
|
||||
description="Permanently delete all conversations and their messages. This action cannot be undone. Consider exporting your conversations first if you want to keep a backup."
|
||||
IconComponent={Trash2}
|
||||
buttonText="Delete all conversations"
|
||||
onclick={handleDeleteAllClick}
|
||||
titleClass="text-destructive"
|
||||
buttonVariant="destructive"
|
||||
buttonClass="text-destructive-foreground justify-start justify-self-start bg-destructive hover:bg-destructive/80 md:w-auto"
|
||||
/>
|
||||
</SettingsGroup>
|
||||
|
||||
{#snippet section(
|
||||
title: string,
|
||||
description: string,
|
||||
IconComponent: Component,
|
||||
buttonText: string,
|
||||
onclick: () => void,
|
||||
opts: SectionOpts
|
||||
)}
|
||||
{@const buttonClass = opts?.buttonClass ?? 'justify-start justify-self-start md:w-auto'}
|
||||
{@const buttonVariant = opts?.buttonVariant ?? 'outline'}
|
||||
<div class="grid gap-1 {opts?.wrapperClass ?? ''}">
|
||||
<h4 class="mt-0 mb-2 text-sm font-medium {opts?.titleClass ?? ''}">{title}</h4>
|
||||
<SettingsGroup title="Settings">
|
||||
<SettingsChatImportExportSection
|
||||
title="Export"
|
||||
description="Export your chat settings and preferences as a JSON file."
|
||||
IconComponent={Download}
|
||||
buttonText="Export settings"
|
||||
onclick={handleSettingsExport}
|
||||
summary={{ show: showSettingsExportSummary, verb: 'Exported', items: [] }}
|
||||
/>
|
||||
|
||||
<p class="mb-4 text-sm text-muted-foreground">{description}</p>
|
||||
|
||||
<Button class={buttonClass} {onclick} variant={buttonVariant}>
|
||||
<IconComponent class="mr-2 h-4 w-4" />
|
||||
|
||||
{buttonText}
|
||||
</Button>
|
||||
|
||||
{#if opts?.summary}
|
||||
{@render summaryList(opts.summary.show, opts.summary.verb, opts.summary.items)}
|
||||
{/if}
|
||||
</div>
|
||||
{/snippet}
|
||||
|
||||
<div class="space-y-6" in:fade={{ duration: 150 }}>
|
||||
<div class="space-y-6">
|
||||
{@render section(
|
||||
'Export Conversations',
|
||||
'Download all your conversations as a JSON file. This includes all messages, attachments, and conversation history.',
|
||||
Download,
|
||||
'Export conversations',
|
||||
handleExportClick,
|
||||
{ summary: { show: showExportSummary, verb: 'Exported', items: exportedConversations } }
|
||||
)}
|
||||
|
||||
{@render section(
|
||||
'Import Conversations',
|
||||
'Import one or more conversations from a previously exported JSON file. This will merge with your existing conversations.',
|
||||
Upload,
|
||||
'Import conversations',
|
||||
handleImportClick,
|
||||
{
|
||||
wrapperClass: 'border-t border-border/30 pt-6',
|
||||
summary: { show: showImportSummary, verb: 'Imported', items: importedConversations }
|
||||
}
|
||||
)}
|
||||
|
||||
{@render section(
|
||||
'Delete All Conversations',
|
||||
'Permanently delete all conversations and their messages. This action cannot be undone. Consider exporting your conversations first if you want to keep a backup.',
|
||||
Trash2,
|
||||
'Delete all conversations',
|
||||
handleDeleteAllClick,
|
||||
{
|
||||
wrapperClass: 'border-t border-border/30 pt-4',
|
||||
titleClass: 'text-destructive',
|
||||
buttonVariant: 'destructive',
|
||||
buttonClass:
|
||||
'text-destructive-foreground justify-start justify-self-start bg-destructive hover:bg-destructive/80 md:w-auto'
|
||||
}
|
||||
)}
|
||||
</div>
|
||||
<SettingsChatImportExportSection
|
||||
title="Import"
|
||||
description="Import chat settings from a previously exported JSON file. This will merge with your existing settings."
|
||||
IconComponent={Upload}
|
||||
buttonText="Import settings"
|
||||
onclick={handleSettingsImport}
|
||||
summary={{ show: showSettingsImportSummary, verb: 'Imported', items: [] }}
|
||||
/>
|
||||
</SettingsGroup>
|
||||
</div>
|
||||
|
||||
<DialogExportSettings
|
||||
bind:open={showSettingsExportDialog}
|
||||
bind:includeSensitiveData
|
||||
onConfirm={handleSettingsExportConfirm}
|
||||
onCancel={handleSettingsExportCancel}
|
||||
/>
|
||||
|
||||
<DialogConversationSelection
|
||||
conversations={availableConversations}
|
||||
{messageCountMap}
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
<script lang="ts">
|
||||
import type { Snippet } from 'svelte';
|
||||
|
||||
interface Props {
|
||||
title: string;
|
||||
children: Snippet;
|
||||
}
|
||||
|
||||
let { title, children }: Props = $props();
|
||||
</script>
|
||||
|
||||
<div>
|
||||
<h3 class="mb-6 text-base font-semibold">{title}</h3>
|
||||
|
||||
<div class="space-y-8">
|
||||
{@render children()}
|
||||
</div>
|
||||
</div>
|
||||
@@ -19,32 +19,6 @@ export { default as SettingsChatDesktopSidebar } from './SettingsChatDesktopSide
|
||||
*/
|
||||
export { default as SettingsChatMobileHeader } from './SettingsChatMobileHeader.svelte';
|
||||
|
||||
/**
|
||||
* Settings Import/Export panel.
|
||||
* Provides UI for importing and exporting chat conversations.
|
||||
*/
|
||||
export { default as SettingsChatImportExportTab } from './SettingsChat/SettingsChatImportExportTab.svelte';
|
||||
|
||||
/**
|
||||
* MCP Servers configuration panel.
|
||||
* Provides UI for managing Model Context Protocol (MCP) server connections.
|
||||
*/
|
||||
export { default as SettingsMcpServers } from './SettingsMcpServers.svelte';
|
||||
|
||||
/**
|
||||
* Footer with save/cancel buttons for settings panel. Positioned at bottom
|
||||
* of settings dialog. Save button commits form state to config store,
|
||||
* cancel button triggers reset and close.
|
||||
*/
|
||||
export { default as SettingsChatFooter } from './SettingsChat/SettingsChatFooter.svelte';
|
||||
|
||||
/**
|
||||
* Form fields renderer for individual settings. Generates appropriate input
|
||||
* components based on field type (text, number, select, checkbox, textarea).
|
||||
* Handles validation, help text display, and parameter source indicators.
|
||||
*/
|
||||
export { default as SettingsChatFields } from './SettingsChat/SettingsChatFields.svelte';
|
||||
|
||||
/**
|
||||
* Badge indicating parameter source for sampling settings. Shows one of:
|
||||
* - **Custom**: User has explicitly set this value (orange badge)
|
||||
@@ -54,6 +28,44 @@ export { default as SettingsChatFields } from './SettingsChat/SettingsChatFields
|
||||
*/
|
||||
export { default as SettingsChatParameterSourceIndicator } from './SettingsChat/SettingsChatParameterSourceIndicator.svelte';
|
||||
|
||||
/**
|
||||
* Section wrapper for settings panels. Displays a title heading with
|
||||
* child content in a structured layout.
|
||||
*/
|
||||
export { default as SettingsGroup } from './SettingsGroup.svelte';
|
||||
|
||||
/**
|
||||
* Footer with save/cancel buttons for settings panel. Positioned at bottom
|
||||
* of settings dialog. Save button commits form state to config store,
|
||||
* cancel button triggers reset and close.
|
||||
*/
|
||||
export { default as SettingsFooter } from './SettingsFooter.svelte';
|
||||
|
||||
/**
|
||||
* Settings Import/Export panel.
|
||||
* Provides UI for importing and exporting chat conversations.
|
||||
*/
|
||||
export { default as SettingsChatImportExportTab } from './SettingsChat/SettingsChatImportExportTab.svelte';
|
||||
|
||||
/**
|
||||
* Section wrapper for import/export sections. Displays a title, description,
|
||||
* icon button, and optional summary of recent actions.
|
||||
*/
|
||||
export { default as SettingsChatImportExportSection } from './SettingsChat/SettingsChatImportExportSection.svelte';
|
||||
|
||||
/**
|
||||
* MCP Servers configuration panel.
|
||||
* Provides UI for managing Model Context Protocol (MCP) server connections.
|
||||
*/
|
||||
export { default as SettingsMcpServers } from './SettingsMcpServers.svelte';
|
||||
|
||||
/**
|
||||
* Form fields renderer for individual settings. Generates appropriate input
|
||||
* components based on field type (text, number, select, checkbox, textarea).
|
||||
* Handles validation, help text display, and parameter source indicators.
|
||||
*/
|
||||
export { default as SettingsChatFields } from './SettingsChat/SettingsChatFields.svelte';
|
||||
|
||||
/**
|
||||
* **SettingsChatToolsTab** - Tools configuration tab for chat settings
|
||||
*
|
||||
|
||||
@@ -29,10 +29,9 @@ export * from './message-export';
|
||||
export * from './model-id';
|
||||
export * from './precision';
|
||||
export * from './processing-info';
|
||||
export * from './settings-config';
|
||||
export * from './settings-fields';
|
||||
export * from './routes';
|
||||
export * from './settings-keys';
|
||||
export * from './settings-sections';
|
||||
export * from './settings-registry';
|
||||
export * from './supported-file-types';
|
||||
export * from './table-html-restorer';
|
||||
export * from './title-generation';
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
export const NEW_CHAT_PARAM = 'new_chat';
|
||||
|
||||
/** Settings section slugs — used for routes and navigation. */
|
||||
export const SETTINGS_SECTION_SLUGS = {
|
||||
GENERAL: 'general',
|
||||
DISPLAY: 'display',
|
||||
SAMPLING: 'sampling',
|
||||
PENALTIES: 'penalties',
|
||||
AGENTIC: 'agentic',
|
||||
DEVELOPER: 'developer',
|
||||
TOOLS: 'tools',
|
||||
IMPORT_EXPORT: 'import-export'
|
||||
} as const;
|
||||
|
||||
export const ROUTES = {
|
||||
/** Root — start of the app. */
|
||||
START: '#/',
|
||||
/** New chat — root with new chat query param. */
|
||||
NEW_CHAT: `?${NEW_CHAT_PARAM}=true#/`,
|
||||
/** Chat base — for dynamic chat URLs use RouterService. */
|
||||
CHAT: '#/chat',
|
||||
/** MCP servers. */
|
||||
MCP_SERVERS: '#/mcp-servers',
|
||||
/** Settings base — for dynamic settings URLs use RouterService. */
|
||||
SETTINGS: '#/settings'
|
||||
} as const;
|
||||
@@ -1,170 +0,0 @@
|
||||
import { ColorMode } from '$lib/enums/ui';
|
||||
import { Monitor, Moon, Sun } from '@lucide/svelte';
|
||||
import { TITLE } from './title-generation';
|
||||
|
||||
export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean | undefined> = {
|
||||
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value.
|
||||
// Do not use nested objects, keep it single level. Prefix the key if you need to group them.
|
||||
apiKey: '',
|
||||
systemMessage: '',
|
||||
showSystemMessage: true,
|
||||
theme: ColorMode.SYSTEM,
|
||||
showThoughtInProgress: true,
|
||||
disableReasoningParsing: false,
|
||||
excludeReasoningFromContext: false,
|
||||
showRawOutputSwitch: false,
|
||||
keepStatsVisible: false,
|
||||
showMessageStats: true,
|
||||
askForTitleConfirmation: false,
|
||||
titleGenerationUseFirstLine: false,
|
||||
titleGenerationUseLLM: false,
|
||||
titleGenerationPrompt: TITLE.DEFAULT_PROMPT,
|
||||
pasteLongTextToFileLen: 2500,
|
||||
copyTextAttachmentsAsPlainText: false,
|
||||
pdfAsImage: false,
|
||||
disableAutoScroll: false,
|
||||
renderUserContentAsMarkdown: false,
|
||||
alwaysShowSidebarOnDesktop: false,
|
||||
autoShowSidebarOnNewChat: true,
|
||||
sendOnEnter: true,
|
||||
autoMicOnEmpty: false,
|
||||
fullHeightCodeBlocks: false,
|
||||
showRawModelNames: false,
|
||||
mcpServers: '[]',
|
||||
mcpServerUsageStats: '{}', // JSON object: { [serverId]: usageCount }
|
||||
agenticMaxTurns: 10,
|
||||
agenticMaxToolPreviewLines: 25,
|
||||
showToolCallInProgress: false,
|
||||
alwaysShowAgenticTurns: false,
|
||||
// sampling params: empty means "use server default"
|
||||
// the server / preset is the source of truth
|
||||
// empty values are shown as placeholders from /props in the UI
|
||||
// and are NOT sent in API requests, letting the server decide
|
||||
samplers: '',
|
||||
backend_sampling: false,
|
||||
temperature: undefined,
|
||||
dynatemp_range: undefined,
|
||||
dynatemp_exponent: undefined,
|
||||
top_k: undefined,
|
||||
top_p: undefined,
|
||||
min_p: undefined,
|
||||
xtc_probability: undefined,
|
||||
xtc_threshold: undefined,
|
||||
typ_p: undefined,
|
||||
repeat_last_n: undefined,
|
||||
repeat_penalty: undefined,
|
||||
presence_penalty: undefined,
|
||||
frequency_penalty: undefined,
|
||||
dry_multiplier: undefined,
|
||||
dry_base: undefined,
|
||||
dry_allowed_length: undefined,
|
||||
dry_penalty_last_n: undefined,
|
||||
max_tokens: undefined,
|
||||
custom: '', // custom json-stringified object
|
||||
preEncodeConversation: false,
|
||||
// experimental features
|
||||
pyInterpreterEnabled: false,
|
||||
enableContinueGeneration: false
|
||||
};
|
||||
|
||||
export const SETTING_CONFIG_INFO: Record<string, string> = {
|
||||
apiKey: 'Set the API Key if you are using <code>--api-key</code> option for the server.',
|
||||
systemMessage: 'The starting message that defines how model should behave.',
|
||||
showSystemMessage: 'Display the system message at the top of each conversation.',
|
||||
theme:
|
||||
'Choose the color theme for the interface. You can choose between System (follows your device settings), Light, or Dark.',
|
||||
pasteLongTextToFileLen:
|
||||
'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
|
||||
copyTextAttachmentsAsPlainText:
|
||||
'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
|
||||
samplers:
|
||||
'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
|
||||
backend_sampling:
|
||||
'Enable backend-based samplers. When enabled, supported samplers run on the accelerator backend for faster sampling.',
|
||||
temperature:
|
||||
'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
|
||||
dynatemp_range:
|
||||
'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
|
||||
dynatemp_exponent:
|
||||
'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
|
||||
top_k: 'Keeps only k top tokens.',
|
||||
top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
|
||||
min_p:
|
||||
'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
|
||||
xtc_probability:
|
||||
'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
|
||||
xtc_threshold:
|
||||
'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
|
||||
typ_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
|
||||
repeat_last_n: 'Last n tokens to consider for penalizing repetition',
|
||||
repeat_penalty: 'Controls the repetition of token sequences in the generated text',
|
||||
presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
|
||||
frequency_penalty: 'Limits tokens based on how often they appear in the output.',
|
||||
dry_multiplier:
|
||||
'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
|
||||
dry_base:
|
||||
'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
|
||||
dry_allowed_length:
|
||||
'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
|
||||
dry_penalty_last_n:
|
||||
'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
|
||||
max_tokens: 'The maximum number of token per output. Use -1 for infinite (no limit).',
|
||||
custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
|
||||
showThoughtInProgress: 'Expand thought process by default when generating messages.',
|
||||
disableReasoningParsing:
|
||||
'Send reasoning_format=none so the server returns thinking tokens inline instead of extracting them into a separate field.',
|
||||
excludeReasoningFromContext:
|
||||
'Strip thinking from previous messages before sending. When off, thinking is sent back via the reasoning_content field so the model sees its own chain-of-thought across turns.',
|
||||
showRawOutputSwitch:
|
||||
'Show toggle button to display messages as plain text instead of Markdown-formatted content',
|
||||
keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
|
||||
showMessageStats:
|
||||
'Display generation statistics (tokens/second, token count, duration) below each assistant message.',
|
||||
askForTitleConfirmation:
|
||||
'Ask for confirmation before automatically changing conversation title when editing the first message.',
|
||||
titleGenerationUseFirstLine:
|
||||
'Use only the first non-empty line of the prompt to generate the conversation title.',
|
||||
titleGenerationUseLLM:
|
||||
'Use the LLM to automatically generate conversation titles based on the first message exchange.',
|
||||
titleGenerationPrompt:
|
||||
'Optional template for the title generation prompt. Use {{USER}} for the user message and {{ASSISTANT}} for the assistant message.',
|
||||
pdfAsImage:
|
||||
'Parse PDF as image instead of text. Automatically falls back to text processing for non-vision models.',
|
||||
disableAutoScroll:
|
||||
'Disable automatic scrolling while messages stream so you can control the viewport position manually.',
|
||||
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
|
||||
alwaysShowSidebarOnDesktop:
|
||||
'Always keep the sidebar visible on desktop instead of auto-hiding it.',
|
||||
autoShowSidebarOnNewChat:
|
||||
'Automatically show sidebar when starting a new chat. Disable to keep the sidebar hidden until you click on it.',
|
||||
sendOnEnter:
|
||||
'Use Enter to send messages and Shift + Enter for new lines. When disabled, use Ctrl/Cmd + Enter.',
|
||||
autoMicOnEmpty:
|
||||
'Automatically show microphone button instead of send button when textarea is empty for models with audio modality support.',
|
||||
fullHeightCodeBlocks:
|
||||
'Always display code blocks at their full natural height, overriding any height limits.',
|
||||
showRawModelNames:
|
||||
'Display full raw model identifiers (e.g. "ggml-org/GLM-4.7-Flash-GGUF:Q8_0") instead of parsed names with badges.',
|
||||
mcpServers:
|
||||
'Configure MCP servers as a JSON list. Use the form in the MCP Client settings section to edit.',
|
||||
mcpServerUsageStats:
|
||||
'Usage statistics for MCP servers. Tracks how many times tools from each server have been used.',
|
||||
agenticMaxTurns:
|
||||
'Maximum number of tool execution cycles before stopping (prevents infinite loops).',
|
||||
agenticMaxToolPreviewLines:
|
||||
'Number of lines shown in tool output previews (last N lines). Only these previews and the final LLM response persist after the agentic loop completes.',
|
||||
showToolCallInProgress:
|
||||
'Automatically expand tool call details while executing and keep them expanded after completion.',
|
||||
pyInterpreterEnabled:
|
||||
'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
|
||||
preEncodeConversation:
|
||||
'After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.',
|
||||
enableContinueGeneration:
|
||||
'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.'
|
||||
};
|
||||
|
||||
export const SETTINGS_COLOR_MODES_CONFIG = [
|
||||
{ value: ColorMode.SYSTEM, label: 'System', icon: Monitor },
|
||||
{ value: ColorMode.LIGHT, label: 'Light', icon: Sun },
|
||||
{ value: ColorMode.DARK, label: 'Dark', icon: Moon }
|
||||
];
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user