From 4c66df50ca11394f3a24a5746dcfd13403bba7ca Mon Sep 17 00:00:00 2001 From: shahondin1624 Date: Wed, 27 May 2026 08:08:12 +0200 Subject: [PATCH] hip: fix HIP graph capture crash for FA quantized KV f16 dequant The HIP branch in launch_fattn used raw hipMalloc / hipFree / hipStreamSynchronize(main_stream) for the K/V f16 dequant temp buffers (introduced to avoid pool retention OOM). These three calls are illegal during HIP graph capture and abort cudaStreamEndCapture with hipErrorStreamCaptureUnsupported, manifesting as the "ROCm error" at ggml-cuda.cu:104 when running models like Qwen3.6-27B-Dense and Qwen3.6-35B-A3B-Q8 with -fa 1 on gfx1151. Workaround was GGML_CUDA_DISABLE_GRAPHS=1. Probe cudaStreamIsCapturing on entry; when a capture is in progress use ggml_cuda_pool_alloc (legal in capture). Outside capture, behavior is unchanged so the OOM-avoidance the raw-alloc branch was added for is preserved. Also: ggml_cuda_error wrote only via GGML_LOG_ERROR, which llama-bench silences with llama_null_log_callback, so the actual hipError was invisible. Mirror the message to stderr with fflush so failures stay diagnosable from bench. Expand the inline CUDA_CHECK around cudaStreamEndCapture / cudaGraphInstantiate / cudaGraphLaunch to print which graph step failed plus the cgraph's first/last op for context. Co-Authored-By: Claude Opus 4.7 --- ggml/src/ggml-cuda/fattn-common.cuh | 27 ++++++++++++++++---- ggml/src/ggml-cuda/ggml-cuda.cu | 38 ++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 861d844670..e47a68e2c1 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -1303,22 +1303,39 @@ void launch_fattn( // For quantized KV dequant, this means the f16 temp buffer stays allocated, // consuming more VRAM than the quantized KV compression saves — causing OOM. // Using raw alloc+free ensures the memory is released after the kernel completes. + // + // Caveat: hipMalloc/hipFree/hipStreamSynchronize are ILLEGAL during HIP graph + // capture and abort cudaStreamEndCapture with hipErrorStreamCaptureUnsupported. + // While capturing we must fall back to the pool (the captured graph reuses + // the same temp buffer across launches, so OOM risk is bounded to a single + // peak-sized allocation that lives for the lifetime of the cuda_graph). + cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone; + (void)cudaStreamIsCapturing(main_stream, &capture_status); + const bool is_capturing = (capture_status != cudaStreamCaptureStatusNone); + struct hip_f16_alloc { half * ptr = nullptr; cudaStream_t stream; - hip_f16_alloc(cudaStream_t s) : stream(s) {} + bool use_pool = false; + ggml_cuda_pool_alloc pool_holder; + + hip_f16_alloc(cudaStream_t s, ggml_cuda_pool & p, bool use_pool_) : stream(s), use_pool(use_pool_), pool_holder(p) {} ~hip_f16_alloc() { - if (ptr) { + if (!use_pool && ptr) { cudaStreamSynchronize(stream); cudaFree(ptr); } } void alloc(size_t nelements) { - CUDA_CHECK(cudaMalloc(&ptr, nelements * sizeof(half))); + if (use_pool) { + ptr = pool_holder.alloc(nelements); + } else { + CUDA_CHECK(cudaMalloc(&ptr, nelements * sizeof(half))); + } } }; - hip_f16_alloc K_f16(main_stream); - hip_f16_alloc V_f16(main_stream); + hip_f16_alloc K_f16(main_stream, pool, is_capturing); + hip_f16_alloc V_f16(main_stream, pool, is_capturing); #else ggml_cuda_pool_alloc K_f16(pool); ggml_cuda_pool_alloc V_f16(pool); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 07d236940e..0022287fb0 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -100,6 +100,12 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg); GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_LOG_ERROR(" %s\n", stmt); + // Also write to stderr directly: llama-bench installs a null log callback + // that swallows GGML_LOG_ERROR output, leaving "ROCm error" with no diagnosis. + fprintf(stderr, GGML_CUDA_NAME " error: %s\n", msg); + fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line); + fprintf(stderr, " %s\n", stmt); + fflush(stderr); // abort with GGML_ABORT to get a stack trace GGML_ABORT(GGML_CUDA_NAME " error"); } @@ -4471,7 +4477,21 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud graph->graph = nullptr; } - CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &graph->graph)); + { + cudaError_t end_capture_err = cudaStreamEndCapture(cuda_ctx->stream(), &graph->graph); + if (end_capture_err != cudaSuccess) { + fprintf(stderr, "[ggml-cuda-graph] cudaStreamEndCapture failed: %s\n", + cudaGetErrorString(end_capture_err)); + fprintf(stderr, "[ggml-cuda-graph] cgraph had %d nodes; first op = %s (%s), last op = %s (%s)\n", + cgraph->n_nodes, + cgraph->n_nodes > 0 ? cgraph->nodes[0]->name : "", + cgraph->n_nodes > 0 ? ggml_op_name(cgraph->nodes[0]->op) : "", + cgraph->n_nodes > 0 ? cgraph->nodes[cgraph->n_nodes-1]->name : "", + cgraph->n_nodes > 0 ? ggml_op_name(cgraph->nodes[cgraph->n_nodes-1]->op) : ""); + fflush(stderr); + CUDA_CHECK(end_capture_err); + } + } graph_evaluated_or_captured = true; // CUDA graph has been captured std::lock_guard lock(ggml_cuda_lock); @@ -4486,13 +4506,25 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud if (use_cuda_graph) { ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); if (graph->instance == nullptr) { // Create executable graph from captured graph. - CUDA_CHECK(cudaGraphInstantiate(&graph->instance, graph->graph, NULL, NULL, 0)); + cudaError_t inst_err = cudaGraphInstantiate(&graph->instance, graph->graph, NULL, NULL, 0); + if (inst_err != cudaSuccess) { + fprintf(stderr, "[ggml-cuda-graph] cudaGraphInstantiate failed: %s\n", cudaGetErrorString(inst_err)); + fflush(stderr); + CUDA_CHECK(inst_err); + } } if (cuda_graph_update_required) { // Update graph executable ggml_cuda_graph_update_executable(cuda_ctx, graph_key); } // Launch graph - CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream())); + { + cudaError_t launch_err = cudaGraphLaunch(graph->instance, cuda_ctx->stream()); + if (launch_err != cudaSuccess) { + fprintf(stderr, "[ggml-cuda-graph] cudaGraphLaunch failed: %s\n", cudaGetErrorString(launch_err)); + fflush(stderr); + CUDA_CHECK(launch_err); + } + } #else GGML_UNUSED(graph_key); graph_evaluated_or_captured = true;