From 4c66df50ca11394f3a24a5746dcfd13403bba7ca Mon Sep 17 00:00:00 2001
From: shahondin1624 <throwaway278598@protonmail.com>
Date: Wed, 27 May 2026 08:08:12 +0200
Subject: [PATCH] hip: fix HIP graph capture crash for FA quantized KV f16
 dequant

The HIP branch in launch_fattn used raw hipMalloc / hipFree /
hipStreamSynchronize(main_stream) for the K/V f16 dequant temp buffers
(introduced to avoid pool retention OOM). These three calls are illegal
during HIP graph capture and abort cudaStreamEndCapture with
hipErrorStreamCaptureUnsupported, manifesting as the "ROCm error" at
ggml-cuda.cu:104 when running models like Qwen3.6-27B-Dense and
Qwen3.6-35B-A3B-Q8 with -fa 1 on gfx1151. Workaround was
GGML_CUDA_DISABLE_GRAPHS=1.

Probe cudaStreamIsCapturing on entry; when a capture is in progress
use ggml_cuda_pool_alloc<half> (legal in capture). Outside capture,
behavior is unchanged so the OOM-avoidance the raw-alloc branch was
added for is preserved.

Also: ggml_cuda_error wrote only via GGML_LOG_ERROR, which llama-bench
silences with llama_null_log_callback, so the actual hipError was
invisible. Mirror the message to stderr with fflush so failures stay
diagnosable from bench. Expand the inline CUDA_CHECK around
cudaStreamEndCapture / cudaGraphInstantiate / cudaGraphLaunch to print
which graph step failed plus the cgraph's first/last op for context.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 ggml/src/ggml-cuda/fattn-common.cuh | 27 ++++++++++++++++----
 ggml/src/ggml-cuda/ggml-cuda.cu     | 38 ++++++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index 861d844670..e47a68e2c1 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -1303,22 +1303,39 @@ void launch_fattn(
     // For quantized KV dequant, this means the f16 temp buffer stays allocated,
     // consuming more VRAM than the quantized KV compression saves — causing OOM.
     // Using raw alloc+free ensures the memory is released after the kernel completes.
+    //
+    // Caveat: hipMalloc/hipFree/hipStreamSynchronize are ILLEGAL during HIP graph
+    // capture and abort cudaStreamEndCapture with hipErrorStreamCaptureUnsupported.
+    // While capturing we must fall back to the pool (the captured graph reuses
+    // the same temp buffer across launches, so OOM risk is bounded to a single
+    // peak-sized allocation that lives for the lifetime of the cuda_graph).
+    cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
+    (void)cudaStreamIsCapturing(main_stream, &capture_status);
+    const bool is_capturing = (capture_status != cudaStreamCaptureStatusNone);
+
     struct hip_f16_alloc {
         half * ptr = nullptr;
         cudaStream_t stream;
-        hip_f16_alloc(cudaStream_t s) : stream(s) {}
+        bool         use_pool = false;
+        ggml_cuda_pool_alloc<half> pool_holder;
+
+        hip_f16_alloc(cudaStream_t s, ggml_cuda_pool & p, bool use_pool_) : stream(s), use_pool(use_pool_), pool_holder(p) {}
         ~hip_f16_alloc() {
-            if (ptr) {
+            if (!use_pool && ptr) {
                 cudaStreamSynchronize(stream);
                 cudaFree(ptr);
             }
         }
         void alloc(size_t nelements) {
-            CUDA_CHECK(cudaMalloc(&ptr, nelements * sizeof(half)));
+            if (use_pool) {
+                ptr = pool_holder.alloc(nelements);
+            } else {
+                CUDA_CHECK(cudaMalloc(&ptr, nelements * sizeof(half)));
+            }
         }
     };
-    hip_f16_alloc K_f16(main_stream);
-    hip_f16_alloc V_f16(main_stream);
+    hip_f16_alloc K_f16(main_stream, pool, is_capturing);
+    hip_f16_alloc V_f16(main_stream, pool, is_capturing);
 #else
     ggml_cuda_pool_alloc<half>   K_f16(pool);
     ggml_cuda_pool_alloc<half>   V_f16(pool);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 07d236940e..0022287fb0 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -100,6 +100,12 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
     GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
     GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
     GGML_LOG_ERROR("  %s\n", stmt);
+    // Also write to stderr directly: llama-bench installs a null log callback
+    // that swallows GGML_LOG_ERROR output, leaving "ROCm error" with no diagnosis.
+    fprintf(stderr, GGML_CUDA_NAME " error: %s\n", msg);
+    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
+    fprintf(stderr, "  %s\n", stmt);
+    fflush(stderr);
     // abort with GGML_ABORT to get a stack trace
     GGML_ABORT(GGML_CUDA_NAME " error");
 }
@@ -4471,7 +4477,21 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                 graph->graph = nullptr;
             }
 
-            CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &graph->graph));
+            {
+                cudaError_t end_capture_err = cudaStreamEndCapture(cuda_ctx->stream(), &graph->graph);
+                if (end_capture_err != cudaSuccess) {
+                    fprintf(stderr, "[ggml-cuda-graph] cudaStreamEndCapture failed: %s\n",
+                            cudaGetErrorString(end_capture_err));
+                    fprintf(stderr, "[ggml-cuda-graph] cgraph had %d nodes; first op = %s (%s), last op = %s (%s)\n",
+                            cgraph->n_nodes,
+                            cgraph->n_nodes > 0 ? cgraph->nodes[0]->name : "<none>",
+                            cgraph->n_nodes > 0 ? ggml_op_name(cgraph->nodes[0]->op) : "<none>",
+                            cgraph->n_nodes > 0 ? cgraph->nodes[cgraph->n_nodes-1]->name : "<none>",
+                            cgraph->n_nodes > 0 ? ggml_op_name(cgraph->nodes[cgraph->n_nodes-1]->op) : "<none>");
+                    fflush(stderr);
+                    CUDA_CHECK(end_capture_err);
+                }
+            }
             graph_evaluated_or_captured = true; // CUDA graph has been captured
 
             std::lock_guard<std::mutex> lock(ggml_cuda_lock);
@@ -4486,13 +4506,25 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
     if (use_cuda_graph) {
         ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
         if (graph->instance == nullptr) { // Create executable graph from captured graph.
-            CUDA_CHECK(cudaGraphInstantiate(&graph->instance, graph->graph, NULL, NULL, 0));
+            cudaError_t inst_err = cudaGraphInstantiate(&graph->instance, graph->graph, NULL, NULL, 0);
+            if (inst_err != cudaSuccess) {
+                fprintf(stderr, "[ggml-cuda-graph] cudaGraphInstantiate failed: %s\n", cudaGetErrorString(inst_err));
+                fflush(stderr);
+                CUDA_CHECK(inst_err);
+            }
         }
         if (cuda_graph_update_required) { // Update graph executable
             ggml_cuda_graph_update_executable(cuda_ctx, graph_key);
         }
         // Launch graph
-        CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream()));
+        {
+            cudaError_t launch_err = cudaGraphLaunch(graph->instance, cuda_ctx->stream());
+            if (launch_err != cudaSuccess) {
+                fprintf(stderr, "[ggml-cuda-graph] cudaGraphLaunch failed: %s\n", cudaGetErrorString(launch_err));
+                fflush(stderr);
+                CUDA_CHECK(launch_err);
+            }
+        }
 #else
         GGML_UNUSED(graph_key);
         graph_evaluated_or_captured = true;