hip: fix HIP graph capture crash for FA quantized KV f16 dequant

The HIP branch in launch_fattn used raw hipMalloc / hipFree / hipStreamSynchronize(main_stream) for the K/V f16 dequant temp buffers (introduced to avoid pool retention OOM). These three calls are illegal during HIP graph capture and abort cudaStreamEndCapture with hipErrorStreamCaptureUnsupported, manifesting as the "ROCm error" at ggml-cuda.cu:104 when running models like Qwen3.6-27B-Dense and Qwen3.6-35B-A3B-Q8 with -fa 1 on gfx1151. Workaround was GGML_CUDA_DISABLE_GRAPHS=1. Probe cudaStreamIsCapturing on entry; when a capture is in progress use ggml_cuda_pool_alloc<half> (legal in capture). Outside capture, behavior is unchanged so the OOM-avoidance the raw-alloc branch was added for is preserved. Also: ggml_cuda_error wrote only via GGML_LOG_ERROR, which llama-bench silences with llama_null_log_callback, so the actual hipError was invisible. Mirror the message to stderr with fflush so failures stay diagnosable from bench. Expand the inline CUDA_CHECK around cudaStreamEndCapture / cudaGraphInstantiate / cudaGraphLaunch to print which graph step failed plus the cgraph's first/last op for context. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 08:08:12 +02:00
parent a581eead32
commit 4c66df50ca
2 changed files with 57 additions and 8 deletions
@@ -1303,22 +1303,39 @@ void launch_fattn(
    // For quantized KV dequant, this means the f16 temp buffer stays allocated,
    // consuming more VRAM than the quantized KV compression saves — causing OOM.
    // Using raw alloc+free ensures the memory is released after the kernel completes.
+    //
+    // Caveat: hipMalloc/hipFree/hipStreamSynchronize are ILLEGAL during HIP graph
+    // capture and abort cudaStreamEndCapture with hipErrorStreamCaptureUnsupported.
+    // While capturing we must fall back to the pool (the captured graph reuses
+    // the same temp buffer across launches, so OOM risk is bounded to a single
+    // peak-sized allocation that lives for the lifetime of the cuda_graph).
+    cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
+    (void)cudaStreamIsCapturing(main_stream, &capture_status);
+    const bool is_capturing = (capture_status != cudaStreamCaptureStatusNone);
+
    struct hip_f16_alloc {
        half * ptr = nullptr;
        cudaStream_t stream;
-        hip_f16_alloc(cudaStream_t s) : stream(s) {}
+        bool         use_pool = false;
+        ggml_cuda_pool_alloc<half> pool_holder;
+
+        hip_f16_alloc(cudaStream_t s, ggml_cuda_pool & p, bool use_pool_) : stream(s), use_pool(use_pool_), pool_holder(p) {}
        ~hip_f16_alloc() {
-            if (ptr) {
+            if (!use_pool && ptr) {
                cudaStreamSynchronize(stream);
                cudaFree(ptr);
            }
        }
        void alloc(size_t nelements) {
-            CUDA_CHECK(cudaMalloc(&ptr, nelements * sizeof(half)));
+            if (use_pool) {
+                ptr = pool_holder.alloc(nelements);
+            } else {
+                CUDA_CHECK(cudaMalloc(&ptr, nelements * sizeof(half)));
+            }
        }
    };
-    hip_f16_alloc K_f16(main_stream);
-    hip_f16_alloc V_f16(main_stream);
+    hip_f16_alloc K_f16(main_stream, pool, is_capturing);
+    hip_f16_alloc V_f16(main_stream, pool, is_capturing);
 #else
    ggml_cuda_pool_alloc<half>   K_f16(pool);
    ggml_cuda_pool_alloc<half>   V_f16(pool);
@@ -100,6 +100,12 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
    GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
    GGML_LOG_ERROR("  %s\n", stmt);
+    // Also write to stderr directly: llama-bench installs a null log callback
+    // that swallows GGML_LOG_ERROR output, leaving "ROCm error" with no diagnosis.
+    fprintf(stderr, GGML_CUDA_NAME " error: %s\n", msg);
+    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
+    fprintf(stderr, "  %s\n", stmt);
+    fflush(stderr);
    // abort with GGML_ABORT to get a stack trace
    GGML_ABORT(GGML_CUDA_NAME " error");
 }
@@ -4471,7 +4477,21 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                graph->graph = nullptr;
            }

-            CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &graph->graph));
+            {
+                cudaError_t end_capture_err = cudaStreamEndCapture(cuda_ctx->stream(), &graph->graph);
+                if (end_capture_err != cudaSuccess) {
+                    fprintf(stderr, "[ggml-cuda-graph] cudaStreamEndCapture failed: %s\n",
+                            cudaGetErrorString(end_capture_err));
+                    fprintf(stderr, "[ggml-cuda-graph] cgraph had %d nodes; first op = %s (%s), last op = %s (%s)\n",
+                            cgraph->n_nodes,
+                            cgraph->n_nodes > 0 ? cgraph->nodes[0]->name : "<none>",
+                            cgraph->n_nodes > 0 ? ggml_op_name(cgraph->nodes[0]->op) : "<none>",
+                            cgraph->n_nodes > 0 ? cgraph->nodes[cgraph->n_nodes-1]->name : "<none>",
+                            cgraph->n_nodes > 0 ? ggml_op_name(cgraph->nodes[cgraph->n_nodes-1]->op) : "<none>");
+                    fflush(stderr);
+                    CUDA_CHECK(end_capture_err);
+                }
+            }
            graph_evaluated_or_captured = true; // CUDA graph has been captured

            std::lock_guard<std::mutex> lock(ggml_cuda_lock);
@@ -4486,13 +4506,25 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
    if (use_cuda_graph) {
        ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
        if (graph->instance == nullptr) { // Create executable graph from captured graph.
-            CUDA_CHECK(cudaGraphInstantiate(&graph->instance, graph->graph, NULL, NULL, 0));
+            cudaError_t inst_err = cudaGraphInstantiate(&graph->instance, graph->graph, NULL, NULL, 0);
+            if (inst_err != cudaSuccess) {
+                fprintf(stderr, "[ggml-cuda-graph] cudaGraphInstantiate failed: %s\n", cudaGetErrorString(inst_err));
+                fflush(stderr);
+                CUDA_CHECK(inst_err);
+            }
        }
        if (cuda_graph_update_required) { // Update graph executable
            ggml_cuda_graph_update_executable(cuda_ctx, graph_key);
        }
        // Launch graph
-        CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream()));
+        {
+            cudaError_t launch_err = cudaGraphLaunch(graph->instance, cuda_ctx->stream());
+            if (launch_err != cudaSuccess) {
+                fprintf(stderr, "[ggml-cuda-graph] cudaGraphLaunch failed: %s\n", cudaGetErrorString(launch_err));
+                fflush(stderr);
+                CUDA_CHECK(launch_err);
+            }
+        }
 #else
        GGML_UNUSED(graph_key);
        graph_evaluated_or_captured = true;