Executing a CUDA Graph from a CUDA kernel

Question

I’m trying to launch a captured CUDA Graph from inside a regular CUDA kernel (i.e., device-side graph launch).

From the NVIDIA blog on device graph launch, it seems this should be supported on newer GPUs (such as H100) with recent CUDA versions.

To test this, I wrote a minimal example that captures a simple add kernel into a CUDA Graph, instantiates it with cudaGraphInstantiateFlagDeviceLaunch, and then tries to launch it from inside another kernel.

Here’s the code:

#include <cstdio>
#include <cuda_runtime.h>
#include <iostream>

__global__ void add(int* a, int* b, int* out){ 
    if(threadIdx.x==0 && blockIdx.x==0) *out = *a + *b; 
}

__global__ void regularLauncher(cudaGraphExec_t gexec, int* deviceErr){
  if(threadIdx.x==0 && blockIdx.x==0){
    *deviceErr = 999; // mark kernel ran
    cudaError_t e = cudaGraphLaunch(gexec, cudaStreamGraphFireAndForget);
    *deviceErr = (e == cudaSuccess) ? 512 /*test value*/ : (int)e;
  }
}

int main(){
  int device;
  cudaGetDevice(&device);
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, device);
  printf("Device: %s, Compute Capability: %d.%d
", prop.name, prop.major, prop.minor);

  int *d_a, *d_b, *d_out, *d_err;
  cudaMalloc(&d_a, sizeof(int));
  cudaMalloc(&d_b, sizeof(int));
  cudaMalloc(&d_out, sizeof(int));
  cudaMalloc(&d_err, sizeof(int));

  int ha=2, hb=3;
  cudaMemcpy(d_a, &ha, sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &hb, sizeof(int), cudaMemcpyHostToDevice);
  cudaMemset(d_out, 0, sizeof(int));
  cudaMemset(d_err, 0, sizeof(int));

  cudaStream_t s; 
  cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
  cudaGraph_t g;
  cudaStreamBeginCapture(s, cudaStreamCaptureModeGlobal);
  add<<<1,1,0,s>>>(d_a, d_b, d_out);
  cudaStreamEndCapture(s, &g);

  cudaGraphExec_t exec; 
  cudaGraphInstantiate(&exec, g, cudaGraphInstantiateFlagDeviceLaunch);
  cudaGraphUpload(exec, s);
  cudaStreamSynchronize(s);

  // Verify host launch works
  cudaGraphLaunch(exec, s);
  cudaStreamSynchronize(s);
  printf("Host cudaGraphLaunch error: %s
", cudaGetErrorString(cudaGetLastError()));

  int hostOut = 0;
  cudaMemcpy(&hostOut, d_out, sizeof(int), cudaMemcpyDeviceToHost);
  printf("Host add kernel result = %d
", hostOut);

  // Reset and test device-side launch
  cudaMemset(d_out, 0, sizeof(int));
  regularLauncher<<<1,1,0,s>>>(exec, d_err);
  cudaStreamSynchronize(s);
  printf("Kernel launch error: %s
", cudaGetErrorString(cudaGetLastError()));

  int err=0, out=0;
  cudaMemcpy(&err, d_err, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
  printf("device cudaGraphLaunch -> %d
", err);
  printf("add kernel result = %d
", out);

  cudaGraphExecDestroy(exec);
  cudaGraphDestroy(g);
  cudaStreamDestroy(s);
  cudaFree(d_a); cudaFree(d_b); cudaFree(d_out); cudaFree(d_err);
  return 0;
}

Build command:

nvcc -std=c++17 -O3 -rdc=true -arch=sm_90 device_graph_test.cu -o device_graph_test -lcudadevrt

Output:

Device: NVIDIA H100 NVL, Compute Capability: 9.0
Host cudaGraphLaunch error: no error
Host add kernel result = 5
Kernel launch error: operation not supported
device cudaGraphLaunch -> 0
add kernel result = 0

Question:

Why does the host-side graph launch succeed, but the device-side cudaGraphLaunch fails with operation not supported?
Is there an additional requirement or API flag needed to enable device graph launches on H100 with CUDA 12.9?
Am I missing something in how I instantiate or launch the graph from inside the kernel?

Any clarification or working example of launching a CUDA Graph from within a regular kernel would be greatly appreciated.

Robert Crovella · Accepted Answer

From the programming guide here: " Device graphs must be launched from another graph when launched from the device." You are not doing that, you are launching it from a kernel, not another graph.

If you mimic the control flow exactly as indicated in the linked blog, and launch from a launcher graph instead of attempting to use a "launcher kernel" (there is evidently no such entity), I think you will observe successful results.

Example:

# cat t414.cu
#include <cstdio>
#include <cuda_runtime.h>
#include <iostream>

__global__ void add(int* a, int* b, int* out){
    if(threadIdx.x==0 && blockIdx.x==0) *out = *a + *b;
}

__global__ void regularLauncher(cudaGraphExec_t gexec, int* deviceErr){
  if(threadIdx.x==0 && blockIdx.x==0){
    *deviceErr = 999; // mark kernel ran
    cudaError_t e = cudaGraphLaunch(gexec, cudaStreamGraphFireAndForget);
    *deviceErr = (e == cudaSuccess) ? 512 /*test value*/ : (int)e;
  }
}

int main(){
  int device;
  cudaGetDevice(&device);
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, device);
  printf("Device: %s, Compute Capability: %d.%d
", prop.name, prop.major, prop.minor);

  int *d_a, *d_b, *d_out, *d_err;
  cudaMalloc(&d_a, sizeof(int));
  cudaMalloc(&d_b, sizeof(int));
  cudaMalloc(&d_out, sizeof(int));
  cudaMalloc(&d_err, sizeof(int));

  int ha=2, hb=3;
  cudaMemcpy(d_a, &ha, sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &hb, sizeof(int), cudaMemcpyHostToDevice);
  cudaMemset(d_out, 0, sizeof(int));
  cudaMemset(d_err, 0, sizeof(int));

  cudaStream_t s;
  cudaStreamCreate(&s);
  cudaGraph_t g;
  cudaStreamBeginCapture(s, cudaStreamCaptureModeGlobal);
  add<<<1,1,0,s>>>(d_a, d_b, d_out);
  cudaStreamEndCapture(s, &g);

  cudaGraphExec_t exec;
  cudaGraphInstantiate(&exec, g, cudaGraphInstantiateFlagDeviceLaunch);
  cudaGraphUpload(exec, s);
  cudaStreamSynchronize(s);

  // Verify host launch works
  cudaGraphLaunch(exec, s);
  cudaStreamSynchronize(s);
  printf("Host cudaGraphLaunch error: %s
", cudaGetErrorString(cudaGetLastError()));

  int hostOut = 0;
  cudaMemcpy(&hostOut, d_out, sizeof(int), cudaMemcpyDeviceToHost);
  printf("Host add kernel result = %d
", hostOut);

  // Reset and test device-side launch
  cudaMemset(d_out, 0, sizeof(int));
// changes start here
  cudaGraph_t launcherGraph;
  cudaGraphExec_t launcherExec;
  cudaStreamBeginCapture(s, cudaStreamCaptureModeGlobal);
  regularLauncher<<<1,1,0,s>>>(exec, d_err);
  cudaStreamEndCapture(s, &launcherGraph);
  cudaGraphInstantiate(&launcherExec, launcherGraph, cudaGraphInstantiateFlagDeviceLaunch);
  cudaGraphLaunch(launcherExec,s);
// changes end here
  cudaStreamSynchronize(s);
  printf("Kernel launch error: %s
", cudaGetErrorString(cudaGetLastError()));

  int err=0, out=0;
  cudaMemcpy(&err, d_err, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
  printf("device cudaGraphLaunch -> %d
", err);
  printf("add kernel result = %d
", out);

  cudaGraphExecDestroy(exec);
  cudaGraphDestroy(g);
  cudaStreamDestroy(s);
  cudaFree(d_a); cudaFree(d_b); cudaFree(d_out); cudaFree(d_err);
  return 0;
}
# nvcc -o t414 t414.cu -rdc=true -lcudadevrt -arch=sm_89 -lineinfo
# compute-sanitizer ./t414
========= COMPUTE-SANITIZER
Device: NVIDIA L4, Compute Capability: 8.9
Host cudaGraphLaunch error: no error
Host add kernel result = 5
Kernel launch error: no error
device cudaGraphLaunch -> 512
add kernel result = 5
========= ERROR SUMMARY: 0 errors
#

Executing a CUDA Graph from a CUDA kernel

Tags:

c++

cuda

nvidia

cuda-graphs

Mohammad Siavashi

1 Answers

Robert Crovella

Recent Activity

Donate For Us

Executing a CUDA Graph from a CUDA kernel

Tags:

c++

cuda

nvidia

cuda-graphs

Mohammad Siavashi

1 Answers

Robert Crovella

Related questions

Recent Activity

Donate For Us