I’m trying to launch a captured CUDA Graph from inside a regular CUDA kernel (i.e., device-side graph launch).
From the NVIDIA blog on device graph launch, it seems this should be supported on newer GPUs (such as H100) with recent CUDA versions.
To test this, I wrote a minimal example that captures a simple add kernel into a CUDA Graph, instantiates it with cudaGraphInstantiateFlagDeviceLaunch, and then tries to launch it from inside another kernel.
Here’s the code:
#include <cstdio>
#include <cuda_runtime.h>
#include <iostream>
__global__ void add(int* a, int* b, int* out){
if(threadIdx.x==0 && blockIdx.x==0) *out = *a + *b;
}
__global__ void regularLauncher(cudaGraphExec_t gexec, int* deviceErr){
if(threadIdx.x==0 && blockIdx.x==0){
*deviceErr = 999; // mark kernel ran
cudaError_t e = cudaGraphLaunch(gexec, cudaStreamGraphFireAndForget);
*deviceErr = (e == cudaSuccess) ? 512 /*test value*/ : (int)e;
}
}
int main(){
int device;
cudaGetDevice(&device);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device);
printf("Device: %s, Compute Capability: %d.%d\n", prop.name, prop.major, prop.minor);
int *d_a, *d_b, *d_out, *d_err;
cudaMalloc(&d_a, sizeof(int));
cudaMalloc(&d_b, sizeof(int));
cudaMalloc(&d_out, sizeof(int));
cudaMalloc(&d_err, sizeof(int));
int ha=2, hb=3;
cudaMemcpy(d_a, &ha, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &hb, sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(d_out, 0, sizeof(int));
cudaMemset(d_err, 0, sizeof(int));
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
cudaGraph_t g;
cudaStreamBeginCapture(s, cudaStreamCaptureModeGlobal);
add<<<1,1,0,s>>>(d_a, d_b, d_out);
cudaStreamEndCapture(s, &g);
cudaGraphExec_t exec;
cudaGraphInstantiate(&exec, g, cudaGraphInstantiateFlagDeviceLaunch);
cudaGraphUpload(exec, s);
cudaStreamSynchronize(s);
// Verify host launch works
cudaGraphLaunch(exec, s);
cudaStreamSynchronize(s);
printf("Host cudaGraphLaunch error: %s\n", cudaGetErrorString(cudaGetLastError()));
int hostOut = 0;
cudaMemcpy(&hostOut, d_out, sizeof(int), cudaMemcpyDeviceToHost);
printf("Host add kernel result = %d\n", hostOut);
// Reset and test device-side launch
cudaMemset(d_out, 0, sizeof(int));
regularLauncher<<<1,1,0,s>>>(exec, d_err);
cudaStreamSynchronize(s);
printf("Kernel launch error: %s\n", cudaGetErrorString(cudaGetLastError()));
int err=0, out=0;
cudaMemcpy(&err, d_err, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
printf("device cudaGraphLaunch -> %d\n", err);
printf("add kernel result = %d\n", out);
cudaGraphExecDestroy(exec);
cudaGraphDestroy(g);
cudaStreamDestroy(s);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_out); cudaFree(d_err);
return 0;
}
Build command:
nvcc -std=c++17 -O3 -rdc=true -arch=sm_90 device_graph_test.cu -o device_graph_test -lcudadevrt
Output:
Device: NVIDIA H100 NVL, Compute Capability: 9.0
Host cudaGraphLaunch error: no error
Host add kernel result = 5
Kernel launch error: operation not supported
device cudaGraphLaunch -> 0
add kernel result = 0
Question:
cudaGraphLaunch fails with operation not supported?Any clarification or working example of launching a CUDA Graph from within a regular kernel would be greatly appreciated.
From the programming guide here: " Device graphs must be launched from another graph when launched from the device." You are not doing that, you are launching it from a kernel, not another graph.
If you mimic the control flow exactly as indicated in the linked blog, and launch from a launcher graph instead of attempting to use a "launcher kernel" (there is evidently no such entity), I think you will observe successful results.
Example:
# cat t414.cu
#include <cstdio>
#include <cuda_runtime.h>
#include <iostream>
__global__ void add(int* a, int* b, int* out){
if(threadIdx.x==0 && blockIdx.x==0) *out = *a + *b;
}
__global__ void regularLauncher(cudaGraphExec_t gexec, int* deviceErr){
if(threadIdx.x==0 && blockIdx.x==0){
*deviceErr = 999; // mark kernel ran
cudaError_t e = cudaGraphLaunch(gexec, cudaStreamGraphFireAndForget);
*deviceErr = (e == cudaSuccess) ? 512 /*test value*/ : (int)e;
}
}
int main(){
int device;
cudaGetDevice(&device);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device);
printf("Device: %s, Compute Capability: %d.%d\n", prop.name, prop.major, prop.minor);
int *d_a, *d_b, *d_out, *d_err;
cudaMalloc(&d_a, sizeof(int));
cudaMalloc(&d_b, sizeof(int));
cudaMalloc(&d_out, sizeof(int));
cudaMalloc(&d_err, sizeof(int));
int ha=2, hb=3;
cudaMemcpy(d_a, &ha, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &hb, sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(d_out, 0, sizeof(int));
cudaMemset(d_err, 0, sizeof(int));
cudaStream_t s;
cudaStreamCreate(&s);
cudaGraph_t g;
cudaStreamBeginCapture(s, cudaStreamCaptureModeGlobal);
add<<<1,1,0,s>>>(d_a, d_b, d_out);
cudaStreamEndCapture(s, &g);
cudaGraphExec_t exec;
cudaGraphInstantiate(&exec, g, cudaGraphInstantiateFlagDeviceLaunch);
cudaGraphUpload(exec, s);
cudaStreamSynchronize(s);
// Verify host launch works
cudaGraphLaunch(exec, s);
cudaStreamSynchronize(s);
printf("Host cudaGraphLaunch error: %s\n", cudaGetErrorString(cudaGetLastError()));
int hostOut = 0;
cudaMemcpy(&hostOut, d_out, sizeof(int), cudaMemcpyDeviceToHost);
printf("Host add kernel result = %d\n", hostOut);
// Reset and test device-side launch
cudaMemset(d_out, 0, sizeof(int));
// changes start here
cudaGraph_t launcherGraph;
cudaGraphExec_t launcherExec;
cudaStreamBeginCapture(s, cudaStreamCaptureModeGlobal);
regularLauncher<<<1,1,0,s>>>(exec, d_err);
cudaStreamEndCapture(s, &launcherGraph);
cudaGraphInstantiate(&launcherExec, launcherGraph, cudaGraphInstantiateFlagDeviceLaunch);
cudaGraphLaunch(launcherExec,s);
// changes end here
cudaStreamSynchronize(s);
printf("Kernel launch error: %s\n", cudaGetErrorString(cudaGetLastError()));
int err=0, out=0;
cudaMemcpy(&err, d_err, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
printf("device cudaGraphLaunch -> %d\n", err);
printf("add kernel result = %d\n", out);
cudaGraphExecDestroy(exec);
cudaGraphDestroy(g);
cudaStreamDestroy(s);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_out); cudaFree(d_err);
return 0;
}
# nvcc -o t414 t414.cu -rdc=true -lcudadevrt -arch=sm_89 -lineinfo
# compute-sanitizer ./t414
========= COMPUTE-SANITIZER
Device: NVIDIA L4, Compute Capability: 8.9
Host cudaGraphLaunch error: no error
Host add kernel result = 5
Kernel launch error: no error
device cudaGraphLaunch -> 512
add kernel result = 5
========= ERROR SUMMARY: 0 errors
#
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With