I'm having a bit of trouble understanding how to send a 2D array to Cuda.  I have a program that parses a  large file with a 30 data points on each line.  I read about 10 rows at a time and then create a matrix for each line and items(so in my example of 10 rows with 30 data points, it would be int list[10][30];  My goal is to send this array to my kernal and have each block process a row(I have gotten this to work perfectly in normal C, but Cuda has been a bit more challenging).
Here's what I'm doing so far but no luck(note: sizeofbucket = rows, and sizeOfBucketsHoldings = items in row...I know I should win a award for odd variable names):
    int list[sizeOfBuckets][sizeOfBucketsHoldings]; //this is created at the start of the file and I can confirmed its filled with the correct data
#define sizeOfBuckets 10 //size of buckets before sending to process list
#define sizeOfBucketsHoldings  30
    //Cuda part
                //define device variables
                int *dev_current_list[sizeOfBuckets][sizeOfBucketsHoldings];
                //time to malloc the 2D array on device
                size_t pitch;
                cudaMallocPitch((int**)&dev_current_list,  (size_t *)&pitch, sizeOfBucketsHoldings * sizeof(int), sizeOfBuckets);
                //copy data from host to device
                cudaMemcpy2D( dev_current_list, pitch, list, sizeOfBuckets * sizeof(int), sizeOfBuckets * sizeof(int), sizeOfBucketsHoldings * sizeof(int),cudaMemcpyHostToDevice );
                process_list<<<count,1>>> (sizeOfBuckets, sizeOfBucketsHoldings, dev_current_list, pitch);
                //free memory of device
                cudaFree( dev_current_list );
    __global__ void process_list(int sizeOfBuckets, int sizeOfBucketsHoldings, int *current_list, int pitch) {
        int tid = blockIdx.x;
        for (int r = 0; r < sizeOfBuckets; ++r) {
            int* row = (int*)((char*)current_list + r * pitch);
            for (int c = 0; c < sizeOfBucketsHoldings; ++c) {
                 int element = row[c];
            }
        }
The error I'm getting is:
main.cu(266): error: argument of type "int *(*)[30]" is incompatible with parameter of type "int *"
1 error detected in the compilation of "/tmp/tmpxft_00003f32_00000000-4_main.cpp1.ii".
line 266 is the kernel call process_list<<<count,1>>> (count, countListItem, dev_current_list, pitch);  I think the problem is I am trying to create my array in my function as int * but how else can I create it? In my pure C code, I use int current_list[num_of_rows][num_items_in_row] which works but I can't get the same outcome to work in Cuda.
My end goal is simple I just want to get each block to process each row(sizeOfBuckets) and then have it loop through all items in that row(sizeOfBucketHoldings).  I orginally just did a normal cudamalloc and cudaMemcpy but it wasn't working so I looked around and found out about MallocPitch and 2dcopy(both of which were not in my cuda by example book) and I have been trying to study examples but they seem to be giving me the same error(I'm currently reading the CUDA_C programming guide found this idea on page22 but still no luck).  Any ideas? or suggestions of where to look? 
Edit: To test this, I just want to add the value of each row together(I copied the logic from the cuda by example array addition example). My kernel:
__global__ void process_list(int sizeOfBuckets, int sizeOfBucketsHoldings, int *current_list, size_t pitch, int *total) {
    //TODO: we need to flip the list as well
    int tid = blockIdx.x;
    for (int c = 0; c < sizeOfBucketsHoldings; ++c) {
        total[tid] = total + current_list[tid][c];
    }
}
Here's how I declare the total array in my main:
int *dev_total;
cudaMalloc( (void**)&dev_total, sizeOfBuckets * sizeof(int) );
You have some mistakes in your code.
This example should help you with memory allocation:
__global__ void process_list(int sizeOfBucketsHoldings, int* total, int* current_list, int pitch)
{
    int tid = blockIdx.x;
    total[tid] = 0;
    for (int c = 0; c < sizeOfBucketsHoldings; ++c)
    {
        total[tid] += *((int*)((char*)current_list + tid * pitch) + c);
    }
}
int main()
{
    size_t sizeOfBuckets         = 10;
    size_t sizeOfBucketsHoldings = 30;
    size_t width = sizeOfBucketsHoldings * sizeof(int);//ned to be in bytes
    size_t height = sizeOfBuckets;
    int* list = new int [sizeOfBuckets * sizeOfBucketsHoldings];// one dimensional
    for (int i = 0; i < sizeOfBuckets; i++)
        for (int j = 0; j < sizeOfBucketsHoldings; j++)
            list[i *sizeOfBucketsHoldings + j] = i;
    size_t pitch_h = sizeOfBucketsHoldings * sizeof(int);// always in bytes
    int* dev_current_list;
    size_t pitch_d;
    cudaMallocPitch((int**)&dev_current_list, &pitch_d, width, height);
    int *test;
    cudaMalloc((void**)&test, sizeOfBuckets * sizeof(int));
    int* h_test = new int[sizeOfBuckets];
    cudaMemcpy2D(dev_current_list, pitch_d, list, pitch_h, width, height, cudaMemcpyHostToDevice);
    process_list<<<10, 1>>>(sizeOfBucketsHoldings, test, dev_current_list, pitch_d);
    cudaDeviceSynchronize();
    cudaMemcpy(h_test, test, sizeOfBuckets * sizeof(int), cudaMemcpyDeviceToHost);
    for (int i = 0; i < sizeOfBuckets; i++)
        printf("%d %d\n", i , h_test[i]);
    return 0;
}
To access your 2D array in kernel you should use pattern base_addr + y * pitch_d + x.
WARNING: the pitvh allways in bytes. You need to cast your pointer to byte*. 
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With