I have a 5000x500 matrix and I want to sort each row separately with cuda. I can use arrayfire but this is just a for loop over the thrust::sort, which should not be efficient.
https://github.com/arrayfire/arrayfire/blob/devel/src/backend/cuda/kernel/sort.hpp
for(dim_type w = 0; w < val.dims[3]; w++) {
            dim_type valW = w * val.strides[3];
            for(dim_type z = 0; z < val.dims[2]; z++) {
                dim_type valWZ = valW + z * val.strides[2];
                for(dim_type y = 0; y < val.dims[1]; y++) {
                    dim_type valOffset = valWZ + y * val.strides[1];
                    if(isAscending) {
                        thrust::sort(val_ptr + valOffset, val_ptr + valOffset + val.dims[0]);
                    } else {
                        thrust::sort(val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
                                     thrust::greater<T>());
                    }
                }
            }
        }
Is there a way to fuse operations in thrust so as to have the sorts run in parallel? Indeed, what I am looking for is a generic way to fuse for loop iterations into.
I can think of 2 possibilities, one of which is suggested already by @JaredHoberock. I don't know of a general methodology to fuse for-loop iterations in thrust, but the second method is the more general approach. My guess is that the first method would be the faster of the two approaches, in this case.
Use a vectorized sort. If the regions to be sorted by your nested for-loops are non-overlapping, you can do a vectorized sort using 2 back-to-back stable-sort operations as discussed here.
Thrust v1.8 (available with CUDA 7 RC, or via direct download from the thrust github repository includes support for nesting thrust algorithms, by including a thrust algorithm call within a custom functor passed to another thrust algorithm.  If you use the thrust::for_each operation to select the individual sorts you need to perform, you can run those sorts with a single thrust algorithm call, by including the thrust::sort operation in the functor you pass to thrust::for_each.
Here's a fully worked comparison between 3 methods:
In each case, we're sorting the same 16000 sets of 1000 ints each.
$ cat t617.cu
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>
#define NSORTS 16000
#define DSIZE 1000
int my_mod_start = 0;
int my_mod(){
  return (my_mod_start++)/DSIZE;
}
bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2){
  return thrust::equal(d1.begin(), d1.end(), d2.begin());
}
struct sort_functor
{
  thrust::device_ptr<int> data;
  int dsize;
  __host__ __device__
  void operator()(int start_idx)
  {
    thrust::sort(thrust::device, data+(dsize*start_idx), data+(dsize*(start_idx+1)));
  }
};
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int main(){
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16*DSIZE*NSORTS));
  thrust::host_vector<int> h_data(DSIZE*NSORTS);
  thrust::generate(h_data.begin(), h_data.end(), rand);
  thrust::device_vector<int> d_data = h_data;
  // first time a loop
  thrust::device_vector<int> d_result1 = d_data;
  thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
  unsigned long long mytime = dtime_usec(0);
  for (int i = 0; i < NSORTS; i++)
    thrust::sort(r1ptr+(i*DSIZE), r1ptr+((i+1)*DSIZE));
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  std::cout << "loop time: " << mytime/(float)USECPSEC << "s" << std::endl;
  //vectorized sort
  thrust::device_vector<int> d_result2 = d_data;
  thrust::host_vector<int> h_segments(DSIZE*NSORTS);
  thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
  thrust::device_vector<int> d_segments = h_segments;
  mytime = dtime_usec(0);
  thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
  thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  std::cout << "vectorized time: " << mytime/(float)USECPSEC << "s" << std::endl;
  if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
  //nested sort
  thrust::device_vector<int> d_result3 = d_data;
  sort_functor f = {d_result3.data(), DSIZE};
  thrust::device_vector<int> idxs(NSORTS);
  thrust::sequence(idxs.begin(), idxs.end());
  mytime = dtime_usec(0);
  thrust::for_each(idxs.begin(), idxs.end(), f);
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  std::cout << "nested time: " << mytime/(float)USECPSEC << "s" << std::endl;
  if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
  return 0;
}
$ nvcc -arch=sm_20 -std=c++11 -o t617 t617.cu
$ ./t617
loop time: 8.51577s
vectorized time: 0.068802s
nested time: 0.567959s
$
Notes:
-arch=sm_20 to -arch=sm_35 -rdc=true -lcudadevrt
cudaDeviceSetLimit.cudaDeviceSetLimit may need to be increased perhaps by an additional factor of 8.If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With