*another update in code and questions*
just started to learn openCL about 1 week or so, and i tried to port a CUDA program about bruteforcing a MD5 hash to get an actual string from it. I use 2 files: kernel.cl, and main.cpp.
//this is kernel.cl
{...*defining some md5 variables*...}
void IncrementBruteGPU(unsigned char* ourBrute, unsigned int charSetLen, unsigned int bruteLength, unsigned int incrementBy){
int i = 0;
while(incrementBy > 0 && i < bruteLength)
{
    int add = incrementBy + ourBrute[i];
    ourBrute[i] = add % charSetLen;
    incrementBy = add / charSetLen;
    i++;
}}
void md5_vfy(unsigned char* data, unsigned int length, unsigned int *a1, unsigned int *b1, unsigned int *c1, unsigned int *d1){
{...*some md5 hashing function*...}}
__kernel void crack(unsigned int numThreads, unsigned int charSetLen,
                unsigned int bruteLength, unsigned int v1,
                unsigned int v2, unsigned int v3, unsigned int v4,
                __constant unsigned char *cudaBrute, 
                __constant unsigned char *cudaCharSet,
                __global unsigned char *correctPass){
//count index
unsigned int idx = get_global_id(0);
int totalLen = 0;
int bruteStart = 0;
unsigned char word[14];
unsigned char ourBrute[14];
int i = 0;
for(i = 0; i < 14; i++)
{
    ourBrute[i] = cudaBrute[i];
}
i = 0;
bruteStart = i;
i+= bruteLength;
totalLen = i;
IncrementBruteGPU(ourBrute, charSetLen, bruteLength, idx);
int timer = 0;
for(timer = 0; timer < 200; timer++)
{
    //substitute into string
    for(i = 0; i < bruteLength; i++)
    {
        word[i+bruteStart] = cudaCharSet[ourBrute[i]];
    }
    unsigned int c1 = 0, c2 = 0, c3 = 0, c4 = 0;
    //find MD5 hash from word
    md5_vfy(word,totalLen, &c1, &c2, &c3, &c4);
    //compare hash with the input one
    if(c1 == v1 && c2 == v2 && c3 == v3 && c4 == v4)
    {
        //place the right string into first index of array
        int j;
        for(j= 0; j < 14; j++)
        {
            correctPass[j] = word[j];
        }
        correctPass[totalLen] = 0;
    }
    IncrementBruteGPU(ourBrute, charSetLen, bruteLength, numThreads);
}}
and this is the main:
//just the main, not the entire main.cpp
int main( int argc, char** argv){
int digit=1;
int charSetLen = 0;
char hash[32];
char *strhash[32];
printf("Insert Hash: ");
scanf("%s", strhash);
system("cls");
int numThreads = BLOCKS * THREADS_PER_BLOCK;
unsigned char currentBrute[14];
unsigned char cpuCorrectPass[14];
ZeroFill(currentBrute, 14);
ZeroFill(cpuCorrectPass, 14);
charSetLen = 65;
unsigned char charSet[65];
memcpy(charSet, " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@_", charSetLen);
memcpy(hash, strhash, 32);
//break hash into 4 processes of MD5
unsigned int v1, v2, v3, v4;
md5_to_ints(hash,&v1,&v2,&v3,&v4);
//openCL starts here
cl_platform_id cpPlatform;        // OpenCL platform
cl_device_id device_id;           // device ID
cl_context context;               // context
cl_command_queue queue;           // command queue
cl_program program;               // program
cl_kernel kernel;                 // kernel
cl_int err;
cl_mem correctPass;
cl_mem cudaCharSet;
cl_mem cudaBrute;
size_t globalSize, localSize;
size_t bytes = 14*sizeof(char);
//5 work-groups
localSize = 10;
globalSize = 50;
 // Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);
if(err < 0) {
  perror("Couldn't identify a platform");
  exit(1);
} 
// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if(err == CL_DEVICE_NOT_FOUND) {
  err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
}
if(err < 0) {
  perror("Couldn't access any devices");
  exit(1);   
}
// Create a context  
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if(err < 0) {
  perror("Couldn't create a context");
  exit(1);   
}
// Create a command queue 
queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if(err < 0) {
  perror("Couldn't create a command queue");
  exit(1);   
}
// Build the program executable 
program = build_program(context, device_id, PROGRAM_FILE);
 // Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if(err < 0) {
  perror("Couldn't create a kernel");
  exit(1);
}
// Create the input and output arrays in device memory for our calculation
cudaBrute = clCreateBuffer(context, CL_MEM_READ_ONLY, 14, NULL, NULL);
cudaCharSet = clCreateBuffer(context, CL_MEM_READ_ONLY, 95, NULL, NULL);
correctPass = clCreateBuffer(context, CL_MEM_READ_WRITE, 14, NULL, NULL);
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, correctPass, CL_TRUE, 0,
    bytes, cpuCorrectPass, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, cudaCharSet, CL_TRUE, 0,
    bytes, charSet, 0, NULL, NULL);
// Set the arguments to our compute kernel
err  = clSetKernelArg(kernel, 0, sizeof(unsigned int), &numThreads);
err  |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &charSetLen);
err  |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &digit);
err  |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &v1);
err  |= clSetKernelArg(kernel, 4, sizeof(unsigned int), &v2);
err  |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &v3);
err  |= clSetKernelArg(kernel, 6, sizeof(unsigned int), &v4);
err  |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cudaBrute);
err  |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &cudaCharSet);
err  |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &correctPass);
bool finished = false;
int ct = 0;
while(true){
do{
    err = clEnqueueWriteBuffer(queue, cudaBrute, CL_TRUE, 0,
        bytes, currentBrute, 0, NULL, NULL);
// Execute the kernel over the entire range of the data set  
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                          0, NULL, NULL);
// Wait for the command queue to get serviced before reading back results
    clFinish(queue);
// Read the results from the device
    clEnqueueReadBuffer(queue, correctPass, CL_TRUE, 0, bytes, cpuCorrectPass, 0, NULL, NULL );
    if(cpuCorrectPass[0] != 0)
    {       
        printf("MD5 Cracked---->\t");
        int k = 0;
        while(cpuCorrectPass[k] != 0)
        {
            printf("%c", cpuCorrectPass[k]);
            k++;
        }
        printf("\n\n");
        return 0;
    }
    finished = BruteIncrement(currentBrute, charSetLen, digit, numThreads * 200);
    if(ct % OUTPUT_INTERVAL == 0)
    {
        printf("STATUS: ");
        int k = 0;
        for(k = 0; k < digit; k++)
            printf("%c",charSet[currentBrute[k]]);
        printf("\n");
    }
    ct++;
} while(!finished);
    digit=digit+1;
}   
// release OpenCL resources
clReleaseMemObject(correctPass);
clReleaseMemObject(cudaCharSet);
clReleaseMemObject(cudaBrute);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;}
the problem with this program is it never finds the right string. Seems like the idea of comparing brute-hashes and the input hash isn't working. I got the CUDA version works perfectly .
Please kindly tell me what makes this doesn't run correctly. I suspect either the kernel isn't working at all or my lack of understanding about read/write memory & buffer in openCL or in general cause this.
*if you want to see all the files, please ask me., because i think it will be too long if i post them here. thanks before and sorry for the bad formatting.
Your kernel is reading and writing from constant arrays defined at program scope in your OpenCL kernel source code (cudaBrute, cudaCharSet, correctPass). These arrays are not initialised, and the host will never be able to get the output from the kernel. To transfer input data from the host to a kernel and to retrieve results from a kernel, you need to use kernel arguments, not program scope variables.
Your kernel definition should look something like this:
__kernel void crack(unsigned int numThreads, unsigned int charSetLen,
                    unsigned int bruteLength, unsigned int v1,
                    unsigned int v2, unsigned int v3, unsigned int v4,
                    __global uchar *cudaBrute, 
                    __global uchar *cudaCharSet,
                    __global uchar *correctPass)
{
  ...
  (do stuff with the arguments)
  ...
}
To set the arguments from your host code, you would do something like this:
// Set the arguments to our compute kernel
err  = clSetKernelArg(kernel, 0, sizeof(int), &numThreads);
err  |= clSetKernelArg(kernel, 1, sizeof(int), &charSetLen);
err  |= clSetKernelArg(kernel, 2, sizeof(int), &digit);
err  |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &v1);
err  |= clSetKernelArg(kernel, 4, sizeof(unsigned int), &v2);
err  |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &v3);
err  |= clSetKernelArg(kernel, 6, sizeof(unsigned int), &v4);
err  |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cudaBrute);
err  |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &cudaCharSet);
err  |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &correctPass);
Notice the second argument, which is the argument index in your kernel definition, and how for the last three arguments we are now passing in the buffer we created with clCreateBuffer.
(EDIT: A couple more issues were found after further debugging)
You are updating the value of digit on the host. In order to pass this updated value to the device for each kernel invocation, you need to re-set the kernel argument. You can do this simply by moving this line to just before your clEnqueueNDRangeKernel call:
err  |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &digit);
When you write data to the cudaCharSet buffer, you need to make sure you are writing the correct amount. Your code currently uses bytes (which is 14), but this should really be charSetLen (which is 65):
err = clEnqueueWriteBuffer(queue, cudaCharSet, CL_TRUE, 0,
                           charSetLen, charSet, 0, NULL, NULL);
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With