Sfoglia il codice sorgente

Initial check in

A set of OpenCL test programs for testing various implementations
Z
Rafael M. Rubio 7 anni fa
parent
commit
971c047810
12 ha cambiato i file con 1316 aggiunte e 0 eliminazioni
  1. 27
    0
      Makefile
  2. 136
    0
      sha256.c
  3. 176
    0
      sha256.cl
  4. 12
    0
      sha256.h
  5. 55
    0
      shamain.cpp
  6. 223
    0
      shatest.cl
  7. 143
    0
      vecadd1.c
  8. 127
    0
      vecadd2.c
  9. 132
    0
      vecadd3.c
  10. 8
    0
      vecadd3.cl
  11. 253
    0
      vecadd4.c
  12. 24
    0
      vecadd4.cl

+ 27
- 0
Makefile Vedi File

@@ -0,0 +1,27 @@
1
+CC=gcc
2
+CPP=g++
3
+INC=
4
+LIBS=-lOpenCL -lm
5
+
6
+all: vecadd1 vecadd2 vecadd3 vecadd4
7
+
8
+vecadd1: vecadd1.c
9
+	$(CC) vecadd1.c -o vecadd1 $(INC) $(LIBS)
10
+
11
+vecadd2: vecadd2.c
12
+	$(CPP) vecadd2.c -o vecadd2 $(INC) $(LIBS)
13
+
14
+vecadd3: vecadd3.c vecadd3.cl
15
+	$(CPP) vecadd3.c -o vecadd3 $(INC) $(LIBS)
16
+
17
+vecadd4: vecadd4.c vecadd4.cl
18
+	$(CPP) vecadd4.c -o vecadd4 $(INC) $(LIBS)
19
+
20
+sha256.o: sha256.c
21
+	$(CPP) -c sha256.c $(INC)
22
+
23
+shamain: shamain.cpp sha256.o
24
+	$(CPP) shamain.cpp -o shamain sha256.o $(INC) $(LIBS) 
25
+
26
+clean:
27
+	rm vecadd1 vecadd2 vecadd3 vecadd4 sha256.o shamain

+ 136
- 0
sha256.c Vedi File

@@ -0,0 +1,136 @@
1
+#define _CRT_SECURE_NO_WARNINGS
2
+#include "sha256.h"
3
+
4
+static cl_platform_id platform_id = NULL;
5
+static cl_device_id device_id = NULL;  
6
+static cl_uint ret_num_devices;
7
+static cl_uint ret_num_platforms;
8
+static cl_context context;
9
+
10
+static cl_int ret;
11
+
12
+static char* source_str;
13
+static size_t source_size;
14
+
15
+static cl_program program;
16
+static cl_kernel kernel;
17
+static cl_command_queue command_queue;
18
+
19
+
20
+static cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_info;
21
+static cl_uint *partial_hashes;
22
+static cl_uint *res_hashes;
23
+static char *saved_plain;
24
+static unsigned int datai[3];
25
+static int have_full_hashes;
26
+
27
+static size_t kpc = 4;
28
+
29
+static size_t global_work_size=3;
30
+static size_t local_work_size=1;
31
+static size_t string_len;
32
+
33
+void load_source();
34
+void createDevice();
35
+void createkernel();
36
+void create_clobj();
37
+
38
+void crypt_all();
39
+
40
+
41
+void sha256_init(size_t user_kpc)
42
+{
43
+    kpc = user_kpc;
44
+    load_source();
45
+    createDevice();
46
+    createkernel();
47
+    create_clobj();
48
+}
49
+
50
+void sha256_crypt(char input[], char* output)
51
+{
52
+
53
+    int i;
54
+    string_len = strlen(input);
55
+    global_work_size = 3;
56
+    datai[0] = SHA256_PLAINTEXT_LENGTH;
57
+    datai[1] = global_work_size;
58
+    datai[2] = string_len;
59
+    memcpy(saved_plain, input, string_len+1);
60
+
61
+    crypt_all();
62
+
63
+
64
+    for(i=0; i<SHA256_RESULT_SIZE; i++)
65
+    {
66
+        sprintf(output+i*8,"%08x", partial_hashes[i]);
67
+    }
68
+    printf("'%s':\n%s\n", input, output);
69
+
70
+
71
+}
72
+
73
+void crypt_all()
74
+{
75
+    //printf("%s\n",saved_plain);
76
+    ret = clEnqueueWriteBuffer(command_queue, data_info, CL_TRUE, 0, sizeof(unsigned int) * 3, datai, 0, NULL, NULL);
77
+    ret = clEnqueueWriteBuffer(command_queue, buffer_keys, CL_TRUE, 0, SHA256_PLAINTEXT_LENGTH * kpc, saved_plain, 0, NULL, NULL);
78
+    // printf("%s\n",buffer_keys);
79
+    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
80
+
81
+    ret = clFinish(command_queue);
82
+    // read back partial hashes
83
+    ret = clEnqueueReadBuffer(command_queue, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, partial_hashes, 0, NULL, NULL);
84
+    have_full_hashes = 0;
85
+}
86
+
87
+void load_source()
88
+{
89
+    FILE *fp;
90
+
91
+    fp = fopen("sha256.cl", "r");
92
+    if (!fp) {
93
+        fprintf(stderr, "Failed to load kernel.\n");
94
+        exit(1);
95
+    }
96
+    source_str = (char*)malloc(MAX_SOURCE_SIZE);
97
+    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
98
+    fclose( fp );
99
+
100
+
101
+}
102
+
103
+void create_clobj(){
104
+    pinned_saved_keys = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (SHA256_PLAINTEXT_LENGTH)*kpc, NULL, &ret);
105
+    saved_plain = (char*)clEnqueueMapBuffer(command_queue, pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, (SHA256_PLAINTEXT_LENGTH)*kpc, 0, NULL, NULL, &ret);
106
+    memset(saved_plain, 0, SHA256_PLAINTEXT_LENGTH * kpc);
107
+    res_hashes = (cl_uint *)malloc(sizeof(cl_uint) * SHA256_RESULT_SIZE);
108
+    memset(res_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
109
+    pinned_partial_hashes = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
110
+    partial_hashes = (cl_uint *) clEnqueueMapBuffer(command_queue, pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, 0, NULL, NULL, &ret);
111
+    memset(partial_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
112
+
113
+    buffer_keys = clCreateBuffer(context, CL_MEM_READ_ONLY, (SHA256_PLAINTEXT_LENGTH) * kpc, NULL, &ret);
114
+    buffer_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
115
+    data_info = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int) * 3, NULL, &ret);
116
+
117
+    clSetKernelArg(kernel, 0, sizeof(data_info), (void *) &data_info);
118
+    clSetKernelArg(kernel, 1, sizeof(buffer_keys), (void *) &buffer_keys);
119
+    clSetKernelArg(kernel, 2, sizeof(buffer_out), (void *) &buffer_out);
120
+}
121
+
122
+void createDevice()
123
+{
124
+    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
125
+    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
126
+
127
+    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
128
+}
129
+
130
+void createkernel()
131
+{
132
+    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
133
+    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
134
+    kernel = clCreateKernel(program, "sha256_crypt_kernel", &ret);
135
+    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
136
+}

+ 176
- 0
sha256.cl Vedi File

@@ -0,0 +1,176 @@
1
+#ifndef uint32_t
2
+#define uint32_t unsigned int
3
+#endif
4
+
5
+#define H0 0x6a09e667
6
+#define H1 0xbb67ae85
7
+#define H2 0x3c6ef372
8
+#define H3 0xa54ff53a
9
+#define H4 0x510e527f
10
+#define H5 0x9b05688c
11
+#define H6 0x1f83d9ab
12
+#define H7 0x5be0cd19
13
+
14
+
15
+uint rotr(uint x, int n) {
16
+  if (n < 32) return (x >> n) | (x << (32 - n));
17
+  return x;
18
+}
19
+
20
+uint ch(uint x, uint y, uint z) {
21
+  return (x & y) ^ (~x & z);
22
+}
23
+
24
+uint maj(uint x, uint y, uint z) {
25
+  return (x & y) ^ (x & z) ^ (y & z);
26
+}
27
+
28
+uint sigma0(uint x) {
29
+  return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
30
+}
31
+
32
+uint sigma1(uint x) {
33
+  return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
34
+}
35
+
36
+uint gamma0(uint x) {
37
+  return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
38
+}
39
+
40
+uint gamma1(uint x) {
41
+  return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
42
+}
43
+
44
+
45
+
46
+__kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key,  __global uint *digest){
47
+  int t, gid, msg_pad;
48
+  int stop, mmod;
49
+  uint i, ulen, item, total;
50
+  uint W[80], temp, A,B,C,D,E,F,G,H,T1,T2;
51
+  uint num_keys = data_info[1];
52
+  int current_pad;
53
+
54
+ //printf(get_global_id(0));
55
+
56
+  uint K[64]={
57
+0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
58
+0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
59
+0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
60
+0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
61
+0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
62
+0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
63
+0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
64
+0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
65
+};
66
+
67
+  msg_pad=0;
68
+
69
+  ulen = data_info[2];
70
+  total = ulen%64>=56?2:1 + ulen/64;
71
+
72
+  //printf("ulen: %u total:%u\n", ulen, total);
73
+
74
+  digest[0] = H0;
75
+  digest[1] = H1;
76
+  digest[2] = H2;
77
+  digest[3] = H3;
78
+  digest[4] = H4;
79
+  digest[5] = H5;
80
+  digest[6] = H6;
81
+  digest[7] = H7;
82
+  for(item=0; item<total; item++)
83
+  {
84
+
85
+    A = digest[0];
86
+    B = digest[1];
87
+    C = digest[2];
88
+    D = digest[3];
89
+    E = digest[4];
90
+    F = digest[5];
91
+    G = digest[6];
92
+    H = digest[7];
93
+
94
+#pragma unroll
95
+    for (t = 0; t < 80; t++){
96
+    W[t] = 0x00000000;
97
+    }
98
+    msg_pad=item*64;
99
+    if(ulen > msg_pad)
100
+    {
101
+      current_pad = (ulen-msg_pad)>64?64:(ulen-msg_pad);
102
+    }
103
+    else
104
+    {
105
+      current_pad =-1;    
106
+    }
107
+
108
+   // printf("current_pad: %d\n",current_pad);
109
+    if(current_pad>0)
110
+    {
111
+      i=current_pad;
112
+
113
+      stop =  i/4;
114
+  //    printf("i:%d, stop: %d msg_pad:%d\n",i,stop, msg_pad);
115
+      for (t = 0 ; t < stop+get_global_id(0) ; t++){
116
+        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
117
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
118
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
119
+        W[t] |= (uchar)  plain_key[msg_pad + t * 4 + 3];
120
+   //     printf("W[%u]: %u\n",t,W[t]);
121
+      }
122
+      mmod = i % 4;
123
+      if ( mmod == 3){
124
+        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
125
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
126
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
127
+        W[t] |=  ((uchar) 0x80) ;
128
+      } else if (mmod == 2) {
129
+        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
130
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
131
+        W[t] |=  0x8000 ;
132
+      } else if (mmod == 1) {
133
+        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
134
+        W[t] |=  0x800000 ;
135
+      } else /*if (mmod == 0)*/ {
136
+        W[t] =  0x80000000 ;
137
+      }
138
+
139
+      if (current_pad<56)
140
+      {
141
+        W[15] =  ulen*8 ;
142
+   //     printf("ulen avlue 2 :w[15] :%u\n", W[15]);
143
+      }
144
+    }
145
+    else if(current_pad <0)
146
+    {
147
+      if( ulen%64==0)
148
+        W[0]=0x80000000;
149
+      W[15]=ulen*8;
150
+      //printf("ulen avlue 3 :w[15] :%u\n", W[15]);
151
+    }
152
+
153
+    for (t = 0; t < 64; t++) {
154
+      if (t >= 16)
155
+        W[t] = gamma1(W[t - 2]) + W[t - 7] + gamma0(W[t - 15]) + W[t - 16];
156
+      T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
157
+      T2 = sigma0(A) + maj(A, B, C);
158
+      H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
159
+    }
160
+    digest[0] += A;
161
+    digest[1] += B;
162
+    digest[2] += C;
163
+    digest[3] += D;
164
+    digest[4] += E;
165
+    digest[5] += F;
166
+    digest[6] += G;
167
+    digest[7] += H;
168
+
169
+
170
+  }
171
+
172
+
173
+  // printf("hi");
174
+
175
+
176
+}

+ 12
- 0
sha256.h Vedi File

@@ -0,0 +1,12 @@
1
+#include <stdio.h>
2
+#include <stdlib.h>
3
+#include <CL/opencl.h>
4
+#include <string.h>
5
+
6
+#define SHA256_PLAINTEXT_LENGTH 80
7
+#define SHA256_RESULT_SIZE 32
8
+#define MAX_SOURCE_SIZE 16384
9
+
10
+void sha256_init(size_t user_kpc);
11
+void sha256_crypt(char input[], char* output);
12
+

+ 55
- 0
shamain.cpp Vedi File

@@ -0,0 +1,55 @@
1
+#define _CRT_SECURE_NO_WARNINGS
2
+#include "sha256.h"
3
+// #include <stdio.h>
4
+// #include < string.h > 
5
+
6
+
7
+
8
+void crypt_and_print(char input[])
9
+{
10
+  char result[65];
11
+  char diff[65] = "00000";
12
+  char *istr;
13
+  char  buffer2[20];
14
+  int temp;
15
+
16
+  char str2[20];
17
+
18
+
19
+
20
+  for (int i = 0; i < 1; i++)
21
+  {
22
+
23
+      char string[] = "xqqq";
24
+
25
+      sprintf(buffer2, "%d", i);
26
+      temp = 8 - strlen(buffer2);
27
+      str2[0] = '\0';
28
+      while (strlen(str2) != temp)
29
+          strcat(str2, "0");
30
+      strcat(str2, buffer2);
31
+      strcat(string, str2);
32
+      sha256_crypt(string, result);
33
+
34
+      istr = strstr(result, diff);
35
+
36
+
37
+      if (istr != NULL) {
38
+          printf(istr);
39
+          break;
40
+      }
41
+
42
+
43
+  }
44
+
45
+}
46
+
47
+int main()
48
+{
49
+  char result[65];
50
+
51
+  sha256_init(2048);
52
+
53
+  crypt_and_print((char*)"");
54
+
55
+}

+ 223
- 0
shatest.cl Vedi File

@@ -0,0 +1,223 @@
1
+#ifndef uint32_t
2
+#define uint32_t unsigned int
3
+#endif
4
+
5
+#define H0 0x6a09e667
6
+#define H1 0xbb67ae85
7
+#define H2 0x3c6ef372
8
+#define H3 0xa54ff53a
9
+#define H4 0x510e527f
10
+#define H5 0x9b05688c
11
+#define H6 0x1f83d9ab
12
+#define H7 0x5be0cd19
13
+
14
+uint rotr(uint x, int n) {
15
+  if (n < 32) return (x >> n) | (x << (32 - n));
16
+  return x;
17
+}
18
+
19
+uint ch(uint x, uint y, uint z) {
20
+  return (x & y) ^ (~x & z);
21
+}
22
+
23
+uint maj(uint x, uint y, uint z) {
24
+  return (x & y) ^ (x & z) ^ (y & z);
25
+}
26
+
27
+uint sigma0(uint x) {
28
+  return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
29
+}
30
+
31
+uint sigma1(uint x) {
32
+  return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
33
+}
34
+
35
+uint gamma0(uint x) {
36
+  return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
37
+}
38
+
39
+uint gamma1(uint x) {
40
+  return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
41
+}
42
+
43
+__kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key,  __global uint *digest) {
44
+  int t, gid, msg_pad;
45
+  int stop, mmod;
46
+  uint i, ulen, item, total;
47
+  uint W[80], temp, A,B,C,D,E,F,G,H,T1,T2;
48
+  uint num_keys = data_info[1];
49
+  int current_pad;
50
+  uint tacc, wt, wtm2, wtm7, wtm15, wtm16;
51
+  uint tsh1; // short
52
+  uint tsh2; // short
53
+  uchar uch1, uch2, uch3, uch4;
54
+  uchar uch[4];
55
+
56
+ //printf(get_global_id(0));
57
+  uint K[64]={
58
+0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
59
+0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
60
+0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
61
+0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
62
+0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
63
+0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
64
+0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
65
+0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
66
+};
67
+
68
+  msg_pad=0;
69
+
70
+  ulen = data_info[2];
71
+  total = ulen%64>=56?2:1 + ulen/64;
72
+
73
+  //printf("ulen: %u total:%u\n", ulen, total);
74
+
75
+  digest[0] = H0;
76
+  digest[1] = H1;
77
+  digest[2] = H2;
78
+  digest[3] = H3;
79
+  digest[4] = H4;
80
+  digest[5] = H5;
81
+  digest[6] = H6;
82
+  digest[7] = H7;
83
+  for(item=0; item<total; item++)
84
+  {
85
+
86
+    A = digest[0];
87
+    B = digest[1];
88
+    C = digest[2];
89
+    D = digest[3];
90
+    E = digest[4];
91
+    F = digest[5];
92
+    G = digest[6];
93
+    H = digest[7];
94
+
95
+#pragma unroll
96
+    for (t = 0; t < 80; t++){
97
+      W[t] = 0x00000000;
98
+    }
99
+    msg_pad=item*64;
100
+    if(ulen > msg_pad)
101
+    {
102
+      current_pad = (ulen-msg_pad)>64?64:(ulen-msg_pad);
103
+    }
104
+    else
105
+    {
106
+      current_pad =-1;    
107
+    }
108
+
109
+    if(current_pad>0)
110
+    {
111
+      i=current_pad;
112
+
113
+      stop =  i/4;
114
+      for (t = 0 ; t < stop+get_global_id(0) ; t++){
115
+        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
116
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
117
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
118
+        W[t] |= (uchar)  plain_key[msg_pad + t * 4 + 3];
119
+      }
120
+      mmod = i % 4;
121
+      if ( mmod == 3){
122
+        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
123
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
124
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
125
+        W[t] |=  ((uchar) 0x80) ;
126
+      } else if (mmod == 2) {
127
+        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
128
+        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
129
+        W[t] |=  0x8000 ;
130
+      } else if (mmod == 1) {
131
+        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
132
+        W[t] |=  0x800000 ;
133
+      } else { //if (mmod == 0)
134
+        W[t] =  0x80000000 ;
135
+      }
136
+
137
+      if (current_pad<56)
138
+      {
139
+        W[15] =  ulen*8 ;
140
+      }
141
+    }
142
+    else if(current_pad <0)
143
+    {
144
+      if( ulen%64==0)
145
+        W[0]=0x80000000;
146
+      W[15]=ulen*8;
147
+    }
148
+    for (t = 0; t < 64; t++) {
149
+      if (t >= 16)
150
+        // W[t] = gamma1(W[t - 2]) + W[t - 7] + gamma0(W[t - 15]) + W[t - 16]; // 64-bit
151
+        // W[t] = (W[t - 2]) + W[t - 7] + (W[t - 15]) + W[t - 16]; // 64-bit
152
+        // W[t] = (W[t - 2] + W[t - 7] + W[t - 15] + W[t - 16])&0xffffffff; // 64-bit
153
+        // W[t] = (W[t - 2] + W[t - 7])&0xffffffff; // 64-bit
154
+        // W[t] = W[t - 2]; // 64-bit // 64-bit
155
+        // W[t] = 0x0 ^ W[t - 2]; // 64-bit
156
+        // W[t] = (uint)W[t-2]; // 64-bit
157
+        // W[t] = 0xffffffff & W[t-2] & 0xffffffff; // 64-bit
158
+        // W[t] = W[t-2]>>16<<16 + W[t-2]&0x0000ffff; // 64-bit
159
+        // W[t] = (uint)(W[t-2]>>16); // 64-bit
160
+        // W[t] = (uint)(W[t-2]/2); // 64-bit
161
+        // W[t] = (int)(W[0]); // 64-bit
162
+        // W[t] = W[t] + W[t]; // 64-bit
163
+        // W[t] = W[t-2]; // 64-bit
164
+        // W[t] = wtm2 + wtm7 + wtm15 + wtm16; // 64-bit
165
+        // W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
166
+        // W[t] = ((uchar)  uch[0]) << 24; // 64-bit
167
+        // W[t] |= ((uchar)  uch[1]) << 16; // 64-bit
168
+        // W[t] |= ((uchar)  uch[2]) << 8; // 64-bit
169
+        // W[t] |= ((uchar)  uch[3]) << 0; // 64-bit
170
+        // wt = W[t]; // This works, but reassignment to W[t] fails as 64-bit
171
+        // wtm2 = W[t-2];
172
+        // wtm7 = W[t-7];
173
+        // wtm15 = W[t-15];
174
+        // wtm16 = W[t-16];
175
+        // tacc = gamma1(wtm2) + wtm7 + gamma0(wtm15) + wtm16; 
176
+
177
+        W[t] = gamma1(W[t-2]) + W[t-7] + gamma0(W[t-15]) + W[t-16];
178
+        T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
179
+        T2 = sigma0(A) + maj(A, B, C);
180
+        H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
181
+
182
+
183
+/*
184
+        tsh1 = (uint)(tacc >> 16);
185
+        tsh2 = (uint)(tacc & 0xffff);
186
+        uch1 = (uchar)(tsh1>>8);
187
+        uch2 = (uchar)(tsh1&0xff);
188
+        uch3 = (uchar)(tsh2>>8);
189
+        uch4 = (uchar)(tsh2&0xff);
190
+        uch[0]=(uchar)uch1;
191
+        uch[1]=(uchar)uch2;
192
+        uch[2]=(uchar)uch3;
193
+        uch[3]=(uchar)uch4;
194
+        W[t] = t;
195
+        t = (uchar)uch1;
196
+        W[t] = (uint)(t);
197
+        W[0] = (uint)uch1;
198
+        W[t] |= ((uchar)uch2) << 16;
199
+        W[t] |= ((uchar)uch3) << 8;
200
+        W[t] |= (uchar)uch4;
201
+        tmpi = W[t-2];
202
+        W[t] = tmpi;
203
+        W[t] = 0xffffffff << 1;
204
+        W[t-2] = W[t-2];
205
+        W[t] = W[t];
206
+        W[t] = W[t]&0xffffffff;
207
+*/
208
+/*
209
+      T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
210
+      T2 = sigma0(A) + maj(A, B, C);
211
+      H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
212
+*/
213
+    }
214
+    digest[0] += A;
215
+    digest[1] += B;
216
+    digest[2] += C;
217
+    digest[3] += D;
218
+    digest[4] += E;
219
+    digest[5] += F;
220
+    digest[6] += G;
221
+    digest[7] += H;
222
+  }
223
+}

+ 143
- 0
vecadd1.c Vedi File

@@ -0,0 +1,143 @@
1
+#include <stdio.h>
2
+#include <stdlib.h>
3
+#include <math.h>
4
+#include <CL/opencl.h>
5
+ 
6
+// OpenCL kernel. Each work item takes care of one element of c
7
+const char *kernelSource =                                       "\n" \
8
+"#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \
9
+"__kernel void vecAdd(  __global double *a,                       \n" \
10
+"                       __global double *b,                       \n" \
11
+"                       __global double *c,                       \n" \
12
+"                       const unsigned int n)                    \n" \
13
+"{                                                               \n" \
14
+"    //Get our global thread ID                                  \n" \
15
+"    int id = get_global_id(0);                                  \n" \
16
+"                                                                \n" \
17
+"    //Make sure we do not go out of bounds                      \n" \
18
+"    if (id < n)                                                 \n" \
19
+"        c[id] = a[id] + b[id];                                  \n" \
20
+"}                                                               \n" \
21
+                                                                "\n" ;
22
+ 
23
+int main( int argc, char* argv[] )
24
+{
25
+    // Length of vectors
26
+    unsigned int n = 100000;
27
+ 
28
+    // Host input vectors
29
+    double *h_a;
30
+    double *h_b;
31
+    // Host output vector
32
+    double *h_c;
33
+ 
34
+    // Device input buffers
35
+    cl_mem d_a;
36
+    cl_mem d_b;
37
+    // Device output buffer
38
+    cl_mem d_c;
39
+ 
40
+    cl_platform_id cpPlatform;        // OpenCL platform
41
+    cl_device_id device_id;           // device ID
42
+    cl_context context;               // context
43
+    cl_command_queue queue;           // command queue
44
+    cl_program program;               // program
45
+    cl_kernel kernel;                 // kernel
46
+ 
47
+    // Size, in bytes, of each vector
48
+    size_t bytes = n*sizeof(double);
49
+ 
50
+    // Allocate memory for each vector on host
51
+    h_a = (double*)malloc(bytes);
52
+    h_b = (double*)malloc(bytes);
53
+    h_c = (double*)malloc(bytes);
54
+ 
55
+    // Initialize vectors on host
56
+    int i;
57
+    for( i = 0; i < n; i++ )
58
+    {
59
+        h_a[i] = sinf(i)*sinf(i);
60
+        h_b[i] = cosf(i)*cosf(i);
61
+    }
62
+ 
63
+    size_t globalSize, localSize;
64
+    cl_int err;
65
+ 
66
+    // Number of work items in each local work group
67
+    localSize = 64;
68
+ 
69
+    // Number of total work items - localSize must be devisor
70
+    globalSize = ceil(n/(float)localSize)*localSize;
71
+ 
72
+    // Bind to platform
73
+    err = clGetPlatformIDs(1, &cpPlatform, NULL);
74
+ 
75
+    // Get ID for the device
76
+    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
77
+ 
78
+    // Create a context 
79
+    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
80
+ 
81
+    // Create a command queue
82
+    queue = clCreateCommandQueue(context, device_id, 0, &err);
83
+ 
84
+    // Create the compute program from the source buffer
85
+    program = clCreateProgramWithSource(context, 1,
86
+                            (const char **) & kernelSource, NULL, &err);
87
+ 
88
+    // Build the program executable
89
+    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
90
+ 
91
+    // Create the compute kernel in the program we wish to run
92
+    kernel = clCreateKernel(program, "vecAdd", &err);
93
+ 
94
+    // Create the input and output arrays in device memory for our calculation
95
+    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
96
+    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
97
+    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
98
+ 
99
+    // Write our data set into the input array in device memory
100
+    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
101
+                                   bytes, h_a, 0, NULL, NULL);
102
+    err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
103
+                                   bytes, h_b, 0, NULL, NULL);
104
+ 
105
+    // Set the arguments to our compute kernel
106
+    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
107
+    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
108
+    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
109
+    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
110
+ 
111
+    // Execute the kernel over the entire range of the data set 
112
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
113
+                                                              0, NULL, NULL);
114
+ 
115
+    // Wait for the command queue to get serviced before reading back results
116
+    clFinish(queue);
117
+ 
118
+    // Read the results from the device
119
+    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
120
+                                bytes, h_c, 0, NULL, NULL );
121
+ 
122
+    //Sum up vector c and print result divided by n, this should equal 1 within error
123
+    double sum = 0;
124
+    for(i=0; i<n; i++)
125
+        sum += h_c[i];
126
+    printf("final result: %f\n", sum/n);
127
+ 
128
+    // release OpenCL resources
129
+    clReleaseMemObject(d_a);
130
+    clReleaseMemObject(d_b);
131
+    clReleaseMemObject(d_c);
132
+    clReleaseProgram(program);
133
+    clReleaseKernel(kernel);
134
+    clReleaseCommandQueue(queue);
135
+    clReleaseContext(context);
136
+ 
137
+    //release host memory
138
+    free(h_a);
139
+    free(h_b);
140
+    free(h_c);
141
+ 
142
+    return 0;
143
+}

+ 127
- 0
vecadd2.c Vedi File

@@ -0,0 +1,127 @@
1
+//Includes
2
+#include <stdio.h>
3
+#include <stdlib.h>
4
+#include <iostream>
5
+ 
6
+#ifdef __APPLE__
7
+#include <OpenCL/opencl.h>
8
+#else
9
+#include <CL/cl.h>
10
+#endif
11
+ 
12
+#define DATA_SIZE 10
13
+ 
14
+using namespace std;
15
+ 
16
+const char *ProgramSource =
17
+"__kernel void add(__global float *inputA, __global float *inputB, __global float *output)\n"\
18
+"{\n"\
19
+"  size_t id = get_global_id(0);\n"\
20
+"  output[id] = inputA[id] + 2.0*inputB[id];\n"\
21
+"}\n";
22
+ 
23
+int main(void)
24
+{
25
+cl_context context;
26
+cl_context_properties properties[3];
27
+cl_kernel kernel;
28
+cl_command_queue command_queue;
29
+cl_program program;
30
+cl_int err;
31
+cl_uint num_of_platforms=0;
32
+cl_platform_id platform_id;
33
+cl_device_id device_id;
34
+cl_uint num_of_devices=0;
35
+cl_mem inputA, inputB, output;
36
+ 
37
+size_t global;
38
+ 
39
+float inputDataA[DATA_SIZE]={1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
40
+float inputDataB[DATA_SIZE]={1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
41
+float results[DATA_SIZE]={0};
42
+ 
43
+int i;
44
+ 
45
+// retreive a list of platforms avaible
46
+if (clGetPlatformIDs(1, &platform_id, &num_of_platforms)!= CL_SUCCESS)
47
+{
48
+printf("Unable to get platform_id\n");
49
+return 1;
50
+}
51
+ 
52
+// try to get a supported GPU device
53
+if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices) != CL_SUCCESS)
54
+{
55
+printf("Unable to get device_id\n");
56
+return 1;
57
+}
58
+ 
59
+// context properties list - must be terminated with 0
60
+properties[0]= CL_CONTEXT_PLATFORM;
61
+properties[1]= (cl_context_properties) platform_id;
62
+properties[2]= 0;
63
+ 
64
+// create a context with the GPU device
65
+context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
66
+ 
67
+// create command queue using the context and device
68
+command_queue = clCreateCommandQueue(context, device_id, 0, &err);
69
+ 
70
+// create a program from the kernel source code
71
+program = clCreateProgramWithSource(context,1,(const char **) &ProgramSource, NULL, &err);
72
+ 
73
+// compile the program
74
+if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
75
+{
76
+printf("Error building program\n");
77
+return 1;
78
+}
79
+ 
80
+// specify which kernel from the program to execute
81
+kernel = clCreateKernel(program, "add", &err);
82
+ 
83
+// create buffers for the input and ouput
84
+ 
85
+inputA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
86
+inputB = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
87
+output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
88
+ 
89
+// load data into the input buffer
90
+clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
91
+clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataB, 0, NULL, NULL);
92
+ 
93
+// set the argument list for the kernel command
94
+clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
95
+clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
96
+clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
97
+ 
98
+global=DATA_SIZE;
99
+ 
100
+// enqueue the kernel command for execution
101
+clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
102
+clFinish(command_queue);
103
+ 
104
+// copy the results from out of the output buffer
105
+clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) *DATA_SIZE, results, 0, NULL, NULL);
106
+ 
107
+// print the results
108
+printf("output: ");
109
+ 
110
+for(i=0;i<DATA_SIZE; i++)
111
+{
112
+printf("%f ",results[i]);
113
+}
114
+printf("\n");
115
+ 
116
+// cleanup - release OpenCL resources
117
+clReleaseMemObject(inputA);
118
+clReleaseMemObject(inputB);
119
+clReleaseMemObject(output);
120
+clReleaseProgram(program);
121
+clReleaseKernel(kernel);
122
+clReleaseCommandQueue(command_queue);
123
+clReleaseContext(context);
124
+ 
125
+return 0;
126
+ 
127
+}

+ 132
- 0
vecadd3.c Vedi File

@@ -0,0 +1,132 @@
1
+#include <stdio.h>
2
+#include <stdlib.h>
3
+ 
4
+#ifdef __APPLE__
5
+#include <OpenCL/opencl.h>
6
+#else
7
+#include <CL/cl.h>
8
+#endif
9
+ 
10
+#define MAX_SOURCE_SIZE (0x100000)
11
+ 
12
+int main(void) {
13
+    printf("started running\n");
14
+
15
+    // Create the two input vectors
16
+    int i;
17
+    const int LIST_SIZE = 1024;
18
+    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
19
+    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
20
+    for(i = 0; i < LIST_SIZE; i++) {
21
+        A[i] = i;
22
+        B[i] = LIST_SIZE - i;
23
+    }
24
+ 
25
+    // Load the kernel source code into the array source_str
26
+    FILE *fp;
27
+    char *source_str;
28
+    size_t source_size;
29
+ 
30
+    fp = fopen("vecadd3.cl", "r");
31
+    if (!fp) {
32
+        fprintf(stderr, "Failed to load kernel.\n");
33
+        exit(1);
34
+    }
35
+    source_str = (char*)malloc(MAX_SOURCE_SIZE);
36
+    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
37
+    fclose( fp );
38
+    printf("kernel loading done\n");
39
+
40
+    // Get platform and device information
41
+    cl_device_id device_id = NULL;   
42
+    cl_uint ret_num_devices;
43
+    cl_uint ret_num_platforms;
44
+    
45
+	
46
+	cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);
47
+    cl_platform_id *platforms = NULL;
48
+    platforms = (cl_platform_id*)malloc(ret_num_platforms*sizeof(cl_platform_id));
49
+    ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);
50
+    printf("ret at %d is %d\n", __LINE__, ret);
51
+    ret = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_ALL, 1, &device_id, &ret_num_devices);
52
+    printf("ret at %d is %d\n", __LINE__, ret);
53
+
54
+    // Create an OpenCL context
55
+    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
56
+ 	printf("ret at %d is %d\n", __LINE__, ret);
57
+
58
+    // Create a command queue
59
+    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
60
+ 	printf("ret at %d is %d\n", __LINE__, ret);
61
+
62
+    // Create memory buffers on the device for each vector 
63
+    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret);
64
+    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret);
65
+    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int), NULL, &ret);
66
+ 
67
+    // Copy the lists A and B to their respective memory buffers
68
+    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
69
+    printf("ret at %d is %d\n", __LINE__, ret);
70
+
71
+    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
72
+    printf("ret at %d is %d\n", __LINE__, ret);
73
+ 
74
+    printf("before building\n");
75
+
76
+    // Create a program from the kernel source
77
+    cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
78
+    printf("ret at %d is %d\n", __LINE__, ret);
79
+ 
80
+    // Build the program
81
+    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
82
+    printf("ret at %d is %d\n", __LINE__, ret);
83
+    printf("after building\n");
84
+
85
+    // Create the OpenCL kernel
86
+    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
87
+    printf("ret at %d is %d\n", __LINE__, ret);
88
+ 
89
+    // Set the arguments of the kernel
90
+    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
91
+    printf("ret at %d is %d\n", __LINE__, ret);
92
+
93
+    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
94
+    printf("ret at %d is %d\n", __LINE__, ret);
95
+
96
+    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
97
+    printf("ret at %d is %d\n", __LINE__, ret);
98
+
99
+    //added this to fix garbage output problem
100
+    //ret = clSetKernelArg(kernel, 3, sizeof(int), &LIST_SIZE);
101
+ 
102
+    printf("before execution\n");
103
+
104
+    // Execute the OpenCL kernel on the list
105
+    size_t global_item_size = LIST_SIZE; // Process the entire lists
106
+    size_t local_item_size = 12; // Divide work items into groups of 12
107
+    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
108
+    printf("after execution\n");
109
+
110
+    // Read the memory buffer C on the device to the local variable C
111
+    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
112
+    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
113
+    printf("after copying\n");
114
+    // Display the result to the screen
115
+    for(i = 0; i < 8; i++)
116
+        printf("%d + %d = %d\n", A[i], B[i], C[i]);
117
+ 
118
+    // Clean up
119
+    ret = clFlush(command_queue);
120
+    ret = clFinish(command_queue);
121
+    ret = clReleaseKernel(kernel);
122
+    ret = clReleaseProgram(program);
123
+    ret = clReleaseMemObject(a_mem_obj);
124
+    ret = clReleaseMemObject(b_mem_obj);
125
+    ret = clReleaseMemObject(c_mem_obj);
126
+    ret = clReleaseCommandQueue(command_queue);
127
+    ret = clReleaseContext(context);
128
+    free(A);
129
+    free(B);
130
+    free(C);
131
+    return 0;
132
+}

+ 8
- 0
vecadd3.cl Vedi File

@@ -0,0 +1,8 @@
1
+__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
2
+ 
3
+    // Get the index of the current element to be processed
4
+    int i = get_global_id(0);
5
+ 
6
+    // Do the operation
7
+    C[i] = A[i] + B[i];
8
+}

+ 253
- 0
vecadd4.c Vedi File

@@ -0,0 +1,253 @@
1
+#include <stdio.h>
2
+#include <stdlib.h>
3
+ 
4
+#ifdef __APPLE__
5
+#include <OpenCL/opencl.h>
6
+#else
7
+#include <CL/cl.h>
8
+#endif
9
+ 
10
+#define MAX_SOURCE_SIZE (0x100000)
11
+
12
+const char *clErrorString(cl_int error)
13
+{
14
+switch(error){
15
+    // run-time and JIT compiler errors
16
+    case 0: return "CL_SUCCESS";
17
+    case -1: return "CL_DEVICE_NOT_FOUND";
18
+    case -2: return "CL_DEVICE_NOT_AVAILABLE";
19
+    case -3: return "CL_COMPILER_NOT_AVAILABLE";
20
+    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
21
+    case -5: return "CL_OUT_OF_RESOURCES";
22
+    case -6: return "CL_OUT_OF_HOST_MEMORY";
23
+    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
24
+    case -8: return "CL_MEM_COPY_OVERLAP";
25
+    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
26
+    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
27
+    case -11: return "CL_BUILD_PROGRAM_FAILURE";
28
+    case -12: return "CL_MAP_FAILURE";
29
+    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
30
+    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
31
+    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
32
+    case -16: return "CL_LINKER_NOT_AVAILABLE";
33
+    case -17: return "CL_LINK_PROGRAM_FAILURE";
34
+    case -18: return "CL_DEVICE_PARTITION_FAILED";
35
+    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
36
+
37
+    // compile-time errors
38
+    case -30: return "CL_INVALID_VALUE";
39
+    case -31: return "CL_INVALID_DEVICE_TYPE";
40
+    case -32: return "CL_INVALID_PLATFORM";
41
+    case -33: return "CL_INVALID_DEVICE";
42
+    case -34: return "CL_INVALID_CONTEXT";
43
+    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
44
+    case -36: return "CL_INVALID_COMMAND_QUEUE";
45
+    case -37: return "CL_INVALID_HOST_PTR";
46
+    case -38: return "CL_INVALID_MEM_OBJECT";
47
+    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
48
+    case -40: return "CL_INVALID_IMAGE_SIZE";
49
+    case -41: return "CL_INVALID_SAMPLER";
50
+    case -42: return "CL_INVALID_BINARY";
51
+    case -43: return "CL_INVALID_BUILD_OPTIONS";
52
+    case -44: return "CL_INVALID_PROGRAM";
53
+    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
54
+    case -46: return "CL_INVALID_KERNEL_NAME";
55
+    case -47: return "CL_INVALID_KERNEL_DEFINITION";
56
+    case -48: return "CL_INVALID_KERNEL";
57
+    case -49: return "CL_INVALID_ARG_INDEX";
58
+    case -50: return "CL_INVALID_ARG_VALUE";
59
+    case -51: return "CL_INVALID_ARG_SIZE";
60
+    case -52: return "CL_INVALID_KERNEL_ARGS";
61
+    case -53: return "CL_INVALID_WORK_DIMENSION";
62
+    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
63
+    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
64
+    case -56: return "CL_INVALID_GLOBAL_OFFSET";
65
+    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
66
+    case -58: return "CL_INVALID_EVENT";
67
+    case -59: return "CL_INVALID_OPERATION";
68
+    case -60: return "CL_INVALID_GL_OBJECT";
69
+    case -61: return "CL_INVALID_BUFFER_SIZE";
70
+    case -62: return "CL_INVALID_MIP_LEVEL";
71
+    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
72
+    case -64: return "CL_INVALID_PROPERTY";
73
+    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
74
+    case -66: return "CL_INVALID_COMPILER_OPTIONS";
75
+    case -67: return "CL_INVALID_LINKER_OPTIONS";
76
+    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
77
+
78
+    // extension errors
79
+    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
80
+    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
81
+    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
82
+    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
83
+    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
84
+    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
85
+    default: return "Unknown OpenCL error";
86
+    }
87
+} 
88
+
89
+void debug(int ret) {
90
+  if (ret != CL_SUCCESS) {
91
+    printf(clErrorString(ret));
92
+    printf("\n");
93
+  }
94
+} 
95
+
96
+int main(void) {
97
+    printf("start\n");
98
+
99
+    // Create the two input vectors
100
+    int i;
101
+    const int LIST_SIZE = 1024;
102
+    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
103
+    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
104
+    for(i = 0; i < LIST_SIZE; i++) {
105
+        A[i] = i;
106
+        B[i] = LIST_SIZE - i;
107
+    }
108
+ 
109
+    // Load the kernel source code into the array source_str
110
+    FILE *fp;
111
+    char *source_str;
112
+    size_t source_size;
113
+ 
114
+    fp = fopen("vecadd4.cl", "r");
115
+    if (!fp) {
116
+        fprintf(stderr, "Failed to load kernel.\n");
117
+        exit(1);
118
+    }
119
+    source_str = (char*)malloc(MAX_SOURCE_SIZE);
120
+    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
121
+    fclose( fp );
122
+
123
+    // Get platform and device information
124
+    cl_device_id device_id = NULL;   
125
+    cl_uint ret_num_devices;
126
+    cl_uint ret_num_platforms;
127
+    
128
+	
129
+    cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);
130
+    debug(ret);
131
+    cl_platform_id *platforms = NULL;
132
+    platforms = (cl_platform_id*)malloc(ret_num_platforms*sizeof(cl_platform_id));
133
+    ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);
134
+    printf("ret at clGetPlatformIDs (%d) is %d\n", __LINE__, ret);
135
+    debug(ret);
136
+    ret = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_ALL, 1, &device_id, &ret_num_devices);
137
+    printf("ret at clGetDeviceIDs (%d) is %d\n", __LINE__, ret);
138
+    debug(ret);
139
+  
140
+    // Create an OpenCL context
141
+    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
142
+    printf("ret at clCreateContext (%d) is %d\n", __LINE__, ret);
143
+    debug(ret);
144
+
145
+    // Create a command queue
146
+    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
147
+    printf("ret at clCreateCommandQueue (%d) is %d\n", __LINE__, ret);
148
+    debug(ret);
149
+
150
+    // Create memory buffers on the device for each vector 
151
+    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret);
152
+    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret);
153
+    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int), NULL, &ret);
154
+ 
155
+    // Copy the lists A and B to their respective memory buffers
156
+    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
157
+    printf("ret at clEnqueueWriteBuffer (%d) is %d\n", __LINE__, ret);
158
+    debug(ret);
159
+
160
+    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
161
+    printf("ret at clEnqueueWriteBuffer (%d) is %d\n", __LINE__, ret);
162
+    debug(ret);
163
+ 
164
+    // Create a program from the kernel source
165
+    cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
166
+    printf("ret at clCreateProgramWithSource (%d) is %d\n", __LINE__, ret);
167
+    debug(ret);
168
+ 
169
+    // Build the program
170
+    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
171
+    printf("ret at clBuildProgram (%d) is %d\n", __LINE__, ret);
172
+    if (ret != CL_SUCCESS) {
173
+      printf(clErrorString(ret));
174
+      printf("\n");
175
+      cl_build_status status;
176
+      char * log;
177
+      size_t log_size;
178
+      clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
179
+      log = (char*)malloc(log_size+1);
180
+      clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
181
+      log[log_size-1]=0;  
182
+      printf(log);
183
+      free(log);
184
+    }
185
+
186
+
187
+    // Create the OpenCL kernel
188
+    // __kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key,  __global uint *digest) {
189
+    // cl_kernel kernel = clCreateKernel(program, "sha256_crypt_kernel", &ret);
190
+    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
191
+    printf("ret at clCreateKernel (%d) is %d\n", __LINE__, ret);
192
+    debug(ret);
193
+ 
194
+    // Set the arguments of the kernel
195
+    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
196
+    printf("ret at clSetKernelArg (%d) is %d\n", __LINE__, ret);
197
+    debug(ret);
198
+
199
+    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
200
+    printf("ret at clSetKernelArg (%d) is %d\n", __LINE__, ret);
201
+    debug(ret);
202
+
203
+    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
204
+    printf("ret at clSetKernelArg (%d) is %d\n", __LINE__, ret);
205
+    debug(ret);
206
+
207
+    //added this to fix garbage output problem
208
+    //ret = clSetKernelArg(kernel, 3, sizeof(int), &LIST_SIZE);
209
+ 
210
+    // Execute the OpenCL kernel on the list
211
+    size_t global_item_size = LIST_SIZE; // Process the entire lists
212
+    size_t local_item_size = 8; // Divide work items into groups of 8 (12 ideally, but 1024 isn't) 
213
+    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
214
+    printf("ret at clEnqueueNDRangeKernel (%d) is %d\n", __LINE__, ret);
215
+    if (ret!=CL_SUCCESS) {
216
+        printf(clErrorString(ret));
217
+        printf("\n");
218
+    }
219
+    // Read the memory buffer C on the device to the local variable C
220
+    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
221
+    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
222
+    printf("ret at clEnqueueReadBuffer (%d) is %d\n", __LINE__, ret);
223
+    debug(ret);
224
+
225
+    // Display the result to the screen
226
+    for(i = 0; i < 8; i++)
227
+        printf("%d + %d = %d\n", A[i], B[i], C[i]);
228
+ 
229
+    // Clean up
230
+    ret = clFlush(command_queue);
231
+    debug(ret);
232
+    ret = clFinish(command_queue);
233
+    debug(ret);
234
+    ret = clReleaseKernel(kernel);
235
+    debug(ret);
236
+    ret = clReleaseProgram(program);
237
+    debug(ret);
238
+    ret = clReleaseMemObject(a_mem_obj);
239
+    debug(ret);
240
+    ret = clReleaseMemObject(b_mem_obj);
241
+    debug(ret);
242
+    ret = clReleaseMemObject(c_mem_obj);
243
+    debug(ret);
244
+    ret = clReleaseCommandQueue(command_queue);
245
+    debug(ret);
246
+    ret = clReleaseContext(context);
247
+    debug(ret);
248
+
249
+    free(A);
250
+    free(B);
251
+    free(C);
252
+    return 0;
253
+}

+ 24
- 0
vecadd4.cl Vedi File

@@ -0,0 +1,24 @@
1
+__kernel void vector_add(__global const uint *A, __global const uint *B, __global uint *C) {
2
+ 
3
+    // Get the index of the current element to be processed
4
+    int i = get_global_id(0);
5
+ 
6
+    // Do the operation
7
+    C[i] = (A[i] + B[i]);
8
+
9
+    // Now break the compiler
10
+    uint a[4];
11
+    uchar c = 0x01;
12
+    short s = 0x0101;
13
+    c = 0x23;
14
+    s = c;
15
+    for(int t=0;t<4;t++) {
16
+      a[t]=c;
17
+    }
18
+    for(int t=0;t<4;t++) {
19
+      a[t]=s;
20
+      if(t>0) {
21
+        a[t-1]=a[t] + c + s;
22
+      }
23
+    }
24
+}