Advertisement
Guest User

Untitled

a guest
Dec 14th, 2018
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.26 KB | None | 0 0
  1. #include <iostream>
  2. #include <intrin.h>
  3. #include<omp.h>
  4. #include<chrono>
  5. #define __CL_ENABLE_EXCEPTIONS
  6.  
  7. #include<CL/cl.hpp>
  8. #include <vector>
  9. #include <fstream>
  10. #include <immintrin.h>
  11. #include <cstdlib>
  12. #include <ctime>
  13.  
  14. using namespace std;
  15. using namespace cl;
  16.  
  17. const int n = 64;
  18. const int cnt = 10;
  19.  
  20. void sum_vec(int* x, int* y, int* res) {
  21. for (int i = 0; i < n * n * 3; i += 8) {
  22. _mm256_storeu_si256((__m256i*)&res[i], _mm256_sub_epi32(_mm256_loadu_si256((__m256i*)&x[i]), _mm256_loadu_si256((__m256i*)&y[i])));
  23. }
  24. }
  25. void omp_parallel(int* A, int* B, int* res) {
  26. #pragma omp parallel for
  27. for (int i = 0; i < n; ++i) {
  28. res[i] = A[i] - B[i];
  29. }
  30. }
  31.  
  32. void oCl(cl::Device device, int*A, int*B, int *result) {
  33. vector<Device> contextDevices;
  34. contextDevices.push_back(device);
  35. Context context(contextDevices);
  36.  
  37. CommandQueue queue(context, device);
  38.  
  39. fill_n(result, n*n * 3, 0);
  40.  
  41. Buffer cl_matrix_A = Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, n*n *3 * sizeof(int), A);
  42. Buffer cl_vector_B = Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, n * n * 3 * sizeof(int), B);
  43. Buffer cl_result_vector = Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, n*n * 3 * sizeof(int), result);
  44.  
  45. cl::Program::Sources sources;
  46. ifstream sourceFile("device.cl");
  47. string sourceCode(istreambuf_iterator<char>(sourceFile), (istreambuf_iterator<char>()));
  48. sources.push_back({ sourceCode.c_str(),sourceCode.length() });
  49. cl::Program program = cl::Program(context, sources);
  50. program.build(contextDevices);
  51. cl::Kernel kernel(program, "matrix_min");
  52.  
  53. int iArg = 0;
  54. kernel.setArg(iArg++, cl_result_vector);
  55. kernel.setArg(iArg++, cl_matrix_A);
  56. kernel.setArg(iArg++, cl_vector_B);
  57.  
  58. queue.enqueueNDRangeKernel(kernel, NullRange, NDRange(n*n*3), NDRange(n));
  59. queue.finish();
  60.  
  61. }
  62.  
  63. void generate(int*A, int*B) {
  64. for (int i = 0; i < n * n * 3; i++) {
  65. A[i] = 0 + (rand() % static_cast<int>(255 - 0 + 1));
  66. B[i] = 0 + (rand() % static_cast<int>(255 - 0 + 1));
  67. }
  68. }
  69.  
  70. void minusM(int *A, int *B, int *result) {
  71. for (int i = 0; i < n * n * 3; i++) {
  72. result[i] = A[i] - B[i];
  73. }
  74. }
  75.  
  76.  
  77. int main() {
  78. srand((unsigned int)time(0));
  79. int *mA = new int[n*n * 3];
  80. int *mB = new int[n*n * 3];
  81. int *mRes = new int[n*n * 3];
  82.  
  83. generate(mA, mB);
  84. vector<cl::Platform> platforms;
  85. cl::Platform::get(&platforms);
  86. vector<cl::Device> devices;
  87. chrono::time_point<chrono::system_clock> start, end;
  88. double all_time = 0;
  89. double all_err = 0;
  90.  
  91. /*for (unsigned int iPlatform = 0; iPlatform < platforms.size(); iPlatform++) {
  92. platforms[iPlatform].getDevices(CL_DEVICE_TYPE_ALL, &devices);
  93. for (unsigned int iDevice = 0; iDevice < devices.size(); iDevice++) {
  94. try {
  95. std::cout << devices[iDevice].getInfo<CL_DEVICE_NAME>() << iDevice << " " << iPlatform << " : " << devices[iDevice].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << std::endl;
  96. }
  97. catch (cl::Error error) {
  98. std::cout << error.what() << "(" << error.err() << ")" << std::endl;
  99. }
  100. }
  101. }*/
  102.  
  103. platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
  104.  
  105.  
  106.  
  107. //Platform 0 Device 0
  108. for (int i = 0; i < cnt; i++) {
  109. start = chrono::system_clock::now();
  110. try {
  111. oCl(devices[0], mA, mB, mRes);
  112. }
  113. catch (cl::Error error) {
  114. cout << error.what() << "(" << error.err() << ")" << endl;
  115. }
  116. end = chrono::system_clock::now();
  117. all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
  118. }
  119.  
  120. cout << "GPU: " << all_time / (double)cnt << " milliseconds" << endl;
  121.  
  122. //Device end
  123. /*all_time = 0;
  124. all_err = 0;
  125. //Vector start
  126. for (int i = 0; i < cnt; i++) {
  127. start = chrono::system_clock::now();
  128. sum_vec(mA, mB, mRes);
  129. end = chrono::system_clock::now();
  130. all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
  131. }
  132. cout << "Vectorization: " << all_time / (double)cnt << " microseconds" << endl;
  133. //Vector end*/
  134.  
  135. all_time = 0;
  136. all_err = 0;
  137. //Parallel start
  138. for (int i = 0; i < cnt; i++) {
  139. start = chrono::system_clock::now();
  140. omp_parallel(mA, mB, mRes);
  141. end = chrono::system_clock::now();
  142. all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
  143. }
  144. cout << "Parallel: " << all_time / (double)cnt << " microseconds" << endl;
  145. //Parallel end
  146.  
  147. all_time = 0;
  148. all_err = 0;
  149. //Simple start
  150. for (int i = 0; i < cnt; i++) {
  151. start = chrono::system_clock::now();
  152. minusM(mA, mB, mRes);
  153. end = chrono::system_clock::now();
  154. all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
  155. }
  156. cout << "Simple: " << all_time / (double)cnt << " microseconds" << endl;
  157. //Simple end
  158.  
  159.  
  160. //BMP
  161.  
  162. FILE *f;
  163. unsigned char *img = NULL;
  164. int filesize = 54 + 3 * n*n, r, g, b, x, y;
  165.  
  166. img = (unsigned char *)malloc(3 * n*n);
  167. memset(img, 0, 3 * n*n);
  168.  
  169. for (int i = 0; i < n * 3; i+=3)
  170. {
  171. for (int j = 0; j < n * 3; j+=3)
  172. {
  173. x = i/3; y = ((n * 3 - 1) - j)/3;
  174. r = mRes[i*n + j];
  175. g = mRes[i*n + j + 1];
  176. b = mRes[i*n + j + 2];
  177. if (r > 255) r = 255;
  178. if (g > 255) g = 255;
  179. if (b > 255) b = 255;
  180. img[(x + y * n) * 3 + 2] = (unsigned char)(r);
  181. img[(x + y * n) * 3 + 1] = (unsigned char)(g);
  182. img[(x + y * n) * 3 + 0] = (unsigned char)(b);
  183. }
  184. }
  185.  
  186. unsigned char bmpfileheader[14] = { 'B','M', 0,0,0,0, 0,0, 0,0, 54,0,0,0 };
  187. unsigned char bmpinfoheader[40] = { 40,0,0,0, 0,0,0,0, 0,0,0,0, 1,0, 24,0 };
  188. unsigned char bmppad[3] = { 0,0,0 };
  189.  
  190. bmpfileheader[2] = (unsigned char)(filesize);
  191. bmpfileheader[3] = (unsigned char)(filesize >> 8);
  192. bmpfileheader[4] = (unsigned char)(filesize >> 16);
  193. bmpfileheader[5] = (unsigned char)(filesize >> 24);
  194.  
  195. bmpinfoheader[4] = (unsigned char)(n);
  196. bmpinfoheader[5] = (unsigned char)(n >> 8);
  197. bmpinfoheader[6] = (unsigned char)(n >> 16);
  198. bmpinfoheader[7] = (unsigned char)(n >> 24);
  199. bmpinfoheader[8] = (unsigned char)(n);
  200. bmpinfoheader[9] = (unsigned char)(n >> 8);
  201. bmpinfoheader[10] = (unsigned char)(n >> 16);
  202. bmpinfoheader[11] = (unsigned char)(n >> 24);
  203.  
  204. f = fopen("img.bmp", "wb");
  205. fwrite(bmpfileheader, 1, 14, f);
  206. fwrite(bmpinfoheader, 1, 40, f);
  207. for (int i = 0; i < n; i++)
  208. {
  209. fwrite(img + (n*(n - i - 1) * 3), 3, n, f);
  210. fwrite(bmppad, 1, (4 - (n * 3) % 4) % 4, f);
  211. }
  212.  
  213. free(img);
  214. fclose(f);
  215.  
  216. //BMP END
  217.  
  218. // cor(a, b, c,A,B,RES);
  219. delete[] mA, mB, mRes;
  220.  
  221.  
  222. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement