Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <intrin.h>
- #include<omp.h>
- #include<chrono>
- #define __CL_ENABLE_EXCEPTIONS
- #include<CL/cl.hpp>
- #include <vector>
- #include <fstream>
- #include <immintrin.h>
- #include <cstdlib>
- #include <ctime>
- using namespace std;
- using namespace cl;
- const int n = 64;
- const int cnt = 10;
- void sum_vec(int* x, int* y, int* res) {
- for (int i = 0; i < n * n * 3; i += 8) {
- _mm256_storeu_si256((__m256i*)&res[i], _mm256_sub_epi32(_mm256_loadu_si256((__m256i*)&x[i]), _mm256_loadu_si256((__m256i*)&y[i])));
- }
- }
- void omp_parallel(int* A, int* B, int* res) {
- #pragma omp parallel for
- for (int i = 0; i < n; ++i) {
- res[i] = A[i] - B[i];
- }
- }
- void oCl(cl::Device device, int*A, int*B, int *result) {
- vector<Device> contextDevices;
- contextDevices.push_back(device);
- Context context(contextDevices);
- CommandQueue queue(context, device);
- fill_n(result, n*n * 3, 0);
- Buffer cl_matrix_A = Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, n*n *3 * sizeof(int), A);
- Buffer cl_vector_B = Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, n * n * 3 * sizeof(int), B);
- Buffer cl_result_vector = Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, n*n * 3 * sizeof(int), result);
- cl::Program::Sources sources;
- ifstream sourceFile("device.cl");
- string sourceCode(istreambuf_iterator<char>(sourceFile), (istreambuf_iterator<char>()));
- sources.push_back({ sourceCode.c_str(),sourceCode.length() });
- cl::Program program = cl::Program(context, sources);
- program.build(contextDevices);
- cl::Kernel kernel(program, "matrix_min");
- int iArg = 0;
- kernel.setArg(iArg++, cl_result_vector);
- kernel.setArg(iArg++, cl_matrix_A);
- kernel.setArg(iArg++, cl_vector_B);
- queue.enqueueNDRangeKernel(kernel, NullRange, NDRange(n*n*3), NDRange(n));
- queue.finish();
- }
- void generate(int*A, int*B) {
- for (int i = 0; i < n * n * 3; i++) {
- A[i] = 0 + (rand() % static_cast<int>(255 - 0 + 1));
- B[i] = 0 + (rand() % static_cast<int>(255 - 0 + 1));
- }
- }
- void minusM(int *A, int *B, int *result) {
- for (int i = 0; i < n * n * 3; i++) {
- result[i] = A[i] - B[i];
- }
- }
- int main() {
- srand((unsigned int)time(0));
- int *mA = new int[n*n * 3];
- int *mB = new int[n*n * 3];
- int *mRes = new int[n*n * 3];
- generate(mA, mB);
- vector<cl::Platform> platforms;
- cl::Platform::get(&platforms);
- vector<cl::Device> devices;
- chrono::time_point<chrono::system_clock> start, end;
- double all_time = 0;
- double all_err = 0;
- /*for (unsigned int iPlatform = 0; iPlatform < platforms.size(); iPlatform++) {
- platforms[iPlatform].getDevices(CL_DEVICE_TYPE_ALL, &devices);
- for (unsigned int iDevice = 0; iDevice < devices.size(); iDevice++) {
- try {
- std::cout << devices[iDevice].getInfo<CL_DEVICE_NAME>() << iDevice << " " << iPlatform << " : " << devices[iDevice].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << std::endl;
- }
- catch (cl::Error error) {
- std::cout << error.what() << "(" << error.err() << ")" << std::endl;
- }
- }
- }*/
- platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
- //Platform 0 Device 0
- for (int i = 0; i < cnt; i++) {
- start = chrono::system_clock::now();
- try {
- oCl(devices[0], mA, mB, mRes);
- }
- catch (cl::Error error) {
- cout << error.what() << "(" << error.err() << ")" << endl;
- }
- end = chrono::system_clock::now();
- all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
- }
- cout << "GPU: " << all_time / (double)cnt << " milliseconds" << endl;
- //Device end
- /*all_time = 0;
- all_err = 0;
- //Vector start
- for (int i = 0; i < cnt; i++) {
- start = chrono::system_clock::now();
- sum_vec(mA, mB, mRes);
- end = chrono::system_clock::now();
- all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
- }
- cout << "Vectorization: " << all_time / (double)cnt << " microseconds" << endl;
- //Vector end*/
- all_time = 0;
- all_err = 0;
- //Parallel start
- for (int i = 0; i < cnt; i++) {
- start = chrono::system_clock::now();
- omp_parallel(mA, mB, mRes);
- end = chrono::system_clock::now();
- all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
- }
- cout << "Parallel: " << all_time / (double)cnt << " microseconds" << endl;
- //Parallel end
- all_time = 0;
- all_err = 0;
- //Simple start
- for (int i = 0; i < cnt; i++) {
- start = chrono::system_clock::now();
- minusM(mA, mB, mRes);
- end = chrono::system_clock::now();
- all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
- }
- cout << "Simple: " << all_time / (double)cnt << " microseconds" << endl;
- //Simple end
- //BMP
- FILE *f;
- unsigned char *img = NULL;
- int filesize = 54 + 3 * n*n, r, g, b, x, y;
- img = (unsigned char *)malloc(3 * n*n);
- memset(img, 0, 3 * n*n);
- for (int i = 0; i < n * 3; i+=3)
- {
- for (int j = 0; j < n * 3; j+=3)
- {
- x = i/3; y = ((n * 3 - 1) - j)/3;
- r = mRes[i*n + j];
- g = mRes[i*n + j + 1];
- b = mRes[i*n + j + 2];
- if (r > 255) r = 255;
- if (g > 255) g = 255;
- if (b > 255) b = 255;
- img[(x + y * n) * 3 + 2] = (unsigned char)(r);
- img[(x + y * n) * 3 + 1] = (unsigned char)(g);
- img[(x + y * n) * 3 + 0] = (unsigned char)(b);
- }
- }
- unsigned char bmpfileheader[14] = { 'B','M', 0,0,0,0, 0,0, 0,0, 54,0,0,0 };
- unsigned char bmpinfoheader[40] = { 40,0,0,0, 0,0,0,0, 0,0,0,0, 1,0, 24,0 };
- unsigned char bmppad[3] = { 0,0,0 };
- bmpfileheader[2] = (unsigned char)(filesize);
- bmpfileheader[3] = (unsigned char)(filesize >> 8);
- bmpfileheader[4] = (unsigned char)(filesize >> 16);
- bmpfileheader[5] = (unsigned char)(filesize >> 24);
- bmpinfoheader[4] = (unsigned char)(n);
- bmpinfoheader[5] = (unsigned char)(n >> 8);
- bmpinfoheader[6] = (unsigned char)(n >> 16);
- bmpinfoheader[7] = (unsigned char)(n >> 24);
- bmpinfoheader[8] = (unsigned char)(n);
- bmpinfoheader[9] = (unsigned char)(n >> 8);
- bmpinfoheader[10] = (unsigned char)(n >> 16);
- bmpinfoheader[11] = (unsigned char)(n >> 24);
- f = fopen("img.bmp", "wb");
- fwrite(bmpfileheader, 1, 14, f);
- fwrite(bmpinfoheader, 1, 40, f);
- for (int i = 0; i < n; i++)
- {
- fwrite(img + (n*(n - i - 1) * 3), 3, n, f);
- fwrite(bmppad, 1, (4 - (n * 3) % 4) % 4, f);
- }
- free(img);
- fclose(f);
- //BMP END
- // cor(a, b, c,A,B,RES);
- delete[] mA, mB, mRes;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement