Untitled

#include <iostream>
#include <intrin.h>
#include<omp.h>
#include<chrono>
#define __CL_ENABLE_EXCEPTIONS

#include<CL/cl.hpp>
#include <vector>
#include <fstream>
#include <immintrin.h>
#include <cstdlib>
#include <ctime>

using namespace std;
using namespace cl;

const int n = 64;
const int cnt = 10;

void sum_vec(int* x, int* y, int* res) {
	for (int i = 0; i < n * n * 3; i += 8) {
		_mm256_storeu_si256((__m256i*)&res[i], _mm256_sub_epi32(_mm256_loadu_si256((__m256i*)&x[i]), _mm256_loadu_si256((__m256i*)&y[i])));
	}
}
void omp_parallel(int* A, int* B, int* res) {
#pragma omp parallel for
	for (int i = 0; i < n; ++i) {
		res[i] = A[i] - B[i];
	}
}

void oCl(cl::Device device, int*A, int*B, int *result) {
	vector<Device> contextDevices;
	contextDevices.push_back(device);
	Context context(contextDevices);

	CommandQueue queue(context, device);

	fill_n(result, n*n * 3, 0);

	Buffer cl_matrix_A = Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, n*n *3 * sizeof(int), A);
	Buffer cl_vector_B = Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, n * n * 3 * sizeof(int), B);
	Buffer cl_result_vector = Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, n*n * 3 * sizeof(int), result);

	cl::Program::Sources sources;
	ifstream sourceFile("device.cl");
	string sourceCode(istreambuf_iterator<char>(sourceFile), (istreambuf_iterator<char>()));
	sources.push_back({ sourceCode.c_str(),sourceCode.length() });
	cl::Program program = cl::Program(context, sources);
	program.build(contextDevices);
	cl::Kernel kernel(program, "matrix_min");

	int iArg = 0;
	kernel.setArg(iArg++, cl_result_vector);
	kernel.setArg(iArg++, cl_matrix_A);
	kernel.setArg(iArg++, cl_vector_B);

	queue.enqueueNDRangeKernel(kernel, NullRange, NDRange(n*n*3), NDRange(n));
	queue.finish();

}

void generate(int*A, int*B) {
	for (int i = 0; i < n * n * 3; i++) {
		A[i] = 0 + (rand() % static_cast<int>(255 - 0 + 1));
		B[i] = 0 + (rand() % static_cast<int>(255 - 0 + 1));
	}
}

void minusM(int *A, int *B, int *result) {
	for (int i = 0; i < n * n * 3; i++) {
		result[i] = A[i] - B[i];
	}
}


int main() {
	srand((unsigned int)time(0));
	int *mA = new int[n*n * 3];
	int *mB = new int[n*n * 3];
	int *mRes = new int[n*n * 3];

	generate(mA, mB);
	vector<cl::Platform> platforms;
	cl::Platform::get(&platforms);
	vector<cl::Device> devices;
	chrono::time_point<chrono::system_clock> start, end;
	double all_time = 0;
	double all_err = 0;

	/*for (unsigned int iPlatform = 0; iPlatform < platforms.size(); iPlatform++) {
		platforms[iPlatform].getDevices(CL_DEVICE_TYPE_ALL, &devices);
		for (unsigned int iDevice = 0; iDevice < devices.size(); iDevice++) {
			try {
				std::cout << devices[iDevice].getInfo<CL_DEVICE_NAME>() << iDevice << " " << iPlatform << " : " << devices[iDevice].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << std::endl;
			}
			catch (cl::Error error) {
				std::cout << error.what() << "(" << error.err() << ")" << std::endl;
			}
		}
	}*/

	platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);


	//Platform 0 Device 0
	for (int i = 0; i < cnt; i++) {
		start = chrono::system_clock::now();
		try {
			oCl(devices[0], mA, mB, mRes);
		}
		catch (cl::Error error) {
			cout << error.what() << "(" << error.err() << ")" << endl;
		}
		end = chrono::system_clock::now();
		all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
	}

	cout << "GPU: " << all_time / (double)cnt << " milliseconds" << endl;

	//Device end
	/*all_time = 0;
	all_err = 0;
	//Vector start
	for (int i = 0; i < cnt; i++) {
		start = chrono::system_clock::now();
		sum_vec(mA, mB, mRes);
		end = chrono::system_clock::now();
		all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
	}
	cout << "Vectorization: " << all_time / (double)cnt << " microseconds" << endl;
	//Vector end*/

	all_time = 0;
	all_err = 0;
	//Parallel start
	for (int i = 0; i < cnt; i++) {
		start = chrono::system_clock::now();
		omp_parallel(mA, mB, mRes);
		end = chrono::system_clock::now();
		all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
	}
	cout << "Parallel: " << all_time / (double)cnt << " microseconds" << endl;
	//Parallel end

	all_time = 0;
	all_err = 0;
	//Simple start
	for (int i = 0; i < cnt; i++) {
		start = chrono::system_clock::now();
		minusM(mA, mB, mRes);
		end = chrono::system_clock::now();
		all_time += chrono::duration_cast<chrono::microseconds>(end - start).count();
	}
	cout << "Simple: " << all_time / (double)cnt << " microseconds" << endl;
	//Simple end


	//BMP

	FILE *f;
	unsigned char *img = NULL;
	int filesize = 54 + 3 * n*n, r, g, b, x, y;

	img = (unsigned char *)malloc(3 * n*n);
	memset(img, 0, 3 * n*n);

	for (int i = 0; i < n * 3; i+=3)
	{
		for (int j = 0; j < n * 3; j+=3)
		{
			x = i/3; y = ((n * 3 - 1) - j)/3;
			r = mRes[i*n + j];
			g = mRes[i*n + j + 1];
			b = mRes[i*n + j + 2];
			if (r > 255) r = 255;
			if (g > 255) g = 255;
			if (b > 255) b = 255;
			img[(x + y * n) * 3 + 2] = (unsigned char)(r);
			img[(x + y * n) * 3 + 1] = (unsigned char)(g);
			img[(x + y * n) * 3 + 0] = (unsigned char)(b);
		}
	}

	unsigned char bmpfileheader[14] = { 'B','M', 0,0,0,0, 0,0, 0,0, 54,0,0,0 };
	unsigned char bmpinfoheader[40] = { 40,0,0,0, 0,0,0,0, 0,0,0,0, 1,0, 24,0 };
	unsigned char bmppad[3] = { 0,0,0 };

	bmpfileheader[2] = (unsigned char)(filesize);
	bmpfileheader[3] = (unsigned char)(filesize >> 8);
	bmpfileheader[4] = (unsigned char)(filesize >> 16);
	bmpfileheader[5] = (unsigned char)(filesize >> 24);

	bmpinfoheader[4] = (unsigned char)(n);
	bmpinfoheader[5] = (unsigned char)(n >> 8);
	bmpinfoheader[6] = (unsigned char)(n >> 16);
	bmpinfoheader[7] = (unsigned char)(n >> 24);
	bmpinfoheader[8] = (unsigned char)(n);
	bmpinfoheader[9] = (unsigned char)(n >> 8);
	bmpinfoheader[10] = (unsigned char)(n >> 16);
	bmpinfoheader[11] = (unsigned char)(n >> 24);

	f = fopen("img.bmp", "wb");
	fwrite(bmpfileheader, 1, 14, f);
	fwrite(bmpinfoheader, 1, 40, f);
	for (int i = 0; i < n; i++)
	{
		fwrite(img + (n*(n - i - 1) * 3), 3, n, f);
		fwrite(bmppad, 1, (4 - (n * 3) % 4) % 4, f);
	}

	free(img);
	fclose(f);

	//BMP END

	// cor(a, b, c,A,B,RES);
	delete[] mA, mB, mRes;


}