Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "stdafx.h"
- #include <amp.h>
- #include <iostream>
- #include <ctime>
- #include <omp.h>
- #include <iomanip>
- #define N 2048
- #define BLOCK 16
- using namespace concurrency;
- std::vector <std::vector <float> > cA(N, std::vector<float>(N));
- std::vector <std::vector <float> > cB(N, std::vector<float>(N)); //Не работает нихуя на AMP
- std::vector <std::vector <float> > cC(N, std::vector<float>(N));
- std::vector <float> A(N * N);
- std::vector <float> B(N * N); //А вот это заебись
- std::vector <float> C(N * N);
- void createMatrix()
- {
- srand(time(0));
- int index = 0;
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++, index++)
- {
- cA[i][j] = rand() % 100 + 1;
- cB[i][j] = rand() % 100 + 1;
- A[index] = cA[i][j];
- B[index] = cB[i][j];
- }
- }
- }
- void Math_CPU()
- {
- float sum;
- for (int l = 0; l < N; l++)
- {
- for (int i = 0; i < N; i++)
- {
- sum = 0;
- for (int j = 0; j < N; j++)
- {
- sum += cA[l][j] * cB[j][i];
- }
- cC[i][l] = sum;
- }
- }
- }
- void Math_OMP()
- {
- float sum;
- #pragma omp parallel for reduction(+:sum)
- for (int l = 0; l < N; l++)
- {
- for (int i = 0; i < N; i++)
- {
- sum = 0;
- for (int j = 0; j < N; j++)
- {
- sum += cA[l][j] * cB[j][i];
- }
- cC[i][l] = sum;
- }
- }
- }
- void Math_AMP()
- {
- array_view<float, 2> Matrix_A(N, N, A), Matrix_B(N, N, B), Matrix_C(N, N, C);
- Matrix_C.discard_data();
- parallel_for_each(
- Matrix_C.extent, [=](index<2> idx) restrict(amp, cpu)
- {
- float sum = 0;
- for (int j = 0; j < N; j++)
- {
- sum += Matrix_A(idx[0], j) * Matrix_B(j, idx[1]);
- }
- Matrix_C[idx] = sum;
- });
- Matrix_C.synchronize();
- }
- void Math_AMP_2()
- {
- array_view<float, 2> Matrix_A(N, N, A), Matrix_B(N, N, B), Matrix_C(N, N, C);
- Matrix_C.discard_data();
- parallel_for_each(
- Matrix_C.extent.tile <BLOCK, BLOCK>(), [=](tiled_index <BLOCK, BLOCK> idx) restrict(amp)
- {
- float sum = 0;
- for (int j = 0; j < N; j += BLOCK)
- {
- tile_static int localA[BLOCK][BLOCK], localB[BLOCK][BLOCK];
- localA[idx.local[0]][idx.local[1]] = Matrix_A(idx.global[0], idx.local[1] + j);
- localB[idx.local[0]][idx.local[1]] = Matrix_B(idx.local[0] + j, idx.global[1]);
- idx.barrier.wait();
- for (int t = 0; t <BLOCK; ++t)
- {
- sum += localA[idx.local[0]][t] * localB[t][idx.local[1]];
- }
- idx.barrier.wait();
- }
- Matrix_C[idx.global] = sum;
- });
- }
- int main()
- {
- createMatrix();
- accelerator chosen_one;
- std::wcout << chosen_one.description << std::endl << std::endl;
- std::cout << "Matrix size: " << N << "*" << N << std::endl;
- int a = rand() % N; // положение элемента матрицы, для теста.
- int b = rand() % N;
- /*
- //1 thread
- unsigned int start_time_CPU = clock();
- Math_CPU();
- unsigned int end_time_CPU = clock();
- unsigned int search_time_CPU = end_time_CPU - start_time_CPU;
- std::cout << "CPU 1 thread " << "time=" << search_time_CPU / 1000.0 << std::endl;
- */
- //OpenMP
- unsigned int start_time_MP = clock();
- Math_OMP();
- unsigned int end_time_MP = clock();
- unsigned int search_time_MP = end_time_MP - start_time_MP;
- std::cout << cC[a][b] << " CPU_OpenMP " << "time=" << search_time_MP / 1000.0 << std::endl;
- //AMP
- unsigned int start_time_AMP = clock();
- Math_AMP();
- unsigned int end_time_AMP = clock();
- unsigned int search_time_AMP = end_time_AMP - start_time_AMP;
- std::cout << C[a + b * N] << " GPU_AMP_ALL " << "time=" << search_time_AMP / 1000.0 << " boost=" << (float)search_time_MP / search_time_AMP << "x" << std::endl;
- //AMP v2
- unsigned int start_time_AMP_2 = clock();
- Math_AMP_2();
- unsigned int end_time_AMP_2 = clock();
- unsigned int search_time_AMP_2 = end_time_AMP_2 - start_time_AMP_2;
- std::cout << C[a + b * N] << " GPU_AMP_block " << "time=" << search_time_AMP_2 / 1000.0 << " boost=" << (float)search_time_MP / search_time_AMP_2 << "x" << std::endl;
- std::cout << std::endl;
- system("pause");
- return 0;
- }
- /*
- AMD Radeon (TM) R9 390 Series
- Matrix size: 2048*2048
- 5.20176e+06 CPU_MAX time=13.226
- 5.20176e+06 GPU_AMP_ALL time=0.063 boost=209.937x
- 5.20176e+06 GPU_AMP_block time=0.055 boost=240.473x
- */
Advertisement
Add Comment
Please, Sign In to add comment