ZHEGALKIN_2.0

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <fstream>

#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>


using namespace std;
#define vectors 100000
#define N L*vectors
#define L 8

__global__ void test(long A[N][L])
{
    int b = threadIdx.x + blockIdx.x * blockDim.x;

    if (b < vectors) {
        int popa = L * b + 1;
        for (int i = popa; i < popa + L - 1; i++) {
            for (int j = 0; j < L - 1; j++) {
                A[i][j] = A[i - 1][j] ^ A[i - 1][j + 1];
            }
        }
    }
}

int main()
{
    static long A[N][L];
    long(*d_A)[L]; //pointers to arrays of dimension N

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    ofstream myfileout;
    myfileout.open("D:\\out.txt");

    ofstream myfileinput;
    myfileinput.open("D:\\in.txt");

    for (int i = 0; i < N; i++) {
        for (int j = 0; j < L; j++) {
            A[i][j] = 0;
        }
    }

    for (int i = 0; i < N; i = i + 8) {
        A[i][0] = 1;
        A[i][2] = 1;
        A[i][4] = 1;
        A[i][7] = 1;
    }
    /*
    for (int i = 0; i < N; i++) {
        if (i % 8 == 0) {
            myfileinput << endl;
        }
        myfileinput << endl;
        for (int j = 0; j < L; j++) {
            myfileinput << A[i][j];

        }
    }
    */
    //allocation
    cudaMalloc((void**)&d_A, (N*L)*sizeof(float));

    //copying from host to device
    cudaMemcpy(d_A, A, (N*L)*sizeof(float), cudaMemcpyHostToDevice);

    // Start record
    cudaEventRecord(start, 0);
    // Kernel invocation
    test << <512, 256>> >(d_A);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop); // that's our time!

    //copying from device to host
    cudaMemcpy(A, (d_A), (N*L)*sizeof(float), cudaMemcpyDeviceToHost);
    cout  << "GPU Time [mks] " << elapsedTime*1000 << endl;
    for (int i = N-L; i < N; i++) {
        if (i % 8 == 0) {
            myfileout << endl;
        }
        myfileout << endl;
        for (int j = 0; j < L; j++) {
            myfileout << A[i][j];

        }
    }

    myfileout.close();
    myfileinput.close();
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

}