Untitled

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

const int ARRAY_SIZE = 2;
//nvcc -o partone partone.cu

__global__ void matMult(double** d_out, double** d_in){
    int idx = threadIdx.x;
    /*
    for(int i = 0; i < ARRAY_SIZE; i++){
        d_out[idx][i] = 0;
        for(int j = 0; j < ARRAY_SIZE; j++){
            d_out[idx][i] +=  d_in[j][idx] * d_in[j][i];
        }
    }*/
    d_out[0][0] = 6.0;
}

int main (int argc, char** argv) {
    const int ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(double);
    //const double min = 1.0;
    //const double max = 2.0;

    double h_in[ARRAY_SIZE][ARRAY_SIZE];
    double h_out[ARRAY_SIZE][ARRAY_SIZE];

    double** d_in;
    double** d_out;
    //https://ubuntuforums.org/showthread.php?t=1717717&p=10618266#post10618266
    //double range = max - min;
    //double division = RAND_MAX / range;
    for (int i = 0; i < ARRAY_SIZE; i++){
        for(int j = 0; j < ARRAY_SIZE; j++){
            h_in[i][j] = i + j;
        }
    }

    cudaMalloc((void***) &d_in, ARRAY_BYTES);
    cudaMalloc((void***) &d_out, ARRAY_BYTES);

    //transfer the array to the GPU
    //dest, source, bytes, direction
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

    //launch operator: on 1 block of 64 elements
    matMult <<< 1, ARRAY_SIZE >>> (d_out, d_in);

    cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);

    printf("result (0,0): %lf    ", h_out[0][0]);
    printf("result (0,1): %lf\n", h_out[0][1]);
    printf("result (1,0): %lf    ", h_out[1][0]);
    printf("result (1,1): %lf\n", h_out[1][1]);

    cudaFree(d_in);
    cudaFree(d_out);

    return 0;
}