zpragma

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

#define SCALAR double

#define NGAUSS 6
#define NSHAP 6
#define NGEO 6
#define NDOFS 6

#define TEST

#define zero 0.0
#define one 1.0
#define two 2.0
#define half 0.5
#define one_fourth 0.25
#define one_sixth (0.166666666666666667)

#define weight_linear_prism (one_sixth)

omp_lock_t lock;

int pdr_num_int_el_QSS_prism(
    SCALAR *gauss_dat_host, // integration points data of elements having given p
    SCALAR *shape_fun_host, // shape functions on a reference element
    SCALAR *el_data_in,     // data for integration of NR_ELEMS_THIS_KERCALL elements
    SCALAR *el_data_out,    // result of integration of NR_ELEMS_THIS_KERCALL elements
    const int nr_elem_mic,
    const int size_el_out,
    const int size_el_in,
    const int size_shp,
    const int geo_dat_size,
    const int nr_coeff,
    const int one_el_stiff_mat_size,
    const int one_el_load_vec_size);

int main()
{

    omp_init_lock(&lock);
    srand(time(NULL));
    omp_set_num_threads(2);
    FILE *input;

#ifdef TEST
    FILE *output;
#endif

    input = fopen("input_TEST_PRISM_NORMAL.txt", "r");

    int Nr_elems_mic, size_in, size_out, size_shap, geo_dat_size, nr_coeff, one_el_stiff_mat_size, one_el_load_vec_size, i;

    fscanf(input, "%d", &Nr_elems_mic);
    printf("Nr_elems_mic=%d\n", Nr_elems_mic);
    fscanf(input, "%d", &size_in);
    printf("Size_in=%d\n", size_in);
    fscanf(input, "%d", &size_out);
    printf("Size_out=%d\n", size_out);
    fscanf(input, "%d", &size_shap);
    printf("Size_shap=%d\n", size_shap);
    fscanf(input, "%d", &geo_dat_size);
    printf("geo_dat_size=%d\n", geo_dat_size);
    fscanf(input, "%d", &nr_coeff);
    printf("nr_coeff=%d\n", nr_coeff);
    fscanf(input, "%d", &one_el_stiff_mat_size);
    printf("one_el_stiff_mat_size=%d\n", one_el_stiff_mat_size);
    fscanf(input, "%d", &one_el_load_vec_size);
    printf("one_el_load_vec_size=%d\n", one_el_load_vec_size);
    printf("\n\n\n\n");

    SCALAR *gauss_dat_host = (SCALAR *)malloc(1344 * sizeof(SCALAR));
    SCALAR *shape_fun_host;
    SCALAR *el_data_in;
    SCALAR *el_data_out;
#ifdef TEST
    SCALAR *testing;
#endif
    shape_fun_host = (SCALAR *)malloc(size_shap * sizeof(SCALAR));
    el_data_in = (SCALAR *)malloc(size_in * sizeof(SCALAR));
    el_data_out = (SCALAR *)malloc(size_out * sizeof(SCALAR));

#ifdef TEST
    testing = (SCALAR *)malloc(size_out * sizeof(SCALAR));
#endif

    for (i = 0; i < 1344; i++)
        fscanf(input, "%lf", &gauss_dat_host[i]);
    printf("Wczytano punkty Gaussa\n");

    for (i = 0; i < size_shap; i++)
        fscanf(input, "%lf", &shape_fun_host[i]);
    printf("Wczytano funkcje ksztaltu\n");

    for (i = 0; i < size_in; i++)
        fscanf(input, "%lf", &el_data_in[i]);
    printf("Wczytano parametry poczatkowe\n");

    fclose(input);
#ifdef TEST

    output = fopen("output_TEST_PRISM_NORMAL.txt", "r");

    printf("Otwarcie outputu\n");

    for (i = 0; i < size_out; i++)
        fscanf(output, "%lf ", &testing[i]);
    fclose(output);

#endif

    printf("Start!\n");
    double start = omp_get_wtime();
    pdr_num_int_el_QSS_prism(gauss_dat_host, shape_fun_host,
                             el_data_in, el_data_out, Nr_elems_mic,
                             size_out, size_in, size_shap, geo_dat_size, nr_coeff, one_el_stiff_mat_size, one_el_load_vec_size);
    printf("Time: %f\n", omp_get_wtime() - start);
    printf("Koniec!\n");

#ifdef TEST

    SCALAR delta = 0.00001;
    for (i = 0; i < size_out; i++)
    {
        if (fabs(testing[i] - el_data_out[i]) > delta)
        {
            printf("Error in %d element->test=%.10lf != out=%.10lf\n", i, testing[i], el_data_out[i]);
            exit(0);
        }
    }
    printf("Test OK!\n");
#endif
}

int pdr_num_int_el_QSS_prism(
    SCALAR *gauss_dat_host, // integration points data of elements having given p
    SCALAR *shape_fun_host, // shape functions on a reference element
    SCALAR *el_data_in,     // data for integration of NR_ELEMS_THIS_KERCALL elements
    SCALAR *el_data_out,    // result of integration of NR_ELEMS_THIS_KERCALL elements
    const int nr_elem_mic,
    const int size_el_out,
    const int size_el_in,
    const int size_shp,
    const int geo_dat_size,
    const int nr_coeff,
    const int one_el_stiff_mat_size,
    const int one_el_load_vec_size)
{

    int ielem;
    register int offset;

    #pragma omp parallel for private(ielem, offset)
    for (ielem = 0; ielem < nr_elem_mic; ielem++)
    {

        SCALAR tab_fun_u_derx[NSHAP];
        SCALAR tab_fun_u_dery[NSHAP];
        SCALAR tab_fun_u_derz[NSHAP];

        SCALAR stiff_mat[NDOFS * NDOFS];
        SCALAR load_vec[NDOFS];

        int i;
        //-------------------------------------------------------------
        // ******************* READING INPUT DATA *********************

        offset = nr_elem_mic * geo_dat_size + ielem * nr_coeff;

        register SCALAR coeff00 = el_data_in[offset + 0];
        register SCALAR coeff01 = el_data_in[offset + 1];
        register SCALAR coeff02 = el_data_in[offset + 2];
        register SCALAR coeff10 = el_data_in[offset + 3];
        register SCALAR coeff11 = el_data_in[offset + 4];
        register SCALAR coeff12 = el_data_in[offset + 5];
        register SCALAR coeff20 = el_data_in[offset + 6];
        register SCALAR coeff21 = el_data_in[offset + 7];
        register SCALAR coeff22 = el_data_in[offset + 8];
        register SCALAR coeff30 = el_data_in[offset + 9];
        register SCALAR coeff31 = el_data_in[offset + 10];
        register SCALAR coeff32 = el_data_in[offset + 11];
        register SCALAR coeff03 = el_data_in[offset + 12];
        register SCALAR coeff13 = el_data_in[offset + 13];
        register SCALAR coeff23 = el_data_in[offset + 14];
        register SCALAR coeff33 = el_data_in[offset + 15];
        register SCALAR coeff04 = el_data_in[offset + 16];
        register SCALAR coeff14 = el_data_in[offset + 17];
        register SCALAR coeff24 = el_data_in[offset + 18];
        register SCALAR coeff34 = el_data_in[offset + 19];

        // ******* THE END OF: READING INPUT DATA *********************
        //-------------------------------------------------------------

        //-------------------------------------------------------------
        //******************** INITIALIZING SM AND LV ******************//

        #pragma omp parallel for private(i) shared(stiff_mat)
        for (i = 0; i < NDOFS * NDOFS; i++)
            stiff_mat[i] = zero;

        #pragma omp parallel for private(i) shared(load_vec)
        for (i = 0; i < NDOFS; i++)
            load_vec[i] = zero;

        //******************** END OF: INITIALIZING SM AND LV ******************//
        //-------------------------------------------------------------

        //-------------------------------------------------------------
        //************************* LOOP OVER INTEGRATION POINTS ************************//

        // in a loop over gauss points
        int igauss;
        int idof, jdof;
        #pragma omp parallel private(igauss) shared(idof, jdof, load_vec, stiff_mat)
        for (igauss = 0; igauss < NGAUSS; igauss++)
        {

            SCALAR daux = gauss_dat_host[4 * igauss];
            SCALAR faux = gauss_dat_host[4 * igauss + 1];
            SCALAR eaux = gauss_dat_host[4 * igauss + 2];

            SCALAR vol = weight_linear_prism; // vol = weight CONSTANT FOR LINEAR PRISM!!!
                                              //-------------------------------------------------------------
                                              //************************* JACOBIAN TERMS CALCULATIONS *************************//

            SCALAR temp1 = zero;
            SCALAR temp2 = zero;
            SCALAR temp3 = zero;
            SCALAR temp4 = zero;
            SCALAR temp5 = zero;
            SCALAR temp6 = zero;
            SCALAR temp7 = zero;
            SCALAR temp8 = zero;
            SCALAR temp9 = zero;

            { // block to indicate the scope of jac_x registers

                SCALAR jac_0 = zero;
                SCALAR jac_1 = zero;
                SCALAR jac_2 = zero;
                SCALAR jac_3 = zero;
                SCALAR jac_4 = zero;
                SCALAR jac_5 = zero;
                SCALAR jac_6 = zero;
                SCALAR jac_7 = zero;
                SCALAR jac_8 = zero;

                { // block to indicate the scope of jac_data

                    SCALAR jac_data[3 * NGEO];

                    jac_data[0] = -(one - eaux) * half;
                    jac_data[1] = (one - eaux) * half;
                    jac_data[2] = zero;
                    jac_data[3] = -(one + eaux) * half;
                    jac_data[4] = (one + eaux) * half;
                    jac_data[5] = zero;
                    jac_data[6] = -(one - eaux) * half;
                    jac_data[7] = zero;
                    jac_data[8] = (one - eaux) * half;
                    jac_data[9] = -(one + eaux) * half;
                    jac_data[10] = zero;
                    jac_data[11] = (one + eaux) * half;
                    jac_data[12] = -(one - daux - faux) * half;
                    jac_data[13] = -daux * half;
                    jac_data[14] = -faux * half;
                    jac_data[15] = (one - daux - faux) * half;
                    jac_data[16] = daux * half;
                    jac_data[17] = faux * half;

                    /* Jacobian matrix J */

                    offset = ielem * geo_dat_size;

                    #pragma omp parallel for private(i) shared(jac_data, el_data_in) reduction(+ \
                                                                           : temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9)
                    for (i = 0; i < NGEO; i++)
                    {

                        jac_1 = jac_data[i];
                        jac_2 = jac_data[NGEO + i];
                        jac_3 = jac_data[2 * NGEO + i];

                        jac_4 = el_data_in[offset + 3 * i]; //node coor
                        jac_5 = el_data_in[offset + 3 * i + 1];
                        jac_6 = el_data_in[offset + 3 * i + 2];

                        //printf("el_data_in_geo_inside[%d]=%lf\n",offset+3*i,el_data_in[offset+3*i]);  //ok

                        temp1 += jac_4 * jac_1;
                        temp2 += jac_4 * jac_2;
                        temp3 += jac_4 * jac_3;
                        temp4 += jac_5 * jac_1;
                        temp5 += jac_5 * jac_2;
                        temp6 += jac_5 * jac_3;
                        temp7 += jac_6 * jac_1;
                        temp8 += jac_6 * jac_2;
                        temp9 += jac_6 * jac_3;
                    }

                } // the end of scope for jac_data

                jac_0 = (temp5 * temp9 - temp8 * temp6);
                jac_1 = (temp8 * temp3 - temp2 * temp9);
                jac_2 = (temp2 * temp6 - temp3 * temp5);

                daux = temp1 * jac_0 + temp4 * jac_1 + temp7 * jac_2;

                /* Jacobian calculations - |J| and inverse of the Jacobian matrix*/
                vol *= daux; // vol = weight * det J

                faux = one / daux;

                jac_0 *= faux;
                jac_1 *= faux;
                jac_2 *= faux;

                jac_3 = (temp6 * temp7 - temp4 * temp9) * faux;
                jac_4 = (temp1 * temp9 - temp7 * temp3) * faux;
                jac_5 = (temp3 * temp4 - temp1 * temp6) * faux;

                jac_6 = (temp4 * temp8 - temp5 * temp7) * faux;
                jac_7 = (temp2 * temp7 - temp1 * temp8) * faux;
                jac_8 = (temp1 * temp5 - temp2 * temp4) * faux;

                //************* THE END OF: JACOBIAN TERMS CALCULATIONS *************************//
                //-------------------------------------------------------------

                //-------------------------------------------------------------
                //***** SEPARATE COMPUTING OF ALL GLOBAL DERIVATIVES OF ALL SHAPE FUNCTIONS *****//

                #pragma omp parallel for private(idof, temp1, temp2, temp3) default(shared)
                for (idof = 0; idof < NSHAP; idof++)
                {

                    // read proper values of shape functions and their derivatives
                    temp1 = shape_fun_host[igauss * NSHAP + idof];
                    temp2 = shape_fun_host[NSHAP * NGAUSS + igauss * NSHAP + idof];
                    temp3 = shape_fun_host[2 * NSHAP * NGAUSS + igauss * NSHAP + idof];

                    tab_fun_u_derx[idof] = temp1 * jac_0 + temp2 * jac_3 + temp3 * jac_6;
                    tab_fun_u_dery[idof] = temp1 * jac_1 + temp2 * jac_4 + temp3 * jac_7;
                    tab_fun_u_derz[idof] = temp1 * jac_2 + temp2 * jac_5 + temp3 * jac_8;

                } // end loop over shape functions for which global derivatives were computed

            } // the end of block to indicate the scope of jac_x registers

            //*** THE END OF: SEPARATE COMPUTING OF ALL GLOBAL DERIVATIVES OF ALL SHAPE FUNCTIONS ***//
            //-------------------------------------------------------------

            //-------------------------------------------------------------
            //***** SUBSTITUTING ACTUAL COEFFICIENTS FOR SM AND LV CALCULATIONS *****//

            //-------------------------------------------------------------
            //********************* first loop over shape functions ***********************//

            //offset = ielem * (one_el_stiff_mat_size + one_el_load_vec_size); //loop invariant code motion
            #pragma omp parallel for private(idof) shared(igauss) reduction(+ \
                                                                : load_vec[:NSHAP])
            for (idof = 0; idof < NSHAP; idof++)
            {
                { // beginning of using registers for u  (shp_fun_u, fun_u_der.)

                    //-------------------------------------------------------------
                    //****** SUBSTITUTING OR COMPUTING GLOBAL DERIVATIVES OF IDOF SHAPE FUNCTION ******//

                    SCALAR shp_fun_u = shape_fun_host[3 * NSHAP * NGAUSS + igauss * NSHAP + idof];
                    SCALAR fun_u_derx = tab_fun_u_derx[idof];
                    SCALAR fun_u_dery = tab_fun_u_dery[idof];
                    SCALAR fun_u_derz = tab_fun_u_derz[idof];

                    //*** THE END OF: SUBSTITUTING OR COMPUTING GLOBAL DERIVATIVES OF IDOF SHAPE FUNCTION ***//
                    //-------------------------------------------------------------

                    //-------------------------------------------------------------
                    //*** ACTUAL INTERMEDIATE CALCULATIONS FOR IDOF SHAPE FUNCTION ***//

                    load_vec[idof] += (

                                          coeff04 * fun_u_derx +
                                          coeff14 * fun_u_dery +
                                          coeff24 * fun_u_derz +
                                          coeff34 * shp_fun_u

                                          ) *
                                      vol;

                    //*** THE END OF: ACTUAL CALCULATIONS FOR LOAD VECTOR (AND IDOF SHAPE FUNCTION) ***//
                    //-------------------------------------------------------------

                } // the end of using registers for u (shp_fun_u, fun_u_der.)

                //-------------------------------------------------------------
                // ************************* second loop over shape functions ****************************//

                for (jdof = 0; jdof < NSHAP; jdof++)
                {
                    //-------------------------------------------------------------
                    //****** SUBSTITUTING OR COMPUTING GLOBAL DERIVATIVES OF JDOF SHAPE FUNCTION ******//

                    // read proper values of shape functions and their derivatives
                    SCALAR shp_fun_v = shape_fun_host[3 * NSHAP * NGAUSS + igauss * NSHAP + jdof];
                    SCALAR fun_v_derx = tab_fun_u_derx[jdof];
                    SCALAR fun_v_dery = tab_fun_u_dery[jdof];
                    SCALAR fun_v_derz = tab_fun_u_derz[jdof];

                    //*** THE END OF: SUBSTITUTING OR COMPUTING GLOBAL DERIVATIVES OF IDOF SHAPE FUNCTION ***//
                    //-------------------------------------------------------------

                    //-------------------------------------------------------------
                    //********* ACTUAL FINAL CALCULATIONS FOR SM ENTRY  *********//

                    stiff_mat[idof * NDOFS + jdof] += (temp4 * fun_v_derx +
                                                       temp5 * fun_v_dery +
                                                       temp6 * fun_v_derz +
                                                       temp7 * shp_fun_v) *
                                                      vol;

                    //*** THE END OF: ACTUAL FINAL CALCULATIONS FOR SM ENTRY  ***//
                    //-------------------------------------------------------------

                } //jdof

            } //idof

        } //gauss

        //      // ******** THE END OF: loop over integration points ********//
        //      //-------------------------------------------------------------

        offset = ielem * (one_el_stiff_mat_size + one_el_load_vec_size);

        #pragma omp parallel for private(i) shared(offset, stiff_mat, el_data_out)
        for (i = 0; i < NDOFS * NDOFS; i++)
            el_data_out[offset + i] = stiff_mat[i];

        #pragma omp parallel for private(i) shared(offset, load_vec, el_data_out)
        for (i = 0; i < NDOFS; i++)
            el_data_out[offset + one_el_stiff_mat_size + i] = load_vec[i];

    } // the end of loop over elements

    return 1;
}