Untitled

void SmithWatermanGPU(std::string const& s1, std::string const& s2, double const d, double const e, double const B)
{
    //DATA PREPARATION


    //input strings are const so we copy
    std::string string_m(s1);
    std::string string_n(s2);

    //memory locations
    float *Gi,*Gd,*F,*E;
    float *memory;
    char *M;

    //sizes of strings
    long int m = string_m.length();
    long int n = string_n.length();


    //B is the desirable number of blocks in grid
    double k = sqrt(B/(m/n));
    long int blockSize_n = floor(k);
    long int blockSize_m = floor((m/n)*k);
    long int blockSize = blockSize_n*blockSize_m;

    //std::cout<<k<<" "<<blockSize_n<<" "<<blockSize_m<<std::endl;
    //here we define how much will there be blocks in m and n direction
    long int blockNum_n = ceil((double)n/blockSize_n);
    long int blockNum_m = ceil((double)m/blockSize_m);
    long int blockNum = blockNum_m*blockNum_n;

    //std::cout<<"Size:"<<n<<" "<<blockSize_n<<" "<<ceil((double)n/blockSize_n)<<" "<<ceil(n/blockSize_n)<<std::endl;
    //std::cout<<"Size:"<<m<<" "<<blockSize_m<<" "<<ceil((double)m/blockSize_m)<<" "<<ceil(m/blockSize_m)<<std::endl;
    //here we are padding strings so there are no elements that will be

    padding(string_m,string_n,blockNum_m*blockSize_m,blockNum_n*blockSize_n);
    //std::cout<<string_m<<std::endl;
    //std::cout<<string_n<<std::endl;
    //std::cout<<"Size:"<<string_m.length()<<" "<<string_n.length()<<std::endl;

    //strings have been padded so their length is measured again
    m=string_m.length();
    n=string_n.length();

    long int N = (m+1)*(n+1);
    //part of code where memory allocation is happening
    cudaMallocManaged(&memory, N*sizeof(float));
    cudaMallocManaged(&M, N*sizeof(char));
    cudaMallocManaged(&Gi, N*sizeof(float));
    cudaMallocManaged(&Gd, N*sizeof(float));
    cudaMallocManaged(&F, N*sizeof(float));
    cudaMallocManaged(&E, N*sizeof(float));

    char* x1 ;//= allocateMemory(string_m);

    const char *cstr = string_m.c_str();
        cudaMallocManaged(&x1, string_m.length()*(sizeof(char)+1));
        //x.copy( memory, x.length() );
        //for(int i=0;i<x.length();++i) memory[i]=cstr[i];
        strcpy(x1, cstr);
        x1[string_m.length()]='\0';

        char* x2 ;// = allocateMemory(string_n);

    const char *cstr2 = string_n.c_str();
    cudaMallocManaged(&x2, string_n.length()*(sizeof(char)+1));
        //x.copy( memory, x.length() );
        //for(int i=0;i<x.length();++i) memory[i]=cstr[i];
        strcpy(x2, cstr2);
        x2[string_n.length()]='\0';

    int *semaphore;
    //blockSize = 64;
    //std::cout<<blockNum<<" "<<blockSize<<std::endl;
    initsemaphor<<<1, 64>>>(semaphore, blockSize);
    cudaDeviceSynchronize();


    //CALCULATION

    std::cout<<blockNum<<" "<<blockSize<<std::endl;
    //blockSize_m,blockSize_n,blockNum_m,blockNum_n
    //threadSolver<<<1, 64>>>(memory,0,0,n,d,e,5,x1,x2,semaphore);
    //helloWorld<<<1,1>>>();

    //threadSolver(float *memory,long int subM,long int subN, long int const n, float const d, float const e, long int const b_size, char *s1, char *s2, int *semaphore);
    /*for(int i=0;i<N;i++)
    {
        std::cout<<memory[i]<<" ";
    }*/

    //memory freeing
    cudaFree(memory);
    cudaFree(M);
    cudaFree(Gi);
    cudaFree(Gd);
    cudaFree(F);
    cudaFree(E);
    cudaFree(semaphore);

    return;
}