Untitled

#include "is.h"
#include "math.h"
#include <vector>
#include <stdio.h>
#include <iostream>
#include "vector.h"
#include <tuple>
#include <algorithm>
#include "stopwatch.h"
#include <immintrin.h>

inline int static divup(int a, int b) {
    return (a + b - 1)/b;
}
inline int static roundup(int a, int b) {
    return divup(a, b) * b;
}
inline float hmax8(float8_t vv) {
    float v = 0;
    for (int i = 0; i < 8; ++i) {
        v = std::max(vv[i], v);
    }
    return v;
}

Result segment(int ny, int nx, const float* data) {
    // ppc::stopwatch sw;
    // sw.record();
    int vs = 8;
    int nxp = roundup(nx + vs, vs);
    int nyp = ny + 1;
    std::vector<float> s(nxp*nyp);


    for (int y = 0; y < nyp; y++) {
        for (int x = 0; x < nxp; x++){
            float elem = NAN;
            if (x < nx + 1 && y < ny +1) {
                if (x == 0 || y == 0) {
                    elem = 0;
                } else {
                    float add = data[(x-1)*3 + (y-1)*3*nx];
                    elem = s[(x-1) + y*nxp] + s[x + (y-1)*nxp] - s[(x-1) + (y-1)*nxp] + add;
                }
            }
            s[x + y*nxp] = elem;
        }
    }

    // sw.record();

    // std::cout<<"\ndata:\n";
    // for (int y = 0; y < ny; y++) {
    //     for (int x = 0; x < nx; x++) {
    //             std::cout<<data[x*3 + y*3*nx] << " ";
    //     }
    //     std::cout<<"\n";
    // }

    // std::cout<<"\nprocessed:\n";
    // for (int y = 0; y < nyp; y++) {
    //     for (int x = 0; x < nxp; x++) {
    //             std::cout<<s[x + y*nxp] << " ";
    //     }
    //     std::cout<<"\n";
    // }

    float8_t hval = float8_0;
    int height = 0;
    int width = 0;

    float vp = s[nx + nxp*ny];
    float vp2 = vp * vp;

    #pragma omp parallel
    {
        float8_t tprev = float8_0;
        float8_t th = float8_0;
        int bh = 0;
        int bw = 0;
        #pragma omp for nowait schedule(dynamic, 1)
        for (int h = 1; h < ny + 1; h++) {
            for (int w = 1; w < nx + 1; w++) {
                int ax = w * h;
                int ay = ny*nx - ax;

                if (ax != ny*nx) {

                    float iax = 1.0/ax;
                    float iay = 1.0/ay;
                    //some equation solving happened here :D
                    float a = vp2 * iay;
                    float b = -2*vp*iay;
                    float c = (ay + ax)*iax*iay;

                    int yub = ny - h + 2;
                    int xub = nx - w + 2;
                    for (int y0 = 1; y0 < yub; y0++) {
                        for (int x0 = 1; x0 < xub; x0+=vs) {
                            int y1 = y0 - 1 + h;
                            int x1 = x0 - 1 + w;

                            float8_t x1y1 = _mm256_loadu_ps(s.data() + x1 + y1*nxp);
                            float8_t x1y0 = _mm256_loadu_ps(s.data() + x1 + (y0-1)*nxp);
                            float8_t x0y1 = _mm256_loadu_ps(s.data() + (x0-1) + y1*nxp);
                            float8_t x0y0 = _mm256_loadu_ps(s.data() + (x0-1) + (y0-1)*nxp);

                            float8_t vx =  x1y1 - x0y1 - x1y0 + x0y0;
                            float8_t hv = a + vx*(b + c*vx);

                            th = (hv > th) ? hv : th;
                        }
                    }

                    float8_t d = _mm256_cmp_ps(th, tprev, _CMP_GT_OQ);
                    if (_mm256_movemask_ps(d)) {
                        float hmax = hmax8(th);             //horizontal max
                        tprev = _mm256_set1_ps(hmax);        //vector of hmax
                        bh = h;                             //best height to current
                        bw = w;                             //best width to current
                    }
                }
            }
        }
        #pragma omp critical
        {
            float8_t d = _mm256_cmp_ps(tprev, hval, _CMP_GT_OQ);
            if (_mm256_movemask_ps(d)) {
                hval = tprev;
                height = bh;
                width = bw;
            }
        }
    }

    // sw.record();

    Result best;
    float xval = 0;
    int ax = width * height;
    int ay = ny*nx - ax;
    float iax = 1.0/ax;
    float iay = 1.0/ay;
    int yub = ny - height + 2;
    int xub = nx - width + 2;
    for (int y0 = 1; y0 < yub; y0++) {
        for (int x0 = 1; x0 < xub; x0++) {

            int y1 = y0 - 1 + height;
            int x1 = x0 - 1 + width;

            float x1y1 = s[x1 + y1*nxp];
            float x1y0 = s[x1 + (y0-1)*nxp];
            float x0y1 = s[(x0-1) + y1*nxp];
            float x0y0 = s[(x0-1) + (y0-1)*nxp];


            float vx =  x1y1 - x0y1 - x1y0 + x0y0;
            float vy = vp - vx;
            float xc = vx * iax;
            float yc = vy * iay;

            vx = xc * vx;
            vy = yc * vy;

            float h = vx + vy;

            // std::cout<<"indices: (y1, x1, y0, x0): " << y1 << ", "<< x1 << ", "<<y0 << ", "<< x0 << "\n ";
            // std::cout<<"h:" <<h;
            if (h > xval) {
                xval = h;
                best.y0 = y0 - 1;
                best.x0 = x0 - 1;
                best.y1 = y1;
                best.x1 = x1;
                best.outer[0] = yc;
                best.outer[1] = yc;
                best.outer[2] = yc;
                best.inner[0] = xc;
                best.inner[1] = xc;
                best.inner[2] = xc;
            }
        }
    }

    // sw.record();
    // sw.print();

    return best;
}