blerk

// - first off: stylewise this is just a mockup, modify it as you see fit

// - there's been a change of plans, since further insight into the code (not pasted in the question)
//   taught me that the Y coordinate was needed after all :)

// - but if you're pre-reorganizing data anyway, this shouldn't be too hard to accomplish either
//   if need be I can still write the code so it takes in XYZ pairs but it'd do some harm locally

__forceinline __m128 function_dot2D(__m128 u0, __m128 v0, __m128 u1, const __m128 &v1)
// ^ VS2008 can't gaurantee alignment for 4 separate SSE register parameters, hence the const&
{
    // multiplies all Xs with all Xs, all Zs with all Zs, then adds them per-vertex
    // and stores them in 1 register as 4 single prec. float result
    // also, all hail HADDPS (warning: it's SSE3)
    return _mm_hadd_ps(_mm_mul_ps(u0, v0), _mm_mul_ps(u1, v1));
}

__m128 function(
    const float *p,  // 4 XZ pairs
    const float *a,  // 4 XZ pairs
    const float *b,  // 4 XZ pairs
    const float *c,  // 4 XZ pairs
    const float *aY, // 4 Ys
    const float *bY, // 4 Ys
    const float *cY) // 4 Ys
    // all, as discussed, must be aligned to 16 bytes
{
    // load all 4 vertices (X&Z) pairs from each resp. pointer
    const __m128 p0 = _mm_load_ps(p);
    const __m128 p1 = _mm_load_ps(p+4);
    const __m128 a0 = _mm_load_ps(a);
    const __m128 a1 = _mm_load_ps(a+4);
    const __m128 b0 = _mm_load_ps(b);
    const __m128 b1 = _mm_load_ps(b+4);
    const __m128 c0 = _mm_load_ps(c);
    const __m128 c1 = _mm_load_ps(c+4);

    //v0s = c-a
    const __m128 v0_0 = _mm_sub_ps(c0, a0);
    const __m128 v0_1 = _mm_sub_ps(c1, a1);

    //v1s = b-a
    const __m128 v1_0 = _mm_sub_ps(b0, a0);
    const __m128 v1_1 = _mm_sub_ps(b1, a1);

    //v2s = p-a
    const __m128 v2_0 = _mm_sub_ps(p0, a0);
    const __m128 v2_1 = _mm_sub_ps(p1, a1);

    // calculate all 2D dot products, each one with 4 results: 1 for each triangle
    const __m128 dot00 = function_dot2D(v0_0, v0_0, v0_1, v0_1);
    const __m128 dot01 = function_dot2D(v0_0, v1_0, v0_1, v1_1);
    const __m128 dot02 = function_dot2D(v0_0, v2_0, v0_1, v2_1);
    const __m128 dot11 = function_dot2D(v1_0, v1_0, v1_1, v1_1);
    const __m128 dot12 = function_dot2D(v1_0, v2_0, v1_1, v2_1);

    // now for the barycentric calc.
    const __m128 invDenom = _mm_rsqrt_ps(_mm_sub_ps(_mm_mul_ps(dot00, dot11), _mm_mul_ps(dot01, dot01)));
    const __m128 u = _mm_mul_ps(invDenom, _mm_sub_ps(_mm_mul_ps(dot11, dot02), _mm_mul_ps(dot01, dot12)));
    const __m128 v = _mm_mul_ps(invDenom, _mm_sub_ps(_mm_mul_ps(dot00, dot12), _mm_mul_ps(dot01, dot02)));

    // at this point we're gonna deviate (and where things get a bit iffy)
    // wish you had pasted it along in the question ;)

    // memon's "sloppy epsilon(s)"
    // apparently needed to allow to get height of points which are interpolated along the edges of the triangles
    static const __m128 min_eps = _mm_set1_ps(-(1e-4f));
    static const __m128 eps_plus_1 = _mm_set1_ps(1.f+(1e-4f));

    // the original function would test a few things and return a boolean to indicate if 'height' is valid
    // what we'll do for the sake of parallel is perform the same series of tests, and just return 0 in the
    // appropriate component if the test fails, basically how branchless SSE compares work
    // it's easy to change this to FFFFFFFF or whatever if you want to, just go see what would work for the callee

    // this makes a mask for: if (u >= -EPS && v >= -EPS && (u+v) <= 1+EPS)
    const __m128 mask1 = _mm_cmpge_ps(u, min_eps);
    const __m128 mask2 = _mm_cmpge_ps(v, min_eps);
    const __m128 mask3 = _mm_cmple_ps(_mm_add_ps(u, v), eps_plus_1);
    const __m128 mask = _mm_and_ps(mask1, _mm_and_ps(mask2, mask3));

    // now comes the nasty part where we suddenly needed the Ys :)
    const __m128 aYs = _mm_load_ps(aY);
    const __m128 bYs = _mm_load_ps(bY);
    const __m128 cYs = _mm_load_ps(cY);
    const __m128 yv0 = _mm_sub_ps(cYs, aYs);
    const __m128 yv1 = _mm_sub_ps(bYs, aYs);

    // create this: v0[1]*u & v1[1]*v
    // this basically means multiply all y deltas from v0 by U, and the others by V
    const __m128 uyv0 = _mm_mul_ps(u, yv0);
    const __m128 vyv1 = _mm_mul_ps(v, yv1);

    // now we add all point a Y's to it, mask it, et voila
    return _mm_and_ps(mask, _mm_add_ps(aYs, _mm_add_ps(uyv0, vyv1)));
}