Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // - first off: stylewise this is just a mockup, modify it as you see fit
- // - there's been a change of plans, since further insight into the code (not pasted in the question)
- // taught me that the Y coordinate was needed after all :)
- // - but if you're pre-reorganizing data anyway, this shouldn't be too hard to accomplish either
- // if need be I can still write the code so it takes in XYZ pairs but it'd do some harm locally
- __forceinline __m128 function_dot2D(__m128 u0, __m128 v0, __m128 u1, const __m128 &v1)
- // ^ VS2008 can't gaurantee alignment for 4 separate SSE register parameters, hence the const&
- {
- // multiplies all Xs with all Xs, all Zs with all Zs, then adds them per-vertex
- // and stores them in 1 register as 4 single prec. float result
- // also, all hail HADDPS (warning: it's SSE3)
- return _mm_hadd_ps(_mm_mul_ps(u0, v0), _mm_mul_ps(u1, v1));
- }
- __m128 function(
- const float *p, // 4 XZ pairs
- const float *a, // 4 XZ pairs
- const float *b, // 4 XZ pairs
- const float *c, // 4 XZ pairs
- const float *aY, // 4 Ys
- const float *bY, // 4 Ys
- const float *cY) // 4 Ys
- // all, as discussed, must be aligned to 16 bytes
- {
- // load all 4 vertices (X&Z) pairs from each resp. pointer
- const __m128 p0 = _mm_load_ps(p);
- const __m128 p1 = _mm_load_ps(p+4);
- const __m128 a0 = _mm_load_ps(a);
- const __m128 a1 = _mm_load_ps(a+4);
- const __m128 b0 = _mm_load_ps(b);
- const __m128 b1 = _mm_load_ps(b+4);
- const __m128 c0 = _mm_load_ps(c);
- const __m128 c1 = _mm_load_ps(c+4);
- //v0s = c-a
- const __m128 v0_0 = _mm_sub_ps(c0, a0);
- const __m128 v0_1 = _mm_sub_ps(c1, a1);
- //v1s = b-a
- const __m128 v1_0 = _mm_sub_ps(b0, a0);
- const __m128 v1_1 = _mm_sub_ps(b1, a1);
- //v2s = p-a
- const __m128 v2_0 = _mm_sub_ps(p0, a0);
- const __m128 v2_1 = _mm_sub_ps(p1, a1);
- // calculate all 2D dot products, each one with 4 results: 1 for each triangle
- const __m128 dot00 = function_dot2D(v0_0, v0_0, v0_1, v0_1);
- const __m128 dot01 = function_dot2D(v0_0, v1_0, v0_1, v1_1);
- const __m128 dot02 = function_dot2D(v0_0, v2_0, v0_1, v2_1);
- const __m128 dot11 = function_dot2D(v1_0, v1_0, v1_1, v1_1);
- const __m128 dot12 = function_dot2D(v1_0, v2_0, v1_1, v2_1);
- // now for the barycentric calc.
- const __m128 invDenom = _mm_rsqrt_ps(_mm_sub_ps(_mm_mul_ps(dot00, dot11), _mm_mul_ps(dot01, dot01)));
- const __m128 u = _mm_mul_ps(invDenom, _mm_sub_ps(_mm_mul_ps(dot11, dot02), _mm_mul_ps(dot01, dot12)));
- const __m128 v = _mm_mul_ps(invDenom, _mm_sub_ps(_mm_mul_ps(dot00, dot12), _mm_mul_ps(dot01, dot02)));
- // at this point we're gonna deviate (and where things get a bit iffy)
- // wish you had pasted it along in the question ;)
- // memon's "sloppy epsilon(s)"
- // apparently needed to allow to get height of points which are interpolated along the edges of the triangles
- static const __m128 min_eps = _mm_set1_ps(-(1e-4f));
- static const __m128 eps_plus_1 = _mm_set1_ps(1.f+(1e-4f));
- // the original function would test a few things and return a boolean to indicate if 'height' is valid
- // what we'll do for the sake of parallel is perform the same series of tests, and just return 0 in the
- // appropriate component if the test fails, basically how branchless SSE compares work
- // it's easy to change this to FFFFFFFF or whatever if you want to, just go see what would work for the callee
- // this makes a mask for: if (u >= -EPS && v >= -EPS && (u+v) <= 1+EPS)
- const __m128 mask1 = _mm_cmpge_ps(u, min_eps);
- const __m128 mask2 = _mm_cmpge_ps(v, min_eps);
- const __m128 mask3 = _mm_cmple_ps(_mm_add_ps(u, v), eps_plus_1);
- const __m128 mask = _mm_and_ps(mask1, _mm_and_ps(mask2, mask3));
- // now comes the nasty part where we suddenly needed the Ys :)
- const __m128 aYs = _mm_load_ps(aY);
- const __m128 bYs = _mm_load_ps(bY);
- const __m128 cYs = _mm_load_ps(cY);
- const __m128 yv0 = _mm_sub_ps(cYs, aYs);
- const __m128 yv1 = _mm_sub_ps(bYs, aYs);
- // create this: v0[1]*u & v1[1]*v
- // this basically means multiply all y deltas from v0 by U, and the others by V
- const __m128 uyv0 = _mm_mul_ps(u, yv0);
- const __m128 vyv1 = _mm_mul_ps(v, yv1);
- // now we add all point a Y's to it, mask it, et voila
- return _mm_and_ps(mask, _mm_add_ps(aYs, _mm_add_ps(uyv0, vyv1)));
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement