Advertisement
superplek

blerk

Jul 2nd, 2012
501
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 4.21 KB | None | 0 0
  1. // - first off: stylewise this is just a mockup, modify it as you see fit
  2.  
  3. // - there's been a change of plans, since further insight into the code (not pasted in the question)
  4. //   taught me that the Y coordinate was needed after all :)
  5.  
  6. // - but if you're pre-reorganizing data anyway, this shouldn't be too hard to accomplish either
  7. //   if need be I can still write the code so it takes in XYZ pairs but it'd do some harm locally
  8.  
  9. __forceinline __m128 function_dot2D(__m128 u0, __m128 v0, __m128 u1, const __m128 &v1)
  10. // ^ VS2008 can't gaurantee alignment for 4 separate SSE register parameters, hence the const&
  11. {
  12.     // multiplies all Xs with all Xs, all Zs with all Zs, then adds them per-vertex
  13.     // and stores them in 1 register as 4 single prec. float result
  14.     // also, all hail HADDPS (warning: it's SSE3)
  15.     return _mm_hadd_ps(_mm_mul_ps(u0, v0), _mm_mul_ps(u1, v1));
  16. }
  17.  
  18. __m128 function(
  19.     const float *p,  // 4 XZ pairs
  20.     const float *a,  // 4 XZ pairs
  21.     const float *b,  // 4 XZ pairs
  22.     const float *c,  // 4 XZ pairs
  23.     const float *aY, // 4 Ys
  24.     const float *bY, // 4 Ys
  25.     const float *cY) // 4 Ys
  26.     // all, as discussed, must be aligned to 16 bytes  
  27. {
  28.     // load all 4 vertices (X&Z) pairs from each resp. pointer
  29.     const __m128 p0 = _mm_load_ps(p);
  30.     const __m128 p1 = _mm_load_ps(p+4);
  31.     const __m128 a0 = _mm_load_ps(a);
  32.     const __m128 a1 = _mm_load_ps(a+4);
  33.     const __m128 b0 = _mm_load_ps(b);
  34.     const __m128 b1 = _mm_load_ps(b+4);
  35.     const __m128 c0 = _mm_load_ps(c);
  36.     const __m128 c1 = _mm_load_ps(c+4);
  37.  
  38.     //v0s = c-a
  39.     const __m128 v0_0 = _mm_sub_ps(c0, a0);
  40.     const __m128 v0_1 = _mm_sub_ps(c1, a1);
  41.  
  42.     //v1s = b-a
  43.     const __m128 v1_0 = _mm_sub_ps(b0, a0);
  44.     const __m128 v1_1 = _mm_sub_ps(b1, a1);
  45.  
  46.     //v2s = p-a
  47.     const __m128 v2_0 = _mm_sub_ps(p0, a0);
  48.     const __m128 v2_1 = _mm_sub_ps(p1, a1);
  49.    
  50.     // calculate all 2D dot products, each one with 4 results: 1 for each triangle
  51.     const __m128 dot00 = function_dot2D(v0_0, v0_0, v0_1, v0_1);
  52.     const __m128 dot01 = function_dot2D(v0_0, v1_0, v0_1, v1_1);
  53.     const __m128 dot02 = function_dot2D(v0_0, v2_0, v0_1, v2_1);
  54.     const __m128 dot11 = function_dot2D(v1_0, v1_0, v1_1, v1_1);
  55.     const __m128 dot12 = function_dot2D(v1_0, v2_0, v1_1, v2_1);
  56.    
  57.     // now for the barycentric calc.
  58.     const __m128 invDenom = _mm_rsqrt_ps(_mm_sub_ps(_mm_mul_ps(dot00, dot11), _mm_mul_ps(dot01, dot01)));
  59.     const __m128 u = _mm_mul_ps(invDenom, _mm_sub_ps(_mm_mul_ps(dot11, dot02), _mm_mul_ps(dot01, dot12)));
  60.     const __m128 v = _mm_mul_ps(invDenom, _mm_sub_ps(_mm_mul_ps(dot00, dot12), _mm_mul_ps(dot01, dot02)));
  61.  
  62.     // at this point we're gonna deviate (and where things get a bit iffy)
  63.     // wish you had pasted it along in the question ;)
  64.  
  65.     // memon's "sloppy epsilon(s)"
  66.     // apparently needed to allow to get height of points which are interpolated along the edges of the triangles
  67.     static const __m128 min_eps = _mm_set1_ps(-(1e-4f));
  68.     static const __m128 eps_plus_1 = _mm_set1_ps(1.f+(1e-4f));
  69.  
  70.     // the original function would test a few things and return a boolean to indicate if 'height' is valid
  71.     // what we'll do for the sake of parallel is perform the same series of tests, and just return 0 in the
  72.     // appropriate component if the test fails, basically how branchless SSE compares work
  73.     // it's easy to change this to FFFFFFFF or whatever if you want to, just go see what would work for the callee
  74.  
  75.     // this makes a mask for: if (u >= -EPS && v >= -EPS && (u+v) <= 1+EPS)
  76.     const __m128 mask1 = _mm_cmpge_ps(u, min_eps);
  77.     const __m128 mask2 = _mm_cmpge_ps(v, min_eps);
  78.     const __m128 mask3 = _mm_cmple_ps(_mm_add_ps(u, v), eps_plus_1);
  79.     const __m128 mask = _mm_and_ps(mask1, _mm_and_ps(mask2, mask3));
  80.  
  81.     // now comes the nasty part where we suddenly needed the Ys :)
  82.     const __m128 aYs = _mm_load_ps(aY);
  83.     const __m128 bYs = _mm_load_ps(bY);
  84.     const __m128 cYs = _mm_load_ps(cY);
  85.     const __m128 yv0 = _mm_sub_ps(cYs, aYs);
  86.     const __m128 yv1 = _mm_sub_ps(bYs, aYs);
  87.    
  88.     // create this: v0[1]*u & v1[1]*v
  89.     // this basically means multiply all y deltas from v0 by U, and the others by V
  90.     const __m128 uyv0 = _mm_mul_ps(u, yv0);
  91.     const __m128 vyv1 = _mm_mul_ps(v, yv1);
  92.    
  93.     // now we add all point a Y's to it, mask it, et voila
  94.     return _mm_and_ps(mask, _mm_add_ps(aYs, _mm_add_ps(uyv0, vyv1)));  
  95. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement