Advertisement
Guest User

Untitled

a guest
Oct 16th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 19.19 KB | None | 0 0
  1.  
  2. BEGIN_TIMED_BLOCK(DrawRectangleSlowly);
  3.  
  4. //NOTE(Egor): premultiply color
  5. Color.rgb *= Color.a;
  6.  
  7. #if 1
  8.  
  9. // NOTE(Egor): specific order for algorithm
  10. rectangle2i FillRect = InvertedHalfMaxRectangle();
  11. #else
  12.  
  13. rectangle2i FillRect = HalfMaxRectangle();
  14. #endif
  15.  
  16. // NOTE(Egor): take points of (possibly) rotated coordinate axis to determine
  17. // area that includes shape
  18. v2 Min = Basis.Origin;
  19. v2 Max = Basis.Origin + Basis.XAxis + Basis.YAxis;
  20. v2 P[4] = {Min, Min + Basis.XAxis, Min + Basis.YAxis, Max};
  21.  
  22. for(uint32 Index = 0; Index < ArrayCount(P); ++Index) {
  23.  
  24. v2 *TestP = P + Index;
  25.  
  26. int32 CeilX = CeilReal32ToInt32(TestP->x) + 1;
  27. int32 FloorX = FloorReal32ToInt32(TestP->x);
  28. int32 CeilY = CeilReal32ToInt32(TestP->y) + 1;
  29. int32 FloorY = FloorReal32ToInt32(TestP->y);
  30.  
  31. if(FillRect.XMin > FloorX) FillRect.XMin = FloorX;
  32. if(FillRect.YMin > FloorY) FillRect.YMin = FloorY;
  33. if(FillRect.XMax < CeilX) FillRect.XMax = CeilX;
  34. if(FillRect.YMax < CeilY) FillRect.YMax = CeilY;
  35. }
  36.  
  37. FillRect = Intersect(ClipRect, FillRect);
  38.  
  39. if(HasArea(FillRect)) {
  40.  
  41. // NOTE(Egor): interleaved alternating scanning lines
  42. if(!Even == (FillRect.YMin & 1)) {
  43.  
  44. FillRect.YMin += 1;
  45. }
  46.  
  47. #if 1
  48. int32 FillWidth = FillRect.XMax - FillRect.XMin;
  49. int32 FillWidthAlign = FillWidth & 0x3;
  50. int32 Adjust = (~FillWidthAlign + 1) & 0x3;
  51. FillWidth += Adjust;
  52. FillRect.XMin = FillRect.XMax - FillWidth;
  53.  
  54. // TODO(Egor): do not use madskills
  55. __m128i Dummy = _mm_setr_epi32(0,1,2,3);
  56. __m128i StartupClipMask = _mm_sub_epi32(Dummy, _mm_set1_epi32(Adjust));
  57. StartupClipMask = _mm_cmplt_epi32(StartupClipMask, _mm_set1_epi32(0));
  58. StartupClipMask = _mm_andnot_si128(StartupClipMask, _mm_set1_epi8(-1));
  59. #endif
  60.  
  61. real32 InvXAxisLengthSq = 1.0f/LengthSq(Basis.XAxis);
  62. real32 InvYAxisLengthSq = 1.0f/LengthSq(Basis.YAxis);
  63.  
  64. v2 nXAxis = InvXAxisLengthSq * Basis.XAxis;
  65. v2 nYAxis = InvYAxisLengthSq * Basis.YAxis;
  66.  
  67. __m128 nXAxisX_4x = _mm_set1_ps(nXAxis.x);
  68. __m128 nXAxisY_4x = _mm_set1_ps(nXAxis.y);
  69.  
  70. __m128 nYAxisX_4x = _mm_set1_ps(nYAxis.x);
  71. __m128 nYAxisY_4x = _mm_set1_ps(nYAxis.y);
  72.  
  73. real32 Inv255 = 1.0f/255.0f;
  74.  
  75. // NOTE(Egor): color modulation values in SIMD
  76. __m128 ColorR_4x = _mm_set1_ps(Color.r);
  77. __m128 ColorG_4x = _mm_set1_ps(Color.g);
  78. __m128 ColorB_4x = _mm_set1_ps(Color.b);
  79. __m128 ColorA_4x = _mm_set1_ps(Color.a);
  80.  
  81. __m128 ColorA_INV_4x = _mm_set1_ps(Color.a*Inv255);
  82.  
  83. __m128 One = _mm_set1_ps(1.0f);
  84. __m128 Two = _mm_set1_ps(2.0f);
  85. __m128 Zero = _mm_set1_ps(0.0f);
  86. __m128 Four = _mm_set1_ps(4.0f);
  87. __m128i MaskFF = _mm_set1_epi32(0xFF);
  88. __m128i MaskFFFF = _mm_set1_epi32(0xFFFF);
  89. __m128i MaskFF00FF = _mm_set1_epi32(0x00FF00FF);
  90.  
  91. __m128 Squared255 = _mm_set1_ps(255.0f*255.0f);
  92.  
  93. __m128 WidthM2_4x = _mm_set1_ps((real32)(Texture->Width - 2));
  94. __m128 HeightM2_4x = _mm_set1_ps((real32)(Texture->Height - 2));
  95.  
  96. int32 TexturePitch = Texture->Pitch;
  97. __m128i TexturePitch_4x = _mm_set1_epi32(TexturePitch);
  98. void *TextureMemory = Texture->Memory;
  99. uint32 RowAdvance = Buffer->Pitch*2;
  100.  
  101. uint8 *Row = (uint8 *)Buffer->Memory + FillRect.YMin*Buffer->Pitch + FillRect.XMin*BITMAP_BYTES_PER_PIXEL;
  102. #if 0
  103. int32 Align = (uintptr)Row & (16 - 1);
  104. Row -= Align;
  105. #endif
  106.  
  107. #define M(a, I) ((real32 *)&(a))[I]
  108. #define Mi(a, I) ((uint32 *)&(a))[I]
  109. #define mm_square(a) _mm_mul_ps(a, a)
  110.  
  111. #if 0 // NOTE(Egor): disable gamma correction
  112.  
  113. #undef mm_square
  114. #define mm_square(a) a;
  115. #define _mm_sqrt_ps(a) a
  116. #endif
  117.  
  118. __m128 OriginX_4x = _mm_set1_ps(Basis.Origin.x);
  119. __m128 OriginY_4x = _mm_set1_ps(Basis.Origin.y);
  120.  
  121. // NOTE(Egor): relative position of actual (ON_SCREEN) pixel inside texture
  122. // --> we find the vector, that point to a pixel inside a texture basis
  123. // NOTE(Egor): Get all 4 Pixels (X-OriginX)+3 (X-OriginX)+2 (X-OriginX)+1 (X-OriginX)+0
  124. // NOTE(Egor): Add3210_m_OriginX has baked in OriginX_4x
  125. // NOTE(Egor): This is when we reset PixelPX betwen Y - runs
  126. __m128 Add3210_m_OriginX = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
  127. Add3210_m_OriginX = _mm_sub_ps(Add3210_m_OriginX, OriginX_4x);
  128. __m128 XMin_4x = _mm_set1_ps((real32)FillRect.XMin);
  129. XMin_4x = _mm_add_ps(XMin_4x, Add3210_m_OriginX);
  130.  
  131. // NOTE(Egor): Get quad (Y-OriginY) pixel
  132. __m128 PixelPY = _mm_set1_ps((real32)FillRect.YMin);
  133. PixelPY = _mm_sub_ps(PixelPY, OriginY_4x);
  134.  
  135. BEGIN_TIMED_BLOCK(ProcessPixel);
  136.  
  137. for(int32 Y = FillRect.YMin; Y < FillRect.YMax; Y += 2) {
  138.  
  139. __m128i ClipMask = StartupClipMask;
  140.  
  141. #define TEST_PixelPY_x_NXAxisY 1 // NOTE(Egor): looks like disabled it's faster ¯\_(-_-)_/¯
  142.  
  143. #if TEST_PixelPY_x_NXAxisY
  144. // NOTE(Egor): only changes between Y - runs
  145. __m128 PixelPY_x_NXAxisY = _mm_mul_ps(nXAxisY_4x, PixelPY);
  146. __m128 PixelPY_x_NYAxisY = _mm_mul_ps(nYAxisY_4x, PixelPY);
  147. #endif
  148.  
  149. __m128 PixelPX = XMin_4x;
  150.  
  151. uint32 *Pixel = (uint32 *)Row;
  152.  
  153. for(int32 XI = FillRect.XMin; XI < FillRect.XMax; XI += 4) {
  154.  
  155. IACA_VC64_START;
  156.  
  157. // NOTE(Egor): this routine works with _ON_SCREEN_ pixels, and correspond them with
  158. // Texels inside Actual Texture that drawn to that part of _SCREEN_
  159.  
  160. // NOTE(Egor): inner product that compute X and Y component of vector
  161. // 1. U and V is a normalized coefficients, U = X_component/XAxis_length
  162. // 2. nXAxisX is premultiplied to InvXAxis_Length
  163. // 3. PixelPY_x_NXAxisY is constant across all X - runs
  164. #if TEST_PixelPY_x_NXAxisY
  165.  
  166. __m128 U = _mm_add_ps(_mm_mul_ps(nXAxisX_4x, PixelPX), PixelPY_x_NXAxisY);
  167. __m128 V = _mm_add_ps(_mm_mul_ps(nYAxisX_4x, PixelPX), PixelPY_x_NYAxisY);
  168. #else
  169.  
  170. __m128 U = _mm_add_ps(_mm_mul_ps(nXAxisX_4x, PixelPX), _mm_mul_ps(nXAxisY_4x, PixelPY));
  171. __m128 V = _mm_add_ps(_mm_mul_ps(nYAxisX_4x, PixelPX), _mm_mul_ps(nYAxisY_4x, PixelPY));
  172.  
  173. #endif
  174.  
  175. // NOTE(Egor): generate write mask that determine if we should write
  176. // New Color Data, if pixels is out of bounds we just masked them out with
  177. // Old Color Data (Was Called OriginalDest -- possibly renamed)
  178. __m128i WriteMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmple_ps(U, One),
  179. _mm_cmpge_ps(U, Zero)),
  180. _mm_and_ps(_mm_cmple_ps(V, One),
  181. _mm_cmpge_ps(V, Zero))));
  182.  
  183. WriteMask = _mm_and_si128(WriteMask, ClipMask);
  184. // TODO(Egor): check later if it helps
  185. // if(_mm_movemask_epi8(WriteMask))
  186. {
  187.  
  188. __m128i OriginalDest = _mm_loadu_si128((__m128i *)Pixel);
  189.  
  190. // NOTE(Egor): clamp U and V to prevent fetching nonexistent texel data
  191. // we could have U and V that exceeds 1.0f that can fetch outside of texture
  192. // buffer
  193. U = _mm_min_ps(_mm_max_ps(U, Zero), One);
  194. V = _mm_min_ps(_mm_max_ps(V, Zero), One);
  195.  
  196. // NOTE(Egor): texture boundary multiplication to determine which pixel
  197. // we need to fetch from texture buffer
  198. __m128 tX = _mm_mul_ps(U, WidthM2_4x);
  199. __m128 tY = _mm_mul_ps(V, HeightM2_4x);
  200.  
  201. // NOTE(Egor): round with truncation
  202. __m128i FetchX_4x = _mm_cvttps_epi32(tX);
  203. __m128i FetchY_4x =_mm_cvttps_epi32(tY);
  204.  
  205. // NOTE(Egor): take the fractional part of tX and tY
  206. __m128 fX = _mm_sub_ps(tX, _mm_cvtepi32_ps(FetchX_4x));
  207. __m128 fY = _mm_sub_ps(tY, _mm_cvtepi32_ps(FetchY_4x));
  208.  
  209. // NOTE(Egor): fetch 4 texels from texture buffer
  210.  
  211. FetchX_4x = _mm_slli_epi32(FetchX_4x, 2);
  212. FetchY_4x = _mm_mullo_epi32(FetchY_4x, TexturePitch_4x);
  213. FetchX_4x = _mm_add_epi32(FetchX_4x, FetchY_4x);
  214.  
  215. int32 Fetch0 = Mi(FetchX_4x, 0);
  216. int32 Fetch1 = Mi(FetchX_4x, 1);
  217. int32 Fetch2 = Mi(FetchX_4x, 2);
  218. int32 Fetch3 = Mi(FetchX_4x, 3);
  219.  
  220. uint8 *TexelPtr0 = (((uint8 *)TextureMemory) + Fetch0);
  221. uint8 *TexelPtr1 = (((uint8 *)TextureMemory) + Fetch1);
  222. uint8 *TexelPtr2 = (((uint8 *)TextureMemory) + Fetch2);
  223. uint8 *TexelPtr3 = (((uint8 *)TextureMemory) + Fetch3);
  224.  
  225. __m128i SampleA;
  226. __m128i SampleB;
  227. __m128i SampleC;
  228. __m128i SampleD;
  229.  
  230. SampleA = _mm_setr_epi32(*(uint32 *)(TexelPtr0),
  231. *(uint32 *)(TexelPtr1),
  232. *(uint32 *)(TexelPtr2),
  233. *(uint32 *)(TexelPtr3));
  234.  
  235. SampleB = _mm_setr_epi32(*(uint32 *)(TexelPtr0 + sizeof(uint32)),
  236. *(uint32 *)(TexelPtr1 + sizeof(uint32)),
  237. *(uint32 *)(TexelPtr2 + sizeof(uint32)),
  238. *(uint32 *)(TexelPtr3 + sizeof(uint32)));
  239.  
  240. SampleC = _mm_setr_epi32(*(uint32 *)(TexelPtr0 + TexturePitch),
  241. *(uint32 *)(TexelPtr1 + TexturePitch),
  242. *(uint32 *)(TexelPtr2 + TexturePitch),
  243. *(uint32 *)(TexelPtr3 + TexturePitch));
  244.  
  245. SampleD = _mm_setr_epi32(*(uint32 *)(TexelPtr0 + TexturePitch + sizeof(uint32)),
  246. *(uint32 *)(TexelPtr1 + TexturePitch + sizeof(uint32)),
  247. *(uint32 *)(TexelPtr2 + TexturePitch + sizeof(uint32)),
  248. *(uint32 *)(TexelPtr3 + TexturePitch + sizeof(uint32)));
  249.  
  250. //////
  251. __m128i TexelArb = _mm_and_si128(SampleA, MaskFF00FF);
  252. __m128i TexelAag = _mm_and_si128(_mm_srli_epi32(SampleA, 8), MaskFF00FF);
  253. TexelArb = _mm_mullo_epi16(TexelArb, TexelArb);
  254. __m128 TexelAa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelAag, 16));
  255. TexelAag = _mm_mullo_epi16(TexelAag, TexelAag);
  256.  
  257. //////
  258. __m128i TexelCrb = _mm_and_si128(SampleC, MaskFF00FF);
  259. __m128i TexelCag = _mm_and_si128(_mm_srli_epi32(SampleC, 8), MaskFF00FF);
  260. TexelCrb = _mm_mullo_epi16(TexelCrb, TexelCrb);
  261. __m128 TexelCa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelCag, 16));
  262. TexelCag = _mm_mullo_epi16(TexelCag, TexelCag);
  263.  
  264. //////
  265. __m128i TexelDrb = _mm_and_si128(SampleD, MaskFF00FF);
  266. __m128i TexelDag = _mm_and_si128(_mm_srli_epi32(SampleD, 8), MaskFF00FF);
  267. TexelDrb = _mm_mullo_epi16(TexelDrb, TexelDrb);
  268. __m128 TexelDa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelDag, 16));
  269. TexelDag = _mm_mullo_epi16(TexelDag, TexelDag);
  270.  
  271. // NOTE(Egor): Convert texture from SRGB to 'linear' brightness space
  272. __m128 TexelAr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelArb, 16));
  273. __m128 TexelAg = _mm_cvtepi32_ps(_mm_and_si128(TexelAag, MaskFFFF));
  274. __m128 TexelAb = _mm_cvtepi32_ps(_mm_and_si128(TexelArb, MaskFFFF));
  275.  
  276. __m128 TexelCr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelCrb, 16));
  277. __m128 TexelCg = _mm_cvtepi32_ps(_mm_and_si128(TexelCag, MaskFFFF));
  278. __m128 TexelCb = _mm_cvtepi32_ps(_mm_and_si128(TexelCrb, MaskFFFF));
  279.  
  280. __m128 TexelDr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelDrb, 16));
  281. __m128 TexelDg = _mm_cvtepi32_ps(_mm_and_si128(TexelDag, MaskFFFF));
  282. __m128 TexelDb = _mm_cvtepi32_ps(_mm_and_si128(TexelDrb, MaskFFFF));
  283.  
  284. // NOTE(Egor): convert to linear brightness space
  285. // Dest = SRGB255ToLinear1(Dest);
  286. #if 0
  287.  
  288. __m128 TexelBb = _mm_cvtepi32_ps(_mm_and_si128(SampleB, MaskFF));
  289. __m128 TexelBg = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(SampleB, 8), MaskFF));
  290. __m128 TexelBr = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(SampleB, 16), MaskFF));
  291. __m128 TexelBa = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(SampleB, 24), MaskFF));
  292. TexelBr = mm_square(TexelBr);
  293. TexelBg = mm_square(TexelBg);
  294. TexelBb = mm_square(TexelBb);
  295.  
  296. __m128 DestB = _mm_cvtepi32_ps(_mm_and_si128(OriginalDest, MaskFF));
  297. __m128 DestG = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(OriginalDest, 8), MaskFF));
  298. __m128 DestR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(OriginalDest, 16), MaskFF));
  299. __m128 DestA = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(OriginalDest, 24), MaskFF));
  300. DestR = mm_square(DestR);
  301. DestG = mm_square(DestG);
  302. DestB = mm_square(DestB);
  303.  
  304. #else
  305.  
  306. //////
  307. __m128i TexelBrb = _mm_and_si128(SampleB, MaskFF00FF);
  308. __m128i TexelBag = _mm_and_si128(_mm_srli_epi32(SampleB, 8), MaskFF00FF);
  309. TexelBrb = _mm_mullo_epi16(TexelBrb, TexelBrb);
  310. __m128 TexelBa = _mm_cvtepi32_ps(_mm_srli_epi32(TexelBag, 16));
  311. TexelBag = _mm_mullo_epi16(TexelBag, TexelBag);
  312.  
  313. __m128 TexelBr = _mm_cvtepi32_ps(_mm_srli_epi32(TexelBrb, 16));
  314. __m128 TexelBg = _mm_cvtepi32_ps(_mm_and_si128(TexelBag, MaskFFFF));
  315. __m128 TexelBb = _mm_cvtepi32_ps(_mm_and_si128(TexelBrb, MaskFFFF));
  316.  
  317. // NOTE(Egor): load destination color
  318. __m128i Destrb = _mm_and_si128(OriginalDest, MaskFF00FF);
  319. __m128i Destag = _mm_and_si128(_mm_srli_epi32(OriginalDest, 8), MaskFF00FF);
  320. Destrb = _mm_mullo_epi16(Destrb, Destrb);
  321. __m128 DestA = _mm_cvtepi32_ps(_mm_srli_epi32(Destag, 16));
  322. Destag = _mm_mullo_epi16(Destag, Destag);
  323.  
  324. __m128 DestR = _mm_cvtepi32_ps(_mm_srli_epi32(Destrb, 16));
  325. __m128 DestG = _mm_cvtepi32_ps(_mm_and_si128(Destag, MaskFFFF));
  326. __m128 DestB = _mm_cvtepi32_ps(_mm_and_si128(Destrb, MaskFFFF));
  327.  
  328. #endif
  329.  
  330. // NOTE(Egor): compute coefficients for for subpixel rendering
  331. __m128 ifX = _mm_sub_ps(One, fX);
  332. __m128 ifY = _mm_sub_ps(One, fY);
  333. __m128 L0 = _mm_mul_ps(ifY, ifX);
  334. __m128 L1 = _mm_mul_ps(ifY, fX);
  335. __m128 L2 = _mm_mul_ps(fY, ifX);
  336. __m128 L3 = _mm_mul_ps(fY, fX);
  337.  
  338. // NOTE(Egor): subpixel blending
  339. // lerp 4 pixel square |A|B| into one pixel with weighted coefficients
  340. // |C|D|
  341. // fX -> Fractional Part of X Texel from texture
  342. // fY -> Fractional Part of Y Textl from texture
  343. __m128 Texelr = _mm_add_ps(_mm_add_ps(_mm_mul_ps(L0, TexelAr),
  344. _mm_mul_ps(L1, TexelBr)),
  345. _mm_add_ps(_mm_mul_ps(L2, TexelCr),
  346. _mm_mul_ps(L3, TexelDr)));
  347.  
  348. __m128 Texelg = _mm_add_ps(_mm_add_ps(_mm_mul_ps(L0, TexelAg),
  349. _mm_mul_ps(L1, TexelBg)),
  350. _mm_add_ps(_mm_mul_ps(L2, TexelCg),
  351. _mm_mul_ps(L3, TexelDg)));
  352.  
  353. __m128 Texelb = _mm_add_ps(_mm_add_ps(_mm_mul_ps(L0, TexelAb),
  354. _mm_mul_ps(L1, TexelBb)),
  355. _mm_add_ps(_mm_mul_ps(L2, TexelCb),
  356. _mm_mul_ps(L3, TexelDb)));
  357.  
  358. __m128 Texela = _mm_add_ps(_mm_add_ps(_mm_mul_ps(L0, TexelAa),
  359. _mm_mul_ps(L1, TexelBa)),
  360. _mm_add_ps(_mm_mul_ps(L2, TexelCa),
  361. _mm_mul_ps(L3, TexelDa)));
  362.  
  363. // NOTE(Egor): Modulate by incoming color
  364. Texelr = _mm_mul_ps(Texelr, ColorR_4x);
  365. Texelg = _mm_mul_ps(Texelg, ColorG_4x);
  366. Texelb = _mm_mul_ps(Texelb, ColorB_4x);
  367. Texela = _mm_mul_ps(Texela, ColorA_INV_4x);
  368.  
  369. // NOTE(Egor): clamp color to valid range
  370. Texelr = _mm_min_ps(_mm_max_ps(Texelr, Zero), Squared255);
  371. Texelg = _mm_min_ps(_mm_max_ps(Texelg, Zero), Squared255);
  372. Texelb = _mm_min_ps(_mm_max_ps(Texelb, Zero), Squared255);
  373.  
  374. // NOTE(Egor): alpha channel for composited bitmaps is in premultiplied alpha mode
  375. // for case when we create intermediate buffer with two or more bitmaps blend with
  376. // each other
  377.  
  378. // NOTE(Egor): destination blend
  379. // v4 Blended = (1.0f - Texel.a)*Dest + Texel;
  380. __m128 OneComplementTexelA = _mm_sub_ps(One, Texela);
  381. __m128 BlendedR = _mm_add_ps(_mm_mul_ps(OneComplementTexelA, DestR), Texelr);
  382. __m128 BlendedG = _mm_add_ps(_mm_mul_ps(OneComplementTexelA, DestG), Texelg);
  383. __m128 BlendedB = _mm_add_ps(_mm_mul_ps(OneComplementTexelA, DestB), Texelb);
  384. __m128 BlendedA = _mm_add_ps(_mm_mul_ps(OneComplementTexelA, DestA), Texela);
  385.  
  386. // NOTE(Egor): convert back to gamma
  387. // v4 Blended255 = Linear1ToSRGB255(Blended);
  388. BlendedR = _mm_mul_ps(BlendedR, _mm_rsqrt_ps(BlendedR));
  389. BlendedG = _mm_mul_ps(BlendedG, _mm_rsqrt_ps(BlendedG));
  390. BlendedB = _mm_mul_ps(BlendedB, _mm_rsqrt_ps(BlendedB));
  391. BlendedA = BlendedA;
  392.  
  393. // NOTE(Egor): set _cvtps_ (round to nearest)
  394. __m128i IntA = _mm_cvtps_epi32(BlendedA);
  395. __m128i IntR = _mm_cvtps_epi32(BlendedR);
  396. __m128i IntG = _mm_cvtps_epi32(BlendedG);
  397. __m128i IntB = _mm_cvtps_epi32(BlendedB);
  398.  
  399. IntA = _mm_slli_epi32(IntA, 24);
  400. IntR = _mm_slli_epi32(IntR, 16);
  401. IntG = _mm_slli_epi32(IntG, 8);
  402. IntB = IntB;
  403.  
  404. __m128i Out = _mm_or_si128(_mm_or_si128(IntR, IntG),
  405. _mm_or_si128(IntB, IntA));
  406.  
  407. __m128i New = _mm_and_si128(WriteMask, Out);
  408. __m128i Old = _mm_andnot_si128(WriteMask, OriginalDest);
  409. __m128i MaskedOut = _mm_or_si128(New, Old);
  410.  
  411. _mm_store_si128((__m128i *)Pixel, MaskedOut);
  412. }
  413.  
  414. PixelPX = _mm_add_ps(PixelPX, Four);
  415. Pixel += 4;
  416. // NOTE(Egor): we clipped first 4 pixels
  417. ClipMask = _mm_set1_epi32(0xFFFFFFFF);
  418.  
  419. IACA_VC64_END;
  420. }
  421.  
  422. // NOTE(Egor): alternating lines
  423. Row += RowAdvance;
  424. PixelPY = _mm_add_ps(PixelPY, Two);
  425. }
  426.  
  427. int32 PixelCount = GetClampedArea(FillRect)/2;
  428. END_TIMED_BLOCK_COUNTED(ProcessPixel, PixelCount);
  429.  
  430. END_TIMED_BLOCK(DrawRectangleSlowly);
  431. }
  432.  
  433.  
  434.  
  435.  
  436.  
  437. THE_OLD_ONE ///////////////////////////////////////////////////////////////////
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement