Advertisement
Guest User

Untitled

a guest
Nov 21st, 2019
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.76 KB | None | 0 0
  1. static tracy_force_inline uint64_t ProcessRGB( const uint8_t* src )
  2. {
  3. #ifdef __SSE4_1__
  4. __m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
  5. __m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
  6. __m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
  7. __m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
  8.  
  9. __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
  10. __m128i sd0 = _mm_and_si128( px0, smask );
  11. __m128i sd1 = _mm_and_si128( px1, smask );
  12. __m128i sd2 = _mm_and_si128( px2, smask );
  13. __m128i sd3 = _mm_and_si128( px3, smask );
  14.  
  15. __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
  16.  
  17. __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
  18. __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
  19. __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
  20. __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
  21.  
  22. __m128i sm0 = _mm_and_si128(sc0, sc1);
  23. __m128i sm1 = _mm_and_si128(sc2, sc3);
  24. __m128i sm = _mm_and_si128(sm0, sm1);
  25.  
  26. if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
  27. {
  28. return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
  29. }
  30.  
  31. __m128i min0 = _mm_min_epu8( px0, px1 );
  32. __m128i min1 = _mm_min_epu8( px2, px3 );
  33. __m128i min2 = _mm_min_epu8( min0, min1 );
  34.  
  35. __m128i max0 = _mm_max_epu8( px0, px1 );
  36. __m128i max1 = _mm_max_epu8( px2, px3 );
  37. __m128i max2 = _mm_max_epu8( max0, max1 );
  38.  
  39. __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
  40. __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
  41. __m128i min4 = _mm_min_epu8( min2, min3 );
  42. __m128i max4 = _mm_max_epu8( max2, max3 );
  43.  
  44. __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
  45. __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
  46. __m128i rmin = _mm_min_epu8( min4, min5 );
  47. __m128i rmax = _mm_max_epu8( max4, max5 );
  48.  
  49. __m128i range1 = _mm_subs_epu8( rmax, rmin );
  50. __m128i range2 = _mm_sad_epu8( rmax, rmin );
  51.  
  52. uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
  53. __m128i range = _mm_set1_epi16( DivTable[vrange] );
  54.  
  55. __m128i inset1 = _mm_srli_epi16( range1, 4 );
  56. __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
  57. __m128i min = _mm_adds_epu8( rmin, inset );
  58. __m128i max = _mm_subs_epu8( rmax, inset );
  59.  
  60. __m128i c0 = _mm_subs_epu8( px0, rmin );
  61. __m128i c1 = _mm_subs_epu8( px1, rmin );
  62. __m128i c2 = _mm_subs_epu8( px2, rmin );
  63. __m128i c3 = _mm_subs_epu8( px3, rmin );
  64.  
  65. __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
  66. __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
  67. __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
  68. __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
  69.  
  70. __m128i s0 = _mm_hadd_epi16( is0, is1 );
  71. __m128i s1 = _mm_hadd_epi16( is2, is3 );
  72.  
  73. __m128i m0 = _mm_mulhi_epu16( s0, range );
  74. __m128i m1 = _mm_mulhi_epu16( s1, range );
  75.  
  76. __m128i p0 = _mm_packus_epi16( m0, m1 );
  77.  
  78. __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
  79. __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
  80. __m128i p3 = _mm_or_si128( p1, p2 );
  81. __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
  82.  
  83. uint32_t vmin = _mm_cvtsi128_si32( min );
  84. uint32_t vmax = _mm_cvtsi128_si32( max );
  85. uint32_t vp = _mm_cvtsi128_si32( p );
  86.  
  87. return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
  88. #else
  89. const auto ref = to565( src[0], src[1], src[2] );
  90. auto stmp = src + 4;
  91. for( int i=1; i<16; i++ )
  92. {
  93. if( to565( stmp[0], stmp[1], stmp[2] ) != ref )
  94. {
  95. break;
  96. }
  97. stmp += 4;
  98. }
  99. if( stmp == src + 64 )
  100. {
  101. return uint64_t( ref ) << 16;
  102. }
  103.  
  104. uint8_t min[3] = { src[0], src[1], src[2] };
  105. uint8_t max[3] = { src[0], src[1], src[2] };
  106. auto tmp = src + 4;
  107. for( int i=1; i<16; i++ )
  108. {
  109. for( int j=0; j<3; j++ )
  110. {
  111. if( tmp[j] < min[j] ) min[j] = tmp[j];
  112. else if( tmp[j] > max[j] ) max[j] = tmp[j];
  113. }
  114. tmp += 4;
  115. }
  116.  
  117. const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
  118. const uint32_t rmin = min[0] + min[1] + min[2];
  119. for( int i=0; i<3; i++ )
  120. {
  121. const uint8_t inset = ( max[i] - min[i] ) >> 4;
  122. min[i] += inset;
  123. max[i] -= inset;
  124. }
  125.  
  126. uint32_t data = 0;
  127. for( int i=0; i<16; i++ )
  128. {
  129. const uint32_t c = src[0] + src[1] + src[2] - rmin;
  130. const uint8_t idx = ( c * range ) >> 16;
  131. data |= idx << (i*2);
  132. src += 4;
  133. }
  134.  
  135. return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
  136. #endif
  137. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement