Advertisement
Bisqwit

tblend2.cc

Jun 9th, 2020
1,308
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.16 KB | None | 0 0
  1. #include <string>
  2. #include <cstdio>
  3. #include <cstring>
  4. #include <thread>
  5. #include <fcntl.h>
  6.  
  7. /* Compile this using Clang. GCC gets about 6.0 fps, Clang gets slightly more */
  8.  
  9. void YUV444pBlend(const unsigned char* __restrict__ frame1,
  10.                   const unsigned char* __restrict__ frame2,
  11.                   const unsigned char* __restrict__ frame3,
  12.                   const unsigned char* __restrict__ frame4,
  13.                   std::size_t num_bytes,
  14.                   unsigned char* __restrict__ target)
  15. {
  16. #if 0
  17.  
  18.     typedef unsigned long unit;
  19.     const unit* source1 = (const unit*)__builtin_assume_aligned(frame1,32);
  20.     const unit* source2 = (const unit*)__builtin_assume_aligned(frame2,32);
  21.     const unit* source3 = (const unit*)__builtin_assume_aligned(frame3,32);
  22.     const unit* source4 = (const unit*)__builtin_assume_aligned(frame4,32);
  23.     unit* destination = (unit*)__builtin_assume_aligned(target,32);
  24.  
  25.     #pragma omp parallel for simd schedule(static) num_threads(4)
  26.     for(std::size_t p=0; p<num_bytes/sizeof(unit); ++p)
  27.     {
  28.         unit word1 = ((const unit*)__builtin_assume_aligned(source1,32))[p];
  29.         unit word2 = ((const unit*)__builtin_assume_aligned(source2,32))[p];
  30.         unit word3 = ((const unit*)__builtin_assume_aligned(source3,32))[p];
  31.         unit word4 = ((const unit*)__builtin_assume_aligned(source4,32))[p];
  32.         const unit mask = (~(unit)(0)) / 0x1111; // 000F000F...
  33.  
  34.         unit lo1 = word1 & mask, mid1 = (word1>>4) & mask, next1 = (word1>>8) & mask, hi1 = (word1>>12) & mask;
  35.         unit lo2 = word2 & mask, mid2 = (word2>>4) & mask, next2 = (word2>>8) & mask, hi2 = (word2>>12) & mask;
  36.         unit lo3 = word3 & mask, mid3 = (word3>>4) & mask, next3 = (word3>>8) & mask, hi3 = (word3>>12) & mask;
  37.         unit lo4 = word4 & mask, mid4 = (word4>>4) & mask, next4 = (word4>>8) & mask, hi4 = (word4>>12) & mask;
  38.         unit losum   = lo1*23u   + (lo2  +lo3)*54u   + lo4*125u;   // 0#xx
  39.         unit midsum  = mid1*23u  + (mid2 +mid3)*54u  + mid4*125u;  // 0#xx
  40.         unit nextsum = next1*23u + (next2+next3)*54u + next4*125u; // 0#xx
  41.         unit hisum   = hi1*23u   + (hi2  +hi3)*54u   + hi4*125u;   // 0#xx
  42.  
  43.         unit result  = ((losum   & (mask<<8)) >> 8)
  44.                      | ((midsum  & (mask<<8)) >> 4)
  45.                      | ((nextsum & (mask<<8))     )
  46.                      | ((hisum   & (mask<<8)) << 4);
  47.         ((unit*)__builtin_assume_aligned(destination,32))[p] = result;
  48.     }
  49.  
  50.     /* (a*0.3 + b*0.7)*0.3 + (c*0.3 + d*0.7)*0.7
  51.      = a*0.09 + b*0.21 + c*0.21 + d*0.49
  52.      =
  53.      = Approximate these factors, 0.09, 0.21, 0.49 using 23, 54, 125
  54.      */
  55. #else
  56.  
  57.     #pragma omp simd //parallel for simd num_threads(2)
  58.     for(std::size_t p=0; p<num_bytes; ++p)
  59.     {
  60.         unsigned char byte1 = frame1[p], byte2 = frame2[p], byte3 = frame3[p], byte4 = frame4[p];
  61.         unsigned char lo1 = byte1,    lo2 = byte2,    lo3 = byte3,    lo4 = byte4;
  62.         unsigned char hi1 = byte1&0xF0, hi2 = byte2&0xF0, hi3 = byte3&0xF0, hi4 = byte4&0xF0;
  63.         unsigned lo = lo1*23u + (lo2+lo3)*54u + lo4*125u; // 0000x#xx
  64.         unsigned hi = hi1*23u + (hi2+hi3)*54u + hi4*125u; // 000x#xxx
  65.         target[p] = ((hi >> 8)&0xF) + (lo>>8);
  66.     }
  67.  
  68. #endif
  69. }
  70.  
  71. static std::size_t num_bytes;
  72.  
  73. static void pipe_resize(FILE* fp)
  74. {
  75. /*
  76.     sudo sysctl fs.pipe-user-pages-soft=0
  77.     sudo sysctl fs.pipe-max-size=$[1048576*512]
  78.     sudo setcap 'CAP_SYS_RESOURCE=+ep' tblend2
  79.     sudo setcap 'CAP_SYS_RESOURCE=+ep' tblend
  80. */
  81.     int prev_err=0;
  82.     for(unsigned power=41; power>10; --power)
  83.     {
  84.         int r = 0;
  85.         for(int tries=0; tries<4000; ++tries)
  86.         {
  87.             r = fcntl(fileno(fp), F_SETPIPE_SZ, 1ul<<power);
  88.             if(r >= 0) break;
  89.         }
  90.         if(r >= 0)
  91.         {
  92.             std::fprintf(stderr, "Pipe size successfully set to %lu (r=%d)\n", 1ul<<power, r);
  93.             break;
  94.         }
  95.         else
  96.         {
  97.             if(errno != prev_err)
  98.             {
  99.                 std::fprintf(stderr, "Failed to set pipe size to %lu; ", 1ul<<power);
  100.                 std::perror("fcntl");
  101.                 prev_err=errno;
  102.             }
  103.         }
  104.     }
  105.     int s = fcntl(fileno(fp), F_GETPIPE_SZ);
  106.     if(s > 0)
  107.         std::fprintf(stderr, "Pipe size is %d bytes\n", s);
  108.     else
  109.         std::perror("fcntl");
  110. }
  111.  
  112. int main(int argc, char** argv)
  113. {
  114.     std::size_t num_pixels = std::stoi(argv[1]) * std::stoi(argv[2]);
  115.     num_bytes = num_pixels * 3;
  116.  
  117.     std::size_t interval = num_bytes*4;
  118.  
  119.     unsigned char* buffer = new unsigned char[interval];
  120.     unsigned char* outbuf = new unsigned char[num_bytes];
  121.     unsigned char* buffer1 = new unsigned char[interval];
  122.     unsigned char* outbuf1 = new unsigned char[num_bytes];
  123.  
  124.     std::thread writer, processor;
  125.  
  126.     pipe_resize(stdin);
  127.     pipe_resize(stdout);
  128.  
  129.     for(;;)
  130.     {
  131.         std::swap(buffer,buffer1);
  132.  
  133.         std::size_t p = 0;
  134.         while(p < interval)
  135.         {
  136.             std::size_t eat = interval - p;
  137.             int r = std::fread(buffer+p, 1, eat, stdin);
  138.             if(r <= 0) break;
  139.             p += r;
  140.         }
  141.         if(!p) break;
  142.         if(p < interval)
  143.         {
  144.             std::memset(buffer+p, 0, interval-p);
  145.         }
  146.         if(processor.joinable()) processor.join();
  147.  
  148.         processor = std::thread([buffer,&outbuf,&outbuf1,&writer]()
  149.         {
  150.             std::swap(outbuf,outbuf1);
  151.             YUV444pBlend(buffer ,
  152.                          buffer  + num_bytes,
  153.                          buffer  + num_bytes*2,
  154.                          buffer  + num_bytes*3,
  155.                          num_bytes,
  156.                          outbuf);
  157.             if(writer.joinable()) writer.join();
  158.  
  159.             writer = std::thread([outbuf]()
  160.             {
  161.                 std::size_t p = 0;
  162.                 while(p < num_bytes)
  163.                 {
  164.                     std::size_t eat = num_bytes - p;
  165.                     int r = std::fwrite(outbuf + p, 1, eat, stdout);
  166.                     if(r <= 0) break;
  167.                     p += r;
  168.                 }
  169.             });
  170.         });
  171.     }
  172.  
  173.     if(processor.joinable()) processor.join();
  174.     if(writer.joinable()) writer.join();
  175. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement