Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- char a[16]={1,0,0,1 ,0,0,1,0, 0,1,0,0, 0,0,0,1};
- char sum1 = a[0] + a[1] + a[2] + a[3];
- char sum2 = a[4] + a[5] + a[6] + a[7];
- char sum3 = a[8] + a[9] + a[10] + a[11];
- char sum4 = a[12] + a[13] + a[14] + a[15];
- void myfunc( const char *vec1, const char *vec2, char *vec3, int *counts, int n){
- __m128i *r1 = (__m128i*)vec1;
- __m128i *r2 = (__m128i*)vec2;
- char *a = vec3;
- char temp[16] __attribute__ ((aligned (16)));
- for ( int i = 0; i < n; i+=16, r1++, r2++, a+=16 ) {
- _mm_store_si128((__m128i*)a, _mm_and_si128(*r1, *r2));
- _mm_store_si128((__m128i*)temp, _mm_or_si128(*r1, *r2));
- char size = a[0]+a[1]+a[2]+a[3];
- if( size == 0 ){
- memcpy(a, temp, 4*sizeof(char));
- counts[k]++;
- }
- k++;
- size = a[4]+a[5]+a[6]+a[7];
- if( size == 0 ){
- memcpy(a+4, temp+4, 4*sizeof(char));
- counts[k]++;
- }
- k++;
- size = a[8]+a[9]+a[10]+a[11];
- if( size == 0 ){
- memcpy(a+8, temp+8, 4*sizeof(char));
- counts[k]++;
- }
- k++;
- size = a[12]+a[13]+a[14]+a[15];
- if( size == 0 ){
- memcpy(a+12, temp+12, 4*sizeof(char));
- counts[k]++;
- }
- k++;
- }
- }
- for ( int i = 0; i < n; i+=16, r1++, r2++, a+=16, k+=4 ) {
- _mm_store_si128((__m128i*)a, _mm_and_si128(*r1, *r2));
- _mm_store_si128((__m128i*)temp, _mm_or_si128(*r1, *r2));
- __m128i a4 = _mm_load_si128((__m128i*)a);
- __m128i tmp4 = _mm_load_si128((__m128i*)tmp);
- __m128i counts4 = _mm_load_si128((__m128i*)&counts[k]);
- __m128i test = _mm_cmpeq_epi32(_mm_set1_epi32(0), a4);
- a4 = _mm_add_epi32(a4, _mm_and_si128(tmp4,test));
- counts4 = _mm_sub_epi32(counts4, test);
- _mm_store_si128((__m128i*)a, a4);
- _mm_store_si128((__m128i*)counts, counts4);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement