Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* unoptimized
- double sumDBIndicies(double* input1, IndiciesInput indicies)
- {
- double accumulate = 0;
- for(unsigned int i = 0; i < indicies.nSingleIndicies; i++)
- {
- accumulate += input1[indicies.singleIndicies[i]];
- }
- for(unsigned int i = 0; i < indicies.nRanges; i++)
- {
- for(double* address = indicies.rangeStartPositions[i] + input1; address < indicies.rangeStartPositions[i] + input1 + indicies.rangeLengths[i]; address++)
- {
- accumulate += *address;
- }
- }
- return accumulate;
- }
- */
- double sumDBIndicies(double* input1, IndiciesInput indicies)
- {
- double accumulate = 0;
- for(unsigned int i = 0; i < indicies.nSingleIndicies; i++)
- {
- accumulate += input1[indicies.singleIndicies[i]];
- }
- for(unsigned int i = 0; i < indicies.nRanges; i++)
- {
- unsigned int masked = indicies.rangeLengths[i] & ~0xF;
- double* address = indicies.rangeStartPositions[i] + input1;
- for(; address + 15 < indicies.rangeStartPositions[i] + input1 + masked; address+=16)
- {
- __m128d result1 = _mm_add_pd(_mm_load_pd(address), _mm_load_pd(address+2));
- __m128d result2 = _mm_add_pd(_mm_load_pd(address+4), _mm_load_pd(address+6));
- __m128d result3 = _mm_add_pd(_mm_load_pd(address+8), _mm_load_pd(address+10));
- __m128d result4 = _mm_add_pd(_mm_load_pd(address+12), _mm_load_pd(address+14));
- //now aggregate those
- __m128d result5 = _mm_add_pd(result1, result2);
- __m128d result6 = _mm_add_pd(result3, result4);
- __m128d result7 = _mm_add_pd(result5, result6);
- double output[8];
- double* aligned = (double*)((size_t)(output + 4) & ~0xF);
- _mm_store_pd(aligned, result7);
- accumulate += aligned[0] + aligned[1];
- }
- switch(indicies.rangeLengths[i] % 16)
- {
- case 15: accumulate += address[14];
- case 14: accumulate += address[13];
- case 13: accumulate += address[12];
- case 12: accumulate += address[11];
- case 11: accumulate += address[10];
- case 10: accumulate += address[9];
- case 9: accumulate += address[8];
- case 8: accumulate += address[7];
- case 7: accumulate += address[6];
- case 6: accumulate += address[5];
- case 5: accumulate += address[4];
- case 4: accumulate += address[3];
- case 3: accumulate += address[2];
- case 2: accumulate += address[1];
- case 1: accumulate += address[0];
- case 0: break;
- }
- }
- return accumulate;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement