Ok se nessun altro lo vuole, avrò un trucco, assumendo avx2 disponibile (non verificato perché avx2 non fa per me). Non ho idea se questo sia effettivamente più veloce di arare attraverso 8 iterazioni di bsf su un registro intero a 64 bit. Ho i miei dubbi. Supponiamo che src_address contenga il bit più significativo. src_address + 511 contiene il minimo.
bsf trova il meno significativo 1 bit.
Possiamo isolare il meno significativo 1 bit con la formula
x & (-1) //Hackers Delight 1st ed p11
avx2 utilizza registri a 256 bit quindi dovremo srotolare per superare tutti i 512 bit.
int BSR512(char* src_address) {
//load the data into 2 registers
__m256i data1 = _mm256_loadu_si256((__m256i*)src_address);
__m256i data2 = _mm256_loadu_si256((__m256i*)(src_address + 256));
//zero a register for negate and comparison
//__m256i zeroes = _mm256_set_epi32(0,0,0,0,0,0,0,0); //NO
__m256i _mm256_setzero_si256();
//negate
__m256i negs1 = _mm256_sub_epi32(zeroes, data1);
__m256i negs2 = _mm256_sub_epi32(zeroes, data2);
// x & (-x)
//these should be _mm256_and_si256 to avoid casts -thanks rwong
__m256i lsbs1 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(data1), _mm256_castsi256_ps(negs1)));
__m256i lsbs2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(data2), _mm256_castsi256_ps(negs2)));
// set int32 in a reg to all 1s if we found something
__m256i mask1 = _mm256_cmpeq_epi32(lsbs1, zeroes);
__m256i mask2 = _mm256_cmpeq_epi32(lsbs2, zeroes);
// which of the packed ints was set to all 1s? put in a normal register
int which1 = _mm256_movemask_ps (_mm256_castsi256_ps(mask1));
// isolate the lowest set bit
which1 = (which1 & -which1);
int which2 = _mm256_movemask_ps (_mm256_castsi256_ps(mask2));
which2 = (which2 & -which2);
// do you have to do a dance here? Does the lowest memory address contain
// the highest bit or the lowest bit - ask your data and change this
if(which1 != 0) {
// extract the 32 bit int with the bit set
const char w = which1;
int contains_set_bit = _mm256_extract_epi32(lsbs1, w);
// add number of bits in lower integers unset to bsf
return which1 * 32 + BSF(contains_set_bit);
} else if (which2 != 0) {
// as above but didn't find in the first loop
const char w = which2;
int contains_set_bit _mm256_extract_epi32(lsbs2, w);
return which2 * 32 + BSF(contains_set_bit) + 256;
}
return 0;
}
Che compilato con clang ++ -std = c ++ 11 -Wall -g -march = core-avx2 -O3
objdump -DC a.out restituisce:
0000000000400a90 <BSF512(char*)>:
400a90: c5 fe 6f 07 vmovdqu (%rdi),%ymm0
400a94: c5 f5 ef c9 vpxor %ymm1,%ymm1,%ymm1
400a98: c5 f5 fa d0 vpsubd %ymm0,%ymm1,%ymm2
400a9c: c5 fd db c2 vpand %ymm2,%ymm0,%ymm0
400aa0: c5 fd 76 d1 vpcmpeqd %ymm1,%ymm0,%ymm2
400aa4: c5 fc 50 c2 vmovmskps %ymm2,%eax
400aa8: c4 e2 70 f3 d8 blsi %eax,%ecx
400aad: 74 26 je 400ad5 <BSF512(char*)+0x45>
400aaf: 89 c8 mov %ecx,%eax
400ab1: 83 e0 07 and $0x7,%eax
400ab4: c5 f9 6e c8 vmovd %eax,%xmm1
400ab8: c4 e2 75 36 c0 vpermd %ymm0,%ymm1,%ymm0
400abd: c5 f9 7e c2 vmovd %xmm0,%edx
400ac1: c1 e1 05 shl $0x5,%ecx
400ac4: f3 0f bc c2 tzcnt %edx,%eax
400ac8: ff c0 inc %eax
400aca: 85 d2 test %edx,%edx
400acc: 0f 44 c2 cmove %edx,%eax
400acf: 01 c8 add %ecx,%eax
400ad1: c5 f8 77 vzeroupper
400ad4: c3 retq
400ad5: c5 fe 6f 87 00 01 00 vmovdqu 0x100(%rdi),%ymm0
400adc: 00
400add: c5 f5 fa d0 vpsubd %ymm0,%ymm1,%ymm2
400ae1: c5 fd db c2 vpand %ymm2,%ymm0,%ymm0
400ae5: c5 fd 76 c9 vpcmpeqd %ymm1,%ymm0,%ymm1
400ae9: c5 fc 50 c9 vmovmskps %ymm1,%ecx
400aed: 31 c0 xor %eax,%eax
400aef: c4 e2 70 f3 d9 blsi %ecx,%ecx
400af4: 74 27 je 400b1d <BSF512(char*)+0x8d>
400af6: 89 c8 mov %ecx,%eax
400af8: 83 e0 07 and $0x7,%eax
400afb: c5 f9 6e c8 vmovd %eax,%xmm1
400aff: c4 e2 75 36 c0 vpermd %ymm0,%ymm1,%ymm0
400b04: c5 f9 7e c0 vmovd %xmm0,%eax
400b08: c1 e1 05 shl $0x5,%ecx
400b0b: f3 0f bc d0 tzcnt %eax,%edx
400b0f: ff c2 inc %edx
400b11: 85 c0 test %eax,%eax
400b13: 0f 44 d0 cmove %eax,%edx
400b16: 8d 84 11 00 01 00 00 lea 0x100(%rcx,%rdx,1),%eax
400b1d: c5 f8 77 vzeroupper
400b20: c3 retq
400b21: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
400b28: 00 00 00
400b2b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)