Portál AbcLinuxu, 13. května 2025 23:52
if (x > y) { z = x - y } else { z = y - x }
#define ABS_DIFF(X,Y) ((X > Y) ? (X - Y) : (Y - X))
Řešení dotazu:
/* Return the absolute value of I. */ int abs (int i) { return i < 0 ? -i : i; }na tom mym srotu (i686) to pri pouziti makra a prelozeni s -O1 vychazi podobne jako volani abs, a mezi pouzitim short a int neni taktez temer rozdil. Dost mozny, ze to brzdi ten short. Prace s typem, kterej nema velikost slova muze bejt asi drazsi.
int abs(int i) { int t = i >> (32 - 1); return (i ^ t) - t; }(a presne to isté generuje gcc pre 32 bitový int)
unsigned short my_abs(int i) { int t = i >> (sizeof(int) * CHAR_BIT - 1); /* patent free */ return (i + t) ^ t; /*return (i ^ t) - t;*/ }Jinak ta varianta s minusem je možná patentově chráněna (viz.) :) Např.:
unsigned short x = 5; unsigned short y = 20; unsigned short z = my_abs(x - y);Přetypování my_abs(x - y) typicky zabere jeden procesorový cyklus. Přetypování int na unsigned short je bez výkonové penalizace.
return i < 0 ? -i : i;Všechny tři metody jsou ve výsledném kódu totožné. Doporučuji nepoužívat short, operace s ním jsou dražší než s intem (resp. longem na 64bitech).
function soucet_abs_hodnot(unsigned short *input1, unsigned short *input2, unsigned int size) { int z = 0, i; for (i = 0; i < size; i++) z+= abs(input1[i] - input2[i]); return z; }Takhle přesně vypadá funkce kterou chci zoptimalizovat.
int abs(int i) {
if (i & 0x80000000) return -i; // hex hodnota se samozřejmě liší podle použitých číselných typů
return i;
}
int sim = 0; int i; for (i = 1; i <= input2[0]; i++) sim+= abs((int) input1[i] - (int) input2[i]); return sim;
real 0m16.898s user 0m56.828s
#define MACRO_DIST(X,Y) ((X < Y) ? (Y - X) : (X - Y)) unsigned short sim = 0; for (i = 1; i <= input2[0]; i++) sim += MACRO_DIST(input1[i],input2[i]); return sim;
real 0m20.070s user 1m18.761sData ve formátu float
float sim = 0; int i; for (i = 1; i <= input2[0]; i++) sim+= fabs(input1[i] - input2[i]); return sim;
real 0m12.351s user 0m33.758s
#include <stdio.h> #include <math.h> // Config. #define USE_SSE2 #define USE_SSSE3 // SSE2. #if defined(USE_SSE2) #include <emmintrin.h> #endif // USE_SSE2 // SSSE3. #if defined(USE_SSSE3) #include <tmmintrin.h> #endif // USE_SSE3 #define ABS_C(_Value_) abs(_Value_) int sum_abs_u16( const unsigned short* input1, const unsigned short* input2, size_t size) { size_t i = size; int z = 0; #if defined(USE_SSE2) if (i >= 20) { // Align. while ((intptr_t)input1 & (intptr_t)0xF) { z += ABS_C((int)input1[0] - (int)input2[0]); if (--i == 0) return z; input1++; input2++; } // Counter. __m128i cn = _mm_setzero_si128(); __m128i zn = _mm_setzero_si128(); // Large loop. while (i >= 16) { __m128i r0, r1; __m128i r2, r3; __m128i r4, r5; // Load input1, aligned. r0 = _mm_load_si128(reinterpret_cast<const __m128i*>(input1 + 0)); r3 = _mm_load_si128(reinterpret_cast<const __m128i*>(input1 + 8)); // Load input2, unaligned. r2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input2 + 0)); r4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input2 + 8)); // Get absolute values. r1 = _mm_subs_epu16(r2, r0); r0 = _mm_subs_epu16(r0, r2); r5 = _mm_subs_epu16(r3, r4); r4 = _mm_subs_epu16(r4, r3); r0 = _mm_add_epi16(r0, r1); r4 = _mm_add_epi16(r4, r5); // Unpack to 32-bit and sum. r1 = _mm_unpackhi_epi16(r0, zn); r5 = _mm_unpackhi_epi16(r4, zn); r0 = _mm_unpacklo_epi16(r0, zn); r4 = _mm_unpacklo_epi16(r4, zn); r0 = _mm_add_epi32(r0, r1); r4 = _mm_add_epi32(r4, r5); cn = _mm_add_epi32(cn, r0); cn = _mm_add_epi32(cn, r4); i -= 16; input1 += 16; input2 += 16; } #if defined(USE_SSSE3) cn = _mm_hadd_epi32(cn, cn); cn = _mm_hadd_epi32(cn, cn); z += _mm_cvtsi128_si32(cn); #else cn = _mm_add_epi32(cn, _mm_shuffle_epi32(cn, _MM_SHUFFLE(2, 3, 0, 1))); cn = _mm_add_epi32(cn, _mm_shuffle_epi32(cn, _MM_SHUFFLE(0, 1, 3, 2))); z += _mm_cvtsi128_si32(cn); #endif } #endif // USE_SSE2 // Small loop. while (i > 0) { z += ABS_C((int)input1[0] - (int)input2[0]); i--; input1++; input2++; } return z; } int main(int argc, char* argv[]) { const unsigned short input1[40] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; const unsigned short input2[40] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 }; int sum = sum_abs_u16(input1, input2, 40); printf("Sum=%d\n", sum); return 0; }
gcc -O3 -mssse3
), tak vygeneruje jeste o kousek lepsi kod (protoze ma funkci abs).
// Small loop.
while (i > 0)
{
z += ABS_C((int)input1[0] - (int)input2[0]);
i--;
input1++;
input2++;
}
Prelozi nejak takhle:
movl 8(%ebp), %ebx
xorl %edx, %edx
xorl %eax, %eax
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
.p2align 4,,7
.p2align 3
.L7:
movdqu (%ebx,%eax), %xmm2
movdqu (%edi,%eax), %xmm3
movdqa %xmm2, %xmm0
movdqa %xmm3, %xmm1
punpcklwd %xmm5, %xmm0
punpcklwd %xmm5, %xmm1
addl $1, %edx
psubd %xmm1, %xmm0
punpckhwd %xmm5, %xmm2
pabsd %xmm0, %xmm0
punpckhwd %xmm5, %xmm3
paddd %xmm4, %xmm0
psubd %xmm3, %xmm2
addl $16, %eax
cmpl %edx, %ecx
pabsd %xmm2, %xmm4
paddd %xmm0, %xmm4
ja .L7
+ nejaka ta omacko okolo.
Tiskni
Sdílej:
ISSN 1214-1267, (c) 1999-2007 Stickfish s.r.o.