b785ec59d161 — Chris Cannam 9 months ago
Apply actual SLEEF SIMD code for -to-polar
M bqvec/Allocators.h +3 -3
@@ 127,9 127,9 @@ T *allocate(size_t count)
 #else /* !MALLOC_IS_ALIGNED */
 
     // That's the "sufficiently aligned" functions dealt with, the
-    // rest need a specific alignment provided to the call. 32-byte
-    // alignment is required for at least OpenMAX
-    static const int alignment = 32;
+    // rest need a specific alignment provided to the call. 64-byte
+    // alignment is enough for 8x8 double operations
+    static const int alignment = 64;
 
 #ifdef HAVE__ALIGNED_MALLOC
     ptr = _aligned_malloc(count * sizeof(T), alignment);

          
M bqvec/VectorOpsComplex.h +58 -20
@@ 41,6 41,8 @@ 
 
 #include <cmath>
 
+#include <iostream>
+
 namespace breakfastquay {
 
 #ifndef NO_COMPLEX_TYPES

          
@@ 422,16 424,6 @@ inline void c_phasor(T *real, T *imag, T
     } else {
         vvsincos((double *)imag, (double *)real, (const double *)&phase, &one);
     }
-#elif defined HAVE_SLEEF
-    if (sizeof(T) == sizeof(float)) {
-        Sleef_float2 out = Sleef_sincosf_u10(float(phase));
-        *imag = out.x;
-        *real = out.y;
-    } else {
-        Sleef_double2 out = Sleef_sincos_u10(double(phase));
-        *imag = out.x;
-        *real = out.y;
-    }
 #elif defined LACK_SINCOS
     if (sizeof(T) == sizeof(float)) {
         *real = cosf(phase);

          
@@ 464,18 456,8 @@ inline void c_phasor(T *real, T *imag, T
 template<typename T>
 inline void c_magphase(T *mag, T *phase, T real, T imag)
 {
-#if defined HAVE_SLEEF
-    if (sizeof(T) == sizeof(float)) {
-        *mag = Sleef_sqrtf_u05(real * real + imag * imag);
-        *phase = Sleef_atan2f_u10(imag, real);
-    } else {
-        *mag = Sleef_sqrt_u35(real * real + imag * imag);
-        *phase = Sleef_atan2_u35(imag, real);
-    }
-#else
     *mag = sqrt(real * real + imag * imag);
     *phase = atan2(imag, real);
-#endif
 }
 
 #if defined USE_APPROXIMATE_ATAN2

          
@@ 684,6 666,62 @@ inline void v_cartesian_interleaved_to_p
     ippsCartToPolar_64fc((const Ipp64fc *)src, mag, phase, count);
 }
 
+#elif defined HAVE_SLEEF
+
+extern void concrete_v_cartesian_to_polar_f
+(float *const BQ_R__, float *const BQ_R__,
+ const float *const BQ_R__, const float *const BQ_R__, const int);
+
+extern void concrete_v_cartesian_interleaved_to_polar_f
+(float *const BQ_R__, float *const BQ_R__,
+ const float *const BQ_R__, const int);
+
+extern void concrete_v_cartesian_to_polar_d
+(double *const BQ_R__, double *const BQ_R__,
+ const double *const BQ_R__, const double *const BQ_R__, const int);
+
+extern void concrete_v_cartesian_interleaved_to_polar_d
+(double *const BQ_R__, double *const BQ_R__,
+ const double *const BQ_R__, const int);
+
+template<>
+inline void v_cartesian_to_polar(float *const BQ_R__ mag,
+                                 float *const BQ_R__ phase,
+                                 const float *const BQ_R__ real,
+                                 const float *const BQ_R__ imag,
+                                 const int count)
+{
+    concrete_v_cartesian_to_polar_f(mag, phase, real, imag, count);
+}
+
+template<>
+inline void v_cartesian_interleaved_to_polar(float *const BQ_R__ mag,
+                                             float *const BQ_R__ phase,
+                                             const float *const BQ_R__ src,
+                                             const int count)
+{
+    concrete_v_cartesian_interleaved_to_polar_f(mag, phase, src, count);
+}
+
+template<>
+inline void v_cartesian_to_polar(double *const BQ_R__ mag,
+                                 double *const BQ_R__ phase,
+                                 const double *const BQ_R__ real,
+                                 const double *const BQ_R__ imag,
+                                 const int count)
+{
+    concrete_v_cartesian_to_polar_d(mag, phase, real, imag, count);
+}
+
+template<>
+inline void v_cartesian_interleaved_to_polar(double *const BQ_R__ mag,
+                                             double *const BQ_R__ phase,
+                                             const double *const BQ_R__ src,
+                                             const int count)
+{
+    concrete_v_cartesian_interleaved_to_polar_d(mag, phase, src, count);
+}
+
 #elif defined HAVE_VDSP
 
 template<>

          
M build/Makefile.inc +2 -2
@@ 15,9 15,9 @@ TIMINGS_OBJECTS	:= $(TIMINGS_SOURCES:.cp
 TEST_SOURCES	:= $(wildcard $(TEST_DIR)/*.cpp)
 TEST_OBJECTS	:= $(TEST_SOURCES:.cpp=.o)
 
-OPTFLAGS := -O3 -ffast-math
+OPTFLAGS ?= -O3 -ffast-math
 
-CXXFLAGS := -std=c++98 -fpic -Wall -Wextra -Werror $(VECTOR_DEFINES) $(ALLOCATOR_DEFINES) -I. $(THIRD_PARTY_INCLUDES) -I$(HEADER_DIR) $(OPTFLAGS)
+CXXFLAGS ?= -std=c++98 -fpic -Wall -Wextra -Werror $(VECTOR_DEFINES) $(ALLOCATOR_DEFINES) -I. $(THIRD_PARTY_INCLUDES) -I$(HEADER_DIR) $(OPTFLAGS)
 
 LIBRARY	:= libbqvec.a
 

          
M build/Makefile.linux.sleef +2 -0
@@ 6,5 6,7 @@ ALLOCATOR_DEFINES 	:= -DHAVE_POSIX_MEMAL
 THIRD_PARTY_INCLUDES	:= -I/usr/local/include
 THIRD_PARTY_LIBS	:= -L/usr/local/lib -lsleef
 
+OPTFLAGS                := -O3 -ffast-math -flto -march=native
+
 include build/Makefile.inc
 

          
M src/VectorOpsComplex.cpp +166 -2
@@ 54,8 54,6 @@ 
 #include <alloca.h>
 #endif
 
-#include <iostream>
-
 using namespace std;
 
 namespace breakfastquay {

          
@@ 312,4 310,170 @@ v_polar_interleaved_to_cartesian_inplace
 
 #endif // NO_COMPLEX_TYPES
 
+#ifdef HAVE_SLEEF
+
+#if defined(__AVX512F__)
+#define BQ_SIMD_n 512
+#define BQ_SIMD_sp __m512
+#define BQ_SIMD_dp __m512d
+#define BQ_ATAN2_sp Sleef_atan2f16_u10
+#define BQ_ATAN2_dp Sleef_atan2d8_u35
+#elif defined(__AVX__)
+#define BQ_SIMD_n 256
+#define BQ_SIMD_sp __m256
+#define BQ_SIMD_dp __m256d
+#define BQ_ATAN2_sp Sleef_atan2f8_u10
+#define BQ_ATAN2_dp Sleef_atan2d4_u35
+#elif defined(__SSE2__)
+#define BQ_SIMD_n 128
+#define BQ_SIMD_sp __m128
+#define BQ_SIMD_dp __m128d
+#define BQ_ATAN2_sp Sleef_atan2f4_u10
+#define BQ_ATAN2_dp Sleef_atan2d2_u35
+#else
+#define BQ_SIMD_n 0
+#endif
+
+void concrete_v_cartesian_to_polar_f(float *const BQ_R__ mag,
+                                     float *const BQ_R__ phase,
+                                     const float *const BQ_R__ real,
+                                     const float *const BQ_R__ imag,
+                                     const int count)
+{
+    int i = 0;
+    const int bytes = BQ_SIMD_n / 8;
+    const int elements = bytes / sizeof(float);
+    if (!BQ_SIMD_n ||
+        ((uintptr_t)real & (bytes - 1)) ||
+        ((uintptr_t)imag & (bytes - 1))) {
+        // No SIMD or unaligned
+        while (i < count) {
+            c_magphase<float>(mag + i, phase + i, real[i], imag[i]);
+            ++i;
+        }
+        return;
+    }
+    while (i + elements < count) {
+        const BQ_SIMD_sp *rp = (const BQ_SIMD_sp *)(real + i);
+        const BQ_SIMD_sp *ip = (const BQ_SIMD_sp *)(imag + i);
+        BQ_SIMD_sp *pp = (BQ_SIMD_sp *)(phase + i);
+        *pp = BQ_ATAN2_sp(*ip, *rp);
+        i += elements;
+    }
+    while (i < count) {
+        phase[i] = atan2f(imag[i], real[i]);
+        ++i;
+    }
+    for (int i = 0; i < count; ++i) {
+        mag[i] = sqrtf(real[i] * real[i] + imag[i] * imag[i]);
+    }
 }
+
+void concrete_v_cartesian_interleaved_to_polar_f(float *const BQ_R__ mag,
+                                                 float *const BQ_R__ phase,
+                                                 const float *const BQ_R__ src,
+                                                 const int count)
+{
+    int i = 0;
+    const int bytes = BQ_SIMD_n / 8;
+    const int elements = bytes / sizeof(float);
+    if (!BQ_SIMD_n) {
+        while (i < count) {
+            c_magphase<float>(mag + i, phase + i, src[i*2], src[i*2+1]);
+            ++i;
+        }
+        return;
+    }
+    while (i + elements < count) {
+        BQ_SIMD_sp rp;
+        BQ_SIMD_sp ip;
+        for (int j = 0; j < elements; ++j) {
+            ((float *)&rp)[j] = src[(i+j) * 2];
+            ((float *)&ip)[j] = src[(i+j) * 2 + 1];
+        }
+        BQ_SIMD_sp *pp = (BQ_SIMD_sp *)(phase + i);
+        *pp = BQ_ATAN2_sp(ip, rp);
+        i += elements;
+    }
+    while (i < count) {
+        phase[i] = atan2f(src[i*2+1], src[i*2]);
+        ++i;
+    }
+    for (int i = 0; i < count; ++i) {
+        mag[i] = sqrtf(src[i*2] * src[i*2] + src[i*2+1] * src[i*2+1]);
+    }
+}
+
+void concrete_v_cartesian_to_polar_d(double *const BQ_R__ mag,
+                                     double *const BQ_R__ phase,
+                                     const double *const BQ_R__ real,
+                                     const double *const BQ_R__ imag,
+                                     const int count)
+{
+    int i = 0;
+    const int bytes = BQ_SIMD_n / 8;
+    const int elements = bytes / sizeof(double);
+    if (!BQ_SIMD_n ||
+        ((uintptr_t)real & (bytes - 1)) ||
+        ((uintptr_t)imag & (bytes - 1))) {
+        // No SIMD or unaligned
+        while (i < count) {
+            c_magphase<double>(mag + i, phase + i, real[i], imag[i]);
+            ++i;
+        }
+        return;
+    }
+    while (i + elements < count) {
+        const BQ_SIMD_dp *rp = (const BQ_SIMD_dp *)(real + i);
+        const BQ_SIMD_dp *ip = (const BQ_SIMD_dp *)(imag + i);
+        BQ_SIMD_dp *pp = (BQ_SIMD_dp *)(phase + i);
+        *pp = BQ_ATAN2_dp(*ip, *rp);
+        i += elements;
+    }
+    while (i < count) {
+        phase[i] = atan2(imag[i], real[i]);
+        ++i;
+    }
+    for (int i = 0; i < count; ++i) {
+        mag[i] = sqrt(real[i] * real[i] + imag[i] * imag[i]);
+    }
+}
+
+void concrete_v_cartesian_interleaved_to_polar_d(double *const BQ_R__ mag,
+                                                 double *const BQ_R__ phase,
+                                                 const double *const BQ_R__ src,
+                                                 const int count)
+{
+    int i = 0;
+    const int bytes = BQ_SIMD_n / 8;
+    const int elements = bytes / sizeof(double);
+    if (!BQ_SIMD_n) {
+        while (i < count) {
+            c_magphase<double>(mag + i, phase + i, src[i*2], src[i*2+1]);
+            ++i;
+        }
+        return;
+    }
+    while (i + elements < count) {
+        BQ_SIMD_dp rp;
+        BQ_SIMD_dp ip;
+        for (int j = 0; j < elements; ++j) {
+            ((double *)&rp)[j] = src[(i+j) * 2];
+            ((double *)&ip)[j] = src[(i+j) * 2 + 1];
+        }
+        BQ_SIMD_dp *pp = (BQ_SIMD_dp *)(phase + i);
+        *pp = BQ_ATAN2_dp(ip, rp);
+        i += elements;
+    }
+    while (i < count) {
+        phase[i] = atan2(src[i*2+1], src[i*2]);
+        ++i;
+    }
+    for (int i = 0; i < count; ++i) {
+        mag[i] = sqrt(src[i*2] * src[i*2] + src[i*2+1] * src[i*2+1]);
+    }
+}
+
+#endif
+
+}

          
M test/Timings.cpp +70 -2
@@ 225,9 225,9 @@ testPolarToCartInterleaved()
 }
 
 bool
-testCartToPolar()
+testCartToPolarComplexTypes()
 {
-    cerr << "testVectorOps: testing v_cartesian_to_polar" << endl;
+    cerr << "testVectorOps: testing v_cartesian_to_polar [complex types]" << endl;
 
     const int iterations = 50000;
     const int N = 1024;

          
@@ 289,12 289,80 @@ testCartToPolar()
     return true;
 }
 
+bool
+testCartToPolar()
+{
+    cerr << "testVectorOps: testing v_cartesian_to_polar" << endl;
+
+    const int iterations = 50000;
+    const int N = 1024;
+
+    bq_complex_element_t *real = allocate<bq_complex_element_t>(N);
+    bq_complex_element_t *imag = allocate<bq_complex_element_t>(N);
+    bq_complex_element_t *mag = allocate<bq_complex_element_t>(N);
+    bq_complex_element_t *phase = allocate<bq_complex_element_t>(N);
+
+    for (int i = 0; i < N; ++i) {
+        real[i] = (drand48() * 2.0) - 1.0;
+        imag[i] = (drand48() * 2.0) - 1.0;
+    }
+
+    float divisor = float(CLOCKS_PER_SEC) / 1000.f;
+    clock_t start = clock();
+
+    double first, last, total = 0;
+    for (int j = 0; j < iterations; ++j) {
+        for (int i = 0; i < N; ++i) {
+            double mag = sqrt(real[i] * real[i] + imag[i] * imag[i]);
+            double phase = atan2(imag[i], real[i]);
+            if (i == 0) first = mag;
+            if (i == N-1) last = phase;
+            total += mag;
+            total += j * phase;
+        }
+    }
+    
+    clock_t end = clock();
+
+    cerr << "naive method: first = " << first << ", last = " << last
+         << ", total = " << total << endl;
+    cerr << "time for naive method: " << float(end - start)/divisor << endl;
+
+    start = clock();
+
+    first = last = total = 0;
+    for (int j = 0; j < iterations; ++j) {
+        v_cartesian_to_polar(mag, phase, real, imag, N);
+        for (int i = 0; i < N; ++i) {
+            if (i == 0) first = mag[i];
+            if (i == N-1) last = phase[i];
+            total += mag[i];
+            total += j * phase[i];
+        }
+    }
+
+    end = clock();
+
+    cerr << "v_cartesian_to_polar: first = " << first << ", last = " << last
+         << ", total = " << total << endl;
+    cerr << "time for v_cartesian_to_polar: " << float(end - start)/divisor << endl;
+
+    deallocate(real);
+    deallocate(imag);
+    deallocate(mag);
+    deallocate(phase);
+    
+    cerr << endl;
+    return true;
+}
+
 int main(int, char **)
 {
     cerr << endl;
     if (!testMultiply()) return 1;
     if (!testPolarToCart()) return 1;
     if (!testPolarToCartInterleaved()) return 1;
+    if (!testCartToPolarComplexTypes()) return 1;
     if (!testCartToPolar()) return 1;
     return 0;
 }