M bqvec/Allocators.h +3 -3
@@ 127,9 127,9 @@ T *allocate(size_t count)
#else /* !MALLOC_IS_ALIGNED */
// That's the "sufficiently aligned" functions dealt with, the
- // rest need a specific alignment provided to the call. 32-byte
- // alignment is required for at least OpenMAX
- static const int alignment = 32;
+ // rest need a specific alignment provided to the call. 64-byte
+ // alignment is enough for 8x8 double operations
+ static const int alignment = 64;
#ifdef HAVE__ALIGNED_MALLOC
ptr = _aligned_malloc(count * sizeof(T), alignment);
M bqvec/VectorOpsComplex.h +58 -20
@@ 41,6 41,8 @@
#include <cmath>
+#include <iostream>
+
namespace breakfastquay {
#ifndef NO_COMPLEX_TYPES
@@ 422,16 424,6 @@ inline void c_phasor(T *real, T *imag, T
} else {
vvsincos((double *)imag, (double *)real, (const double *)&phase, &one);
}
-#elif defined HAVE_SLEEF
- if (sizeof(T) == sizeof(float)) {
- Sleef_float2 out = Sleef_sincosf_u10(float(phase));
- *imag = out.x;
- *real = out.y;
- } else {
- Sleef_double2 out = Sleef_sincos_u10(double(phase));
- *imag = out.x;
- *real = out.y;
- }
#elif defined LACK_SINCOS
if (sizeof(T) == sizeof(float)) {
*real = cosf(phase);
@@ 464,18 456,8 @@ inline void c_phasor(T *real, T *imag, T
template<typename T>
inline void c_magphase(T *mag, T *phase, T real, T imag)
{
-#if defined HAVE_SLEEF
- if (sizeof(T) == sizeof(float)) {
- *mag = Sleef_sqrtf_u05(real * real + imag * imag);
- *phase = Sleef_atan2f_u10(imag, real);
- } else {
- *mag = Sleef_sqrt_u35(real * real + imag * imag);
- *phase = Sleef_atan2_u35(imag, real);
- }
-#else
*mag = sqrt(real * real + imag * imag);
*phase = atan2(imag, real);
-#endif
}
#if defined USE_APPROXIMATE_ATAN2
@@ 684,6 666,62 @@ inline void v_cartesian_interleaved_to_p
ippsCartToPolar_64fc((const Ipp64fc *)src, mag, phase, count);
}
+#elif defined HAVE_SLEEF
+
+extern void concrete_v_cartesian_to_polar_f
+(float *const BQ_R__, float *const BQ_R__,
+ const float *const BQ_R__, const float *const BQ_R__, const int);
+
+extern void concrete_v_cartesian_interleaved_to_polar_f
+(float *const BQ_R__, float *const BQ_R__,
+ const float *const BQ_R__, const int);
+
+extern void concrete_v_cartesian_to_polar_d
+(double *const BQ_R__, double *const BQ_R__,
+ const double *const BQ_R__, const double *const BQ_R__, const int);
+
+extern void concrete_v_cartesian_interleaved_to_polar_d
+(double *const BQ_R__, double *const BQ_R__,
+ const double *const BQ_R__, const int);
+
+template<>
+inline void v_cartesian_to_polar(float *const BQ_R__ mag,
+ float *const BQ_R__ phase,
+ const float *const BQ_R__ real,
+ const float *const BQ_R__ imag,
+ const int count)
+{
+ concrete_v_cartesian_to_polar_f(mag, phase, real, imag, count);
+}
+
+template<>
+inline void v_cartesian_interleaved_to_polar(float *const BQ_R__ mag,
+ float *const BQ_R__ phase,
+ const float *const BQ_R__ src,
+ const int count)
+{
+ concrete_v_cartesian_interleaved_to_polar_f(mag, phase, src, count);
+}
+
+template<>
+inline void v_cartesian_to_polar(double *const BQ_R__ mag,
+ double *const BQ_R__ phase,
+ const double *const BQ_R__ real,
+ const double *const BQ_R__ imag,
+ const int count)
+{
+ concrete_v_cartesian_to_polar_d(mag, phase, real, imag, count);
+}
+
+template<>
+inline void v_cartesian_interleaved_to_polar(double *const BQ_R__ mag,
+ double *const BQ_R__ phase,
+ const double *const BQ_R__ src,
+ const int count)
+{
+ concrete_v_cartesian_interleaved_to_polar_d(mag, phase, src, count);
+}
+
#elif defined HAVE_VDSP
template<>
M build/Makefile.inc +2 -2
@@ 15,9 15,9 @@ TIMINGS_OBJECTS := $(TIMINGS_SOURCES:.cp
TEST_SOURCES := $(wildcard $(TEST_DIR)/*.cpp)
TEST_OBJECTS := $(TEST_SOURCES:.cpp=.o)
-OPTFLAGS := -O3 -ffast-math
+OPTFLAGS ?= -O3 -ffast-math
-CXXFLAGS := -std=c++98 -fpic -Wall -Wextra -Werror $(VECTOR_DEFINES) $(ALLOCATOR_DEFINES) -I. $(THIRD_PARTY_INCLUDES) -I$(HEADER_DIR) $(OPTFLAGS)
+CXXFLAGS ?= -std=c++98 -fpic -Wall -Wextra -Werror $(VECTOR_DEFINES) $(ALLOCATOR_DEFINES) -I. $(THIRD_PARTY_INCLUDES) -I$(HEADER_DIR) $(OPTFLAGS)
LIBRARY := libbqvec.a
M build/Makefile.linux.sleef +2 -0
@@ 6,5 6,7 @@ ALLOCATOR_DEFINES := -DHAVE_POSIX_MEMAL
THIRD_PARTY_INCLUDES := -I/usr/local/include
THIRD_PARTY_LIBS := -L/usr/local/lib -lsleef
+OPTFLAGS := -O3 -ffast-math -flto -march=native
+
include build/Makefile.inc
M src/VectorOpsComplex.cpp +166 -2
@@ 54,8 54,6 @@
#include <alloca.h>
#endif
-#include <iostream>
-
using namespace std;
namespace breakfastquay {
@@ 312,4 310,170 @@ v_polar_interleaved_to_cartesian_inplace
#endif // NO_COMPLEX_TYPES
+#ifdef HAVE_SLEEF
+
+#if defined(__AVX512F__)
+#define BQ_SIMD_n 512
+#define BQ_SIMD_sp __m512
+#define BQ_SIMD_dp __m512d
+#define BQ_ATAN2_sp Sleef_atan2f16_u10
+#define BQ_ATAN2_dp Sleef_atan2d8_u35
+#elif defined(__AVX__)
+#define BQ_SIMD_n 256
+#define BQ_SIMD_sp __m256
+#define BQ_SIMD_dp __m256d
+#define BQ_ATAN2_sp Sleef_atan2f8_u10
+#define BQ_ATAN2_dp Sleef_atan2d4_u35
+#elif defined(__SSE2__)
+#define BQ_SIMD_n 128
+#define BQ_SIMD_sp __m128
+#define BQ_SIMD_dp __m128d
+#define BQ_ATAN2_sp Sleef_atan2f4_u10
+#define BQ_ATAN2_dp Sleef_atan2d2_u35
+#else
+#define BQ_SIMD_n 0
+#endif
+
+void concrete_v_cartesian_to_polar_f(float *const BQ_R__ mag,
+ float *const BQ_R__ phase,
+ const float *const BQ_R__ real,
+ const float *const BQ_R__ imag,
+ const int count)
+{
+ int i = 0;
+ const int bytes = BQ_SIMD_n / 8;
+ const int elements = bytes / sizeof(float);
+ if (!BQ_SIMD_n ||
+ ((uintptr_t)real & (bytes - 1)) ||
+ ((uintptr_t)imag & (bytes - 1))) {
+ // No SIMD or unaligned
+ while (i < count) {
+ c_magphase<float>(mag + i, phase + i, real[i], imag[i]);
+ ++i;
+ }
+ return;
+ }
+ while (i + elements < count) {
+ const BQ_SIMD_sp *rp = (const BQ_SIMD_sp *)(real + i);
+ const BQ_SIMD_sp *ip = (const BQ_SIMD_sp *)(imag + i);
+ BQ_SIMD_sp *pp = (BQ_SIMD_sp *)(phase + i);
+ *pp = BQ_ATAN2_sp(*ip, *rp);
+ i += elements;
+ }
+ while (i < count) {
+ phase[i] = atan2f(imag[i], real[i]);
+ ++i;
+ }
+ for (int i = 0; i < count; ++i) {
+ mag[i] = sqrtf(real[i] * real[i] + imag[i] * imag[i]);
+ }
}
+
+void concrete_v_cartesian_interleaved_to_polar_f(float *const BQ_R__ mag,
+ float *const BQ_R__ phase,
+ const float *const BQ_R__ src,
+ const int count)
+{
+ int i = 0;
+ const int bytes = BQ_SIMD_n / 8;
+ const int elements = bytes / sizeof(float);
+ if (!BQ_SIMD_n) {
+ while (i < count) {
+ c_magphase<float>(mag + i, phase + i, src[i*2], src[i*2+1]);
+ ++i;
+ }
+ return;
+ }
+ while (i + elements < count) {
+ BQ_SIMD_sp rp;
+ BQ_SIMD_sp ip;
+ for (int j = 0; j < elements; ++j) {
+ ((float *)&rp)[j] = src[(i+j) * 2];
+ ((float *)&ip)[j] = src[(i+j) * 2 + 1];
+ }
+ BQ_SIMD_sp *pp = (BQ_SIMD_sp *)(phase + i);
+ *pp = BQ_ATAN2_sp(ip, rp);
+ i += elements;
+ }
+ while (i < count) {
+ phase[i] = atan2f(src[i*2+1], src[i*2]);
+ ++i;
+ }
+ for (int i = 0; i < count; ++i) {
+ mag[i] = sqrtf(src[i*2] * src[i*2] + src[i*2+1] * src[i*2+1]);
+ }
+}
+
+void concrete_v_cartesian_to_polar_d(double *const BQ_R__ mag,
+ double *const BQ_R__ phase,
+ const double *const BQ_R__ real,
+ const double *const BQ_R__ imag,
+ const int count)
+{
+ int i = 0;
+ const int bytes = BQ_SIMD_n / 8;
+ const int elements = bytes / sizeof(double);
+ if (!BQ_SIMD_n ||
+ ((uintptr_t)real & (bytes - 1)) ||
+ ((uintptr_t)imag & (bytes - 1))) {
+ // No SIMD or unaligned
+ while (i < count) {
+ c_magphase<double>(mag + i, phase + i, real[i], imag[i]);
+ ++i;
+ }
+ return;
+ }
+ while (i + elements < count) {
+ const BQ_SIMD_dp *rp = (const BQ_SIMD_dp *)(real + i);
+ const BQ_SIMD_dp *ip = (const BQ_SIMD_dp *)(imag + i);
+ BQ_SIMD_dp *pp = (BQ_SIMD_dp *)(phase + i);
+ *pp = BQ_ATAN2_dp(*ip, *rp);
+ i += elements;
+ }
+ while (i < count) {
+ phase[i] = atan2(imag[i], real[i]);
+ ++i;
+ }
+ for (int i = 0; i < count; ++i) {
+ mag[i] = sqrt(real[i] * real[i] + imag[i] * imag[i]);
+ }
+}
+
+void concrete_v_cartesian_interleaved_to_polar_d(double *const BQ_R__ mag,
+ double *const BQ_R__ phase,
+ const double *const BQ_R__ src,
+ const int count)
+{
+ int i = 0;
+ const int bytes = BQ_SIMD_n / 8;
+ const int elements = bytes / sizeof(double);
+ if (!BQ_SIMD_n) {
+ while (i < count) {
+ c_magphase<double>(mag + i, phase + i, src[i*2], src[i*2+1]);
+ ++i;
+ }
+ return;
+ }
+ while (i + elements < count) {
+ BQ_SIMD_dp rp;
+ BQ_SIMD_dp ip;
+ for (int j = 0; j < elements; ++j) {
+ ((double *)&rp)[j] = src[(i+j) * 2];
+ ((double *)&ip)[j] = src[(i+j) * 2 + 1];
+ }
+ BQ_SIMD_dp *pp = (BQ_SIMD_dp *)(phase + i);
+ *pp = BQ_ATAN2_dp(ip, rp);
+ i += elements;
+ }
+ while (i < count) {
+ phase[i] = atan2(src[i*2+1], src[i*2]);
+ ++i;
+ }
+ for (int i = 0; i < count; ++i) {
+ mag[i] = sqrt(src[i*2] * src[i*2] + src[i*2+1] * src[i*2+1]);
+ }
+}
+
+#endif
+
+}
M test/Timings.cpp +70 -2
@@ 225,9 225,9 @@ testPolarToCartInterleaved()
}
bool
-testCartToPolar()
+testCartToPolarComplexTypes()
{
- cerr << "testVectorOps: testing v_cartesian_to_polar" << endl;
+ cerr << "testVectorOps: testing v_cartesian_to_polar [complex types]" << endl;
const int iterations = 50000;
const int N = 1024;
@@ 289,12 289,80 @@ testCartToPolar()
return true;
}
+bool
+testCartToPolar()
+{
+ cerr << "testVectorOps: testing v_cartesian_to_polar" << endl;
+
+ const int iterations = 50000;
+ const int N = 1024;
+
+ bq_complex_element_t *real = allocate<bq_complex_element_t>(N);
+ bq_complex_element_t *imag = allocate<bq_complex_element_t>(N);
+ bq_complex_element_t *mag = allocate<bq_complex_element_t>(N);
+ bq_complex_element_t *phase = allocate<bq_complex_element_t>(N);
+
+ for (int i = 0; i < N; ++i) {
+ real[i] = (drand48() * 2.0) - 1.0;
+ imag[i] = (drand48() * 2.0) - 1.0;
+ }
+
+ float divisor = float(CLOCKS_PER_SEC) / 1000.f;
+ clock_t start = clock();
+
+ double first, last, total = 0;
+ for (int j = 0; j < iterations; ++j) {
+ for (int i = 0; i < N; ++i) {
+ double mag = sqrt(real[i] * real[i] + imag[i] * imag[i]);
+ double phase = atan2(imag[i], real[i]);
+ if (i == 0) first = mag;
+ if (i == N-1) last = phase;
+ total += mag;
+ total += j * phase;
+ }
+ }
+
+ clock_t end = clock();
+
+ cerr << "naive method: first = " << first << ", last = " << last
+ << ", total = " << total << endl;
+ cerr << "time for naive method: " << float(end - start)/divisor << endl;
+
+ start = clock();
+
+ first = last = total = 0;
+ for (int j = 0; j < iterations; ++j) {
+ v_cartesian_to_polar(mag, phase, real, imag, N);
+ for (int i = 0; i < N; ++i) {
+ if (i == 0) first = mag[i];
+ if (i == N-1) last = phase[i];
+ total += mag[i];
+ total += j * phase[i];
+ }
+ }
+
+ end = clock();
+
+ cerr << "v_cartesian_to_polar: first = " << first << ", last = " << last
+ << ", total = " << total << endl;
+ cerr << "time for v_cartesian_to_polar: " << float(end - start)/divisor << endl;
+
+ deallocate(real);
+ deallocate(imag);
+ deallocate(mag);
+ deallocate(phase);
+
+ cerr << endl;
+ return true;
+}
+
int main(int, char **)
{
cerr << endl;
if (!testMultiply()) return 1;
if (!testPolarToCart()) return 1;
if (!testPolarToCartInterleaved()) return 1;
+ if (!testCartToPolarComplexTypes()) return 1;
if (!testCartToPolar()) return 1;
return 0;
}