# HG changeset patch # User Chris Cannam # Date 1660293568 -3600 # Fri Aug 12 09:39:28 2022 +0100 # Branch sleef # Node ID 3155878fa6e6cb0cabbc13e615e37edace46a8dd # Parent 759bf59be4523eeaf2cbb50d7e4b92e9ef00b59b Wedge in some more SLEEF functions - this is most inelegantly done diff --git a/bqvec/VectorOpsComplex.h b/bqvec/VectorOpsComplex.h --- a/bqvec/VectorOpsComplex.h +++ b/bqvec/VectorOpsComplex.h @@ -550,6 +550,62 @@ ippsPolarToCart_64fc(mag, phase, (Ipp64fc *)dst, count); } +#elif defined HAVE_SLEEF + +extern void concrete_v_polar_to_cartesian_f +(float *const BQ_R__, float *const BQ_R__, + const float *const BQ_R__, const float *const BQ_R__, const int); + +extern void concrete_v_polar_to_cartesian_interleaved_f +(float *const BQ_R__, const float *const BQ_R__, + const float *const BQ_R__, const int); + +extern void concrete_v_polar_to_cartesian_d +(double *const BQ_R__, double *const BQ_R__, + const double *const BQ_R__, const double *const BQ_R__, const int); + +extern void concrete_v_polar_to_cartesian_interleaved_d +(double *const BQ_R__, const double *const BQ_R__, + const double *const BQ_R__, const int); + +template<> +inline void v_polar_to_cartesian(float *const BQ_R__ real, + float *const BQ_R__ imag, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count) +{ + concrete_v_polar_to_cartesian_f(real, imag, mag, phase, count); +} + +template<> +inline void v_polar_to_cartesian(double *const BQ_R__ real, + double *const BQ_R__ imag, + const double *const BQ_R__ mag, + const double *const BQ_R__ phase, + const int count) +{ + concrete_v_polar_to_cartesian_d(real, imag, mag, phase, count); +} + +template<> +inline void v_polar_to_cartesian_interleaved(float *const BQ_R__ dst, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count) +{ + concrete_v_polar_to_cartesian_interleaved_f(dst, mag, phase, count); +} + +template<> +inline void v_polar_to_cartesian_interleaved(double *const BQ_R__ dst, + const double *const BQ_R__ mag, + const double *const BQ_R__ phase, + const int count) +{ + concrete_v_polar_to_cartesian_interleaved_d(dst, mag, phase, count); +} + #elif defined USE_POMMIER_MATHFUN void v_polar_to_cartesian_pommier(float *const BQ_R__ real, diff --git a/src/VectorOpsComplex.cpp b/src/VectorOpsComplex.cpp --- a/src/VectorOpsComplex.cpp +++ b/src/VectorOpsComplex.cpp @@ -290,24 +290,180 @@ #define BQ_SIMD_n 512 #define BQ_SIMD_sp __m512 #define BQ_SIMD_dp __m512d +#define BQ_SIMD_sp_2 Sleef___m512_2 +#define BQ_SIMD_dp_2 Sleef___m512d_2 #define BQ_ATAN2_sp Sleef_atan2f16_u10 #define BQ_ATAN2_dp Sleef_atan2d8_u35 +#define BQ_SINCOS_sp Sleef_sincosf16_u10 +#define BQ_SINCOS_dp Sleef_sincosd8_u10 #elif defined(__AVX__) #define BQ_SIMD_n 256 #define BQ_SIMD_sp __m256 #define BQ_SIMD_dp __m256d +#define BQ_SIMD_sp_2 Sleef___m256_2 +#define BQ_SIMD_dp_2 Sleef___m256d_2 #define BQ_ATAN2_sp Sleef_atan2f8_u10 #define BQ_ATAN2_dp Sleef_atan2d4_u35 +#define BQ_SINCOS_sp Sleef_sincosf8_u10 +#define BQ_SINCOS_dp Sleef_sincosd4_u10 #elif defined(__SSE2__) #define BQ_SIMD_n 128 #define BQ_SIMD_sp __m128 #define BQ_SIMD_dp __m128d +#define BQ_SIMD_sp_2 Sleef___m128_2 +#define BQ_SIMD_dp_2 Sleef___m128d_2 #define BQ_ATAN2_sp Sleef_atan2f4_u10 #define BQ_ATAN2_dp Sleef_atan2d2_u35 +#define BQ_SINCOS_sp Sleef_sincosf4_u10 +#define BQ_SINCOS_dp Sleef_sincosd2_u10 #else #define BQ_SIMD_n 0 #endif +void concrete_v_polar_to_cartesian_f(float *const BQ_R__ real, + float *const BQ_R__ imag, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count) +{ + int i = 0; + const int bytes = BQ_SIMD_n / 8; + const int elements = bytes / sizeof(float); + if (!BQ_SIMD_n || + ((uintptr_t)real & (bytes - 1)) || + ((uintptr_t)imag & (bytes - 1))) { + // No SIMD or unaligned + while (i < count) { + c_phasor(real + i, imag + i, phase[i]); + real[i] *= mag[i]; + imag[i] *= mag[i]; + ++i; + } + return; + } + while (i + elements < count) { + const BQ_SIMD_sp *pp = (const BQ_SIMD_sp *)(phase + i); + BQ_SIMD_sp_2 sc = BQ_SINCOS_sp(*pp); + for (int j = 0; j < elements; ++j) { + real[i+j] = mag[i+j] * ((float *)&(sc.y))[j]; + imag[i+j] = mag[i+j] * ((float *)&(sc.x))[j]; + } + i += elements; + } + while (i < count) { + c_phasor(real + i, imag + i, phase[i]); + real[i] *= mag[i]; + imag[i] *= mag[i]; + ++i; + } +} + +void concrete_v_polar_to_cartesian_d(double *const BQ_R__ real, + double *const BQ_R__ imag, + const double *const BQ_R__ mag, + const double *const BQ_R__ phase, + const int count) +{ + int i = 0; + const int bytes = BQ_SIMD_n / 8; + const int elements = bytes / sizeof(double); + if (!BQ_SIMD_n || ((uintptr_t)phase & (bytes - 1))) { + // No SIMD or unaligned + while (i < count) { + c_phasor(real + i, imag + i, phase[i]); + real[i] *= mag[i]; + imag[i] *= mag[i]; + ++i; + } + return; + } + while (i + elements < count) { + const BQ_SIMD_dp *pp = (const BQ_SIMD_dp *)(phase + i); + BQ_SIMD_dp_2 sc = BQ_SINCOS_dp(*pp); + for (int j = 0; j < elements; ++j) { + real[i+j] = mag[i+j] * ((double *)&(sc.y))[j]; + imag[i+j] = mag[i+j] * ((double *)&(sc.x))[j]; + } + i += elements; + } + while (i < count) { + c_phasor(real + i, imag + i, phase[i]); + real[i] *= mag[i]; + imag[i] *= mag[i]; + ++i; + } +} + +void concrete_v_polar_to_cartesian_interleaved_f(float *const BQ_R__ dst, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count) +{ + int i = 0; + const int bytes = BQ_SIMD_n / 8; + const int elements = bytes / sizeof(float); + if (!BQ_SIMD_n || ((uintptr_t)phase & (bytes - 1))) { + // No SIMD or unaligned + while (i < count) { + c_phasor(dst + i*2, dst + i*2 + 1, phase[i]); + dst[i*2] *= mag[i]; + dst[i*2 + 1] *= mag[i]; + ++i; + } + return; + } + while (i + elements < count) { + const BQ_SIMD_sp *pp = (const BQ_SIMD_sp *)(phase + i); + BQ_SIMD_sp_2 sc = BQ_SINCOS_sp(*pp); + for (int j = 0; j < elements; ++j) { + dst[(i+j) * 2] = mag[i+j] * ((float *)&(sc.y))[j]; + dst[(i+j) * 2 + 1] = mag[i+j] * ((float *)&(sc.x))[j]; + } + i += elements; + } + while (i < count) { + c_phasor(dst + i*2, dst + i*2 + 1, phase[i]); + dst[i*2] *= mag[i]; + dst[i*2 + 1] *= mag[i]; + ++i; + } +} + +void concrete_v_polar_to_cartesian_interleaved_d(double *const BQ_R__ dst, + const double *const BQ_R__ mag, + const double *const BQ_R__ phase, + const int count) +{ + int i = 0; + const int bytes = BQ_SIMD_n / 8; + const int elements = bytes / sizeof(double); + if (!BQ_SIMD_n || ((uintptr_t)phase & (bytes - 1))) { + // No SIMD or unaligned + while (i < count) { + c_phasor(dst + i*2, dst + i*2 + 1, phase[i]); + dst[i*2] *= mag[i]; + dst[i*2 + 1] *= mag[i]; + ++i; + } + return; + } + while (i + elements < count) { + const BQ_SIMD_dp *pp = (const BQ_SIMD_dp *)(phase + i); + BQ_SIMD_dp_2 sc = BQ_SINCOS_dp(*pp); + for (int j = 0; j < elements; ++j) { + dst[(i+j) * 2] = mag[i+j] * ((double *)&(sc.y))[j]; + dst[(i+j) * 2 + 1] = mag[i+j] * ((double *)&(sc.x))[j]; + } + i += elements; + } + while (i < count) { + c_phasor(dst + i*2, dst + i*2 + 1, phase[i]); + dst[i*2] *= mag[i]; + dst[i*2 + 1] *= mag[i]; + ++i; + } +} + void concrete_v_cartesian_to_polar_f(float *const BQ_R__ mag, float *const BQ_R__ phase, const float *const BQ_R__ real, @@ -322,7 +478,7 @@ ((uintptr_t)imag & (bytes - 1))) { // No SIMD or unaligned while (i < count) { - c_magphase(mag + i, phase + i, real[i], imag[i]); + c_magphase(mag + i, phase + i, real[i], imag[i]); ++i; } return; @@ -353,7 +509,7 @@ const int elements = bytes / sizeof(float); if (!BQ_SIMD_n) { while (i < count) { - c_magphase(mag + i, phase + i, src[i*2], src[i*2+1]); + c_magphase(mag + i, phase + i, src[i*2], src[i*2+1]); ++i; } return; @@ -392,7 +548,7 @@ ((uintptr_t)imag & (bytes - 1))) { // No SIMD or unaligned while (i < count) { - c_magphase(mag + i, phase + i, real[i], imag[i]); + c_magphase(mag + i, phase + i, real[i], imag[i]); ++i; } return; @@ -423,7 +579,7 @@ const int elements = bytes / sizeof(double); if (!BQ_SIMD_n) { while (i < count) { - c_magphase(mag + i, phase + i, src[i*2], src[i*2+1]); + c_magphase(mag + i, phase + i, src[i*2], src[i*2+1]); ++i; } return;