/** * Force use SSE only on x86, even if AVX2 or AVX512F are enabled * through the baseline, since scatter(AVX512F) and gather very costly * to handle non-contiguous memory access comparing with SSE for * such small operations that this file covers. */ #define NPY_SIMD_FORCE_128 #include "numpy/npy_math.h" #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" /********************************************************** ** Scalars **********************************************************/ #if !NPY_SIMD_F32 NPY_FINLINE float c_recip_f32(float a) { return 1.0f / a; } NPY_FINLINE float c_abs_f32(float a) { const float tmp = a > 0 ? a : -a; /* add 0 to clear -0.0 */ return tmp + 0; } NPY_FINLINE float c_square_f32(float a) { return a * a; } #endif // !NPY_SIMD_F32 #if !NPY_SIMD_F64 NPY_FINLINE double c_recip_f64(double a) { return 1.0 / a; } NPY_FINLINE double c_abs_f64(double a) { const double tmp = a > 0 ? a : -a; /* add 0 to clear -0.0 */ return tmp + 0; } NPY_FINLINE double c_square_f64(double a) { return a * a; } #endif // !NPY_SIMD_F64 /** * MSVC(32-bit mode) requires a clarified contiguous loop * in order to use SSE, otherwise it uses a soft version of square root * that doesn't raise a domain error. */ #if defined(_MSC_VER) && defined(_M_IX86) && !NPY_SIMD #include NPY_FINLINE float c_sqrt_f32(float _a) { __m128 a = _mm_load_ss(&_a); __m128 lower = _mm_sqrt_ss(a); return _mm_cvtss_f32(lower); } NPY_FINLINE double c_sqrt_f64(double _a) { __m128d a = _mm_load_sd(&_a); __m128d lower = _mm_sqrt_pd(a); return _mm_cvtsd_f64(lower); } #else #define c_sqrt_f32 npy_sqrtf #define c_sqrt_f64 npy_sqrt #endif #define c_ceil_f32 npy_ceilf #define c_ceil_f64 npy_ceil #define c_trunc_f32 npy_truncf #define c_trunc_f64 npy_trunc #define c_floor_f32 npy_floorf #define c_floor_f64 npy_floor #define c_rint_f32 npy_rintf #define c_rint_f64 npy_rint /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ /** Notes: * - avoid the use of libmath to unify fp/domain errors * for both scalars and vectors among all compilers/architectures. * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_ * to fill the remind lanes with 1.0 to avoid divide by zero fp * exception in reciprocal. */ #define CONTIG 0 #define NCONTIG 1 /**begin repeat * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# */ #if @VCHK@ /**begin repeat1 * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal# * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip# * #repl_0w1 = 0*7, 1# */ /**begin repeat2 * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG# * #unroll = 4, 4, 2, 2# */ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@ (const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len) { const npyv_lanetype_@sfx@ *src = _src; npyv_lanetype_@sfx@ *dst = _dst; const int vstep = npyv_nlanes_@sfx@; const int wstep = vstep * @unroll@; // unrolled iterations for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) { /**begin repeat3 * #N = 0, 1, 2, 3# */ #if @unroll@ > @N@ #if @STYPE@ == CONTIG npyv_@sfx@ v_src@N@ = npyv_load_@sfx@(src + vstep*@N@); #else npyv_@sfx@ v_src@N@ = npyv_loadn_@sfx@(src + ssrc*vstep*@N@, ssrc); #endif npyv_@sfx@ v_unary@N@ = npyv_@intr@_@sfx@(v_src@N@); #endif /**end repeat3**/ /**begin repeat3 * #N = 0, 1, 2, 3# */ #if @unroll@ > @N@ #if @DTYPE@ == CONTIG npyv_store_@sfx@(dst + vstep*@N@, v_unary@N@); #else npyv_storen_@sfx@(dst + sdst*vstep*@N@, sdst, v_unary@N@); #endif #endif /**end repeat3**/ } // vector-sized iterations for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { #if @STYPE@ == CONTIG npyv_@sfx@ v_src0 = npyv_load_@sfx@(src); #else npyv_@sfx@ v_src0 = npyv_loadn_@sfx@(src, ssrc); #endif npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0); #if @DTYPE@ == CONTIG npyv_store_@sfx@(dst, v_unary0); #else npyv_storen_@sfx@(dst, sdst, v_unary0); #endif } // last partial iteration, if needed if(len > 0){ #if @STYPE@ == CONTIG #if @repl_0w1@ npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1); #else npyv_@sfx@ v_src0 = npyv_load_tillz_@sfx@(src, len); #endif #else #if @repl_0w1@ npyv_@sfx@ v_src0 = npyv_loadn_till_@sfx@(src, ssrc, len, 1); #else npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len); #endif #endif npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0); #if @DTYPE@ == CONTIG npyv_store_till_@sfx@(dst, len, v_unary0); #else npyv_storen_till_@sfx@(dst, sdst, len, v_unary0); #endif } npyv_cleanup(); } /**end repeat2**/ /**end repeat1**/ #endif // @VCHK@ /**end repeat**/ /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ /**begin repeat * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal# * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip# * #clear = 0, 0, 0, 0, 0, 1, 0, 0# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { const char *src = args[0]; char *dst = args[1]; const npy_intp src_step = steps[0]; const npy_intp dst_step = steps[1]; npy_intp len = dimensions[0]; #if @VCHK@ const int lsize = sizeof(npyv_lanetype_@sfx@); if (is_mem_overlap(src, src_step, dst, dst_step, len)) { goto no_unroll; } if (!npyv_loadable_stride_@sfx@(src_step) || !npyv_storable_stride_@sfx@(dst_step)) { goto no_unroll; } const npy_intp ssrc = src_step / lsize; const npy_intp sdst = dst_step / lsize; if (ssrc == 1 && sdst == 1) { simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 1, dst, 1, len); } else if (sdst == 1) { simd_@TYPE@_@kind@_NCONTIG_CONTIG(src, ssrc, dst, 1, len); } else if (ssrc == 1) { simd_@TYPE@_@kind@_CONTIG_NCONTIG(src, 1, dst, sdst, len); } else { simd_@TYPE@_@kind@_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len); } goto clear; no_unroll: #endif // @VCHK@ for (; len > 0; --len, src += src_step, dst += dst_step) { #if @VCHK@ // to guarantee the same precision and fp/domain errors for both scalars and vectors simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1); #else const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src; *(npyv_lanetype_@sfx@*)dst = c_@intr@_@sfx@(src0); #endif } #if @VCHK@ clear:; #endif #if @clear@ npy_clear_floatstatus_barrier((char*)dimensions); #endif } /**end repeat1**/ /**end repeat**/