#ifndef _NPY_SIMD_H_ #define _NPY_SIMD_H_ #include /* for alignof until C23 */ /** * the NumPy C SIMD vectorization interface "NPYV" are types and functions intended * to simplify vectorization of code on different platforms, currently supports * the following SIMD extensions SSE, AVX2, AVX512, VSX and NEON. * * TODO: Add an independent sphinx doc. */ #include "numpy/npy_common.h" #ifndef __cplusplus #include #endif #include "npy_cpu_dispatch.h" #include "simd_utils.h" #ifdef __cplusplus extern "C" { #endif /* * clang commit an aggressive optimization behaviour when flag `-ftrapping-math` * isn't fully supported that's present at -O1 or greater. When partially loading a * vector register for a operations that requires to fill up the remaining lanes * with certain value for example divide operation needs to fill the remaining value * with non-zero integer to avoid fp exception divide-by-zero. * clang optimizer notices that the entire register is not needed for the store * and optimizes out the fill of non-zero integer to the remaining * elements. As workaround we mark the returned register with `volatile` * followed by symmetric operand operation e.g. `or` * to convince the compiler that the entire vector is needed. */ #if defined(__clang__) && !defined(NPY_HAVE_CLANG_FPSTRICT) #define NPY_SIMD_GUARD_PARTIAL_LOAD 1 #else #define NPY_SIMD_GUARD_PARTIAL_LOAD 0 #endif #if defined(_MSC_VER) && defined(_M_IX86) /* * Avoid using any of the following intrinsics with MSVC 32-bit, * even if they are apparently work on newer versions. * They had bad impact on the generated instructions, * sometimes the compiler deal with them without the respect * of 32-bit mode which lead to crush due to execute 64-bit * instructions and other times generate bad emulated instructions. */ #undef _mm512_set1_epi64 #undef _mm256_set1_epi64x #undef _mm_set1_epi64x #undef _mm512_setr_epi64x #undef _mm256_setr_epi64x #undef _mm_setr_epi64x #undef _mm512_set_epi64x #undef _mm256_set_epi64x #undef _mm_set_epi64x #endif // lane type by intrin suffix typedef npy_uint8 npyv_lanetype_u8; typedef npy_int8 npyv_lanetype_s8; typedef npy_uint16 npyv_lanetype_u16; typedef npy_int16 npyv_lanetype_s16; typedef npy_uint32 npyv_lanetype_u32; typedef npy_int32 npyv_lanetype_s32; typedef npy_uint64 npyv_lanetype_u64; typedef npy_int64 npyv_lanetype_s64; typedef float npyv_lanetype_f32; typedef double npyv_lanetype_f64; #if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128) #include "avx512/avx512.h" #elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128) #include "avx2/avx2.h" #elif defined(NPY_HAVE_SSE2) #include "sse/sse.h" #endif // TODO: Add support for VSX(2.06) and BE Mode for VSX #if defined(NPY_HAVE_VX) || (defined(NPY_HAVE_VSX2) && defined(__LITTLE_ENDIAN__)) #include "vec/vec.h" #endif #ifdef NPY_HAVE_NEON #include "neon/neon.h" #endif #ifdef NPY_HAVE_LSX #include "lsx/lsx.h" #endif #ifndef NPY_SIMD /// SIMD width in bits or 0 if there's no SIMD extension available. #define NPY_SIMD 0 /// SIMD width in bytes or 0 if there's no SIMD extension available. #define NPY_SIMD_WIDTH 0 /// 1 if the enabled SIMD extension supports single-precision otherwise 0. #define NPY_SIMD_F32 0 /// 1 if the enabled SIMD extension supports double-precision otherwise 0. #define NPY_SIMD_F64 0 /// 1 if the enabled SIMD extension supports native FMA otherwise 0. /// note: we still emulate(fast) FMA intrinsics even if they /// aren't supported but they shouldn't be used if the precision is matters. #define NPY_SIMD_FMA3 0 /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0. #define NPY_SIMD_BIGENDIAN 0 /// 1 if the supported comparison intrinsics(lt, le, gt, ge) /// raises FP invalid exception for quite NaNs. #define NPY_SIMD_CMPSIGNAL 0 #endif // enable emulated mask operations for all SIMD extension except for AVX512 #if !defined(NPY_HAVE_AVX512F) && NPY_SIMD && NPY_SIMD < 512 #include "emulate_maskop.h" #endif // enable integer divisor generator for all SIMD extensions #if NPY_SIMD #include "intdiv.h" #endif /** * Some SIMD extensions currently(AVX2, AVX512F) require (de facto) * a maximum number of strides sizes when dealing with non-contiguous memory access. * * Therefore the following functions must be used to check the maximum * acceptable limit of strides before using any of non-contiguous load/store intrinsics. * * For instance: * * if (npyv_loadable_stride_f32(steps[0]) && npyv_storable_stride_f32(steps[1])) { * // Strides are now guaranteed to be a multiple and compatible * npy_intp ld_stride = steps[0] / sizeof(float); * npy_intp st_stride = steps[1] / sizeof(float); * for (;;) * npyv_f32 a = npyv_loadn_f32(ld_pointer, ld_stride); * // ... * npyv_storen_f32(st_pointer, st_stride, a); * } * else { * for (;;) * // C scalars, use byte steps/strides. * } */ #ifndef NPY_SIMD_MAXLOAD_STRIDE32 #define NPY_SIMD_MAXLOAD_STRIDE32 0 #endif #ifndef NPY_SIMD_MAXSTORE_STRIDE32 #define NPY_SIMD_MAXSTORE_STRIDE32 0 #endif #ifndef NPY_SIMD_MAXLOAD_STRIDE64 #define NPY_SIMD_MAXLOAD_STRIDE64 0 #endif #ifndef NPY_SIMD_MAXSTORE_STRIDE64 #define NPY_SIMD_MAXSTORE_STRIDE64 0 #endif #define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE) \ NPY_FINLINE int \ npyv_loadable_stride_##SFX(npy_intp stride) \ { \ if (alignof(npyv_lanetype_##SFX) != sizeof(npyv_lanetype_##SFX) && \ stride % sizeof(npyv_lanetype_##SFX) != 0) { \ /* stride not a multiple of itemsize, cannot handle. */ \ return 0; \ } \ stride = stride / sizeof(npyv_lanetype_##SFX); \ return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1; \ } \ NPY_FINLINE int \ npyv_storable_stride_##SFX(npy_intp stride) \ { \ if (alignof(npyv_lanetype_##SFX) != sizeof(npyv_lanetype_##SFX) && \ stride % sizeof(npyv_lanetype_##SFX) != 0) { \ /* stride not a multiple of itemsize, cannot handle. */ \ return 0; \ } \ stride = stride / sizeof(npyv_lanetype_##SFX); \ return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1; \ } #if NPY_SIMD NPYV_IMPL_MAXSTRIDE(u32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32) NPYV_IMPL_MAXSTRIDE(s32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32) NPYV_IMPL_MAXSTRIDE(f32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32) NPYV_IMPL_MAXSTRIDE(u64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64) NPYV_IMPL_MAXSTRIDE(s64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64) #endif #if NPY_SIMD_F64 NPYV_IMPL_MAXSTRIDE(f64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64) #endif #ifdef __cplusplus } #endif #endif // _NPY_SIMD_H_