#ifndef NPY_SIMD #error "Not a standalone header" #endif #include #ifndef _NPY_SIMD_LSX_MISC_H #define _NPY_SIMD_LSX_MISC_H // vector with zero lanes #define npyv_zero_u8() __lsx_vldi(0) #define npyv_zero_s8() __lsx_vldi(0) #define npyv_zero_u16() __lsx_vldi(0) #define npyv_zero_s16() __lsx_vldi(0) #define npyv_zero_u32() __lsx_vldi(0) #define npyv_zero_s32() __lsx_vldi(0) #define npyv_zero_u64() __lsx_vldi(0) #define npyv_zero_s64() __lsx_vldi(0) #define npyv_zero_f32() (__m128)__lsx_vldi(0) #define npyv_zero_f64() (__m128d)__lsx_vldi(0) // vector with a specific value set to all lanes #define npyv_setall_u8(VAL) __lsx_vreplgr2vr_b((unsigned char)(VAL)) #define npyv_setall_s8(VAL) __lsx_vreplgr2vr_b((signed char)(VAL)) #define npyv_setall_u16(VAL) __lsx_vreplgr2vr_h((unsigned short)(VAL)) #define npyv_setall_s16(VAL) __lsx_vreplgr2vr_h((signed short)(VAL)) #define npyv_setall_u32(VAL) __lsx_vreplgr2vr_w((unsigned int)(VAL)) #define npyv_setall_s32(VAL) __lsx_vreplgr2vr_w((signed int)(VAL)) #define npyv_setall_u64(VAL) __lsx_vreplgr2vr_d((unsigned long long)(VAL)) #define npyv_setall_s64(VAL) __lsx_vreplgr2vr_d((long long)(VAL)) #define npyv_setall_f32(VAL) (__m128)(v4f32){VAL, VAL, VAL, VAL} #define npyv_setall_f64(VAL) (__m128d)(v2f64){VAL, VAL} /** * vector with specific values set to each lane and * set a specific value to all remained lanes * * Args that generated by NPYV__SET_FILL_* not going to expand if * _mm_setr_* are defined as macros. */ NPY_FINLINE __m128i npyv__set_u8( npy_uint8 i0, npy_uint8 i1, npy_uint8 i2, npy_uint8 i3, npy_uint8 i4, npy_uint8 i5, npy_uint8 i6, npy_uint8 i7, npy_uint8 i8, npy_uint8 i9, npy_uint8 i10, npy_uint8 i11, npy_uint8 i12, npy_uint8 i13, npy_uint8 i14, npy_uint8 i15) { v16u8 vec = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; return (__m128i)vec; } NPY_FINLINE __m128i npyv__set_s8( npy_int8 i0, npy_int8 i1, npy_int8 i2, npy_int8 i3, npy_int8 i4, npy_int8 i5, npy_int8 i6, npy_int8 i7, npy_int8 i8, npy_int8 i9, npy_int8 i10, npy_int8 i11, npy_int8 i12, npy_int8 i13, npy_int8 i14, npy_int8 i15) { v16i8 vec = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; return (__m128i)vec; } NPY_FINLINE __m128i npyv__set_u16(npy_uint16 i0, npy_uint16 i1, npy_uint16 i2, npy_uint16 i3, npy_uint16 i4, npy_uint16 i5, npy_uint16 i6, npy_uint16 i7) { v8u16 vec = {i0, i1, i2, i3, i4, i5, i6, i7}; return (__m128i)vec; } NPY_FINLINE __m128i npyv__set_s16(npy_int16 i0, npy_int16 i1, npy_int16 i2, npy_int16 i3, npy_int16 i4, npy_int16 i5, npy_int16 i6, npy_int16 i7) { v8i16 vec = {i0, i1, i2, i3, i4, i5, i6, i7}; return (__m128i)vec; } NPY_FINLINE __m128i npyv__set_u32(npy_uint32 i0, npy_uint32 i1, npy_uint32 i2, npy_uint32 i3) { v4u32 vec = {i0, i1, i2, i3}; return (__m128i)vec; } NPY_FINLINE __m128i npyv__set_s32(npy_int32 i0, npy_int32 i1, npy_int32 i2, npy_int32 i3) { v4i32 vec = {i0, i1, i2, i3}; return (__m128i)vec; } NPY_FINLINE __m128i npyv__set_u64(npy_uint64 i0, npy_uint64 i1) { v2u64 vec = {i0, i1}; return (__m128i)vec; } NPY_FINLINE __m128i npyv__set_s64(npy_int64 i0, npy_int64 i1) { v2i64 vec = {i0, i1}; return (__m128i)vec; } NPY_FINLINE __m128 npyv__set_f32(float i0, float i1, float i2, float i3) { __m128 vec = {i0, i1, i2, i3}; return vec; } NPY_FINLINE __m128d npyv__set_f64(double i0, double i1) { __m128d vec = {i0, i1}; return vec; } #define npyv_setf_u8(FILL, ...) npyv__set_u8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)) #define npyv_setf_s8(FILL, ...) npyv__set_s8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)) #define npyv_setf_u16(FILL, ...) npyv__set_u16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)) #define npyv_setf_s16(FILL, ...) npyv__set_s16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)) #define npyv_setf_u32(FILL, ...) npyv__set_u32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)) #define npyv_setf_s32(FILL, ...) npyv__set_s32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)) #define npyv_setf_u64(FILL, ...) npyv__set_u64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) #define npyv_setf_s64(FILL, ...) npyv__set_s64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) #define npyv_setf_f32(FILL, ...) npyv__set_f32(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)) #define npyv_setf_f64(FILL, ...) npyv__set_f64(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)) // vector with specific values set to each lane and // set zero to all remained lanes #define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) #define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) #define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) #define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) #define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) #define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) #define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) #define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) #define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) #define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) // Per lane select NPY_FINLINE __m128i npyv_select_u8(__m128i mask, __m128i a, __m128i b) { return __lsx_vbitsel_v(b, a, mask); } NPY_FINLINE __m128 npyv_select_f32(__m128i mask, __m128 a, __m128 b) { return (__m128)__lsx_vbitsel_v((__m128i)b, (__m128i)a, mask); } NPY_FINLINE __m128d npyv_select_f64(__m128i mask, __m128d a, __m128d b) { return (__m128d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, mask); } #define npyv_select_s8 npyv_select_u8 #define npyv_select_u16 npyv_select_u8 #define npyv_select_s16 npyv_select_u8 #define npyv_select_u32 npyv_select_u8 #define npyv_select_s32 npyv_select_u8 #define npyv_select_u64 npyv_select_u8 #define npyv_select_s64 npyv_select_u8 // extract the first vector's lane #define npyv_extract0_u8(A) ((npy_uint8)__lsx_vpickve2gr_bu(A, 0)) #define npyv_extract0_s8(A) ((npy_int8)__lsx_vpickve2gr_b(A, 0)) #define npyv_extract0_u16(A) ((npy_uint16)__lsx_vpickve2gr_hu(A, 0)) #define npyv_extract0_s16(A) ((npy_int16)__lsx_vpickve2gr_h(A, 0)) #define npyv_extract0_u32(A) ((npy_uint32)__lsx_vpickve2gr_wu(A, 0)) #define npyv_extract0_s32(A) ((npy_int32)__lsx_vpickve2gr_w(A, 0)) #define npyv_extract0_u64(A) ((npy_uint64)__lsx_vpickve2gr_du(A, 0)) #define npyv_extract0_s64(A) ((npy_int64)__lsx_vpickve2gr_d(A, 0)) #define npyv_extract0_f32(A) A[0] #define npyv_extract0_f64(A) A[0] // Reinterpret #define npyv_reinterpret_u8_u8(X) X #define npyv_reinterpret_u8_s8(X) X #define npyv_reinterpret_u8_u16(X) X #define npyv_reinterpret_u8_s16(X) X #define npyv_reinterpret_u8_u32(X) X #define npyv_reinterpret_u8_s32(X) X #define npyv_reinterpret_u8_u64(X) X #define npyv_reinterpret_u8_s64(X) X #define npyv_reinterpret_u8_f32(X) (__m128i)X #define npyv_reinterpret_u8_f64(X) (__m128i)X #define npyv_reinterpret_s8_s8(X) X #define npyv_reinterpret_s8_u8(X) X #define npyv_reinterpret_s8_u16(X) X #define npyv_reinterpret_s8_s16(X) X #define npyv_reinterpret_s8_u32(X) X #define npyv_reinterpret_s8_s32(X) X #define npyv_reinterpret_s8_u64(X) X #define npyv_reinterpret_s8_s64(X) X #define npyv_reinterpret_s8_f32(X) (__m128i)X #define npyv_reinterpret_s8_f64(X) (__m128i)X #define npyv_reinterpret_u16_u16(X) X #define npyv_reinterpret_u16_u8(X) X #define npyv_reinterpret_u16_s8(X) X #define npyv_reinterpret_u16_s16(X) X #define npyv_reinterpret_u16_u32(X) X #define npyv_reinterpret_u16_s32(X) X #define npyv_reinterpret_u16_u64(X) X #define npyv_reinterpret_u16_s64(X) X #define npyv_reinterpret_u16_f32(X) (__m128i)X #define npyv_reinterpret_u16_f64(X) (__m128i)X #define npyv_reinterpret_s16_s16(X) X #define npyv_reinterpret_s16_u8(X) X #define npyv_reinterpret_s16_s8(X) X #define npyv_reinterpret_s16_u16(X) X #define npyv_reinterpret_s16_u32(X) X #define npyv_reinterpret_s16_s32(X) X #define npyv_reinterpret_s16_u64(X) X #define npyv_reinterpret_s16_s64(X) X #define npyv_reinterpret_s16_f32(X) (__m128i)X #define npyv_reinterpret_s16_f64(X) (__m128i)X #define npyv_reinterpret_u32_u32(X) X #define npyv_reinterpret_u32_u8(X) X #define npyv_reinterpret_u32_s8(X) X #define npyv_reinterpret_u32_u16(X) X #define npyv_reinterpret_u32_s16(X) X #define npyv_reinterpret_u32_s32(X) X #define npyv_reinterpret_u32_u64(X) X #define npyv_reinterpret_u32_s64(X) X #define npyv_reinterpret_u32_f32(X) (__m128i)X #define npyv_reinterpret_u32_f64(X) (__m128i)X #define npyv_reinterpret_s32_s32(X) X #define npyv_reinterpret_s32_u8(X) X #define npyv_reinterpret_s32_s8(X) X #define npyv_reinterpret_s32_u16(X) X #define npyv_reinterpret_s32_s16(X) X #define npyv_reinterpret_s32_u32(X) X #define npyv_reinterpret_s32_u64(X) X #define npyv_reinterpret_s32_s64(X) X #define npyv_reinterpret_s32_f32(X) (__m128i)X #define npyv_reinterpret_s32_f64(X) (__m128i)X #define npyv_reinterpret_u64_u64(X) X #define npyv_reinterpret_u64_u8(X) X #define npyv_reinterpret_u64_s8(X) X #define npyv_reinterpret_u64_u16(X) X #define npyv_reinterpret_u64_s16(X) X #define npyv_reinterpret_u64_u32(X) X #define npyv_reinterpret_u64_s32(X) X #define npyv_reinterpret_u64_s64(X) X #define npyv_reinterpret_u64_f32(X) (__m128i)X #define npyv_reinterpret_u64_f64(X) (__m128i)X #define npyv_reinterpret_s64_s64(X) X #define npyv_reinterpret_s64_u8(X) X #define npyv_reinterpret_s64_s8(X) X #define npyv_reinterpret_s64_u16(X) X #define npyv_reinterpret_s64_s16(X) X #define npyv_reinterpret_s64_u32(X) X #define npyv_reinterpret_s64_s32(X) X #define npyv_reinterpret_s64_u64(X) X #define npyv_reinterpret_s64_f32(X) (__m128i)X #define npyv_reinterpret_s64_f64(X) (__m128i)X #define npyv_reinterpret_f32_f32(X) X #define npyv_reinterpret_f32_u8(X) (__m128)X #define npyv_reinterpret_f32_s8(X) (__m128)X #define npyv_reinterpret_f32_u16(X) (__m128)X #define npyv_reinterpret_f32_s16(X) (__m128)X #define npyv_reinterpret_f32_u32(X) (__m128)X #define npyv_reinterpret_f32_s32(X) (__m128)X #define npyv_reinterpret_f32_u64(X) (__m128)X #define npyv_reinterpret_f32_s64(X) (__m128)X #define npyv_reinterpret_f32_f64(X) (__m128)X #define npyv_reinterpret_f64_f64(X) X #define npyv_reinterpret_f64_u8(X) (__m128d)X #define npyv_reinterpret_f64_s8(X) (__m128d)X #define npyv_reinterpret_f64_u16(X) (__m128d)X #define npyv_reinterpret_f64_s16(X) (__m128d)X #define npyv_reinterpret_f64_u32(X) (__m128d)X #define npyv_reinterpret_f64_s32(X) (__m128d)X #define npyv_reinterpret_f64_u64(X) (__m128d)X #define npyv_reinterpret_f64_s64(X) (__m128d)X #define npyv_reinterpret_f64_f32(X) (__m128d)X // Only required by AVX2/AVX512 #define npyv_cleanup() ((void)0) #endif