#ifndef NPY_SIMD #error "Not a standalone header" #endif #ifndef _NPY_SIMD_AVX512_MISC_H #define _NPY_SIMD_AVX512_MISC_H // set all lanes to zero #define npyv_zero_u8 _mm512_setzero_si512 #define npyv_zero_s8 _mm512_setzero_si512 #define npyv_zero_u16 _mm512_setzero_si512 #define npyv_zero_s16 _mm512_setzero_si512 #define npyv_zero_u32 _mm512_setzero_si512 #define npyv_zero_s32 _mm512_setzero_si512 #define npyv_zero_u64 _mm512_setzero_si512 #define npyv_zero_s64 _mm512_setzero_si512 #define npyv_zero_f32 _mm512_setzero_ps #define npyv_zero_f64 _mm512_setzero_pd // set all lanes to same value #define npyv_setall_u8(VAL) _mm512_set1_epi8((char)VAL) #define npyv_setall_s8(VAL) _mm512_set1_epi8((char)VAL) #define npyv_setall_u16(VAL) _mm512_set1_epi16((short)VAL) #define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL) #define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL) #define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL) #define npyv_setall_f32(VAL) _mm512_set1_ps(VAL) #define npyv_setall_f64(VAL) _mm512_set1_pd(VAL) NPY_FINLINE __m512i npyv__setr_epi64( npy_int64, npy_int64, npy_int64, npy_int64, npy_int64, npy_int64, npy_int64, npy_int64 ); NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a) { npy_int64 ai = (npy_int64)a; #if defined(_MSC_VER) && defined(_M_IX86) return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai); #else return _mm512_set1_epi64(ai); #endif } NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a) { #if defined(_MSC_VER) && defined(_M_IX86) return npyv__setr_epi64(a, a, a, a, a, a, a, a); #else return _mm512_set1_epi64(a); #endif } /** * vector with specific values set to each lane and * set a specific value to all remained lanes * * _mm512_set_epi8 and _mm512_set_epi16 are missing in many compilers */ NPY_FINLINE __m512i npyv__setr_epi8( char i0, char i1, char i2, char i3, char i4, char i5, char i6, char i7, char i8, char i9, char i10, char i11, char i12, char i13, char i14, char i15, char i16, char i17, char i18, char i19, char i20, char i21, char i22, char i23, char i24, char i25, char i26, char i27, char i28, char i29, char i30, char i31, char i32, char i33, char i34, char i35, char i36, char i37, char i38, char i39, char i40, char i41, char i42, char i43, char i44, char i45, char i46, char i47, char i48, char i49, char i50, char i51, char i52, char i53, char i54, char i55, char i56, char i57, char i58, char i59, char i60, char i61, char i62, char i63) { const char NPY_DECL_ALIGNED(64) data[64] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47, i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 }; return _mm512_load_si512((const void*)data); } NPY_FINLINE __m512i npyv__setr_epi16( short i0, short i1, short i2, short i3, short i4, short i5, short i6, short i7, short i8, short i9, short i10, short i11, short i12, short i13, short i14, short i15, short i16, short i17, short i18, short i19, short i20, short i21, short i22, short i23, short i24, short i25, short i26, short i27, short i28, short i29, short i30, short i31) { const short NPY_DECL_ALIGNED(64) data[32] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 }; return _mm512_load_si512((const void*)data); } // args that generated by NPYV__SET_FILL_* not going to expand if // _mm512_setr_* are defined as macros. NPY_FINLINE __m512i npyv__setr_epi32( int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15) { return _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); } NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3, npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7) { #if defined(_MSC_VER) && defined(_M_IX86) return _mm512_setr_epi32( (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32), (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32), (int)i4, (int)(i4 >> 32), (int)i5, (int)(i5 >> 32), (int)i6, (int)(i6 >> 32), (int)i7, (int)(i7 >> 32) ); #else return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7); #endif } NPY_FINLINE __m512 npyv__setr_ps( float i0, float i1, float i2, float i3, float i4, float i5, float i6, float i7, float i8, float i9, float i10, float i11, float i12, float i13, float i14, float i15) { return _mm512_setr_ps(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); } NPY_FINLINE __m512d npyv__setr_pd(double i0, double i1, double i2, double i3, double i4, double i5, double i6, double i7) { return _mm512_setr_pd(i0, i1, i2, i3, i4, i5, i6, i7); } #define npyv_setf_u8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_64(char, FILL, __VA_ARGS__)) #define npyv_setf_s8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_64(char, FILL, __VA_ARGS__)) #define npyv_setf_u16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_32(short, FILL, __VA_ARGS__)) #define npyv_setf_s16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_32(short, FILL, __VA_ARGS__)) #define npyv_setf_u32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_16(int, FILL, __VA_ARGS__)) #define npyv_setf_s32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_16(int, FILL, __VA_ARGS__)) #define npyv_setf_u64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_8(npy_int64, FILL, __VA_ARGS__)) #define npyv_setf_s64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_8(npy_int64, FILL, __VA_ARGS__)) #define npyv_setf_f32(FILL, ...) npyv__setr_ps(NPYV__SET_FILL_16(float, FILL, __VA_ARGS__)) #define npyv_setf_f64(FILL, ...) npyv__setr_pd(NPYV__SET_FILL_8(double, FILL, __VA_ARGS__)) // vector with specific values set to each lane and // set zero to all remained lanes #define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) #define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) #define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) #define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) #define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) #define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) #define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) #define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) #define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) #define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) // per lane select #ifdef NPY_HAVE_AVX512BW #define npyv_select_u8(MASK, A, B) _mm512_mask_blend_epi8(MASK, B, A) #define npyv_select_u16(MASK, A, B) _mm512_mask_blend_epi16(MASK, B, A) #else NPY_FINLINE __m512i npyv_select_u8(__m512i mask, __m512i a, __m512i b) { return _mm512_xor_si512(b, _mm512_and_si512(_mm512_xor_si512(b, a), mask)); } #define npyv_select_u16 npyv_select_u8 #endif #define npyv_select_s8 npyv_select_u8 #define npyv_select_s16 npyv_select_u16 #define npyv_select_u32(MASK, A, B) _mm512_mask_blend_epi32(MASK, B, A) #define npyv_select_s32 npyv_select_u32 #define npyv_select_u64(MASK, A, B) _mm512_mask_blend_epi64(MASK, B, A) #define npyv_select_s64 npyv_select_u64 #define npyv_select_f32(MASK, A, B) _mm512_mask_blend_ps(MASK, B, A) #define npyv_select_f64(MASK, A, B) _mm512_mask_blend_pd(MASK, B, A) // extract the first vector's lane #define npyv_extract0_u8(A) ((npy_uint8)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) #define npyv_extract0_s8(A) ((npy_int8)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) #define npyv_extract0_u16(A) ((npy_uint16)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) #define npyv_extract0_s16(A) ((npy_int16)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) #define npyv_extract0_u32(A) ((npy_uint32)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) #define npyv_extract0_s32(A) ((npy_int32)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) #define npyv_extract0_u64(A) ((npy_uint64)npyv128_cvtsi128_si64(_mm512_castsi512_si128(A))) #define npyv_extract0_s64(A) ((npy_int64)npyv128_cvtsi128_si64(_mm512_castsi512_si128(A))) #define npyv_extract0_f32(A) _mm_cvtss_f32(_mm512_castps512_ps128(A)) #define npyv_extract0_f64(A) _mm_cvtsd_f64(_mm512_castpd512_pd128(A)) // reinterpret #define npyv_reinterpret_u8_u8(X) X #define npyv_reinterpret_u8_s8(X) X #define npyv_reinterpret_u8_u16(X) X #define npyv_reinterpret_u8_s16(X) X #define npyv_reinterpret_u8_u32(X) X #define npyv_reinterpret_u8_s32(X) X #define npyv_reinterpret_u8_u64(X) X #define npyv_reinterpret_u8_s64(X) X #define npyv_reinterpret_u8_f32 _mm512_castps_si512 #define npyv_reinterpret_u8_f64 _mm512_castpd_si512 #define npyv_reinterpret_s8_s8(X) X #define npyv_reinterpret_s8_u8(X) X #define npyv_reinterpret_s8_u16(X) X #define npyv_reinterpret_s8_s16(X) X #define npyv_reinterpret_s8_u32(X) X #define npyv_reinterpret_s8_s32(X) X #define npyv_reinterpret_s8_u64(X) X #define npyv_reinterpret_s8_s64(X) X #define npyv_reinterpret_s8_f32 _mm512_castps_si512 #define npyv_reinterpret_s8_f64 _mm512_castpd_si512 #define npyv_reinterpret_u16_u16(X) X #define npyv_reinterpret_u16_u8(X) X #define npyv_reinterpret_u16_s8(X) X #define npyv_reinterpret_u16_s16(X) X #define npyv_reinterpret_u16_u32(X) X #define npyv_reinterpret_u16_s32(X) X #define npyv_reinterpret_u16_u64(X) X #define npyv_reinterpret_u16_s64(X) X #define npyv_reinterpret_u16_f32 _mm512_castps_si512 #define npyv_reinterpret_u16_f64 _mm512_castpd_si512 #define npyv_reinterpret_s16_s16(X) X #define npyv_reinterpret_s16_u8(X) X #define npyv_reinterpret_s16_s8(X) X #define npyv_reinterpret_s16_u16(X) X #define npyv_reinterpret_s16_u32(X) X #define npyv_reinterpret_s16_s32(X) X #define npyv_reinterpret_s16_u64(X) X #define npyv_reinterpret_s16_s64(X) X #define npyv_reinterpret_s16_f32 _mm512_castps_si512 #define npyv_reinterpret_s16_f64 _mm512_castpd_si512 #define npyv_reinterpret_u32_u32(X) X #define npyv_reinterpret_u32_u8(X) X #define npyv_reinterpret_u32_s8(X) X #define npyv_reinterpret_u32_u16(X) X #define npyv_reinterpret_u32_s16(X) X #define npyv_reinterpret_u32_s32(X) X #define npyv_reinterpret_u32_u64(X) X #define npyv_reinterpret_u32_s64(X) X #define npyv_reinterpret_u32_f32 _mm512_castps_si512 #define npyv_reinterpret_u32_f64 _mm512_castpd_si512 #define npyv_reinterpret_s32_s32(X) X #define npyv_reinterpret_s32_u8(X) X #define npyv_reinterpret_s32_s8(X) X #define npyv_reinterpret_s32_u16(X) X #define npyv_reinterpret_s32_s16(X) X #define npyv_reinterpret_s32_u32(X) X #define npyv_reinterpret_s32_u64(X) X #define npyv_reinterpret_s32_s64(X) X #define npyv_reinterpret_s32_f32 _mm512_castps_si512 #define npyv_reinterpret_s32_f64 _mm512_castpd_si512 #define npyv_reinterpret_u64_u64(X) X #define npyv_reinterpret_u64_u8(X) X #define npyv_reinterpret_u64_s8(X) X #define npyv_reinterpret_u64_u16(X) X #define npyv_reinterpret_u64_s16(X) X #define npyv_reinterpret_u64_u32(X) X #define npyv_reinterpret_u64_s32(X) X #define npyv_reinterpret_u64_s64(X) X #define npyv_reinterpret_u64_f32 _mm512_castps_si512 #define npyv_reinterpret_u64_f64 _mm512_castpd_si512 #define npyv_reinterpret_s64_s64(X) X #define npyv_reinterpret_s64_u8(X) X #define npyv_reinterpret_s64_s8(X) X #define npyv_reinterpret_s64_u16(X) X #define npyv_reinterpret_s64_s16(X) X #define npyv_reinterpret_s64_u32(X) X #define npyv_reinterpret_s64_s32(X) X #define npyv_reinterpret_s64_u64(X) X #define npyv_reinterpret_s64_f32 _mm512_castps_si512 #define npyv_reinterpret_s64_f64 _mm512_castpd_si512 #define npyv_reinterpret_f32_f32(X) X #define npyv_reinterpret_f32_u8 _mm512_castsi512_ps #define npyv_reinterpret_f32_s8 _mm512_castsi512_ps #define npyv_reinterpret_f32_u16 _mm512_castsi512_ps #define npyv_reinterpret_f32_s16 _mm512_castsi512_ps #define npyv_reinterpret_f32_u32 _mm512_castsi512_ps #define npyv_reinterpret_f32_s32 _mm512_castsi512_ps #define npyv_reinterpret_f32_u64 _mm512_castsi512_ps #define npyv_reinterpret_f32_s64 _mm512_castsi512_ps #define npyv_reinterpret_f32_f64 _mm512_castpd_ps #define npyv_reinterpret_f64_f64(X) X #define npyv_reinterpret_f64_u8 _mm512_castsi512_pd #define npyv_reinterpret_f64_s8 _mm512_castsi512_pd #define npyv_reinterpret_f64_u16 _mm512_castsi512_pd #define npyv_reinterpret_f64_s16 _mm512_castsi512_pd #define npyv_reinterpret_f64_u32 _mm512_castsi512_pd #define npyv_reinterpret_f64_s32 _mm512_castsi512_pd #define npyv_reinterpret_f64_u64 _mm512_castsi512_pd #define npyv_reinterpret_f64_s64 _mm512_castsi512_pd #define npyv_reinterpret_f64_f32 _mm512_castps_pd #ifdef NPY_HAVE_AVX512_KNL #define npyv_cleanup() ((void)0) #else #define npyv_cleanup _mm256_zeroall #endif #endif // _NPY_SIMD_AVX512_MISC_H