#ifndef NPY_SIMD #error "Not a standalone header" #endif #ifndef _NPY_SIMD_NEON_REORDER_H #define _NPY_SIMD_NEON_REORDER_H // combine lower part of two vectors #ifdef __aarch64__ #define npyv_combinel_u8(A, B) vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B))) #define npyv_combinel_s8(A, B) vreinterpretq_s8_u64(vzip1q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B))) #define npyv_combinel_u16(A, B) vreinterpretq_u16_u64(vzip1q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B))) #define npyv_combinel_s16(A, B) vreinterpretq_s16_u64(vzip1q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B))) #define npyv_combinel_u32(A, B) vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B))) #define npyv_combinel_s32(A, B) vreinterpretq_s32_u64(vzip1q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B))) #define npyv_combinel_u64 vzip1q_u64 #define npyv_combinel_s64 vzip1q_s64 #define npyv_combinel_f32(A, B) vreinterpretq_f32_u64(vzip1q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B))) #define npyv_combinel_f64 vzip1q_f64 #else #define npyv_combinel_u8(A, B) vcombine_u8(vget_low_u8(A), vget_low_u8(B)) #define npyv_combinel_s8(A, B) vcombine_s8(vget_low_s8(A), vget_low_s8(B)) #define npyv_combinel_u16(A, B) vcombine_u16(vget_low_u16(A), vget_low_u16(B)) #define npyv_combinel_s16(A, B) vcombine_s16(vget_low_s16(A), vget_low_s16(B)) #define npyv_combinel_u32(A, B) vcombine_u32(vget_low_u32(A), vget_low_u32(B)) #define npyv_combinel_s32(A, B) vcombine_s32(vget_low_s32(A), vget_low_s32(B)) #define npyv_combinel_u64(A, B) vcombine_u64(vget_low_u64(A), vget_low_u64(B)) #define npyv_combinel_s64(A, B) vcombine_s64(vget_low_s64(A), vget_low_s64(B)) #define npyv_combinel_f32(A, B) vcombine_f32(vget_low_f32(A), vget_low_f32(B)) #endif // combine higher part of two vectors #ifdef __aarch64__ #define npyv_combineh_u8(A, B) vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B))) #define npyv_combineh_s8(A, B) vreinterpretq_s8_u64(vzip2q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B))) #define npyv_combineh_u16(A, B) vreinterpretq_u16_u64(vzip2q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B))) #define npyv_combineh_s16(A, B) vreinterpretq_s16_u64(vzip2q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B))) #define npyv_combineh_u32(A, B) vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B))) #define npyv_combineh_s32(A, B) vreinterpretq_s32_u64(vzip2q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B))) #define npyv_combineh_u64 vzip2q_u64 #define npyv_combineh_s64 vzip2q_s64 #define npyv_combineh_f32(A, B) vreinterpretq_f32_u64(vzip2q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B))) #define npyv_combineh_f64 vzip2q_f64 #else #define npyv_combineh_u8(A, B) vcombine_u8(vget_high_u8(A), vget_high_u8(B)) #define npyv_combineh_s8(A, B) vcombine_s8(vget_high_s8(A), vget_high_s8(B)) #define npyv_combineh_u16(A, B) vcombine_u16(vget_high_u16(A), vget_high_u16(B)) #define npyv_combineh_s16(A, B) vcombine_s16(vget_high_s16(A), vget_high_s16(B)) #define npyv_combineh_u32(A, B) vcombine_u32(vget_high_u32(A), vget_high_u32(B)) #define npyv_combineh_s32(A, B) vcombine_s32(vget_high_s32(A), vget_high_s32(B)) #define npyv_combineh_u64(A, B) vcombine_u64(vget_high_u64(A), vget_high_u64(B)) #define npyv_combineh_s64(A, B) vcombine_s64(vget_high_s64(A), vget_high_s64(B)) #define npyv_combineh_f32(A, B) vcombine_f32(vget_high_f32(A), vget_high_f32(B)) #endif // combine two vectors from lower and higher parts of two other vectors #define NPYV_IMPL_NEON_COMBINE(T_VEC, SFX) \ NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \ { \ T_VEC##x2 r; \ r.val[0] = NPY_CAT(npyv_combinel_, SFX)(a, b); \ r.val[1] = NPY_CAT(npyv_combineh_, SFX)(a, b); \ return r; \ } NPYV_IMPL_NEON_COMBINE(npyv_u8, u8) NPYV_IMPL_NEON_COMBINE(npyv_s8, s8) NPYV_IMPL_NEON_COMBINE(npyv_u16, u16) NPYV_IMPL_NEON_COMBINE(npyv_s16, s16) NPYV_IMPL_NEON_COMBINE(npyv_u32, u32) NPYV_IMPL_NEON_COMBINE(npyv_s32, s32) NPYV_IMPL_NEON_COMBINE(npyv_u64, u64) NPYV_IMPL_NEON_COMBINE(npyv_s64, s64) NPYV_IMPL_NEON_COMBINE(npyv_f32, f32) #ifdef __aarch64__ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) #endif // interleave & deinterleave two vectors #ifdef __aarch64__ #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ { \ T_VEC##x2 r; \ r.val[0] = vzip1q_##SFX(a, b); \ r.val[1] = vzip2q_##SFX(a, b); \ return r; \ } \ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ { \ T_VEC##x2 r; \ r.val[0] = vuzp1q_##SFX(a, b); \ r.val[1] = vuzp2q_##SFX(a, b); \ return r; \ } #else #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ { return vzipq_##SFX(a, b); } \ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ { return vuzpq_##SFX(a, b); } #endif NPYV_IMPL_NEON_ZIP(npyv_u8, u8) NPYV_IMPL_NEON_ZIP(npyv_s8, s8) NPYV_IMPL_NEON_ZIP(npyv_u16, u16) NPYV_IMPL_NEON_ZIP(npyv_s16, s16) NPYV_IMPL_NEON_ZIP(npyv_u32, u32) NPYV_IMPL_NEON_ZIP(npyv_s32, s32) NPYV_IMPL_NEON_ZIP(npyv_f32, f32) #define npyv_zip_u64 npyv_combine_u64 #define npyv_zip_s64 npyv_combine_s64 #define npyv_zip_f64 npyv_combine_f64 #define npyv_unzip_u64 npyv_combine_u64 #define npyv_unzip_s64 npyv_combine_s64 #define npyv_unzip_f64 npyv_combine_f64 // Reverse elements of each 64-bit lane #define npyv_rev64_u8 vrev64q_u8 #define npyv_rev64_s8 vrev64q_s8 #define npyv_rev64_u16 vrev64q_u16 #define npyv_rev64_s16 vrev64q_s16 #define npyv_rev64_u32 vrev64q_u32 #define npyv_rev64_s32 vrev64q_s32 #define npyv_rev64_f32 vrev64q_f32 // Permuting the elements of each 128-bit lane by immediate index for // each element. #ifdef __clang__ #define npyv_permi128_u32(A, E0, E1, E2, E3) \ __builtin_shufflevector(A, A, E0, E1, E2, E3) #elif defined(__GNUC__) #define npyv_permi128_u32(A, E0, E1, E2, E3) \ __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3)) #else #define npyv_permi128_u32(A, E0, E1, E2, E3) \ npyv_set_u32( \ vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \ vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3) \ ) #define npyv_permi128_s32(A, E0, E1, E2, E3) \ npyv_set_s32( \ vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \ vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3) \ ) #define npyv_permi128_f32(A, E0, E1, E2, E3) \ npyv_set_f32( \ vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \ vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3) \ ) #endif #if defined(__clang__) || defined(__GNUC__) #define npyv_permi128_s32 npyv_permi128_u32 #define npyv_permi128_f32 npyv_permi128_u32 #endif #ifdef __clang__ #define npyv_permi128_u64(A, E0, E1) \ __builtin_shufflevector(A, A, E0, E1) #elif defined(__GNUC__) #define npyv_permi128_u64(A, E0, E1) \ __builtin_shuffle(A, npyv_set_u64(E0, E1)) #else #define npyv_permi128_u64(A, E0, E1) \ npyv_set_u64( \ vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1) \ ) #define npyv_permi128_s64(A, E0, E1) \ npyv_set_s64( \ vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1) \ ) #define npyv_permi128_f64(A, E0, E1) \ npyv_set_f64( \ vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1) \ ) #endif #if defined(__clang__) || defined(__GNUC__) #define npyv_permi128_s64 npyv_permi128_u64 #define npyv_permi128_f64 npyv_permi128_u64 #endif #if !NPY_SIMD_F64 #undef npyv_permi128_f64 #endif #endif // _NPY_SIMD_NEON_REORDER_H