#define _UMATHMODULE #define _MULTIARRAYMODULE #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" // Provides the various *_LOOP macros #include "fast_loop_macros.h" /** * TODO: * - Improve the implementation of SIMD complex absolute, * current one kinda slow and it can be optimized by * at least avoiding the division and keep sqrt. * - Vectorize reductions * - Add support for ASIMD/VCMLA through universal intrinsics. */ //############################################################################### //## Real Single/Double precision //############################################################################### /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ /**begin repeat * Float types * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# * #c = f, # * #C = F, # * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 * Arithmetic * # kind = add, subtract, multiply, divide# * # intrin = add, sub, mul, div# * # OP = +, -, *, /# * # PW = 1, 0, 0, 0# * # is_div = 0*3, 1# * # is_mul = 0*2, 1, 0# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { npy_intp len = dimensions[0]; char *src0 = args[0], *src1 = args[1], *dst = args[2]; npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2]; // reduce if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) { #if @PW@ *((@type@*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1); #else @type@ acc = *((@type@*)src0); if (ssrc1 == sizeof(@type@)) { for (; len > 0; --len, src1 += sizeof(@type@)) { acc @OP@= *(@type@ *)src1; } } else { for (; len > 0; --len, src1 += ssrc1) { acc @OP@= *(@type@ *)src1; } } *((@type@*)src0) = acc; #endif return; } #if @is_div@ && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64 /** * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD * support for single-precision floating-point division. Only scalar division is * supported natively, and without hardware for performance and accuracy comparison, * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus * native scalar division. * * The `npyv_div_f32` universal intrinsic emulates the division operation using an * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced * precision. However, this approach has limitations: * * - It can cause unexpected floating-point overflows in special cases, such as when * the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097). * * - The precision may vary between the emulated SIMD and scalar division due to * non-uniform branches (non-contiguous) in the code, leading to precision * inconsistencies. * * - Considering the necessity of multiple Newton-Raphson iterations, the performance * gain may not sufficiently offset these drawbacks. */ #elif @VECTOR@ if (len > npyv_nlanes_@sfx@*2 && !is_mem_overlap(src0, ssrc0, dst, sdst, len) && !is_mem_overlap(src1, ssrc1, dst, sdst, len) ) { const int vstep = npyv_nlanes_u8; const int wstep = vstep * 2; const int hstep = npyv_nlanes_@sfx@; const int lstep = hstep * 2; // lots of specializations, to squeeze out max performance if (ssrc0 == sizeof(@type@) && ssrc0 == ssrc1 && ssrc0 == sdst) { for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) { npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0); npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep)); npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1); npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep)); npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b0); npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b1); npyv_store_@sfx@((@type@*)dst, r0); npyv_store_@sfx@((@type@*)(dst + vstep), r1); } for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) { #if @is_div@ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@); npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@); #else npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len); npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len); #endif npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); npyv_store_till_@sfx@((@type@*)dst, len, r); } } else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) { npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0)); for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) { npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1); npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep)); npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a, b0); npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a, b1); npyv_store_@sfx@((@type@*)dst, r0); npyv_store_@sfx@((@type@*)(dst + vstep), r1); } for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) { #if @is_div@ || @is_mul@ npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@); #else npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len); #endif npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); npyv_store_till_@sfx@((@type@*)dst, len, r); } } else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) { npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1)); for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) { npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0); npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep)); npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b); npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b); npyv_store_@sfx@((@type@*)dst, r0); npyv_store_@sfx@((@type@*)(dst + vstep), r1); } for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) { #if @is_mul@ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@); #elif @is_div@ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, NPY_NAN@C@); #else npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len); #endif npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); npyv_store_till_@sfx@((@type@*)dst, len, r); } } else { goto loop_scalar; } npyv_cleanup(); return; } loop_scalar: #endif for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) { const @type@ a = *((@type@*)src0); const @type@ b = *((@type@*)src1); *((@type@*)dst) = a @OP@ b; } } NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed) (PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { char *ip1 = args[0]; char *indxp = args[1]; char *value = args[2]; npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; npy_intp shape = steps[3]; npy_intp n = dimensions[0]; npy_intp i; @type@ *indexed; for(i = 0; i < n; i++, indxp += isindex, value += isb) { npy_intp indx = *(npy_intp *)indxp; if (indx < 0) { indx += shape; } indexed = (@type@ *)(ip1 + is1 * indx); *indexed = *indexed @OP@ *(@type@ *)value; } return 0; } /**end repeat1**/ /**end repeat**/ //############################################################################### //## Complex Single/Double precision //############################################################################### /******************************************************************************** ** op intrinsics ********************************************************************************/ #if NPY_SIMD_F32 NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a) { npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a)); npyv_f32x2 r; r.val[0] = fill; r.val[1] = fill; return r; } NPY_FINLINE npyv_f32 simd_cconjugate_f32(npyv_f32 x) { #if NPY_SIMD_BIGENDIAN const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000)); #else const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL)); #endif return npyv_xor_f32(x, mask); } NPY_FINLINE npyv_f32 simd_cmul_f32(npyv_f32 a, npyv_f32 b) { npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2); npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2); npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3); // a_im * b_im, a_im * b_re npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev); return npyv_muladdsub_f32(a_re, b, ab_iiir); } NPY_FINLINE npyv_f32 simd_csquare_f32(npyv_f32 x) { return simd_cmul_f32(x, x); } #endif #if NPY_SIMD_F64 NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a) { npyv_f64 r = npyv_setall_f64(a[0]); npyv_f64 i = npyv_setall_f64(a[1]); return npyv_zip_f64(r, i); } NPY_FINLINE npyv_f64 simd_cconjugate_f64(npyv_f64 x) { const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64( 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL )); return npyv_xor_f64(x, mask); } NPY_FINLINE npyv_f64 simd_cmul_f64(npyv_f64 a, npyv_f64 b) { npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0); npyv_f64 a_re = npyv_permi128_f64(a, 0, 0); npyv_f64 a_im = npyv_permi128_f64(a, 1, 1); // a_im * b_im, a_im * b_re npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev); return npyv_muladdsub_f64(a_re, b, ab_iiir); } NPY_FINLINE npyv_f64 simd_csquare_f64(npyv_f64 x) { return simd_cmul_f64(x, x); } #endif /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ /**begin repeat * complex types * #TYPE = CFLOAT, CDOUBLE# * #ftype = npy_float, npy_double# * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# * #sfx = f32, f64# * #c = f, # * #C = F, # */ /**begin repeat1 * arithmetic * #kind = add, subtract, multiply# * #vectorf = npyv_add, npyv_sub, simd_cmul# * #OP = +, -, *# * #PW = 1, 0, 0# * #is_mul = 0*2, 1# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { npy_intp len = dimensions[0]; char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2]; npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2]; #if @PW@ // reduce if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst && b_ssrc1 % (sizeof(@ftype@)*2) == 0 ) { @ftype@ *rl_im = (@ftype@ *)b_src0; @ftype@ rr, ri; @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2); rl_im[0] @OP@= rr; rl_im[1] @OP@= ri; return; } #endif #if @VECTOR@ // Certain versions of Apple clang (commonly used in CI images) produce // non-deterministic output in the mul path with AVX2 enabled on x86_64. // Work around by scalarising. #if @is_mul@ \ && defined(NPY_CPU_AMD64) && defined(__clang__) \ && defined(__apple_build_version__) \ && __apple_build_version__ >= 14000000 \ && __apple_build_version__ < 14030000 goto loop_scalar; #endif // end affected Apple clang. if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) || is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) || !npyv_loadable_stride_@sfx@(b_ssrc0) || !npyv_loadable_stride_@sfx@(b_ssrc1) || !npyv_storable_stride_@sfx@(b_sdst) || b_sdst == 0 ) { goto loop_scalar; } const @ftype@ *src0 = (@ftype@*)b_src0; const @ftype@ *src1 = (@ftype@*)b_src1; @ftype@ *dst = (@ftype@*)b_dst; const npy_intp ssrc0 = b_ssrc0 / sizeof(@ftype@); const npy_intp ssrc1 = b_ssrc1 / sizeof(@ftype@); const npy_intp sdst = b_sdst / sizeof(@ftype@); const int vstep = npyv_nlanes_@sfx@; const int wstep = vstep * 2; const int hstep = vstep / 2; // lots**lots of specializations, to squeeze out max performance // contig if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) { for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) { npyv_@sfx@ a0 = npyv_load_@sfx@(src0); npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep); npyv_@sfx@ b0 = npyv_load_@sfx@(src1); npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep); npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0); npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1); npyv_store_@sfx@(dst, r0); npyv_store_@sfx@(dst + vstep, r1); } for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) { npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len); npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len); npyv_@sfx@ r = @vectorf@_@sfx@(a, b); npyv_store2_till_@sfx@(dst, len, r); } } // scalar 0 else if (ssrc0 == 0) { npyv_@sfx@x2 a = simd_set2_@sfx@(src0); // contig if (ssrc1 == 2 && sdst == ssrc1) { for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) { npyv_@sfx@ b0 = npyv_load_@sfx@(src1); npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep); npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0); npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1); npyv_store_@sfx@(dst, r0); npyv_store_@sfx@(dst + vstep, r1); } for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) { #if @is_mul@ npyv_@sfx@ b = npyv_load2_till_@sfx@(src1, len, 1.0@c@, 1.0@c@); #else npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len); #endif npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b); npyv_store2_till_@sfx@(dst, len, r); } } // non-contig else { for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) { npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1); npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1); npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0); npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1); npyv_storen2_@sfx@(dst, sdst, r0); npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); } for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) { #if @is_mul@ npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@); #else npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len); #endif npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b); npyv_storen2_till_@sfx@(dst, sdst, len, r); } } } // scalar 1 else if (ssrc1 == 0) { npyv_@sfx@x2 b = simd_set2_@sfx@(src1); if (ssrc0 == 2 && sdst == ssrc0) { for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) { npyv_@sfx@ a0 = npyv_load_@sfx@(src0); npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep); npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]); npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]); npyv_store_@sfx@(dst, r0); npyv_store_@sfx@(dst + vstep, r1); } for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) { #if @is_mul@ npyv_@sfx@ a = npyv_load2_till_@sfx@(src0, len, 1.0@c@, 1.0@c@); #else npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len); #endif npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]); npyv_store2_till_@sfx@(dst, len, r); } } // non-contig else { for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) { npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0); npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0); npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]); npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]); npyv_storen2_@sfx@(dst, sdst, r0); npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); } for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) { #if @is_mul@ npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@); #else npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len); #endif npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]); npyv_storen2_till_@sfx@(dst, sdst, len, r); } } } #if @is_mul@ // non-contig else { for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, src1 += ssrc1*vstep, dst += sdst*vstep ) { npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0); npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0); npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1); npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1); npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0); npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1); npyv_storen2_@sfx@(dst, sdst, r0); npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); } for (; len > 0; len -= hstep, src0 += ssrc0*hstep, src1 += ssrc1*hstep, dst += sdst*hstep ) { #if @is_mul@ npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@); npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@); #else npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len); npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len); #endif npyv_@sfx@ r = @vectorf@_@sfx@(a, b); npyv_storen2_till_@sfx@(dst, sdst, len, r); } } #else /* @is_mul@ */ else { // Only multiply is vectorized for the generic non-contig case. goto loop_scalar; } #endif /* @is_mul@ */ npyv_cleanup(); return; loop_scalar: #endif for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) { const @ftype@ a_r = ((@ftype@ *)b_src0)[0]; const @ftype@ a_i = ((@ftype@ *)b_src0)[1]; const @ftype@ b_r = ((@ftype@ *)b_src1)[0]; const @ftype@ b_i = ((@ftype@ *)b_src1)[1]; #if @is_mul@ ((@ftype@ *)b_dst)[0] = a_r*b_r - a_i*b_i; ((@ftype@ *)b_dst)[1] = a_r*b_i + a_i*b_r; #else ((@ftype@ *)b_dst)[0] = a_r @OP@ b_r; ((@ftype@ *)b_dst)[1] = a_i @OP@ b_i; #endif } } NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed) (PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { char *ip1 = args[0]; char *indxp = args[1]; char *value = args[2]; npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; npy_intp shape = steps[3]; npy_intp n = dimensions[0]; npy_intp i; @ftype@ *indexed; for(i = 0; i < n; i++, indxp += isindex, value += isb) { npy_intp indx = *(npy_intp *)indxp; if (indx < 0) { indx += shape; } indexed = (@ftype@ *)(ip1 + is1 * indx); const @ftype@ b_r = ((@ftype@ *)value)[0]; const @ftype@ b_i = ((@ftype@ *)value)[1]; #if @is_mul@ const @ftype@ a_r = indexed[0]; const @ftype@ a_i = indexed[1]; indexed[0] = a_r*b_r - a_i*b_i; indexed[1] = a_r*b_i + a_i*b_r; #else indexed[0] @OP@= b_r; indexed[1] @OP@= b_i; #endif } return 0; } /**end repeat1**/ /**begin repeat1 * #kind = conjugate, square# * #is_square = 0, 1# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { npy_intp len = dimensions[0]; char *b_src = args[0], *b_dst = args[1]; npy_intp b_ssrc = steps[0], b_sdst = steps[1]; #if @VECTOR@ if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) || !npyv_loadable_stride_@sfx@(b_ssrc) || !npyv_storable_stride_@sfx@(b_sdst) ) { goto loop_scalar; } const @ftype@ *src = (@ftype@*)b_src; @ftype@ *dst = (@ftype@*)b_dst; const npy_intp ssrc = b_ssrc / sizeof(@ftype@); const npy_intp sdst = b_sdst / sizeof(@ftype@); const int vstep = npyv_nlanes_@sfx@; const int wstep = vstep * 2; const int hstep = vstep / 2; if (ssrc == 2 && ssrc == sdst) { for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) { npyv_@sfx@ a0 = npyv_load_@sfx@(src); npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep); npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0); npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1); npyv_store_@sfx@(dst, r0); npyv_store_@sfx@(dst + vstep, r1); } for (; len > 0; len -= hstep, src += vstep, dst += vstep) { npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len); npyv_@sfx@ r = simd_c@kind@_@sfx@(a); npyv_store2_till_@sfx@(dst, len, r); } } else if (ssrc == 2) { for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) { npyv_@sfx@ a0 = npyv_load_@sfx@(src); npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep); npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0); npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1); npyv_storen2_@sfx@(dst, sdst, r0); npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); } for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) { npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len); npyv_@sfx@ r = simd_c@kind@_@sfx@(a); npyv_storen2_till_@sfx@(dst, sdst, len, r); } } else if (sdst == 2) { for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) { npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc); npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc); npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0); npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1); npyv_store_@sfx@(dst, r0); npyv_store_@sfx@(dst + vstep, r1); } for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) { npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@((@ftype@*)src, ssrc, len); npyv_@sfx@ r = simd_c@kind@_@sfx@(a); npyv_store2_till_@sfx@(dst, len, r); } } else { goto loop_scalar; } npyv_cleanup(); return; loop_scalar: #endif for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) { const @ftype@ rl = ((@ftype@ *)b_src)[0]; const @ftype@ im = ((@ftype@ *)b_src)[1]; #if @is_square@ ((@ftype@ *)b_dst)[0] = rl*rl - im*im; ((@ftype@ *)b_dst)[1] = rl*im + im*rl; #else ((@ftype@ *)b_dst)[0] = rl; ((@ftype@ *)b_dst)[1] = -im; #endif } } /**end repeat1**/ /**end repeat**/