#include "numpy/npy_math.h" #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" #include "npy_svml.h" #include "fast_loop_macros.h" #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) /**begin repeat * #sfx = f32, f64# * #func_suffix = f16, 8_ha# * #len = 32, 64# */ /**begin repeat1 * #func = exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# * #default_val = 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0# * #fxnan = 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ static void simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc, npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len) { const int vstep = npyv_nlanes_@sfx@; for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { npyv_@sfx@ x; #if @default_val@ if (ssrc == 1) { x = npyv_load_till_@sfx@(src, len, @default_val@); } else { x = npyv_loadn_till_@sfx@(src, ssrc, len, @default_val@); } #else if (ssrc == 1) { x = npyv_load_tillz_@sfx@(src, len); } else { x = npyv_loadn_tillz_@sfx@(src, ssrc, len); } #endif #if @fxnan@ == @len@ // Workaround, invalid value encountered when x is set to nan npyv_b@len@ nnan_mask = npyv_notnan_@sfx@(x); npyv_@sfx@ x_exnan = npyv_select_@sfx@(nnan_mask, x, npyv_setall_@sfx@(@default_val@)); npyv_@sfx@ out = __svml_@func@@func_suffix@(x_exnan); out = npyv_select_@sfx@(nnan_mask, out, x); #else npyv_@sfx@ out = __svml_@func@@func_suffix@(x); #endif if (sdst == 1) { npyv_store_till_@sfx@(dst, len, out); } else { npyv_storen_till_@sfx@(dst, sdst, len, out); } } npyv_cleanup(); } /**end repeat1**/ /**end repeat**/ /**begin repeat * #sfx = f32, f64# * #func_suffix = f16, 8_ha# */ /**begin repeat1 * #func = pow, atan2# */ static void simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src1, npy_intp ssrc1, const npyv_lanetype_@sfx@ *src2, npy_intp ssrc2, npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len) { const int vstep = npyv_nlanes_@sfx@; for (; len > 0; len -= vstep, src1 += ssrc1*vstep, src2 += ssrc2*vstep, dst += sdst*vstep) { npyv_@sfx@ x1; if (ssrc1 == 1) { x1 = npyv_load_till_@sfx@(src1, len, 1); } else { x1 = npyv_loadn_till_@sfx@(src1, ssrc1, len, 1); } npyv_@sfx@ x2; if (ssrc2 == 1) { x2 = npyv_load_till_@sfx@(src2, len, 1); } else { x2 = npyv_loadn_till_@sfx@(src2, ssrc2, len, 1); } npyv_@sfx@ out = __svml_@func@@func_suffix@(x1, x2); if (sdst == 1) { npyv_store_till_@sfx@(dst, len, out); } else { npyv_storen_till_@sfx@(dst, sdst, len, out); } } } /**end repeat1**/ /**end repeat**/ #endif /**begin repeat * #TYPE = DOUBLE, FLOAT# * #type = npy_double, npy_float# * #vsub = , f# * #sfx = f64, f32# */ /**begin repeat1 * #func = exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# * #intrin = exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) const @type@ *src = (@type@*)args[0]; @type@ *dst = (@type@*)args[1]; const npy_intp len = dimensions[0]; if (!is_mem_overlap(src, steps[0], dst, steps[1], len) && npyv_loadable_stride_@sfx@(steps[0]) && npyv_storable_stride_@sfx@(steps[1])) { const npy_intp ssrc = steps[0] / sizeof(@type@); const npy_intp sdst = steps[1] / sizeof(@type@); simd_@intrin@_@sfx@(src, ssrc, dst, sdst, len); return; } #endif UNARY_LOOP { const @type@ in1 = *(@type@ *)ip1; *(@type@ *)op1 = npy_@intrin@@vsub@(in1); } } /**end repeat1**/ /**end repeat**/ /**begin repeat * #TYPE = DOUBLE, FLOAT# * #type = npy_double, npy_float# * #vsub = , f# * #sfx = f64, f32# * #sqrt = sqrt, sqrtf# */ /**begin repeat1 * #func = power# * #intrin = pow# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { int stride_zero = steps[1]==0; if (stride_zero) { BINARY_DEFS const @type@ in2 = *(@type@ *)ip2; int fastop_found = 1; BINARY_LOOP_SLIDING { const @type@ in1 = *(@type@ *)ip1; if (in2 == -1.0) { *(@type@ *)op1 = 1.0 / in1; } else if (in2 == 0.0) { *(@type@ *)op1 = 1.0; } else if (in2 == 0.5) { *(@type@ *)op1 = @sqrt@(in1); } else if (in2 == 1.0) { *(@type@ *)op1 = in1; } else if (in2 == 2.0) { *(@type@ *)op1 = in1 * in1; } else { fastop_found = 0; break; } } if (fastop_found) { return; } } #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) const @type@ *src1 = (@type@*)args[0]; const @type@ *src2 = (@type@*)args[1]; @type@ *dst = (@type@*)args[2]; const npy_intp len = dimensions[0]; if (!is_mem_overlap(src1, steps[0], dst, steps[2], len) && !is_mem_overlap(src2, steps[1], dst, steps[2], len) && npyv_loadable_stride_@sfx@(steps[0]) && npyv_loadable_stride_@sfx@(steps[1]) && npyv_storable_stride_@sfx@(steps[2]) ) { const npy_intp ssrc1 = steps[0] / sizeof(@type@); const npy_intp ssrc2 = steps[1] / sizeof(@type@); const npy_intp sdst = steps[2] / sizeof(@type@); simd_@intrin@_@sfx@(src1, ssrc1, src2, ssrc2, dst, sdst, len); return; } #endif BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; *(@type@ *)op1 = npy_@intrin@@vsub@(in1, in2); } } /**end repeat1**/ /**begin repeat1 * #func = arctan2# * #intrin = atan2# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) const @type@ *src1 = (@type@*)args[0]; const @type@ *src2 = (@type@*)args[1]; @type@ *dst = (@type@*)args[2]; const npy_intp len = dimensions[0]; if (!is_mem_overlap(src1, steps[0], dst, steps[2], len) && !is_mem_overlap(src2, steps[1], dst, steps[2], len) && npyv_loadable_stride_@sfx@(steps[0]) && npyv_loadable_stride_@sfx@(steps[1]) && npyv_storable_stride_@sfx@(steps[2]) ) { const npy_intp ssrc1 = steps[0] / sizeof(@type@); const npy_intp ssrc2 = steps[1] / sizeof(@type@); const npy_intp sdst = steps[2] / sizeof(@type@); simd_@intrin@_@sfx@(src1, ssrc1, src2, ssrc2, dst, sdst, len); return; } #endif BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; *(@type@ *)op1 = npy_@intrin@@vsub@(in1, in2); } } /**end repeat1**/ /**end repeat**/