#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include <float.h>

#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "npy_svml.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"
#include "npy_simd_data.h"

// TODO: tweak & replace raw SIMD with NPYV

/********************************************************************************
 ** bunch of helper functions used in ISA_exp/log_FLOAT
 ********************************************************************************/
#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
    /**
     * For somehow MSVC commit aggressive optimization lead
     * to raises 'RuntimeWarning: RuntimeWarning: overflow encountered in exp'
     *
     * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
     * investigate about it while moving to NPYV.
     */
    #define SIMD_AVX512F
#elif defined(NPY_HAVE_AVX2) && defined(NPY_HAVE_FMA3)
    #define SIMD_AVX2_FMA3
#endif
#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512_SKX)
    #define SIMD_AVX512_SKX
#endif
#if defined(SIMD_AVX512F) && !(defined(__clang__) && (__clang_major__ < 10 || \
                              (__clang_major__ == 10 && __clang_minor__ < 1)))
    #define SIMD_AVX512F_NOCLANG_BUG
#endif

#ifdef SIMD_AVX2_FMA3

NPY_FINLINE __m256
fma_get_full_load_mask_ps(void)
{
    return _mm256_set1_ps(-1.0);
}

NPY_FINLINE __m256i
fma_get_full_load_mask_pd(void)
{
    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
}

NPY_FINLINE __m256
fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
{
    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
    float* addr = maskint + num_lanes - num_elem;
    return _mm256_loadu_ps(addr);
}

NPY_FINLINE __m256i
fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
{
    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
    return _mm256_loadu_si256((__m256i*) addr);
}

NPY_FINLINE __m256
fma_masked_gather_ps(__m256 src,
                     npy_float* addr,
                     __m256i vindex,
                     __m256 mask)
{
    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}

NPY_FINLINE __m256d
fma_masked_gather_pd(__m256d src,
                     npy_double* addr,
                     __m128i vindex,
                     __m256d mask)
{
    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
}

NPY_FINLINE __m256
fma_masked_load_ps(__m256 mask, npy_float* addr)
{
    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
}

NPY_FINLINE __m256d
fma_masked_load_pd(__m256i mask, npy_double* addr)
{
    return _mm256_maskload_pd(addr, mask);
}

NPY_FINLINE __m256
fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
{
    return _mm256_blendv_ps(x, val, mask);
}

NPY_FINLINE __m256d
fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
{
    return _mm256_blendv_pd(x, val, mask);
}

NPY_FINLINE __m256
fma_blend(__m256 x, __m256 y, __m256 ymask)
{
    return _mm256_blendv_ps(x, y, ymask);
}

NPY_FINLINE __m256
fma_get_exponent(__m256 x)
{
    /*
     * Special handling of denormals:
     * 1) Multiply denormal elements with 2**100 (0x71800000)
     * 2) Get the 8 bits of unbiased exponent
     * 3) Subtract 100 from exponent of denormals
     */

    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);

    /*
     * The volatile is probably unnecessary now since we compile clang with
     * `-ftrapping-math`: https://github.com/numpy/numpy/issues/18005
     */
    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
    x = _mm256_blendv_ps(x, temp, denormal_mask);

    __m256 exp = _mm256_cvtepi32_ps(
                    _mm256_sub_epi32(
                        _mm256_srli_epi32(
                            _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E)));

    __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
    return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
}

NPY_FINLINE __m256
fma_get_mantissa(__m256 x)
{
    /*
     * Special handling of denormals:
     * 1) Multiply denormal elements with 2**100 (0x71800000)
     * 2) Get the 23 bits of mantissa
     * 3) Mantissa for denormals is not affected by the multiplication
     */

    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);

    /*
     * The volatile is probably unnecessary now since we compile clang with
     * `-ftrapping-math`: https://github.com/numpy/numpy/issues/18005
     */
    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
    x = _mm256_blendv_ps(x, temp, denormal_mask);

    __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
    __m256i exp_126_bits  = _mm256_set1_epi32(126 << 23);
    return _mm256_castsi256_ps(
                _mm256_or_si256(
                    _mm256_and_si256(
                        _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
}

NPY_FINLINE __m256
fma_scalef_ps(__m256 poly, __m256 quadrant)
{
    /*
     * Handle denormals (which occur when quadrant <= -125):
     * 1) This function computes poly*(2^quad) by adding the exponent of
     poly to quad
     * 2) When quad <= -125, the output is a denormal and the above logic
     breaks down
     * 3) To handle such cases, we split quadrant: -125 + (quadrant + 125)
     * 4) poly*(2^-125) is computed the usual way
     * 5) 2^(quad-125) can be computed by: 2 << abs(quad-125)
     * 6) The final div operation generates the denormal
     */
     __m256 minquadrant = _mm256_set1_ps(-125.0f);
     __m256 denormal_mask = _mm256_cmp_ps(quadrant, minquadrant, _CMP_LE_OQ);
     if (_mm256_movemask_ps(denormal_mask) != 0x0000) {
        __m256 quad_diff = _mm256_sub_ps(quadrant, minquadrant);
        quad_diff = _mm256_sub_ps(_mm256_setzero_ps(), quad_diff);
        quad_diff = _mm256_blendv_ps(_mm256_setzero_ps(), quad_diff, denormal_mask);
        __m256i two_power_diff = _mm256_sllv_epi32(
                                   _mm256_set1_epi32(1), _mm256_cvtps_epi32(quad_diff));
        quadrant = _mm256_max_ps(quadrant, minquadrant); //keep quadrant >= -126
        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
        poly = _mm256_castsi256_ps(
                   _mm256_add_epi32(
                       _mm256_castps_si256(poly), exponent));
        __m256 denorm_poly = _mm256_div_ps(poly, _mm256_cvtepi32_ps(two_power_diff));
        return _mm256_blendv_ps(poly, denorm_poly, denormal_mask);
     }
     else {
        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
        poly = _mm256_castsi256_ps(
                   _mm256_add_epi32(
                       _mm256_castps_si256(poly), exponent));
        return poly;
     }
}

#endif // SIMD_AVX2_FMA3

#ifdef SIMD_AVX512F

NPY_FINLINE __mmask16
avx512_get_full_load_mask_ps(void)
{
    return 0xFFFF;
}

NPY_FINLINE __mmask8
avx512_get_full_load_mask_pd(void)
{
    return 0xFF;
}

NPY_FINLINE __mmask16
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
{
    return (0x0001 << num_elem) - 0x0001;
}

NPY_FINLINE __mmask8
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
{
    return (0x01 << num_elem) - 0x01;
}

NPY_FINLINE __m512
avx512_masked_gather_ps(__m512 src,
                        npy_float* addr,
                        __m512i vindex,
                        __mmask16 kmask)
{
    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
}

NPY_FINLINE __m512d
avx512_masked_gather_pd(__m512d src,
                        npy_double* addr,
                        __m256i vindex,
                        __mmask8 kmask)
{
    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
}

NPY_FINLINE __m512
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
{
    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
}

NPY_FINLINE __m512d
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
{
    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
}

NPY_FINLINE __m512
avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
{
    return _mm512_mask_blend_ps(mask, x, val);
}

NPY_FINLINE __m512d
avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
{
    return _mm512_mask_blend_pd(mask, x, val);
}

NPY_FINLINE __m512
avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
{
    return _mm512_mask_mov_ps(x, ymask, y);
}

NPY_FINLINE __m512
avx512_get_exponent(__m512 x)
{
    return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
}

NPY_FINLINE __m512
avx512_get_mantissa(__m512 x)
{
    return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
}

NPY_FINLINE __m512
avx512_scalef_ps(__m512 poly, __m512 quadrant)
{
    return _mm512_scalef_ps(poly, quadrant);
}

NPY_FINLINE __m512d
avx512_permute_x4var_pd(__m512d t0,
                        __m512d t1,
                        __m512d t2,
                        __m512d t3,
                        __m512i index)
{
    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
                          _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index),
                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
    __m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
    __m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
    return _mm512_mask_blend_pd(lut_mask, res1, res2);
}

NPY_FINLINE __m512d
avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
                        __m512d t4, __m512d t5, __m512d t6, __m512d t7,
                        __m512i index)
{
    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
                          _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index),
                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
    __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index);
    __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index);
    return _mm512_mask_blend_pd(lut_mask, res1, res2);
}

#endif // SIMD_AVX512F

/********************************************************************************
 ** Defining the SIMD kernels
 ********************************************************************************/
/**begin repeat
 * #ISA = FMA, AVX512F#
 * #isa = fma, avx512#
 * #vtype = __m256, __m512#
 * #vsize = 256, 512#
 * #BYTES = 32, 64#
 * #NUM_LANES = 8, 16#
 * #mask = __m256, __mmask16#
 * #vsub = , _mask#
 * #or_masks =_mm256_or_ps, _mm512_kor#
 * #and_masks =_mm256_and_ps, _mm512_kand#
 * #xor_masks =_mm256_xor_ps, _mm512_kxor#
 * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
 * #mask_to_int = _mm256_movemask_ps, npyv_tobits_b32#
 * #full_mask= 0xFF, 0xFFFF#
 * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
 * #cvtps_epi32 = _mm256_cvtps_epi32, #
 * #CHK = SIMD_AVX2_FMA3, SIMD_AVX512F#
 */
#ifdef @CHK@
/*
 * Vectorized Cody-Waite range reduction technique
 * Performs the reduction step x* = x - y*C in three steps:
 * 1) x* = x - y*c1
 * 2) x* = x - y*c2
 * 3) x* = x - y*c3
 * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
 */
NPY_FINLINE @vtype@
simd_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
{
    @vtype@ reduced_x = @fmadd@(y, c1, x);
    reduced_x = @fmadd@(y, c2, reduced_x);
    reduced_x = @fmadd@(y, c3, reduced_x);
    return reduced_x;
}
/*
 * Vectorized implementation of exp using AVX2 and AVX512:
 * 1) if x >= xmax; return INF (overflow)
 * 2) if x <= xmin; return 0.0f (underflow)
 * 3) Range reduction (using Coyd-Waite):
 *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
 * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
 *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
 *      algorithm (mini-max polynomial approximation)
 * 5) Compute exp(x) = exp(y) * 2^k
 * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
 * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
 * same x = 0xc2781e37)
 */
static void
simd_exp_FLOAT(npy_float * op,
                npy_float * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
    npy_float xmax = 88.72283935546875f;
    npy_float xmin = -103.97208404541015625f;

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }

    /* Load up frequently used constants */
    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
    @vtype@ exp_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_EXPf);
    @vtype@ exp_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_EXPf);
    @vtype@ exp_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_EXPf);
    @vtype@ exp_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_EXPf);
    @vtype@ exp_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_EXPf);
    @vtype@ exp_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_EXPf);
    @vtype@ exp_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_EXPf);
    @vtype@ exp_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_EXPf);
    @vtype@ exp_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_EXPf);
    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
    @vtype@ log2e = _mm@vsize@_set1_ps(NPY_LOG2Ef);
    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
    @vtype@ ninf = _mm@vsize@_set1_ps(-1*NPY_INFINITYF);
    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
    @vtype@ poly, num_poly, denom_poly, quadrant;
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);

    @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask, ninf_mask;
    @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
    @mask@ underflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
    npy_intp num_remaining_elements = array_size;

    while (num_remaining_elements > 0) {

        if (num_remaining_elements < num_lanes) {
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                       num_lanes);
        }

        @vtype@ x;
        if (stride == 1) {
            x = @isa@_masked_load_ps(load_mask, ip);
        }
        else {
            x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
        }

        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
        x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask);

        xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
        xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
        inf_mask = _mm@vsize@_cmp_ps@vsub@(x, inf, _CMP_EQ_OQ);
        ninf_mask = _mm@vsize@_cmp_ps@vsub@(x, ninf, _CMP_EQ_OQ);
        overflow_mask = @or_masks@(overflow_mask,
                                    @xor_masks@(xmax_mask, inf_mask));
        underflow_mask = @or_masks@(underflow_mask,
                                    @xor_masks@(xmin_mask, ninf_mask));

        x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
                                    @or_masks@(nan_mask, xmin_mask), xmax_mask));

        quadrant = _mm@vsize@_mul_ps(x, log2e);

        /* round to nearest */
        quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
        quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);

        /* Cody-Waite's range reduction algorithm */
        x = simd_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);

        num_poly = @fmadd@(exp_p5, x, exp_p4);
        num_poly = @fmadd@(num_poly, x, exp_p3);
        num_poly = @fmadd@(num_poly, x, exp_p2);
        num_poly = @fmadd@(num_poly, x, exp_p1);
        num_poly = @fmadd@(num_poly, x, exp_p0);
        denom_poly = @fmadd@(exp_q2, x, exp_q1);
        denom_poly = @fmadd@(denom_poly, x, exp_q0);
        poly = _mm@vsize@_div_ps(num_poly, denom_poly);

        /*
         * compute val = poly * 2^quadrant; which is same as adding the
         * exponent of quadrant to the exponent of poly. quadrant is an int,
         * so extracting exponent is simply extracting 8 bits.
         */
        poly = @isa@_scalef_ps(poly, quadrant);

        /*
         * elem > xmax; return inf
         * elem < xmin; return 0.0f
         * elem = +/- nan, return nan
         */
        poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
        poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask);
        poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask);

        @masked_store@(op, @cvtps_epi32@(load_mask), poly);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if (@mask_to_int@(overflow_mask)) {
        npy_set_floatstatus_overflow();
    }

    if (@mask_to_int@(underflow_mask)) {
        npy_set_floatstatus_underflow();
    }
}

/*
 * Vectorized implementation of log using AVX2 and AVX512
 * 1) if x < 0.0f; return -NAN (invalid input)
 * 2) Range reduction: y = x/2^k;
 *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
 * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
 *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
 *      algorithm (mini-max polynomial approximation)
 * 5) Compute log(x) = log(y) + k*ln(2)
 * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
 * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
 * x = 0x3f486945)
 */

static void
simd_log_FLOAT(npy_float * op,
                npy_float * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }

    /* Load up frequently used constants */
    @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
    @vtype@ log_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_LOGf);
    @vtype@ log_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_LOGf);
    @vtype@ log_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_LOGf);
    @vtype@ log_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_LOGf);
    @vtype@ log_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_LOGf);
    @vtype@ log_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_LOGf);
    @vtype@ log_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_LOGf);
    @vtype@ log_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_LOGf);
    @vtype@ log_q3 = _mm@vsize@_set1_ps(NPY_COEFF_Q3_LOGf);
    @vtype@ log_q4 = _mm@vsize@_set1_ps(NPY_COEFF_Q4_LOGf);
    @vtype@ log_q5 = _mm@vsize@_set1_ps(NPY_COEFF_Q5_LOGf);
    @vtype@ loge2 = _mm@vsize@_set1_ps(NPY_LOGE2f);
    @vtype@ nan = _mm@vsize@_set1_ps(NPY_NANF);
    @vtype@ neg_nan = _mm@vsize@_set1_ps(-NPY_NANF);
    @vtype@ neg_inf = _mm@vsize@_set1_ps(-NPY_INFINITYF);
    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr);
    @vtype@ poly, num_poly, denom_poly, exponent;

    @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
    @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
    @mask@ divide_by_zero_mask = invalid_mask;
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
    npy_intp num_remaining_elements = array_size;

    while (num_remaining_elements > 0) {

        if (num_remaining_elements < num_lanes) {
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                       num_lanes);
        }

        @vtype@ x_in;
        if (stride == 1) {
            x_in = @isa@_masked_load_ps(load_mask, ip);
        }
        else {
            x_in  = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
        }

        negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
        zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
        inf_mask = _mm@vsize@_cmp_ps@vsub@(x_in, inf, _CMP_EQ_OQ);
        nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
        divide_by_zero_mask = @or_masks@(divide_by_zero_mask,
                                        @and_masks@(zero_mask, load_mask));
        invalid_mask = @or_masks@(invalid_mask, negx_mask);

        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask);

        /* set x = normalized mantissa */
        exponent = @isa@_get_exponent(x);
        x = @isa@_get_mantissa(x);

        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
        sqrt2_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
        x = @isa@_blend(x, _mm@vsize@_add_ps(x,x), sqrt2_mask);
        exponent = @isa@_blend(exponent,
                               _mm@vsize@_sub_ps(exponent,ones_f), sqrt2_mask);

        /* x = x - 1 */
        x = _mm@vsize@_sub_ps(x, ones_f);

        /* Polynomial approximation for log(1+x) */
        num_poly = @fmadd@(log_p5, x, log_p4);
        num_poly = @fmadd@(num_poly, x, log_p3);
        num_poly = @fmadd@(num_poly, x, log_p2);
        num_poly = @fmadd@(num_poly, x, log_p1);
        num_poly = @fmadd@(num_poly, x, log_p0);
        denom_poly = @fmadd@(log_q5, x, log_q4);
        denom_poly = @fmadd@(denom_poly, x, log_q3);
        denom_poly = @fmadd@(denom_poly, x, log_q2);
        denom_poly = @fmadd@(denom_poly, x, log_q1);
        denom_poly = @fmadd@(denom_poly, x, log_q0);
        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
        poly = @fmadd@(exponent, loge2, poly);

        /*
         * x < 0.0f; return -NAN
         * x = +/- NAN; return NAN
         * x = 0.0f; return -INF
         */
        poly = @isa@_set_masked_lanes_ps(poly, nan, nan_mask);
        poly = @isa@_set_masked_lanes_ps(poly, neg_nan, negx_mask);
        poly = @isa@_set_masked_lanes_ps(poly, neg_inf, zero_mask);
        poly = @isa@_set_masked_lanes_ps(poly, inf, inf_mask);

        @masked_store@(op, @cvtps_epi32@(load_mask), poly);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if (@mask_to_int@(invalid_mask)) {
        npy_set_floatstatus_invalid();
    }
    if (@mask_to_int@(divide_by_zero_mask)) {
        npy_set_floatstatus_divbyzero();
    }
}
#endif // @CHK@
/**end repeat**/

#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
/**begin repeat
 * #func = exp, log#
 * #default_val = 0, 1#
 */
static void
simd_@func@_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
                      npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
{
    const int vstep = npyv_nlanes_f64;
    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
        npyv_f64 x;
#if @default_val@
        if (ssrc == 1) {
            x = npyv_load_till_f64(src, len, @default_val@);
        } else {
            x = npyv_loadn_till_f64(src, ssrc, len, @default_val@);
        }
#else
        if (ssrc == 1) {
            x = npyv_load_tillz_f64(src, len);
        } else {
            x = npyv_loadn_tillz_f64(src, ssrc, len);
        }
#endif
        npyv_f64 out = __svml_@func@8_ha(x);
        if (sdst == 1) {
            npyv_store_till_f64(dst, len, out);
        } else {
            npyv_storen_till_f64(dst, sdst, len, out);
        }
    }
    npyv_cleanup();
}
/**end repeat**/

#else
#ifdef SIMD_AVX512F_NOCLANG_BUG
/*
 * Vectorized implementation of exp double using AVX512
 * Reference: Tang, P.T.P., "Table-driven implementation of the
 *  exponential function in IEEE floating-point
 *  arithmetic," ACM Transactions on Mathematical
 *  Software, vol. 15, pp. 144-157, 1989.
 * 1) if x > mTH_max or x is INF; return INF (overflow)
 * 2) if x < mTH_min; return 0.0f (underflow)
 * 3) if abs(x) < mTH_nearzero; return 1.0f + x
 * 4) if x is Nan; return Nan
 * 5) Range reduction:
 *    x = (32m + j)ln2 / 32 + r; r in [-ln2/64, ln2/64]
 * 6) exp(r) - 1 is approximated by a polynomial function p(r)
 *    exp(x) = 2^m(2^(j/32) + 2^(j/32)p(r));
 */
static void
AVX512F_exp_DOUBLE(npy_double * op,
                npy_double * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    npy_intp num_remaining_elements = array_size;
    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
    npy_int32 indexarr[8];
    for (npy_int32 ii = 0; ii < 8; ii++) {
        indexarr[ii] = ii*stride;
    }

    __m512d InvLn2N = _mm512_set1_pd(NPY_INV_LN2_MUL_32);
    __m512d mShift = _mm512_set1_pd(NPY_RINT_CVT_MAGIC);
    __m512d mNegL1 = _mm512_set1_pd(NPY_TANG_NEG_L1);
    __m512d mNegL2 = _mm512_set1_pd(NPY_TANG_NEG_L2);
    __m512i mMod = _mm512_set1_epi64(0x1f);
    __m512d mA1 = _mm512_set1_pd(NPY_TANG_A1);
    __m512d mA2 = _mm512_set1_pd(NPY_TANG_A2);
    __m512d mA3 = _mm512_set1_pd(NPY_TANG_A3);
    __m512d mA4 = _mm512_set1_pd(NPY_TANG_A4);
    __m512d mA5 = _mm512_set1_pd(NPY_TANG_A5);
    __m512d mTH_nearzero = _mm512_set1_pd(0x1p-54);
    __m512d mTH_max = _mm512_set1_pd(0x1.62e42fefa39efp+9);
    __m512d mTH_min = _mm512_set1_pd(-0x1.74910d52d3053p+9);
    __m512d mTH_inf = _mm512_set1_pd(NPY_INFINITY);
    __m512d mTH_ninf = _mm512_set1_pd(-NPY_INFINITY);
    __m512d zeros_d = _mm512_set1_pd(0.0f);
    __m512d ones_d = _mm512_set1_pd(1.0f);
    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);

    __m512d mTable_top_0 = _mm512_loadu_pd(&(EXP_Table_top[8*0]));
    __m512d mTable_top_1 = _mm512_loadu_pd(&(EXP_Table_top[8*1]));
    __m512d mTable_top_2 = _mm512_loadu_pd(&(EXP_Table_top[8*2]));
    __m512d mTable_top_3 = _mm512_loadu_pd(&(EXP_Table_top[8*3]));
    __m512d mTable_tail_0 = _mm512_loadu_pd(&(EXP_Table_tail[8*0]));
    __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1]));
    __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2]));
    __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));

    __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
    __mmask8 underflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
    __mmask8 load_mask = avx512_get_full_load_mask_pd();
    __mmask8 xmin_mask, xmax_mask, inf_mask, ninf_mask, nan_mask, nearzero_mask;

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < num_lanes) {
            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
                                                      num_lanes);
        }

        __m512d x;
        if (1 == stride) {
            x = avx512_masked_load_pd(load_mask, ip);
        }
        else {
            x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
        }

        nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ);
        x = avx512_set_masked_lanes_pd(x, zeros_d, nan_mask);
        xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ);
        xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ);
        inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ);
        ninf_mask = _mm512_cmp_pd_mask(x, mTH_ninf, _CMP_EQ_OQ);
        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x),
                                _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs),
                                    mTH_nearzero, _CMP_LT_OQ);
        nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
        overflow_mask = _mm512_kor(overflow_mask,
                                _mm512_kxor(xmax_mask, inf_mask));
        underflow_mask = _mm512_kor(underflow_mask,
                                _mm512_kxor(xmin_mask, ninf_mask));
        x = avx512_set_masked_lanes_pd(x, zeros_d,
                        _mm512_kor(_mm512_kor(nan_mask, xmin_mask),
                            _mm512_kor(xmax_mask, nearzero_mask)));

        /* z = x * 32/ln2 */
        __m512d z = _mm512_mul_pd(x, InvLn2N);

        /* round to nearest */
        __m512d kd = _mm512_add_pd(z, mShift);
        __m512i ki = _mm512_castpd_si512(kd);
        kd = _mm512_sub_pd(kd, mShift);

        /* r = (x + kd*mNegL1) + kd*mNegL2 */
        __m512d r1 = _mm512_fmadd_pd(kd, mNegL1, x);
        __m512d r2 = _mm512_mul_pd(kd, mNegL2);
        __m512d r = _mm512_add_pd(r1,r2);

        /* Polynomial approximation for exp(r) - 1 */
        __m512d q = _mm512_fmadd_pd(mA5, r, mA4);
        q = _mm512_fmadd_pd(q, r, mA3);
        q = _mm512_fmadd_pd(q, r, mA2);
        q = _mm512_fmadd_pd(q, r, mA1);
        q = _mm512_mul_pd(q, r);
        __m512d p = _mm512_fmadd_pd(r, q, r2);
        p = _mm512_add_pd(r1, p);

        /* Get 2^(j/32) from lookup table */
        __m512i j = _mm512_and_epi64(ki, mMod);
        __m512d top = avx512_permute_x4var_pd(mTable_top_0, mTable_top_1,
                                  mTable_top_2, mTable_top_3, j);
        __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1,
                                  mTable_tail_2, mTable_tail_3, j);

        /*
         * s = top + tail;
         * exp(x) = 2^m * (top + (tail + s * p));
         */
        __m512d s = _mm512_add_pd(top, tail);
        __m512d res = _mm512_fmadd_pd(s, p, tail);
        res = _mm512_add_pd(res, top);
        res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32)));

        /* return special cases */
        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d),
                                        nearzero_mask);
        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN),
                                        nan_mask);
        res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask);
        res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask);

        _mm512_mask_storeu_pd(op, load_mask, res);

        ip += num_lanes * stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }
    /*
     * Don't count on the compiler for cast between mask and int registers.
     * On gcc7 with flags -march>=nocona -O3 can cause FP stack overflow
     * which may lead to putting NaN into certain HW/FP calculations.
     *
     * For more details, please check the comments in:
     * - https://github.com/numpy/numpy/issues/20356
     */
    if (npyv_tobits_b64(overflow_mask)) {
        npy_set_floatstatus_overflow();
    }

    if (npyv_tobits_b64(underflow_mask)) {
        npy_set_floatstatus_underflow();
    }
}
/*
 * Vectorized implementation of log double using AVX512
 * Reference:
 * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions
 *     and their error analysis. No. CONF-9106103-1. Argonne National Lab.,
 *     IL (USA), 1991.
 * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm
 *     function in IEEE floating-point arithmetic." ACM Transactions on
 *     Mathematical Software (TOMS) 16.4 (1990): 378-400.
 * [3] Muller, Jean-Michel. "Elementary functions: algorithms and
 *     implementation." (2016).
 * 1) if x = 0; return -INF
 * 2) if x < 0; return NAN
 * 3) if x is INF; return INF
 * 4) if x is NAN; return NAN
 * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log()
 * 6) Range reduction:
 *    log(x) = log(2^m * z)
 *           = mln2 + log(z)
 * 7) log(z) = log(z / c_k) + log(c_k);
 *    where c_k = 1 + k/64, k = 0,1,...,64
 *    s.t. |x - c_k| <= 1/128 when x on[1,2].
 * 8) r = 2(x - c_k)/(x + c_k)
 *    log(x/c_k) = log((1 + r/2) / (1 - r/2))
 *               = p(r)
 *               = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
 */

/* LLVM has a bug where AVX-512F intrinsic `_mm512_mask_mul_pd` emits an
 * unmasked operation with a masked store.  This can cause FP exceptions to
 * occur for the lanes that are suppose to have been masked.
 *
 * See https://bugs.llvm.org/show_bug.cgi?id=51988
 *
 * Note, this affects LLVM based compilers like Apple Clang, Clang, and Intel's
 * ICX.
 */
#if defined(__clang__)
    #if defined(__apple_build_version__)
    // Apple Clang
        #if __apple_build_version__ > 11000000
        // Apple Clang after v11
        #define WORKAROUND_LLVM__mm512_mask_mul_pd
        #endif
    #else
    // Clang, not Apple Clang
        #if __clang_major__ > 9
        // Clang v9+
        #define WORKAROUND_LLVM__mm512_mask_mul_pd
        #endif
    #endif
#endif

static void
AVX512F_log_DOUBLE(npy_double * op,
                npy_double * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    npy_intp num_remaining_elements = array_size;
    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
    npy_int32 indexarr[8];
    for (npy_int32 ii = 0; ii < 8; ii++) {
        indexarr[ii] = ii*stride;
    }

    __m512d zeros_d = _mm512_set1_pd(0.0f);
    __m512d ones_d = _mm512_set1_pd(1.0f);
    __m512d mInf = _mm512_set1_pd(NPY_INFINITY);
    __m512d mInv64 = _mm512_castsi512_pd(_mm512_set1_epi64(0x3f90000000000000));
    __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN);
    __m512d mNan = _mm512_set1_pd(NPY_NAN);
    __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY);
    __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1);
    __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2);
    __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3);
    __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4);
    __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI);
    __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO);

    __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4);
    __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4);
    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);

    /* Load lookup table data */
    /**begin repeat
     * #i = 0, 1, 2, 3, 4, 5, 6, 7#
     */

    __m512d mLUT_TOP_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*@i@]));
    __m512d mLUT_TAIL_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*@i@]));

    /**end repeat**/

    __mmask8 load_mask = avx512_get_full_load_mask_pd();
    __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
    __mmask8 divide_by_zero_mask = invalid_mask;

    __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask,
             glibc_mask;

    __m512d x_in;
    while (num_remaining_elements > 0) {
        if (num_remaining_elements < num_lanes) {
            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
                                                      num_lanes);
        }

        if (1 == stride) {
            x_in = avx512_masked_load_pd(load_mask, ip);
        }
        else {
            x_in = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
        }

        /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */
        __mmask8 m1 = _mm512_cmp_pd_mask(x_in, mTo_glibc_max, _CMP_LT_OQ);
        __mmask8 m2 = _mm512_cmp_pd_mask(x_in, mTo_glibc_min, _CMP_GT_OQ);
        glibc_mask =  m1 & m2;

        if (glibc_mask != 0xFF) {
            zero_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_EQ_OQ);
            inf_mask = _mm512_cmp_pd_mask(x_in, mInf, _CMP_EQ_OQ);
            negx_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_LT_OQ);
            nan_mask = _mm512_cmp_pd_mask(x_in, x_in, _CMP_NEQ_UQ);

            divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask);
            invalid_mask = invalid_mask | negx_mask;

            __m512d x = avx512_set_masked_lanes_pd(x_in, zeros_d, negx_mask);
            __m512i ix = _mm512_castpd_si512(x);

            /* Normalize x when it is denormal */
            __m512i top12 = _mm512_and_epi64(ix,
                                _mm512_set1_epi64(0xfff0000000000000));
            denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
                                _CMP_EQ_OQ);
            denormal_mask = (~zero_mask) & denormal_mask;
            __m512d masked_x = x;
            #ifdef WORKAROUND_LLVM__mm512_mask_mul_pd
            masked_x = avx512_set_masked_lanes_pd(masked_x, zeros_d, (~denormal_mask));
            #endif
            ix = _mm512_castpd_si512(_mm512_mask_mul_pd(x, denormal_mask,
                                    masked_x, _mm512_set1_pd(0x1p52)));
            ix = _mm512_mask_sub_epi64(ix, denormal_mask,
                                    ix, _mm512_set1_epi64(52ULL << 52));

            /*
             * x = 2^k * z; where z in range [1,2]
             */
            __m512i tmp = _mm512_sub_epi64(ix,
                              _mm512_set1_epi64(0x3ff0000000000000));
            __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6),
                            _mm512_set1_epi64(0x3fULL));
            __m512i ik = _mm512_srai_epi64(tmp, 52);
            __m512d z = _mm512_castsi512_pd(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp,
                            _mm512_set1_epi64(0xfff0000000000000))));
            /* c = i/64 + 1 */
            __m256i i_32 = _mm512_cvtepi64_epi32(i);
            __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d);

            /* u = 2 * (z - c) / (z + c) */
            __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c));
            u = _mm512_mul_pd(_mm512_set1_pd(2.0), u);

            /* v = u * u */
            __m512d v = _mm512_mul_pd(u,u);

            /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */
            __m512d res = _mm512_fmadd_pd(v, mA4, mA3);
            res = _mm512_fmadd_pd(v, res, mA2);
            res = _mm512_fmadd_pd(v, res, mA1);
            res = _mm512_mul_pd(v, res);
            res = _mm512_fmadd_pd(u, res, u);

            /* Load lookup table data */
            __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1,
                            mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5,
                            mLUT_TOP_6, mLUT_TOP_7, i);
            __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1,
                              mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5,
                              mLUT_TAIL_6, mLUT_TAIL_7, i);

            /*
             * log(x) = k * ln2_hi + c_hi +
             *          k * ln2_lo + c_lo +
             *          log(z/c)
             */
            __m256i ik_32 = _mm512_cvtepi64_epi32(ik);
            __m512d k = _mm512_cvtepi32_pd(ik_32);
            __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi);
            __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo);
            tt = _mm512_add_pd(tt, tt2);
            res = _mm512_add_pd(tt, res);

            /* return special cases */
            res = avx512_set_masked_lanes_pd(res, mNan, nan_mask);
            res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask);
            res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask);
            res = avx512_set_masked_lanes_pd(res, mInf, inf_mask);

            _mm512_mask_storeu_pd(op, load_mask, res);
        }

        /* call glibc's log func when x around 1.0f. */
        if (glibc_mask != 0) {
            double NPY_DECL_ALIGNED(64) ip_fback[8];
            /* Using a mask_store_pd instead of store_pd to prevent a fatal
             * compiler optimization bug. See
             * https://github.com/numpy/numpy/issues/27745#issuecomment-2498684564
             * for details.*/
            _mm512_mask_store_pd(ip_fback, avx512_get_full_load_mask_pd(), x_in);

            for (int ii = 0; ii < 8; ++ii, glibc_mask >>= 1) {
                if (glibc_mask & 0x01) {
                    op[ii] = npy_log(ip_fback[ii]);
                }
            }
        }
        ip += num_lanes * stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if (npyv_tobits_b64(invalid_mask)) {
        npy_set_floatstatus_invalid();
    }
    if (npyv_tobits_b64(divide_by_zero_mask)) {
        npy_set_floatstatus_divbyzero();
    }
}

#undef WORKAROUND_LLVM__mm512_mask_mul_pd

#endif // SIMD_AVX512F_NOCLANG_BUG
#endif // NPY_CAN_LINK_SVML

#ifdef SIMD_AVX512_SKX
/**begin repeat
 * #type = npy_float, npy_double#
 * #TYPE = FLOAT, DOUBLE#
 * #num_lanes = 16, 8#
 * #vsuffix = ps, pd#
 * #mask = __mmask16, __mmask8#
 * #vtype1 = __m512, __m512d#
 * #vtype2 = __m512i, __m256i#
 * #scale = 4, 8#
 * #vindextype = __m512i, __m256i#
 * #vindexsize = 512, 256#
 * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
 * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32#
 * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32#
 * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32#
 * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
 * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
 */
static inline void
AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = array_size;
    @type@* ip1 = (@type@*) args[0];
    int* ip2 = (int*) args[1];
    @type@* op  = (@type@*) args[2];

    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
     */

    npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
        index_ip1[ii] = ii*stride_ip1;
        index_ip2[ii] = ii*stride_ip2;
        index_op[ii] = ii*stride_op;
    }
    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
    @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
    @vindextype@ vindex_op  = @vindexload@((@vindextype@*)&index_op[0]);
    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
    @vtype2@ zeros = @setzero@();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < @num_lanes@) {
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
                                    num_remaining_elements, @num_lanes@);
        }
        @vtype1@ x1;
        @vtype2@ x2;
        if (stride_ip1 == 1) {
            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
        }
        else {
            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
        }
        if (stride_ip2 == 1) {
            x2 = @vtype2_load@(load_mask, ip2);
        }
        else {
            x2 = @vtype2_gather@(zeros, load_mask, vindex_ip2, ip2, 4);
        }

        @vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2));

        if (stride_op == 1) {
            _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
        }
        else {
            /* scatter! */
            _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
        }

        ip1 += @num_lanes@*stride_ip1;
        ip2 += @num_lanes@*stride_ip2;
        op += @num_lanes@*stride_op;
        num_remaining_elements -= @num_lanes@;
    }
}

static inline void
AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
    const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(@type@);
    const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = array_size;
    @type@* ip1 = (@type@*) args[0];
    @type@* op1  = (@type@*) args[1];
    int* op2 = (int*) args[2];

    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
     */

    npy_int32 index_ip1[@num_lanes@], index_op1[@num_lanes@], index_op2[@num_lanes@];
    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
        index_ip1[ii] = ii*stride_ip1;
        index_op1[ii] = ii*stride_op1;
        index_op2[ii] = ii*stride_op2;
    }
    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
    @vindextype@ vindex_op1 = @vindexload@((@vindextype@*)&index_op1[0]);
    @vindextype@ vindex_op2 = @vindexload@((@vindextype@*)&index_op2[0]);
    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < @num_lanes@) {
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
                                    num_remaining_elements, @num_lanes@);
        }
        @vtype1@ x1;
        if (stride_ip1 == 1) {
            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
        }
        else {
            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
        }

        /*
         * The x86 instructions vpgetmant and vpgetexp do not conform
         * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
         * We mask these values with spmask to avoid invalid exceptions.
         */
        @mask@ spmask =_mm512_knot(_mm512_fpclass_@vsuffix@_mask(
                                                x1, 0b10011111));
        @vtype1@ out1 = _mm512_maskz_getmant_@vsuffix@(
                                spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
        out1 = _mm512_mask_mov_@vsuffix@(x1, spmask, out1);
        @vtype2@ out2 = _mm512_cvt@vsuffix@_epi32(
                            _mm512_maskz_add_@vsuffix@(spmask, _mm512_set1_@vsuffix@(1.0),
                                _mm512_maskz_getexp_@vsuffix@(spmask, x1)));
        if (stride_op1 == 1) {
            _mm512_mask_storeu_@vsuffix@(op1, load_mask, out1);
        }
        else {
            _mm512_mask_i32scatter_@vsuffix@(op1, load_mask, vindex_op1, out1, @scale@);
        }
        if (stride_op2 == 1) {
            @vtype2_store@(op2, load_mask, out2);
        }
        else {
            @vtype2_scatter@(op2, load_mask, vindex_op2, out2, 4);
        }

        ip1 += @num_lanes@*stride_ip1;
        op1 += @num_lanes@*stride_op1;
        op2 += @num_lanes@*stride_op2;
        num_remaining_elements -= @num_lanes@;
    }
}
/**end repeat**/
#endif // SIMD_AVX512_SKX


/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
/**begin repeat
 * #func = exp, log#
 * #scalarf = npy_expf, npy_logf#
 */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if defined(SIMD_AVX2_FMA3) || defined(SIMD_AVX512F)
    // third arg in `IS_OUTPUT_BLOCKABLE_UNARY` is dummy
    // TODO: get ride of this macro during the move to NPYV
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), 64)) {
        simd_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
    }
    else {
        UNARY_LOOP {
            /*
             * We use the AVX function to compute exp/log for scalar elements as well.
             * This is needed to ensure the output of strided and non-strided
             * cases match. SIMD code handles strided input cases, but not
             * strided output.
             */
            simd_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
        }
    }
#else
    UNARY_LOOP {
        const npy_float in1 = *(npy_float *)ip1;
        *(npy_float *)op1 = @scalarf@(in1);
    }
#endif
}
/**end repeat**/

/**begin repeat
 * #func = exp, log#
 * #scalar = npy_exp, npy_log#
 */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
    const npy_intp len = dimensions[0];

    if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
            npyv_loadable_stride_f64(steps[0]) &&
            npyv_storable_stride_f64(steps[1])) {
        const npy_double *src = (npy_double*)args[0];
        npy_double *dst = (npy_double*)args[1];
        const npy_intp ssrc = steps[0] / sizeof(src[0]);
        const npy_intp sdst = steps[1] / sizeof(src[0]);

        simd_@func@_f64(src, ssrc, dst, sdst, len);
        return;
    }
#else
#ifdef SIMD_AVX512F_NOCLANG_BUG
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
        AVX512F_@func@_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
        return;
    }
#endif // SIMD_AVX512F_NOCLANG_BUG
#endif // NPY_CAN_LINK_SVML
    UNARY_LOOP {
        const npy_double in1 = *(npy_double *)ip1;
        *(npy_double *)op1 = @scalar@(in1);
    }
}

/**end repeat**/

/**begin repeat
 * Float types
 *  #type = npy_float, npy_double#
 *  #TYPE = FLOAT, DOUBLE#
 *  #c = f, #
 *  #C = F, #
 *  #suffix = f32, f64#
 */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_frexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#ifdef SIMD_AVX512_SKX
    if ((npyv_loadable_stride_@suffix@(steps[0])) &&
        (npyv_storable_stride_@suffix@(steps[1])) &&
        (npyv_storable_stride_@suffix@(steps[2])) &&
        (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
        (!is_mem_overlap(args[0], steps[0], args[1], steps[1], dimensions[0]))) {
        AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
        return;
    }
#endif
    UNARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = npy_frexp@c@(in1, (int *)op2);
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_ldexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#ifdef SIMD_AVX512_SKX
    if ((npyv_loadable_stride_@suffix@(steps[0])) &&
        (npyv_storable_stride_@suffix@(steps[1])) &&
        (npyv_storable_stride_@suffix@(steps[2])) &&
        (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
        (!is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0]))) {
        AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
        return;
    }
#endif
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const int in2 = *(int *)ip2;
        *((@type@ *)op1) = npy_ldexp@c@(in1, in2);
    }
}
/**end repeat**/