#ifndef NPY_SIMD
    #error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_NEON_OPERATORS_H
#define _NPY_SIMD_NEON_OPERATORS_H

/***************************
 * Shifting
 ***************************/

// left
#define npyv_shl_u16(A, C) vshlq_u16(A, npyv_setall_s16(C))
#define npyv_shl_s16(A, C) vshlq_s16(A, npyv_setall_s16(C))
#define npyv_shl_u32(A, C) vshlq_u32(A, npyv_setall_s32(C))
#define npyv_shl_s32(A, C) vshlq_s32(A, npyv_setall_s32(C))
#define npyv_shl_u64(A, C) vshlq_u64(A, npyv_setall_s64(C))
#define npyv_shl_s64(A, C) vshlq_s64(A, npyv_setall_s64(C))

// left by an immediate constant
#define npyv_shli_u16 vshlq_n_u16
#define npyv_shli_s16 vshlq_n_s16
#define npyv_shli_u32 vshlq_n_u32
#define npyv_shli_s32 vshlq_n_s32
#define npyv_shli_u64 vshlq_n_u64
#define npyv_shli_s64 vshlq_n_s64

// right
#define npyv_shr_u16(A, C) vshlq_u16(A, npyv_setall_s16(-(C)))
#define npyv_shr_s16(A, C) vshlq_s16(A, npyv_setall_s16(-(C)))
#define npyv_shr_u32(A, C) vshlq_u32(A, npyv_setall_s32(-(C)))
#define npyv_shr_s32(A, C) vshlq_s32(A, npyv_setall_s32(-(C)))
#define npyv_shr_u64(A, C) vshlq_u64(A, npyv_setall_s64(-(C)))
#define npyv_shr_s64(A, C) vshlq_s64(A, npyv_setall_s64(-(C)))

// right by an immediate constant
#define npyv_shri_u16 vshrq_n_u16
#define npyv_shri_s16 vshrq_n_s16
#define npyv_shri_u32 vshrq_n_u32
#define npyv_shri_s32 vshrq_n_s32
#define npyv_shri_u64 vshrq_n_u64
#define npyv_shri_s64 vshrq_n_s64

/***************************
 * Logical
 ***************************/

// AND
#define npyv_and_u8  vandq_u8
#define npyv_and_s8  vandq_s8
#define npyv_and_u16 vandq_u16
#define npyv_and_s16 vandq_s16
#define npyv_and_u32 vandq_u32
#define npyv_and_s32 vandq_s32
#define npyv_and_u64 vandq_u64
#define npyv_and_s64 vandq_s64
#define npyv_and_f32(A, B) \
    vreinterpretq_f32_u8(vandq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
#define npyv_and_f64(A, B) \
    vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B)))
#define npyv_and_b8   vandq_u8
#define npyv_and_b16  vandq_u16
#define npyv_and_b32  vandq_u32
#define npyv_and_b64  vandq_u64

// OR
#define npyv_or_u8  vorrq_u8
#define npyv_or_s8  vorrq_s8
#define npyv_or_u16 vorrq_u16
#define npyv_or_s16 vorrq_s16
#define npyv_or_u32 vorrq_u32
#define npyv_or_s32 vorrq_s32
#define npyv_or_u64 vorrq_u64
#define npyv_or_s64 vorrq_s64
#define npyv_or_f32(A, B) \
    vreinterpretq_f32_u8(vorrq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
#define npyv_or_f64(A, B) \
    vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B)))
#define npyv_or_b8   vorrq_u8
#define npyv_or_b16  vorrq_u16
#define npyv_or_b32  vorrq_u32
#define npyv_or_b64  vorrq_u64


// XOR
#define npyv_xor_u8  veorq_u8
#define npyv_xor_s8  veorq_s8
#define npyv_xor_u16 veorq_u16
#define npyv_xor_s16 veorq_s16
#define npyv_xor_u32 veorq_u32
#define npyv_xor_s32 veorq_s32
#define npyv_xor_u64 veorq_u64
#define npyv_xor_s64 veorq_s64
#define npyv_xor_f32(A, B) \
    vreinterpretq_f32_u8(veorq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
#define npyv_xor_f64(A, B) \
    vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B)))
#define npyv_xor_b8   veorq_u8
#define npyv_xor_b16  veorq_u16
#define npyv_xor_b32  veorq_u32
#define npyv_xor_b64  veorq_u64

// NOT
#define npyv_not_u8  vmvnq_u8
#define npyv_not_s8  vmvnq_s8
#define npyv_not_u16 vmvnq_u16
#define npyv_not_s16 vmvnq_s16
#define npyv_not_u32 vmvnq_u32
#define npyv_not_s32 vmvnq_s32
#define npyv_not_u64(A) vreinterpretq_u64_u8(vmvnq_u8(vreinterpretq_u8_u64(A)))
#define npyv_not_s64(A) vreinterpretq_s64_u8(vmvnq_u8(vreinterpretq_u8_s64(A)))
#define npyv_not_f32(A) vreinterpretq_f32_u8(vmvnq_u8(vreinterpretq_u8_f32(A)))
#define npyv_not_f64(A) vreinterpretq_f64_u8(vmvnq_u8(vreinterpretq_u8_f64(A)))
#define npyv_not_b8   vmvnq_u8
#define npyv_not_b16  vmvnq_u16
#define npyv_not_b32  vmvnq_u32
#define npyv_not_b64  npyv_not_u64

// ANDC, ORC and XNOR
#define npyv_andc_u8 vbicq_u8
#define npyv_andc_b8 vbicq_u8
#define npyv_orc_b8 vornq_u8
#define npyv_xnor_b8 vceqq_u8

/***************************
 * Comparison
 ***************************/

// equal
#define npyv_cmpeq_u8  vceqq_u8
#define npyv_cmpeq_s8  vceqq_s8
#define npyv_cmpeq_u16 vceqq_u16
#define npyv_cmpeq_s16 vceqq_s16
#define npyv_cmpeq_u32 vceqq_u32
#define npyv_cmpeq_s32 vceqq_s32
#define npyv_cmpeq_f32 vceqq_f32
#define npyv_cmpeq_f64 vceqq_f64

#ifdef __aarch64__
    #define npyv_cmpeq_u64 vceqq_u64
    #define npyv_cmpeq_s64 vceqq_s64
#else
    NPY_FINLINE uint64x2_t npyv_cmpeq_u64(uint64x2_t a, uint64x2_t b)
    {
        uint64x2_t cmpeq = vreinterpretq_u64_u32(vceqq_u32(
            vreinterpretq_u32_u64(a), vreinterpretq_u32_u64(b)
        ));
        uint64x2_t cmpeq_h = vshlq_n_u64(cmpeq, 32);
        uint64x2_t test = vandq_u64(cmpeq, cmpeq_h);
        return vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(test), 32));
    }
    #define npyv_cmpeq_s64(A, B) \
        npyv_cmpeq_u64(vreinterpretq_u64_s64(A), vreinterpretq_u64_s64(B))
#endif

// not Equal
#define npyv_cmpneq_u8(A, B)  vmvnq_u8(vceqq_u8(A, B))
#define npyv_cmpneq_s8(A, B)  vmvnq_u8(vceqq_s8(A, B))
#define npyv_cmpneq_u16(A, B) vmvnq_u16(vceqq_u16(A, B))
#define npyv_cmpneq_s16(A, B) vmvnq_u16(vceqq_s16(A, B))
#define npyv_cmpneq_u32(A, B) vmvnq_u32(vceqq_u32(A, B))
#define npyv_cmpneq_s32(A, B) vmvnq_u32(vceqq_s32(A, B))
#define npyv_cmpneq_u64(A, B) npyv_not_u64(npyv_cmpeq_u64(A, B))
#define npyv_cmpneq_s64(A, B) npyv_not_u64(npyv_cmpeq_s64(A, B))
#define npyv_cmpneq_f32(A, B) vmvnq_u32(vceqq_f32(A, B))
#define npyv_cmpneq_f64(A, B) npyv_not_u64(vceqq_f64(A, B))

// greater than
#define npyv_cmpgt_u8  vcgtq_u8
#define npyv_cmpgt_s8  vcgtq_s8
#define npyv_cmpgt_u16 vcgtq_u16
#define npyv_cmpgt_s16 vcgtq_s16
#define npyv_cmpgt_u32 vcgtq_u32
#define npyv_cmpgt_s32 vcgtq_s32
#define npyv_cmpgt_f32 vcgtq_f32
#define npyv_cmpgt_f64 vcgtq_f64

#ifdef __aarch64__
    #define npyv_cmpgt_u64 vcgtq_u64
    #define npyv_cmpgt_s64 vcgtq_s64
#else
    NPY_FINLINE uint64x2_t npyv_cmpgt_s64(int64x2_t a, int64x2_t b)
    {
        int64x2_t sub = vsubq_s64(b, a);
        uint64x2_t nsame_sbit = vreinterpretq_u64_s64(veorq_s64(a, b));
        int64x2_t test = vbslq_s64(nsame_sbit, b, sub);
        int64x2_t extend_sbit = vshrq_n_s64(test, 63);
        return  vreinterpretq_u64_s64(extend_sbit);
    }
    NPY_FINLINE uint64x2_t npyv_cmpgt_u64(uint64x2_t a, uint64x2_t b)
    {
        const uint64x2_t sbit = npyv_setall_u64(0x8000000000000000);
        a = npyv_xor_u64(a, sbit);
        b = npyv_xor_u64(b, sbit);
        return npyv_cmpgt_s64(vreinterpretq_s64_u64(a), vreinterpretq_s64_u64(b));
    }
#endif

// greater than or equal
#define npyv_cmpge_u8  vcgeq_u8
#define npyv_cmpge_s8  vcgeq_s8
#define npyv_cmpge_u16 vcgeq_u16
#define npyv_cmpge_s16 vcgeq_s16
#define npyv_cmpge_u32 vcgeq_u32
#define npyv_cmpge_s32 vcgeq_s32
#define npyv_cmpge_f32 vcgeq_f32
#define npyv_cmpge_f64 vcgeq_f64

#ifdef __aarch64__
    #define npyv_cmpge_u64 vcgeq_u64
    #define npyv_cmpge_s64 vcgeq_s64
#else
    #define npyv_cmpge_u64(A, B) npyv_not_u64(npyv_cmpgt_u64(B, A))
    #define npyv_cmpge_s64(A, B) npyv_not_u64(npyv_cmpgt_s64(B, A))
#endif

// less than
#define npyv_cmplt_u8(A, B)  npyv_cmpgt_u8(B, A)
#define npyv_cmplt_s8(A, B)  npyv_cmpgt_s8(B, A)
#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A)
#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A)
#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A)
#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A)
#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A)
#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A)
#define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A)
#define npyv_cmplt_f64(A, B) npyv_cmpgt_f64(B, A)

// less than or equal
#define npyv_cmple_u8(A, B)  npyv_cmpge_u8(B, A)
#define npyv_cmple_s8(A, B)  npyv_cmpge_s8(B, A)
#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A)
#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A)
#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A)
#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A)
#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A)
#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)

// check special cases
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
{
#if defined(__clang__)
/**
 * To avoid signaling qNaN, workaround for clang symmetric inputs bug
 * check https://github.com/numpy/numpy/issues/22933,
 * for more clarification.
 */
    npyv_b32 ret;
    #if NPY_SIMD_F64
        __asm("fcmeq %0.4s, %1.4s, %1.4s" : "=w" (ret) : "w" (a));
    #else
        __asm("vceq.f32 %q0, %q1, %q1" : "=w" (ret) : "w" (a));
    #endif
    return ret;
#else
    return vceqq_f32(a, a);
#endif
}
#if NPY_SIMD_F64
    NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
    {
    #if defined(__clang__)
        npyv_b64 ret;
        __asm("fcmeq %0.2d, %1.2d, %1.2d" : "=w" (ret) : "w" (a));
        return ret;
    #else
        return vceqq_f64(a, a);
    #endif
    }
#endif

// Test cross all vector lanes
// any: returns true if any of the elements is not equal to zero
// all: returns true if all elements are not equal to zero
#if NPY_SIMD_F64
    #define NPYV_IMPL_NEON_ANYALL(LEN)                  \
        NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a) \
        { return vmaxvq_u##LEN(a) != 0; }               \
        NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a) \
        { return vminvq_u##LEN(a) != 0; }
    NPYV_IMPL_NEON_ANYALL(8)
    NPYV_IMPL_NEON_ANYALL(16)
    NPYV_IMPL_NEON_ANYALL(32)
    #undef NPYV_IMPL_NEON_ANYALL

    #define NPYV_IMPL_NEON_ANYALL(SFX, USFX, BSFX)                      \
        NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a)                   \
        { return npyv_any_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); } \
        NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a)                   \
        { return npyv_all_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); }
    NPYV_IMPL_NEON_ANYALL(u8,  u8,  b8)
    NPYV_IMPL_NEON_ANYALL(s8,  u8,  b8)
    NPYV_IMPL_NEON_ANYALL(u16, u16, b16)
    NPYV_IMPL_NEON_ANYALL(s16, u16, b16)
    NPYV_IMPL_NEON_ANYALL(u32, u32, b32)
    NPYV_IMPL_NEON_ANYALL(s32, u32, b32)
    #undef NPYV_IMPL_NEON_ANYALL

    NPY_FINLINE bool npyv_any_b64(npyv_b64 a)
    { return vmaxvq_u32(vreinterpretq_u32_u64(a)) != 0; }
    NPY_FINLINE bool npyv_all_b64(npyv_b64 a)
    { return vminvq_u32(vreinterpretq_u32_u64(a)) != 0; }
    #define npyv_any_u64 npyv_any_b64
    NPY_FINLINE bool npyv_all_u64(npyv_u64 a)
    {
        uint32x4_t a32 = vreinterpretq_u32_u64(a);
                   a32 = vorrq_u32(a32, vrev64q_u32(a32));
        return vminvq_u32(a32) != 0;
    }
    NPY_FINLINE bool npyv_any_s64(npyv_s64 a)
    { return npyv_any_u64(vreinterpretq_u64_s64(a)); }
    NPY_FINLINE bool npyv_all_s64(npyv_s64 a)
    { return npyv_all_u64(vreinterpretq_u64_s64(a)); }

    #define NPYV_IMPL_NEON_ANYALL(SFX, BSFX)                                 \
        NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a)                        \
        { return !npyv_all_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); } \
        NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a)                        \
        { return !npyv_any_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); }
    NPYV_IMPL_NEON_ANYALL(f32, b32)
    NPYV_IMPL_NEON_ANYALL(f64, b64)
    #undef NPYV_IMPL_NEON_ANYALL
#else
    #define NPYV_IMPL_NEON_ANYALL(LEN)                    \
        NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a)   \
        {                                                 \
            int64x2_t a64 = vreinterpretq_s64_u##LEN(a);  \
            return (                                      \
                vgetq_lane_s64(a64, 0) |                  \
                vgetq_lane_s64(a64, 1)                    \
            ) != 0;                                       \
        }                                                 \
        NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a)   \
        {                                                 \
            int64x2_t a64 = vreinterpretq_s64_u##LEN(a);  \
            return (                                      \
                vgetq_lane_s64(a64, 0) &                  \
                vgetq_lane_s64(a64, 1)                    \
            ) == -1;                                      \
        }
    NPYV_IMPL_NEON_ANYALL(8)
    NPYV_IMPL_NEON_ANYALL(16)
    NPYV_IMPL_NEON_ANYALL(32)
    NPYV_IMPL_NEON_ANYALL(64)
    #undef NPYV_IMPL_NEON_ANYALL

    #define NPYV_IMPL_NEON_ANYALL(SFX, USFX)              \
        NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a)     \
        {                                                 \
            int64x2_t a64 = vreinterpretq_s64_##SFX(a);   \
            return (                                      \
                vgetq_lane_s64(a64, 0) |                  \
                vgetq_lane_s64(a64, 1)                    \
            ) != 0;                                       \
        }                                                 \
        NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a)     \
        {                                                 \
            npyv_##USFX tz = npyv_cmpeq_##SFX(            \
                a, npyv_zero_##SFX()                      \
            );                                            \
            int64x2_t a64 = vreinterpretq_s64_##USFX(tz); \
            return (                                      \
                vgetq_lane_s64(a64, 0) |                  \
                vgetq_lane_s64(a64, 1)                    \
            ) == 0;                                       \
        }
    NPYV_IMPL_NEON_ANYALL(u8,  u8)
    NPYV_IMPL_NEON_ANYALL(s8,  u8)
    NPYV_IMPL_NEON_ANYALL(u16, u16)
    NPYV_IMPL_NEON_ANYALL(s16, u16)
    NPYV_IMPL_NEON_ANYALL(u32, u32)
    NPYV_IMPL_NEON_ANYALL(s32, u32)
    #undef NPYV_IMPL_NEON_ANYALL

    NPY_FINLINE bool npyv_any_f32(npyv_f32 a)
    {
        uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32());
        int64x2_t a64 = vreinterpretq_s64_u32(tz);
        return (vgetq_lane_s64(a64, 0) & vgetq_lane_s64(a64, 1)) != -1ll;
    }
    NPY_FINLINE bool npyv_all_f32(npyv_f32 a)
    {
        uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32());
        int64x2_t a64 = vreinterpretq_s64_u32(tz);
        return (vgetq_lane_s64(a64, 0) | vgetq_lane_s64(a64, 1)) == 0;
    }
    NPY_FINLINE bool npyv_any_s64(npyv_s64 a)
    { return (vgetq_lane_s64(a, 0) | vgetq_lane_s64(a, 1)) != 0; }
    NPY_FINLINE bool npyv_all_s64(npyv_s64 a)
    { return vgetq_lane_s64(a, 0) && vgetq_lane_s64(a, 1); }
    NPY_FINLINE bool npyv_any_u64(npyv_u64 a)
    { return (vgetq_lane_u64(a, 0) | vgetq_lane_u64(a, 1)) != 0; }
    NPY_FINLINE bool npyv_all_u64(npyv_u64 a)
    { return vgetq_lane_u64(a, 0) && vgetq_lane_u64(a, 1); }
#endif // NPY_SIMD_F64

#endif // _NPY_SIMD_NEON_OPERATORS_H