#ifndef NPY_SIMD
    #error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_LSX_OPERATORS_H
#define _NPY_SIMD_LSX_OPERATORS_H

/***************************
 * Shifting
 ***************************/

// left
#define npyv_shl_u16(A, C) __lsx_vsll_h(A, npyv_setall_s16(C))
#define npyv_shl_s16(A, C) __lsx_vsll_h(A, npyv_setall_s16(C))
#define npyv_shl_u32(A, C) __lsx_vsll_w(A, npyv_setall_s32(C))
#define npyv_shl_s32(A, C) __lsx_vsll_w(A, npyv_setall_s32(C))
#define npyv_shl_u64(A, C) __lsx_vsll_d(A, npyv_setall_s64(C))
#define npyv_shl_s64(A, C) __lsx_vsll_d(A, npyv_setall_s64(C))

// left by an immediate constant
#define npyv_shli_u16 __lsx_vslli_h
#define npyv_shli_s16 __lsx_vslli_h
#define npyv_shli_u32 __lsx_vslli_w
#define npyv_shli_s32 __lsx_vslli_w
#define npyv_shli_u64 __lsx_vslli_d
#define npyv_shli_s64 __lsx_vslli_d

// right
#define npyv_shr_u16(A, C) __lsx_vsrl_h(A, npyv_setall_u16(C))
#define npyv_shr_s16(A, C) __lsx_vsra_h(A, npyv_setall_u16(C))
#define npyv_shr_u32(A, C) __lsx_vsrl_w(A, npyv_setall_u32(C))
#define npyv_shr_s32(A, C) __lsx_vsra_w(A, npyv_setall_u32(C))
#define npyv_shr_u64(A, C) __lsx_vsrl_d(A, npyv_setall_u64(C))
#define npyv_shr_s64(A, C) __lsx_vsra_d(A, npyv_setall_u64(C))

// Right by an immediate constant
#define npyv_shri_u16 __lsx_vsrli_h
#define npyv_shri_s16 __lsx_vsrai_h
#define npyv_shri_u32 __lsx_vsrli_w
#define npyv_shri_s32 __lsx_vsrai_w
#define npyv_shri_u64 __lsx_vsrli_d
#define npyv_shri_s64 __lsx_vsrai_d

/***************************
 * Logical
 ***************************/

// AND
#define npyv_and_u8  __lsx_vand_v
#define npyv_and_s8  __lsx_vand_v
#define npyv_and_u16 __lsx_vand_v
#define npyv_and_s16 __lsx_vand_v
#define npyv_and_u32 __lsx_vand_v
#define npyv_and_s32 __lsx_vand_v
#define npyv_and_u64 __lsx_vand_v
#define npyv_and_s64 __lsx_vand_v
#define npyv_and_f32(A, B)  \
        (__m128)__lsx_vand_v((__m128i)A, (__m128i)B)
#define npyv_and_f64(A, B)  \
        (__m128d)__lsx_vand_v((__m128i)A, (__m128i)B)
#define npyv_and_b8  __lsx_vand_v
#define npyv_and_b16 __lsx_vand_v
#define npyv_and_b32 __lsx_vand_v
#define npyv_and_b64 __lsx_vand_v

// OR
#define npyv_or_u8  __lsx_vor_v
#define npyv_or_s8  __lsx_vor_v
#define npyv_or_u16 __lsx_vor_v
#define npyv_or_s16 __lsx_vor_v
#define npyv_or_u32 __lsx_vor_v
#define npyv_or_s32 __lsx_vor_v
#define npyv_or_u64 __lsx_vor_v
#define npyv_or_s64 __lsx_vor_v
#define npyv_or_f32(A, B)   \
        (__m128)__lsx_vor_v((__m128i)A, (__m128i)B)
#define npyv_or_f64(A, B)   \
        (__m128d)__lsx_vor_v((__m128i)A, (__m128i)B)
#define npyv_or_b8  __lsx_vor_v
#define npyv_or_b16 __lsx_vor_v
#define npyv_or_b32 __lsx_vor_v
#define npyv_or_b64 __lsx_vor_v

// XOR
#define npyv_xor_u8  __lsx_vxor_v
#define npyv_xor_s8  __lsx_vxor_v
#define npyv_xor_u16 __lsx_vxor_v
#define npyv_xor_s16 __lsx_vxor_v
#define npyv_xor_u32 __lsx_vxor_v
#define npyv_xor_s32 __lsx_vxor_v
#define npyv_xor_u64 __lsx_vxor_v
#define npyv_xor_s64 __lsx_vxor_v
#define npyv_xor_f32(A, B)   \
        (__m128)__lsx_vxor_v((__m128i)A, (__m128i)B)
#define npyv_xor_f64(A, B)   \
        (__m128d)__lsx_vxor_v((__m128i)A, (__m128i)B)
#define npyv_xor_b8  __lsx_vxor_v
#define npyv_xor_b16 __lsx_vxor_v
#define npyv_xor_b32 __lsx_vxor_v
#define npyv_xor_b64 __lsx_vxor_v

// NOT
#define npyv_not_u8(A) __lsx_vxori_b((__m128i)A, 0xff)
#define npyv_not_s8  npyv_not_u8
#define npyv_not_u16 npyv_not_u8
#define npyv_not_s16 npyv_not_u8
#define npyv_not_u32 npyv_not_u8
#define npyv_not_s32 npyv_not_u8
#define npyv_not_u64 npyv_not_u8
#define npyv_not_s64 npyv_not_u8
#define npyv_not_f32 (__m128)npyv_not_u8
#define npyv_not_f64 (__m128d)npyv_not_u8
#define npyv_not_b8  npyv_not_u8
#define npyv_not_b16 npyv_not_u8
#define npyv_not_b32 npyv_not_u8
#define npyv_not_b64 npyv_not_u8

// ANDC, ORC and XNOR
#define npyv_andc_u8(A, B) __lsx_vandn_v(B, A)
#define npyv_andc_b8(A, B) __lsx_vandn_v(B, A)
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
#define npyv_xnor_b8 __lsx_vseq_b

/***************************
 * Comparison
 ***************************/

// Int Equal
#define npyv_cmpeq_u8  __lsx_vseq_b
#define npyv_cmpeq_s8  __lsx_vseq_b
#define npyv_cmpeq_u16 __lsx_vseq_h
#define npyv_cmpeq_s16 __lsx_vseq_h
#define npyv_cmpeq_u32 __lsx_vseq_w
#define npyv_cmpeq_s32 __lsx_vseq_w
#define npyv_cmpeq_u64 __lsx_vseq_d
#define npyv_cmpeq_s64 __lsx_vseq_d

// Int Not Equal
#define npyv_cmpneq_u8(A, B)  npyv_not_u8(npyv_cmpeq_u8(A, B))
#define npyv_cmpneq_u16(A, B) npyv_not_u16(npyv_cmpeq_u16(A, B))
#define npyv_cmpneq_u32(A, B) npyv_not_u32(npyv_cmpeq_u32(A, B))
#define npyv_cmpneq_u64(A, B) npyv_not_u64(npyv_cmpeq_u64(A, B))
#define npyv_cmpneq_s8  npyv_cmpneq_u8
#define npyv_cmpneq_s16 npyv_cmpneq_u16
#define npyv_cmpneq_s32 npyv_cmpneq_u32
#define npyv_cmpneq_s64 npyv_cmpneq_u64

// signed greater than
#define npyv_cmpgt_s8(A, B)  __lsx_vslt_b(B, A)
#define npyv_cmpgt_s16(A, B) __lsx_vslt_h(B, A)
#define npyv_cmpgt_s32(A, B) __lsx_vslt_w(B, A)
#define npyv_cmpgt_s64(A, B) __lsx_vslt_d(B, A)

// signed greater than or equal
#define npyv_cmpge_s8(A, B)  __lsx_vsle_b(B, A)
#define npyv_cmpge_s16(A, B) __lsx_vsle_h(B, A)
#define npyv_cmpge_s32(A, B) __lsx_vsle_w(B, A)
#define npyv_cmpge_s64(A, B) __lsx_vsle_d(B, A)

// unsigned greater than
#define npyv_cmpgt_u8(A, B)  __lsx_vslt_bu(B, A)
#define npyv_cmpgt_u16(A, B) __lsx_vslt_hu(B, A)
#define npyv_cmpgt_u32(A, B) __lsx_vslt_wu(B, A)
#define npyv_cmpgt_u64(A, B) __lsx_vslt_du(B, A)

// unsigned greater than or equal
#define npyv_cmpge_u8(A, B)  __lsx_vsle_bu(B, A)
#define npyv_cmpge_u16(A, B) __lsx_vsle_hu(B, A)
#define npyv_cmpge_u32(A, B) __lsx_vsle_wu(B, A)
#define npyv_cmpge_u64(A, B) __lsx_vsle_du(B, A)

// less than
#define npyv_cmplt_u8  __lsx_vslt_bu
#define npyv_cmplt_s8  __lsx_vslt_b
#define npyv_cmplt_u16 __lsx_vslt_hu
#define npyv_cmplt_s16 __lsx_vslt_h
#define npyv_cmplt_u32 __lsx_vslt_wu
#define npyv_cmplt_s32 __lsx_vslt_w
#define npyv_cmplt_u64 __lsx_vslt_du
#define npyv_cmplt_s64 __lsx_vslt_d

// less than or equal
#define npyv_cmple_u8  __lsx_vsle_bu
#define npyv_cmple_s8  __lsx_vsle_b
#define npyv_cmple_u16 __lsx_vsle_hu
#define npyv_cmple_s16 __lsx_vsle_h
#define npyv_cmple_u32 __lsx_vsle_wu
#define npyv_cmple_s32 __lsx_vsle_w
#define npyv_cmple_u64 __lsx_vsle_du
#define npyv_cmple_s64 __lsx_vsle_d

// precision comparison
#define npyv_cmpeq_f32  __lsx_vfcmp_ceq_s
#define npyv_cmpeq_f64  __lsx_vfcmp_ceq_d
#define npyv_cmpneq_f32 __lsx_vfcmp_cune_s
#define npyv_cmpneq_f64 __lsx_vfcmp_cune_d
#define npyv_cmplt_f32  __lsx_vfcmp_clt_s
#define npyv_cmplt_f64  __lsx_vfcmp_clt_d
#define npyv_cmple_f32  __lsx_vfcmp_cle_s
#define npyv_cmple_f64  __lsx_vfcmp_cle_d
#define npyv_cmpgt_f32(A, B) npyv_cmplt_f32(B, A)
#define npyv_cmpgt_f64(A, B) npyv_cmplt_f64(B, A)
#define npyv_cmpge_f32(A, B) npyv_cmple_f32(B, A)
#define npyv_cmpge_f64(A, B) npyv_cmple_f64(B, A)

// check special cases
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
{ return __lsx_vfcmp_cor_s(a, a); }    //!nan,return:ffffffff
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
{ return __lsx_vfcmp_cor_d(a, a); }

// Test cross all vector lanes
// any: returns true if any of the elements is not equal to zero
// all: returns true if all elements are not equal to zero
#define NPYV_IMPL_LSX_ANYALL(SFX)                   \
    NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a)   \
    { return __lsx_vmsknz_b((__m128i)a)[0] != 0; }  \
    NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a)   \
    { return __lsx_vmsknz_b((__m128i)a)[0] == 0xffff; }
NPYV_IMPL_LSX_ANYALL(b8)
NPYV_IMPL_LSX_ANYALL(b16)
NPYV_IMPL_LSX_ANYALL(b32)
NPYV_IMPL_LSX_ANYALL(b64)
#undef NPYV_IMPL_LSX_ANYALL

#define NPYV_IMPL_LSX_ANYALL(SFX, TSFX, MASK)       \
    NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a)   \
    {                                               \
        return  __lsx_vmsknz_b(a)[0] != 0;          \
    }                                               \
    NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a)   \
    {                                               \
        return __lsx_vmsknz_b(                      \
            __lsx_vseq_##TSFX(a, npyv_zero_##SFX()) \
        )[0] == 0;                                  \
    }
NPYV_IMPL_LSX_ANYALL(u8,  b, 0xffff)
NPYV_IMPL_LSX_ANYALL(s8,  b, 0xffff)
NPYV_IMPL_LSX_ANYALL(u16, h, 0xffff)
NPYV_IMPL_LSX_ANYALL(s16, h, 0xffff)
NPYV_IMPL_LSX_ANYALL(u32, w, 0xffff)
NPYV_IMPL_LSX_ANYALL(s32, w, 0xffff)
NPYV_IMPL_LSX_ANYALL(u64, d, 0xffff)
NPYV_IMPL_LSX_ANYALL(s64, d, 0xffff)
#undef NPYV_IMPL_LSX_ANYALL

NPY_FINLINE bool npyv_any_f32(npyv_f32 a)
{
    return __lsx_vmsknz_b(__lsx_vfcmp_ceq_s(a, npyv_zero_f32()))[0] != 0xffff;
}
NPY_FINLINE bool npyv_all_f32(npyv_f32 a)
{
    return __lsx_vmsknz_b(__lsx_vfcmp_ceq_s(a, npyv_zero_f32()))[0] == 0;
}
NPY_FINLINE bool npyv_any_f64(npyv_f64 a)
{
    return __lsx_vmsknz_b(__lsx_vfcmp_ceq_d(a, npyv_zero_f64()))[0] != 0xffff;
}
NPY_FINLINE bool npyv_all_f64(npyv_f64 a)
{
    return __lsx_vmsknz_b(__lsx_vfcmp_ceq_d(a, npyv_zero_f64()))[0] == 0;
}
#endif // _NPY_SIMD_LSX_OPERATORS_H