#ifndef NPY_SIMD
    #error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_NEON_CVT_H
#define _NPY_SIMD_NEON_CVT_H

// convert boolean vectors to integer vectors
#define npyv_cvt_u8_b8(A)   A
#define npyv_cvt_s8_b8   vreinterpretq_s8_u8
#define npyv_cvt_u16_b16(A) A
#define npyv_cvt_s16_b16 vreinterpretq_s16_u16
#define npyv_cvt_u32_b32(A) A
#define npyv_cvt_s32_b32 vreinterpretq_s32_u32
#define npyv_cvt_u64_b64(A) A
#define npyv_cvt_s64_b64 vreinterpretq_s64_u64
#define npyv_cvt_f32_b32 vreinterpretq_f32_u32
#define npyv_cvt_f64_b64 vreinterpretq_f64_u64

// convert integer vectors to boolean vectors
#define npyv_cvt_b8_u8(BL)   BL
#define npyv_cvt_b8_s8   vreinterpretq_u8_s8
#define npyv_cvt_b16_u16(BL) BL
#define npyv_cvt_b16_s16 vreinterpretq_u16_s16
#define npyv_cvt_b32_u32(BL) BL
#define npyv_cvt_b32_s32 vreinterpretq_u32_s32
#define npyv_cvt_b64_u64(BL) BL
#define npyv_cvt_b64_s64 vreinterpretq_u64_s64
#define npyv_cvt_b32_f32 vreinterpretq_u32_f32
#define npyv_cvt_b64_f64 vreinterpretq_u64_f64

// convert boolean vector to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
    const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
    npyv_u8 seq_scale = vandq_u8(a, scale);
#if defined(__aarch64__)
    const npyv_u8 byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
    npyv_u8 v0 = vqtbl1q_u8(seq_scale, byteOrder);
    return vaddlvq_u16(vreinterpretq_u16_u8(v0));
#else
    npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(seq_scale)));
    return vgetq_lane_u64(sumh, 0) + ((int)vgetq_lane_u64(sumh, 1) << 8);
#endif
}
NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
{
    const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128);
    npyv_u16 seq_scale = vandq_u16(a, scale);
#if NPY_SIMD_F64
    return vaddvq_u16(seq_scale);
#else
    npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(seq_scale));
    return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
#endif
}
NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
{
    const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8);
    npyv_u32 seq_scale = vandq_u32(a, scale);
#if NPY_SIMD_F64
    return vaddvq_u32(seq_scale);
#else
    npyv_u64 sumh = vpaddlq_u32(seq_scale);
    return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
#endif
}
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{
    uint64_t lo = vgetq_lane_u64(a, 0);
    uint64_t hi = vgetq_lane_u64(a, 1);
    return ((hi & 0x2) | (lo & 0x1));
}

//expand
NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) {
    npyv_u16x2 r;
    r.val[0] = vmovl_u8(vget_low_u8(data));
    r.val[1] = vmovl_u8(vget_high_u8(data));
    return r;
}

NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
    npyv_u32x2 r;
    r.val[0] = vmovl_u16(vget_low_u16(data));
    r.val[1] = vmovl_u16(vget_high_u16(data));
    return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
#if defined(__aarch64__)
    return vuzp1q_u8((uint8x16_t)a, (uint8x16_t)b);
#else
    return vcombine_u8(vmovn_u16(a), vmovn_u16(b));
#endif
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
#if defined(__aarch64__)
    npyv_b16 ab = vuzp1q_u16((uint16x8_t)a, (uint16x8_t)b);
    npyv_b16 cd = vuzp1q_u16((uint16x8_t)c, (uint16x8_t)d);
#else
    npyv_b16 ab = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
    npyv_b16 cd = vcombine_u16(vmovn_u32(c), vmovn_u32(d));
#endif
    return npyv_pack_b8_b16(ab, cd);
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
                 npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
#if defined(__aarch64__)
    npyv_b32 ab = vuzp1q_u32((uint32x4_t)a, (uint32x4_t)b);
    npyv_b32 cd = vuzp1q_u32((uint32x4_t)c, (uint32x4_t)d);
    npyv_b32 ef = vuzp1q_u32((uint32x4_t)e, (uint32x4_t)f);
    npyv_u32 gh = vuzp1q_u32((uint32x4_t)g, (uint32x4_t)h);
#else
    npyv_b32 ab = vcombine_u32(vmovn_u64(a), vmovn_u64(b));
    npyv_b32 cd = vcombine_u32(vmovn_u64(c), vmovn_u64(d));
    npyv_b32 ef = vcombine_u32(vmovn_u64(e), vmovn_u64(f));
    npyv_b32 gh = vcombine_u32(vmovn_u64(g), vmovn_u64(h));
#endif
    return npyv_pack_b8_b32(ab, cd, ef, gh);
}

// round to nearest integer
#if NPY_SIMD_F64
    #define npyv_round_s32_f32 vcvtnq_s32_f32
    NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
    {
        npyv_s64 lo = vcvtnq_s64_f64(a), hi = vcvtnq_s64_f64(b);
        return vcombine_s32(vmovn_s64(lo), vmovn_s64(hi));
    }
#else
    NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a)
    {
        // halves will be rounded up. it's very costly
        // to obey IEEE standard on arm7. tests should pass +-1 difference
        const npyv_u32 sign = vdupq_n_u32(0x80000000);
        const npyv_f32 half = vdupq_n_f32(0.5f);
        npyv_f32 sign_half = vbslq_f32(sign, a, half);
        return vcvtq_s32_f32(vaddq_f32(a, sign_half));
    }
#endif

#endif // _NPY_SIMD_NEON_CVT_H