/* -*- c -*- */
#define PY_SSIZE_T_CLEAN
#include <Python.h>

#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "npy_config.h"
#include "numpy/npy_common.h"
#include "numpy/arrayobject.h"
#include "numpy/ufuncobject.h"
#include "numpy/npy_math.h"
#include "numpy/halffloat.h"
#include "lowlevel_strided_loops.h"


#include "blas_utils.h"
#include "npy_cblas.h"
#include "arraytypes.h" /* For TYPE_dot functions */

#include <assert.h>

/*
 *****************************************************************************
 **                            BASICS                                       **
 *****************************************************************************
 */

#define ABS(x) ((x) < 0 ? -(x) : (x))

#if defined(HAVE_CBLAS)
/*
 * -1 to be conservative, in case blas internally uses a for loop with an
 * inclusive upper bound
 */
#ifndef HAVE_BLAS_ILP64
#define BLAS_MAXSIZE (NPY_MAX_INT - 1)
#else
#define BLAS_MAXSIZE (NPY_MAX_INT64 - 1)
#endif

/*
 * Determine if a 2d matrix can be used by BLAS
 * 1. Strides must not alias or overlap
 * 2. The faster (second) axis must be contiguous
 * 3. The slower (first) axis stride, in unit steps, must be larger than
 *    the faster axis dimension
 */
static inline npy_bool
is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
              npy_intp d1, npy_intp d2,  npy_intp itemsize)
{
    npy_intp unit_stride1 = byte_stride1 / itemsize;
    if (byte_stride2 != itemsize) {
        return NPY_FALSE;
    }
    if ((byte_stride1 % itemsize ==0) &&
        (unit_stride1 >= d2) &&
        (unit_stride1 <= BLAS_MAXSIZE))
    {
        return NPY_TRUE;
    }
    return NPY_FALSE;
}

#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
static const npy_cfloat oneF = {1.0f, 0.0f}, zeroF = {0.0f, 0.0f};
#else
static const npy_cdouble oneD = 1.0, zeroD = 0.0;
static const npy_cfloat oneF = 1.0f, zeroF = 0.0f;
#endif

/**begin repeat
 *
 * #name = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
 * #ctype = npy_float, npy_double, npy_cfloat, npy_cdouble#
 * #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
 * #prefix = s, d, c, z#
 * #step1 = 1.F, 1., &oneF, &oneD#
 * #step0 = 0.F, 0., &zeroF, &zeroD#
 */

static inline void
@name@_matrix_copy(npy_bool transpose,
                   void *_ip, npy_intp is_m, npy_intp is_n,
                   void *_op, npy_intp os_m, npy_intp os_n,
                   npy_intp dm, npy_intp dn)
{

    char *ip = (char *)_ip, *op = (char *)_op;

    npy_intp m, n, ib, ob;
    
    if (transpose) {
        ib = is_m * dm, ob = os_m * dm;

        for (n = 0; n < dn; n++) {
            for (m = 0; m < dm; m++) {
                *(@ctype@ *)op = *(@ctype@ *)ip;
                ip += is_m;
                op += os_m;
            }
            ip += is_n - ib;
            op += os_n - ob;
        }
    
        return;
    }

    ib = is_n * dn, ob = os_n * dn;

    for (m = 0; m < dm; m++) {
        for (n = 0; n < dn; n++) {
            *(@ctype@ *)op = *(@ctype@ *)ip;
            ip += is_n;
            op += os_n;
        }
        ip += is_m - ib;
        op += os_m - ob;
    }
}

static void
@name@_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
            void *ip2, npy_intp is2_n,
            void *op, npy_intp op_m,
            npy_intp m, npy_intp n)
{
    /*
     * Vector matrix multiplication -- Level 2 BLAS
     * arguments
     * ip1: contiguous data, m*n shape
     * ip2: data in c order, n*1 shape
     * op:  data in c order, m shape
     */
    enum CBLAS_ORDER order;
    CBLAS_INT M, N, lda;

    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
    assert (is_blasable2d(is2_n, sizeof(@typ@), n, 1, sizeof(@typ@)));
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
        order = CblasColMajor;
        lda = (CBLAS_INT)(is1_m / sizeof(@typ@));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        order = CblasRowMajor;
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
        lda = (CBLAS_INT)(is1_n / sizeof(@typ@));
    }
    CBLAS_FUNC(cblas_@prefix@gemv)(order, CblasTrans, N, M, @step1@, ip1, lda, ip2,
                                     is2_n / sizeof(@typ@), @step0@, op, op_m / sizeof(@typ@));
}

static void
@name@_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
                           void *ip2, npy_intp is2_n, npy_intp is2_p,
                           void *op, npy_intp os_m, npy_intp os_p,
                           npy_intp m, npy_intp n, npy_intp p)
{
    /*
     * matrix matrix multiplication -- Level 3 BLAS
     */
    enum CBLAS_ORDER order = CblasRowMajor;
    enum CBLAS_TRANSPOSE trans1, trans2;
    CBLAS_INT M, N, P, lda, ldb, ldc;
    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;
    P = (CBLAS_INT)p;

    assert(is_blasable2d(os_m, os_p, m, p, sizeof(@typ@)));
    ldc = (CBLAS_INT)(os_m / sizeof(@typ@));

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
        trans1 = CblasNoTrans;
        lda = (CBLAS_INT)(is1_m / sizeof(@typ@));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
        trans1 = CblasTrans;
        lda = (CBLAS_INT)(is1_n / sizeof(@typ@));
    }

    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(@typ@))) {
        trans2 = CblasNoTrans;
        ldb = (CBLAS_INT)(is2_n / sizeof(@typ@));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(@typ@)));
        trans2 = CblasTrans;
        ldb = (CBLAS_INT)(is2_p / sizeof(@typ@));
    }
    /*
     * Use syrk if we have a case of a matrix times its transpose.
     * Otherwise, use gemm for all other cases.
     */
    if (
        (ip1 == ip2) &&
        (m == p) &&
        (is1_m == is2_p) &&
        (is1_n == is2_n) &&
        (trans1 != trans2)
    ) {
        npy_intp i,j;
        if (trans1 == CblasNoTrans) {
            CBLAS_FUNC(cblas_@prefix@syrk)(
                order, CblasUpper, trans1, P, N, @step1@,
                ip1, lda, @step0@, op, ldc);
        }
        else {
            CBLAS_FUNC(cblas_@prefix@syrk)(
                order, CblasUpper, trans1, P, N, @step1@,
                ip1, ldb, @step0@, op, ldc);
        }
        /* Copy the triangle */
        for (i = 0; i < P; i++) {
            for (j = i + 1; j < P; j++) {
                ((@typ@*)op)[j * ldc + i] = ((@typ@*)op)[i * ldc + j];
            }
        }

    }
    else {
        CBLAS_FUNC(cblas_@prefix@gemm)(
            order, trans1, trans2, M, P, N, @step1@, ip1, lda,
            ip2, ldb, @step0@, op, ldc);
    }
}

/**end repeat**/
#endif

/*
 * matmul loops
 * signature is (m?,n),(n,p?)->(m?,p?)
 */

/**begin repeat
 *  #TYPE = LONGDOUBLE,
 *          FLOAT, DOUBLE, HALF,
 *          CFLOAT, CDOUBLE, CLONGDOUBLE,
 *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
 *          BYTE, SHORT, INT, LONG, LONGLONG#
 *  #typ = npy_longdouble,
 *         npy_float,npy_double,npy_half,
 *         npy_cfloat, npy_cdouble, npy_clongdouble,
 *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
 *         npy_byte, npy_short, npy_int, npy_long, npy_longlong#
 * #suff = , , , , f, , l, x*10#
 * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*10#
 * #IS_HALF = 0, 0, 0, 1, 0*13#
 */

static void
@TYPE@_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)

{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if @IS_COMPLEX@ == 1
            npy_csetreal@suff@((@typ@ *)op, 0.0);
            npy_csetimag@suff@((@typ@ *)op, 0.0);
#elif @IS_HALF@
            float sum = 0;
#else
            *(@typ@ *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                @typ@ val1 = (*(@typ@ *)ip1);
                @typ@ val2 = (*(@typ@ *)ip2);
#if @IS_HALF@
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif @IS_COMPLEX@ == 1
                npy_csetreal@suff@((@typ@ *) op, npy_creal@suff@(*((@typ@ *) op)) + ((npy_creal@suff@(val1) * npy_creal@suff@(val2)) -
                                       (npy_cimag@suff@(val1) * npy_cimag@suff@(val2))));
                npy_csetimag@suff@((@typ@ *) op, npy_cimag@suff@(*((@typ@ *) op)) + ((npy_creal@suff@(val1) * npy_cimag@suff@(val2)) +
                                       (npy_cimag@suff@(val1) * npy_creal@suff@(val2))));
#else
                *(@typ@ *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if @IS_HALF@
            *(@typ@ *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}

/**end repeat**/
static void
BOOL_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)

{
    npy_intp m, n, p;
    npy_intp ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
            char *ip1tmp = ip1;
            char *ip2tmp = ip2;
            *(npy_bool *)op = NPY_FALSE;
            for (n = 0; n < dn; n++) {
                npy_bool val1 = (*(npy_bool *)ip1tmp);
                npy_bool val2 = (*(npy_bool *)ip2tmp);
                if (val1 != 0 && val2 != 0) {
                    *(npy_bool *)op = NPY_TRUE;
                    break;
                }
                ip2tmp += is2_n;
                ip1tmp += is1_n;
            }
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}

static void
OBJECT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
{
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    npy_intp ib1_n = is1_n * dn;
    npy_intp ib2_n = is2_n * dn;
    npy_intp ib2_p = is2_p * dp;
    npy_intp ob_p  = os_p * dp;

    PyObject *product, *sum_of_products = NULL;

    for (npy_intp m = 0; m < dm; m++) {
        for (npy_intp p = 0; p < dp; p++) {
            if ( 0 == dn ) {
                sum_of_products = PyLong_FromLong(0);
                if (sum_of_products == NULL) {
                    return;
                }
            }

            for (npy_intp n = 0; n < dn; n++) {
                PyObject *obj1 = *(PyObject**)ip1, *obj2 = *(PyObject**)ip2;
                if (obj1 == NULL) {
                    obj1 = Py_None;
                }
                if (obj2 == NULL) {
                    obj2 = Py_None;
                }

                product = PyNumber_Multiply(obj1, obj2);
                if (product == NULL) {
                    Py_XDECREF(sum_of_products);
                    return;
                }

                if (n == 0) {
                    sum_of_products = product;
                }
                else {
                    Py_SETREF(sum_of_products, PyNumber_Add(sum_of_products, product));
                    Py_DECREF(product);
                    if (sum_of_products == NULL) {
                        return;
                    }
                }

                ip2 += is2_n;
                ip1 += is1_n;
            }

            *((PyObject **)op) = sum_of_products;
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


/**begin repeat
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
 *          CFLOAT, CDOUBLE, CLONGDOUBLE,
 *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
 *          BYTE, SHORT, INT, LONG, LONGLONG,
 *          BOOL, OBJECT#
 *  #typ = npy_float,npy_double,npy_longdouble, npy_half,
 *         npy_cfloat, npy_cdouble, npy_clongdouble,
 *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
 *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
 *         npy_bool,npy_object#
 * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*12#
 * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*13#
 */


NPY_NO_EXPORT void
@TYPE@_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if @USEBLAS@ && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(@typ@);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool oblasable = o_c_blasable || o_f_blasable;
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
    npy_bool noblas_fallback = too_big_for_blas || any_zero_dim;
    npy_bool matrix_matrix = !noblas_fallback && !special_case;
    npy_bool allocate_buffer = matrix_matrix && (
        !i1blasable || !i2blasable || !oblasable
    );

    uint8_t *tmp_ip12op = NULL;
    void *tmp_ip1 = NULL, *tmp_ip2 = NULL, *tmp_op = NULL;

    if (allocate_buffer){
        npy_intp ip1_size = i1blasable ? 0 : sz * dm * dn,
                 ip2_size = i2blasable ? 0 : sz * dn * dp,
                 op_size = oblasable ? 0 : sz * dm * dp,
                 total_size = ip1_size + ip2_size + op_size;

        tmp_ip12op = (uint8_t*)malloc(total_size);

        if (tmp_ip12op == NULL) {
            PyGILState_STATE gil_state = PyGILState_Ensure();
            PyErr_SetString(
                PyExc_MemoryError, "Out of memory in matmul"
            );
            PyGILState_Release(gil_state);

            return;
        }    
    
        tmp_ip1 = tmp_ip12op;
        tmp_ip2 = tmp_ip12op + ip1_size;
        tmp_op = tmp_ip12op + ip1_size + ip2_size;    
    }

#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if @USEBLAS@ && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (noblas_fallback) {
            @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n,
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                @TYPE@_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                @TYPE@_gemv(ip2, is2_p, is2_n, ip1, is1_n,
                            op, os_p, dp, dn);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                @TYPE@_gemv(ip1, is1_m, is1_n, ip2, is2_n,
                            op, os_m, dm, dn);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix 
             * copy if not blasable, see gh-12365 & gh-23588 */
            npy_bool i1_transpose = ABS(is1_m) < ABS(is1_n),
                     i2_transpose = ABS(is2_n) < ABS(is2_p),
                     o_transpose = ABS(os_m) < ABS(os_p);

            npy_intp tmp_is1_m = i1_transpose ? sz : sz*dn,
                     tmp_is1_n = i1_transpose ? sz*dm : sz,
                     tmp_is2_n = i2_transpose ? sz : sz*dp,
                     tmp_is2_p = i2_transpose ? sz*dn : sz,
                     tmp_os_m = o_transpose ? sz : sz*dp,
                     tmp_os_p = o_transpose ? sz*dm : sz;

            if (!i1blasable) {
                @TYPE@_matrix_copy(
                    i1_transpose, ip1, is1_m, is1_n,
                    tmp_ip1, tmp_is1_m, tmp_is1_n,
                    dm, dn
                );
            }
            
            if (!i2blasable) {
                @TYPE@_matrix_copy(
                    i2_transpose, ip2, is2_n, is2_p,
                    tmp_ip2, tmp_is2_n, tmp_is2_p,
                    dn, dp
                );
            }

            void *ip1_ = i1blasable ? ip1 : tmp_ip1,
                 *ip2_ = i2blasable ? ip2 : tmp_ip2,
                 *op_ = oblasable ? op : tmp_op;

            npy_intp is1_m_ = i1blasable ? is1_m : tmp_is1_m,
                     is1_n_ = i1blasable ? is1_n : tmp_is1_n,
                     is2_n_ = i2blasable ? is2_n : tmp_is2_n,
                     is2_p_ = i2blasable ? is2_p : tmp_is2_p,
                     os_m_ = oblasable ? os_m : tmp_os_m,
                     os_p_ = oblasable ? os_p : tmp_os_p;

            /*
             * Use transpose equivalence:
             * matmul(a, b, o) == matmul(b.T, a.T, o.T)
             */
            if (o_transpose) {
                @TYPE@_matmul_matrixmatrix(
                    ip2_, is2_p_, is2_n_,
                    ip1_, is1_n_, is1_m_,
                    op_, os_p_, os_m_,
                    dp, dn, dm
                );
            }
            else {
                @TYPE@_matmul_matrixmatrix(
                    ip1_, is1_m_, is1_n_,
                    ip2_, is2_n_, is2_p_,
                    op_, os_m_, os_p_,
                    dm, dn, dp
                );
            }

            if(!oblasable){
                @TYPE@_matrix_copy(
                    o_transpose, tmp_op, tmp_os_m, tmp_os_p,
                    op, os_m, os_p,
                    dm, dp
                );
            }
        }
#else
        @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n,
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
#if @USEBLAS@ && defined(HAVE_CBLAS)
#if NPY_BLAS_CHECK_FPE_SUPPORT
    if (!npy_blas_supports_fpe()) {
        npy_clear_floatstatus_barrier((char*)args);
    }
#endif
    if (allocate_buffer) free(tmp_ip12op);
#endif
}

/**end repeat**/


/*
 * vecdot loops
 * signature is (n),(n)->()
 * We need to redefine complex and object dot from arraytypes.c.src to ensure
 * we take the complex conjugate of the first argument (for cblas, this means
 * using the dotc functions instead of dotu).
 */

/**begin repeat
 *
 * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
 * #ctype = npy_cfloat, npy_cdouble, npy_clongdouble#
 * #type = npy_float, npy_double, npy_longdouble#
 * #sumtype = npy_double, npy_double, npy_longdouble#
 * #prefix = c, z, 0#
 * #USE_BLAS = 1, 1, 0#
 */
static void
@name@_dotc(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
            char *op, npy_intp n, void *NPY_UNUSED(ignore))
{
#if defined(HAVE_CBLAS) && @USE_BLAS@
    CBLAS_INT is1b = blas_stride(is1, sizeof(@ctype@));
    CBLAS_INT is2b = blas_stride(is2, sizeof(@ctype@));

    if (is1b && is2b) {
        @sumtype@ sum[2] = {0., 0.};  /* at least double for stability */

        while (n > 0) {
            CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
            @type@ tmp[2];

            CBLAS_FUNC(cblas_@prefix@dotc_sub)(
                    (CBLAS_INT)chunk, ip1, is1b, ip2, is2b, tmp);
            sum[0] += (@sumtype@)tmp[0];
            sum[1] += (@sumtype@)tmp[1];
            /* use char strides here */
            ip1 += chunk * is1;
            ip2 += chunk * is2;
            n -= chunk;
        }
        ((@type@ *)op)[0] = (@type@)sum[0];
        ((@type@ *)op)[1] = (@type@)sum[1];
    }
    else
#endif
    {
        @sumtype@ sumr = 0., sumi = 0.;  /* at least double for stability */
        for (npy_intp i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
            const @type@ ip1r = ((@type@ *)ip1)[0];
            const @type@ ip1i = ((@type@ *)ip1)[1];
            const @type@ ip2r = ((@type@ *)ip2)[0];
            const @type@ ip2i = ((@type@ *)ip2)[1];
            /* ip1.conj() * ip2 */
            sumr += (@sumtype@)(ip1r * ip2r + ip1i * ip2i);
            sumi += (@sumtype@)(ip1r * ip2i - ip1i * ip2r);
        }
        ((@type@ *)op)[0] = (@type@)sumr;
        ((@type@ *)op)[1] = (@type@)sumi;
    }
}
/**end repeat**/

static void
OBJECT_dotc(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
            void *NPY_UNUSED(ignore))
{
    npy_intp i;
    PyObject *result = NULL;

    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
        PyObject *obj1 = *(PyObject**)ip1, *obj2 = *(PyObject**)ip2;
        if (obj1 == NULL) {
            obj1 = Py_None;  /* raise useful error message at conjugate */
        }
        if (obj2 == NULL) {
            obj2 = Py_None;  /* raise at multiply (unless obj1 handles None) */
        }
        PyObject *obj1_conj = PyObject_CallMethod(obj1, "conjugate", NULL);
        if (obj1_conj == NULL) {
            goto fail;
        }
        PyObject *prod = PyNumber_Multiply(obj1_conj, obj2);
        Py_DECREF(obj1_conj);
        if (prod == NULL) {
            goto fail;
        }
        if (i == 0) {
            result = prod;
        }
        else {
            Py_SETREF(result, PyNumber_Add(result, prod));
            Py_DECREF(prod);
            if (result == NULL) {
                goto fail;
            }
        }
    }
    Py_XSETREF(*((PyObject **)op), result);
    return;

  fail:
    Py_XDECREF(result);
    return;
}

/**begin repeat
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
 *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
 *          BYTE, SHORT, INT, LONG, LONGLONG, BOOL,
 *          CFLOAT, CDOUBLE, CLONGDOUBLE, OBJECT#
 *  #DOT = dot*15, dotc*4#
 *  #CHECK_PYERR = 0*18, 1#
 *  #CHECK_BLAS = 1*2, 0*13, 1*2, 0*2#
 */
NPY_NO_EXPORT void
@TYPE@_vecdot(char **args, npy_intp const *dimensions, npy_intp const *steps,
              void *NPY_UNUSED(func))
{
    npy_intp n_outer = dimensions[0];
    npy_intp s0=steps[0], s1=steps[1], s2=steps[2];
    npy_intp n_inner = dimensions[1];
    npy_intp is1=steps[3], is2=steps[4];

    for (npy_intp i = 0; i < n_outer; i++, args[0] += s0, args[1] += s1, args[2] += s2) {
        /*
         * TODO: use new API to select inner loop with get_loop and
         * improve error treatment.
         */
        char *ip1=args[0], *ip2=args[1], *op=args[2];
        @TYPE@_@DOT@(ip1, is1, ip2, is2, op, n_inner, NULL);
#if @CHECK_PYERR@
        if (PyErr_Occurred()) {
            return;
        }
#endif
    }
#if @CHECK_BLAS@ && NPY_BLAS_CHECK_FPE_SUPPORT
    if (!npy_blas_supports_fpe()) {
        npy_clear_floatstatus_barrier((char*)args);
    }
#endif
}
/**end repeat**/

#if defined(HAVE_CBLAS)
/*
 * Blas complex vector-matrix product via gemm (gemv cannot conjugate the vector).
 */
/**begin repeat
 *
 * #name = CFLOAT, CDOUBLE#
 * #typ = npy_cfloat, npy_cdouble#
 * #prefix = c, z#
 * #step1 = &oneF, &oneD#
 * #step0 = &zeroF, &zeroD#
 */
static void
@name@_vecmat_via_gemm(void *ip1, npy_intp is1_n,
                       void *ip2, npy_intp is2_n, npy_intp is2_m,
                       void *op, npy_intp os_m,
                       npy_intp n, npy_intp m)
{
    enum CBLAS_ORDER order = CblasRowMajor;
    enum CBLAS_TRANSPOSE trans1, trans2;
    CBLAS_INT N, M, lda, ldb, ldc;
    assert(n <= BLAS_MAXSIZE && m <= BLAS_MAXSIZE);
    N = (CBLAS_INT)n;
    M = (CBLAS_INT)m;

    assert(os_m == sizeof(@typ@));
    ldc = (CBLAS_INT)m;

    assert(is_blasable2d(is1_n, sizeof(@typ@), n, 1, sizeof(@typ@)));
    trans1 = CblasConjTrans;
    lda = (CBLAS_INT)(is1_n / sizeof(@typ@));

    if (is_blasable2d(is2_n, is2_m, n, m, sizeof(@typ@))) {
        trans2 = CblasNoTrans;
        ldb = (CBLAS_INT)(is2_n / sizeof(@typ@));
    }
    else {
        assert(is_blasable2d(is2_m, is2_n, m, n, sizeof(@typ@)));
        trans2 = CblasTrans;
        ldb = (CBLAS_INT)(is2_m / sizeof(@typ@));
    }
    CBLAS_FUNC(cblas_@prefix@gemm)(
        order, trans1, trans2, 1, M, N, @step1@, ip1, lda,
        ip2, ldb, @step0@, op, ldc);
}
/**end repeat**/
#endif

/*
 * matvec loops, using blas gemv if possible, and TYPE_dot implementations otherwise.
 * signature is (m,n),(n)->(m)
 */
/**begin repeat
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
 *          CFLOAT, CDOUBLE, CLONGDOUBLE,
 *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
 *          BYTE, SHORT, INT, LONG, LONGLONG,
 *          BOOL, OBJECT#
 *  #typ = npy_float,npy_double,npy_longdouble, npy_half,
 *         npy_cfloat, npy_cdouble, npy_clongdouble,
 *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
 *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
 *         npy_bool, npy_object#
 * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*13#
 * #CHECK_PYERR = 0*18, 1#
 */
NPY_NO_EXPORT void
@TYPE@_matvec(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp n_outer = dimensions[0];
    npy_intp s0=steps[0], s1=steps[1], s2=steps[2];
    npy_intp dm = dimensions[1], dn = dimensions[2];
    npy_intp is1_m=steps[3], is1_n=steps[4], is2_n=steps[5], os_m=steps[6];
#if @USEBLAS@ && defined(HAVE_CBLAS)
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sizeof(@typ@));
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sizeof(@typ@));
    npy_bool i2_blasable = is_blasable2d(is2_n, sizeof(@typ@), dn, 1, sizeof(@typ@));
    npy_bool blasable = ((i1_c_blasable || i1_f_blasable) && i2_blasable
                         && !too_big_for_blas && dn > 1 && dm > 1);
#endif
    for (npy_intp i = 0; i < n_outer; i++,
             args[0] += s0, args[1] += s1, args[2] += s2) {
        char *ip1=args[0], *ip2=args[1], *op=args[2];
#if @USEBLAS@ && defined(HAVE_CBLAS)
        if (blasable) {
            @TYPE@_gemv(ip1, is1_m, is1_n, ip2, is2_n, op, os_m, dm, dn);
            continue;
        }
#endif
        /*
         * Dot the different matrix rows with the vector to get output elements.
         * (no conjugation for complex, unlike vecdot and vecmat)
         */
        for (npy_intp j = 0; j < dm; j++, ip1 += is1_m, op += os_m) {
            @TYPE@_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
#if @CHECK_PYERR@
            if (PyErr_Occurred()) {
                return;
            }
#endif
        }
    }
#if @USEBLAS@ && NPY_BLAS_CHECK_FPE_SUPPORT
    if (!npy_blas_supports_fpe()) {
        npy_clear_floatstatus_barrier((char*)args);
    }
#endif
}
/**end repeat**/

/*
 * vecmat loops, using blas gemv for float and gemm for complex if possible,
 * and TYPE_dot[c] implementations otherwise.
 * Note that we cannot use gemv for complex, since we need to conjugate the vector.
 * signature is (n),(n,m)->(m)
 */
/**begin repeat
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
 *          CFLOAT, CDOUBLE, CLONGDOUBLE,
 *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
 *          BYTE, SHORT, INT, LONG, LONGLONG,
 *          BOOL, OBJECT#
 *  #typ = npy_float,npy_double,npy_longdouble, npy_half,
 *         npy_cfloat, npy_cdouble, npy_clongdouble,
 *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
 *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
 *         npy_bool, npy_object#
 * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*13#
 * #COMPLEX = 0*4, 1*3, 0*11, 1#
 * #DOT = dot*4, dotc*3, dot*11, dotc#
 * #CHECK_PYERR = 0*18, 1#
 */
NPY_NO_EXPORT void
@TYPE@_vecmat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp n_outer = dimensions[0];
    npy_intp s0=steps[0], s1=steps[1], s2=steps[2];
    npy_intp dn = dimensions[1], dm = dimensions[2];
    npy_intp is1_n=steps[3], is2_n=steps[4], is2_m=steps[5], os_m=steps[6];
#if @USEBLAS@ && defined(HAVE_CBLAS)
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE);
    npy_bool i1_blasable = is_blasable2d(is1_n, sizeof(@typ@), dn, 1, sizeof(@typ@));
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_m, dn, dm, sizeof(@typ@));
    npy_bool i2_f_blasable = is_blasable2d(is2_m, is2_n, dm, dn, sizeof(@typ@));
    npy_bool blasable = (i1_blasable && (i2_c_blasable || i2_f_blasable)
                         && !too_big_for_blas && dn > 1 && dm > 1);
#endif
    for (npy_intp i = 0; i < n_outer; i++,
             args[0] += s0, args[1] += s1, args[2] += s2) {
        char *ip1=args[0], *ip2=args[1], *op=args[2];
#if @USEBLAS@ && defined(HAVE_CBLAS)
        if (blasable) {
#if @COMPLEX@
            /* For complex, use gemm so we can conjugate the vector */
            @TYPE@_vecmat_via_gemm(ip1, is1_n, ip2, is2_n, is2_m, op, os_m, dn, dm);
#else
            /* For float, use gemv (hence flipped order) */
            @TYPE@_gemv(ip2, is2_m, is2_n, ip1, is1_n, op, os_m, dm, dn);
#endif
            continue;
        }
#endif
        /* Dot the vector with different matrix columns to get output elements. */
        for (npy_intp j = 0; j < dm; j++, ip2 += is2_m, op += os_m) {
            @TYPE@_@DOT@(ip1, is1_n, ip2, is2_n, op, dn, NULL);
#if @CHECK_PYERR@
            if (PyErr_Occurred()) {
                return;
            }
#endif
        }
    }
#if @USEBLAS@ && NPY_BLAS_CHECK_FPE_SUPPORT
    if (!npy_blas_supports_fpe()) {
        npy_clear_floatstatus_barrier((char*)args);
    }
#endif
}
/**end repeat**/