import itertools
import operator

from packaging import version

import numpy as np

from .common import DLPACK_TYPES, TYPES1, Benchmark, get_squares_

ufuncs = ['abs', 'absolute', 'add', 'arccos', 'arccosh', 'arcsin', 'arcsinh',
          'arctan', 'arctan2', 'arctanh', 'bitwise_and', 'bitwise_count', 'bitwise_not',
          'bitwise_or', 'bitwise_xor', 'cbrt', 'ceil', 'conj', 'conjugate',
          'copysign', 'cos', 'cosh', 'deg2rad', 'degrees', 'divide', 'divmod',
          'equal', 'exp', 'exp2', 'expm1', 'fabs', 'float_power', 'floor',
          'floor_divide', 'fmax', 'fmin', 'fmod', 'frexp', 'gcd', 'greater',
          'greater_equal', 'heaviside', 'hypot', 'invert', 'isfinite',
          'isinf', 'isnan', 'isnat', 'lcm', 'ldexp', 'left_shift', 'less',
          'less_equal', 'log', 'log10', 'log1p', 'log2', 'logaddexp',
          'logaddexp2', 'logical_and', 'logical_not', 'logical_or',
          'logical_xor', 'matmul', 'matvec', 'maximum', 'minimum', 'mod',
          'modf', 'multiply', 'negative', 'nextafter', 'not_equal', 'positive',
          'power', 'rad2deg', 'radians', 'reciprocal', 'remainder',
          'right_shift', 'rint', 'sign', 'signbit', 'sin',
          'sinh', 'spacing', 'sqrt', 'square', 'subtract', 'tan', 'tanh',
          'true_divide', 'trunc', 'vecdot', 'vecmat']
arrayfuncdisp = ['real', 'round']

for name in ufuncs:
    f = getattr(np, name, None)
    if not isinstance(f, np.ufunc):
        raise ValueError(f"Bench target `np.{name}` is not a ufunc")

all_ufuncs = (getattr(np, name, None) for name in dir(np))
all_ufuncs = set(filter(lambda f: isinstance(f, np.ufunc), all_ufuncs))
bench_ufuncs = {getattr(np, name, None) for name in ufuncs}

missing_ufuncs = all_ufuncs - bench_ufuncs
if len(missing_ufuncs) > 0:
    missing_ufunc_names = [f.__name__ for f in missing_ufuncs]
    raise NotImplementedError(
        f"Missing benchmarks for ufuncs {missing_ufunc_names!r}")


class ArrayFunctionDispatcher(Benchmark):
    params = [arrayfuncdisp]
    param_names = ['func']
    timeout = 10

    def setup(self, ufuncname):
        np.seterr(all='ignore')
        try:
            self.afdn = getattr(np, ufuncname)
        except AttributeError:
            raise NotImplementedError
        self.args = []
        for _, aarg in get_squares_().items():
            arg = (aarg,) * 1  # no nin
            try:
                self.afdn(*arg)
            except TypeError:
                continue
            self.args.append(arg)

    def time_afdn_types(self, ufuncname):
        [self.afdn(*arg) for arg in self.args]


class Broadcast(Benchmark):
    def setup(self):
        self.d = np.ones((50000, 100), dtype=np.float64)
        self.e = np.ones((100,), dtype=np.float64)

    def time_broadcast(self):
        self.d - self.e


class At(Benchmark):
    def setup(self):
        rng = np.random.default_rng(1)
        self.vals = rng.random(10_000_000, dtype=np.float64)
        self.idx = rng.integers(1000, size=10_000_000).astype(np.intp)
        self.res = np.zeros(1000, dtype=self.vals.dtype)

    def time_sum_at(self):
        np.add.at(self.res, self.idx, self.vals)

    def time_maximum_at(self):
        np.maximum.at(self.res, self.idx, self.vals)


class UFunc(Benchmark):
    params = [ufuncs]
    param_names = ['ufunc']
    timeout = 10

    def setup(self, ufuncname):
        np.seterr(all='ignore')
        try:
            self.ufn = getattr(np, ufuncname)
        except AttributeError:
            raise NotImplementedError
        self.args = []
        for _, aarg in get_squares_().items():
            arg = (aarg,) * self.ufn.nin
            try:
                self.ufn(*arg)
            except TypeError:
                continue
            self.args.append(arg)

    def time_ufunc_types(self, ufuncname):
        [self.ufn(*arg) for arg in self.args]


class MethodsV0(Benchmark):
    """ Benchmark for the methods which do not take any arguments
    """
    params = [['__abs__', '__neg__', '__pos__'], TYPES1]
    param_names = ['methods', 'npdtypes']
    timeout = 10

    def setup(self, methname, npdtypes):
        values = get_squares_()
        self.xarg = values.get(npdtypes)[0]

    def time_ndarray_meth(self, methname, npdtypes):
        getattr(operator, methname)(self.xarg)


class NDArrayLRShifts(Benchmark):
    """ Benchmark for the shift methods
    """
    params = [['__lshift__', '__rshift__'],
              ['intp', 'int8', 'int16',
                'int32', 'int64', 'uint8',
                'uint16', 'uint32', 'uint64']]
    param_names = ['methods', 'npdtypes']
    timeout = 10

    def setup(self, methname, npdtypes):
        self.vals = np.ones(1000,
                            dtype=getattr(np, npdtypes)) * \
                            np.random.randint(9)

    def time_ndarray_meth(self, methname, npdtypes):
        getattr(operator, methname)(*[self.vals, 2])


class Methods0DBoolComplex(Benchmark):
    """Zero dimension array methods
    """
    params = [['__bool__', '__complex__'],
              TYPES1]
    param_names = ['methods', 'npdtypes']
    timeout = 10

    def setup(self, methname, npdtypes):
        self.xarg = np.array(3, dtype=npdtypes)

    def time_ndarray__0d__(self, methname, npdtypes):
        meth = getattr(self.xarg, methname)
        meth()


class Methods0DFloatInt(Benchmark):
    """Zero dimension array methods
    """
    params = [['__int__', '__float__'],
              [dt for dt in TYPES1 if not dt.startswith('complex')]]
    param_names = ['methods', 'npdtypes']
    timeout = 10

    def setup(self, methname, npdtypes):
        self.xarg = np.array(3, dtype=npdtypes)

    def time_ndarray__0d__(self, methname, npdtypes):
        meth = getattr(self.xarg, methname)
        meth()


class Methods0DInvert(Benchmark):
    """Zero dimension array methods
    """
    params = ['int16', 'int32', 'int64']
    param_names = ['npdtypes']
    timeout = 10

    def setup(self, npdtypes):
        self.xarg = np.array(3, dtype=npdtypes)

    def time_ndarray__0d__(self, npdtypes):
        self.xarg.__invert__()


class MethodsV1(Benchmark):
    """ Benchmark for the methods which take an argument
    """
    params = [['__add__', '__eq__', '__ge__', '__gt__', '__le__',
               '__lt__', '__matmul__', '__mul__', '__ne__',
               '__pow__', '__sub__', '__truediv__'],
              TYPES1]
    param_names = ['methods', 'npdtypes']
    timeout = 10

    def setup(self, methname, npdtypes):
        values = get_squares_().get(npdtypes)
        self.xargs = [values[0], values[1]]
        if np.issubdtype(npdtypes, np.inexact):
            # avoid overflow in __pow__/__matmul__ for low-precision dtypes
            self.xargs[1] *= 0.01

    def time_ndarray_meth(self, methname, npdtypes):
        getattr(operator, methname)(*self.xargs)


class MethodsV1IntOnly(Benchmark):
    """ Benchmark for the methods which take an argument
    """
    params = [['__and__', '__or__', '__xor__'],
              ['int16', 'int32', 'int64']]
    param_names = ['methods', 'npdtypes']
    timeout = 10

    def setup(self, methname, npdtypes):
        values = get_squares_().get(npdtypes)
        self.xargs = [values[0], values[1]]

    def time_ndarray_meth(self, methname, npdtypes):
        getattr(operator, methname)(*self.xargs)


class MethodsV1NoComplex(Benchmark):
    """ Benchmark for the methods which take an argument
    """
    params = [['__floordiv__', '__mod__'],
              [dt for dt in TYPES1 if not dt.startswith('complex')]]
    param_names = ['methods', 'npdtypes']
    timeout = 10

    def setup(self, methname, npdtypes):
        values = get_squares_().get(npdtypes)
        self.xargs = [values[0], values[1]]

    def time_ndarray_meth(self, methname, npdtypes):
        getattr(operator, methname)(*self.xargs)


class NDArrayGetItem(Benchmark):
    param_names = ['margs', 'msize']
    params = [[0, (0, 0), (-1, 0), [0, -1]],
              ['small', 'big']]

    def setup(self, margs, msize):
        self.xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
        self.xl = np.random.uniform(-1, 1, 50 * 50).reshape(50, 50)

    def time_methods_getitem(self, margs, msize):
        if msize == 'small':
            mdat = self.xs
        elif msize == 'big':
            mdat = self.xl
        mdat.__getitem__(margs)


class NDArraySetItem(Benchmark):
    param_names = ['margs', 'msize']
    params = [[0, (0, 0), (-1, 0), [0, -1]],
              ['small', 'big']]

    def setup(self, margs, msize):
        self.xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
        self.xl = np.random.uniform(-1, 1, 100 * 100).reshape(100, 100)

    def time_methods_setitem(self, margs, msize):
        if msize == 'small':
            mdat = self.xs
        elif msize == 'big':
            mdat = self.xl
            mdat[margs] = 17


class DLPMethods(Benchmark):
    """ Benchmark for DLPACK helpers
    """
    params = [['__dlpack__', '__dlpack_device__'], DLPACK_TYPES]
    param_names = ['methods', 'npdtypes']
    timeout = 10

    def setup(self, methname, npdtypes):
        values = get_squares_()
        if npdtypes == 'bool':
            if version.parse(np.__version__) > version.parse("1.25"):
                self.xarg = values.get('int16')[0].astype('bool')
            else:
                raise NotImplementedError("Not supported before v1.25")
        else:
            self.xarg = values.get('int16')[0]

    def time_ndarray_dlp(self, methname, npdtypes):
        meth = getattr(self.xarg, methname)
        meth()


class NDArrayAsType(Benchmark):
    """ Benchmark for type conversion
    """
    params = [list(itertools.combinations(TYPES1, 2))]
    param_names = ['typeconv']
    timeout = 10

    def setup(self, typeconv):
        if typeconv[0] == typeconv[1]:
            raise NotImplementedError(
                    "Skipping test for converting to the same dtype")
        self.xarg = get_squares_().get(typeconv[0])

    def time_astype(self, typeconv):
        self.xarg.astype(typeconv[1])


class UFuncSmall(Benchmark):
    """  Benchmark for a selection of ufuncs on a small arrays and scalars

    Since the arrays and scalars are small, we are benchmarking the overhead
    of the numpy ufunc functionality
    """
    params = ['abs', 'sqrt', 'cos']
    param_names = ['ufunc']
    timeout = 10

    def setup(self, ufuncname):
        np.seterr(all='ignore')
        try:
            self.f = getattr(np, ufuncname)
        except AttributeError:
            raise NotImplementedError
        self.array_5 = np.array([1., 2., 10., 3., 4.])
        self.array_int_3 = np.array([1, 2, 3])
        self.float64 = np.float64(1.1)
        self.python_float = 1.1

    def time_ufunc_small_array(self, ufuncname):
        self.f(self.array_5)

    def time_ufunc_small_array_inplace(self, ufuncname):
        self.f(self.array_5, out=self.array_5)

    def time_ufunc_small_int_array(self, ufuncname):
        self.f(self.array_int_3)

    def time_ufunc_numpy_scalar(self, ufuncname):
        self.f(self.float64)

    def time_ufunc_python_float(self, ufuncname):
        self.f(self.python_float)


class Custom(Benchmark):
    def setup(self):
        self.b = np.ones(20000, dtype=bool)
        self.b_small = np.ones(3, dtype=bool)

    def time_nonzero(self):
        np.nonzero(self.b)

    def time_not_bool(self):
        (~self.b)

    def time_and_bool(self):
        (self.b & self.b)

    def time_or_bool(self):
        (self.b | self.b)

    def time_and_bool_small(self):
        (self.b_small & self.b_small)


class CustomInplace(Benchmark):
    def setup(self):
        self.c = np.ones(500000, dtype=np.int8)
        self.i = np.ones(150000, dtype=np.int32)
        self.f = np.zeros(150000, dtype=np.float32)
        self.d = np.zeros(75000, dtype=np.float64)
        # fault memory
        self.f *= 1.
        self.d *= 1.

    def time_char_or(self):
        np.bitwise_or(self.c, 0, out=self.c)
        np.bitwise_or(0, self.c, out=self.c)

    def time_char_or_temp(self):
        0 | self.c | 0

    def time_int_or(self):
        np.bitwise_or(self.i, 0, out=self.i)
        np.bitwise_or(0, self.i, out=self.i)

    def time_int_or_temp(self):
        0 | self.i | 0

    def time_float_add(self):
        np.add(self.f, 1., out=self.f)
        np.add(1., self.f, out=self.f)

    def time_float_add_temp(self):
        1. + self.f + 1.

    def time_double_add(self):
        np.add(self.d, 1., out=self.d)
        np.add(1., self.d, out=self.d)

    def time_double_add_temp(self):
        1. + self.d + 1.


class CustomScalar(Benchmark):
    params = [np.float32, np.float64]
    param_names = ['dtype']

    def setup(self, dtype):
        self.d = np.ones(20000, dtype=dtype)

    def time_add_scalar2(self, dtype):
        np.add(self.d, 1)

    def time_divide_scalar2(self, dtype):
        np.divide(self.d, 1)

    def time_divide_scalar2_inplace(self, dtype):
        np.divide(self.d, 1, out=self.d)


class CustomComparison(Benchmark):
    params = (np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16,
              np.uint32, np.uint64, np.float32, np.float64, np.bool)
    param_names = ['dtype']

    def setup(self, dtype):
        self.x = np.ones(50000, dtype=dtype)
        self.y = np.ones(50000, dtype=dtype)
        self.s = np.ones(1, dtype=dtype)

    def time_less_than_binary(self, dtype):
        (self.x < self.y)

    def time_less_than_scalar1(self, dtype):
        (self.s < self.x)

    def time_less_than_scalar2(self, dtype):
        (self.x < self.s)


class CustomScalarFloorDivideInt(Benchmark):
    params = (np._core.sctypes['int'],
              [8, -8, 43, -43])
    param_names = ['dtype', 'divisors']

    def setup(self, dtype, divisor):
        iinfo = np.iinfo(dtype)
        self.x = np.random.randint(
                    iinfo.min, iinfo.max, size=10000, dtype=dtype)

    def time_floor_divide_int(self, dtype, divisor):
        self.x // divisor


class CustomScalarFloorDivideUInt(Benchmark):
    params = (np._core.sctypes['uint'],
              [8, 43])
    param_names = ['dtype', 'divisors']

    def setup(self, dtype, divisor):
        iinfo = np.iinfo(dtype)
        self.x = np.random.randint(
                    iinfo.min, iinfo.max, size=10000, dtype=dtype)

    def time_floor_divide_uint(self, dtype, divisor):
        self.x // divisor


class CustomArrayFloorDivideInt(Benchmark):
    params = (np._core.sctypes['int'] + np._core.sctypes['uint'],
              [100, 10000, 1000000])
    param_names = ['dtype', 'size']

    def setup(self, dtype, size):
        iinfo = np.iinfo(dtype)
        self.x = np.random.randint(
                    iinfo.min, iinfo.max, size=size, dtype=dtype)
        self.y = np.random.randint(2, 32, size=size, dtype=dtype)

    def time_floor_divide_int(self, dtype, size):
        self.x // self.y


class Scalar(Benchmark):
    def setup(self):
        self.x = np.asarray(1.0)
        self.y = np.asarray(1.0 + 1j)
        self.z = complex(1.0, 1.0)

    def time_add_scalar(self):
        (self.x + self.x)

    def time_add_scalar_conv(self):
        (self.x + 1.0)

    def time_add_scalar_conv_complex(self):
        (self.y + self.z)


class ArgPack:
    __slots__ = ['args', 'kwargs']

    def __init__(self, *args, **kwargs):
        self.args = args
        self.kwargs = kwargs

    def __repr__(self):
        return '({})'.format(', '.join(
            [repr(a) for a in self.args] +
            [f'{k}={v!r}' for k, v in self.kwargs.items()]
        ))


class ArgParsing(Benchmark):
    # In order to benchmark the speed of argument parsing, all but the
    # out arguments are chosen such that they have no effect on the
    # calculation.  In particular, subok=True and where=True are
    # defaults, and the dtype is the correct one (the latter will
    # still have some effect on the search for the correct inner loop).
    x = np.array(1.)
    y = np.array(2.)
    out = np.array(3.)
    param_names = ['arg_kwarg']
    params = [[
        ArgPack(x, y),
        ArgPack(x, y, out),
        ArgPack(x, y, out=out),
        ArgPack(x, y, out=(out,)),
        ArgPack(x, y, out=out, subok=True, where=True),
        ArgPack(x, y, subok=True),
        ArgPack(x, y, subok=True, where=True),
        ArgPack(x, y, out, subok=True, where=True)
    ]]

    def time_add_arg_parsing(self, arg_pack):
        np.add(*arg_pack.args, **arg_pack.kwargs)


class ArgParsingReduce(Benchmark):
    # In order to benchmark the speed of argument parsing, all but the
    # out arguments are chosen such that they have minimal effect on the
    # calculation.
    a = np.arange(2.)
    out = np.array(0.)
    param_names = ['arg_kwarg']
    params = [[
        ArgPack(a,),
        ArgPack(a, 0),
        ArgPack(a, axis=0),
        ArgPack(a, 0, None),
        ArgPack(a, axis=0, dtype=None),
        ArgPack(a, 0, None, out),
        ArgPack(a, axis=0, dtype=None, out=out),
        ArgPack(a, out=out)
    ]]

    def time_add_reduce_arg_parsing(self, arg_pack):
        np.add.reduce(*arg_pack.args, **arg_pack.kwargs)

class BinaryBench(Benchmark):
    params = [np.float32, np.float64]
    param_names = ['dtype']

    def setup(self, dtype):
        N = 1000000
        self.a = np.random.rand(N).astype(dtype)
        self.b = np.random.rand(N).astype(dtype)

    def time_pow(self, dtype):
        np.power(self.a, self.b)

    def time_pow_2(self, dtype):
        np.power(self.a, 2.0)

    def time_pow_half(self, dtype):
        np.power(self.a, 0.5)

    def time_pow_2_op(self, dtype):
        self.a ** 2

    def time_pow_half_op(self, dtype):
        self.a ** 0.5

    def time_atan2(self, dtype):
        np.arctan2(self.a, self.b)

class BinaryBenchInteger(Benchmark):
    params = [np.int32, np.int64]
    param_names = ['dtype']

    def setup(self, dtype):
        N = 1000000
        self.a = np.random.randint(20, size=N).astype(dtype)
        self.b = np.random.randint(4, size=N).astype(dtype)

    def time_pow(self, dtype):
        np.power(self.a, self.b)

    def time_pow_two(self, dtype):
        np.power(self.a, 2)

    def time_pow_five(self, dtype):
        np.power(self.a, 5)