import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from pygam import GAM, LinearGAM, LogisticGAM, PoissonGAM, InvGaussGAM, s, f, te 
from pygam import intercept
from .exploration import find_mask_words


def get_females_per_year(gender_first_author, year_first_author, years, subset_size = None, rs=42, verbose=False):
    """ Returns instances (for training a GAM) from each paper: its year and gender of author.
    It returns X and y to train a GAM to fit the fraction of female authors per year, where X is the year of the paper and y is whether for that paper the author was female (1) or male (0).
    If for a given year there are no papers from that year (e.g. subset of 'bioinformatic' papers in 1970), nothing is returned for that year.
    
    Parameters
    ----------
    gender_first_author : array-like of shape (n_papers,)
        List with the predicted genders for each paper ('male', 'female' or 'unknown').
    year_first_author : array-like of shape (n_papers,)
        List with the year for each paper.
    years : array-like
        List of years that you want to query.
    subset_size : int , optional 
        If not given, it will return all existing instances ('female' or 'male') from the years specified in `years`.
        If given, it will return a subset of instances of size `subset_size` for every year. 
        If for a year there is a number of instances smaller than the `subset_size`, it will ignore the instances from that year.
    rs : int, default=42
        Random seed for the subset, in case `subset_size` is given.
        
    Returns
    -------
    X_years : ndarray of shape (n_instances)
        Individual years.
    y_genders : ndarray of bool
        Individual genders ('female'= True, 'male'=False)
    average_fraction_female: ndarray of shape (n_years)
        Average fraction of female authors per years.
    average_number_females : ndarray of shape (n_years)
        Number of female authors per year.
    
    """
    
    y_genders = []
    X_years = []
    average_fraction_female = []
    average_number_females = []

    for year in years:
        if verbose == True:
            print('Year ', year)
            
        if subset_size is not None:
            genders_year = gender_first_author[year_first_author == year]
        
            if len(genders_year) < subset_size:
                continue

            np.random.seed(rs)
            subset = np.random.choice(genders_year.shape[0], size=subset_size, replace=False)
            genders_subset = genders_year[subset]

            total_pred_authors_subset = len(np.where(genders_subset != 'unknown')[0])

            if total_pred_authors_subset == 0:
                continue

            average_fraction_female.append(len(np.where(genders_subset == 'female')[0])/total_pred_authors_subset)
            average_number_females.append(len(np.where(genders_subset == 'female')[0]))

            genders_subset_predicted = genders_subset[genders_subset != 'unknown']

            X_years.append([year]*len(genders_subset_predicted))
            y_genders.append(np.where(genders_subset_predicted == 'female', 1, 0))

        else:
            genders_year = gender_first_author[year_first_author == year]

            total_pred_authors_year = len(np.where(genders_year != 'unknown')[0])

            if total_pred_authors_year == 0:
                continue

            average_fraction_female.append(len(np.where(genders_year == 'female')[0])/total_pred_authors_year) 
            average_number_females.append(len(np.where(genders_year == 'female')[0]))

            genders_year_predicted = genders_year[genders_year != 'unknown']

            X_years.append([year]*len(genders_year_predicted))
            y_genders.append(np.where(genders_year_predicted == 'female', 1, 0))


    y_genders = np.hstack(y_genders)
    X_years = np.hstack(X_years)

    average_fraction_female = np.array(average_fraction_female)
    average_number_females = np.array(average_number_females)

    return X_years, y_genders, average_fraction_female, average_number_females


def get_knn_overlap_gam(X, labels, date_year, years, class1, other_classes, k=10, subset_size=None, rs=42, verbose=False):
    """Returns instances (for training a GAM) for each paper: its year and its kNN overlapp to one class.
    The X are the individual years and the y are the kNN overlap of each paper with one other class. It returns one y (columns of the array) for each class given in `other_classes`.
    For the years where there are no `class1` points, the corresponding X and y values will be -1. 
    If subset_size= not None, and n_class1_points < `subset_size`, then it will take all available points.  
    Take this into account to filter this values for plotting after that. 
    
    Parameters
    ----------
    X : array-like of shape (n_papers, n_dims)
        Dataset where to look for the neighbors.
    labels : array-like of shape (n_papers,)
        Labels of papers.
    date_year : array-like of shape (n_papers,)
        Years of papers.
    years : array-like
        Years in which to query.
    class1 : str
        Main class (label).
    other_classes : list of str
        Other labels for which to calculate the kNN overlap.
    k : int, default=10
        Number of nearest neighbors to query.
    subset_size : int, default=None
        If true, only a subset of the points of size `subset_size` will be taken for every year.
    rs : int, default=42
        Random seed for the selection of the subset.
    verbose : bool, default=False
        If True, print year being calculated.
    
    Returns
    -------
    individual_years : ndarray of shape (n_points,)
        Individual years of the instances. n_points is either the subset of points with label == `class1` or `subset_size`*len(`years`).
    individual_knn_overlapp : ndarray of shape (n_points, n_other_classes)
        Individual kNN overlap of the instances, for every class (columns).
    average_knn_overlap: (n_years, n_other_classes)
        Average kNN overlap, for every class (columns). 
    """
    
    # select only labeled papers
    X_labeled = X[labels != 'unlabeled']
    labels_labeled = labels[labels != 'unlabeled']
    date_year_labeled = date_year[labels != 'unlabeled']

    # specify k_to_query and initialize variables
    if subset_size is not None:
        # In this case we will have to query k+1 points, because
        # sklearn returns the query point itself as one of the neighbors
        k_to_query = k + 1
        
        # initialize variables
        individual_years = np.ones((len(years))*subset_size)*-1
        individual_knn_overlapp = np.ones(shape=((len(years))*subset_size,len(other_classes)))*-1
        average_knn_overlap = np.ones(shape=((len(years)),len(other_classes)))*-1
        
    else:
        # In this case we can query k points
        k_to_query = k
        
        # initialize variables
        n_class1_papers = X_labeled[labels_labeled == class1].shape[0]
        individual_years = np.ones(n_class1_papers)*-1
        individual_knn_overlapp = np.ones(shape=(n_class1_papers,len(other_classes)))*-1
        average_knn_overlap = np.zeros(shape=((len(years)),len(other_classes)))


    #print(individual_years.shape)
    #print(individual_knn_overlapp.shape)
    #print(average_knn_overlap.shape)
    
    # loop over decades
    n=0
    for i in range(len(years)):
        if verbose == True:
            print('year', years[i])

        # select class1 subset from a particular decade
        X_class1 = X_labeled[(labels_labeled == class1) & (date_year_labeled == years[i])]
        
        # if subset of nsc papers from that year is == 0
        if X_class1.shape[0] == 0:
            average_knn_overlap[i,:] = -1
            continue

        # create subset
        if subset_size is not None:
            # if subset of nsc papers from that year is < subset_size
            if X_class1.shape[0] < subset_size:
            #    continue
                np.random.seed(rs)
                subset = np.random.choice(X_class1.shape[0], size=X_class1.shape[0], replace=False)
            else:
                np.random.seed(rs)
                subset = np.random.choice(X_class1.shape[0], size=subset_size, replace=False)

        # get NN
        nbrs = NearestNeighbors(n_neighbors=k_to_query, n_jobs=-1).fit(X_labeled)
        dist , ind = nbrs.kneighbors(X=X_class1 if subset_size is None else X_class1[subset],
                                return_distance=True)
        # here ind.shape[0] is the size of the subset
        # here ind.shape[1] is k_to_query


        # count papers in NN
        for j in range(ind.shape[0]):
            individual_years[n+j] = years[i]


            for l, class_l in enumerate(other_classes):
                individual_knn_overlapp[n+j, l] = len(np.where(labels_labeled[ind[j]] == class_l)[0])/k
                # it doesn't matter that I divide by k=10 even though ind has 11 neighbors since the first neighbor (the point itself)
                # is always going to be from class_1 and therefore != class_l

                average_knn_overlap[i,l] += len(np.where(labels_labeled[ind[j]] == class_l)[0])
                
        n += ind.shape[0]

        # normalize average knn overlapp
        average_knn_overlap[i,:] = average_knn_overlap[i,:]/ ind.shape[0] / k

    return individual_years, individual_knn_overlapp, average_knn_overlap


def get_ml_per_year(years, abstracts, date_year, verbose=False):
    """Returns instances (for training a GAM) from each paper: its year and whether it has 'machine learning' in its abstract or not.
    It returns X and y to train a GAM to fit the fraction of ML papers per year, where X is the year of the paper and y is whether it has 'machine learning' in its abstract or not.
    If for a given year there are no papers from that year (e.g. subset of 'bioinformatic' papers in 1970), nothing is returned for that year.
    
    Parameters
    ----------
    years : array-like
        List of years that you want to query.
    abstracts : array-like of str
        Corpus of abstracts.
    date_year: array-like
        Years of the papers.
    verbose : bool, default=False
        If True, it prints the year being queried.
        
    Returns
    -------
    X_years : ndarray of shape (n_instances)
        Individual years.
    y_ml : ndarray of bool
        Contains ML or not.
    fraction_ml: ndarray of shape (n_years)
        Average fraction of papers with ML in their abstract per years.
    number_papers : ndarray of shape (n_years)
        Number of papers with ML in their abstract per year.
        
    """
    
    X_years = []
    y_ml = []
    fraction_ml = []
    number_papers=[]

    for year in years:
        if verbose == True:
            print('Year', year)

        abstracts_year = abstracts[date_year == year]
        
        if abstracts_year.shape[0]== 0:
            continue

        mask_ml = find_mask_words(abstracts_year, 'machine learning', verbose=False)

        X_years.append([year]*len(mask_ml))
        y_ml.append(mask_ml)
        fraction_ml.append(np.mean(mask_ml))
        number_papers.append(len(abstracts_year))
    
    X_years = np.hstack(X_years)
    y_ml = np.hstack(y_ml)
    fraction = np.hstack(fraction_ml)
    number_papers = np.hstack(number_papers)
    
    return X_years, y_ml, fraction_ml, number_papers


def train_logistic_gam(X_train, y_train, verbose=False):
    """Trains a Logistic GAM.
    It has the parameter values already chosen for our experiment.
    
    Parameters
    ----------
    X_train
    y_train
    verbose : bool, default=False
        If True, prints the gam.summary()
    
    Return
    ------
    gam  
        Trained gam model
    """

    n_features = 1 # number of features used in the model

    lams = np.logspace(-5,5,20) * n_features 
    splines = 12 # number of splines we will use

    gam = LogisticGAM(s(0, n_splines=splines))

    gam.gridsearch(X_train, y_train, lam=lams)
    
    if verbose == True:
        gam.summary()
    
    return gam


def train_linear_gam(X_train, y_train, verbose=False):
    """Trains a Linear GAM.
    It has the parameter values already chosen for our experiment.
    
    Parameters
    ----------
    X_train
    y_train
    verbose : bool, default=False
        If True, prints the gam.summary()
    
    Return
    ------
    gam  
        Trained gam model
    """
    
    n_features = 1 # number of features used in the model

    lams = np.logspace(-5,5,20) * n_features 
    splines = 6 # number of splines we will use

    gam = LinearGAM(s(0, n_splines=splines))

    gam.gridsearch(X_train, y_train, lam=lams)
    
    if verbose == True:
        gam.summary()
    
    return gam


def get_plot_gam(gam, gam_type={'linear', 'logistic'}):
    """Gets the necessary things from a trained GAM model to produce the plot.
    
    Parameters
    ----------
    gam  
        Trained GAM model.
    gam_type : str, {'linear', 'logistic'}
        Type of the trained GAM.

    Return
    ------
    XX
        X generated coordinates.
    pdep
        Predicted GAM values for the `XX` coordinates.
    confi : ndarray of shape (n_XX, 2)
        Confidence intervals.
    intercept
        Intercept value.
    """

    XX = gam.generate_X_grid(term=0)
    #pdep, confi = gam.partial_dependence(term=0, X=XX, width=width)
    
    if gam_type == 'logistic':
        pdep = gam.predict_proba(XX)
        
    if gam_type == 'linear':
        pdep = gam.predict(XX)
    
    confi = gam.confidence_intervals(XX)
    
    intercept = gam.coef_[-1]
    
    return XX, pdep, confi, intercept