import pandas as pd import numpy as np import scipy as sp from scipy import sparse from scipy.spatial.distance import cdist from sklearn.metrics import pairwise_distances def print_tfidf_top_words(tfidf_features, vocabulary_swap, mask, n=5): """Prints top TF-IDF words. Prints the `n` words with the highest mean TF-IDF score in the region defined by the `mask`. Parameters ---------- tfidf_features : sparse matrix TF-IDF features. vocabulary_swap : dict Reverse dictionary of the TF-IDF vocabulary, {index : word}. mask : array of bool Mask of the selected region. n : int, default=5 Number of words to be printed. See Also -------- print_tfidf_threshold_words : Prints the most frequent TF-IDF words above a certain threshold. """ tfidf_features_reduced=tfidf_features[mask,:] print('There are {} papers.'.format(tfidf_features_reduced.shape[0])) mean_tfidf_values = np.array(np.mean(tfidf_features_reduced, axis=0)).flatten() sorted_tfidf_indeces = np.flip(np.argsort(mean_tfidf_values)) sorted_tfidf_values = mean_tfidf_values[sorted_tfidf_indeces] sorted_tfidf_columns = np.arange(tfidf_features_reduced.shape[1])[sorted_tfidf_indeces] words=np.vectorize(vocabulary_swap.get)(sorted_tfidf_columns[:n]) print('TF-IDF: ', sorted_tfidf_values[:n]) print('Words: ',words) def print_tfidf_threshold_words(tfidf_features, vocabulary_swap, mask, n, several_th = False): """Prints the most relevant TF-IDF words. Prints the most frequent `n` words with TF-IDF values above a certain threshold (th=0.10), in the region defined by the `mask`. Parameters ---------- tfidf_features : sparse matrix TF-IDF features. vocabulary_swap : dict Reverse dictionary of the TF-IDF vocabulary, {index : word}. mask : array of bool Mask of the selected region. n : int, default=5 Number of words to be printed. several_th : bool, default = False If True, the words are printed for several values of threshold (th=[0.05, 0.10, 0.15]). See Also -------- print_tfidf_top_words : Prints the TF-IDF words with the highest mean TF-IDF score. Notes ----- All words/tfidf-elements above a certain threshold are selected. The goal of this threshold is to eliminate frequent low-content words (like stopwords) which tent to have a relatively low tf-idf score (below 0.05). Then, from all the words with tf-idf scores above a certain threshold, the most frequent words are selected, to ensure that the words printed appear in a majority of the papers of the chosen island, and thus provide a good overview of the common topic of the island. If we were just to select the words with the highest tf-idf score, we would get mostly very rare words, sometimes even numeric terms (a specific cell culture, a specifig drug name/code...). These words don't provide a good overview of the topic of the abstracs. Therefore, we choose a threshold with a lower value of the tf-idf score. The most informative thresholds are around 0.05-0.15. After the threshold, by choosing the most frequent words, we ensure that the words printed occur in many of the papers of the area that we have selected. It is also important selecting a tf-idf threshold instead of a fixed value, since a word may have different tf-idf scores in different papers. """ tfidf_features_reduced=tfidf_features[mask,:] print('There are {} papers.'.format(tfidf_features_reduced.shape[0])) tfidf_nonzero=sp.sparse.find(tfidf_features_reduced) nonzero_elem=np.array(list(zip(tfidf_nonzero[0], tfidf_nonzero[1], tfidf_nonzero[2]))) if several_th == False: threshold = [0.1] if several_th == True: threshold=np.arange(0.05,0.20,0.05) for i, th in enumerate(threshold): mask=nonzero_elem[:,2]>=th greater_elem=nonzero_elem[mask] rows=greater_elem[:,0] unique_rows=np.unique(rows) print('In this threshold there are {} papers.'.format(len(unique_rows))) columns=greater_elem[:,1] unique_columns,count_columns=np.unique(columns,return_counts=True) sorted_counts_indeces=np.flip(np.argsort(count_columns)) sorted_columns=np.take_along_axis(unique_columns,sorted_counts_indeces,0) sorted_counts=np.flip(np.sort(count_columns)) print('For threshold = {:.2f}'.format(th),sorted_counts[:n]) if sorted_columns.size > 0: words=np.vectorize(vocabulary_swap.get)(sorted_columns[:n]) print('For threshold = {:.2f}'.format(th),words) def find_and_print_NN(serie_titles, dataset, paper_index, k=10, print_abstracts=False, serie_abstracts=None): """Prints the titles of the nearest neighbors. Finds and prints the titles of the `k`-NN of one particular paper. If `print_abstracts` = True and `serie_abstracts` is given, it also prints the abstracts. It doesn't support sparse datasets Parameters ---------- serie_titles : pandas series Titles. dataset : array-like Data in which the neighbors are searched. paper_index : int Index of the paper to query. k : int, default=10. Number of nearest neighbors to search. print_abstracts : bool, optional If True along with a given `serie_abstracts`, it will also print the abstracts. - serie_abstracts : pandas series, default=None Abstracts. If given along with `print_abstracts`=True, it will also print the abstracts. Returns ------- indeces_k1nn : array-like of int Indeces of the the `k`+1 nearest neighbors (the first index corresponds to the point queried). See Also -------- find_and_print_NN_sparse : analogous function for sparse datasets. """ dataset_paper=dataset[paper_index,:] print(dataset.shape) print(dataset_paper.shape) d = cdist(dataset,dataset_paper.reshape(1,-1)) indeces_k1nn=np.argsort(d.flatten())[:k+1] #print for i in range(0,len(indeces_k1nn)): print('Neighbor {:}:'.format(i),serie_titles.iloc[indeces_k1nn].tolist()[i]) #print(serie_titles.iloc[indeces_k1nn].tolist()[i]) if print_abstracts == True: print(serie_abstracts.iloc[indeces_k1nn].tolist()[i]) print('----------------------------------------------------------') return indeces_k1nn def find_and_print_NN_sparse(serie_titles, dataset, paper_index, k=10, print_abstracts=False,serie_abstracts=None): """Prints the titles of the nearest neighbors. Finds and prints the titles of the `k`-NN of one particular paper. If `print_abstracts` = True and `serie_abstracts` is given, it also prints the abstracts. It supports sparse datasets Parameters ---------- serie_titles : pandas series Titles. dataset : {array-like, sparse matrix} Data in which the neighbors are searched. paper_index : int Index of the paper to query. k : int, default=10. Number of nearest neighbors to search. print_abstracts : bool, optional If True along with a given `serie_abstracts`, it will also print the abstracts. - serie_abstracts : pandas series, default=None Abstracts. If given along with `print_abstracts`=True, it will also print the abstracts. Returns ------- indeces_k1nn : array-like of int Indeces of the the `k`+1 nearest neighbors (the first index corresponds to the point queried). """ dataset_paper=dataset[paper_index,:] print(dataset.shape) print(dataset_paper.shape) d = pairwise_distances(dataset,dataset_paper.reshape(1,-1)) indeces_k1nn=np.argsort(d.flatten())[:k+1] #print for i in range(0,len(indeces_k1nn)): print('Neighbor {:}:'.format(i),serie_titles.iloc[indeces_k1nn].tolist()[i]) if print_abstracts==True: print(serie_abstracts.iloc[indeces_k1nn].tolist()[i]) print('----------------------------------------------------------') return indeces_k1nn def find_mask_words(abstracts, word, verbose=True): """ Creates a mask for abstracts containing a certain word. Creates several masks of the size of `abstracts` for instances containing the words in `words`. Also it prints how many instances contain each word, in its capitalized, uncapitalized versions, and total. Parameters ---------- abstracts : pandas dataframe of str All texts (in this case abstracts). words : str str of the word/phrase to be queried. verbose : bool, optional If True, prints the number of times the word appears in its different forms in the abstracts collection. Returns ------- mask : array-like of bool Mask. """ sub1=' '+word sub2=word.capitalize() indexes1= abstracts.str.find(sub1) indexes2= abstracts.str.find(sub2) mask = (indexes1!=-1) | (indexes2!=-1) if verbose == True: print(f"Number of papers with uncapitalized word '{word}': ", len(np.where(indexes1!=-1)[0])) print(f"Number of papers with capitalized word '{word}': ", len(np.where(indexes2!=-1)[0])) print(f"Number of total papers with word '{word}': ", len(np.where(mask)[0])) return mask def print_numbers_names(names_first_author, gender_first_author, names_last_author, gender_last_author): """Prints some statistics on available, predicted and female (first/last) author names. Returns the absolute numbers and percentages of available and predicted names, and female authors for both first and last authors. Parameters ---------- names_first_author : ndarray Names of first authors. gender_first_author : ndarray Genders of first authors. names_last_author : ndarray Names of last authors. gender_last_author : ndarray Genders of last authors. """ assert names_first_author.shape[0] == names_last_author.shape[0] total = names_first_author.shape[0] print('Number of available first author names: ', len(np.where(names_first_author != '')[0])) print('Number of predicted first author names: ', len(np.where(gender_first_author != 'unknown')[0])) print('Number of female first author names: ', len(np.where(gender_first_author == 'female')[0])) print('% of available first author names: ', len(np.where(names_first_author != '')[0])/total*100) #print('% of predicted first author names out of total: ', len(np.where(gender_first_author != 'unknown')[0])/total*100) print('% of predicted first author names: ', len(np.where(gender_first_author != 'unknown')[0])/len(np.where(names_first_author != '')[0])*100) print('% of female first author names: ', len(np.where(gender_first_author == 'female')[0])/len(np.where(gender_first_author != 'unknown')[0])*100) print('Number of available last author names: ', len(np.where(names_last_author != '')[0])) print('Number of predicted last author names: ', len(np.where(gender_last_author != 'unknown')[0])) print('Number of female last author names: ', len(np.where(gender_last_author == 'female')[0])) print('% of available last author names: ', len(np.where(names_last_author != '')[0])/total*100) print('% of predicted last author names: ', len(np.where(gender_last_author != 'unknown')[0])/len(np.where(names_first_author != '')[0])*100) print('% of female last author names: ', len(np.where(gender_last_author == 'female')[0])/len(np.where(gender_last_author != 'unknown')[0])*100) def print_numbers_names_label(label, names_first_author, gender_first_author, names_last_author, gender_last_author, colors_new, colors_new_legend): """Prints some statistics on available, predicted and female (first/last) author names for a given label Returns the absolute numbers and percentages of available and predicted names, and female authors for both first and last authors, for a given label. Parameters ---------- label : str Chosen label. names_first_author : ndarray Names of first authors. gender_first_author : ndarray Genders of first authors. names_last_author : ndarray Names of last authors. gender_last_author : ndarray Genders of last authors. colors_new : array Colors/labels. colors_new_legend : dict Legend mapping the colors to the labels. See Also -------- print_numbers_names """ names_first_author = names_first_author[colors_new == colors_new_legend[label]] gender_first_author = gender_first_author[colors_new == colors_new_legend[label]] names_last_author = names_last_author[colors_new == colors_new_legend[label]] gender_last_author = gender_last_author[colors_new == colors_new_legend[label]] print_numbers_names(names_first_author, gender_first_author, names_last_author, gender_last_author)