In [12]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

import scipy as sp
from scipy import sparse
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from openTSNE import TSNE
from openTSNE import affinity, initialization, TSNEEmbedding
from openTSNE.affinity import Affinities

import time
import pickle

import memory_profiler

%load_ext memory_profiler

from pathlib import Path

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [13]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [26]:
%load_ext autoreload
%autoreload 2

from pubmed_landscape_src.metrics import knn_accuracy_ls

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
variables_path = Path("../../results/variables/bert-models")
figures_path = Path("../../results/figures/bert-models")
berenslab_data_path = Path("/gpfs01/berens/data/data/pubmed_processed")

# Import

In [5]:
# Import
df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
df = df.reset_index(drop=True)
abstracts = df["AbstractText"].tolist()

# TF-IDF Vectorizer

In [6]:
%%time
%%memit

# TfidfVectorizer
corpus = abstracts
vectorizer = TfidfVectorizer(sublinear_tf=True)
tfidf_features_1M = vectorizer.fit_transform(corpus)

peak memory: 5592.64 MiB, increment: 2881.86 MiB
CPU times: user 1min 53s, sys: 3.82 s, total: 1min 57s
Wall time: 1min 57s


In [7]:
print(tfidf_features_1M.shape)

(1000000, 758111)


In [8]:
# save results
sp.sparse.save_npz(variables_path / "tfidf_features_1M", tfidf_features_1M)

# Truncated SVD

In [4]:
#  results
#tfidf_features_1M = sp.sparse.load_npz(variables_path / "tfidf_features_1M.npz")

tcmalloc: large alloc 18038185984 bytes == 0x5af2000 @ 
tcmalloc: large alloc 18038185984 bytes == 0x43a030000 @ 


In [9]:
%%time
%%memit

# TruncatedSVD
svd = TruncatedSVD(n_components=300, random_state=42, algorithm="arpack")
svd_data_1M = svd.fit_transform(tfidf_features_1M)

# save results
np.save(variables_path / "svd_data_1M", svd_data_1M)

tcmalloc: large alloc 3645005824 bytes == 0x14d300000 @ 
tcmalloc: large alloc 3645005824 bytes == 0x231a66000 @ 
tcmalloc: large alloc 1819467776 bytes == 0x14d300000 @ 
tcmalloc: large alloc 1819467776 bytes == 0x1b9a2e000 @ 
tcmalloc: large alloc 2400002048 bytes == 0x231a66000 @ 
tcmalloc: large alloc 2400002048 bytes == 0x2c0b38000 @ 
tcmalloc: large alloc 2400002048 bytes == 0x231a66000 @ 


peak memory: 17312.98 MiB, increment: 11717.75 MiB
CPU times: user 3h 29min 38s, sys: 1h 36min 51s, total: 5h 6min 29s
Wall time: 23min 53s


# t-SNE

In [6]:
svd_data_1M = np.load(variables_path / "svd_data_1M.npy")

tcmalloc: large alloc 2400002048 bytes == 0x74930000 @ 


In [7]:
def run_tsne(
    embeddings, model_name, variables_path, rs=42, save_intermediates=True
):
    # affinities
    A = affinity.Uniform(
        embeddings,
        k_neighbors=10,
        n_jobs=-1,
        verbose=1,
        random_state=42,
    )

    # initialization
    I = initialization.pca(embeddings, random_state=42)

    if save_intermediates == True:
        affinities_name = "affinities_P_" + model_name
        sp.sparse.save_npz(variables_path / affinities_name, A.P)

        initialization_name = "initialization_" + model_name
        np.save(variables_path / initialization_name, I)

    # t-SNE optimization
    E = TSNEEmbedding(I, A, n_jobs=-1, random_state=42, verbose=True)

    ## early exaggeration
    E = E.optimize(
        n_iter=125, exaggeration=12, momentum=0.5, n_jobs=-1, verbose=True
    )

    ## exaggeration annealing
    exs = np.linspace(12, 1, 125)
    for i in range(125):
        E = E.optimize(
            n_iter=1,
            exaggeration=exs[i],
            momentum=0.8,
            n_jobs=-1,
            verbose=True,
        )

    ## final optimization without exaggeration
    E = E.optimize(
        n_iter=500, exaggeration=1, momentum=0.8, n_jobs=-1, verbose=True
    )

    tsne = np.array(E)

    # save
    tsne_name = "tsne_" + model_name
    np.save(variables_path / tsne_name, tsne)

    return tsne

In [8]:
%%time
tsne_tfidf = run_tsne(svd_data_1M, "tfidf", variables_path=variables_path)

===> Finding 10 nearest neighbors using Annoy approximate search using euclidean distance...


tcmalloc: large alloc 1275691008 bytes == 0x13e1da000 @ 
tcmalloc: large alloc 1658404864 bytes == 0x1d0812000 @ 
tcmalloc: large alloc 2155921408 bytes == 0x233da6000 @ 


   --> Time elapsed: 228.07 seconds


tcmalloc: large alloc 2400002048 bytes == 0x2b45b2000 @ 
tcmalloc: large alloc 2400002048 bytes == 0x343684000 @ 


===> Running optimization with exaggeration=12.00, lr=83333.33 for 125 iterations...
Iteration   50, KL divergence 10.8997, 50 iterations in 22.4524 sec
Iteration  100, KL divergence 10.1305, 50 iterations in 21.9770 sec
   --> Time elapsed: 55.65 seconds
===> Running optimization with exaggeration=12.00, lr=83333.33 for 1 iterations...
   --> Time elapsed: 0.44 seconds
===> Running optimization with exaggeration=11.91, lr=83333.33 for 1 iterations...
   --> Time elapsed: 0.33 seconds
===> Running optimization with exaggeration=11.82, lr=83333.33 for 1 iterations...
   --> Time elapsed: 0.33 seconds
===> Running optimization with exaggeration=11.73, lr=83333.33 for 1 iterations...
   --> Time elapsed: 0.33 seconds
===> Running optimization with exaggeration=11.65, lr=83333.33 for 1 iterations...
   --> Time elapsed: 0.44 seconds
===> Running optimization with exaggeration=11.56, lr=83333.33 for 1 iterations...
   --> Time elapsed: 0.44 seconds
===> Running optimization with exaggeratio

# kNN accuracies: RERUN

## Import

In [16]:
# Import
df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
df = df.reset_index(drop=True)
colors = df["Colors"].to_numpy()

In [17]:
tfidf_features_1M = sp.sparse.load_npz(
    variables_path / "tfidf_features_1M.npz"
)

In [18]:
svd_data_1M = np.load(variables_path / "svd_data_1M.npy")

tcmalloc: large alloc 2400002048 bytes == 0x233da6000 @ 


In [20]:
tsne_tfidf = np.load(variables_path / "tsne_tfidf.npy")

## Run

In [27]:
%%time
knn_accuracy_tfidf_features_1M = knn_accuracy_ls(tfidf_features_1M, colors)

CPU times: user 49min 33s, sys: 3.82 s, total: 49min 37s
Wall time: 6min 17s


In [28]:
print(knn_accuracy_tfidf_features_1M)

0.61


In [29]:
%%time
knn_accuracy_svd_data_1M = knn_accuracy_ls(svd_data_1M, colors)

CPU times: user 35min 54s, sys: 40min 16s, total: 1h 16min 10s
Wall time: 5min 41s


In [30]:
print(knn_accuracy_svd_data_1M)

0.5475


In [31]:
%%time
knn_accuracy_tsne_tfidf = knn_accuracy_ls(tsne_tfidf, colors)

CPU times: user 11min 13s, sys: 19min 48s, total: 31min 2s
Wall time: 4min


In [32]:
print(knn_accuracy_tsne_tfidf)

0.499
