In [1]:
import pandas as pd
import random
import numpy as np
from random import randint

import torch
from transformers import AutoTokenizer, AutoModel

import time

import memory_profiler

%load_ext memory_profiler

from pathlib import Path

In [2]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [3]:
%load_ext autoreload
%autoreload 2

from pubmed_landscape_src.metrics import knn_accuracy_ls
from pubmed_landscape_src.data import generate_embeddings

In [4]:
variables_path = Path("../../results/variables")
figures_path = Path("../../results/figures")
berenslab_data_path = Path("/gpfs01/berens/data/data/pubmed_processed")

# Import

In [5]:
# Import
df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
df = df.reset_index(drop=True)
abstracts = df["AbstractText"].tolist()

# Obtain embeddings

In [6]:
# random seed
random_state = random.seed(42)

In [7]:
# specify & check gpu usage
device = (
    "cuda" if torch.cuda.is_available() else "cpu"
)  # put cuda:0 if else not working
print("running on device: {}".format(device))

running on device: cuda


In [8]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

print("model: BERT")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model: BERT


In [9]:
# set device
model = model.to(device)

In [None]:
%%capture cap
%%time
%%memit

embeddings_av = np.empty([len(abstracts), 768])
embeddings_sep = np.empty([len(abstracts), 768])
embeddings_cls = np.empty([len(abstracts), 768])

for i, abstr in enumerate(abstracts):
    np.save(variables_path / "experiment_iter", i)

    embd_cls, embd_sep, embd_av = generate_embeddings(
        abstr, tokenizer, model, device
    )

    embeddings_cls[i] = embd_cls
    embeddings_sep[i] = embd_sep
    embeddings_av[i] = embd_av

    if (i % 50000) == 0:
        np.save(
            berenslab_data_path
            / "embeddings/embeddings_BERT/embeddings_cls_interm",
            embeddings_cls,
        )
        np.save(
            berenslab_data_path
            / "embeddings/embeddings_BERT/embeddings_sep_interm",
            embeddings_sep,
        )
        np.save(
            berenslab_data_path
            / "embeddings/embeddings_BERT/embeddings_av_interm",
            embeddings_av,
        )

np.save(
    berenslab_data_path / "embeddings/embeddings_BERT/embeddings_cls",
    embeddings_cls,
)
np.save(
    berenslab_data_path / "embeddings/embeddings_BERT/embeddings_sep",
    embeddings_sep,
)
np.save(
    berenslab_data_path / "embeddings/embeddings_BERT/embeddings_av",
    embeddings_av,
)

tcmalloc: large alloc 6144000000 bytes == 0x7f8f2fca0000 @ 
tcmalloc: large alloc 6144000000 bytes == 0x7f8dc1940000 @ 
tcmalloc: large alloc 6144000000 bytes == 0x7f8c535e0000 @ 


In [None]:
with open(variables_path / "verbose_batches_BERT.txt", "w") as f:
    f.write(cap.stdout)

# kNN accuracies 

In [None]:
# Import
#df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
#df = df.reset_index(drop=True)
labels = df["Colors"].tolist()

## CLS

In [None]:
#embeddings_cls = np.load(berenslab_data_path / 'embeddings/embeddings_BERT/embeddings_cls.npy', allow_pickle=True, fix_imports=True)

In [15]:
embeddings_cls.shape

(1000000, 768)

In [None]:
%%capture cap
%%time
knn_accuracy_BERT_cls = knn_accuracy_ls(embeddings_cls, labels)

In [None]:
with open(variables_path / "verbose_knn_accuracy_BERT_cls.txt", "w") as f:
    f.write(cap.stdout)

In [27]:
print(knn_accuracy_BERT_cls)

0.5038


In [None]:
np.save(variables_path / "knn_accuracy_BERT_cls", knn_accuracy_BERT_cls)

## SEP

In [None]:
#embeddings_sep = np.load(berenslab_data_path / 'embeddings/embeddings_BERT/embeddings_sep.npy', allow_pickle=True, fix_imports=True)

In [35]:
embeddings_sep.shape

(1000000, 768)

In [None]:
%%capture cap
%%time
knn_accuracy_BERT_sep = knn_accuracy_ls(embeddings_sep, labels)

In [None]:
with open(variables_path / "verbose_knn_accuracy_BERT_sep.txt", "w") as f:
    f.write(cap.stdout)

In [26]:
print(knn_accuracy_BERT_sep)

0.5338


In [None]:
np.save(variables_path / "knn_accuracy_BERT_sep", knn_accuracy_BERT_sep)

## Average

In [None]:
#embeddings_av = np.load(berenslab_data_path / 'embeddings/embeddings_BERT/embeddings_av.npy', allow_pickle=True, fix_imports=True)

In [39]:
embeddings_av.shape

(1000000, 768)

In [None]:
%%capture cap
%%time
knn_accuracy_BERT_av = knn_accuracy_ls(embeddings_av, labels)

In [None]:
with open(variables_path / "verbose_knn_accuracy_BERT_av.txt", "w") as f:
    f.write(cap.stdout)

In [25]:
print(knn_accuracy_BERT_av)

0.5709


In [None]:
np.save(variables_path / "knn_accuracy_BERT_av", knn_accuracy_BERT_av)