In [2]:
import pandas as pd
import random
import numpy as np
from random import randint

import torch
from transformers import AutoTokenizer, AutoModel

import time

import memory_profiler

%load_ext memory_profiler

from pathlib import Path

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [3]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [4]:
%load_ext autoreload
%autoreload 2

from pubmed_landscape_src.metrics import knn_accuracy_ls
from pubmed_landscape_src.data import generate_embeddings

In [5]:
variables_path = Path("../../results/variables")
figures_path = Path("../../results/figures")
berenslab_data_path = Path("/gpfs01/berens/data/data/pubmed_processed")

# Import

In [6]:
# Import
df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
df = df.reset_index(drop=True)
abstracts = df["AbstractText"].tolist()

# Obtain embeddings

In [7]:
# random seed
random_state = random.seed(42)

In [8]:
# specify & check gpu usage
device = (
    "cuda" if torch.cuda.is_available() else "cpu"
)  # put cuda:0 if else not working
print("running on device: {}".format(device))

running on device: cuda


In [9]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("malteos/scincl")
model = AutoModel.from_pretrained("malteos/scincl")

print("model: SciNCL")

model: SciNCL


In [10]:
# set device
model = model.to(device)

In [None]:
%%capture cap
%%time
%%memit

embeddings_av = np.empty([len(abstracts), 768])
embeddings_sep = np.empty([len(abstracts), 768])
embeddings_cls = np.empty([len(abstracts), 768])

for i, abstr in enumerate(abstracts):
    np.save(variables_path / "experiment_iter", i)

    embd_cls, embd_sep, embd_av = generate_embeddings(
        abstr, tokenizer, model, device
    )

    embeddings_cls[i] = embd_cls
    embeddings_sep[i] = embd_sep
    embeddings_av[i] = embd_av

    if (i % 50000) == 0:
        np.save(
            berenslab_data_path
            / "embeddings/embeddings_malteos/embeddings_cls_interm",
            embeddings_cls,
        )
        np.save(
            berenslab_data_path
            / "embeddings/embeddings_malteos/embeddings_sep_interm",
            embeddings_sep,
        )
        np.save(
            berenslab_data_path
            / "embeddings/embeddings_malteos/embeddings_av_interm",
            embeddings_av,
        )

np.save(
    berenslab_data_path / "embeddings/embeddings_malteos/embeddings_cls",
    embeddings_cls,
)
np.save(
    berenslab_data_path / "embeddings/embeddings_malteos/embeddings_sep",
    embeddings_sep,
)
np.save(
    berenslab_data_path / "embeddings/embeddings_malteos/embeddings_av",
    embeddings_av,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tcmalloc: large alloc 6144000000 bytes == 0x7f3103280000 @ 
tcmalloc: large alloc 6144000000 bytes == 0x7f2f94f20000 @ 
tcmalloc: large alloc 6144000000 bytes == 0x7f2e26bc0000 @ 


In [None]:
with open(variables_path / "verbose_batches_malteos.txt", "w") as f:
    f.write(cap.stdout)

# kNN accuracies (RERUN)

In [16]:
# Import
df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
df = df.reset_index(drop=True)
labels = df["Colors"].tolist()

## CLS

In [None]:
embeddings_cls = np.load(berenslab_data_path / 'embedding_malteos/embeddings_cls.npy', allow_pickle=True, fix_imports=True)

In [15]:
embeddings_cls.shape

(1000000, 768)

In [17]:
%%time
knn_accuracy_malteos_cls = knn_accuracy_ls(embeddings_cls, labels)

tcmalloc: large alloc 6082560000 bytes == 0x7f2c5b4f6000 @ 


In [18]:
with open(variables_path / "verbose_knn_accuracy_malteos_cls.txt", "w") as f:
    f.write(cap.stdout)

In [33]:
print(knn_accuracy_malteos_cls)

0.6459


In [20]:
np.save(variables_path / "knn_accuracy_malteos_cls", knn_accuracy_malteos_cls)

## SEP

In [None]:
embeddings_sep = np.load(berenslab_data_path / 'embedding_malteos/embeddings_sep.npy', allow_pickle=True, fix_imports=True)

In [35]:
embeddings_sep.shape

(1000000, 768)

In [21]:
%%time
knn_accuracy_malteos_sep = knn_accuracy_ls(embeddings_sep, labels)

tcmalloc: large alloc 6082560000 bytes == 0x7f2c1b94a000 @ 


In [22]:
with open(variables_path / "verbose_knn_accuracy_malteos_sep.txt", "w") as f:
    f.write(cap.stdout)

In [32]:
print(knn_accuracy_malteos_sep)

0.6462


In [24]:
np.save(variables_path / "knn_accuracy_malteos_sep", knn_accuracy_malteos_sep)

## Average

In [None]:
embeddings_av = np.load(berenslab_data_path / 'embedding_malteos/embeddings_av.npy', allow_pickle=True, fix_imports=True)

In [39]:
embeddings_av.shape

(1000000, 768)

In [25]:
%%time
knn_accuracy_malteos_av = knn_accuracy_ls(embeddings_av, labels)

tcmalloc: large alloc 6082560000 bytes == 0x7f2c1b94a000 @ 


In [26]:
with open(variables_path / "verbose_knn_accuracy_malteos_av.txt", "w") as f:
    f.write(cap.stdout)

In [31]:
print(knn_accuracy_malteos_av)

0.6588


In [28]:
np.save(variables_path / "knn_accuracy_malteos_av", knn_accuracy_malteos_av)