In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import sparse
from scipy.sparse import csr_matrix

import time
import pickle

import memory_profiler
%load_ext memory_profiler

from pathlib import Path

In [2]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [8]:
%load_ext autoreload
%autoreload 2

from pubmed_landscape_src.plotting import automatic_coloring

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
variables_path = Path("../results/variables")
figures_path = Path("../results/figures")
berenslab_data_path = Path("/gpfs01/berens/data/data/pubmed_processed")

# Import

In [9]:
tsne_reparsed = np.load(variables_path / "bert/tsne_reparsed.npy")

In [10]:
tsne = np.vstack((-tsne_reparsed[:, 0], tsne_reparsed[:, 1])).T

In [11]:
mask_covid = np.load(variables_path / "mask_reparsed_covid.npy")

In [12]:
mask_covid_2 = np.load(variables_path / "mask_covid_2.npy")

In [13]:
mask_covid_tfidf_island = np.load(
    variables_path / "mask_covid_tfidf_island.npy"
)

In [30]:
date_year = np.load(variables_path / "date_year_reparsed.npy")

# Colors based on title keywords

## Import

In [1419]:
# import clean_df_updated
clean_df_updated_reparsed_filtered_with_authors_ISSN = pd.read_pickle(variables_path / "clean_df_updated_reparsed_filtered_with_authors_ISSN")

## Colors

In [15]:
covid_words = [
    "Cancer",
    "Children",
    "Treatment",
    "Mental",
    "Mortality",
    "Transmission",
    "Psychological",
    "Workers",
    "Pneumonia",
    "Pediatric",
    "Healthcare",
    "Vaccine",
    "Outbreak",
    "Implications",
    "Strategies",
    "Clinical",
    "Epidemic",
    "Symptoms",
    "Respiratory",
    "Therapy",
    "Surgery",
    "Population",
    "Social",
    "Anxiety",
    "Students",
    "Antibody",
    "Immune",
]
covid_colors = [
    "#ffbe00",
    "#1CE6FF",
    "#FF34FF",
    "#FF4A46",
    "#008941",
    "#006FA6",
    "#A30059",
    "#7A4900",
    "#0000A6",
    "#63FFAC",
    "#B79762",
    "#004D43",
    "#5A0007",
    "#1B4400",
    "#4FC601",
    "#3B5DFF",
    "#4A3B53",
    "#FF2F80",
    "#6B7900",
    "#00C2A0",
    "#FFAA92",
    "#FF90C9",
    "#B903AA",
    "#D16100",
    "#7B4F4B",
    "#0AA6D8",
    "#00846F",
]

In [21]:
covid_legend = dict(zip(covid_words, covid_colors))

In [66]:
%%time
covid_titles = clean_df_updated_reparsed_filtered_with_authors_ISSN['Title'][mask_covid_island]
_, covid_colors = improved_coloring(covid_titles, covid_legend)

CPU times: user 12min 12s, sys: 41.1 s, total: 12min 53s
Wall time: 12min 53s


In [1424]:
#save results
np.save(variables_path / "covid_colors", covid_colors)

#save results
f = open(variables_path / "covid_legend.pkl","wb")
pickle.dump(covid_legend,f)
f.close()

# Numbers

In [33]:
print("Total number of Covid-19 papers:", np.sum(mask_covid_2))
print(
    "Percentage of Covid-19 papers:",
    np.sum(mask_covid_2) / tsne_reparsed.shape[0] * 100,
)

print(
    "Percentage of Covid-19 papers out of papers published in 2020-2022:",
    np.sum(mask_covid_2)
    / np.sum([(date_year >= 2020) & (date_year <= 2022)])
    * 100,
)

Total number of Covid-19 papers: 132802
Percentage of Covid-19 papers: 0.6419540632711611
Percentage of Covid-19 papers out of papers published in 2020-2022: 5.159451055107391


In [20]:
# mask island 1
tsne = np.vstack((-tsne_reparsed[:, 0], tsne_reparsed[:, 1])).T

right = 15
left = -15
top = -40
bottom = -85

mask_covid_island = (
    mask_covid_2
    & (tsne[:, 0] < right)
    & (tsne[:, 0] > left)
    & (tsne[:, 1] < top)
    & (tsne[:, 1] > bottom)
)

In [23]:
print(
    "Number of Covid-19 papers grouped in the Covid island:",
    np.sum(mask_covid_island),
)
print(
    "Percentage of Covid-19 papers grouped in the Covid island:",
    np.sum(mask_covid_island) / np.sum(mask_covid_2) * 100,
)

Number of Covid-19 papers grouped in the Covid island: 78732
Percentage of Covid-19 papers grouped in the Covid island: 59.28525172813662


In [36]:
# mask island 1
tsne = np.vstack((-tsne_reparsed[:, 0], tsne_reparsed[:, 1])).T

right = 15
left = -15
top = -40
bottom = -85

mask_island = (
    (tsne[:, 0] < right)
    & (tsne[:, 0] > left)
    & (tsne[:, 1] < top)
    & (tsne[:, 1] > bottom)
)

In [39]:
print("Number of total papers in the island:", np.sum(mask_island))

Number of total papers in the island: 249187
Percentage of non-covid papers in the covid island: 68.40445127554808


In [45]:
print(
    "Percentage of unlabeled papers in the covid island:",
    np.sum([covid_colors == "lightgrey"]) / covid_colors.shape[0] * 100,
)

Percentage of unlabeled papers in the covid island: 54.43529949702789
