In [2]:
%load_ext rpy2.ipython

In [80]:
%R require(gender)
%R require(tibble)

R[write to console]: Loading required package: tibble



0
1


In this notebooks we use the `gender` R package. You can install them in R via comannd `install.packages("gender")`.

In [81]:
import pandas as pd
import numpy as np

import time
import memory_profiler
%load_ext memory_profiler

from pathlib import Path

In [9]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [18]:
variables_path = Path('../results/variables')
figures_path = Path('../results/figures')
berenslab_data_path = Path('/gpfs01/berens/data/data/pubmed_processed')

# Import

In [379]:
date_year = np.load(variables_path / "date_year_reparsed.npy")

In [380]:
date_year.shape

(20687150,)

In [381]:
filtered_name_first_author = np.load(variables_path / 'filtered_name_first_author.npy')
filtered_name_last_author = np.load(variables_path / 'filtered_name_last_author.npy')

tcmalloc: large alloc 2399715328 bytes == 0x3008aa000 @ 
tcmalloc: large alloc 2316967936 bytes == 0x1535f6000 @ 


In [382]:
filtered_name_first_author.shape

(20687150,)

# Gender prediction

## First author

In [385]:
# I initialize a dataframe where I will store year, name and gender
# This will be the dataframe `year_name_gender_first_author_df'

df_subset = pd.DataFrame({'Years' : date_year, 'Names' : filtered_name_first_author})
df_subset

Unnamed: 0,Years,Names
0,1975.0,
1,1975.0,
2,1975.0,
3,1975.0,
4,1975.0,
...,...,...
20687145,2021.0,Katharina
20687146,2021.0,Katja
20687147,2021.0,Indira
20687148,2021.0,Giuseppe


In [386]:
%%time

# I predict the gender of those names sorting the df by year.
# The problem is that the returned df does not have the same dimensions as the original, since not all the papers have names.
# Therefore, the next two steps are necessary

gender_prediction_df = pd.DataFrame()
unique_years = np.unique(df_subset.Years)

for year in unique_years:
    print('Year: ', year)
    
    df_grouped_year = df_subset.groupby("Years").get_group(year)
    df_names = df_grouped_year.Names
    
    #cases for years outside interval
    if year <= 1930:
        %R library(gender)
        %R -i df_names -o result result = gender(df_names, years = 1930, method = "ssa")
        
    if year >= 2012:
        %R library(gender)
        %R -i df_names -o result result = gender(df_names, years = 2012, method = "ssa")
        
    if (year < 2012) & (year > 1930):
        %R library(gender)
        %R -i df_names -i year -o result result = gender(df_names, years = year, method = "ssa")


    gender_prediction_df = pd.concat([gender_prediction_df, result], ignore_index=True)
    
gender_prediction_df

Year:  1808.0
Year:  1881.0
Year:  1891.0
Year:  1896.0
Year:  1897.0
Year:  1898.0
Year:  1899.0
Year:  1900.0
Year:  1901.0
Year:  1902.0
Year:  1903.0
Year:  1905.0
Year:  1906.0
Year:  1907.0
Year:  1908.0
Year:  1909.0
Year:  1910.0
Year:  1911.0
Year:  1912.0
Year:  1913.0
Year:  1914.0
Year:  1915.0
Year:  1916.0
Year:  1917.0
Year:  1918.0
Year:  1919.0
Year:  1920.0
Year:  1921.0
Year:  1922.0
Year:  1923.0
Year:  1924.0
Year:  1925.0
Year:  1926.0
Year:  1927.0
Year:  1928.0
Year:  1929.0
Year:  1930.0
Year:  1931.0
Year:  1932.0
Year:  1933.0
Year:  1934.0
Year:  1935.0
Year:  1936.0
Year:  1937.0
Year:  1938.0
Year:  1939.0
Year:  1940.0
Year:  1941.0
Year:  1942.0
Year:  1943.0
Year:  1944.0
Year:  1945.0
Year:  1946.0
Year:  1947.0
Year:  1948.0
Year:  1949.0
Year:  1950.0
Year:  1951.0
Year:  1952.0
Year:  1953.0
Year:  1954.0
Year:  1955.0
Year:  1956.0
Year:  1957.0
Year:  1958.0
Year:  1959.0
Year:  1960.0
Year:  1961.0
Year:  1962.0
Year:  1963.0
Year:  1964.0
Year: 

Unnamed: 0,name,proportion_male,proportion_female,gender,year_min,year_max
0,Chalmers,1.0000,0.0000,male,1930.0,1930.0
1,Edmund,0.9940,0.0060,male,1930.0,1930.0
2,Leigh,0.7342,0.2658,male,1930.0,1930.0
3,Mary,0.0056,0.9944,female,1932.0,1932.0
4,Ben,0.9866,0.0134,male,1933.0,1933.0
...,...,...,...,...,...,...
8363111,Zoe,0.0023,0.9977,female,2012.0,2012.0
8363112,Zoe,0.0023,0.9977,female,2012.0,2012.0
8363113,Zofia,0.0000,1.0000,female,2012.0,2012.0
8363114,Zoie,0.0000,1.0000,female,2012.0,2012.0


In [391]:
#save df
gender_prediction_df.to_pickle(variables_path / "gender_prediction_first_author_df")

In [392]:
%%time

# In here I am creating a dictionary that has a map name-gender for every year, based on the predictions from the dataframe above (because they are year dependent). 
# I do some tricks to create mappings for also years that were not predicted, and years outside the available `ssa' intervals (<1930, >2012)

unique_predicted_years = np.unique(gender_prediction_df["year_min"])
gender_maps_years = {}
for year in unique_years: 
    
    if year >= 2012:
        eff_year = 2012
    if year <= 1930:
        eff_year = 1930
    if (year < 2012) & (year > 1930):
        eff_year = year
        
    if eff_year not in unique_predicted_years:
        closest_year = unique_predicted_years[(unique_predicted_years - eff_year).argmin()]
        
        gender_prediction_grouped_year = gender_prediction_df.groupby("year_min").get_group(closest_year)
        
    else:
        gender_prediction_grouped_year = gender_prediction_df.groupby("year_min").get_group(eff_year)
    
    gender_map = dict(zip(gender_prediction_grouped_year.name, gender_prediction_grouped_year.gender))
    
    gender_maps_years[year]=gender_map
    
gender_maps_years.keys()

CPU times: user 51.7 s, sys: 1.14 s, total: 52.9 s
Wall time: 53 s


dict_keys([1808.0, 1881.0, 1891.0, 1896.0, 1897.0, 1898.0, 1899.0, 1900.0, 1901.0, 1902.0, 1903.0, 1905.0, 1906.0, 1907.0, 1908.0, 1909.0, 1910.0, 1911.0, 1912.0, 1913.0, 1914.0, 1915.0, 1916.0, 1917.0, 1918.0, 1919.0, 1920.0, 1921.0, 1922.0, 1923.0, 1924.0, 1925.0, 1926.0, 1927.0, 1928.0, 1929.0, 1930.0, 1931.0, 1932.0, 1933.0, 1934.0, 1935.0, 1936.0, 1937.0, 1938.0, 1939.0, 1940.0, 1941.0, 1942.0, 1943.0, 1944.0, 1945.0, 1946.0, 1947.0, 1948.0, 1949.0, 1950.0, 1951.0, 1952.0, 1953.0, 1954.0, 1955.0, 1956.0, 1957.0, 1958.0, 1959.0, 1960.0, 1961.0, 1962.0, 1963.0, 1964.0, 1965.0, 1966.0, 1967.0, 1968.0, 1969.0, 1970.0, 1971.0, 1972.0, 1973.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.

In [393]:
#save results
f = open(variables_path / "gender_maps_years_first_author.pkl","wb")
pickle.dump(gender_maps_years,f)
f.close()

In [394]:
%%time
# Now, using the name-gender maps created above, I map the names in the original dataframe (df_subset) to their respective genders, saving them in a new column 'Gender'

# here I add a 'Gender' column to the df_subset
df_subset["Gender"] = ['unknown']*df_subset.shape[0]

for year in unique_years:    
    print(year)

    df_subset_year = df_subset.groupby("Years").get_group(year).Names.apply(lambda x: np.vectorize(gender_maps_years[year].get)(x))
    df_subset_year.rename("Gender", inplace=True)
    df_subset.update(df_subset_year)

df_subset

1808.0
1881.0
1891.0
1896.0
1897.0
1898.0
1899.0
1900.0
1901.0
1902.0
1903.0
1905.0
1906.0
1907.0
1908.0
1909.0
1910.0
1911.0
1912.0
1913.0
1914.0
1915.0
1916.0
1917.0
1918.0
1919.0
1920.0
1921.0
1922.0
1923.0
1924.0
1925.0
1926.0
1927.0
1928.0
1929.0
1930.0
1931.0
1932.0
1933.0
1934.0
1935.0
1936.0
1937.0
1938.0
1939.0
1940.0
1941.0
1942.0
1943.0
1944.0
1945.0
1946.0
1947.0
1948.0
1949.0
1950.0
1951.0
1952.0
1953.0
1954.0
1955.0
1956.0
1957.0
1958.0
1959.0
1960.0
1961.0
1962.0
1963.0
1964.0
1965.0
1966.0
1967.0
1968.0
1969.0
1970.0
1971.0
1972.0
1973.0
1974.0
1975.0
1976.0
1977.0
1978.0
1979.0
1980.0
1981.0
1982.0
1983.0
1984.0
1985.0
1986.0
1987.0
1988.0
1989.0
1990.0
1991.0
1992.0
1993.0
1994.0
1995.0
1996.0
1997.0
1998.0
1999.0
2000.0
2001.0
2002.0
2003.0
2004.0
2005.0
2006.0
2007.0
2008.0
2009.0
2010.0
2011.0
2012.0
2013.0
2014.0
2015.0
2016.0
2017.0
2018.0
2019.0
2020.0
2021.0
2022.0
CPU times: user 11min 28s, sys: 2.05 s, total: 11min 30s
Wall time: 11min 31s


Unnamed: 0,Years,Names,Gender
0,1975.0,,unknown
1,1975.0,,unknown
2,1975.0,,unknown
3,1975.0,,unknown
4,1975.0,,unknown
...,...,...,...
20687145,2021.0,Katharina,female
20687146,2021.0,Katja,female
20687147,2021.0,Indira,female
20687148,2021.0,Giuseppe,male


In [399]:
# check that the number of predicted genders matches the length of the df with the predictions above
len(df_subset[df_subset.Gender != 'unknown'])

8363116

In [401]:
#save df
df_subset.to_pickle(variables_path / "year_name_gender_first_author_df")

## Last author

In [367]:
df_subset = pd.DataFrame({'Years' : date_year, 'Names' : filtered_name_last_author})
df_subset

Unnamed: 0,Years,Names
0,1975.0,
1,1975.0,
2,1975.0,
3,1975.0,
4,1975.0,
...,...,...
20687145,2021.0,Jan
20687146,2021.0,Sven
20687147,2021.0,Hans
20687148,2021.0,Federica


In [368]:
%%time

gender_prediction_df = pd.DataFrame()
unique_years = np.unique(df_subset.Years)

for year in unique_years:
    print(year)
    df_grouped_year = df_subset.groupby("Years").get_group(year)
    df_names = df_grouped_year.Names
    
    #cases for years outside interval
    if year <= 1930:
        %R library(gender)
        %R -i df_names -o result result = gender(df_names, years = 1930, method = "ssa")
        
    if year >= 2012:
        %R library(gender)
        %R -i df_names -o result result = gender(df_names, years = 2012, method = "ssa")
        
    if (year < 2012) & (year > 1930):
        %R library(gender)
        %R -i df_names -i year -o result result = gender(df_names, years = year, method = "ssa")


    gender_prediction_df = pd.concat([gender_prediction_df, result], ignore_index=True)
    
gender_prediction_df

1808.0
1881.0
1891.0
1896.0
1897.0
1898.0
1899.0
1900.0
1901.0
1902.0
1903.0
1905.0
1906.0
1907.0
1908.0
1909.0
1910.0
1911.0
1912.0
1913.0
1914.0
1915.0
1916.0
1917.0
1918.0
1919.0
1920.0
1921.0
1922.0
1923.0
1924.0
1925.0
1926.0
1927.0
1928.0
1929.0
1930.0
1931.0
1932.0
1933.0
1934.0
1935.0
1936.0
1937.0
1938.0
1939.0
1940.0
1941.0
1942.0
1943.0
1944.0
1945.0
1946.0
1947.0
1948.0
1949.0
1950.0
1951.0
1952.0
1953.0
1954.0
1955.0
1956.0
1957.0
1958.0
1959.0
1960.0
1961.0
1962.0
1963.0
1964.0
1965.0
1966.0
1967.0
1968.0
1969.0
1970.0
1971.0
1972.0
1973.0
1974.0
1975.0
1976.0
1977.0
1978.0
1979.0
1980.0
1981.0
1982.0
1983.0
1984.0
1985.0
1986.0
1987.0
1988.0
1989.0
1990.0
1991.0
1992.0
1993.0
1994.0
1995.0
1996.0
1997.0
1998.0
1999.0
2000.0
2001.0
2002.0
2003.0
2004.0
2005.0
2006.0
2007.0
2008.0
2009.0
2010.0
2011.0
2012.0
2013.0
2014.0
2015.0
2016.0
2017.0
2018.0
2019.0
2020.0
2021.0
2022.0
CPU times: user 5min 58s, sys: 3.56 s, total: 6min 2s
Wall time: 6min 3s


Unnamed: 0,name,proportion_male,proportion_female,gender,year_min,year_max
0,Chalmers,1.0000,0.0000,male,1930.0,1930.0
1,Edmund,0.9940,0.0060,male,1930.0,1930.0
2,Leigh,0.7342,0.2658,male,1930.0,1930.0
3,Mary,0.0056,0.9944,female,1932.0,1932.0
4,Rose,0.0067,0.9933,female,1932.0,1932.0
...,...,...,...,...,...,...
8468160,Zo,0.0000,1.0000,female,2012.0,2012.0
8468161,Zoe,0.0023,0.9977,female,2012.0,2012.0
8468162,Zora,0.0000,1.0000,female,2012.0,2012.0
8468163,Zoran,1.0000,0.0000,male,2012.0,2012.0


In [369]:
#save df
gender_prediction_df.to_pickle(variables_path / "gender_prediction_last_author_df")

In [370]:
%%time

unique_predicted_years = np.unique(gender_prediction_df["year_min"])
gender_maps_years = {}
for year in unique_years: 
    
    if year >= 2012:
        eff_year = 2012
    if year <= 1930:
        eff_year = 1930
    if (year < 2012) & (year > 1930):
        eff_year = year
        
    if eff_year not in unique_predicted_years:
        closest_year = unique_predicted_years[(unique_predicted_years - eff_year).argmin()]
        
        gender_prediction_grouped_year = gender_prediction_df.groupby("year_min").get_group(closest_year)
        
    else:
        gender_prediction_grouped_year = gender_prediction_df.groupby("year_min").get_group(eff_year)
    
    gender_map = dict(zip(gender_prediction_grouped_year.name, gender_prediction_grouped_year.gender))
    
    gender_maps_years[year]=gender_map
    
gender_maps_years.keys()

CPU times: user 50 s, sys: 258 ms, total: 50.2 s
Wall time: 50.3 s


dict_keys([1808.0, 1881.0, 1891.0, 1896.0, 1897.0, 1898.0, 1899.0, 1900.0, 1901.0, 1902.0, 1903.0, 1905.0, 1906.0, 1907.0, 1908.0, 1909.0, 1910.0, 1911.0, 1912.0, 1913.0, 1914.0, 1915.0, 1916.0, 1917.0, 1918.0, 1919.0, 1920.0, 1921.0, 1922.0, 1923.0, 1924.0, 1925.0, 1926.0, 1927.0, 1928.0, 1929.0, 1930.0, 1931.0, 1932.0, 1933.0, 1934.0, 1935.0, 1936.0, 1937.0, 1938.0, 1939.0, 1940.0, 1941.0, 1942.0, 1943.0, 1944.0, 1945.0, 1946.0, 1947.0, 1948.0, 1949.0, 1950.0, 1951.0, 1952.0, 1953.0, 1954.0, 1955.0, 1956.0, 1957.0, 1958.0, 1959.0, 1960.0, 1961.0, 1962.0, 1963.0, 1964.0, 1965.0, 1966.0, 1967.0, 1968.0, 1969.0, 1970.0, 1971.0, 1972.0, 1973.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.

In [371]:
#save results
f = open(variables_path / "gender_maps_years_last_author.pkl","wb")
pickle.dump(gender_maps_years,f)
f.close()

In [372]:
df_subset

Unnamed: 0,Years,Names
0,1975.0,
1,1975.0,
2,1975.0,
3,1975.0,
4,1975.0,
...,...,...
20687145,2021.0,Jan
20687146,2021.0,Sven
20687147,2021.0,Hans
20687148,2021.0,Federica


In [373]:
%%time

df_subset["Gender"] = ['unknown']*df_subset.shape[0]

for year in unique_years:    
    print(year)
    
    df_subset_year = df_subset.groupby("Years").get_group(year).Names.apply(lambda x: np.vectorize(gender_maps_years[year].get)(x))
    df_subset_year.rename("Gender", inplace=True)
    df_subset.update(df_subset_year)
    
df_subset

1808.0
1881.0
1891.0
1896.0
1897.0
1898.0
1899.0
1900.0
1901.0
1902.0
1903.0
1905.0
1906.0
1907.0
1908.0
1909.0
1910.0
1911.0
1912.0
1913.0
1914.0
1915.0
1916.0
1917.0
1918.0
1919.0
1920.0
1921.0
1922.0
1923.0
1924.0
1925.0
1926.0
1927.0
1928.0
1929.0
1930.0
1931.0
1932.0
1933.0
1934.0
1935.0
1936.0
1937.0
1938.0
1939.0
1940.0
1941.0
1942.0
1943.0
1944.0
1945.0
1946.0
1947.0
1948.0
1949.0
1950.0
1951.0
1952.0
1953.0
1954.0
1955.0
1956.0
1957.0
1958.0
1959.0
1960.0
1961.0
1962.0
1963.0
1964.0
1965.0
1966.0
1967.0
1968.0
1969.0
1970.0
1971.0
1972.0
1973.0
1974.0
1975.0
1976.0
1977.0
1978.0
1979.0
1980.0
1981.0
1982.0
1983.0
1984.0
1985.0
1986.0
1987.0
1988.0
1989.0
1990.0
1991.0
1992.0
1993.0
1994.0
1995.0
1996.0
1997.0
1998.0
1999.0
2000.0
2001.0
2002.0
2003.0
2004.0
2005.0
2006.0
2007.0
2008.0
2009.0
2010.0
2011.0
2012.0
2013.0
2014.0
2015.0
2016.0
2017.0
2018.0
2019.0
2020.0
2021.0
2022.0
CPU times: user 11min 29s, sys: 2.52 s, total: 11min 31s
Wall time: 11min 32s


Unnamed: 0,Years,Names,Gender
0,1975.0,,unknown
1,1975.0,,unknown
2,1975.0,,unknown
3,1975.0,,unknown
4,1975.0,,unknown
...,...,...,...
20687145,2021.0,Jan,male
20687146,2021.0,Sven,male
20687147,2021.0,Hans,male
20687148,2021.0,Federica,female


In [376]:
# check that the number of predicted genders matches the length of the df with the predictions above
len(df_subset[df_subset.Gender != 'unknown'])

8468165

In [377]:
#save df
df_subset.to_pickle(variables_path / "year_name_gender_last_author_df")