import pandas as pd import xml.etree.ElementTree as et import os import torch def xml_import(xml_file): """Parses some elements of the metadata in PubMed XML files. Parses the following elements of each paper stored in the input `xml_file`, and stores them in a pandas DataFrame. Elements parsed: PMID, Title, Abstract, Language, Journal, Date, First author name, Last authors name, ISSN. Parameters ---------- xml_file : path/filename Filename of the XML file. Returns ------- out : pandas dataframe of shape (n_papers, n_elements) Pandas dataframe containing all the parsed papers, and as columns the elements of the metadate that were parsed. dicc : dict Dictionary from which the pandas dataframe was created See Also -------- xml_import Notes ----- Parsed elements of each paper: - PMID (stored in ). - Title (stored in ). - Abstract (stored in ). - Language (stored in ). - Journal (stored in ). - Date (stored in <PubDate>). - First author first name (stored in <ForeName>, child of <AuthorList>, child of <Author>). - Last authors first name (stored in <ForeName>, child of <AuthorList>, child of <Author>). - ISSN (stored in <ISSN>) Details about information extracion: - PMID: If there is no tag <PMID>, it will add 'no tag'. If there is more than one <PMID>, it will import only the first one. If <PMID> contains no text, it will add '' (empty string). - Title: If there is no tag <ArticleTitle>, it will add 'no tag'. If there is more than one <ArticleTitle>, it will import only the first one. If <ArticleTitle> contains no text, it will add '' (empty string). - Abstract: If there is no tag <Abstract> (parent of <AbstractText>), it will add '' (empty string). If there is more than one <AbstractText> inside <Abstract>, it will combine them into one list. If <AbstractText> contains no text, it will add '' (empty string). If there is more than one <Abstract> or other tags containing <AbstractText>, like <OtherAbstract>, it will not get text from them. I am not sure but I think that with the fix it collected all the child tags from <Abstract>. - Language: If there is no tag <Language>, it will add 'no tag'. If there is more than one <Language>, it will import only the first one. If <Language> contains no text, it will add '' (empty string). - Journal: If there is no tag <Title>, it will add 'no tag'. If there is more than one <Title>, it will import only the first one. If <Title> contains no text, it will add '' (empty string). - Date: If there is no tag <PubDate>, it will add 'no tag'. It will combine all the <PubDate>'s childs' texts into one (due to the assymetry of the date storage, sometimes with <Day>, <Month> and <Year>, other times with <MedlineDate>). If <PubDate> contains no further childs, it will print ' '. - First author first name: It parses the <ForeName> of the first <Author> listed in <Authorlist>. Note that sometimes the metadata is not perfect inside the tag there is the complete name, including surnames. If there is no tag <ForeName>, 'no tag' will be appended. Note for the future: Maybe this is misses some names directly listed in the tag <Author>, maybe an approach similar to what I do for abstracts would be better, where everything under the <Author> tag is parsed. In that case we would also have surnames, but that can be cleaned after. - Last authors first name: It parses the <ForeName> of the last <Author> listed in <Authorlist>. Note that sometimes the metadata is not perfect inside the tag there is the complete name, including surnames. If there is no tag <ForeName>, 'no tag' will be appended. Note for the future: maybe same problem as in first authors. - ISSN (stored in <ISSN>): If there is no tag <ISSN>, it will add 'no tag'. If <ISSN> contains no text, it will add '' (empty string). """ xtree = et.parse(xml_file, parser=et.XMLParser(encoding="UTF-8")) xroot = xtree.getroot() dicc={} #PMID ros=[] for child1 in xroot: for child2 in child1: for element in child2.iter('MedlineCitation'): tag=element.find('PMID') if tag is None: ros.append(['no tag']) else: res=[] if not tag.text : res.append('') else: res.append(tag.text) ros.append(res) ros=[' '.join(ele) for ele in ros] dicc['PMID']=ros #Title ros=[] for child1 in xroot: for child2 in child1: for child3 in child2: for article in child3.iter('Article'): tag=article.find('ArticleTitle') if tag is None: ros.append(['no tag']) else: res=[] res.append("".join(tag.find(".").itertext())) ros.append(res) ros=[' '.join(ele) for ele in ros] dicc['Title']=ros #Abstract ros=[] for child1 in xroot: for child2 in child1: for child3 in child2: for article in child3.iter('Article'): tag=article.find('Abstract') if tag is None: ros.append(['']) else: for child4 in child3: for elem in child4.iter('Abstract'): res=[] for AbstractText in elem.iter('AbstractText'): res.append("".join(AbstractText.find(".").itertext()).strip()) res=[' '.join(res)] res=[elem.strip() for elem in res] ros.append(res) #print(ros) ros=[' '.join(ele) for ele in ros] dicc['AbstractText']=ros #Language ros=[] for child1 in xroot: for child2 in child1: for child3 in child2: for article in child3.iter('Article'): tag=article.find('Language') if tag is None: ros.append(['no tag']) else: res=[] if not tag.text : res.append('') else: res.append(tag.text) ros.append(res) ros=[' '.join(ele) for ele in ros] dicc['Language']=ros #Journal ros=[] for child1 in xroot: for child2 in child1: for child3 in child2: for child4 in child3: for journal in child4.iter('Journal'): tag=journal.find('Title') if tag is None: ros.append(['no tag']) else: res=[] if not tag.text: res.append('') else: res.append(tag.text) ros.append(res) ros=[' '.join(ele) for ele in ros] dicc['Journal']=ros #Date ros=[] for child1 in xroot: for child2 in child1: for child3 in child2: for child4 in child3: for child5 in child4: for JI in child5.iter('JournalIssue'): tag=JI.find('PubDate') if tag is None: ros.append(['no tag']) else: res=[] for elem in tag: res.append(elem.text) ros.append(res) ros=[' '.join(ele) for ele in ros] dicc['Date']=ros #First name of the first author ros=[] for child1 in xroot: for child2 in child1: for child3 in child2: for child4 in child3.iter('Article'): authorlist = child4.find('AuthorList') if authorlist is None: ros.append('') else: for elem in child4.iter('AuthorList'): author = elem.find('Author') tag=author.find('ForeName') if tag is None: ros.append(['no tag']) else: res=[] if not tag.text: res.append('') else: res.append(tag.text) ros.append(res) ros=[' '.join(ele) for ele in ros] dicc['NameFirstAuthor']=ros #First name of the last author ros=[] for child1 in xroot: for child2 in child1: for child3 in child2: for child4 in child3.iter('Article'): authorlist = child4.find('AuthorList') if authorlist is None: ros.append('') else: for author in child4.iter('AuthorList'): res=[] for elem in author.iter('Author'): tag=elem.find('ForeName') if tag is None: res.append(['no tag']) #for the next time, this is bad because then the element is a list and not a string #it should be: # res.append('no tag') else: if not tag.text: res.append('') else: res.append(tag.text) ros.append(res[-1]) dicc['NameLastAuthor']=ros #ISSN ros=[] for child1 in xroot: for child2 in child1: for child3 in child2: for child4 in child3: for journal in child4.iter('Journal'): tag=journal.find('ISSN') if tag is None: ros.append(['no tag']) else: res=[] if not tag.text: res.append('') else: res.append(tag.text) ros.append(res) ros=[' '.join(ele) for ele in ros] dicc['ISSN']=ros out=pd.DataFrame.from_dict(dicc) return out, dicc def import_all_files(path, order_files=False): """Imports all xml files from a directory into a combined dataframe using the function xml_import. WARNING: I changed the name of the xml_import function that also includes the first names of first and last authors and ISSN, so now this function works calling the old function. A new import_all_files function needs to be created that calls the new xml_import_with_authors_ISSN. Parameters ---------- path : srt Path of the directory with the files you want to import. order_files : bool, default=False If True, it will print the order in which files are being imported. Returns ------- final_df : pandas dataframe Dataframe with all the XML files from the directory imported and merged together (concatenated in the order that they were in the directory (from up to down)). """ # name_files has the names of both .xml files and .gz.md5 files name_files=os.listdir(path) # we select only the .xml files len_filenames_list=map(len, name_files) len_filenames=np.fromiter(len_filenames_list, dtype=np.int64,count=len(name_files)) name_files_array=np.array(name_files) name_xml_files=name_files_array[len_filenames==17] # import frame_all_df=[] for i in range(0,len(name_xml_files)): path_file=path+name_xml_files[i] if order_files==True: print(name_xml_files[i]) df,dic=xml_import(str(path_file)) dic['filename'] = [name_xml_files[i]]*len(dic['Title']) df=pd.DataFrame.from_dict(dic) frame_all_df.append(df) final_df=pd.concat(frame_all_df,ignore_index=True) return final_df @torch.no_grad() def generate_embeddings(abstracts, tokenizer, model, device): """Generate embeddings using BERT-based model. Code from Luca Schmidt. Parameters ---------- abstracts : list Abstract texts. tokenizer : transformers.models.bert.tokenization_bert_fast.BertTokenizerFast Tokenizer. model : transformers.models.bert.modeling_bert.BertModel BERT-based model. device : str, {"cuda", "cpu"} "cuda" if torch.cuda.is_available() else "cpu". Returns ------- embedding_cls : ndarray [CLS] tokens of the abstracts. embedding_sep : ndarray [SEP] tokens of the abstracts. embedding_av : ndarray Average of tokens of the abstracts. """ # preprocess the input inputs = tokenizer( abstracts, padding=True, truncation=True, return_tensors="pt", max_length=512, ).to(device) # inference outputs = model(**inputs)[0].cpu().detach() embedding_av = torch.mean(outputs, [0, 1]).numpy() embedding_sep = outputs[:, -1, :].numpy() embedding_cls = outputs[:, 0, :].numpy() return embedding_cls, embedding_sep, embedding_av