# Advanced Repeat Annotation of long-lived Red Sea Urchin Genome *Analysis completed by Kate Castellano - November 8,2021 Purpose: A majority of the repeats identified with RepeatModeler are unkown. To better identify diverged and species specific repeats a combined approach with RepeatModeler, TransposonPSI and LTRharvest will be used. Summary: Repeat analysis was conducted on the newly assembled M. franciscancus genome, S. purpuratus genome (version 5.0, https://www.echinobase.org), L. variegatus genome (version 3.0, https://www.echinobase.org) and L. pictus genome (version 2.0, NCBI). The details are shown for M. franciscanus below but the same method was used for the other sea urchin species. Repeats were annotated with RepeatMasker (v 4.1.0) using a de novo repeat library. The de novo library was made using a combination of RepeatModeler (v 2.0.2), Transposon PSI (v 1.0.0) (https://transposonpsi.sourceforge.net/), and LTRharvest (v1.6.2) to better identify diverged and species-specific repeats. Each program was run individually before combining and clustering (Usearch v 11.0.667), with a minimum sequence match of 80%, to remove redundant sequences that may have been identified in multiple programs. # Run Repeat Classification Programs
#remove (-) and replace with nothing, print to a new file
sed 's/(-)//' Mfran_genome_final.fa.TPSI.allHits.chains.bestPerLocus.fasta > Mfran_genome_final.fa.TPSI.allHits.chains.bestPerLocus.editHeader.fasta
#remove (+) and replace with nothing, -i means to edit this file
sed -i 's/(+)//' Mfran_genome_final.fa.TPSI.allHits.chains.bestPerLocus.fasta
wget http://pfam.xfam.org/family/PF03732/hmm #Retrotrans_gag (PF03732)
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF03732.hmm
rm hmm
wget http://pfam.xfam.org/family/PF14529/hmm #Exo_endo_phos_2 Endonuclease-reverse transcriptase
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF14529.hmm
rm hmm
wget http://pfam.xfam.org/family/PF00077/hmm #RVP Retroviral aspartyl protease
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF00077.hmm
rm hmm
wget http://pfam.xfam.org/family/PF00078/hmm #RVT_1 Reverse transcriptase (RNA-dependent DNA polymerase
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF00078.hmm
rm hmm
wget http://pfam.xfam.org/family/PF03078/hmm #ATHILA ATHILA ORF-1 family
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF03078.hmm
rm hmm
wget http://pfam.xfam.org/family/PF03732/hmm #Retrotrans_gag Retrotransposon gag protein
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF03732.hmm
rm hmm
wget http://pfam.xfam.org/family/PF05380/hmm #Peptidase_A17 Pao retrotransposon peptidase
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF05380.hmm
rm hmm
wget http://pfam.xfam.org/family/PF07727/hmm #RVT_2 Reverse transcriptase (RNA-dependent DNA polymerase)
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF07727.hmm
rm hmm
wget http://pfam.xfam.org/family/PF07999/hmm #RHSP Retrotransposon hot spot protein
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF07999.hmm
rm hmm
wget http://pfam.xfam.org/family/PF14244/hmm #Retrotran_gag_3 gag-polypeptide of LTR copia-type
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF14244.hmm
rm hmm
wget http://pfam.xfam.org/family/PF00026/hmm #Asp Eukaryotic aspartyl protease
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF00026.hmm
rm hmm
wget http://pfam.xfam.org/family/PF08284/hmm #RVP_2 Retroviral aspartyl protease
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF08284.hmm
rm hmm
wget http://pfam.xfam.org/family/PF13456/hmm #RVT_3 Reverse transcriptase-like
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF13456.hmm
rm hmm
wget http://pfam.xfam.org/family/PF14223/hmm #Retrotran_gag_2 gag-polypeptide of LTR copia-type
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF14223.hmm
rm hmm
wget http://pfam.xfam.org/family/PF18701/hmm #DUF5641 Family of unknown function (DUF5641)
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF18701.hmm
rm hmm
wget http://pfam.xfam.org/family/PF03716/hmm #WCCH WCCH motif
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF03716.hmm
rm hmm
wget http://pfam.xfam.org/family/PF08333/hmm #DUF1725 Protein of unknown function (DUF1725)
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF08333.hmm
rm hmm
wget http://pfam.xfam.org/family/PF12382/hmm #Peptidase_A2E Retrotransposon peptidase
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF12382.hmm
rm hmm
wget http://pfam.xfam.org/family/PF18162/hmm #Arc_C Arc C-lobe
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF18162.hmm
rm hmm
wget http://pfam.xfam.org/family/PF18769/hmm #APOBEC1 APOBEC1
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF18769.hmm
rm hmm
wget http://pfam.xfam.org/family/PF19687/hmm #MARF1 LOTUS domain
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF19687.hmm
rm hmm
wget http://pfam.xfam.org/family/PF17241/hmm #Retrotran_gag_4 Ty5 Gag N-terminal region
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF17241.hmm
rm hmm
wget http://pfam.xfam.org/family/PF00077/hmm #Retroviral aspartyl protease
/data/app/hmmer-3.3/bin/hmmconvert -2 hmm > PF00077.hmm
rm hmm
unzip GyDB_collection.zip
#move .hmm files to be in the same location as pfam files
mv GyDB_collection/profiles/*.hmm .
#keep folder which contains the protein sequences and alignments
/data/app/genometools-1.6.2/bin/gt gff3 -sort Mfran_genome_LTRharvest.gff3 > Mfran_genome_LTRharvest_sort.gff3
#LTR harvest output
cat Mfran_genome_LTRharvestDigest_complete.fas > Mfran_RepeatLibCombined.fa
#TransposonPSI output
cat Mfran_genome_final.fa.TPSI.allHits.chains.bestPerLocus.editHeader_filter.fasta >> Mfran_RepeatLibCombined.fa
#RepeatModeler output (run by Jen Polinski)
cat mfran-families.fa >> Mfran_RepeatLibCombined.fa
#Output statistics:
Seqs 41913 (41.9k)
Clusters 24242 (24.2k)
Max size 171
Avg size 1.8
Min size 1
Singletons 20915 (20.9k), 49.9% of seqs, 86.3% of clusters
Max mem 449Mb
Time 05:20
Throughput 131.0 seqs/sec.