https://github.com/shorvath/MammalianMethylationConsortium
Tip revision: 44f3329fd593b472c2b6d021d764c9d7e5681f88 authored by Amin Haghani on 24 July 2023, 22:43:25 UTC
Add files via upload
Add files via upload
Tip revision: 44f3329
annotateMm10Probes.ipynb
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download the following utilities\n",
"- http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/bigWigToBedGraph\n",
"- http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/hgWiggle\n",
"- bedtools\n",
"- pandas (Python package)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download the following annotations\n",
"- https://public.hoffman2.idre.ucla.edu/ernst/R0RG6/LECIF/mm10.LECIFv1.1.bw\n",
"- http://hgdownload.cse.ucsc.edu/goldenpath/mm10/phastCons60way/mm10.60way.phastCons.bw\n",
"- http://hgdownload.cse.ucsc.edu/goldenpath/mm10/phastCons60way/mm10.60way.phastCons60wayPlacental.bw\n",
"- http://hgdownload.cse.ucsc.edu/goldenpath/mm10/phyloP60way/mm10.60way.phyloP60way.bw\n",
"- http://hgdownload.cse.ucsc.edu/goldenpath/mm10/phyloP60way/mm10.60way.phyloP60wayPlacental.bw\n",
"- https://drive.google.com/file/d/1TbqDipE2zcw2-FEs5aVWKkQ8zYwjOsQq/view?usp=sharing\n",
"- https://github.com/shorvath/MammalianMethylationConsortium/blob/main/Annotations%2C%20Amin%20Haghani/Mammals/Mus_musculus.grcm38.100.HorvathMammalMethylChip40.v1.csv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Write a bed file with all mm10 coordinates"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"input_filename = 'Mus_musculus.grcm38.100.HorvathMammalMethylChip40.v1.txt'\n",
"df = pd.read_csv(input_filename,usecols=['CGid','seqnames','CGstart','CGend']).dropna()\n",
"df = df.sort_values(by=['seqnames','CGstart'])\n",
"df['chrom'] = \"chr\"+df['seqnames'] \n",
"df['CGstart'] = df['CGstart'].astype(int)\n",
"df['CGend'] = df['CGend'].astype(int)\n",
"df[['chrom','CGstart','CGend','CGid']].to_csv('mm10_coord.bed',sep='\\t',header=False,index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Convert bigWig files to gzipped bed files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"\n",
"./bigWigToBedGraph mm10.LECIFv1.1.bw mm10.LECIF.bed\n",
"gzip mm10.LECIF.bed\n",
"\n",
"./bigWigToBedGraph mm10.60way.phastCons.bw mm10.phastCons60way.bed\n",
"gzip mm10.phastCons60way.bed\n",
"\n",
"./bigWigToBedGraph mm10.60way.phastCons60wayPlacental.bw mm10.phastCons60wayPlacental.bed\n",
"gzip mm10.phastCons60wayPlacental.bed\n",
"\n",
"./bigWigToBedGraph mm10.60way.phyloP60way.bw mm10.phyloP60way.bed\n",
"gzip mm10.phyloP60way.bed\n",
"\n",
"./bigWigToBedGraph mm10.60way.phyloP60wayPlacental.bw mm10.phyloP60wayPlacental.bed\n",
"gzip mm10.phyloP60wayPlacental.bed"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Map LECIF score"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! bedtools map -a mm10_coord.bed -b mm10.LECIF.bed.gz -c 4 -o mean > mm10_coord.LECIF.bed"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Map sequence constraint scores"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"\n",
"for f in phastCons60way phyloP60way phastCons60wayPlacental phyloP60wayPlacental; do\n",
" bedtools map -a mm10_coord.bed -b mm10.$f.bed.gz -c 4 -o mean > mm10_coord.$f.bed\n",
"done"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Map ConsHMM states"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! bedtools map -a mm10_coord.bed -b 60_states_segmentation.bed.gz -c 4 -o first > mm10_coord.ConsHMM.bed"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Map ENCODE ChromHMM states"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"path_prefix=http://hgdownload.soe.ucsc.edu/gbdb/mm10/encode3/chromHmm/\n",
"output_filename=mm10_coord.ChromHMM.bed\n",
"\n",
"cp mm10_coord.bed $output_filename\n",
"for i in `cat mm10_ChromHMM_filenames.txt`; do\n",
" ./bigBedToBed $path_prefix/$i stdout | bedtools map -a $output_filename -b - -c 4 -o first > tmp.bed\n",
" mv tmp.bed $output_filename\n",
"done"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### Combine them into a formatted csv file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"coord_filename = 'mm10_coord.bed'\n",
"bed_filenames = ['mm10_coord.LECIF.bed',\n",
" 'mm10_coord.phastCons60way.bed',\n",
" 'mm10_coord.phastCons60wayPlacental.bed',\n",
" 'mm10_coord.phyloP60way.bed',\n",
" 'mm10_coord.phyloP60wayPlacental.bed',\n",
" 'mm10_coord.ConsHMM.bed',\n",
" 'mm10_coord.ChromHMM.bed']\n",
"\n",
"position_col_names = ['chrom','start','end','probeID']\n",
"\n",
"chromhmm_col_name_prefix = 'ChromHMMState_ENCODE_GorkinEtAl2017bioRxiv_'\n",
"chromhmm_col_names = [chromhmm_col_name_prefix+i.strip().replace('.bb','').replace('encode3RenChromHmm','') for i in open('mm10_ChromHMM_filenames.txt')]\n",
"\n",
"names = [position_col_names + n for n in [['LECIFScore_KwonErnst2012NatureCommunications'],\n",
" ['PhastConsScore_SiepelEtAl2005GenomeResearch'],\n",
" ['PhyloPScore_PollardEtAl2009GenomeResearch'],\n",
" ['PhastConsPlacentalScore_SiepelEtAl2005GenomeResearch'],\n",
" ['PhyloPPlacentalScore_PollardEtAl2009GenomeResearch'],\n",
" ['ConsHMMState_Multiz60way_ArnesonEtAl2020NARGenomBioinform'],\n",
" chromhmm_col_names]]\n",
"\n",
"df = pd.read_table(coord_filename,header=None,names=position_col_names)\n",
"for i in range(len(bed_filenames)):\n",
" input_df = pd.read_table(bed_filenames[i],header=None,names=names[i])\n",
" df = df.merge(input_df,on=position_col_names,how='left')\n",
" \n",
"df.to_csv('mm10_annotErnstLab.csv',index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
