https://github.com/ChengLiLab/myeloma
Raw File
Tip revision: 634d6aabda1b3c0bc7ddfe145dfc34b5018b6f63 authored by ChengLiLab on 17 November 2017, 12:16:24 UTC
Add files via upload
Tip revision: 634d6aa
CNV_database_Multiple_Myeloma_boudary_TAD_boundary.R
# analysis cnv boudary and TAD boundary by using public data
# 20160802
# wupz

# 1.1 CNVD database: disease CNV
CNV_CNVD <- read.table( file = '/lustre/user/liclab/wupz/dosageEffect/rawData/CNVD_HS_MM.txt',sep = '\t', 
                        stringsAsFactors = F)
colnames(CNV_CNVD ) <- c( 'id', 'species', 'chr', 'start', 'end', 'band', 'type', 'genes', 'disease', 'methods', 'sample_size', 'rate', 'pubmed_id')
dim(CNV_CNVD) # [1] 302  13
head(CNV_CNVD)
# 1.2 filtering some data 
filter_id_3 <- (CNV_CNVD[, 'end'] - CNV_CNVD[, 'start'] ) >= 500000
sum(filter_id_3) # [1] 277
filter_id_4 <- CNV_CNVD$rate >= 0.1
sum(filter_id_4) # [1] 183
#table(CNV_CNVD[filter_id_3 & filter_id_4, 'disease' ] )[table(CNV_CNVD[filter_id_3 & filter_id_4, 'disease'] ) >= 100]
#filter_id_5 <- CNV_CNVD[, 'disease'] == 'Multiple myeloma'
# CNV_CNVD[, 'disease'] == 'Basal cell lymphoma' | CNV_CNVD[, 'disease'] == 'Acute myeloid leukemia' | CNV_CNVD[, 'disease'] == 'Burkitts lymphoma' | 
#sum(filter_id_5) # [1] 302
sum(filter_id_3 & filter_id_4)
CNV_CNVD_MM_filter <- CNV_CNVD[filter_id_3 & filter_id_4, ]
CNV_CNVD_MM_filter_breakpoints <- 

# 2.1 for each cnv from cnvd, find the is around the boundary, and the same with tandom sites 
CNV_CNVD_InsulationScore_GM12878  <- numeric()
random_CNVD_insulationScore_GM12878 <- numeric()
for ( i in names(table(CNV_CNVD[filter_id_3 & filter_id_4 & filter_id_5 , 'chr'])) ) {
  tmp_sites <-  CNV_CNVD[filter_id_3 & filter_id_4 & filter_id_5 , 3:5]
  tmp_sites <- tmp_sites[tmp_sites[,1] == i, ]
  tmp_cnv_block_ins <- Sites_IS( Sites = tmp_sites, insulation_score_list = GM12878_is_10kb[[i]], flank = 100)
  CNV_CNVD_InsulationScore_GM12878 <- rbind(CNV_CNVD_InsulationScore_GM12878, tmp_cnv_block_ins)
  # tandom sites 
  tmp_random_number <- dim(tmp_sites)[1] # for each chromosome random select some sites with the same number of TADs
  tmp_random_site <- GM12878_is_10kb[[i]][ sample( x = 1:dim(GM12878_is_10kb[[i]])[1], size = tmp_random_number), ]
  tmp_random_ins <- Sites_IS( Sites = tmp_random_site, insulation_score_list = GM12878_is_10kb[[i]], flank = 100)
  random_CNVD_insulationScore_GM12878   <- rbind(random_CNVD_insulationScore_GM12878  ,  tmp_random_ins)
}
median_ins_CNV_CNVD <- apply(CNV_CNVD_InsulationScore_GM12878, 2, function(x) median( as.numeric(x), na.rm = T) )
median_random_insulationScore_GM12878_CNVD <- apply(random_CNVD_insulationScore_GM12878, 2, function(x) median( as.numeric(x), na.rm = T) )

# 2.4 draw the insulation score around the cnv sites
png(filename = '../CNV_TAD_boundary/Human disease (Multiple myeloma) cnv boundary and TAD boundary (GM12878).png', width = 1024, height = 1024)
par(cex = 3, mgp = c(1.5, 0.5, 0), lwd = 3, tcl =-0.1, mar = c(3,3,3,2))
plot( median_ins_CNV_CNVD, type = 'l', 
      xlab = 'Genomic Regions', 
      ylab = 'Median Insulation Score',
      main = 'CNV Breakpoints in Cancers \nand TAD boundaris in GM12878',
      ylim = c(min(median_ins_CNV_CNVD, GM12878_median_TAD_ins,median_random_insulationScore_GM12878_CNVD, na.rm = T), 
               max(median_ins_CNV_CNVD, GM12878_median_TAD_ins,median_random_insulationScore_GM12878_CNVD, na.rm = T) ),
      col = "red",
      lwd = 5, 
      xlim = c(0, 250), 
      axes = FALSE)
lines( GM12878_median_TAD_ins, lwd = 5, col = "black" )
lines(median_random_insulationScore_GM12878_CNVD, lwd = 5, col = "blue")
axis(side = 1, at = c(1, 50, 101, 151, 201), labels = c('-1000kb', '-500kb','0kb','500kb', '1000kb' ))
axis(side = 2)
legend('bottomright', bty = 'n', cex = 0.75, 
       legend = c('CNV Breakpoints\n(Multiple Myeloma)', 'TAD Boudaries', 'Random Sites'),
       col = c( 'red', 'black', 'blue'), pch = 19)
dev.off()


# select a disease : 
filter_id_6 <- CNV_CNVD[, 'disease'] == 'High-grade prostatic intraepithelial neoplasia'
# CNV_CNVD[, 'disease'] == 'Basal cell lymphoma' | CNV_CNVD[, 'disease'] == 'Acute myeloid leukemia' | CNV_CNVD[, 'disease'] == 'Burkitts lymphoma' | 
sum(filter_id_6) # [1] 208
sum(filter_id_3 & filter_id_4 & filter_id_6)
head( CNV_CNVD[filter_id_3 & filter_id_4, ])

# 2.1 for each cnv from cnvd, find the is around the boundary, and the same with tandom sites 
CNV_CNVD_InsulationScore_GM12878_HGPIN  <- numeric()
random_CNVD_insulationScore_GM12878_HGPIN <- numeric()
for ( i in names(table(CNV_CNVD[filter_id_3 & filter_id_4 & filter_id_6 , 'chr'])) ) {
  i <- as.numeric(i)
  tmp_sites <-  CNV_CNVD[filter_id_3 & filter_id_4 & filter_id_6 , 3:5]
  tmp_sites <- tmp_sites[tmp_sites[,1] == i, ]
  tmp_cnv_block_ins <- Sites_IS( Sites = tmp_sites, insulation_score_list = GM12878_is_10kb[[i]], flank = 100)
  CNV_CNVD_InsulationScore_GM12878_HGPIN <- rbind(CNV_CNVD_InsulationScore_GM12878_HGPIN, tmp_cnv_block_ins)
  # tandom sites 
  tmp_random_number <- dim(tmp_sites)[1] # for each chromosome random select some sites with the same number of TADs
  tmp_random_site <- GM12878_is_10kb[[i]][ sample( x = 1:dim(GM12878_is_10kb[[i]])[1], size = tmp_random_number), ]
  tmp_random_ins <- Sites_IS( Sites = tmp_random_site, insulation_score_list = GM12878_is_10kb[[i]], flank = 100)
  random_CNVD_insulationScore_GM12878_HGPIN <- rbind(random_CNVD_insulationScore_GM12878_HGPIN,  tmp_random_ins)
}
median_ins_CNV_CNVD_HGPIN <- apply(CNV_CNVD_InsulationScore_GM12878_HGPIN, 2, function(x) median( as.numeric(x), na.rm = T) )
median_random_insulationScore_GM12878_CNVD_HGPINC <- apply(random_CNVD_insulationScore_GM12878_HGPIN, 2, function(x) median( as.numeric(x), na.rm = T) )

# 2.4 draw the insulation score around the cnv sites
png(filename = '../CNV_TAD_boundary/Human disease (HGPINC) cnv boundary and TAD boundary (GM12878).png', width = 1024, height = 1024)
par(cex = 3, mgp = c(1.5, 0.5, 0), lwd = 3, tcl =-0.1, mar = c(3,3,3,2))
plot( median_ins_CNV_CNVD_HGPIN, type = 'l', 
      xlab = 'Genomic Regions', 
      ylab = 'Median Insulation Score',
      main = 'CNV Boundaries in Cancers \nand TAD boundaris in GM12878',
      ylim = c(min(median_ins_CNV_CNVD_HGPIN, GM12878_median_TAD_ins,median_random_insulationScore_GM12878_CNVD_HGPINC, na.rm = T), 
               max(median_ins_CNV_CNVD_HGPIN, GM12878_median_TAD_ins,median_random_insulationScore_GM12878_CNVD_HGPINC, na.rm = T) ),
      col = "red",
      lwd = 5, 
      #xlim = , 
      axes = FALSE)
lines( GM12878_median_TAD_ins, lwd = 5, col = "black" )
lines(median_random_insulationScore_GM12878_CNVD_HGPINC, lwd = 5, col = "blue")
axis(side = 1, at = c(1, 50, 101, 151, 201), labels = c('-1000kb', '-500kb','0kb','500kb', '1000kb' ))
axis(side = 2)
legend('bottomright', bty = 'n', cex = 0.6, 
       legend = c('CNV boundaries\n(HGPINC)', 'TAD Boudaries', 'Random Sites'),
       col = c( 'red', 'black', 'blue'), pch = 19)
dev.off()


# select a disease : Non-small cell lung cancer
filter_id_7 <- CNV_CNVD[, 'disease'] == 'Non-small cell lung cancer'
# CNV_CNVD[, 'disease'] == 'Basal cell lymphoma' | CNV_CNVD[, 'disease'] == 'Acute myeloid leukemia' | CNV_CNVD[, 'disease'] == 'Burkitts lymphoma' | 
sum(filter_id_7) # [1] 208
sum(filter_id_3 & filter_id_4 & filter_id_7)
head( CNV_CNVD[filter_id_3 & filter_id_4, ])

# 2.1 for each cnv from cnvd, find the is around the boundary, and the same with tandom sites 
CNV_CNVD_InsulationScore_GM12878_NSCLC  <- numeric()
random_CNVD_insulationScore_GM12878_NSCLC <- numeric()
for ( i in names(table(CNV_CNVD[filter_id_3 & filter_id_4 & filter_id_7 , 'chr'])) ) {
  i <- as.numeric(i)
  tmp_sites <-  CNV_CNVD[filter_id_3 & filter_id_4 & filter_id_7 , 3:5]
  tmp_sites <- tmp_sites[tmp_sites[,1] == i, ]
  tmp_cnv_block_ins <- Sites_IS( Sites = tmp_sites, insulation_score_list = GM12878_is_10kb[[i]], flank = 100)
  CNV_CNVD_InsulationScore_GM12878_NSCLC <- rbind(CNV_CNVD_InsulationScore_GM12878_NSCLC, tmp_cnv_block_ins)
  # tandom sites 
  tmp_random_number <- dim(tmp_sites)[1] # for each chromosome random select some sites with the same number of TADs
  tmp_random_site <- GM12878_is_10kb[[i]][ sample( x = 1:dim(GM12878_is_10kb[[i]])[1], size = tmp_random_number), ]
  tmp_random_ins <- Sites_IS( Sites = tmp_random_site, insulation_score_list = GM12878_is_10kb[[i]], flank = 100)
  random_CNVD_insulationScore_GM12878_NSCLC <- rbind(random_CNVD_insulationScore_GM12878_NSCLC,  tmp_random_ins)
}
median_ins_CNV_CNVD_NSCLC <- apply(CNV_CNVD_InsulationScore_GM12878_NSCLC, 2, function(x) median( as.numeric(x), na.rm = T) )
median_random_insulationScore_GM12878_CNVD_NSCLC <- apply(random_CNVD_insulationScore_GM12878_NSCLC, 2, function(x) median( as.numeric(x), na.rm = T) )

# 2.4 draw the insulation score around the cnv sites
png(filename = '../CNV_TAD_boundary/Human disease (NSCLC) cnv boundary and TAD boundary (GM12878).png', width = 1024, height = 1024)
par(cex = 3, mgp = c(1.5, 0.5, 0), lwd = 3, tcl =-0.1, mar = c(3,3,3,2))
plot( median_ins_CNV_CNVD_NSCLC, type = 'l', 
      xlab = 'Genomic Regions', 
      ylab = 'Median Insulation Score',
      main = 'CNV Breakpoints in Cancers \nand TAD boundaris in GM12878',
      ylim = c(min(median_ins_CNV_CNVD_NSCLC, GM12878_median_TAD_ins,median_random_insulationScore_GM12878_CNVD_NSCLC, na.rm = T), 
               max(median_ins_CNV_CNVD_NSCLC, GM12878_median_TAD_ins,median_random_insulationScore_GM12878_CNVD_NSCLC, na.rm = T) ),
      col = "red",
      lwd = 5, 
      #xlim = , 
      axes = FALSE)
lines( GM12878_median_TAD_ins, lwd = 5, col = "black" )
lines(median_random_insulationScore_GM12878_CNVD_NSCLC, lwd = 5, col = "blue")
axis(side = 1, at = c(1, 50, 101, 151, 201), labels = c('-1000kb', '-500kb','0kb','500kb', '1000kb' ))
axis(side = 2)
legend('bottomright', bty = 'n', cex = 0.6, 
       legend = c('CNV Breakpoints\n(NSCLC)', 'TAD Boudaries', 'Random Sites'),
       col = c( 'red', 'black', 'blue'), pch = 19)
dev.off()
back to top