1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
### this script is used to organize CPTAC purity data
### two kinds of purity data
### first: estimated from RNA
### second: tumor nuclei from HE stain

setwd("/Users/pc2644/Documents/DM_Aneuploidy/Compensation/PanAnalysis")

library(ggplot2)
library(dplyr)

rm(list=ls())

### first organize estimate purity data
### colon2
path_colon <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/CO/AWG_data_freeze/Human__CPTAC_COAD__MS__Clinical__Clinical__03_01_2017__CPTAC__Clinical__BCM.tsi"
sample <- read.delim(path_colon)
sample <- data.frame(t(sample))
colnames(sample) <- sample[1,]
sample <- sample[-1,]
purity_colon <- data.frame(samples=rownames(sample),
                           estiPurity=sample$TumorPurity)
rm(sample, path_colon)

### breast2
path_breast <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/BR/AWG_data_freeze/prosp-brca-v5.3-sample-annotation.csv"
sample <- read.csv(path_breast)
purity_breast <- data.frame(samples=sample$Sample.ID,
                            estiPurity=sample$ESTIMATE.TumorPurity)
rm(sample, path_breast)

### ovarian2
# path_ovarian <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/OV/AWG_data_freeze/Ovary One Year Clinical Data_20160927.txt"
# sample <- read.delim(path_ovarian)
# rm(sample, path_ovarian)

### ccrcc
# path_ccrcc <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/ccRCC/AWG_data_freeze/CCRCC_September2018_case.tsv"
# sample <- read.delim(path_ccrcc)
# rm(sample, path_ccrcc)

### endometrial
path_endometrial <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/EC/AWG_data_freeze/HS_CPTAC_UCEC_CLI.txt"
sample <- read.delim(path_endometrial)
purity_endometrial <- data.frame(samples=gsub("-",".",sample$Proteomics_Participant_ID),
                            estiPurity=sample$Purity_Cancer)
rm(sample, path_endometrial)

### hnscc
path_hnscc <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/HNSCC/AWG_data_freeze/Meta_table.tsv"
sample <- read.delim(path_hnscc)
purity_hnscc <- data.frame(samples=gsub("-",".",sample$case_id),
                           estiPurity=sample$tumor_proportion)

rm(sample, path_hnscc)

### luad
path_luad <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/LUAD/AWG_data_freeze/luad-v3.2-sample-annotation.csv"
sample <- read.csv(path_luad)
purity_luad <- data.frame(samples=sample$Sample.ID,
                          estiPurity=sample$Tumor.Purity.byESTIMATE.RNAseq)
rm(sample, path_luad)

purity_estimate <- rbind(purity_colon, purity_breast, purity_endometrial, purity_hnscc, purity_luad)
purity_estimate <- na.omit(purity_estimate)

### read aneuploidy score
load("/Users/pc2644/Documents/DM_Aneuploidy/Compensation/PanAnalysis/pan_aneuploidy.RData")

indep_pan2 <- merge(indep_pan, purity_estimate, by.x="patients", by.y="samples", all.x=T, sort=F)

### read purity data from image
path_cohort <- "/Volumes/davolt01lab/davolt01labspace/Data/Proteogenomic_Fenyo/CPTAC3/CPTAC3_cohort.csv"
sample <- read.csv(path_cohort)
sample <- sample[sample$Specimen_Type=="tumor_tissue",]
### there are some tissues from the same patients, calculate the average of them
patients <- unique(sample$Patient_ID)
purity_nuclei <- data.frame(samples=patients,
                            nucleiPurity=rep(NA,length(patients)))
rownames(purity_nuclei) <- purity_nuclei$samples
# i=patients[1]
for (i in patients) {
  index <- sample$Patient_ID==i
  purity_nuclei[i,"nucleiPurity"] <- mean(sample$Percent_Tumor_Nuclei[index])
  
}
purity_nuclei$samples <- gsub("-", "." , purity_nuclei$samples)
rm(sample, path_cohort)

indep_pan2 <- merge(indep_pan2, purity_nuclei, by.x="patients", by.y="samples", all.x=T, sort=F)
indep_pan2$estiPurity <- as.numeric(indep_pan2$estiPurity)
indep_pan2$nucleiPurity <- as.numeric(indep_pan2$nucleiPurity)
save(indep_pan2, file = "CPTAC_purity.RData")

indep_pan2_plot <- na.omit(indep_pan2)
ggplot(indep_pan2_plot, aes(x=estiPurity, y=nucleiPurity, color=cancer)) + 
  geom_point() +
  labs(x="purity from genomic data", y="purity from nuclei")

ggplot(indep_pan2, aes(x=cancer, y=estiPurity, color=cancer)) + geom_jitter()

ggplot(indep_pan2, aes(x=cancer, y=nucleiPurity, color=cancer)) + geom_jitter()