https://github.com/davolilab/Proteogenomic-Analysis-of-Aneuploidy
Tip revision: 9aa99245ac462b4134976293e52f56650ecb5c00 authored by breezyzhao on 23 August 2022, 23:15:57 UTC
Delete license
Delete license
Tip revision: 9aa9924
purity.R
### this script is used to organize CPTAC purity data
### two kinds of purity data
### first: estimated from RNA
### second: tumor nuclei from HE stain
setwd("/Users/pc2644/Documents/DM_Aneuploidy/Compensation/PanAnalysis")
library(ggplot2)
library(dplyr)
rm(list=ls())
### first organize estimate purity data
### colon2
path_colon <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/CO/AWG_data_freeze/Human__CPTAC_COAD__MS__Clinical__Clinical__03_01_2017__CPTAC__Clinical__BCM.tsi"
sample <- read.delim(path_colon)
sample <- data.frame(t(sample))
colnames(sample) <- sample[1,]
sample <- sample[-1,]
purity_colon <- data.frame(samples=rownames(sample),
estiPurity=sample$TumorPurity)
rm(sample, path_colon)
### breast2
path_breast <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/BR/AWG_data_freeze/prosp-brca-v5.3-sample-annotation.csv"
sample <- read.csv(path_breast)
purity_breast <- data.frame(samples=sample$Sample.ID,
estiPurity=sample$ESTIMATE.TumorPurity)
rm(sample, path_breast)
### ovarian2
# path_ovarian <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/OV/AWG_data_freeze/Ovary One Year Clinical Data_20160927.txt"
# sample <- read.delim(path_ovarian)
# rm(sample, path_ovarian)
### ccrcc
# path_ccrcc <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/ccRCC/AWG_data_freeze/CCRCC_September2018_case.tsv"
# sample <- read.delim(path_ccrcc)
# rm(sample, path_ccrcc)
### endometrial
path_endometrial <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/EC/AWG_data_freeze/HS_CPTAC_UCEC_CLI.txt"
sample <- read.delim(path_endometrial)
purity_endometrial <- data.frame(samples=gsub("-",".",sample$Proteomics_Participant_ID),
estiPurity=sample$Purity_Cancer)
rm(sample, path_endometrial)
### hnscc
path_hnscc <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/HNSCC/AWG_data_freeze/Meta_table.tsv"
sample <- read.delim(path_hnscc)
purity_hnscc <- data.frame(samples=gsub("-",".",sample$case_id),
estiPurity=sample$tumor_proportion)
rm(sample, path_hnscc)
### luad
path_luad <- "/Volumes/davolt01lab/davolt01labspace/Data/CPTAC_PanCan/CPTAC_PanCan_Clinical_Data/LUAD/AWG_data_freeze/luad-v3.2-sample-annotation.csv"
sample <- read.csv(path_luad)
purity_luad <- data.frame(samples=sample$Sample.ID,
estiPurity=sample$Tumor.Purity.byESTIMATE.RNAseq)
rm(sample, path_luad)
purity_estimate <- rbind(purity_colon, purity_breast, purity_endometrial, purity_hnscc, purity_luad)
purity_estimate <- na.omit(purity_estimate)
### read aneuploidy score
load("/Users/pc2644/Documents/DM_Aneuploidy/Compensation/PanAnalysis/pan_aneuploidy.RData")
indep_pan2 <- merge(indep_pan, purity_estimate, by.x="patients", by.y="samples", all.x=T, sort=F)
### read purity data from image
path_cohort <- "/Volumes/davolt01lab/davolt01labspace/Data/Proteogenomic_Fenyo/CPTAC3/CPTAC3_cohort.csv"
sample <- read.csv(path_cohort)
sample <- sample[sample$Specimen_Type=="tumor_tissue",]
### there are some tissues from the same patients, calculate the average of them
patients <- unique(sample$Patient_ID)
purity_nuclei <- data.frame(samples=patients,
nucleiPurity=rep(NA,length(patients)))
rownames(purity_nuclei) <- purity_nuclei$samples
# i=patients[1]
for (i in patients) {
index <- sample$Patient_ID==i
purity_nuclei[i,"nucleiPurity"] <- mean(sample$Percent_Tumor_Nuclei[index])
}
purity_nuclei$samples <- gsub("-", "." , purity_nuclei$samples)
rm(sample, path_cohort)
indep_pan2 <- merge(indep_pan2, purity_nuclei, by.x="patients", by.y="samples", all.x=T, sort=F)
indep_pan2$estiPurity <- as.numeric(indep_pan2$estiPurity)
indep_pan2$nucleiPurity <- as.numeric(indep_pan2$nucleiPurity)
save(indep_pan2, file = "CPTAC_purity.RData")
indep_pan2_plot <- na.omit(indep_pan2)
ggplot(indep_pan2_plot, aes(x=estiPurity, y=nucleiPurity, color=cancer)) +
geom_point() +
labs(x="purity from genomic data", y="purity from nuclei")
ggplot(indep_pan2, aes(x=cancer, y=estiPurity, color=cancer)) + geom_jitter()
ggplot(indep_pan2, aes(x=cancer, y=nucleiPurity, color=cancer)) + geom_jitter()