https://github.com/cran/AntAngioCOOL
Tip revision: 45df2234862a7196e883c1ff1bbf90ab27d6066b authored by Javad Zahiri on 01 August 2016, 14:01:06 UTC
version 1.2
version 1.2
Tip revision: 45df223
AntAngiCOOL.R
#' @title AntAngioCOOL
#' @description Machine learning based package to predict anti-angiogenic peptides using heterogeneous sequence descriptors.
#' @details AntAngioCOOL is a machine learning based package to predict anti-angiogenic peptides using heterogeneous sequence descriptors.
#' @details This package consists of three different predictors according to the obtained performances on independent test set: sensitive model, specific model and accurate model. These models have been build using the gold standard dataset that published by Ramaprasad et al. (Ettayapuram Ramaprasad et al., 2015).
#' @details Four different features have been used to encode peptides:
#' @details 1- Pseudo Amino Acid Composition (PseAAC) that has been used effectively in predicting cell penetrating peptides (Chen, Chu, Huang, Kong, & Cai, 2015). Despite the simple amino acid composition, PseAAC considers the sequence-order information of the peptide.
#' @details 2- K-mer composition that shows the fraction of all possible subsequences with length k in the given peptide. To compute k-mer composition features, reduced amino acid alphabet that proposed by Zahiri et al (Zahiri et al., 2014) has been exploited: the 20 alphabet of amino acids have been reduced to a new alphabet with size 8 according to 544 physicochemical and biochemical indices that extracted from AAIndex database (Kawashima et al., 2008) (C1={A, E}, C2={I, L, F, M, V}, C3={N, D, T, S}, C4={G}, C5={P}, C6={R, K, Q, H}, C7={Y, W}, C8={C}). We have computed k-mer composition for k=2,3,4 for each peptide.
#' @details 3- Physico-chemical profile : To compute these features, 544 different physico-chemical indices have been extracted from AAINDEX (Kawashima et al., 2008). To remove redundant indices, a subset of indices with correlation coefficient less than 0.8 and greater than -0.8 has been selected. This feature type for 5 amino acids of N- termini (5-NT) and C-termini (5-CT).
#' @details 4- Atomic profile : A 50-dimentional feature vector has been used to encode each peptide according to its atomic properties (frequency of five types of atoms: C, H, N, O, S). For details of atomic composition for each 20 natural amino acid see Kumar et al., 2015.
#' @details References
#' @details 1- Chen, L., Chu, C., Huang, T., Kong, X., & Cai, Y.-D. (2015). Prediction and analysis of cell-penetrating peptides using pseudo-amino acid composition and random forest models. Amino Acids, 47(7), 1485-93. http://doi.org/10.1007/s00726-015-1974-5
#' @details 2- Ettayapuram Ramaprasad, A. S., Singh, S., Gajendra P. S, R., Venkatesan, S., Brem, S., Cotran, R., . Stephens, R. (2015). AntiAngioPred: A Server for Prediction of Anti-Angiogenic Peptides. PLOS ONE, 10(9), e0136990. http://doi.org/10.1371/journal.pone.0136990
#' @details 3- Kawashima, S., Pokarowski, P., Pokarowska, M., Kolinski, A., Katayama, T., & Kanehisa, M. (2008). AAindex: amino acid index database, progress report 2008. Nucleic Acids Research, 36(Database issue), D202-5. http://doi.org/10.1093/nar/gkm998
#' @details 3- Kumar, R., Chaudhary, K., Singh Chauhan, J., Nagpal, G., Kumar, R., Sharma, M., & Raghava, G. P. S. (2015). An in silico platform for predicting, screening and designing of antihypertensive peptides. Scientific Reports, 5, 12512. http://doi.org/10.1038/srep12512
#' @details 4- Zahiri, J., Mohammad-Noori, M., Ebrahimpour, R., Saadat, S., Bozorgmehr, J. H., Goldberg, T., & Masoudi-Nejad, A. (2014). LocFuse: Human protein-protein interaction prediction via classifier fusion using protein localization information. Genomics, 104(6), 496-503.
#' @author Babak Khorsand
#' @import caret
#' @import rJava
#' @import RWeka
#' @import rpart
#' @importFrom stats predict
#' @export AntAngioCOOL
#' @param Input_Sequence A peptide sequence
#' @param Classifier 1 if you want to get the prediction from model with maximum Accuracy (according to the independent test reasults), 2 if maximum Sensivity is desired and 3 if maximum Specefity is desired.
#' @param SF if True then all 2343 selected features (out of 175062 features) that had been used for prediction will be returned.
#' @param AF if True then all 175062 extracted features will be returned.
#' @return If Predicted class (Anti-angiogenic/ Not anti-angiogenic) of the input peptide and a subset of descriptors upon request.
#' @examples
#' AntAngioCOOL("AAPFLECQGN",2,SF=TRUE)
AntAngioCOOL = function(Input_Sequence,Classifier=1,SF=FALSE,AF=FALSE)
{
Seq_Length = nchar(Input_Sequence)
if(Seq_Length<10)
{
stop("Input Sequence must have length more than 10")
}else
{
Onemer=Features[which(Features=="A"):which(Features=="V")]
Twomers=Features[which(Features=="AA"):which(Features=="VV")]
Threemers=Features[which(Features=="AAA"):which(Features=="VVV")]
Fourmers=Features[which(Features=="AAAA"):which(Features=="VVVV")]
# A,AA,AAA,AAAA ----
Seq_Num=1:Seq_Length
Sequence="z"
Sequence= sapply(Seq_Num,function(i) c(Sequence,substr(Input_Sequence,i,i)))
Sequence=Sequence[2,]
Twomer_Seq="AA"
Seq_Num=1:(Seq_Length-1)
Twomer_Seq=sapply(Seq_Num, function(i) c(Twomer_Seq,paste(Sequence[i],Sequence[i+1],sep="")))
Twomer_Seq=Twomer_Seq[2,1:(Seq_Length-1)]
Threemer_Seq="AAA"
Seq_Num=1:(Seq_Length-2)
Threemer_Seq=sapply(Seq_Num, function(i) c(Threemer_Seq,paste(Sequence[i],Sequence[i+1],Sequence[i+2],sep="")))
Threemer_Seq=Threemer_Seq[2,1:(Seq_Length-2)]
Fourmer_Seq="AAAA"
Seq_Num=1:(Seq_Length-3)
Fourmer_Seq=sapply(Seq_Num, function(i) c(Fourmer_Seq,paste(Sequence[i],Sequence[i+1],Sequence[i+2],Sequence[i+3],sep="")))
Fourmer_Seq=Fourmer_Seq[2,1:(Seq_Length-3)]
Onemer_Seq_table=table(Sequence)
Feature_Onemer=0
Feature_Onemer = sapply(Onemer, function(x) c(Feature_Onemer,ifelse(length(grep(x,Sequence))>0,(Onemer_Seq_table[x]/length(Onemer)),0)))
Feature_Onemer=Feature_Onemer[2,]
names(Feature_Onemer)=Onemer
Twomer_Seq_table=table(Twomer_Seq)
Feature_Twomers=0
Feature_Twomers = sapply(Twomers, function(x) c(Feature_Twomers,ifelse(length(grep(x,Twomer_Seq))>0,(Twomer_Seq_table[x]/length(Twomers)),0)))
Feature_Twomers=Feature_Twomers[2,]
names(Feature_Twomers)=Twomers
Threemer_Seq_table=table(Threemer_Seq)
Feature_Threemers=0
Feature_Threemers = sapply(Threemers, function(x) c(Feature_Threemers,ifelse(length(grep(x,Threemer_Seq))>0,(Threemer_Seq_table[x]/length(Threemers)),0)))
Feature_Threemers=Feature_Threemers[2,]
names(Feature_Threemers)=Threemers
Fourmer_Seq_table=table(Fourmer_Seq)
Feature_Fourmers=0
Feature_Fourmers = sapply(Fourmers, function(x) c(Feature_Fourmers,ifelse(length(grep(x,Fourmer_Seq))>0,(Fourmer_Seq_table[x]/length(Fourmers)),0)))
Feature_Fourmers=Feature_Fourmers[2,]
names(Feature_Fourmers)=Fourmers
Extracted_Features=c(Feature_Onemer,Feature_Twomers,Feature_Threemers,Feature_Fourmers)
# Amino8 ----
Amino8 = Input_Sequence
Amino8 = gsub("A","a",Amino8)
Amino8 = gsub("E","a",Amino8)
Amino8 = gsub("I","b",Amino8)
Amino8 = gsub("L","b",Amino8)
Amino8 = gsub("F","b",Amino8)
Amino8 = gsub("M","b",Amino8)
Amino8 = gsub("V","b",Amino8)
Amino8 = gsub("N","c",Amino8)
Amino8 = gsub("D","c",Amino8)
Amino8 = gsub("T","c",Amino8)
Amino8 = gsub("S","c",Amino8)
Amino8 = gsub("G","d",Amino8)
Amino8 = gsub("P","e",Amino8)
Amino8 = gsub("R","f",Amino8)
Amino8 = gsub("K","f",Amino8)
Amino8 = gsub("Q","f",Amino8)
Amino8 = gsub("H","f",Amino8)
Amino8 = gsub("Y","g",Amino8)
Amino8 = gsub("W","g",Amino8)
Amino8 = gsub("C","h",Amino8)
Onemer=Features[which(Features=="a"):which(Features=="h")]
Twomers=Features[which(Features=="aa"):which(Features=="hh")]
Threemers=Features[which(Features=="aaa"):which(Features=="hhh")]
Fourmers=Features[which(Features=="aaaa"):which(Features=="hhhh")]
# a,aa,aaa,aaaa ----
Seq_Num=1:Seq_Length
Sequence="z"
Sequence= sapply(Seq_Num,function(i) c(Sequence,substr(Input_Sequence,i,i)))
Sequence=Sequence[2,]
Twomer_Seq="AA"
Seq_Num=1:(Seq_Length-1)
Twomer_Seq=sapply(Seq_Num, function(i) c(Twomer_Seq,paste(Sequence[i],Sequence[i+1],sep="")))
Twomer_Seq=Twomer_Seq[2,1:(Seq_Length-1)]
Threemer_Seq="AAA"
Seq_Num=1:(Seq_Length-2)
Threemer_Seq=sapply(Seq_Num, function(i) c(Threemer_Seq,paste(Sequence[i],Sequence[i+1],Sequence[i+2],sep="")))
Threemer_Seq=Threemer_Seq[2,1:(Seq_Length-2)]
Fourmer_Seq="AAAA"
Seq_Num=1:(Seq_Length-3)
Fourmer_Seq=sapply(Seq_Num, function(i) c(Fourmer_Seq,paste(Sequence[i],Sequence[i+1],Sequence[i+2],Sequence[i+3],sep="")))
Fourmer_Seq=Fourmer_Seq[2,1:(Seq_Length-3)]
Onemer_Seq_table=table(Sequence)
Feature_Onemer=0
Feature_Onemer = sapply(Onemer, function(x) c(Feature_Onemer,ifelse(length(grep(x,Sequence))>0,(Onemer_Seq_table[x]/length(Onemer)),0)))
Feature_Onemer=Feature_Onemer[2,]
names(Feature_Onemer)=Onemer
Twomer_Seq_table=table(Twomer_Seq)
Feature_Twomers=0
Feature_Twomers = sapply(Twomers, function(x) c(Feature_Twomers,ifelse(length(grep(x,Twomer_Seq))>0,(Twomer_Seq_table[x]/length(Twomers)),0)))
Feature_Twomers=Feature_Twomers[2,]
names(Feature_Twomers)=Twomers
Threemer_Seq_table=table(Threemer_Seq)
Feature_Threemers=0
Feature_Threemers = sapply(Threemers, function(x) c(Feature_Threemers,ifelse(length(grep(x,Threemer_Seq))>0,(Threemer_Seq_table[x]/length(Threemers)),0)))
Feature_Threemers=Feature_Threemers[2,]
names(Feature_Threemers)=Threemers
Fourmer_Seq_table=table(Fourmer_Seq)
Feature_Fourmers=0
Feature_Fourmers = sapply(Fourmers, function(x) c(Feature_Fourmers,ifelse(length(grep(x,Fourmer_Seq))>0,(Fourmer_Seq_table[x]/length(Fourmers)),0)))
Feature_Fourmers=Feature_Fourmers[2,]
names(Feature_Fourmers)=Fourmers
Extracted_Features=c(Extracted_Features,Feature_Onemer,Feature_Twomers,Feature_Threemers,Feature_Fourmers)
# Atomic Profile ----
Seq_Num=1:Seq_Length
Sequence="z"
Sequence= sapply(Seq_Num,function(i) c(Sequence,substr(Input_Sequence,i,i)))
Sequence=Sequence[2,]
for (i in 1:5)
{
temp = Atomic_Profile[Atomic_Profile$AminoAcids==Sequence[i],2:9]
names(temp)=paste(names(temp),"_C",i,sep = "")
temp=sapply(temp, function(x) as.numeric(as.character(x)))
Extracted_Features=c(Extracted_Features,temp)
temp = Atomic_Profile[Atomic_Profile$AminoAcids==Sequence[(Seq_Length-5+i)],2:9]
names(temp)=paste(names(temp),"_N",i,sep = "")
temp=sapply(temp, function(x) as.numeric(as.character(x)))
Extracted_Features=c(Extracted_Features,temp)
}
# PhysicoChemical AAC ----
for (i in 1:5)
{
PhysicoChemical_AAC = AAIndex193_Table[Sequence[i]]
PhysicoChemical_AAC=PhysicoChemical_AAC[,1]
names(PhysicoChemical_AAC)=paste(AAIndex193_Table[[1]],"_C",i,sep = "")
PhysicoChemical_AAC
Extracted_Features=c(Extracted_Features,PhysicoChemical_AAC)
PhysicoChemical_AAC = AAIndex193_Table[Sequence[Seq_Length-5+i]]
PhysicoChemical_AAC=PhysicoChemical_AAC[,1]
names(PhysicoChemical_AAC)=paste(AAIndex193_Table[[1]],"_N",i,sep = "")
Extracted_Features=c(Extracted_Features,PhysicoChemical_AAC)
}
AllFeatures=Extracted_Features
Extracted_Features=Extracted_Features[-nzv]
Extracted_Features=data.frame(Extracted_Features,stringsAsFactors = F)
Extracted_Features[,1]=as.numeric(Extracted_Features[,1])
Extracted_Features=t(Extracted_Features)
Extracted_Features=predict(standardObj,Extracted_Features)
if (Classifier==2)
{
Result=predict(modelFit2,Extracted_Features)
Result=ifelse(Result==0,"Query peptide is NOT Anti-Angiogenic","Query peptide is Anti-Angiogenic")
}else if (Classifier==3)
{
Result=predict(modelFit3,Extracted_Features,type="prob")
Result=ifelse(Result[1,1]>Result[1,2],paste("Query peptide is NOT Anti-Angiogenic (Prediction Probability: ",round(Result[1,1],2),")",sep=""),paste("Query peptide is Anti-Angiogenic (Prediction Probability: ",round(Result[1,2],2),")",sep=""))
}else
{
Result=predict(modelFit1,Extracted_Features,type="prob")
Result=ifelse(Result[1,1]>Result[1,2],paste("Query peptide is NOT Anti-Angiogenic (Prediction Probability: ",round(Result[1,1],2),")",sep=""),paste("Query peptide is Anti-Angiogenic (Prediction Probability: ",round(Result[1,2],2),")",sep=""))
}
if (SF)
{
if (AF)
{
Result_List=list(Prediction_Result=Result,Selected_Features=Extracted_Features,All_Features=AllFeatures)
}else{
Result_List=list(Prediction_Result=Result,Selected_Features=Extracted_Features)
}
}else{
if (AF)
{
Result_List=list(Prediction_Result=Result,All_Features=AllFeatures)
}else{return(Result)}
}
return(Result_List)
}
}