https://github.com/jsollari/popABC
Raw File
Tip revision: 3ebe7b76043299bc4b6a541b5be0d7012895ca1c authored by Joao Sollari Lopes on 21 November 2017, 15:04:40 UTC
remove binaries folder
Tip revision: 3ebe7b7
calmod.r
calmod <- function(target,x,sumstat,tol,gwt,rejmethod=T)
{
# this function uses categorical regression to estimate the posterior probability of a particular model 
#      under the ABC framework P(Y=y | S = s)
#
# target is the set of target summary stats - i.e. s, what you measured from the data.
# x is the parameter vector, Y (long vector of numbers from the simulations) and is the dependent variable for the regression
#          This is a categorical variable, taking values 1, 2, .. n where there are n categories (models)
# sumstat is an array of simulated summary stats, S (i.e. independent variables).
# tol is the required proportion of points nearest the target values
# gwt is a vector with T/F weights, weighting out any 'bad' values (determined by the simulation program - i.e. nan's etc)
# if rejmethod=T it doesn't bother with the regression, and just does rejection.


# If rejmethod=F it returns a list with the following components:-

# $x1 expected value on a logit scale, with standard errors of the estimate
# $x2 expected value on a natural scale - i.e. p(Y=y | S = s) This is what we would normally report.
# $vals - the Ys in the rejection region. The proportion of the different Ys (different models) is a Pritchard etal-style, rejection-based
#                                         estimate of the posterior probability of the different models. You might get some improvement by 
#                                          weighting the frequencies with $wt.
# $wt - the Epanechnikov weight. 
# $ss - the sumstats corresponding to the points in $vals. 



if(missing(gwt))gwt <- rep(T,length(sumstat[,1]))

nss <- length(sumstat[1,])


# scale everything 

    scaled.sumstat <- sumstat
    
    for(j in 1:nss){
    
    	scaled.sumstat[,j] <- normalise(sumstat[,j],sumstat[,j][gwt])
    }
    target.s <- target
    
    for(j in 1:nss){
    
    	target.s[j] <- normalise(target[j],sumstat[,j][gwt])
    }
    
# calc euclidean distance

    sum1 <- 0
    for(j in 1:nss){
    	sum1 <- sum1 + (scaled.sumstat[,j]-target.s[j])^2
   }
   dst <- sqrt(sum1)
# includes the effect of gwt in the tolerance
    dst[!gwt] <- floor(max(dst[gwt])+10)
    

# wt1 defines the region we're interested in 
    abstol <- quantile(dst,tol)
    wt1 <- dst < abstol
    nit <- sum(wt1)
    
    if(rejmethod){
        l1 <- list(x=x[wt1],wt=0)
    }
    else{
        regwt <- 1-dst[wt1]^2/abstol^2
        
        catx <- as.numeric(names(table(x[wt1])))
        ncat <- length(catx)
        yy <- matrix(nrow=nit,ncol=ncat)
        for(j in 1:ncat){
        	yy[,j] <- as.numeric(x[wt1] == catx[j])
        }
        
        tr <- list()

        for(j in 1:nss){
        	tr[[j]] <- scaled.sumstat[wt1,j]
        }
        
        xvar.names <- paste("v",as.character(c(1:nss)),sep="")
        
        names(tr) <- xvar.names
        
        fmla <- as.formula(paste("yy ~ ", paste(xvar.names, collapse= "+")))
        
#        fit1 <- vglm(fmla,data=tr,multinomial) this is the version described in the 
#               manuscript,which did not use the Epanechnikov weights. 
        

        fit1 <- vglm(fmla,data=tr,multinomial,weights=regwt)
        
        target.df <- list()
        for(j in 1:nss){
        	target.df[[j]] <- target.s[j]
        }
        names(target.df) <- xvar.names
        
        prediction1 <- predict(fit1,target.df,se.fit=T)
        prediction2 <- predict(fit1,target.df,type="response")
        
        l1 <- list(x1=prediction1,x2=prediction2,vals=x[wt1],wt=regwt,ss=sumstat[wt1,])
        
    }
    l1
}


normalise <- function(x,y){

if(var(y) == 0)return (x - mean(y))
(x-(mean(y)))/sqrt(var(y))
}


back to top