https://github.com/cran/CBPS
Tip revision: 1d21c5b620c52bc45c85460ba6a02be196257000 authored by Christian Fong on 29 December 2016, 23:13:52 UTC
version 0.13
version 0.13
Tip revision: 1d21c5b
CBMSM.R
library(numDeriv)
library(MASS)
CBMSM<-function(formula, id, time, data, type="MSM", twostep = TRUE, msm.variance = "approx", time.vary = FALSE, ...){
if (missing(data))
data <- environment(formula)
call <- match.call()
family <- binomial()
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
mt <- attr(mf, "terms")
Y <- model.response(mf, "any")
if (length(dim(Y)) == 1L) {
nm <- rownames(Y)
dim(Y) <- NULL
if (!is.null(nm))
names(Y) <- nm
}
X <- if (!is.empty.model(mt)) model.matrix(mt, mf)#[,-2]
else matrix(, NROW(Y), 0L)
##Format treatment matrix
id<-as.numeric(as.factor(id))
unique.id<-sort(unique(id))
treat.hist<-matrix(NA,nrow=length(unique.id),ncol=length(unique(time)))
colnames(treat.hist)<-sort(unique(time))
rownames(treat.hist)<-unique.id
for(i in 1:length(unique(unique.id))) for(j in sort(unique(time)))
{{
treat.hist[i,j]<-Y[id==unique.id[i] & time==j]
}}
#treat.hist.fac<-apply(treat.hist,1,function(x) paste(x, collapse="+"))
cm.treat<-rowSums(treat.hist)
if(type=="MSM") {
MultiBin.fit<-FALSE
}
if(type=="MultiBin"){
MultiBin.fit<-TRUE
X<-cbind(1,X[,apply(X,2,sd)>0])
names.X<-c("Intercept",colnames(X)[-1])
}
fit <- eval(call("CBMSM.fit", treat = Y, X = X, id = id, time=time,
MultiBin.fit = MultiBin.fit, twostep = twostep, msm.variance = msm.variance,
time.vary = time.vary))
fit$call<-call
fit$formula<-formula
fit$y<-Y
fit$x<-X
fit$id<-id
fit$time<-time
fit$model<-mf
fit$data<-data
fit$treat.hist<-treat.hist
fit$treat.cum<-rowSums(treat.hist)
fit$weights<-fit$weights[time==min(time)]
fit
}
########################
###Calls loss function
########################
CBMSM.fit<-function(treat, X, id, time, MultiBin.fit, twostep, msm.variance, time.vary, ...){
id0<-id
id<-as.numeric(as.factor(id0))
if(msm.variance=="approx") full.var<-FALSE
if(msm.variance=="full") full.var<-TRUE
X.mat<-X
X.mat<-X.mat[,apply(X.mat,2,sd)>0, drop = FALSE]
##Format design matrix, run glm
glm1<-glm(treat~X.mat,family="binomial")
##################
##Make SVD matrix of covariates
##and matrix of treatment history
##################
#if(time.vary==FALSE){
X.svd<-X.mat
#X.svd<-apply(X.svd,2,FUN=function(x) (x-mean(x))/sd(x), drop=FALSE)
X.svd<-scale(X.svd) # Edit by Christian; this was causing an error
#X.svd[,c(1,2,7)]<-X.svd[,c(1,2,7)]*10
X.svd<-svd(X.svd)$u%*%diag(svd(X.svd)$d>0.0001)
X.svd<-X.svd[,apply(X.svd,2,sd)>0,drop=FALSE]
glm1<-glm(treat~X.svd,family="binomial")
if(time.vary==TRUE){
#} else{
X.svd<-NULL
for(i in sort(unique(time))){
X.sub<-X.mat[time==i,,drop=FALSE]
#X.sub<-apply(X.sub,2,FUN=function(x) (x-mean(x))/sd(x))
X.sub <- scale(X.sub) # Edit by Christian; this was causing an error
X.sub[is.na(X.sub)]<-0
X.sub<-svd(X.sub)$u%*%diag(svd(X.sub)$d>0.0001)
X.sub<-X.sub[,apply(X.sub,2,sd)>0,drop=FALSE]
X.svd<-rbind(X.svd,X.sub)
}
##Make matrix of time-varying glm starting vals
glm.coefs<-NULL
n.time<-length(unique(time))
for(i in 1:n.time){
glm.coefs<-cbind(glm.coefs, summary(glm(treat~X.svd, subset=(time==i)))$coef[,1])
}
glm.coefs[is.na(glm.coefs)]<-0
glm1$coefficients<-as.vector(glm.coefs)
}
##################
## Start optimization
##################
#Twostep is true
msm.loss1<-function(x,...) msm.loss.func(betas=x, X=cbind(1,X.svd), treat=treat, time=time,...)$loss
glm.fit<-msm.loss.func(glm1$coef,X=cbind(1,X.svd),time=time,treat=treat,full.var=full.var,twostep=FALSE)
#print(head(glm.fit$V))[,1:10]
##Twostep is true; full variance option is passed
if(twostep==TRUE){
Vcov.inv<-glm.fit
msm.opt<-optim(glm1$coef,msm.loss1,full.var=full.var,Vcov.inv=Vcov.inv$V,bal.only=TRUE,twostep=TRUE,method="BFGS")
msm.fit<-msm.loss.func(msm.opt$par,X=cbind(1,X.svd), treat=treat, time=time, full.var=full.var,Vcov.inv=Vcov.inv$V,bal.only=TRUE,twostep=TRUE)
}
##Twostep is false; full variance option is passed
if(twostep==FALSE) {
msm.opt<-optim(glm1$coef,msm.loss1,full.var=full.var,bal.only=TRUE,twostep=FALSE,method="BFGS")
msm.fit<-msm.loss.func(msm.opt$par,X=cbind(1,X.svd), treat=treat, time=time, full.var=full.var,Vcov.inv=Vcov.inv$V,bal.only=TRUE,twostep=FALSE)
}
##################
## Calculate unconditional probs and treatment matrix
##################
n.obs<-length(unique(id))
n.time<-length(unique(time))
treat.hist<-matrix(NA,nrow=n.obs,ncol=n.time)
name.cands<-sort(unique(id))
for(i in 1:n.obs) for(j in 1:n.time) treat.hist[i,j]<-treat[id==name.cands[i] & time==j ]
treat.hist.unique<-unique(treat.hist,MAR=1)
treat.unique<-rep(NA,n.obs)
for(i in 1:n.obs) treat.unique[i]<- which(apply(treat.hist.unique,1,FUN=function(x) sum((x-treat.hist[i,])^2) )==0)
treat.unique<-as.factor(treat.unique)
uncond.probs.cand<-rep(0,n.obs)
for(i in 1:n.obs) {for(j in 1:n.obs) {
check<-mean(treat.hist[j,]==treat.hist[i,])==1
if(check) uncond.probs.cand[i]<-uncond.probs.cand[i]+1
}
}
uncond.probs.cand<-uncond.probs.cand/n.obs
###########
##Produce Weights
###########
wts.out<-rep(uncond.probs.cand/msm.fit$pr,n.time)[time==1]
probs.out<-msm.fit$pr
uncond.probs<-uncond.probs.cand
loss.glm<-glm.fit$loss
loss.msm<-msm.fit$loss
if(loss.glm<loss.msm){
warning("CBMSM fails to improve covariate balance relative to MLE. \n GLM loss: ", glm.fit$loss, "\n CBMSM loss: ", msm.fit$loss, "\n")
}
# I know I'm putting probs.out in the weights and wts.out in the fitted values, but that
# is what Marc said to do
out<-list("weights"=probs.out,"fitted.values"=wts.out,"id"=id0[1:n.obs],"glm.g"=glm.fit$g.all,"msm.g"=msm.fit$g.all,"glm.weights"=(uncond.probs/glm.fit$pr)[time==1])
class(out)<-c("CBMSM","list")
return(out)
}
########################
###Loss function for MSM
########################
msm.loss.func<-function(betas,X=X,treat=treat,time=time,bal.only=F,time.sub=0,twostep=FALSE, Vcov.inv=NULL,full.var=FALSE){
if((length(betas)==dim(X)[2]) ) betas<-rep(betas, dim(X)[2]/length(betas))
time<-time-min(time)+1
unique.time<-sort(unique(time))
n.t<-length(unique.time)
n<-dim(X)[1]/n.t
treat.use<-betas.use<-NULL
X.t<-NULL
for(i in 1:n.t){
betas.use<-cbind(betas.use,betas[1:dim(X)[2]+(i-1)*dim(X)[2] ])
treat.use<-cbind(treat.use,treat[time==unique.time[i]])
X.t<-cbind(X.t,X[time==unique.time[i],])
}
betas<-betas.use
betas[is.na(betas)]<-0
treat<-treat.use
thetas<-NULL
for(i in 1:n.t)
thetas<-cbind(thetas,X[time==i,]%*%betas[,i] )
probs.trim<-.0001
probs<-(1+exp(-thetas))^(-1)
probs<-pmax(probs,probs.trim)
probs<-pmin(probs,1-probs.trim)
probs.obs<-treat*probs+(1-treat)*(1-probs)
w.each<-treat/probs+(1-treat)/(1-probs)#+(treat-probs)^2/(probs*(1-probs))
w.all<-apply(w.each,1,prod)#*probs.uncond
bin.mat<-matrix(0,nrow=(2^n.t-1),ncol=n.t)
for(i in 1:(2^n.t-1)) bin.mat[i,(n.t-length(integer.base.b(i))+1):n.t]<-
integer.base.b(i)
num.valid.outer<-constr.mat.outer<-NULL
for(i.time in 1:n.t){
num.valid<-rep(0,dim(treat)[1])
constr.mat.prop<-constr.mat<-matrix(0,nrow=dim(treat)[1],ncol=dim(bin.mat)[1])
for(i in 1:dim(bin.mat)[1]){
is.valid<-sum(bin.mat[i,(i.time):dim(bin.mat)[2]])>0
if(is.valid){
#for(i.wt in i.time:n.t) w.all.now<-w.all.now*1/(1+3*probs[,i.wt]*(1-probs[,i.wt]))
constr.mat[,i]<-(w.all*(-1)^(treat%*%bin.mat[i,]))
num.valid<-num.valid+1
}else{
constr.mat[,i]<-0
}
}
num.valid.outer<-c(num.valid.outer,num.valid)
constr.mat.outer<-rbind(constr.mat.outer,constr.mat)
}
if(twostep==FALSE){
if(full.var==TRUE){
var.big<-0
X.t.big<-matrix(NA,nrow=n.t*dim(X.t)[1],ncol=dim(X.t)[2])
for(i in 1:n.t){X.t.big[1:dim(X.t)[1]+(i-1)*dim(X.t)[1],]<-X.t}
for(i in 1:dim(X.t.big)[1]){
mat1<-(X.t.big[i,])%*%t(X.t.big[i,])
mat2<-constr.mat.outer[i,]%*%t(constr.mat.outer[i,])
var.big<-var.big+mat2 %x%mat1
}
}
}
X.wt<-X.prop<-g.wt<-g.prop<-NULL
for(i in 1:n.t){
g.prop<-c(g.prop, 1/n*t(X[time==i,])%*%(treat[,i]-probs[,i]))
g.wt<-rbind(g.wt,1/n*t(X[time==i,])%*%cbind(constr.mat.outer[time==i,])*(i>time.sub))
X.prop.curr<-matrix(0,ncol=n,nrow=dim(X)[2])
X.wt.curr<-matrix(0,ncol=n,nrow=dim(X)[2])
X.prop<-rbind(X.prop,1/n^.5*t((X[time==i,]*(probs.obs[,i]*(1-probs.obs[,i]))^.5)))
if(bal.only){
X.wt<-rbind(X.wt,1/n^.5*t(X[time==i,]*unique(num.valid.outer)[i]^.5)) } else{
X.wt<-rbind(X.wt,1/n^.5*t(X[time==i,]*w.all^.5*unique(num.valid.outer)[i]^.5))
}
}
mat.prop<-matrix(0,nrow=n, ncol=dim(X.wt)[2])
mat.prop[,1]<-1
g.prop.all<-0*g.wt
g.prop.all[,1]<-g.prop
#g.prop.all<-g.prop
if(bal.only==T) g.prop.all<-0*g.prop.all
g.all<-rbind(g.prop.all,g.wt)
X.all<-rbind(X.prop*(1-bal.only),X.wt)
if(twostep==TRUE){
var.X.inv<-Vcov.inv
} else{
if(full.var==FALSE){
var.X.inv<-ginv((X.all)%*%t(X.all))}else{
var.X.inv<-ginv(var.big/n)
}
}
length.zero<-dim(g.prop.all)[2]#length(g.prop)
#var.X[(length.zero+1):(2*length.zero),1:length.zero]<-0
#var.X[1:length.zero,(length.zero+1):(2*length.zero)]<-0
#print(dim(g.all))
#print(dim(var.X.inv))
if(full.var==TRUE) g.all<-as.vector(g.wt)
loss<-t(g.all)%*%var.X.inv%*%g.all
out=list("loss"=(sum(diag(loss)))*n,"Var.inv"=var.X.inv,"probs"=w.all,"g.all"=g.all)
#t(g.prop)%*%ginv(X.prop%*%t(X.prop))%*%g.prop +sum(diag(t(g.wt)%*%ginv(X.wt%*%t(X.wt))%*%g.wt ))
}#closes msm.loss.func
########################
###Makes binary representation
########################
integer.base.b <-
function(x, b=2){
xi <- as.integer(x)
if(any(is.na(xi) | ((x-xi)!=0)))
print(list(ERROR="x not integer", x=x))
N <- length(x)
xMax <- max(x)
ndigits <- (floor(logb(xMax, base=2))+1)
Base.b <- array(NA, dim=c(N, ndigits))
for(i in 1:ndigits){#i <- 1
Base.b[, ndigits-i+1] <- (x %% b)
x <- (x %/% b)
}
if(N ==1) Base.b[1, ] else Base.b
}
balance.CBMSM<-function(object, ...)
{
treat.hist<-matrix(NA,nrow=length(unique(object$id)),ncol=length(unique(object$time)))
ids<-sort(unique(object$id))
times<-sort(unique(object$time))
for(i in 1:length(ids)) {
for(j in 1:length(times)){
treat.hist[i,j]<-object$y[object$id== ids[i] & object$time==j]
}
}
treat.hist.fac<-apply(treat.hist,1,function(x) paste(x, collapse="+"))
bal<-matrix(NA,nrow=(ncol(object$x)-1),ncol=length(unique(treat.hist.fac))*2)
baseline<-matrix(NA,nrow=(ncol(object$x)-1),ncol=length(unique(treat.hist.fac))*2)
cnames<-array()
for (i in 1:length(unique(treat.hist.fac)))
{
for (j in 2:ncol(object$x))
{
bal[j-1,i]<-sum((treat.hist.fac==unique(treat.hist.fac)[i])*object$x[which(object$time == times[1]),j]*object$w)/sum(object$w*(treat.hist.fac == unique(treat.hist.fac)[i]))
#bal[j-1,i]<-sum((treat.hist.fac==unique(treat.hist.fac)[i])*object$x[,j]*object$w)/sum(object$w*(treat.hist.fac == unique(treat.hist.fac)[i]))
# print(c(j,i,bal[j-1,i]))
bal[j-1,i+length(unique(treat.hist.fac))]<-bal[j-1,i]/sd(object$w*object$x[which(object$time == times[1]),j])
#bal[j-1,i+length(unique(treat.hist.fac))]<-bal[j-1,i]/sd(object$w*object$x[,j])
baseline[j-1,i]<-sum((treat.hist.fac==unique(treat.hist.fac)[i])*object$x[which(object$time == times[1]),j]*object$glm.w)/sum(object$glm.w*(treat.hist.fac == unique(treat.hist.fac)[i]))
baseline[j-1,i+length(unique(treat.hist.fac))]<-bal[j-1,i]/sd(object$glm.w*object$x[which(object$time == times[1]),j])
#baseline[j-1,i]<-sum((treat.hist.fac==unique(treat.hist.fac)[i])*object$x[,j]*object$glm.w)/sum(object$glm.w*(treat.hist.fac == unique(treat.hist.fac)[i]))
#baseline[j-1,i+length(unique(treat.hist.fac))]<-bal[j-1,i]/sd(object$glm.w*object$x[,j])
}
bal[is.na(bal)]<-0
baseline[is.na(baseline)]<-0
cnames[i]<-paste0(unique(treat.hist.fac)[i],".mean")
cnames[i+length(unique(treat.hist.fac))]<-paste0(unique(treat.hist.fac)[i],".std.mean")
}
colnames(bal)<-cnames
rnames<-colnames(object$x)[-1]
rownames(bal)<-rnames
colnames(baseline)<-cnames
rownames(baseline)<-rnames
statbal<-sum((bal-bal[,1])*(bal!=0)^2)
statloh<-sum((baseline-baseline[,1])*(baseline!=0)^2)
list("Balanced"=bal, "Unweighted"=baseline, "StatBal")
}
plot.CBMSM<-function(x, covars = NULL, silent = TRUE, boxplot = FALSE, ...)
{
bal.out<-balance.CBMSM(x)
bal<-bal.out$Balanced
baseline<-bal.out$Unweighted
no.treats<-ncol(bal)/2
if (is.null(covars))
{
covars<-1:nrow(bal)
}
covarlist<-c()
contrast<-c()
bal.std.diff<-c()
baseline.std.diff<-c()
treat.hist.names<-sapply(colnames(bal)[1:no.treats],function(s) substr(s, 1, nchar(s)-5))
for (i in covars)
{
for (j in 1:(no.treats-1))
{
for (k in (j+1):no.treats)
{
covarlist<-c(covarlist, rownames(bal)[i])
contrast<-c(contrast, paste(treat.hist.names[j],treat.hist.names[k],sep=":",collapse=""))
bal.std.diff<-c(bal.std.diff,abs(bal[i,no.treats+j] - bal[i,no.treats+k]))
baseline.std.diff<-c(baseline.std.diff,abs(baseline[i,no.treats+j] - baseline[i,no.treats+k]))
}
}
}
range.x<-range.y<-range(c(bal.std.diff,baseline.std.diff))
if (!boxplot){
plot(x=baseline.std.diff,y=bal.std.diff,asp="1",xlab="Unweighted Regression Imbalance",ylab="CBMSM Imbalance",
xlim=range.x, ylim = range.y, main = "Difference in Standardized Means", ...)
abline(0,1)
}
else{
boxplot(baseline.std.diff, bal.std.diff, horizontal = TRUE, yaxt = 'n', xlab = "Difference in Standardized Means", ...)
axis(side=2, at=c(1,2),c("CBMSM Weighted", "Unweighted"))
}
if(!silent) return(data.frame("Covariate" = covarlist, "Contrast"=contrast, "Unweighted"=baseline.std.diff, "Balanced"=bal.std.diff))
}