#An example of highly correlated SNP-exposure association estimates and how the SCA MR approach can be used set.seed(131) #Data Generation --------- k_j = sample(5:10,1) p=100;K=10*k_j; n_ind=20000; maf=0.3 G = matrix(rbinom(n_ind * p,size = 2,prob = maf),ncol = p) Ux = matrix(rnorm(n = n_ind*K, 0, 10), ncol = K) epsilon_X=matrix(rnorm(n = n_ind*K,0,14),ncol = K) epsilon_y=matrix(rnorm(n = n_ind*1,0,20),ncol = 1) #2. Generate exposure|G,Ux ## To get a block correlation we use only specific SNPs for each block n_blocks=k_j #Breaking into causal and non-causal blocks bre1 = seq(1, K, K %/% n_blocks) list_causal=list() for (jl in 1:length(bre1)){ list_causal[[jl]]=(seq(bre1[jl], jl*(K %/% n_blocks))) } gamma_SNP_X=matrix(nrow = p,ncol = K) gamma_SNP_X_mu=seq(4,2,length.out = length(list_causal)) for (jm in 1:length(list_causal)){ gamma_SNP_X[,list_causal[[jm]]] = matrix(rnorm(n = p*length(list_causal[[jm]]), mean = gamma_SNP_X_mu[[jm]], sd = 1), ncol = length(list_causal[[jm]]))} X=X_noise=X_cor=matrix(nrow = n_ind,ncol = K) G_per_block=seq(1, p, p %/% n_blocks) #Breaking the genes in discreet blocks that contribute to blocks of X list_causal_G=list() for (jl in 1:length(G_per_block)){ list_causal_G[[jl]]=(seq(G_per_block[jl], jl*(p %/% n_blocks))) } SNP_X_beta_noise=matrix(2,nrow = p,ncol = K) for (lk in 1:length(list_causal)){ #X_noise=X[,-list_causal[[lk]]] = G[,-list_causal_G[[lk]]] %*% gamma_SNP_X[-list_causal_G[[lk]],list_causal[[lk]]] + Ux[,list_causal[[lk]]] + epsilon_X[,list_causal[[lk]]] X[,list_causal[[lk]]] = G[,list_causal_G[[lk]]] %*% gamma_SNP_X[list_causal_G[[lk]],list_causal[[lk]]] + Ux[,list_causal[[lk]]] + epsilon_X[,list_causal[[lk]]] #X=X_noise+X_cor } #3. Generate outcome|X,G,Uy with different beta's for different blocks---- #We use non-zero coefficients for the first two blocks beta_X_Y = matrix(nrow = K*1,ncol = 1) beta_X_Y[-list_causal[[2]],]=0 beta_X_Y[list_causal[[1]],]= runif(n = length(list_causal[[1]]),min = 0.3,max = 4.5) beta_X_Y[list_causal[[2]],]= runif(n = length(list_causal[[1]]),min = 0.1,max = 2.5) beta_X_Y[list_causal[[3]],]= runif(n = length(list_causal[[1]]),min = -3.6,max = -0.2) #Within the causal block, get some to zero. Maybe they will spuriously associate with Y just because of #multicoll8inearity zero_coef = sample(length(list_causal[[1]]),size = sample(1:8,1)) beta_X_Y[list_causal[[2]],][zero_coef]=0 beta_X_Y[list_causal[[3]],][zero_coef]=0 Uy_coef = matrix(runif(n = K*1,min = 10,max = 15),ncol = 1) Y = X %*% beta_X_Y + Ux %*% Uy_coef + epsilon_y #Summary Stats #3b. Extracting summary statistics---- beta_matrix = SE_matrix = pval_matrix = data.frame(matrix(nrow = ncol(G), ncol = ncol(X))) pv_df=data.frame(matrix(nrow = ncol(G),ncol = ncol(X))) pvg=paste('p_val_GX',1:ncol(X),sep='') betag=paste('beta_GX',1:ncol(X),sep='') SEg=paste('SE_GX',1:ncol(X),sep='') rsq_t=NULL for (k in 1:ncol(X)){ d1=summary(lm(X[,k] ~ .,data = data.frame(G))) rsq_t[k]=d1$r.squared lm_obj1=summary(lm(X[,k] ~ .,data = data.frame(G)))$coefficients p_val_temp = lm_obj1[-1,4] pv_df[,k] =p_val_temp; colnames(pv_df)[k]=pvg[k] beta_matrix[,k]=lm_obj1[-1,1] colnames(beta_matrix)[k]=betag[k] SE_matrix[,k]=lm_obj1[-1,2] colnames(SE_matrix)[k]=SEg[k] } rownames(beta_matrix)=rownames(SE_matrix)=rownames(pv_df)=paste('SNP',1:dim(G)[2],sep = '') b1=beta_matrix;se1=SE_matrix by1=summary(lm(Y ~ .,data = data.frame(G)))$coefficients[-1,1] byse1=summary(lm(Y ~ .,data = data.frame(G)))$coefficients[-1,2] #Apply Function ----- sc_obj=sca_MR(beta_X_mat = matrix(unlist(beta_matrix),nrow = nrow(beta_matrix)), beta_Y_vec = by1,SE_Y_vec = byse1,nfold = 5,spars_length = 5) library(RColorBrewer); coul <- colorRampPalette(brewer.pal(8, "RdYlBu"))(25) heatmap(sc_obj$Loadings,Colv = NA,Rowv = NA,col=coul) sc_obj$MR_estimate # The first three causal blocks of exposures are projected in the first three sparse # Components (as can be seen in the heatmap). # This is in accordance with the known data generating process of six blocks with ten exposures in each block. #Additionally, the MR estimates suggest that these blocks of exposures are significantly #associated with the outcome.