https://github.com/cran/BDgraph
Raw File
Tip revision: b3c5ed6c665e2ff22eef7818c79389f8b96cfa6b authored by Reza Mohammadi on 29 March 2018, 14:39:30 UTC
version 2.45
Tip revision: b3c5ed6
bdgraph.sim.R
## ------------------------------------------------------------------------------------------------|
# Data generator according to the graph structure
## ------------------------------------------------------------------------------------------------|
bdgraph.sim = function( p = 10, graph = "random", n = 0, type = "Gaussian", 
						prob = 0.2, size = NULL, mean = 0, class = NULL, 
						cut = 4, b = 3, D = diag(p), K = NULL, sigma = NULL, 
						vis = FALSE )
{
    if( is.matrix( K ) )  graph <- "fixed"
    
    if( type == "normal" )     type = "Gaussian"
    if( type == "non-normal" ) type = "non-Gaussian"
    
    if( is.matrix( graph ) )
	{
		G     <- graph
	    graph <- "fixed"
    } 
	
#--- build the graph structure --------------------------------------------------------------------|
	if( graph == "random" )
	{
		G <- matrix( 0, p, p )

		if( is.null( size ) )
		{
#			if( prob < 0 | prob > 1 ) stop( "'prob' should be between zero and one" )
			
			G[ upper.tri( G ) ] <- rbinom( p * ( p - 1 ) / 2, 1, prob )
		} 
		else 
		{
			if( size < 0 | size > p * ( p - 1 ) / 2 )  stop( "Graph size should be between zero and p*(p-1)/2" )
			
			smp <- sample( 1 : ( p * ( p - 1 ) / 2 ), size, replace = FALSE )
			G[ upper.tri( G ) ][smp] <- 1
		}
		
		G <- G + t( G )
	}
	
	if( graph == "cluster" )
	{
		# partition variables
		if( is.null( class ) )
		{ 
			class = NULL
			if( !is.null( size ) )   class = length( size )
			if( length( prob ) > 1 ) class = length( prob )
			if( is.null( class ) )   class = max( 2, ceiling( p / 20 ) )

			if( !is.null( size ) )
			{
				class <- length( size )
			}else{
				class <- max( 2, ceiling( p / 20 ) )
			}
		}

		g.large <- p %% class
		g.small <- class - g.large
		n.small <- floor( p / class )
		n.large <- n.small + 1
		vp      <- c( rep( n.small, g.small ), rep( n.large, g.large ) )
		 
		G       <- matrix( 0, p, p )
		 
		if( is.null( size ) )
		{
#			if( prob < 0 | prob > 1 ) stop( "'prob' should be between zero and one" )

			for( i in 1 : class )
			{
				tmp <- if( i == 1 ) ( 1 : vp[1] ) else ( ( sum( vp[1 : (i-1)] ) + 1 ) : sum( vp[1:i] ) )
				gg                <- matrix( 0, vp[i], vp[i] )
				gg[upper.tri(gg)] <- rbinom( vp[i] * ( vp[i] - 1 ) / 2, 1, prob )
				G[tmp, tmp]       <- gg
			}
		} 
		else 
		{
			if( class != length(size) )  stop( "Number of graph sizes is not match with number of clusters" )
			if( sum(size) < 0 | sum(size) > p * (p - 1) / 2 )   stop( "Total graph sizes should be between zero and p*(p-1)/2" )

			for( i in 1 : class )
			{
				tmp <- if( i == 1 ) ( 1 : vp[1] ) else ( ( sum( vp[1 : (i-1)] ) + 1 ) : sum( vp[1:i] ) )
				gg  <- matrix( 0, vp[i], vp[i] )
				smp <- sample( 1 : ( vp[i] * (vp[i] - 1) / 2 ), size[i], replace = FALSE )
				gg[upper.tri(gg)][smp] <- 1
				G[tmp, tmp]            <- gg
			}
		}
		
		G <- G + t(G)	   
	}

	if( graph == "hub" )
	{
		# partition variables
		if( is.null( class ) ) class <- ceiling( p / 20 ) 

		# partition variables into groups
		g.large <- p %% class
		g.small <- class - g.large
		n.small <- floor( p / class )
		n.large <- n.small + 1
		g.list  <- c( rep( n.small, g.small ), rep( n.large, g.large ) )
		g.ind   <- rep( c( 1:class ), g.list )
		
		G <- matrix( 0, p, p )
		
		for( i in 1:class )
		{
			tmp           <- which( g.ind == i )
			G[tmp[1],tmp] <- 1
			G[tmp,tmp[1]] <- 1
		}
	}
	
	if( graph == "circle" )
	{
	    G       <- toeplitz( c( 0, 1, rep( 0, p - 2 ) ) )
        G[1, p] <- 1
		G[p, 1] <- 1
	}

	if( graph == "scale-free" )
	{
		G = matrix( 0, p, p )
		resultGraph = .C( "scale_free", G = as.integer(G), as.integer(p), PACKAGE = "BDgraph" )
		G = matrix( resultGraph $ G, p, p ) 
		
		j = sample( 1:p, 1 )
		
		for( i in ( c( 1:p )[ -j ] ) )
		{
			 G[ i, j ] = 1
			 G[ j, i ] = 1
		}
	}
	
	if( graph == "AR1" )
	{
		sigma = matrix( 0, p, p )
		for (i in 1 : (p - 1))
			for (j in (i + 1) : p)
				sigma[i, j] = ( 0.7 ) ^ abs( i - j )
	
		sigma = sigma + t( sigma ) + diag( p )
		K     = solve( sigma )
		G     = 1 * ( abs(K) > 0.2 ) 
	}

	if( graph == "AR2" )
	{
		K = toeplitz( c( 1, 0.5, 0.25, rep( 0, p - 3 ) ) )
		G = 1 * ( abs(K) > 0.2 ) 
	}

	if( graph == "star" )
	{
		K                 <- diag(p)
		K[ 1, ( 2 : p ) ] <- 0.1
		K[ ( 2 : p ), 1 ] <- 0.1
	}

	#--- generate multivariate data according to the graph structure ------------------------------|
	if( n != 0 )
	{
		if( !is.null( sigma ) ) K <- solve( sigma )   

		if( is.matrix( K ) )
		{ 
			G         <- 1 * ( abs( K ) > 0.02 )
			diag( G ) <- 0
			if( is.null( sigma ) ) sigma <- solve( K )	
		}else{ 
			#--- Generate precision matrix according to the graph structure -----------------------|
			Ti        = chol( solve( D ) )
			diag( G ) = 0
			K         = matrix( 0, p, p )
			
			result = .C( "rgwish_c", as.integer(G), as.double(Ti), K = as.double(K), 
						 as.integer(b), as.integer(p), PACKAGE = "BDgraph" )
			
			K     = matrix ( result $ K, p, p ) 		
			sigma = solve( K )
		}
		
		#--- generate multivariate normal data ----------------------------------------------------|
		if( typeof( mean ) == "double" ) mean <- rep( mean, p )
		R <- chol( sigma )
		z <- matrix( rnorm( p * n ), p, n )
		d <- t( R ) %*% z + mean
		d <- t( d )

		#--- generate multivariate mixed data -----------------------------------------------------|
		if( type == "mixed" )
		{
			# generating mixed data which are 'count', 'ordinal', 'non-Gaussian', 
			# 'binary', and 'Gaussian', respectively.
			ps = floor( p / 5 )
			
			# generating count data
			col_number     <- c( 1:ps )
			prob           <- pnorm( d[, col_number] )
			d[,col_number] <- qpois( p = prob, lambda = 10 )

			# generating ordinal data
			col_number     <- c( ( ps + 1 ):( 2 * ps ) )
			prob           <- pnorm( d[ , col_number ] )
			d[,col_number] <- qpois( p = prob, lambda = 2 )

			# generating non-Guassian data
			col_number     <- c( ( 2 * ps + 1 ):( 3 * ps ) )
			prob           <- pnorm( d[ , col_number ] )
			d[,col_number] <- qexp( p = prob, rate = 10 )

			# for binary data
			col_number     <- c( ( 3 * ps + 1 ):( 4 * ps ) )
			prob           <- pnorm( d[ , col_number ] )
			d[,col_number] <- qbinom( p = prob, size = 1, prob = 0.5 )
		}

		#--- generate multivariate continuous non-Gaussian data -----------------------------------|
		if( type == "non-Gaussian" )
		{
			# generating multivariate continuous non-Gaussian data  
			prob <- pnorm( d )
			d    <- qexp( p = prob, rate = 10 )
		}

		#--- generate multivariate discrete data --------------------------------------------------|
		if( type == "discrete" )
		{
			runif_m   <- matrix( runif( cut * p ), nrow = p, ncol = cut )   
			marginals <- apply( runif_m, 1, function( x ) { qnorm( cumsum( x / sum( x ) )[-length( x )] ) } )
			if( cut == 2 ) marginals = matrix( marginals, nrow = 1, ncol = p )
				 
			for( j in 1:p )
			{
				breaks <- c( min( d[, j] ) - 1, marginals[, j], max( d[, j] ) + 1 )  
				d[, j] <- as.integer( cut( d[, j], breaks = breaks, right = FALSE ) )
			}	
			
			d = d - 1
		}

		if( type == "binary" )
		{
			if( p > 16 ) stop( "For type 'binary', number of nodes (p) must be less than 16" )
			
			## Generate clique factors
			clique_factors = generate_clique_factors( ug = G )

			d = sample_ug( n = n, ug = G, clique_factors = clique_factors )
			
			d = d - 1
		}
	}
	
	#--- graph visualization ----------------------------------------------------------------------|
	if( vis )
	{
		graphG <- graph.adjacency( G, mode = "undirected", diag = FALSE )
		
		if( p < 20 ) size = 10 else size = 2
		plot.igraph( graphG, layout = layout.circle, main = "Graph structure", 
		             vertex.color = "white", vertex.size = size, vertex.label.color = 'black' )
	}
	
	#--- Saving the result ------------------------------------------------------------------------|
	if( n != 0 )
	{
		simulation <- list( G = G, data = d, sigma = sigma, K = K, graph = graph, type = type )
	}else{
		simulation <- list( G = G, graph = graph )		
	}
	
	class( simulation ) <- "sim"
	return( simulation )
}
    
## ------------------------------------------------------------------------------------------------|
# Print function for simulation data
print.sim = function( x, ... )
{
	p <- ncol( x $ G )
	
	if( is.null( x $ type ) )
	{
		cat( paste( "  graph generated by bdgraph.sim"                                 ), fill = TRUE )
		cat( paste( "  Graph type      =", x $ graph                                   ), fill = TRUE )
		cat( paste( "  Number of nodes =", p                                           ), fill = TRUE )
		cat( paste( "  Graph size      =", sum( x $ G ) / 2                            ), fill = TRUE )
		cat( paste( "  Sparsity        =", round( sum( x $ G ) / ( p * ( p - 1 ) ), 4) ), fill = TRUE )		
	}else{
		cat( paste( "  Data generated by bdgraph.sim"                                  ), fill = TRUE )
		cat( paste( "  Data type       =", x $ type                                    ), fill = TRUE )
		cat( paste( "  Sample size     =", nrow( x $ data )                            ), fill = TRUE )
		cat( paste( "  Graph type      =", x $ graph                                   ), fill = TRUE )
		cat( paste( "  Number of nodes =", p                                           ), fill = TRUE )
		cat( paste( "  Graph size      =", sum( x $ G ) / 2                            ), fill = TRUE )
		cat( paste( "  Sparsity        =", round( sum( x $ G ) / ( p * ( p - 1 ) ), 4) ), fill = TRUE )
	}
}
    
## ------------------------------------------------------------------------------------------------|
# plot for class "sim" from bdgraph.sim function
plot.sim = function( x, main = NULL, layout = layout.circle, ... )
{
    true_graph = as.matrix( x $ G )
    if( is.null( main ) ) main = "Graph structure"
  	g_igraph <- graph.adjacency( true_graph, mode = "undirected", diag = FALSE )
	
    plot.igraph( g_igraph, main = main, layout = layout, ... )
}		
       
## ------------------------------------------------------------------------------------------------|
# Function for exact sampling from binary data
## ------------------------------------------------------------------------------------------------|
sample_ug = function( n = 1, ug = diag( 3 ), clique_factors = NULL )
{
	p = ncol( ug ) # p smaller than 17 check
	if( p > 16 ) stop( "number of nodes must be smaller than 16" )
	ug[ lower.tri( ug, diag = TRUE ) ] = 0
	if( is.null( clique_factors ) ) clique_factors = generate_clique_factors( ug )
	
	prob = calc_joint_dist( ug, clique_factors )
	noc  = length( prob )
	ind  = sample( 1:noc, n, replace = TRUE, prob = prob )
	oc   = sapply( 1:noc, function( x ){ as.integer( intToBits( x ) ) } )
	oc   = 2 - t( oc[ 1:p, ] )
	data = oc[ ind, ]
	
	return( data )
}
    
## ------------------------------------------------------------------------------------------------|
generate_clique_factors = function( ug )
{
	ug[ lower.tri( ug, diag = TRUE ) ] = 0   
	p              = ncol( ug )
	edges          = which( ug == 1, arr.ind = T )
	a              = igraph::make_undirected_graph( c( t( edges ) ), p )
	cliques        = igraph::max_cliques( a )
	clique_factors = vector( 'list', length( cliques ) )
	
	for ( i in 1:length( cliques ) )
	{
		clq                 = cliques[[i]]
		clique_factors[[i]] = runif( 2 ^ length( clq ) )
	}
	
	return( clique_factors )
}
    
## ------------------------------------------------------------------------------------------------|
calc_joint_dist = function( ug, clique_factors )
{
	p          = ncol( ug )
	oc         = sapply( 1:( 2 ^ p ), function( x ){ as.integer( intToBits( x ) ) } )
	oc         = 2 - t( oc[ 1:p, ] )

	edges      = which( ug == 1, arr.ind = T )
	a          = igraph::make_undirected_graph( c( t( edges ) ), p )

	joint_dist = rep( 1, 2 ^ p )
	cliques    = igraph::max_cliques( a )
	
	for ( i in 1:length( cliques ) )
	{
		clq        = cliques[[i]]
		k          = length( clq )
		temp       = sapply( 1:( 2 ^ k ), function( x ){ as.integer( intToBits( x ) ) } )
		clq_oc     = 2 - t( temp[1:k, ] )
		clq_factor = clique_factors[[i]]
		
		for ( j in 1:nrow( clq_oc ) )
		{
			oc_col_clq = oc[ , clq ]
			if( is.null( dim( oc_col_clq ) ) ) 
				oc_col_clq = matrix( oc_col_clq, nrow = length( oc_col_clq ), ncol = 1 )
		
			ind             = apply( oc_col_clq, 1, identical, clq_oc[ j, ] )
			joint_dist[ind] = joint_dist[ ind ] * clq_factor[ j ]
		}				
	}	
	
	joint_dist = joint_dist / sum( joint_dist )
	
	return( joint_dist )	
}
    


back to top