Content - 3b00d156c0b6ec355f5aa1e90961b8c73cb2e2ad - 2d92a7d/scattering.py

scattering.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''
    *** Scattering ***
    Copyright (C) February 2023 Francesco Camaglia, CNRS LPENS 

    Reference :
    Francesco Camaglia, Arie Ryvkin, Erez Greenstein, Shlomit Reich-Zeliger, Benny Chain, Thierry Mora, Aleksandra M. Walczak, Nir Friedman (2023).
    Quantifying changes in the T cell receptor repertoire during thymic development. [https://doi.org/10.7554/eLife.81622]
'''

import pandas as pd 
import numpy as np
from scipy.stats import pearsonr

# LATEX GRAPH DEFAULT: use latex format and fonts
import matplotlib.pyplot as plt
from matplotlib.colors import is_color_like
SILVER = '#c0c0c0'
AZURE = '#0064FF'
PIDGEON = '#606e8c'

from matplotlib.colors import LinearSegmentedColormap
colors = ['#08457e','#0088ff',"#ef5675",'#ffff66']
cmap1 = LinearSegmentedColormap.from_list("mycmap", colors)

#########################
#  COMPUTELOCALDENSITY  #
#########################

def computeLocalDensity( x, y, xlim=[0,0], ylim=[0,0], n_bins_x=10, n_bins_y=10):
    '''
    Square Local Density Matrices.
    
    Parameters
    ----------
    
    x: list
    y: list
    xlim: list
    ylim: list
    bins: int, optional
    '''
    
    # >>>>>>>>>>>>>>>>>>
    #  Load parameters
    # >>>>>>>>>>>>>>>>>>
    
    #  x, y
    if len(x) != len(y) :
        raise IOError("The vectors x and y must have the same length.")
    
    #  limits
    for this_lim, this_par_name in zip([xlim,ylim],['xlim','ylim']) :
        if np.all( this_lim == [0,0] ) :
            this_lim = np.array([np.min(x),np.max(x)])
        else :
            try :
                this_lim = list( this_lim )
            except :
                print(f"The provided {this_par_name} type is unrecognized.")
            if len( this_lim ) != 2 :
                raise IOError(f"Parameter {this_par_name} required a list with 2 elements.")
                
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    #  computing local counts grid
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    
    # get bin size
    Res_x = ( xlim[-1] - xlim[0] ) / n_bins_x
    Res_y = ( ylim[-1] - ylim[0] ) / n_bins_y
    
    # digitize coordinates
    ind_x = np.floor( ( x - xlim[0] ) / Res_x ).astype(int)
    ind_y = np.floor( ( y - ylim[0] ) / Res_y ).astype(int)
    
    # counting the number of cells in each square of the grid
    temp = pd.Series( zip( ind_y, ind_x ) )
    coordinates = temp.groupby( temp ).size().to_dict()
    
    # loop on all the grid cells
    LocalCounts = np.zeros( ( n_bins_y, n_bins_x ) )
    for c in coordinates:
        i , j = c
        # WARNING!:
        if i < n_bins_x and j < n_bins_y and i > 0 and j > 0 :
            LocalCounts[ i,j ] = coordinates[ c ]
            
    # normalizing
    LocalDensity = LocalCounts / ( len(x) * Res_x * Res_y )

    return LocalDensity
###



#########################
#  DENSITY_SCATTERPLOT  #
#########################

def density_scatterplot( data, bins=None, names=None,  density=True, orientation=None,
                        hist=True, my_cmap=cmap1, bisector=True, density_logscale=True,
                        figsize=None, show_matrix='both', fontsize=10, Pearson=False, set_ticks=None ):
    """
    It produces a scatterplot matrix from a pandas data frame <data>.
    
    Parameters
    ----------
    
    data: pandas.DataFrame
    names: list 
            list of columns of data to be plotted
    """
    
    # >>>>>>>>>>>>>>>>>>
    #  Load parameters
    # >>>>>>>>>>>>>>>>>>
    
    #  names 
    if names == None :
        names = list( data.columns )
    else :
        try :
            names = list( names )
        except :
            print("The provided names type is unrecognized : list is required.")
            
        # check if the names are in the column names
        check = [ n in list( data.columns ) for n in names ]
        if False in check :
            raise KeyError("The provided names do not match with data frame column names: ", names[check])
   
    ncols = len( names ) 
    
    #  figsize
    if figsize is not None :
        if type(figsize) != tuple or len(figsize) != 2 :
            raise IOError( "Wrong choice for `figsize` format." )
    else : # default   
        figsize = ( 2 * ncols, 2 * ncols )
                
    #  bins
    if bins is None :
        # purely empirical choice of the bins
        bins = np.power( 2.8, np.log10( len( data ) ) ).astype(int)
    else : 
        try :
            bins = int( bins )
        except :
            print("The provided bins type is unrecognized : int is required.")
            
    if orientation is not None :
        if not is_color_like(orientation) :
            if orientation != True :
                print('Warning: selected color for `orientation` is not recognized.')
            orientation = PIDGEON
            
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    #  Define subplot dimensions  #
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>  
    
    plt.rcParams.update({'font.size': fontsize})
    fig, axes = plt.subplots( nrows=ncols, ncols=ncols, 
                             figsize=figsize, sharex=False, sharey=False )
    fig.subplots_adjust( hspace=0.1, wspace=0.1 )

    # adjust xlim for binning
    xlim = np.array([min(np.min(data)), max(np.max(data))])
    xlim[0] *= 1. - 0.00001 * np.sign( xlim[0] ) 
    xlim[1] *= 1. + 0.00001 * np.sign( xlim[1] )     
    ylim = xlim                
    Dxlim = xlim[1] - xlim[0]
    Dylim = ylim[1] - ylim[0]
    
    # smart definition of indices off diagonal for plotting
    if show_matrix == 'lower' :
        smart_indices = list(zip( *np.tril_indices_from(axes, k=-1) ))        
        nega_indices = list(zip( *np.triu_indices_from(axes, k=1) ))
    elif show_matrix == 'upper' :
        smart_indices = list(zip( *np.triu_indices_from(axes, k=1) ))       
        nega_indices = list(zip( *np.tril_indices_from(axes, k=-1) ))
    elif show_matrix == 'both' :
        smart_indices = list(zip( *np.tril_indices_from(axes, k=-1) )) 
        smart_indices += list(zip( *np.triu_indices_from(axes, k=1) ))
        nega_indices = []
    else :
        raise IOError('Unrecognized option for show_matrix.')
    
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    #  pre-process for density scatterplot  #
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    if density is True :
        v_clim = np.zeros(2)
        store_loc_density = {}
        
        for i, j in smart_indices:
            x = data[ names[j] ].values
            y = data[ names[i] ].values
            Loc_density = computeLocalDensity( x, y, xlim=xlim, ylim=ylim, n_bins_x=bins, n_bins_y=bins )  
            if density_logscale is True : Loc_density = safe_log10( Loc_density )
            # update the limits for the values in the colorbar
            v_clim = np.array([min(v_clim[0],np.nanmin(Loc_density)), max(v_clim[1],np.nanmax(Loc_density))])
            store_loc_density[(i,j)] = Loc_density
            
    # >>>>>>>>>>>>>>>>>
    #  Plot the data  #
    # >>>>>>>>>>>>>>>>>
    
    for i, j in smart_indices:
        
        x = data[ names[j] ].values
        y = data[ names[i] ].values
                
        #  SCATTER PLOT  # 
        if density is False :    
            
            axes[i,j].scatter( x, y, color=AZURE, ec=None, marker='.', alpha=0.2, zorder=0 )                    
            axes[i,j].set_xlim(xlim), axes[i,j].set_ylim(ylim)           
            if bisector is True :
                axes[i,j].plot( xlim, ylim, lw=1, ls="--", color=SILVER )
            if orientation is not None :
                lreg = compute_gyration_ax(x, y)
                axes[i,j].plot( xlim, lreg.intercept + lreg.slope * xlim,
                               lw=1, ls="-", color=orientation )
                
        #  DENSITY MAP  #
        if density is True : 
            Loc_density = store_loc_density[(i,j)]
            this_im = axes[i,j].imshow( Loc_density, cmap=my_cmap, vmin=v_clim[0], vmax=v_clim[1],
                                       interpolation='nearest', origin='lower' )
                       
            xImshowLim = np.array([0, Loc_density.shape[0]])
            yImshowLim = np.array([0, Loc_density.shape[1]])
            axes[i,j].set_xlim(xImshowLim), axes[i,j].set_ylim(yImshowLim)
            
            if bisector == True :
                axes[i,j].plot( xImshowLim, yImshowLim, lw=1, ls="--", color=SILVER )
                
            if orientation is not None :
                lreg = compute_gyration_ax(x, y)
                lreg_Imshow_slope = lreg.slope * ( yImshowLim[1] * Dxlim ) / ( xImshowLim[1] * Dylim )
                lreg_Imshow_intercept = ( lreg.intercept + lreg.slope * xlim[0] - ylim[0]) * yImshowLim[1] / Dylim
                axes[i,j].plot( xImshowLim, lreg_Imshow_intercept + lreg_Imshow_slope * xImshowLim,
                               lw=1, ls = "-", color=orientation )    
                
            if Pearson is True :
                r, p = pearsonr(x, y)
                label = "r=%.2f" % np.round(r,2)
                axes[i,j].annotate( label, (0.95, 0.1), xycoords='axes fraction',
                               ha='right', va='center' )
               
    # >>>>>>>>>>>>>
    #  x/y ticks  #
    # >>>>>>>>>>>>>
    
    if set_ticks is None :
        ticks_pos = np.linspace( 0.1, 0.9, 3 )
        ticks_x = ticks_pos * Dxlim + xlim[0]
        ticks_y = ticks_pos * Dylim + ylim[0]
    else :
        ticks_x = set_ticks
        ticks_y = set_ticks   
        ticks_pos = ( set_ticks - xlim[0] ) / Dxlim
    
    for ax in axes.flat:            
        
        # modify frame appearence
        [ ax.spines[p].set_linewidth(1) for p in ['right','left', 'top','bottom'] ]

        # y ticks and labels position
        if ax.is_first_col() or ax.is_last_col() :  

            if ax.is_first_col() : ax.yaxis.set_ticks_position('left')
            else : ax.yaxis.set_ticks_position('right')

            if density == True : ax.set_yticks( ticks_pos * bins )
            else : ax.set_yticks( ticks_y )

            ax.set_yticklabels( np.round(ticks_y,1) )

        else :      
            plt.setp(ax.get_yticklabels(), visible=False)  
            ax.tick_params(axis='y', which='both', length=0)

        # x ticks and labels position
        if ax.is_first_row() or ax.is_last_row() :

            if ax.is_first_row() : ax.xaxis.set_ticks_position('top') 
            else : ax.xaxis.set_ticks_position('bottom') 

            if density == True : ax.set_xticks( ticks_pos * bins )
            else : ax.set_xticks( ticks_x )

            ax.set_xticklabels( np.round(ticks_x,1) )

        else :
            plt.setp(ax.get_xticklabels(), visible=False)  
            ax.tick_params(axis='x', which='both', length=0)
            
    # >>>>>>>>>>>>>>>
    #  hide matrix  #
    # >>>>>>>>>>>>>>>

    for i,j in nega_indices :
        axes[i,j].axis('off')
        axes[i,j].set_visible(False)
        axes[i,j].set_xlabel('')
        axes[i,j].set_ylabel('')                     
                
    # >>>>>>>>>>>>>>>>>>>
    #  matrix diagonal  #
    # >>>>>>>>>>>>>>>>>>>
    
    for i, label in enumerate(names):  
        #[ axes[i,i].spines[p].set_color('white') for p in ['right','left', 'top','bottom'] ]       
        plt.setp(axes[i,i].spines.values(), visible=False) # set spines not visible
        
        # minimal labels
        axes[0,i].xaxis.set_label_position('top')
        axes[0,i].set_xlabel(label)            
        axes[i,-1].yaxis.set_label_position('right')
        axes[i,-1].set_ylabel(label)            

        # complete labels
        axes[-1,i].xaxis.set_label_position('bottom')
        axes[-1,i].set_xlabel(label)            
        axes[i,0].yaxis.set_label_position('left')
        axes[i,0].set_ylabel(label)    

        plt.setp(axes[i,i].get_xticklabels(), visible=False)  
        axes[i,i].tick_params(axis='x', which='both', length=0)   
        plt.setp(axes[i,i].get_yticklabels(), visible=False)  
        axes[i,i].tick_params(axis='y', which='both', length=0)
        
        if hist == False : #  Labels in the diagonal subplots  #          
            axes[i,i].annotate( label, (0.5, 0.5), xycoords='axes fraction',
                               ha='center', va='center' )
     
        else : #  Plot histograms and set labels  #
            h, b = np.histogram( data[ label ], bins=np.linspace( xlim[0], xlim[-1], bins ) )
            axes[i,i].set_yscale("log")
            axes[i,i].plot( b[:-1] + 0.5 * ( b[1] - b[0] ), h, color=AZURE ) 
            axes[i,i].set_xlim(xlim)
            
    if hist == False :
        for i in [0,-1] :
            axes[i,i].set_xlabel('')
            axes[i,i].set_ylabel('')
        
    if density is True :
        return axes, this_im
    else :
        return axes
###

#########################
#  COMPUTE_GYRATION_AX  #
#########################

from collections import namedtuple

def compute_gyration_ax( x, y ) :
    
    assert len(x) == len(y)
    lreg = namedtuple('intercept', 'slope')
     
    S = len(x)
    # baricenter
    x_A = np.sum(x) / S
    y_A = np.sum(y) / S
    # gyration tensor 
    G_xx = np.sum(np.power(x,2)) / S - x_A**2
    G_yy = np.sum(np.power(y,2)) / S - y_A**2
    G_xy = np.dot(x,y) / S - x_A * y_A
    # main eigenvalue
    mu_p = 0.5 * ( G_xx + G_yy + np.sqrt( ( G_xx - G_yy )**2 + 4 * G_xy**2 ) ) 
    # main eigenvector angle tangent
    tan = ( mu_p - G_xx ) / G_xy
    # assign results
    lreg.intercept = y_A - tan * x_A
    lreg.slope = tan
    return lreg
### 

################
#  SAFE_LOG10  #
################

def safe_log10( myarray ) :
    
    out = np.empty( np.shape( myarray ) )
    out[ : ] = np.nan
    loc = np.where( myarray > 0 )
    out[ loc ] =np.log10( myarray[ loc ] )
    
    return out
###