#!/usr/bin/env python # -*- coding: utf-8 -*- ''' *** Scattering *** Copyright (C) February 2023 Francesco Camaglia, CNRS LPENS Reference : Francesco Camaglia, Arie Ryvkin, Erez Greenstein, Shlomit Reich-Zeliger, Benny Chain, Thierry Mora, Aleksandra M. Walczak, Nir Friedman (2023). Quantifying changes in the T cell receptor repertoire during thymic development. [https://doi.org/10.7554/eLife.81622] ''' import pandas as pd import numpy as np from scipy.stats import pearsonr # LATEX GRAPH DEFAULT: use latex format and fonts import matplotlib.pyplot as plt from matplotlib.colors import is_color_like SILVER = '#c0c0c0' AZURE = '#0064FF' PIDGEON = '#606e8c' from matplotlib.colors import LinearSegmentedColormap colors = ['#08457e','#0088ff',"#ef5675",'#ffff66'] cmap1 = LinearSegmentedColormap.from_list("mycmap", colors) ######################### # COMPUTELOCALDENSITY # ######################### def computeLocalDensity( x, y, xlim=[0,0], ylim=[0,0], n_bins_x=10, n_bins_y=10): ''' Square Local Density Matrices. Parameters ---------- x: list y: list xlim: list ylim: list bins: int, optional ''' # >>>>>>>>>>>>>>>>>> # Load parameters # >>>>>>>>>>>>>>>>>> # x, y if len(x) != len(y) : raise IOError("The vectors x and y must have the same length.") # limits for this_lim, this_par_name in zip([xlim,ylim],['xlim','ylim']) : if np.all( this_lim == [0,0] ) : this_lim = np.array([np.min(x),np.max(x)]) else : try : this_lim = list( this_lim ) except : print(f"The provided {this_par_name} type is unrecognized.") if len( this_lim ) != 2 : raise IOError(f"Parameter {this_par_name} required a list with 2 elements.") # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # computing local counts grid # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # get bin size Res_x = ( xlim[-1] - xlim[0] ) / n_bins_x Res_y = ( ylim[-1] - ylim[0] ) / n_bins_y # digitize coordinates ind_x = np.floor( ( x - xlim[0] ) / Res_x ).astype(int) ind_y = np.floor( ( y - ylim[0] ) / Res_y ).astype(int) # counting the number of cells in each square of the grid temp = pd.Series( zip( ind_y, ind_x ) ) coordinates = temp.groupby( temp ).size().to_dict() # loop on all the grid cells LocalCounts = np.zeros( ( n_bins_y, n_bins_x ) ) for c in coordinates: i , j = c # WARNING!: if i < n_bins_x and j < n_bins_y and i > 0 and j > 0 : LocalCounts[ i,j ] = coordinates[ c ] # normalizing LocalDensity = LocalCounts / ( len(x) * Res_x * Res_y ) return LocalDensity ### ######################### # DENSITY_SCATTERPLOT # ######################### def density_scatterplot( data, bins=None, names=None, density=True, orientation=None, hist=True, my_cmap=cmap1, bisector=True, density_logscale=True, figsize=None, show_matrix='both', fontsize=10, Pearson=False, set_ticks=None ): """ It produces a scatterplot matrix from a pandas data frame . Parameters ---------- data: pandas.DataFrame names: list list of columns of data to be plotted """ # >>>>>>>>>>>>>>>>>> # Load parameters # >>>>>>>>>>>>>>>>>> # names if names == None : names = list( data.columns ) else : try : names = list( names ) except : print("The provided names type is unrecognized : list is required.") # check if the names are in the column names check = [ n in list( data.columns ) for n in names ] if False in check : raise KeyError("The provided names do not match with data frame column names: ", names[check]) ncols = len( names ) # figsize if figsize is not None : if type(figsize) != tuple or len(figsize) != 2 : raise IOError( "Wrong choice for `figsize` format." ) else : # default figsize = ( 2 * ncols, 2 * ncols ) # bins if bins is None : # purely empirical choice of the bins bins = np.power( 2.8, np.log10( len( data ) ) ).astype(int) else : try : bins = int( bins ) except : print("The provided bins type is unrecognized : int is required.") if orientation is not None : if not is_color_like(orientation) : if orientation != True : print('Warning: selected color for `orientation` is not recognized.') orientation = PIDGEON # >>>>>>>>>>>>>>>>>>>>>>>>>>>>> # Define subplot dimensions # # >>>>>>>>>>>>>>>>>>>>>>>>>>>>> plt.rcParams.update({'font.size': fontsize}) fig, axes = plt.subplots( nrows=ncols, ncols=ncols, figsize=figsize, sharex=False, sharey=False ) fig.subplots_adjust( hspace=0.1, wspace=0.1 ) # adjust xlim for binning xlim = np.array([min(np.min(data)), max(np.max(data))]) xlim[0] *= 1. - 0.00001 * np.sign( xlim[0] ) xlim[1] *= 1. + 0.00001 * np.sign( xlim[1] ) ylim = xlim Dxlim = xlim[1] - xlim[0] Dylim = ylim[1] - ylim[0] # smart definition of indices off diagonal for plotting if show_matrix == 'lower' : smart_indices = list(zip( *np.tril_indices_from(axes, k=-1) )) nega_indices = list(zip( *np.triu_indices_from(axes, k=1) )) elif show_matrix == 'upper' : smart_indices = list(zip( *np.triu_indices_from(axes, k=1) )) nega_indices = list(zip( *np.tril_indices_from(axes, k=-1) )) elif show_matrix == 'both' : smart_indices = list(zip( *np.tril_indices_from(axes, k=-1) )) smart_indices += list(zip( *np.triu_indices_from(axes, k=1) )) nega_indices = [] else : raise IOError('Unrecognized option for show_matrix.') # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # pre-process for density scatterplot # # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> if density is True : v_clim = np.zeros(2) store_loc_density = {} for i, j in smart_indices: x = data[ names[j] ].values y = data[ names[i] ].values Loc_density = computeLocalDensity( x, y, xlim=xlim, ylim=ylim, n_bins_x=bins, n_bins_y=bins ) if density_logscale is True : Loc_density = safe_log10( Loc_density ) # update the limits for the values in the colorbar v_clim = np.array([min(v_clim[0],np.nanmin(Loc_density)), max(v_clim[1],np.nanmax(Loc_density))]) store_loc_density[(i,j)] = Loc_density # >>>>>>>>>>>>>>>>> # Plot the data # # >>>>>>>>>>>>>>>>> for i, j in smart_indices: x = data[ names[j] ].values y = data[ names[i] ].values # SCATTER PLOT # if density is False : axes[i,j].scatter( x, y, color=AZURE, ec=None, marker='.', alpha=0.2, zorder=0 ) axes[i,j].set_xlim(xlim), axes[i,j].set_ylim(ylim) if bisector is True : axes[i,j].plot( xlim, ylim, lw=1, ls="--", color=SILVER ) if orientation is not None : lreg = compute_gyration_ax(x, y) axes[i,j].plot( xlim, lreg.intercept + lreg.slope * xlim, lw=1, ls="-", color=orientation ) # DENSITY MAP # if density is True : Loc_density = store_loc_density[(i,j)] this_im = axes[i,j].imshow( Loc_density, cmap=my_cmap, vmin=v_clim[0], vmax=v_clim[1], interpolation='nearest', origin='lower' ) xImshowLim = np.array([0, Loc_density.shape[0]]) yImshowLim = np.array([0, Loc_density.shape[1]]) axes[i,j].set_xlim(xImshowLim), axes[i,j].set_ylim(yImshowLim) if bisector == True : axes[i,j].plot( xImshowLim, yImshowLim, lw=1, ls="--", color=SILVER ) if orientation is not None : lreg = compute_gyration_ax(x, y) lreg_Imshow_slope = lreg.slope * ( yImshowLim[1] * Dxlim ) / ( xImshowLim[1] * Dylim ) lreg_Imshow_intercept = ( lreg.intercept + lreg.slope * xlim[0] - ylim[0]) * yImshowLim[1] / Dylim axes[i,j].plot( xImshowLim, lreg_Imshow_intercept + lreg_Imshow_slope * xImshowLim, lw=1, ls = "-", color=orientation ) if Pearson is True : r, p = pearsonr(x, y) label = "r=%.2f" % np.round(r,2) axes[i,j].annotate( label, (0.95, 0.1), xycoords='axes fraction', ha='right', va='center' ) # >>>>>>>>>>>>> # x/y ticks # # >>>>>>>>>>>>> if set_ticks is None : ticks_pos = np.linspace( 0.1, 0.9, 3 ) ticks_x = ticks_pos * Dxlim + xlim[0] ticks_y = ticks_pos * Dylim + ylim[0] else : ticks_x = set_ticks ticks_y = set_ticks ticks_pos = ( set_ticks - xlim[0] ) / Dxlim for ax in axes.flat: # modify frame appearence [ ax.spines[p].set_linewidth(1) for p in ['right','left', 'top','bottom'] ] # y ticks and labels position if ax.is_first_col() or ax.is_last_col() : if ax.is_first_col() : ax.yaxis.set_ticks_position('left') else : ax.yaxis.set_ticks_position('right') if density == True : ax.set_yticks( ticks_pos * bins ) else : ax.set_yticks( ticks_y ) ax.set_yticklabels( np.round(ticks_y,1) ) else : plt.setp(ax.get_yticklabels(), visible=False) ax.tick_params(axis='y', which='both', length=0) # x ticks and labels position if ax.is_first_row() or ax.is_last_row() : if ax.is_first_row() : ax.xaxis.set_ticks_position('top') else : ax.xaxis.set_ticks_position('bottom') if density == True : ax.set_xticks( ticks_pos * bins ) else : ax.set_xticks( ticks_x ) ax.set_xticklabels( np.round(ticks_x,1) ) else : plt.setp(ax.get_xticklabels(), visible=False) ax.tick_params(axis='x', which='both', length=0) # >>>>>>>>>>>>>>> # hide matrix # # >>>>>>>>>>>>>>> for i,j in nega_indices : axes[i,j].axis('off') axes[i,j].set_visible(False) axes[i,j].set_xlabel('') axes[i,j].set_ylabel('') # >>>>>>>>>>>>>>>>>>> # matrix diagonal # # >>>>>>>>>>>>>>>>>>> for i, label in enumerate(names): #[ axes[i,i].spines[p].set_color('white') for p in ['right','left', 'top','bottom'] ] plt.setp(axes[i,i].spines.values(), visible=False) # set spines not visible # minimal labels axes[0,i].xaxis.set_label_position('top') axes[0,i].set_xlabel(label) axes[i,-1].yaxis.set_label_position('right') axes[i,-1].set_ylabel(label) # complete labels axes[-1,i].xaxis.set_label_position('bottom') axes[-1,i].set_xlabel(label) axes[i,0].yaxis.set_label_position('left') axes[i,0].set_ylabel(label) plt.setp(axes[i,i].get_xticklabels(), visible=False) axes[i,i].tick_params(axis='x', which='both', length=0) plt.setp(axes[i,i].get_yticklabels(), visible=False) axes[i,i].tick_params(axis='y', which='both', length=0) if hist == False : # Labels in the diagonal subplots # axes[i,i].annotate( label, (0.5, 0.5), xycoords='axes fraction', ha='center', va='center' ) else : # Plot histograms and set labels # h, b = np.histogram( data[ label ], bins=np.linspace( xlim[0], xlim[-1], bins ) ) axes[i,i].set_yscale("log") axes[i,i].plot( b[:-1] + 0.5 * ( b[1] - b[0] ), h, color=AZURE ) axes[i,i].set_xlim(xlim) if hist == False : for i in [0,-1] : axes[i,i].set_xlabel('') axes[i,i].set_ylabel('') if density is True : return axes, this_im else : return axes ### ######################### # COMPUTE_GYRATION_AX # ######################### from collections import namedtuple def compute_gyration_ax( x, y ) : assert len(x) == len(y) lreg = namedtuple('intercept', 'slope') S = len(x) # baricenter x_A = np.sum(x) / S y_A = np.sum(y) / S # gyration tensor G_xx = np.sum(np.power(x,2)) / S - x_A**2 G_yy = np.sum(np.power(y,2)) / S - y_A**2 G_xy = np.dot(x,y) / S - x_A * y_A # main eigenvalue mu_p = 0.5 * ( G_xx + G_yy + np.sqrt( ( G_xx - G_yy )**2 + 4 * G_xy**2 ) ) # main eigenvector angle tangent tan = ( mu_p - G_xx ) / G_xy # assign results lreg.intercept = y_A - tan * x_A lreg.slope = tan return lreg ### ################ # SAFE_LOG10 # ################ def safe_log10( myarray ) : out = np.empty( np.shape( myarray ) ) out[ : ] = np.nan loc = np.where( myarray > 0 ) out[ loc ] =np.log10( myarray[ loc ] ) return out ###