Revision 9e336a9ac1c7af0344d8ebac90c1b631688ed4eb authored by colleenmclaughlin on 27 January 2021, 20:15:47 UTC, committed by GitHub on 27 January 2021, 20:15:47 UTC
1 parent 1138cf3
sct_py3.py
``````from __future__ import division
import numpy as np
import scipy
import pandas as pd
import copy

import sklearn

from scipy.cluster import hierarchy

import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

import gc

def calc_DE_mannwhitneyu(X, names1, names2):
pvalues = []
medianA = []
medianB = []
meanA = []
meanB = []
for gene in X.index:
A = X[names1].loc[gene]
B = X[names2].loc[gene]
if np.count_nonzero(A) == 0 and np.count_nonzero(B) == 0:
pvalues.append(np.nan)
medianA.append(0)
medianB.append(0)
meanA.append(0)
meanB.append(0)
continue
_, pvalue = scipy.stats.mannwhitneyu(A, B)
pvalues.append(pvalue)
medianA.append(np.median(A))
medianB.append(np.median(B))
meanA.append(np.mean(A))
meanB.append(np.mean(B))
df_DE = pd.DataFrame({"pvalue": pvalues, "medianA": medianA, "medianB": medianB,
"meanA": meanA, "meanB": meanB}, index=X.index)
df_DE.sort_values("pvalue", inplace=True)
return df_DE

def call_hits_dropout(df, dropout_cutoff=1, window_size=500, N=2000, pvalue_cutoff=0.05):

df = copy.deepcopy(df)

# Calculate mean
df["mean"] = np.mean(df, axis=1)
df.sort_values(by="mean", ascending=False, inplace=True)

# Calculate dropouts per gene
df["dropouts"] = np.sum(df.ix[:,:-1] < dropout_cutoff, axis=1) / df.shape[1]

def f(df, i, window_size):
""" Get dropout distribution for window around focal gene.
Calculate empirical p value. """

i_upper = int(max(0, i - window_size/2))
i_lower = int(min(df.shape[0], i + window_size/2))

# Make ensemble of dropout values
ensemble = []
for k, (gene_name, row) in enumerate(df.iloc[i_upper:i_lower].iterrows()):
if k + i_upper != i:
ensemble.append(row["dropouts"])

# Get dropouts of focal gene
myDropouts = df.iloc[i]["dropouts"]

# What fraction of ensemble is more extreme than focal gene?
pvalue_empirical = sum(ensemble > myDropouts) / len(ensemble)

return ensemble, myDropouts, pvalue_empirical

hits = []

for i in range(0, N):

ensemble, myDropouts, pvalue_empirical = f(df, i, window_size)
geneName = df.iloc[i].name

if pvalue_empirical < pvalue_cutoff:
hits.append(geneName)

return hits

def call_hits_corr(df, dropout_cutoff=1, N=2000, corr_cutoff=0.8, max_canonical_vector=10, canonical_vector_spacing=3):

df = copy.deepcopy(df)

def f(df, C):
""" Calculate correlations with canonical vector C """

geneNames = []
corrs = []
pvalues = []

for k, (geneName, row) in enumerate(df.iterrows()):

myRankedExpr = row.sort_values()
myRankedExpr = np.log10(myRankedExpr + 1)
myY = myRankedExpr / max(myRankedExpr)
corr, pvalue = scipy.stats.pearsonr(myY, C)

geneNames.append(geneName)
corrs.append(corr)
pvalues.append(pvalue)

df_corrs = pd.DataFrame({"geneName":geneNames, "corr":corrs, "pvalue":pvalues})

return df_corrs

df["mean_expr"] = np.mean(df, axis=1)
df = df.sort_values("mean_expr", ascending=False)
df = df.T.drop("mean_expr").T

df[df < dropout_cutoff] = 0

L = df.shape[1] # length of expr vector (num cells)

df_corrs = pd.DataFrame()

for k in range(1, max_canonical_vector, canonical_vector_spacing):

C = np.zeros(L) # canonical expression profile
C[-k:] = 1.

df_myCorr = f(df, C)
df_corrs = pd.concat([df_corrs, df_myCorr])

# Filter for highly correlated hits
df_hits = df_corrs.loc[df_corrs["corr"] > corr_cutoff].sort_values(by="corr", ascending=False)
df_hits = df_hits.drop_duplicates(subset="geneName")
hits = list(df_hits["geneName"])

return hits, df_hits

def get_zscores(df, num_bin=20):

myMean = np.mean(df, axis=1)
myVar = np.var(df, axis=1)
bins = np.linspace(min(myMean), max(myMean), num_bin)

df["mean"] = myMean
df["var"] = myVar
df["mean_bin"] = pd.cut(myMean, bins, right=True, labels=range(1,len(bins)), include_lowest=True)

for _, group in df.groupby("mean_bin"):

myDispersion = np.log10(group["var"] / group["mean"])
myDispersionStd = np.std(myDispersion)

if myDispersionStd == 0: z_scores = np.zeros(len(group))

z_scores = (myDispersion - np.mean(myDispersion)) / myDispersionStd
#group.index.dropna()
df.loc[group.index.dropna(), "z_score"] = z_scores

mean = df["mean"]
z_score = df["z_score"]
df.drop(["mean", "var", "mean_bin", "z_score"], axis=1, inplace=True) # clean up

return mean, z_score, df

def corr(df, exclude_max=0):
# Calculate pairwise correlations between genes (columns of df)
# Very slow because pandas corr() function is slow (compared to np.corrcoef()).
if exclude_max > 0:
for i in range(exclude_max):
for col, row in enumerate(np.nanargmax(np.array(df), axis=0)):
df.iloc[row,col] = np.nan
return df.corr()

def get_correlated_genes(df, seeds, correlation_cutoff, min_hits=1, exclude_max=0):

correlated_genes = []
uncorrelated_seeds = []
allCorrelations = pd.DataFrame(np.corrcoef(df), index=df.index, columns=df.index)

for seed in seeds:

myCorrelations = allCorrelations.loc[seed].drop(seed)
myHits = myCorrelations[np.abs(myCorrelations) > correlation_cutoff]

if exclude_max > 0:
# keep only hits that are still hits after excluding maximum expressing sample
myValidatedHits = []
for hit in myHits.index:
corr_excludeMax = np.abs(corr(df.loc[[seed, hit]].T, exclude_max=exclude_max).loc[seed, hit])
if corr_excludeMax > correlation_cutoff:
myValidatedHits.append(hit)
else:
myValidatedHits = myHits.index

if len(myValidatedHits) < min_hits:
if seed == "SpikeIn1": print ("SpikeIn1 was uncorrelated seed with len(validatedHits)="), len(myValidatedHits)
uncorrelated_seeds.append(seed)
else:
correlated_genes.extend(myValidatedHits)

return correlated_genes, uncorrelated_seeds

def prune_singleton_correlations(df, product_cutoff=0.9):
""" Identify genes that are driven by a single cell that highly expresses both """
notSingletons = []
singletons = []
for gene1 in df.index:
for gene2 in df.index:
if gene1 == gene2: continue
A = df.loc[gene1]
B = df.loc[gene2]
AB = A*B
myCutoff = product_cutoff * max(A) * max(B)
x = sum(AB > myCutoff)
if x > 1:
notSingletons.append(gene1)
break
singletons.append(gene1)
return notSingletons, singletons

def filter_genes_overdispersed_correlates_dropouts(X, TFs, CSMs=None, exclude=None, N=50, correlation_cutoff=0.5, min_hits=1, exclude_max=0, dropout_rate_low=0.0, dropout_rate_high=1.0):
""" Filter for informative genes for cell type identification.
(1) Drop genes that are not expressed.
(2) Find overdispersed genes.
(3) Expand gene set by finding correlated genes.
(4) Drop genes with no correlates.
(5) Drop genes with high or low dropout rate. """

# Drop genes with low max expression
X = X.loc[np.max(X, axis=1) > 2]

# Find overdispersed genes
myDispersion = dispersion(X)
myDispersion.calc_dispersion(num_bin=20) # calculate overdispersion

hits_genome = myDispersion.get_hits(N=N)
hits_TF = myDispersion.get_hits(N=N, candidates=TFs) # get hits among TFs

if CSMs != None:
hits_CSM = myDispersion.get_hits(N=N, candidates=CSMs) # get hits among CSMs
hits = hits_genome.append([hits_TF, hits_CSM]).drop_duplicates().sort_values(ascending=False)
else:
hits = hits_genome.append([hits_TF]).drop_duplicates().sort_values(ascending=False)

if exclude is not None:
hits = list(set(hits.index) - set(exclude))
else:
hits = list(hits.index)

# Expand gene set by finding correlated genes
# Remove genes that have no correlates (presumably noise -- they don't belong to a "module")
if len(hits) > 1000: print ("Warning: calculating correlations between all genes and >1000 hits")
correlated_genes, uncorrelated_seeds = get_correlated_genes(X, hits, correlation_cutoff=correlation_cutoff, min_hits=min_hits, exclude_max=exclude_max)

hits_pruned = list(set(list(hits)) - set(list(uncorrelated_seeds)))
hits_pruned_expanded = list(set(hits_pruned + list(set(correlated_genes))))

Y = X.loc[hits_pruned_expanded]

# Filter genes by dropout rate
dropout_rate = np.sum(Y < 2, axis=1) / Y.shape[1]
Y = Y.loc[(dropout_rate > dropout_rate_low) & (dropout_rate < dropout_rate_high)]

return Y

class dispersion():

def __init__(self, X):
self.X = X
self.X["max"] = np.max(self.X, axis=1)
self.X.sort_values(by="max", ascending=False, inplace=True)
self.X.drop("max", axis=1, inplace=True)
self.mean = np.mean(self.X, axis=1)

def calc_dispersion(self, num_bin=20):
_, dispersion, _ = get_zscores(self.X, num_bin)
self.dispersion = dispersion

def plot(self, ax):
ax.scatter(self.mean, self.dispersion)
ax.set_xlabel("Mean expression (log2(CPM+1))")
ax.set_ylabel("Dispersion (Z-score of log2(variance/mean))")
ax.set_xlim(left=-0.5)

def get_hits(self, N = None, dispersion_cutoff = None, mean_cutoff = None, candidates=None):
if candidates != None:
# filter genes by candidates
dispersion_sorted = self.dispersion.loc[candidates].sort_values(ascending=False)
else:
dispersion_sorted = self.dispersion.sort_values(ascending=False)
if N != None:
hits = dispersion_sorted[:N]
elif dispersion_cutoff != None and mean_cutoff != None:
# return hits with dispersion and mean greater than cutoffs
hits = dispersion_sorted.loc[self.X.loc[self.dispersion > dispersion_cutoff].loc[self.mean > mean_cutoff].index].sort_values(ascending=False)
else:
print ("Error: N, or dispersion_cutoff and mean_cutoff must be specified")
hits = None
return hits

class PCA():

def __init__(self, X, df, n_components):
self.X = X
self.df = df
self.n_components = n_components

def pca(self):
self.pca = sklearn.decomposition.PCA(n_components=self.n_components, random_state=1)
self.X_pca = self.pca.fit_transform(self.X.T)

def explained_variance_ratio_(self):
return self.pca.explained_variance_ratio_

def plot(self, ax, component_x=0, component_y=1, color_by=None):
if color_by is not None:
c = self.df.loc[color_by]
else:
c = None
ax.scatter(self.X_pca[:,component_x], self.X_pca[:,component_y], c=c)
ax.set_xlabel("PCA " + str(component_x + 1) + " ({0:.2%} variance)".format(self.pca.explained_variance_ratio_[component_x]))
ax.set_ylabel("PCA " + str(component_y + 1) + " ({0:.2%} variance)".format(self.pca.explained_variance_ratio_[component_y]))
plt.tight_layout()

ax.barh(range(len(plot_data)), plot_data, align="center")
yticklabels = plot_data.index
ax.set_yticks(range(len(plot_data)))
ax.set_yticklabels(yticklabels)
plt.tight_layout()

max_component = min(self.n_components, max_component)
hits = []
for component in range(0, max_component):
myHits = list(myZScores.loc[abs(myZScores) > z_score_cutoff].index)
hits.extend(myHits)
if plot:
plt.hist(myZScores, bins=np.linspace(-10, 10, 100))
plt.ylabel("Genes")
hits = list(set(hits))
return hits

class TSNE():

def __init__(self, X, df, df_libs, n_components=2):
self.X = X
self.df = df
self.n_components = n_components
self.df_libs = df_libs

def calc_TSNE(self, perplexity=30, random_state=0, learning_rate=500.0, early_exaggeration=4.0, metric="correlation", n_iter=1000, method="barnes_hut"):
if metric=="correlation":
self.dist = 1-self.X.corr()
self.dist = np.clip(self.dist, 0.0, max(np.max(self.dist))) # clip negative values to 0 (small negative values can occur due to floating point imprecision)
# self.D = self.dist / sum(np.sum(self.dist))
self.D = self.dist
elif metric=="cosine":
self.dist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(self.X.T, metric="cosine"))
self.D = self.dist
self.TSNE = sklearn.manifold.TSNE(n_components=self.n_components, metric="precomputed",
perplexity=perplexity, early_exaggeration=early_exaggeration,
learning_rate=learning_rate, n_iter=n_iter,
method=method, verbose=2, random_state=random_state)
self.X_tsne = self.TSNE.fit_transform(self.D)

def plot(self, fig, ax, colorBy=None, colorMode="gene", **kwargs):

if colorMode == "gene":

if colorBy == None:
print ("Error: colorBy must be a list with 1 or 2 gene names")
return None

elif isinstance(colorBy, (bytes, str)):
singleColor = True
Z = self.df.loc[colorBy]
# Z = Z / max(Z) # normalize to max
C = Z

elif len(colorBy) == 2:
singleColor = False
# c = [[0., 0., 0.] for _ in range(self.X_tsne.shape[0])] # initialize to black
c = [[1., 1., 1.] for _ in range(self.X_tsne.shape[0])] # initialize to white
color_tuple_indexes = [0, 2] # choose color channels
for i, feature in zip(color_tuple_indexes[:len(colorBy[:2])], colorBy[:2]):
Z = self.df.loc[feature]
Z = Z / max(Z) # normalize to max
# Z = Z / 2 + 0.5 # compress range to 0.5 to 1 to make brighter
for myColor, myIntensity in zip(c, Z):
# myColor[i] = myIntensity
myColor[i] = 1. - myIntensity
C = map(tuple, c)

elif len(colorBy) > 2:
print ("Warning: colorBy uses a maximum of 2 colors")

"""
# set uncolored points to white
# not relevant when initialized to white
c_whiteDefault = []
color_intensity_cutoff = 0.1
for color in c:
if ((color[0] < color_intensity_cutoff) and
(color[1] < color_intensity_cutoff) and
(color[2] < color_intensity_cutoff)):
c_whiteDefault.append([1., 1., 1.])
else:
c_whiteDefault.append(color)

# C = map(tuple, c_whiteDefault)
"""

elif colorMode == "genotype":

C = self.df_libs.loc[self.df.columns]["color"]

elif colorMode == "Tissue.type":

C = self.df_libs.loc[self.df.columns]["Tissue.type.color"]

elif colorMode == "custom":

C = colorBy

if colorMode == "gene":
norm = mpl.colors.Normalize(vmin=0, vmax=16)
sc = ax.scatter(self.X_tsne[:,0], self.X_tsne[:,1], s=30, edgecolor="k", linewidths=0.05, c=C, norm=norm, **kwargs)
else:
sc = ax.scatter(self.X_tsne[:,0], self.X_tsne[:,1], s=30, edgecolor="k", linewidths=0.05, c=C, **kwargs)
ax.set_xlabel("tSNE 1")
ax.set_ylabel("tSNE 2")

# make legend
if colorMode == "gene" and singleColor == False:
(left, right) = ax.get_xlim() # get current xlim
for i, feature in zip(color_tuple_indexes, colorBy):
c = [1., 1., 1.]
c[i] = 0.
ax.scatter(max(self.X_tsne[:,0]) + 1e6, 0, s=30, c=c, linewidths=0.05,label=feature)
ax.set_xlim(left, right)
ax.legend(loc="center left", bbox_to_anchor=(1.05, 0.5)) # legend outside plot

# make colorbar
if colorMode == "gene" and singleColor == True:
# cbar = fig.colorbar(sc, label="Log2(CPM+1)")
plt.tight_layout()
ax.set_aspect("equal")
# return sc, cbar
return sc

plt.tight_layout()
ax.set_aspect("equal")

return sc

class hclust:

def __init__(self, X, df):
self.X = X
self.df = df

def cluster(self, method="average", metric="correlation"):
pdist = scipy.spatial.distance.pdist(self.X, metric=metric)
pdist = scipy.spatial.distance.pdist(self.X.T, metric=metric)
pdist[np.isnan(pdist)] = 1.0

def plot(self, figsize=(9,9), cmap="YlGnBu_r", **kwargs):
figsize=figsize, cmap=cmap, **kwargs)
plt.setp(cm.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
return cm

def get_labels(self, what=None, n_clusters=2):
if what == "row":
elif what == "col":
else:
print ('Error: what must be "row" or "col"')
return labels

def cut(self, n_clusters_genes=2, n_clusters_cells=2):
# Currently only works for clusters = 2

X1 = self.X[self.X.columns[map(bool, 1-cell_labels)]]
X2 = self.X[self.X.columns[map(bool, cell_labels)]]

genes1 = list(self.X.T[self.X.T.columns[map(bool, 1-gene_labels)]].T.index)
genes2 = list(self.X.T[self.X.T.columns[map(bool, gene_labels)]].T.index)

df1 = self.df[X1.columns]
df2 = self.df[X2.columns]

return X1, df1, genes1, X2, df2, genes2

def get_terminal_branch_lengths(Z, max_label):
""" Get terminal branch lengths of a linkage matrix """
L = [d for d, label in zip(Z[:,2], Z[:,0]) if label < max_label] + [d for d, label in zip(Z[:,2], Z[:,1]) if label < max_label]
return L

class ICIM:

def __init__(self, X, df, TFs, CSMs, exclude, N, correlation_cutoff,
min_hits, exclude_max, dropout_rate_low, dropout_rate_high,

self.df = df

self.population = {}
self.population["0"] = X
self.markers = {}
self.markers["0"] = []
self.hclust = {}

# parameters
self.TFs = TFs
self.CSMs = CSMs
self.exclude = exclude
self.N = N
self.correlation_cutoff = correlation_cutoff
self.min_hits = min_hits
self.exclude_max = exclude_max
self.dropout_rate_low = dropout_rate_low
self.dropout_rate_high = dropout_rate_high
self.metric = metric
self.N_stop = N_stop
self.stop_condition = stop_condition

def step(self, parent, TFs=None, CSMs=None, exclude=None,
N=None, correlation_cutoff=None, min_hits=None,
exclude_max=None, dropout_rate_low=None,
dropout_rate_high=None, metric=None, stop_condition=None,
# perform one iteration by splitting parent population

if TFs is None: TFs = self.TFs
if CSMs is None: CSMs = self.CSMs
if exclude is None: exclude = self.exclude
if N is None: N = self.N
if correlation_cutoff is None: correlation_cutoff = self.correlation_cutoff
if min_hits is None: min_hits = self.min_hits
if exclude_max is None: exclude_max = self.exclude_max
if dropout_rate_low is None: dropout_rate_low = self.dropout_rate_low
if dropout_rate_high is None: dropout_rate_high = self.dropout_rate_high
if metric is None: metric = self.metric
if stop_condition is None: stop_condition = self.stop_condition
if N_stop is None: N_stop = self.N_stop

myPop = self.population[parent]
Y = filter_genes_overdispersed_correlates_dropouts(myPop, TFs, CSMs, exclude,
N, correlation_cutoff, min_hits, exclude_max,
dropout_rate_low, dropout_rate_high)
if verbose:
print( "Found", Y.shape[0], "genes")

if Y.shape[0] <= 5: return []

myClust = hclust(Y, self.df)
myClust.cluster(method="average", metric=metric)
_, X1, markerGenes1, _, X2, markerGenes2 = myClust.cut()

self.hclust[parent] = myClust

L_terminal = get_terminal_branch_lengths(Z, myClust.X.shape[1]) # terminal branch lengths
min_linkage_dist = min(L_terminal) # smallest terminal branch length (most similar to neighbor)

# do not keep child populations and markers
# return empty queue
if verbose:
print ("Failed linkage distance condition. Stopping.")
return []

child1 = parent + "0"
self.population[child1] = X1
self.markers[child1] = markerGenes1

child2 = parent + "1"
self.population[child2] = X2
self.markers[child2] = markerGenes2

if verbose:
print ("Child populations", X1.shape[1], X2.shape[1])

queue = []

if stop_condition == "num_cells":

if X1.shape[1] >= N_stop:
queue.append(child1)

if X2.shape[1] >= N_stop:
queue.append(child2)

if X1.shape[1] >= 20:
queue.append(child1)
if X2.shape[1] >= 20:
queue.append(child2)

else:

print ('Error: stop_condition must be "num_cells" or "linkage_dist"')
return []

gc.collect()

return queue

def calc(self, skip=[], verbose=False):

if verbose:
print ("Initial step")
queue = []
queue.extend(self.step("0", verbose=verbose))

if verbose:
print

while len(queue) != 0:

parent = queue.pop()
if verbose:
print (parent)
myQueue = self.step(parent, verbose=verbose)
myQueue = list(set(myQueue) - set(skip))
queue.extend(myQueue)
if verbose:
print

return None

def get_all_markers(self):
allMarkers = list(set([item for sublist in self.markers.values() for item in sublist]))
return allMarkers
``````

Computing file changes ...