from __future__ import division
import numpy as np
import scipy
import pandas as pd
import copy

import sklearn

from scipy.cluster import hierarchy

import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

import gc

def calc_DE_mannwhitneyu(X, names1, names2):
    pvalues = []
    medianA = []
    medianB = []
    meanA = []
    meanB = []
    for gene in X.index:
        A = X[names1].loc[gene]
        B = X[names2].loc[gene]
        if np.count_nonzero(A) == 0 and np.count_nonzero(B) == 0:
            pvalues.append(np.nan)
            medianA.append(0)
            medianB.append(0)
            meanA.append(0)
            meanB.append(0)
            continue
        _, pvalue = scipy.stats.mannwhitneyu(A, B)
        pvalues.append(pvalue)
        medianA.append(np.median(A))
        medianB.append(np.median(B))
        meanA.append(np.mean(A))
        meanB.append(np.mean(B))
    df_DE = pd.DataFrame({"pvalue": pvalues, "medianA": medianA, "medianB": medianB,
                          "meanA": meanA, "meanB": meanB}, index=X.index)
    df_DE.sort_values("pvalue", inplace=True)
    df_DE["pvalue_adj"] = df_DE["pvalue"] * df_DE["pvalue"].shape[0]
    return df_DE

def call_hits_dropout(df, dropout_cutoff=1, window_size=500, N=2000, pvalue_cutoff=0.05):

    df = copy.deepcopy(df)

    # Calculate mean
    df["mean"] = np.mean(df, axis=1)
    df.sort_values(by="mean", ascending=False, inplace=True)

    # Calculate dropouts per gene
    df["dropouts"] = np.sum(df.ix[:,:-1] < dropout_cutoff, axis=1) / df.shape[1]

    def f(df, i, window_size):
        """ Get dropout distribution for window around focal gene.
            Calculate empirical p value. """

        i_upper = int(max(0, i - window_size/2))
        i_lower = int(min(df.shape[0], i + window_size/2))

        # Make ensemble of dropout values
        ensemble = []
        for k, (gene_name, row) in enumerate(df.iloc[i_upper:i_lower].iterrows()):
            if k + i_upper != i:
                ensemble.append(row["dropouts"])

        # Get dropouts of focal gene
        myDropouts = df.iloc[i]["dropouts"]

        # What fraction of ensemble is more extreme than focal gene?
        pvalue_empirical = sum(ensemble > myDropouts) / len(ensemble)

        return ensemble, myDropouts, pvalue_empirical

    hits = []

    for i in range(0, N):

        ensemble, myDropouts, pvalue_empirical = f(df, i, window_size)
        geneName = df.iloc[i].name

        if pvalue_empirical < pvalue_cutoff:
            hits.append(geneName)

    return hits

def call_hits_corr(df, dropout_cutoff=1, N=2000, corr_cutoff=0.8, max_canonical_vector=10, canonical_vector_spacing=3):

    df = copy.deepcopy(df)

    def f(df, C):
        """ Calculate correlations with canonical vector C """

        geneNames = []
        corrs = []
        pvalues = []

        for k, (geneName, row) in enumerate(df.iterrows()):

            myRankedExpr = row.sort_values()
            myRankedExpr = np.log10(myRankedExpr + 1)
            myY = myRankedExpr / max(myRankedExpr)
            corr, pvalue = scipy.stats.pearsonr(myY, C)

            geneNames.append(geneName)
            corrs.append(corr)
            pvalues.append(pvalue)

        df_corrs = pd.DataFrame({"geneName":geneNames, "corr":corrs, "pvalue":pvalues})

        return df_corrs

    df["mean_expr"] = np.mean(df, axis=1)
    df = df.sort_values("mean_expr", ascending=False)
    df = df.T.drop("mean_expr").T

    df = df.head(N)
    df[df < dropout_cutoff] = 0

    L = df.shape[1] # length of expr vector (num cells)

    df_corrs = pd.DataFrame()

    for k in range(1, max_canonical_vector, canonical_vector_spacing):

        C = np.zeros(L) # canonical expression profile
        C[-k:] = 1.

        df_myCorr = f(df, C)
        df_corrs = pd.concat([df_corrs, df_myCorr])

    # Filter for highly correlated hits
    df_hits = df_corrs.loc[df_corrs["corr"] > corr_cutoff].sort_values(by="corr", ascending=False)
    df_hits = df_hits.drop_duplicates(subset="geneName")
    hits = list(df_hits["geneName"])

    return hits, df_hits

def get_zscores(df, num_bin=20):

    myMean = np.mean(df, axis=1)
    myVar = np.var(df, axis=1)
    bins = np.linspace(min(myMean), max(myMean), num_bin)

    df["mean"] = myMean
    df["var"] = myVar
    df["mean_bin"] = pd.cut(myMean, bins, right=True, labels=range(1,len(bins)), include_lowest=True)

    for _, group in df.groupby("mean_bin"):

        myDispersion = np.log10(group["var"] / group["mean"])
        myDispersionStd = np.std(myDispersion)

        if myDispersionStd == 0: z_scores = np.zeros(len(group))

        z_scores = (myDispersion - np.mean(myDispersion)) / myDispersionStd
        #group.index.dropna()
        df.loc[group.index.dropna(), "z_score"] = z_scores

    mean = df["mean"]
    z_score = df["z_score"]
    df.drop(["mean", "var", "mean_bin", "z_score"], axis=1, inplace=True) # clean up

    return mean, z_score, df

def corr(df, exclude_max=0):
    # Calculate pairwise correlations between genes (columns of df)
    # Very slow because pandas corr() function is slow (compared to np.corrcoef()).
    if exclude_max > 0:
        for i in range(exclude_max):
            for col, row in enumerate(np.nanargmax(np.array(df), axis=0)):
                df.iloc[row,col] = np.nan
    return df.corr()

def get_correlated_genes(df, seeds, correlation_cutoff, min_hits=1, exclude_max=0):

    correlated_genes = []
    uncorrelated_seeds = []
    allCorrelations = pd.DataFrame(np.corrcoef(df), index=df.index, columns=df.index)

    for seed in seeds:

        myCorrelations = allCorrelations.loc[seed].drop(seed)
        myHits = myCorrelations[np.abs(myCorrelations) > correlation_cutoff]

        if exclude_max > 0:
            # keep only hits that are still hits after excluding maximum expressing sample
            myValidatedHits = []
            for hit in myHits.index:
                corr_excludeMax = np.abs(corr(df.loc[[seed, hit]].T, exclude_max=exclude_max).loc[seed, hit])
                if corr_excludeMax > correlation_cutoff:
                    myValidatedHits.append(hit)
        else:
            myValidatedHits = myHits.index

        if len(myValidatedHits) < min_hits:
            if seed == "SpikeIn1": print ("SpikeIn1 was uncorrelated seed with len(validatedHits)="), len(myValidatedHits)
            uncorrelated_seeds.append(seed)
        else:
            correlated_genes.extend(myValidatedHits)

    return correlated_genes, uncorrelated_seeds

def prune_singleton_correlations(df, product_cutoff=0.9):
    """ Identify genes that are driven by a single cell that highly expresses both """
    notSingletons = []
    singletons = []
    for gene1 in df.index:
        for gene2 in df.index:
            if gene1 == gene2: continue
            A = df.loc[gene1]
            B = df.loc[gene2]
            AB = A*B
            myCutoff = product_cutoff * max(A) * max(B)
            x = sum(AB > myCutoff)
            if x > 1:
                notSingletons.append(gene1)
                break
            singletons.append(gene1)
            return notSingletons, singletons

def filter_genes_overdispersed_correlates_dropouts(X, TFs, CSMs=None, exclude=None, N=50, correlation_cutoff=0.5, min_hits=1, exclude_max=0, dropout_rate_low=0.0, dropout_rate_high=1.0):
    """ Filter for informative genes for cell type identification.
        (1) Drop genes that are not expressed.
        (2) Find overdispersed genes.
        (3) Expand gene set by finding correlated genes.
        (4) Drop genes with no correlates.
        (5) Drop genes with high or low dropout rate. """

    # Drop genes with low max expression
    X = X.loc[np.max(X, axis=1) > 2]

    # Find overdispersed genes
    myDispersion = dispersion(X)
    myDispersion.calc_dispersion(num_bin=20) # calculate overdispersion

    hits_genome = myDispersion.get_hits(N=N)
    hits_TF = myDispersion.get_hits(N=N, candidates=TFs) # get hits among TFs

    if CSMs != None:
        hits_CSM = myDispersion.get_hits(N=N, candidates=CSMs) # get hits among CSMs
        hits = hits_genome.append([hits_TF, hits_CSM]).drop_duplicates().sort_values(ascending=False)
    else:
        hits = hits_genome.append([hits_TF]).drop_duplicates().sort_values(ascending=False)

    if exclude is not None:
        hits = list(set(hits.index) - set(exclude))
    else:
        hits = list(hits.index)

    # Expand gene set by finding correlated genes
    # Remove genes that have no correlates (presumably noise -- they don't belong to a "module")
    if len(hits) > 1000: print ("Warning: calculating correlations between all genes and >1000 hits")
    correlated_genes, uncorrelated_seeds = get_correlated_genes(X, hits, correlation_cutoff=correlation_cutoff, min_hits=min_hits, exclude_max=exclude_max)

    hits_pruned = list(set(list(hits)) - set(list(uncorrelated_seeds)))
    hits_pruned_expanded = list(set(hits_pruned + list(set(correlated_genes))))

    Y = X.loc[hits_pruned_expanded]

    # Filter genes by dropout rate
    dropout_rate = np.sum(Y < 2, axis=1) / Y.shape[1]
    Y = Y.loc[(dropout_rate > dropout_rate_low) & (dropout_rate < dropout_rate_high)]

    return Y

class dispersion():

    def __init__(self, X):
        self.X = X
        self.X["max"] = np.max(self.X, axis=1)
        self.X.sort_values(by="max", ascending=False, inplace=True)
        self.X.drop("max", axis=1, inplace=True)
        self.mean = np.mean(self.X, axis=1)

    def calc_dispersion(self, num_bin=20):
        _, dispersion, _ = get_zscores(self.X, num_bin)
        self.dispersion = dispersion

    def plot(self, ax):
        ax.scatter(self.mean, self.dispersion)
        ax.set_xlabel("Mean expression (log2(CPM+1))")
        ax.set_ylabel("Dispersion (Z-score of log2(variance/mean))")
        ax.set_xlim(left=-0.5)

    def get_hits(self, N = None, dispersion_cutoff = None, mean_cutoff = None, candidates=None):
        if candidates != None:
            # filter genes by candidates
            dispersion_sorted = self.dispersion.loc[candidates].sort_values(ascending=False)
        else:
            dispersion_sorted = self.dispersion.sort_values(ascending=False)
        if N != None:
            # return top N hits
            hits = dispersion_sorted[:N]
        elif dispersion_cutoff != None and mean_cutoff != None:
            # return hits with dispersion and mean greater than cutoffs
            hits = dispersion_sorted.loc[self.X.loc[self.dispersion > dispersion_cutoff].loc[self.mean > mean_cutoff].index].sort_values(ascending=False)
        else:
            print ("Error: N, or dispersion_cutoff and mean_cutoff must be specified")
            hits = None
        return hits

class PCA():

    def __init__(self, X, df, n_components):
        self.X = X
        self.df = df
        self.n_components = n_components

    def pca(self):
        self.pca = sklearn.decomposition.PCA(n_components=self.n_components, random_state=1)
        self.X_pca = self.pca.fit_transform(self.X.T)
        self.loadings = pd.DataFrame(self.pca.components_.T)
        self.loadings.set_index(self.X.index, inplace=True)

    def explained_variance_ratio_(self):
        return self.pca.explained_variance_ratio_

    def plot(self, ax, component_x=0, component_y=1, color_by=None):
        if color_by is not None:
            c = self.df.loc[color_by]
        else:
            c = None
        ax.scatter(self.X_pca[:,component_x], self.X_pca[:,component_y], c=c)
        ax.set_xlabel("PCA " + str(component_x + 1) + " ({0:.2%} variance)".format(self.pca.explained_variance_ratio_[component_x]))
        ax.set_ylabel("PCA " + str(component_y + 1) + " ({0:.2%} variance)".format(self.pca.explained_variance_ratio_[component_y]))
        plt.tight_layout()

    def plot_loadings(self, ax, component=0, num_genes=20):
        myLoadings = self.loadings[component].sort_values(inplace=False, ascending=False)
        plot_data = pd.concat([myLoadings.iloc[0:int(num_genes/2)], myLoadings.iloc[-int(num_genes/2):]]).sort_values(ascending=True)
        ax.barh(range(len(plot_data)), plot_data, align="center")
        ax.set_xlabel("PC " + str(component + 1) + " Loading")
        yticklabels = plot_data.index
        ax.set_yticks(range(len(plot_data)))
        ax.set_yticklabels(yticklabels)
        plt.tight_layout()

    def top_loaded_genes(self, z_score_cutoff=2, max_component=30, plot=False):
        max_component = min(self.n_components, max_component)
        hits = []
        for component in range(0, max_component):
            myLoadings = self.loadings[component].sort_values(inplace=False, ascending=False)
            myMean = np.mean(myLoadings)
            myStd = np.std(myLoadings)
            myZScores = (myLoadings - myMean) / myStd
            myHits = list(myZScores.loc[abs(myZScores) > z_score_cutoff].index)
            hits.extend(myHits)
            if plot:
                plt.hist(myZScores, bins=np.linspace(-10, 10, 100))
                plt.xlabel("Z-score of PCA loading")
                plt.ylabel("Genes")
        hits = list(set(hits))
        return hits

class TSNE():

    def __init__(self, X, df, df_libs, n_components=2):
        self.X = X
        self.df = df
        self.n_components = n_components
        self.df_libs = df_libs

    def calc_TSNE(self, perplexity=30, random_state=0, learning_rate=500.0, early_exaggeration=4.0, metric="correlation", n_iter=1000, method="barnes_hut"):
        if metric=="correlation":
            self.dist = 1-self.X.corr()
            self.dist = np.clip(self.dist, 0.0, max(np.max(self.dist))) # clip negative values to 0 (small negative values can occur due to floating point imprecision)
            # self.D = self.dist / sum(np.sum(self.dist))
            self.D = self.dist
        elif metric=="cosine":
            self.dist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(self.X.T, metric="cosine"))
            self.D = self.dist
        self.TSNE = sklearn.manifold.TSNE(n_components=self.n_components, metric="precomputed",
                                          perplexity=perplexity, early_exaggeration=early_exaggeration,
                                          learning_rate=learning_rate, n_iter=n_iter,
                                          method=method, verbose=2, random_state=random_state)
        self.X_tsne = self.TSNE.fit_transform(self.D)


    def plot(self, fig, ax, colorBy=None, colorMode="gene", **kwargs):

        if colorMode == "gene":

            if colorBy == None:
                print ("Error: colorBy must be a list with 1 or 2 gene names")
                return None

            elif isinstance(colorBy, (bytes, str)):
                singleColor = True
                Z = self.df.loc[colorBy]
                # Z = Z / max(Z) # normalize to max
                C = Z

            elif len(colorBy) == 2:
                singleColor = False
                # c = [[0., 0., 0.] for _ in range(self.X_tsne.shape[0])] # initialize to black
                c = [[1., 1., 1.] for _ in range(self.X_tsne.shape[0])] # initialize to white
                color_tuple_indexes = [0, 2] # choose color channels
                for i, feature in zip(color_tuple_indexes[:len(colorBy[:2])], colorBy[:2]):
                    Z = self.df.loc[feature]
                    Z = Z / max(Z) # normalize to max
                    # Z = Z / 2 + 0.5 # compress range to 0.5 to 1 to make brighter
                    for myColor, myIntensity in zip(c, Z):
                        # myColor[i] = myIntensity
                        myColor[i] = 1. - myIntensity
                C = map(tuple, c)

            elif len(colorBy) > 2:
                print ("Warning: colorBy uses a maximum of 2 colors")

            """
            # set uncolored points to white
            # not relevant when initialized to white
            c_whiteDefault = []
            color_intensity_cutoff = 0.1
            for color in c:
                if ((color[0] < color_intensity_cutoff) and
                    (color[1] < color_intensity_cutoff) and
                    (color[2] < color_intensity_cutoff)):
                    c_whiteDefault.append([1., 1., 1.])
                else:
                    c_whiteDefault.append(color)

            # C = map(tuple, c_whiteDefault)
            """

        elif colorMode == "genotype":

            C = self.df_libs.loc[self.df.columns]["color"]

        elif colorMode == "Tissue.type":

            C = self.df_libs.loc[self.df.columns]["Tissue.type.color"]

        elif colorMode == "custom":

            C = colorBy

        if colorMode == "gene":
            norm = mpl.colors.Normalize(vmin=0, vmax=16)
            sc = ax.scatter(self.X_tsne[:,0], self.X_tsne[:,1], s=30, edgecolor="k", linewidths=0.05, c=C, norm=norm, **kwargs)
        else:
            sc = ax.scatter(self.X_tsne[:,0], self.X_tsne[:,1], s=30, edgecolor="k", linewidths=0.05, c=C, **kwargs)
        ax.set_xlabel("tSNE 1")
        ax.set_ylabel("tSNE 2")

        # make legend
        if colorMode == "gene" and singleColor == False:
            (left, right) = ax.get_xlim() # get current xlim
            for i, feature in zip(color_tuple_indexes, colorBy):
                c = [1., 1., 1.]
                c[i] = 0.
                ax.scatter(max(self.X_tsne[:,0]) + 1e6, 0, s=30, c=c, linewidths=0.05,label=feature)
                ax.set_xlim(left, right)
                ax.legend(loc="center left", bbox_to_anchor=(1.05, 0.5)) # legend outside plot

        # make colorbar
        if colorMode == "gene" and singleColor == True:
            # cbar = fig.colorbar(sc, label="Log2(CPM+1)")
            plt.tight_layout()
            ax.set_aspect("equal")
            # return sc, cbar
            return sc

        plt.tight_layout()
        ax.set_aspect("equal")

        return sc

class hclust:

    def __init__(self, X, df):
        self.X = X
        self.df = df

    def cluster(self, method="average", metric="correlation"):
        pdist = scipy.spatial.distance.pdist(self.X, metric=metric)
        self.row_linkage = hierarchy.linkage(pdist, metric=metric, method=method) # cluster on gene vectors
        pdist = scipy.spatial.distance.pdist(self.X.T, metric=metric)
        pdist[np.isnan(pdist)] = 1.0
        self.col_linkage = hierarchy.linkage(pdist, metric=metric, method=method)

    def plot(self, figsize=(9,9), cmap="YlGnBu_r", **kwargs):
        cm = sns.clustermap(self.X, row_linkage=self.row_linkage, col_linkage=self.col_linkage,
                            figsize=figsize, cmap=cmap, **kwargs)
        plt.setp(cm.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
        return cm

    def get_labels(self, what=None, n_clusters=2):
        if what == "row":
            labels = hierarchy.cut_tree(self.row_linkage, n_clusters)
        elif what == "col":
            labels = hierarchy.cut_tree(self.col_linkage, n_clusters)
        else:
            print ('Error: what must be "row" or "col"')
        return labels

    def cut(self, n_clusters_genes=2, n_clusters_cells=2):
        # Currently only works for clusters = 2

        gene_labels = hierarchy.cut_tree(self.row_linkage, n_clusters_genes)
        cell_labels = hierarchy.cut_tree(self.col_linkage, n_clusters_cells)

        X1 = self.X[self.X.columns[map(bool, 1-cell_labels)]]
        X2 = self.X[self.X.columns[map(bool, cell_labels)]]

        genes1 = list(self.X.T[self.X.T.columns[map(bool, 1-gene_labels)]].T.index)
        genes2 = list(self.X.T[self.X.T.columns[map(bool, gene_labels)]].T.index)

        df1 = self.df[X1.columns]
        df2 = self.df[X2.columns]

        return X1, df1, genes1, X2, df2, genes2

def get_terminal_branch_lengths(Z, max_label):
    """ Get terminal branch lengths of a linkage matrix """
    L = [d for d, label in zip(Z[:,2], Z[:,0]) if label < max_label] + [d for d, label in zip(Z[:,2], Z[:,1]) if label < max_label]
    return L

class ICIM:

    def __init__(self, X, df, TFs, CSMs, exclude, N, correlation_cutoff,
                 min_hits, exclude_max, dropout_rate_low, dropout_rate_high,
                 metric, stop_condition, N_stop, linkage_dist_stop):

        self.df = df

        self.population = {}
        self.population["0"] = X
        self.markers = {}
        self.markers["0"] = []
        self.hclust = {}

        # parameters
        self.TFs = TFs
        self.CSMs = CSMs
        self.exclude = exclude
        self.N = N
        self.correlation_cutoff = correlation_cutoff
        self.min_hits = min_hits
        self.exclude_max = exclude_max
        self.dropout_rate_low = dropout_rate_low
        self.dropout_rate_high = dropout_rate_high
        self.metric = metric
        self.N_stop = N_stop
        self.stop_condition = stop_condition
        self.linkage_dist_stop = linkage_dist_stop

    def step(self, parent, TFs=None, CSMs=None, exclude=None,
             N=None, correlation_cutoff=None, min_hits=None,
             exclude_max=None, dropout_rate_low=None,
             dropout_rate_high=None, metric=None, stop_condition=None,
             N_stop=None, linkage_dist_stop=None, verbose=False):
        # perform one iteration by splitting parent population

        if TFs is None: TFs = self.TFs
        if CSMs is None: CSMs = self.CSMs
        if exclude is None: exclude = self.exclude
        if N is None: N = self.N
        if correlation_cutoff is None: correlation_cutoff = self.correlation_cutoff
        if min_hits is None: min_hits = self.min_hits
        if exclude_max is None: exclude_max = self.exclude_max
        if dropout_rate_low is None: dropout_rate_low = self.dropout_rate_low
        if dropout_rate_high is None: dropout_rate_high = self.dropout_rate_high
        if metric is None: metric = self.metric
        if stop_condition is None: stop_condition = self.stop_condition
        if N_stop is None: N_stop = self.N_stop
        if linkage_dist_stop is None: linkage_dist_stop = self.linkage_dist_stop

        myPop = self.population[parent]
        Y = filter_genes_overdispersed_correlates_dropouts(myPop, TFs, CSMs, exclude,
                                                           N, correlation_cutoff, min_hits, exclude_max,
                                                           dropout_rate_low, dropout_rate_high)
        if verbose:
            print( "Found", Y.shape[0], "genes")

        if Y.shape[0] <= 5: return []

        myClust = hclust(Y, self.df)
        myClust.cluster(method="average", metric=metric)
        _, X1, markerGenes1, _, X2, markerGenes2 = myClust.cut()

        self.hclust[parent] = myClust

        if stop_condition == "linkage_dist":

            Z = myClust.col_linkage
            L_terminal = get_terminal_branch_lengths(Z, myClust.X.shape[1]) # terminal branch lengths
            min_linkage_dist = min(L_terminal) # smallest terminal branch length (most similar to neighbor)

            if min_linkage_dist > linkage_dist_stop:
                # do not keep child populations and markers
                # return empty queue
                if verbose:
                    print ("Failed linkage distance condition. Stopping.")
                return []

        child1 = parent + "0"
        self.population[child1] = X1
        self.markers[child1] = markerGenes1

        child2 = parent + "1"
        self.population[child2] = X2
        self.markers[child2] = markerGenes2

        if verbose:
            print ("Child populations", X1.shape[1], X2.shape[1])

        queue = []

        if stop_condition == "num_cells":

            if X1.shape[1] >= N_stop:
                queue.append(child1)

            if X2.shape[1] >= N_stop:
                queue.append(child2)

        elif stop_condition == "linkage_dist":

            if X1.shape[1] >= 20:
                queue.append(child1)
            if X2.shape[1] >= 20:
                queue.append(child2)

        else:

            print ('Error: stop_condition must be "num_cells" or "linkage_dist"')
            return []

        gc.collect()

        return queue

    def calc(self, skip=[], verbose=False):

        if verbose:
            print ("Initial step")
        queue = []
        queue.extend(self.step("0", verbose=verbose))

        if verbose:
            print

        while len(queue) != 0:

            parent = queue.pop()
            if verbose:
                print (parent)
            myQueue = self.step(parent, verbose=verbose)
            myQueue = list(set(myQueue) - set(skip))
            queue.extend(myQueue)
            if verbose:
                print

        return None

    def get_all_markers(self):
        allMarkers = list(set([item for sublist in self.markers.values() for item in sublist]))
        return allMarkers