https://github.com/stephenfloor/tripseq-analysis
Tip revision: 3e823abcca5b8c1e5e89dd9bd4c49e8673b3e957 authored by Stephen Floor on 24 June 2017, 00:52:49 UTC
email update
email update
Tip revision: 3e823ab
compare_tripseq_clusters.py
#!/usr/bin/env python
# compare_tripseq_clusters.py
# Compare input lists of clusters against each other searching for genes that have transcripts in the different clusters.
# Intended to be applied to fractionated polysome sequencing data (a la tripseq) but theoretically can work for any data involving transcript-specific distributions
# Input format can be either averaged or replicates listed side-by-side; units are arbitrary (counts, fpkm, vsd, rlog, etc)
# SNF Winter 2014/2015
import sys, os, argparse, csv, random
import GTF
from Transcript import *
from SNFUtils import * # prompt, safe_open_file, stdout_from_command
from collections import defaultdict, namedtuple
from scipy.stats.stats import pearsonr, spearmanr, kurtosis
from scipy.optimize import curve_fit
from scipy.stats import mannwhitneyu, ks_2samp
import numpy as np
import matplotlib.pyplot as plt
from itertools import izip_longest
def poly3(x, p1, p2, p3, p4):
return p1 + p2 * x + p3 * x**2 + p4 * x**3
parser = argparse.ArgumentParser(description="Compare transcript-specific distributions at the gene level")
#parser.add_argument("-i", "--input", help="File containing transcript expression values", required=True)
parser.add_argument("--set1", help="Files and IDs containing transcript distributions; compare between set1 and set2", metavar="FNAME ID ...", nargs="+", required=True)
parser.add_argument("--set2", help="Files and IDs containing transcript distributions; compare between set1 and set2", metavar="FNAME ID ...", nargs="+", required=True)
parser.add_argument("--tx-to-gene", help="Mapping between transcript ID in input file and gene ID", required=True)
parser.add_argument("-o", "--output", help="Output filename (default is stdout)", default="stdout")
parser.add_argument("-n", "--nrep", help="Number of replicates of each point", required=True, type=int)
parser.add_argument("--txome-props", help="List of files with transcriptome properties to correlate among (wildcards ok)", nargs="+", default=None)
parser.add_argument("--control", help="Perform randomized comparisons of input transcripts as a control.", action="store_true")
parser.add_argument("--txome-gtf", help="Path to transcriptome GTF", required=True)
parser.add_argument("--strip-id", help="Strip anything following the final _ off the transcript ID in input", action="store_true")
parser.add_argument("--log", help="Plot eCDF on log scale (x-axis)", action="store_true")
# below not implemented yet
#parser.add_argument("--seq", help="Output sequence of the region(s) different between transcripts", action="store_true")
#parser.add_argument("-g", "--genome", help="Path to the genome file (required for --seq")
args = parser.parse_args()
print "------------------------------------"
print "| compare_tripseq_clusters.py |"
print "| extract differential transcripts |"
print "| run with -h for help |"
print "| snf 1/2015 |"
print "------------------------------------\n\n"
print "Arguments:"
print args
# sanity check inputs...
if (len(args.set1) % 2 or len(args.set2) % 2):
sys.exit("FATAL: --set1 and --set2 must be lists of file id file id ...")
if (args.control):
print "\nPerforming a comparsion between two randomized subsets of the data as a control"
if (args.output == "stdout"):
outfile = sys.stdout
else:
outfile = safe_open_file(args.output)
# First, create a dictionary mapping from transcript ID to gene ID by reading in args.tx_to_gene
# - optionally, also save the transcript's gene name and biotype if they are present in columns 3 and 4 in the input
tx_to_gene = defaultdict(str)
tx_to_name = defaultdict(str)
tx_to_biotype = defaultdict(str)
processed = 0
print "\nCreating tx_to_gene dictionary from %s..." % args.tx_to_gene
with open(args.tx_to_gene, "r") as txfile:
for line in txfile:
line = line.split()
tx_to_gene[line[0]] = line[1]
if (len(line) > 2):
tx_to_name[line[0]] = line[2]
if (len(line) > 3):
tx_to_biotype[line[0]] = line[3]
processed += 1
print "Dictionary created (%d entries)." % processed
print "\nCreating transcript dictionary from %s (currently commented out)" % args.txome_gtf
txDict = defaultdict(list)
processed = 0
# COMMENTED CODE builds a transcript dictionary to extract transcripts with specific differences (i.e. 5' utr changes, etc). this takes a bit, so commented for now.
# the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict.
# for line in GTF.lines(args.txome_gtf):
# txDict[line["transcript_id"]].append(line)
# processed += 1
# if (not processed % 25000):
# print "\tProcessed %d lines..." % processed
# print "Dictionary built (%d entries)." % processed
# print "Creating transcript objects."
# processed = 0
# # now create a Transcript object for each transcript and output it
# for key in txDict:
# tx = createGTFTranscript(txDict[key])
# txDict[key] = tx
# #---------------- save to a new dict and/or overwrite old one here
# #print tx
# #writeOutput(tx)
# processed += 1
# if (not processed % 2500):
# print "\tProcessed %d entries..." % processed
print "Dictionary built (%d entries)." % processed
# sample input format:
# -- first column must be transcript ID; quotes are optional
# -- header line below is optional and detected by a non-numeric second column
# "ensid" "X80S" "poly2" "poly3" "poly4" "poly5" "poly6" "poly7" "poly8"
#"ENST00000000233" -0.562625014929129 -2.23696624248826 -0.765273156804401 0.702668471793972 1.36603995862049 1.02457718119074 0.751424784189265 -0.279845981572665
#"ENST00000000412" -1.10133854138933 -1.77019107516088 -1.7623840822903 0.404249332913166 1.42491481379874 1.42016301836686 1.35747792898968 0.0271086047720868
#"ENST00000000442" 1.1367571728999 -2.58192112517174 0.0240875688746112 0.482589702447365 0.969885993274519 0.643070451106668 -0.177572605307724 -0.496897158123603
np.seterr(all='raise')
print "\nProcessing input transcripts..."
def process_file(fname, id, gene_to_txs, tx_to_fit):
processed = 0
rejected_pe = 0
rejected_zero = 0
with open(fname, "r") as infile:
#inspect the first line
if fname.endswith("csv"):
line = [i.strip("\"") for i in infile.readline().split(",")]
else:
line = [i.strip("\"") for i in infile.readline().split()]
if (is_number(line[2])): # no header
found_header = False
infile.seek(0) #rewind
print "\tNo header found."
else:
found_header = True
header = line
print "\tHeader found on line one."
if ( (len(line) - 1) % args.nrep):
print "\tWARNING: %d observations found; not divisible by %d replicates." % (len(line) - 1, args.nrep)
print "\tProcessing %d points (%d replicates)" % ( (len(line) - 1)/args.nrep, args.nrep)
for line in infile:
if (not line.strip()):
continue
if fname.endswith("csv"):
line = [i.strip("\"") for i in line.split(",")]
else:
line = [i.strip("\"") for i in line.split()]
# append all transcripts for a particular gene to a dictionary of lists
if (tx_to_gene[line[0]]):
# convert all to floats before continuing...
line = [line[0]] + [float(i) for i in line[1:]]
txid = line[0]
geneid = tx_to_gene[txid]
obs = line[1:] # the observations
nobs = len(obs) / args.nrep
# 1 read on a 1kb transcript out of 10 million -> FPKM = 1e-10. rounding to this precision.
obs = np.around(obs, 10)
if (len(obs) % args.nrep):
print "\tWarning: transcript %s has %d observations which is not divisible by %d replicates" % (txid, len(obs), args.nrep)
# collapse replicates into means, preserve stdev
means = [np.mean(i) for i in chunkwise(obs, args.nrep)]
stdevs = [np.std(i) for i in chunkwise(obs, args.nrep)]
# average percent error
mean_pe = np.mean([stdevs[i]/means[i] for i in range(nobs) if means[i] > 1e-6])
# if more than half are zero; reject
if ( sum( [np.abs(i) < 0.01 for i in means] ) > nobs/2):
#print "Disqualifying %s: more than half the values are zero (%s)" % (txid, means)
#third_order_poly_fit_plot(range(1,nobs+1), means, txid + "_stdevreject.png", stdevs)
rejected_zero += 1
# if this tx wasn't disqualified, add it to the mix
else:
gene_to_txs[geneid].append( ([txid] + list(obs), [txid] + means, [txid] + stdevs, id) )
# save the fit for later.
#tx_to_fit[txid] = (popt, pcov, infodict, mesg, ier, xvals, means, xnew, ynew)
processed += 1
if (not processed % 5000):
print "\tProcessed %d entries..." % processed
# plot the transcript..
# nasty list comprehensions to make a list of nrep repetitions of each obs to generate the x coordinate list
#xvals = [ k for j in [[i] * args.nrep for i in range(1,nobs+1)] for k in j]
#third_order_poly_fit_plot(xvals, obs, txid + "_poly.png", None)
#third_order_poly_fit_plot(range(1,nobs+1), means, txid + "_poly_avg.png", stdevs)
else:
print "\tWarning: gene ID not found for transcript ID %s; skipping." % line[0]
#print line
#total = float(processed + rejected_pe + rejected_zero )
print "\tRead %d transcript distributions from %s; rejected %d (%.1f %%) for percent error; %d (%.1f %%) for half zero (total %.1f %%)" \
% (processed, fname, rejected_pe, rejected_pe/float(processed)*100, rejected_zero, rejected_zero/float(processed)*100, (rejected_pe + rejected_zero)/float(processed)*100)
return processed - rejected_pe - rejected_zero
#return (gene_to_txs, tx_to_fit)
# workhorse data structures for the program - holds all transcript distributions as values for the gene (key)
gene_to_txs_set1 = defaultdict(list)
tx_to_fit_set1 = defaultdict(tuple)
gene_to_txs_set2 = defaultdict(list)
tx_to_fit_set2 = defaultdict(tuple)
#gene_to_txs_mean = defaultdict(list)
#gene_to_txs_stdev = defaultdict(list)
# global indices into the dict of lists
IDXDIST = 0
IDXMEAN = 1
IDXSTDEV = 2
IDXCLUSTID = 3
n_in_set1 = 0
for fname,id in pairwise(args.set1):
print "\nReading from set1 file %s [ID %s]" % (fname, id)
n_in_set1 += process_file(fname, id, gene_to_txs_set1, tx_to_fit_set1)
print "Read %d total transcripts for set1." % n_in_set1
n_in_set2 = 0
for fname,id in pairwise(args.set2):
print "\nReading from set2 file %s [ID %s]" % (fname, id)
n_in_set2 += process_file(fname, id, gene_to_txs_set2, tx_to_fit_set2)
print "Read %d total transcripts for set2." % n_in_set2
print "%d genes in set1, %d genes in set2" % (len(gene_to_txs_set1), len(gene_to_txs_set2))
#print "---- gene_to_txs_set1 ----"
#print gene_to_txs_set1
#print tx_to_fit_set1
# read in transcriptome parameters:
txome_props = {}
#with open("/Volumes/11tb/seq/Homo_sapiens/Ensembl/GRCh37/Annotation/snf_transcriptome_annotations/grch37_cds_deltag_cds.csv", "r") as infile:
# reader = csv.reader(infile)
# mydict = {rows[0]:rows[1] for rows in reader}
def row_to_float(row):
return [float(i.strip("\";")) if is_number(i.strip("\";")) else i.strip("\";") for i in row[1:]]
#if is_number(row[1]):
# return np.array(row[1:], dtype=np.float32)
#else:
# return row[1:]
if (args.txome_props):
if "*" in args.txome_props:
files = glob.glob(args.txome_props)
else:
files = args.txome_props
for file in files:
print "Reading transcriptome properties from %s" % file
with open(file, "r") as infile:
reader = csv.reader(infile)
header = reader.next()
prop_name = os.path.splitext(os.path.basename(file))[0]
if (args.strip_id):
txome_props[prop_name] = {"_".join(rows[0].split("_")[:-1]).strip("\";"):row_to_float(rows) for rows in reader}
else:
txome_props[prop_name] = {rows[0].strip("\";"):row_to_float(rows) for rows in reader}
#txome_props[prop_name] = {rows[0].split('_')[0]:row_to_float(rows) for rows in reader}
txome_props[prop_name]["header"] = header[1:]
print "\tRead %d entries." % len(txome_props[prop_name].keys())
# generate two randomized, size-matched sets as a control, if requested
gene_to_txs_rand1 = defaultdict(list)
gene_to_txs_rand2 = defaultdict(list)
n_in_rand1, n_in_rand2 = 0,0
if (args.control):
set1_txs = gene_to_txs_set1.viewvalues()
set2_txs = gene_to_txs_set2.viewvalues()
# values are list of tuples of (dist, mean, stdev, clusterID) where each list item is a transcript.
# indices are IDXDIST - distribution, IDXMEAN - means, IDXSTDEV - stdevs, IDXCLUSTID - cluster identifier
# collapse lists and combine...
all_txs = [k for j in list(set1_txs) for k in j] + [k for j in list(set2_txs) for k in j]
print "\nGenerating random transcripts from list of %d (set1 has %d, set2 has %d)" % (len(all_txs), n_in_set1, n_in_set2)
random.seed(1)
all_txs = random.sample(all_txs, len(all_txs))
for tx in all_txs[:n_in_set1]:
txid = tx[IDXDIST][0]
gene_to_txs_rand1[tx_to_gene[txid]].append(tx)
n_in_rand1 += 1
for tx in all_txs[n_in_set1:]:
txid = tx[IDXDIST][0]
gene_to_txs_rand2[tx_to_gene[txid]].append(tx)
n_in_rand2 += 1
#print "set1: %d rand1: %d set2: %d rand2: %d" % (n_in_set1, n_in_rand1, n_in_set2, n_in_rand2)
# now, for all genes that have transcripts in set1 and set2, report things.
print "\nComparing transcript parameters."
prop_list_set1 = defaultdict(list)
prop_list_set2 = defaultdict(list)
tx_list_set1 = defaultdict(list)
tx_list_set2 = defaultdict(list)
comparisons, length_different = 0,0
length_ratios = []
genes_in_both_sets, txs_compared_in_set1, txs_compared_in_set2 = 0,0,0
for gene in gene_to_txs_set1.iterkeys():
# this data structure gene_to_txs is now for each gene (gene), a list (txs) of tuples of (dist, mean, stdev, clusterID) where each entry in the list is one transcript.
# possible that genes will have multiple transcripts in both set1 and set2 - need to do all-by-all comparisons when reporting things
# indices into the tuples are IDXDIST - distribution, IDXMEAN - means, IDXSTDEV - stdevs, IDXCLUSTID - cluster identifier
if gene in gene_to_txs_set2:
genes_in_both_sets += 1
# catalog properties for the two sets
for val1 in gene_to_txs_set1[gene]:
txs_compared_in_set1 += 1
tx_i = val1[IDXDIST][0]
for propfile, propdict in txome_props.iteritems():
for k in range(len(propdict["header"])):
propname = propfile + "_" + propdict["header"][k]
if tx_i in propdict:
prop_i = propdict[tx_i][k]
prop_list_set1[propname].append(prop_i)
# current construction of this does not handle missing values
tx_list_set1[tx_i].append(str(prop_i))
else:
tx_list_set1[tx_i].append(str(-1))
#else:
# print "Warning: transcript %s not found for property %s" % (tx_i, propfile)
for val2 in gene_to_txs_set2[gene]:
txs_compared_in_set2 += 1
tx_j = val2[IDXDIST][0]
for propfile, propdict in txome_props.iteritems():
for k in range(len(propdict["header"])):
propname = propfile + "_" + propdict["header"][k]
if tx_j in propdict:
prop_j = propdict[tx_j][k]
prop_list_set2[propname].append(prop_j)
tx_list_set2[tx_j].append(str(prop_j))
else:
tx_list_set2[tx_j].append(str(-1))
#else:
# print "Warning: transcript %s not found for property %s" % (tx_j, propfile)
# all by all length comparison...
# this is hackish and assumes certain input property filenames AND that set1 is high polys and set2 is low polys
for val1 in gene_to_txs_set1[gene]:
for val2 in gene_to_txs_set2[gene]:
tx1 = val1[IDXDIST][0]
tx2 = val2[IDXDIST][0]
comparisons += 1
# propdict = txome_props["grch37_cds_length"]
# if tx1 in propdict and tx2 in propdict:
# len1 = propdict[tx1][0]
# len2 = propdict[tx2][0]
# length_ratios.append(len2/float(len1))
# if (len2 > len1):
# #print "For gene %s tx1 %s length (%d) is < tx2 %s length (%d)!" % (gene, tx1, len1, tx2, len2)
# length_different += 1
#print "Compared %d transcripts, %d have longer set2 length than set1 length (%3.2f %%)" % (comparisons, length_different, length_different/float(comparisons)*100)
print "Compared %d genes" % genes_in_both_sets
print "Compared %d transcripts overall, %d from set1 and %d from set2" % (comparisons, txs_compared_in_set1, txs_compared_in_set2)
# plot histogram of the length ratios
# fig, ax = plt.subplots()
# n, bins, patches = plt.hist(length_ratios, bins=np.logspace(-1,1,100), range=(0.1,10), normed=1, facecolor='green', alpha=0.75)
# ax.set_xscale("log")
# plt.xlabel("len_set2/len_set1")
# plt.savefig("length_ratio_hist.png")
# plt.close(fig)
# Generate random controls -------
prop_list_rand1 = defaultdict(list)
prop_list_rand2 = defaultdict(list)
comparisons, length_different = 0,0
if (args.control):
tested = 0
for gene in gene_to_txs_rand1.iterkeys():
# this data structure gene_to_txs is now for each gene (gene), a list (txs) of tuples of (dist, mean, stdev, clusterID) where each entry in the list is one transcript.
# possible that genes will have multiple transcripts in both rand1 and rand2 - need to do all-by-all comparisons when reporting things
# indices into the tuples are IDXDIST - distribution, IDXMEAN - means, IDXSTDEV - stdevs, IDXCLUSTID - cluster identifier
if gene in gene_to_txs_rand2:
# catalog properties for the two rands
for val1 in gene_to_txs_rand1[gene]:
tx_i = val1[IDXDIST][0]
for propfile, propdict in txome_props.iteritems():
for k in range(len(propdict["header"])):
propname = propfile + "_" + propdict["header"][k]
if tx_i in propdict:
prop_i = propdict[tx_i][k]
prop_list_rand1[propname].append(prop_i)
#else:
# print "Warning: transcript %s not found for property %s" % (tx_i, propfile)
for val2 in gene_to_txs_rand2[gene]:
tx_j = val2[IDXDIST][0]
for propfile, propdict in txome_props.iteritems():
for k in range(len(propdict["header"])):
propname = propfile + "_" + propdict["header"][k]
if tx_j in propdict:
prop_j = propdict[tx_j][k]
prop_list_rand2[propname].append(prop_j)
#else:
# print "Warning: transcript %s not found for property %s" % (tx_j, propfile)
for val1 in gene_to_txs_set1[gene]:
for val2 in gene_to_txs_set2[gene]:
tx1 = val1[IDXDIST][0]
tx2 = val2[IDXDIST][0]
comparisons += 1
# propdict = txome_props["grch37_cds_length"]
# if tx1 in propdict and tx2 in propdict:
# len1 = propdict[tx1][0]
# len2 = propdict[tx2][0]
# length_ratios.append(len2/float(len1))
# if (len2 > len1):
# #print "For gene %s tx1 %s length (%d) is < tx2 %s length (%d)!" % (gene, tx1, len1, tx2, len2)
# length_different += 1
#print "Compared %d randomly assigned transcripts, %d have longer set2 length than set1 length (%3.2f %%)" % (comparisons, length_different, length_different/float(comparisons)*100)
print "Compared %d randomly assigned transcripts" % (comparisons)
# now make eCDF plots...
def mkline(tuple):
if False in [is_number(i) for i in tuple]:
return ",".join(["%s" % i if i is not None else "" for i in tuple])
elif str in [type(i) for i in tuple]:
print "STR DETECT %s " % str(tuple)
else:
return ",".join(["%.6f" % i if i is not None else "" for i in tuple])
for propname in prop_list_set1.iterkeys():
print "Plotting property %s" % propname
ivals = prop_list_set1[propname]
jvals = prop_list_set2[propname]
propfile = safe_open_file("%s_ecdf.csv" % propname)
if args.control:
rivals = prop_list_rand1[propname]
rjvals = prop_list_rand2[propname]
#if (len(ivals) != len(jvals)):
# print "Warning: property %s i and j lists are unequal length." % propname
if False in [is_number(i) for i in ivals] or False in [is_number(j) for j in jvals]:
print "Skipping property %s due to non-numeric values" % propname
if args.control:
propfile.write("set1,set2,rand1,rand2\n")
propfile.write("\n".join([mkline(i) for i in izip_longest(ivals, jvals, rivals, rjvals)]))
propfile.close()
else:
propfile.write("set1,set2\n")
propfile.write("\n".join([mkline(i) for i in izip_longest(ivals, jvals)]))
propfile.close()
continue
# calculate Mann Whitney U p-value
mw_test = mannwhitneyu(ivals, jvals)
# calculate the KS statistics
ks_test = ks_2samp(ivals, jvals)
# plot eCDFs
fig, ax = plt.subplots()
i_sorted=np.sort( ivals )
i_yvals=np.arange(len(i_sorted))/float(len(i_sorted))
plt.plot( i_sorted, i_yvals )
if args.log:
ax.set_xscale("log")
j_sorted=np.sort( jvals )
j_yvals=np.arange(len(j_sorted))/float(len(j_sorted))
plt.plot( j_sorted, j_yvals )
plt.legend([propname + "_set1", propname + "_set2"], fontsize=8)
plt.text(.7, 0.15, "K-S, p = %4.2f, %.3g" % (ks_test[0], ks_test[1]), fontsize=8, transform=ax.transAxes)
plt.text(.7, 0.1, "M-W U, p = %4.2f, %.3g" % (mw_test[0], mw_test[1]), fontsize=8, transform=ax.transAxes)
plt.text(.7, 0.05, "set1: %s; set2: %s" % (", ".join(args.set1[1::2]), ", ".join(args.set2[1::2])), fontsize=8, transform=ax.transAxes)
plt.text(.7, 0.02, "n_1: %d; n_2: %d" % (len(ivals), len(jvals)), fontsize=8, transform=ax.transAxes)
plt.savefig("%s_ecdf.png" % propname)
plt.close(fig)
if (args.control):
fig, ax = plt.subplots()
mw_test_r = mannwhitneyu(rivals, rjvals)
ri_sorted=np.sort( rivals )
ri_yvals=np.arange(len(ri_sorted))/float(len(ri_sorted))
plt.plot( ri_sorted, ri_yvals )
rj_sorted=np.sort( rjvals )
rj_yvals=np.arange(len(rj_sorted))/float(len(rj_sorted))
plt.plot( rj_sorted, rj_yvals )
plt.legend([propname + "_rand1", propname + "_rand2"], fontsize=8)
plt.text(.7, 0.1, "M-W U p_ctrl = %.3g" % (mw_test_r[1]), fontsize=8, transform=ax.transAxes)
propfile.write("set1,set2,rand1,rand2,ecdf_1_x,ecdf_1_y,ecdf_2_x,ecdf_2_y,ecdf_r1_x,ecdf_r1_y,ecdf_r2_x,ecdf_r2_y\n")
propfile.write("\n".join([mkline(i) for i in izip_longest(ivals, jvals, rivals, rjvals, i_sorted, i_yvals, j_sorted, j_yvals, ri_sorted, ri_yvals, rj_sorted, rj_yvals)]))
plt.savefig("%s_ctrl_ecdf.png" % propname)
plt.close(fig)
else:
propfile.write("set1,set2,ecdf_1_x,ecdf_1_y,ecdf_2_x,ecdf_2_y\n")
propfile.write("\n".join([mkline(i) for i in izip_longest(ivals, jvals, i_sorted, i_yvals, j_sorted, j_yvals)]))
propfile.close()
# --- output tx-specific values for each set ---
set1_outfile = safe_open_file("_".join(args.set1[1::2]) + ".csv")
set2_outfile = safe_open_file("_".join(args.set2[1::2]) + ".csv")
header = ["txid"]
for propfile, propdict in txome_props.iteritems():
for k in range(len(propdict["header"])):
propname = propfile + "_" + propdict["header"][k]
header.append(propname)
set1_outfile.write(",".join(header) + "\n")
set1_outfile.write("\n".join([",".join([txid] + [tx_to_gene[txid]] + tx_list_set1[txid]) for txid in tx_list_set1.iterkeys()]))
set2_outfile.write(",".join(header) + "\n")
set2_outfile.write("\n".join([",".join([txid] + [tx_to_gene[txid]] + tx_list_set2[txid]) for txid in tx_list_set2.iterkeys()]))