import gzip if __name__ == "__main__": array_file = open("../mammalianChip_36K_probes.tsv", "r") scored_probes_file = gzip.open("../allProbes_scored_GW.tsv.gz", "rt") output_file = gzip.open("../36K_probes_SNV_species.tsv.gz", "wt") # Construct dictionary of probes submitted to Bret inf_type_dict = {} forward_type_dict = {} line_dict = {} chrom_dict = {} pos_dict = {} for line in array_file: if line[:2] == "cg": split_line = line.strip().split("\t") probe_ID = split_line[0].split("_")[0].split("-")[0] inf_type = split_line[31] forward_type = split_line[0].split("_")[-1] chrom_dict[probe_ID] = split_line[4] pos_dict[probe_ID] = split_line[5] if probe_ID in line_dict: print("WTF") print(line_dict[probe_ID]) print(line) inf_type_dict[probe_ID] = inf_type forward_type_dict[probe_ID] = forward_type line_dict[probe_ID] = split_line else: header_line = line.strip().split("\t") output_file.write("\t".join(header_line[:31]) + "\ttarget_SNV1\ttarget_SNV2\ttarget_SNV3\t" + "\t".join(header_line[31:]) + "\n") print("Total number of probes on array: ", len(line_dict.keys())) # Go through list of all probes and add SNP info probes_done_dict = {} probes_done = 0 for line in scored_probes_file: split_line = line.strip().split("\t") probe_ID = split_line[0] forward_type = split_line[15] is_converted = (split_line[17] == "C") chrom = split_line[3] pos = split_line[4] if (probe_ID in line_dict) and (forward_type_dict[probe_ID] == forward_type) and (is_converted) and \ chrom_dict[probe_ID] == chrom and pos_dict[probe_ID] == pos: probes_done += 1 if probe_ID in probes_done_dict: print("WTF") print(probe_ID) print(line_dict[probe_ID]) print(line) exit(1) probes_done_dict[probe_ID] = 1 if probes_done % 100 == 0: print("Probes done so far: ", probes_done) inf_type = inf_type_dict[probe_ID] output_file.write("\t".join(line_dict[probe_ID][:31]) + "\t") if inf_type == "Inf1": output_file.write(split_line[30] + "\t" + split_line[31] + "\t" + split_line[32] + "\t") else: output_file.write(split_line[40] + "\t" + split_line[41] + "\t" + split_line[42] + "\t") output_file.write("\t".join(line_dict[probe_ID][31:]) + "\n") probes_not_done = open("../probes_not_done.tsv", "w") for probe_ID in line_dict: if probe_ID not in probes_done_dict: probes_not_done.write("\t".join(line_dict[probe_ID]) + "\n") output_file.close()