https://github.com/alyssafrazee/polyester
Raw File
Tip revision: 1f8e6da9b897d0722dbff4e80d9fb4a8f0f61ea9 authored by Alyssa Frazee on 14 September 2021, 17:30:49 UTC
fix warnings
Tip revision: 1f8e6da
build_error_models.py
#!/usr/bin/python

### code to extract specific error calculations from a GemErr error model
### http://www.biomedcentral.com/1471-2164/13/74

import gzip
import cPickle
import argparse
import numpy as np

# parse file paths:
parser = argparse.ArgumentParser(description='get file paths from user')
parser.add_argument('model_path', type=str, help='directory containing GemSIM sequencing models')
parser.add_argument('outfolder', type=str, help='directory for output')
parser.add_argument('--prefix', type=str, help='prefix (custom error models only)')
parser.add_argument('--paired', action='store_true', help='is the error model paired-end? (custom error models only)')
args = parser.parse_args()
model_path = args.model_path
outfolder = args.outfolder
prefix = args.prefix
paired = args.paired

def get_matrices(gzipFile, paired):
    f = gzip.open(gzipFile)
    if paired:
        readLen = cPickle.load(f)
        mx1 = cPickle.load(f)
        mx2 = cPickle.load(f)
        f.close()
        return mx1, mx2
    else:
        readLen = cPickle.load(f)
        mx = cPickle.load(f)
        f.close()
        return mx

def get_model(m):
    """m is a 7-D matrix generated by GemErr.py"""
    for dim in [2,3,4,5]:
        m = np.add.reduce(m, 2) 
        # we will not address preceding / following bases
    
    inds={'A':0, 'T':1, 'G':2, 'C':3, 'N':4}
    ret = np.zeros((101,5,5))
    for pos in xrange(len(ret)):
        for refBase in ['A', 'T', 'G', 'C', 'N']:
            for readBase in ['A', 'T', 'G', 'C', 'N']:
                refi = inds[refBase]
                readi = inds[readBase]
                if m[pos][refi][5] == 0:
                    # there were no reference bases with this nucleotide at this position
                    # so we will just sequence this base as is and not worry about errors.
                    ret[pos][refi][readi] = 1 if refi==readi else 0
                elif refi == readi:
                    num_errors = sum(m[pos][refi][j] for j in range(5) if j != refi)
                    num_correct = m[pos][refi][5] - num_errors
                    correct_prob = 1 - (float(num_errors) / num_correct)
                    ret[pos][refi][readi] = correct_prob
                else:
                    # nucleotide-specific error probability
                    transition_prob = float(m[pos][refi][readi]) / m[pos][refi][5]
                    ret[pos][refi][readi] = transition_prob

    return ret

def save_model(array, outf):
    """write text file to disk containing this error model specification
    to be read into R later"""
    inds={'A':0, 'T':1, 'G':2, 'C':3, 'N':4}
    with open(outf, 'w') as f:
        f.write('errmodel\treadA\treadT\treadG\treadC\treadN\tpos\n')
        for pos in xrange(len(array)):
            for refBase in ['A', 'T', 'G', 'C', 'N']:
                f.write('ref'+refBase+'\t')
                for readBase in ['A', 'T', 'G', 'C', 'N']:
                    f.write(str(array[pos][inds[refBase]][inds[readBase]])+'\t')
                    if readBase == 'N':
                        f.write(str(int(pos))+'\n')

def release_models(modelpath, modelname, outfolder, paired):
    if paired:
        mx1, mx2 = get_matrices(modelpath+'/'+modelname+'_p.gzip', True)
        mate1 = get_model(mx1)
        mate2 = get_model(mx2)
        save_model(mate1, outfolder+'/'+modelname+'_mate1')
        save_model(mate2, outfolder+'/'+modelname+'_mate2')
    else:
        mx = get_matrices(modelpath+'/'+modelname+'_s.gzip', False)
        model = get_model(mx)
        save_model(model, outfolder+'/'+modelname+'_single')

### create error model files for release
if prefix is not None:
    # custom error model:
    release_models(model_path, prefix, outfolder, paired)
else:
    for model in ['ill100v4', 'ill100v5', 'r454ti']:
        release_models(model_path, model, outfolder, False)
        if model != 'r454ti':
            release_models(model_path, model, outfolder, True)

back to top