Revision d7509c6f3e0a0f34db71b485a9e332223084e7be authored by naomipetela on 18 June 2019, 14:32:57 UTC, committed by GitHub on 18 June 2019, 14:32:57 UTC
1 parent fe38fec
chr_position.py
import re
from os import listdir
from os.path import isfile, join
# Replace with the path to your pileups
path = './'
files = [ f for f in listdir(path) if isfile(join(path,f)) ]
files = [ f for f in files if re.search("\.pileup$", f) ]
for f in files:
chromosome = re.split("\r\n|\r|\n", open(join(path, f)).read())
#Remove all lines that start with hash that have data in them (list comprehension)
chromosomes = [line for line in chromosome if line and line[0] != '#']
# Make dictionary (keys -> chromosomes)
chromosome_library = {}
for line in chromosomes:
data = line.split('\t')
if chromosome_library.get(data[1]):
raise Exception("Duplicate key: " + data[1])
if len(data) < 4:
print f
print data
chromosome_library[data[1]] = data[3]
# Read chromosome length
chromosome_lengths = {
'chrI': 230218,
'chrII': 813184,
'chrIII': 316620,
'chrIV': 1531933,
'chrV': 576874,
'chrVI': 270161,
'chrVII': 1090940,
'chrVIII': 562643,
'chrIX': 439888,
'chrX': 745751,
'chrXI': 666816,
'chrXII': 1078177,
'chrXIII': 924431,
'chrXIV': 784333,
'chrXV': 1091291,
'chrXVI': 948066
}
chromosome_name = chromosomes[0].split('\t')[0]
chromosome_length = chromosome_lengths.get(chromosome_name, 0)
# Create new chromosome array with padded values
padded_chromosome = []
for base in range(1, chromosome_length + 1):
padded_chromosome.append('\t'.join([
chromosome_name,
str(base),
'',
str(chromosome_library.get(str(base), 0))
]))
output = join(path, chromosome_name+'.tabular')
if isfile(output):
raise Exception("Duplicate chromosome "+chromosome_name)
results = open(output, 'w')
results.write('\n'.join(padded_chromosome))
results.close
Computing file changes ...