https://github.com/QBioLab/sequence-data-analysis-for-noise-control
Raw File
Tip revision: c20430824c5a603eec695ee167e32e4cd1b6c024 authored by Aria on 20 December 2020, 07:48:44 UTC
Update README.md
Tip revision: c204308
hg19_insert.py
# -*- coding: utf-8 -*-
"""
Created on Mon Dec  2 22:47:47 2019

@author: chena
"""

from pyfaidx import Fasta
import pandas as pd

insert = Fasta('../rawdata/reference/inserts1.fa')
hg19 = Fasta('../../../Reference/hg19.fa')
insert_sites = pd.read_csv('../rawdata/reference/insert_sites.csv')
insert_sites = insert_sites.sort_values(by=['chromosome','start'])
output = '../metadata/reference/hg19_insert.fa'
tss_output= '../metadata/reference/tss_sites.txt'
new_site_output = '../metadata/reference/new_sites.txt'
tss_site = []
new_site = []
with open(output,'w') as fout:
    for i in hg19:
        name = i.name
        seq = str(i).upper()
        potential_insert = insert_sites[insert_sites['chromosome'] == name]
        potential_insert_len = len(potential_insert)
        if potential_insert_len > 0:
            bias = 0
            for j in range(potential_insert_len):
                d = potential_insert.iloc[j,1]
                insert_name = potential_insert.iloc[j,0]
                if d == 'forward':
                    insert_seq = insert[potential_insert.iloc[j,0]][:].seq
                    if insert_name == 'CEG':
                        tss = 2472
                    elif insert_name == 'mRN':
                        tss = 767
                elif d == 'reverse':
                    insert_seq = insert[potential_insert.iloc[j,0]][:].complement.reverse.seq
                    if insert_name == 'CEG':
                        tss = 4928
                    elif insert_name == 'mRN':
                        tss = 2597
                start = potential_insert.iloc[j,3] + bias - 1
                end = potential_insert.iloc[j,4] + bias - 1
                tss_site.append('{}-{}-{}-{}'.format(insert_name, d, name, 
                                start+tss))
                new_site.append('{},{},{},{},{}'.format(insert_name, d, name, 
                                start+1,end+1+len(insert_seq)))
                seq = '{}{}{}'.format(seq[:start+1],insert_seq,seq[end:])
                bias += len(insert_seq) - 4
        fout.write('>{}\n'.format(name))
        fout.write('{}\n'.format(seq))
with open(tss_output,'w') as fout:
    for s in tss_site:
        fout.write('{}\n'.format(s))
with open(new_site_output,'w') as fout:
    for s in new_site:
        fout.write('{}\n'.format(s))
        
        
back to top