https://github.com/DyogenIBENS/Regulus
Raw File
Tip revision: 620b22689654c6a3d40b0548884892f16fb2c3e1 authored by Alexandra LOUIS on 30 March 2021, 11:29:49 UTC
-update to fix a typo in genes_env.py and several fatal bugs in score_genes_targets.py
Tip revision: 620b226
score_genes_targets.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# 25/02/15
# score_genes_targets.py version 1.0, part of the Regulus suite
# python 2.7
# Copyright © 2015 IBENS/Dyogen : Magali Naville and Hugues ROEST CROLLIUS
# mail : hrc@ens.fr or magali.naville@ens-lyon.fr
# Related Publication: Naville et al. (2015) Nature Communications
# This is free software, you may copy, modify and/or distribute this work under the terms of the GNU General Public License, version 3 (GPL v3) or later and the CeCiLL v2 license in France


# Calculate the association score of each gene found in the environment of a family of orthologous CNEs.

### Usage ###
# score_genes_targets.py genes_env_chr*.list.gz

import gzip
import sys
import bz2

###############
### Functions ###
###############

## Function that sorts vectors according to the values of column 1 and then column 3 (distance CNE-gene) ##
def cmpval(x,y):
    if float(x[1])>float(y[1]):
        return -1
    elif float(x[1])==float(y[1]):
        if float(x[3])<float(y[3]):
	    return -1
	elif float(x[3])==float(y[3]):
	    return 0
	else:
	    return 1
    else:
        return 1

###########
### Data ###
###########

## Weighting factors depending of the degree of rearrangement of each species genome compared to the human one. ##
file = open('species_synteny.list','r')
synt = file.readlines()
file.close()

SY = {} # taux de syntenie
COUV = {} # couverture 
for s in range(1,len(synt)) :
	ls = synt[s].split('\t')
	SY[ls[11].strip()] = ls[5].replace(',','.')
	COUV[ls[11].strip()] = ls[7].replace(',','.')

## Names of genes in Human corresponding to ancestral genes ##
NEW = {}
file = bz2.BZ2File('ancGenes.Boreoeutheria.list.bz2','r')
new = file.readlines()
file.close()

for i in new :
	li = str(i).split()
	NEW[li[0]] = []
	for k in range(1,len(li)):
		if li[k][0:5] == 'ENSG0' :
			NEW[li[0]].append(li[k])

## Current gene names ##
COUR = {} 
file = bz2.BZ2File('names.Homo.sapiens.list.bz2','r')
I = file.readlines()
file.close()

for i in I :
	li = i.strip().split()
	COUR[li[0]] = li[1]

## Genes found on the considered chromosome ##
X = []
file = bz2.BZ2File('genesST.Homo.sapiens.list.bz2','r')
genes = file.readlines()
file.close()

for i in genes :
	li = i.split()
	if li[0] == sys.argv[1].split('chr')[1].split('.')[0] :
		X.append(li[4])
print sys.argv[1].split('chr')[1].split('.')[0]

###########
### Main ###
###########

file = gzip.open(sys.argv[1],'r') 
cne = file.readlines()
file.close()

CNE = {} # CNE dictionary 
KEYS = [] # CNE identifiers
SIZE = {} # dictionary of the number of species in which each CNE is present
SCORE = {} # Dictionary CNE/genes/[scores0 scores1 scores2 dist_moy_scores1 Nbre_1_outgroups]
test_tar = 0 # Test to check that at least one CNE has a putative target
LIMIT = 1000000

for c in cne :
  lc = c.split('\t')
  #print lc
  if len(lc)>1 : # pour eviter les '\n'
    if lc[1] != 'choHof1' and lc[1] != 'eriEur1' and lc[1] != 'macEug1' and lc[1] != 'sorAra1' and lc[1] != 'tarSyr1' and lc[1] != 'hg19' and lc[1] != 'petMar1' : # Species of too low genome coverage, not taken into account + human (common denominator)
	if lc[0] not in CNE : # CNE not referenced in the dictionary yet
		CNE[lc[0]] = {}
		SIZE[lc[0]] = lc[2]
		KEYS.append(int(lc[0].split('U')[1]))
	CNE[lc[0]][lc[1]] = lc[3:len(lc)-1] # CNE[SCNE#][esp]

KEYS.sort()

for n in  KEYS : # for each CNE
	c = 'U'+str(n)
	SCORE[c] = {} # dictonary of each gene score for a given CNE
	for i in CNE[c] : # pour chaque espece i
		for j in CNE[c][i] :
			lj = j.split('-')
			if lj[1] == "0" :
				score = 0
				dist = 0
			else :
				test_tar = 1
				score = 0
				dist = LIMIT
				for k in range(1,len(lj)) :
					lk = lj[k].split('_')
					if lk[1] == '1' :
						score = 1
						if int(lk[2]) < dist :
							dist = lk[2]
					if lk[1] == '2' and (score == 0 or score == 3) :
						score = 2
					if lk[1] == '3' and score == 0 :
						score = 3
			if SCORE[c].__contains__(lj[0]) == 0 :
				SCORE[c][lj[0]] = [0,0,0,0,0,0] 
			if score == 1 :
				SCORE[c][lj[0]][1] += float(SY[i])
				SCORE[c][lj[0]][3] += int(dist)
				SCORE[c][lj[0]][4] += 1
			elif score == 2 :
				SCORE[c][lj[0]][2] += 1/float(SY[i])
			else : # score de 0 ou de 3
				SCORE[c][lj[0]][0] += float(COUV[i])/float(SY[i])

LIGNES = []

if test_tar == 1 :

	file = gzip.open('score_genes_'+sys.argv[1].split('env_')[1].split('.')[0]+'.list.gz','w')

	for n in  KEYS :
		c = 'U'+str(n)
		file.write(c+'\t'+str(SIZE[c])+'\t')
		ligne = c+'\t'+str(SIZE[c])+'\t'
		UN = [] # genes presenting a number of "1" scores different from 0
		for i in SCORE[c] :
			if SCORE[c][i][1] != 0 :
				UN.append([i,float(SCORE[c][i][1])-float(SCORE[c][i][2])-float(SCORE[c][i][0]),SCORE[c][i][1],SCORE[c][i][2],SCORE[c][i][0],float(SCORE[c][i][3])/float(SCORE[c][i][4]),SIZE[c]])

		UN.sort(cmpval)
		for u in UN :
			file.write(u[0]+' %.2f %.0f ' %(float(u[1]), float(u[5]))) # absolute score (1-2-3-0) mean distance
			ligne = ligne + u[0]+' %.2f %.0f ' %(float(u[1]), float(u[5]))
			names = ''
			for j in NEW[u[0]] :
				if X.__contains__(j) :
					if j in COUR:
						names = names + COUR[j] + '/'
					else:
						names = names + j + '/'
			
			file.write(names[0:len(names)-1]+' ')
			ligne = ligne + names[0:len(names)-1]+' '
		file.write('\n')
		ligne = ligne + '\n'
		
		LIGNES.append(ligne)
	file.close()

	file = gzip.open('targets_'+sys.argv[1].split('env_')[1].split('.')[0]+'.list.gz','w')
	
	for ligne in LIGNES :
		ls = ligne.split()
		#print ls
		gen = []
		GEN = {}
		if len(ls) > 2 :
			i = 2
			while i < len(ls) :
				gen.append(ls[i])
				GEN[ls[i]] = [ls[i+1],ls[i+2],ls[i+3]] # absolute score, mean distance, human gene name
				i += 4
			tar = []
			n = 0

			while GEN[gen[n]][0] == GEN[gen[0]][0] :
				tar.append(gen[n])
				if n < len(gen)-1 :
					n += 1
				else :
					break
			
			file.write(ls[0]+'\t'+ls[1]+'\t')
			if len(tar) == len(gen) :
				file.write('-\t')
			else :
				file.write(str(float(GEN[gen[n-1]][0])-float(GEN[gen[n]][0]))+'\t')
			file.write(GEN[gen[0]][0]+'\t')
			for t in tar :
				file.write(GEN[t][2]+' ')
			file.write('\n')
		# to also indicate elements with no target
		else :
			file.write(ls[0]+'\t'+ls[1]+'\t-\t-\t-\n')
		
	file.close()
back to top