https://github.com/kpalin/EEL
Raw File
Tip revision: c0cd936bedca30564088c8aeb8f43ebe4e05421f authored by Kimmo Palin on 07 November 2012, 13:23:26 UTC
Fixed a variable format clash which sometimes caused core dumps.
Tip revision: c0cd936
multiTest.py


# $Log$
# Revision 1.1  2004/01/23 12:51:31  kpalin
# Added accessory and developement testing tools. SyntenyToEnsembl
# makes a link to www.ensembl.org from our sequence names and
# multiTest.py is a brute force concept implementation of multiple alignment.
#

files=[#"ENSDARG00000006837.ENSG00000134323.align.gff",
       #"ENSDARG00000006837.ENSMUSG00000037169.align.gff",
       #"ENSDARG00000006837.ENSRNOG00000006308.align.gff",
       #"ENSDARG00000006837.SINFRUG00000128278.align.gff",
       "ENSG00000134323.ENSMUSG00000037169.align.gff",
       "ENSG00000134323.ENSRNOG00000006308.align.gff",
       #"ENSG00000134323.SINFRUG00000128278.align.gff",
       "ENSMUSG00000037169.ENSRNOG00000006308.align.gff",
       #"ENSMUSG00000037169.SINFRUG00000128278.align.gff",
       #"ENSRNOG00000006308.SINFRUG00000128278.align.gff"
       ]



files=["hum_hum_1031_937.abs9.chrIbg.align.gff",
       "hum_mus_1031.abs9.chrIbg.align.gff",
       "mus_hum_1031_937.abs9.chrIbg.align.gff"]

files=["hmmyf5.align.opt.gff","hrmyf5.align.opt.gff","mrmyf5.align.opt.gff"]

species={"HUMAN":0,"RAT":1,"MOUSE":2}
#species={"HUMAN|4.119820698-119936581":0,
#         "HUMAN|8.97218432-97331876":1,
#         "MOUSE|13.63720377-63822119":2}
specCount=0





class AlignedSite:
    def __init__(self,sType,sScore,pos):
        self.type=sType
        self.score=sScore
        self.pos=pos
        self.pairs=[]

    def __getitem__(self,key):
        return self.pos[key]


    def copy(self):
        c=AlignedSite(self.type,self.score,self.pos)
        c.pairs=self.pairs[:]
        return c


    def addPair(self,spec1,spec2):
        pairs=[spec1,spec2]
        pairs.sort()
        self.pairs.append(tuple(pairs))

    def __repr__(self):
        s="AlignedSite(%s,%s,%s)"%(repr(self.type),repr(self.score),repr(self.pos))
        return s

    def __str__(self):
        return self.__repr__()

    def unite(self,other):
        assert(self.type==other.type)
        self.pos.update(other.pos)
        self.pairs.extend(other.pairs)



class CisMod:
    def __init__(self,species=[],code=[]):
        self.species=species
        #self.species.sort()
        if type(code) not in (type([]),type(())):
            code=[code]
        self.cisCode=code
        self.sites=[]
        self.modScore=0.0

    def __str__(self):
        s=StringIO()
        s.write("%s (%s)"%(" <=> ".join(self.species),"<=>".join(map(lambda x:"CM %d"%(x),self.cisCode))))
        for site in self.sites:
            s.write("\n%s %d %s"%(" ".join(map(lambda x:str(site[x]),self.species)),len(site.pairs),site.type))

        return s.getvalue()

    def __len__(self):
        return len(self.sites)


    def addSite(self,pos,sType,sScore):
        self.sites.append(AlignedSite(sType,sScore,pos))
        if sScore>self.modScore: self.modScore=sScore

    def compatible(self,other):
        """Return the name of the common species if the alignments are between
        three different species"""
        compValue=0
        sSpec,cSpec,oSpec="","",""
        for tSpec in self.species:
            try:
                cSpec=other.species.index(tSpec)
                oSpec=other.species[1-cSpec]
                cSpec=other.species[cSpec]
                compValue=compValue+1
            except ValueError:
                sSpec=tSpec
        if compValue==1:
            return (sSpec,cSpec,oSpec)
        else:
            return None

    def intersect(self,other):
        specs=self.compatible(other)
        if not specs:
            return None

        sSpec,cSpec,oSpec=specs



        i,j=0,0
        m=0

        newCis=CisMultiMod([sSpec,cSpec,oSpec],self.cisCode+other.cisCode)
        #s="%s <=> %s <=> %s (CM%d<=>CM%d)\n"%(sSpec,cSpec,oSpec,self.cisCode,other.cisCode)
        s=str(newCis)
        while i<len(self.sites) and j<len(other.sites):
            if self.sites[i][cSpec]<other.sites[j][cSpec]:
                i+=1
            elif self.sites[i][cSpec]>other.sites[j][cSpec]:
                j+=1
            elif self.sites[i].type==other.sites[j].type:
                newSite=self.sites[i].copy()
                newSite.unite(other.sites[j])
                newCis.sites.append(newSite)

                s=s+"%d %d %d %s %s\n"%(self.sites[i][sSpec],self.sites[i][cSpec],other.sites[j][oSpec],str(self.sites[i].type),str(newSite.pairs))
                
                m=m+1
                i+=1
                j+=1                
            else:
                i+=1
                j+=1
        if m>1:
            #print s
            print newCis
            #print "Matches/mismatches: %d/%d"%(2*m,len(self.sites)+len(other.sites)-2*m)
            return newCis
        else:
            return None

from cStringIO import StringIO


class CisMultiMod(CisMod):
    def __init__(self,species=[],CMcodes=[]):
        CisMod.__init__(self,species,CMcodes)


    def addPairAlign(self,other):
        assert(type(other)==type(CisMod()))
        
        spec1,spec2=other.species
        newCis=CisMultiMod(self.species[:],self.cisCode+other.cisCode)

        spair=other.species[:]
        spair.sort()
        try:
            i=self.sites[0].pairs.index(tuple(spair))
            return None
        except ValueError:
            #print self.sites[0].pairs,spair
            pass

        i,j=0,0
        m=0

        while i<len(self.sites) and j<len(other.sites):
            if self.sites[i][spec1]<other.sites[j][spec1] or self.sites[i][spec2]<other.sites[j][spec2]:
                i+=1
            elif self.sites[i][spec1]>other.sites[j][spec1] or self.sites[i][spec2]>other.sites[j][spec2]:
                j+=1
            elif self.sites[i].type==other.sites[j].type:
                newSite=self.sites[i].copy()
                newSite.unite(other.sites[j])
                newCis.sites.append(newSite)

                m=m+1
                i+=1
                j+=1                
            else:
                i+=1
                j+=1
        if m>1:
            #print newCis
            #print "Matches/mismatches: %d/%d"%(2*m,len(self.sites)+len(other.sites)-2*m)
            return newCis
        else:
            return None













import re

CMpat=re.compile(r"CM (\d+)")
aligns={}
CisMods={}
nodesInCis={}

cisModules=[]
for file in files:
    handle=open(file)
    print file
    parts=file.split(".")
    spec1str=parts[0].split("|")[0]
    if not species.has_key(spec1str):
        species[spec1str]=specCount
        specCount+=1
##    if not species.has_key(parts[1]):
##        species[parts[1]]=specCount
##        specCount+=1
    handle.readline()
    line=handle.readline()  # First CisModule
    parts=line.split("\t")
    while len(line)>0:
        spec1str=parts[0].split("|")[0]
        spec1=species[spec1str]
        
        CMcode=CMpat.search(parts[-1])
        CMcode=int(CMcode.group(1))
        #print CMcode
        line=handle.readline() #Second CisModule
        parts=line.split("\t") 
        spec2str=parts[0].split("|")[0]
        spec2=species[spec2str]

        spec1,spec2=min(spec1,spec2),max(spec1,spec2)
        CMid=(spec1,spec2,CMcode)


        newCis=CisMod([spec1str,spec2str],CMcode)
        if CMcode<20:
            cisModules.append(newCis)
        #print line
        
        line=handle.readline() #first malign row
        parts=line.split("\t") 

        while len(line)>0 and parts[2]!="CisModule":
            
            node1Id=(species[parts[0].split("|")[0]],parts[2],int(parts[3]))
            node1Line=line

            sPos1=int(parts[3])
            
            line=handle.readline()  #Second malign row
            parts=line.split("\t")

            sPos2=int(parts[3])

            newCis.addSite({spec1str:sPos1,spec2str:sPos2},parts[2],float(parts[5]))
            newCis.sites[-1].addPair(spec1str,spec2str)
                           
            assert(parts[1]=="malign")
            
            line=handle.readline()   #next malign or CisModule
            parts=line.split("\t")
            #print parts[1],



cis3spec=[]
for i in range(len(cisModules)):
    for j in range(i+1,len(cisModules)):
        c3=cisModules[i].intersect(cisModules[j])
        if c3:
            cis3spec.append(c3)

multia=[]
for a in cis3spec:
    for b in cisModules:
        c=a.addPairAlign(b)
        if c:
            multia.append(c)
            print c

#a=cis3spec[0]
#b=cis3spec[16]
#c=a.addPairAlign(cisModules[39])
back to top