Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

Revision f6c855ef4a7ce63f72dba6b34e9d0e9edd9200ce authored by ctboughter on 01 December 2020, 17:23:16 UTC, committed by ctboughter on 01 December 2020, 17:23:16 UTC
Add Amino Acid frequency module to GUI
1 parent 73e84d3
  • Files
  • Changes
  • 490c73a
  • /
  • seq_loader.py
Raw File Download
Permalinks

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • revision
  • directory
  • content
revision badge
swh:1:rev:f6c855ef4a7ce63f72dba6b34e9d0e9edd9200ce
directory badge Iframe embedding
swh:1:dir:490c73a3788dcb5fa0f5270b80bac9e12898cccc
content badge Iframe embedding
swh:1:cnt:5c49ef6bdfdbffe00f028227df1a802e2f0a878b
Citations

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • revision
  • directory
  • content
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
seq_loader.py
# Let's start off by loading in Jeff's CDR3's
import numpy as np
import pandas

def getBunker():
    total_Abs=pandas.read_csv('app_data/mouse_IgA.dat',sep='\s+',header=None,names=['cdrL1_aa','cdrL2_aa','cdrL3_aa','cdrH1_aa','cdrH2_aa','cdrH3_aa','react'])
    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    mono_all=total_abs7[total_abs7['react'].isin([0.0,1.0])].values
    poly_all=total_abs7[total_abs7['react'].isin([2.0,3.0,4.0,5.0,6.0,7.0])].values

    mono=total_abs7[total_abs7['react'].isin([0.0])].values
    poly=total_abs7[total_abs7['react'].isin([5.0,6.0,7.0])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)
    
    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)

    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))
#####################################################################################

def getJenna():
    total_Abs=pandas.read_csv('app_data/flu_IgG.dat',sep='\s+',header=None,
    names=['cdrL1_aa','cdrL2_aa','cdrL3_aa','cdrH1_aa','cdrH2_aa','cdrH3_aa','react'])

    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    # Having this and the above lines as "if" options could make this loader more generalizable...
    mono_all=total_abs7[total_abs7['react'].isin([0,1])].values
    poly_all=total_abs7[total_abs7['react'].isin([2,3,4,5,6,7])].values

    mono=total_abs7[total_abs7['react'].isin([0])].values
    poly=total_abs7[total_abs7['react'].isin([5,6,7])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)

    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))

def getHugo():
    my_heavy=pandas.read_csv('app_data/hiv_igg_data/gut_heavy_aa.dat',sep='\s+')
    my_light=pandas.read_csv('app_data/hiv_igg_data/gut_light_aa.dat',sep='\s+')
    poly_YN=pandas.read_csv('app_data/hiv_igg_data/gut_num_react.dat',sep='\s+',header=None,names=['react'])
    total_abs=pandas.concat([my_light,my_heavy,poly_YN],axis=1)
    total_abs7 = total_abs.where((pandas.notnull(total_abs)), '')
    mono_all=total_abs7[total_abs7['react'].isin([0,1])].values
    poly_all=total_abs7[total_abs7['react'].isin([2,3,4])].values
    mono=total_abs7[total_abs7['react'].isin([0])].values
    poly=total_abs7[total_abs7['react'].isin([3,4])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)
    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))

def getHugo_Nature():
    my_heavy=pandas.read_csv('app_data/hiv_igg_data/nat_heavy_aa.dat',sep='\s+')
    my_light=pandas.read_csv('app_data/hiv_igg_data/nat_light_aa.dat',sep='\s+')
    poly_YN=pandas.read_csv('app_data/hiv_igg_data/nat_num_react.dat',sep='\s+',header=None,names=['react'])
    total_Abs=pandas.concat([my_light,my_heavy,poly_YN],axis=1)

    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    # And finish it up...
    mono_all=total_abs7[total_abs7['react'].isin([0.0,1.0])].values
    poly_all=total_abs7[total_abs7['react'].isin([2.0,3.0,4.0,5.0,6.0])].values

    mono=total_abs7[total_abs7['react'].isin([0.0])].values
    poly=total_abs7[total_abs7['react'].isin([5.0,6.0])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)
    
    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)

    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))

def getHugo_NatCNTRL():
    my_heavy=pandas.read_csv('app_data/hiv_igg_data/nat_cntrl_heavy_aa.dat',sep='\s+')
    my_light=pandas.read_csv('app_data/hiv_igg_data/nat_cntrl_light_aa.dat',sep='\s+')
    poly_YN=pandas.read_csv('app_data/hiv_igg_data/nat_cntrl_num_react.dat',sep='\s+',header=None,names=['react'])
    total_Abs=pandas.concat([my_light,my_heavy,poly_YN],axis=1)

    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    # And finish it up...
    mono_all=total_abs7[total_abs7['react'].isin([0.0,1.0])].values
    poly_all=total_abs7[total_abs7['react'].isin([2.0,3.0,4.0,5.0,6.0])].values

    mono=total_abs7[total_abs7['react'].isin([0.0])].values
    poly=total_abs7[total_abs7['react'].isin([5.0,6.0])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)
    
    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)

    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))


def getHugo_PLOS():
    my_heavy=pandas.read_csv('app_data/hiv_igg_data/plos_heavy_aa.dat',sep='\s+')
    my_light=pandas.read_csv('app_data/hiv_igg_data/plos_light_aa.dat',sep='\s+')
    poly_YN=pandas.read_csv('app_data/hiv_igg_data/plos_yn.dat',sep='\s+',header=None,names=['YN'])
    total_Abs=pandas.concat([my_light,my_heavy,poly_YN],axis=1)

    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    # And finish it up...
    mono_all=total_abs7[total_abs7['YN']=='N'].values
    poly_all=total_abs7[total_abs7['YN']=='Y'].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)
    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]))

def getAdimab():
    heavy_Abs=pandas.read_csv('app_data/adimab_data/cdrs_H_final.txt',sep='\s+',header=None,names=['cdrH1_aa','cdrH2_aa','cdrH3_aa'])
    light_Abs=pandas.read_csv('app_data/adimab_data/cdrs_L_final.txt',sep='\s+',header=None,names=['cdrL1_aa','cdrL2_aa','cdrL3_aa'])
    outcomes=pandas.read_csv('app_data/adimab_data/drug_outcomes.csv',sep=',',header=0)
    assays=pandas.read_csv('app_data/adimab_data/drug_properties.csv',sep=',',header=0)
    
    names=outcomes['Name']
    clinical=outcomes['Clinical Status']
    phage=outcomes['Phagec']
    elisa_polyScores=assays['ELISA']
    psr_assayScore=assays['Poly-Specificity Reagent (PSR) SMP Score (0-1)']

    total_Abs=pandas.concat([names,heavy_Abs,clinical,light_Abs,phage,assays.loc[:, assays.columns != 'Unnamed: 13']],axis=1).dropna()

    # Let's not process this data, just return the matrix
    return(total_Abs)
#####################################################################################

def getSabDab():
    heavy_Abs=pandas.read_csv('app_data/SabDab_data/nonAdimab_igblast/cdrs_H_final.txt',sep='\s+',header=None,names=['cdrH1_aa','cdrH2_aa','cdrH3_aa'])
    light_Abs=pandas.read_csv('app_data/SabDab_data/nonAdimab_igblast/cdrs_L_final.txt',sep='\s+',header=None,names=['cdrL1_aa','cdrL2_aa','cdrL3_aa'])
    notAdi=pandas.read_csv('app_data/SabDab_data/non_adimab_dataHu.csv',sep=',',header=0)
    Adi=pandas.read_csv('app_data/SabDab_data/adimab_SabDabdata.csv',sep=',',header=0)
    
    adiName=Adi['Therapeutic']; NotadiName=notAdi['Therapeutic']
    adiOutcome=Adi["Highest_Clin_Trial (Jan '20)"]
    NotadiOutcome=notAdi["Highest_Clin_Trial (Jan '20)"]
    adiDeact=Adi['Est. Status']; NotadiDeact=notAdi['Est. Status']

    adimab_info=pandas.concat([adiName,adiOutcome,adiDeact],axis=1).dropna()
    notadimab_all=pandas.concat([NotadiName,heavy_Abs,light_Abs,NotadiOutcome,NotadiDeact],axis=1).dropna()
    # Let's not process this data, just return the matrix
    return(adimab_info,notadimab_all)
#####################################################################################
The diff you're trying to view is too large. Only the first 1000 changed files have been loaded.
Showing with 0 additions and 0 deletions (0 / 0 diffs computed)
swh spinner

Computing file changes ...

Software Heritage — Copyright (C) 2015–2025, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Contact— JavaScript license information— Web API

back to top