Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

Revision 212ab6cb8d79a68e1c5b5b04a17506f195b3f634 authored by Christopher Boughter on 14 September 2020, 16:44:29 UTC, committed by GitHub on 14 September 2020, 16:44:29 UTC
Update install_packages.sh
1 parent 2c5516b
  • Files
  • Changes
  • 0dcbe37
  • /
  • seq_loader.py
Raw File Download
Permalinks

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • revision
  • directory
  • content
revision badge
swh:1:rev:212ab6cb8d79a68e1c5b5b04a17506f195b3f634
directory badge Iframe embedding
swh:1:dir:0dcbe372bc82884a8bc3774a8390e76471e8fbe0
content badge Iframe embedding
swh:1:cnt:5c49ef6bdfdbffe00f028227df1a802e2f0a878b
Citations

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • revision
  • directory
  • content
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
seq_loader.py
# Let's start off by loading in Jeff's CDR3's
import numpy as np
import pandas

def getBunker():
    total_Abs=pandas.read_csv('app_data/mouse_IgA.dat',sep='\s+',header=None,names=['cdrL1_aa','cdrL2_aa','cdrL3_aa','cdrH1_aa','cdrH2_aa','cdrH3_aa','react'])
    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    mono_all=total_abs7[total_abs7['react'].isin([0.0,1.0])].values
    poly_all=total_abs7[total_abs7['react'].isin([2.0,3.0,4.0,5.0,6.0,7.0])].values

    mono=total_abs7[total_abs7['react'].isin([0.0])].values
    poly=total_abs7[total_abs7['react'].isin([5.0,6.0,7.0])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)
    
    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)

    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))
#####################################################################################

def getJenna():
    total_Abs=pandas.read_csv('app_data/flu_IgG.dat',sep='\s+',header=None,
    names=['cdrL1_aa','cdrL2_aa','cdrL3_aa','cdrH1_aa','cdrH2_aa','cdrH3_aa','react'])

    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    # Having this and the above lines as "if" options could make this loader more generalizable...
    mono_all=total_abs7[total_abs7['react'].isin([0,1])].values
    poly_all=total_abs7[total_abs7['react'].isin([2,3,4,5,6,7])].values

    mono=total_abs7[total_abs7['react'].isin([0])].values
    poly=total_abs7[total_abs7['react'].isin([5,6,7])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)

    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))

def getHugo():
    my_heavy=pandas.read_csv('app_data/hiv_igg_data/gut_heavy_aa.dat',sep='\s+')
    my_light=pandas.read_csv('app_data/hiv_igg_data/gut_light_aa.dat',sep='\s+')
    poly_YN=pandas.read_csv('app_data/hiv_igg_data/gut_num_react.dat',sep='\s+',header=None,names=['react'])
    total_abs=pandas.concat([my_light,my_heavy,poly_YN],axis=1)
    total_abs7 = total_abs.where((pandas.notnull(total_abs)), '')
    mono_all=total_abs7[total_abs7['react'].isin([0,1])].values
    poly_all=total_abs7[total_abs7['react'].isin([2,3,4])].values
    mono=total_abs7[total_abs7['react'].isin([0])].values
    poly=total_abs7[total_abs7['react'].isin([3,4])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)
    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))

def getHugo_Nature():
    my_heavy=pandas.read_csv('app_data/hiv_igg_data/nat_heavy_aa.dat',sep='\s+')
    my_light=pandas.read_csv('app_data/hiv_igg_data/nat_light_aa.dat',sep='\s+')
    poly_YN=pandas.read_csv('app_data/hiv_igg_data/nat_num_react.dat',sep='\s+',header=None,names=['react'])
    total_Abs=pandas.concat([my_light,my_heavy,poly_YN],axis=1)

    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    # And finish it up...
    mono_all=total_abs7[total_abs7['react'].isin([0.0,1.0])].values
    poly_all=total_abs7[total_abs7['react'].isin([2.0,3.0,4.0,5.0,6.0])].values

    mono=total_abs7[total_abs7['react'].isin([0.0])].values
    poly=total_abs7[total_abs7['react'].isin([5.0,6.0])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)
    
    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)

    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))

def getHugo_NatCNTRL():
    my_heavy=pandas.read_csv('app_data/hiv_igg_data/nat_cntrl_heavy_aa.dat',sep='\s+')
    my_light=pandas.read_csv('app_data/hiv_igg_data/nat_cntrl_light_aa.dat',sep='\s+')
    poly_YN=pandas.read_csv('app_data/hiv_igg_data/nat_cntrl_num_react.dat',sep='\s+',header=None,names=['react'])
    total_Abs=pandas.concat([my_light,my_heavy,poly_YN],axis=1)

    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    # And finish it up...
    mono_all=total_abs7[total_abs7['react'].isin([0.0,1.0])].values
    poly_all=total_abs7[total_abs7['react'].isin([2.0,3.0,4.0,5.0,6.0])].values

    mono=total_abs7[total_abs7['react'].isin([0.0])].values
    poly=total_abs7[total_abs7['react'].isin([5.0,6.0])].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)
    
    a=0
    del_these=[]
    for i in np.arange(len(mono[:,5])):
        if mono[i,5] == '' or mono[i,4] == '' or mono[i,3] == '' or mono[i,2] == '' or mono[i,1] == '' or mono[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono2=np.delete(mono,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly[:,5])):
        if poly[i,5] == '' or poly[i,4] == '' or poly[i,3] == '' or poly[i,2] == '' or poly[i,1] == '' or poly[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly2=np.delete(poly,del_these,axis=0)

    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]),np.transpose(mono2[:,0:6]),np.transpose(poly2[:,0:6]))


def getHugo_PLOS():
    my_heavy=pandas.read_csv('app_data/hiv_igg_data/plos_heavy_aa.dat',sep='\s+')
    my_light=pandas.read_csv('app_data/hiv_igg_data/plos_light_aa.dat',sep='\s+')
    poly_YN=pandas.read_csv('app_data/hiv_igg_data/plos_yn.dat',sep='\s+',header=None,names=['YN'])
    total_Abs=pandas.concat([my_light,my_heavy,poly_YN],axis=1)

    total_abs1 = total_Abs.where((pandas.notnull(total_Abs)), '')
    # Remove X's in sequences... Should actually get a count of these at some point...
    total_abs2=total_abs1[~total_abs1['cdrL1_aa'].str.contains("X")]
    total_abs3=total_abs2[~total_abs2['cdrL2_aa'].str.contains("X")]
    total_abs4=total_abs3[~total_abs3['cdrL3_aa'].str.contains("X")]
    total_abs5=total_abs4[~total_abs4['cdrH1_aa'].str.contains("X")]
    total_abs6=total_abs5[~total_abs5['cdrH2_aa'].str.contains("X")]
    total_abs7=total_abs6[~total_abs6['cdrH3_aa'].str.contains("X")]

    # And finish it up...
    mono_all=total_abs7[total_abs7['YN']=='N'].values
    poly_all=total_abs7[total_abs7['YN']=='Y'].values

    a=0
    del_these=[]
    for i in np.arange(len(mono_all[:,5])):
        if mono_all[i,5] == '' or mono_all[i,4] == '' or mono_all[i,3] == '' or mono_all[i,2] == '' or mono_all[i,1] == '' or mono_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    mono_all2=np.delete(mono_all,del_these,axis=0)

    a=0
    del_these=[]
    for i in np.arange(len(poly_all[:,5])):
        if poly_all[i,5] == '' or poly_all[i,4] == '' or poly_all[i,3] == '' or poly_all[i,2] == '' or poly_all[i,1] == '' or poly_all[i,0] == '':
            if a == 0:
                del_these=i
            else:
                del_these=np.vstack((del_these,i))
            a=a+1
    poly_all2=np.delete(poly_all,del_these,axis=0)
    return(np.transpose(mono_all2[:,0:6]),np.transpose(poly_all2[:,0:6]))

def getAdimab():
    heavy_Abs=pandas.read_csv('app_data/adimab_data/cdrs_H_final.txt',sep='\s+',header=None,names=['cdrH1_aa','cdrH2_aa','cdrH3_aa'])
    light_Abs=pandas.read_csv('app_data/adimab_data/cdrs_L_final.txt',sep='\s+',header=None,names=['cdrL1_aa','cdrL2_aa','cdrL3_aa'])
    outcomes=pandas.read_csv('app_data/adimab_data/drug_outcomes.csv',sep=',',header=0)
    assays=pandas.read_csv('app_data/adimab_data/drug_properties.csv',sep=',',header=0)
    
    names=outcomes['Name']
    clinical=outcomes['Clinical Status']
    phage=outcomes['Phagec']
    elisa_polyScores=assays['ELISA']
    psr_assayScore=assays['Poly-Specificity Reagent (PSR) SMP Score (0-1)']

    total_Abs=pandas.concat([names,heavy_Abs,clinical,light_Abs,phage,assays.loc[:, assays.columns != 'Unnamed: 13']],axis=1).dropna()

    # Let's not process this data, just return the matrix
    return(total_Abs)
#####################################################################################

def getSabDab():
    heavy_Abs=pandas.read_csv('app_data/SabDab_data/nonAdimab_igblast/cdrs_H_final.txt',sep='\s+',header=None,names=['cdrH1_aa','cdrH2_aa','cdrH3_aa'])
    light_Abs=pandas.read_csv('app_data/SabDab_data/nonAdimab_igblast/cdrs_L_final.txt',sep='\s+',header=None,names=['cdrL1_aa','cdrL2_aa','cdrL3_aa'])
    notAdi=pandas.read_csv('app_data/SabDab_data/non_adimab_dataHu.csv',sep=',',header=0)
    Adi=pandas.read_csv('app_data/SabDab_data/adimab_SabDabdata.csv',sep=',',header=0)
    
    adiName=Adi['Therapeutic']; NotadiName=notAdi['Therapeutic']
    adiOutcome=Adi["Highest_Clin_Trial (Jan '20)"]
    NotadiOutcome=notAdi["Highest_Clin_Trial (Jan '20)"]
    adiDeact=Adi['Est. Status']; NotadiDeact=notAdi['Est. Status']

    adimab_info=pandas.concat([adiName,adiOutcome,adiDeact],axis=1).dropna()
    notadimab_all=pandas.concat([NotadiName,heavy_Abs,light_Abs,NotadiOutcome,NotadiDeact],axis=1).dropna()
    # Let's not process this data, just return the matrix
    return(adimab_info,notadimab_all)
#####################################################################################
The diff you're trying to view is too large. Only the first 1000 changed files have been loaded.
Showing with 0 additions and 0 deletions (0 / 0 diffs computed)
swh spinner

Computing file changes ...

back to top

Software Heritage — Copyright (C) 2015–2025, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Contact— JavaScript license information— Web API