https://github.com/nbraffman/CylK-homologs
Revision 732a38ef8ab73988203ea1933158238ea90361b0 authored by nbraffman on 29 November 2021, 18:59:23 UTC, committed by GitHub on 29 November 2021, 18:59:23 UTC
1 parent 88e9848
Raw File
Tip revision: 732a38ef8ab73988203ea1933158238ea90361b0 authored by nbraffman on 29 November 2021, 18:59:23 UTC
Update README.md
Tip revision: 732a38e
script_Nterm.sh
#!/bin/sh
filename='input_all_homologs.fasta'
Counter=0
rm out_homologs.fasta
while read line; do
    if [ $Counter -eq 0 ]; then #copy first line metadata of fasta file to temp.txt
        metadata=$line          #copy name to variable for log output
        echo $line > temp.txt
        echo $line > temp.fasta
        Counter=$((Counter+1)) #increase counter index to proceed to second line
    else
        echo $line >> temp.txt #copy second line sequence data to temp.txt
        echo $line >> temp.fasta #secondary copy for binning
        Counter=$((Counter-1)) #decrease counter index back to 0 to reset for next sequence
        
        filename='licheniforme.fasta'   #in the meantime... while we are on each sequence data line:
        while read line; do
            echo $line >> temp.txt      #copy reference metadata and sequence to temp.txt 
        done < 'licheniforme.fasta'

        muscle -in temp.txt -out temp.afa   #run muscle alignment of sequence against ref, output temp.afa

        filename='temp.afa'     #open temp.afa
        Counter2=0
        while read line; do
            Counter2=$((Counter2+1))    #determine number of lines because it will vary between alignments
        done < 'temp.afa'

        filename='temp.afa'     #re-open temp.afa
        Counter3=0
        homolog=""              #define variables homolog and refseq to exract data as strings from alignment
        refseq=""
        while read -r line; do          #very important to have -r here to recognize carriage returns
            Counter3=$((Counter3+1))

            if [ $Counter3 -eq 1 ]; then #at first line of alignment file, skip metadata
                echo
            else                                                #at all other lines
                if [ $Counter3 -le $((Counter2/2)) ]; then      #if we are in the first half of text file (homolog of interest)
                homolog="$homolog$line"                         #conatenate string to remove returns
                else
                    if [ $Counter3 -gt $((Counter2/2+1)) ]; then    #at all other lines second half of text file (reference seq)
                        refseq="$refseq$line"                       #concatenate string to remove returns
                    fi
                fi
            fi
            #echo $homolog > temp2.txt #write extracted strings to temp2.txt and temp3.txt for analysis
            #echo $refseq > temp3.txt
        done < 'temp.afa'

        hash_counter=0                                      #define variable to count breaks in alignment from left to right
        for (( i=0; i<${#refseq}; i++ )); do                #iterate through reference sequence (string)
            if [ $i -le $((hash_counter+240)) ]; then       #continue until the index is <= the number of breaks + 240
                if [ "${refseq:$i:1}" = "-" ]; then         #this is how we are defining the N-terminus, containing something between R105 and K240
                    hash_counter=$(($hash_counter+1))
                fi
            fi
        done
        echo "Query Sequence: "$metadata
        echo "N-Terminal breaks in ref sequence: "$hash_counter
        
        hash_counter2=0
        index=0
        for (( i=0; i<${#homolog}; i++ )); do              #repeat with homolog sequence (string) for the index length as above
            if [ $i -le $((hash_counter+240)) ]; then      #NOTE this should be hash_counter not hash_counter2
                index=$(($index+1))
                if [ "${homolog:$i:1}" = "-" ]; then        
                    hash_counter2=$(($hash_counter2+1))    #but do keep track of hash count in this sequence for analysis
                fi
            fi
        done
        echo "N-terminal breaks in query sequence: "$hash_counter2
        echo "# of residues before K240 equivalent: "$((index-hash_counter2))

        if [ $((index-hash_counter2+hash_counter)) -ge 135  ]; then
            filename='temp.fasta'
            while read line; do
                echo $line >> out_homologs.fasta
            done <'temp.fasta'
        fi
    fi
done < 'input_all_homologs.fasta'

rm temp.fasta
rm temp.afa
rm temp.txt

back to top