Revision 469689991e7dbce2be4c4f618584304d91841c49 authored by Chase W. Nelson on 01 October 2020, 17:39:04 UTC, committed by Chase W. Nelson on 01 October 2020, 17:39:04 UTC
1 parent 0b9f8cb
Fig8.m
clear all
close all
clc
A = importdata('NC_045512_site_database_altA.tsv'); %col 18:end are intrahost samples
A = cellfun(@(s) strsplit(s), A, 'UniformOutput', false);
T = importdata('NC_045512_site_database_altT.tsv');
T = cellfun(@(s) strsplit(s), T, 'UniformOutput', false);
G = importdata('NC_045512_site_database_altG.tsv');
G = cellfun(@(s) strsplit(s), G, 'UniformOutput', false);
C = importdata('NC_045512_site_database_altC.tsv');
C = cellfun(@(s) strsplit(s), C, 'UniformOutput', false);
row = size(A,1);
Result = zeros(row-1,8); %in the order of ATGC.
All = zeros(row-1,2);
ListNuc = [{'A'},{'T'},{'G'},{'C'}];
Gene1 = cellfun(@(s) s{3}, A(2:end), 'UniformOutput',false);
Gene2 = cellfun(@(s) s{4}, A(2:end), 'UniformOutput',false);
uniqueGene1 = setdiff(unique(Gene1),'.');
uniqueGene2 = setdiff(unique(Gene2),'.');
Coor1 = getGeneCoor(Gene1,uniqueGene1);
Coor2 = getGeneCoor(Gene2,uniqueGene2);
for i = 2:row
recA= returnRecurrent(A{i}(18:end));
recT = returnRecurrent(T{i}(18:end));
recG = returnRecurrent(G{i}(18:end));
recC = returnRecurrent(C{i}(18:end));
Result(i-1,:) = [recA,recT,recG,recC];
Data = [A{i}(18:end);T{i}(18:end);G{i}(18:end);C{i}(18:end)];
indNuc = find(strcmp(ListNuc,A{i}(2)) == 1);
Data(indNuc,:) = [];
All(i-1,:) = returnRecurrent(reshape(Data,1,401*3));
end
indA = find(Result(:,1) >= 10);
indU = find(Result(:,3) >= 10);
nonsynAll = [{getNonSyn(A)},{getNonSyn(T)},{getNonSyn(G)},{getNonSyn(C)}];
Colors = [ 0.5 0.5 0;
0 0.5 0.5;
0.5 0 0.5;
0.1 0.1 0.1;
0.1 0.5 0.7;
0.3 0.7 0.5;
1 0 0;
0 0 1;
0 1 0;
1 1 0;
0 1 1;
1 0 1];
%-----plot the distribution----
subplot('Position',[0.08 0.2 0.86 0.75])
hold on
X = All([1:row-1]',1)./All([1:row-1]',2);
indX = find(X>0.025);
scatter(indX,X(indX),130,'MarkerFaceAlpha',0.5,'MarkerEdgeColor','k');
freqExceed = [];
indExceed = [];
for i = 1:4;
X = Result([1:row-1]',i*2-1)./All([1:row-1]',2);
indX = find(X>0.025);
scatter(indX,X(indX),130,'filled' ,'MarkerFaceAlpha',0.5);
indNS = nonsynAll{i};
indExceed = [indExceed;intersect(indX, indNS)];
freqExceed = [freqExceed;X(intersect(indX, indNS))];
end
scatter(indExceed, freqExceed,20,'filled','MarkerFaceColor','r');
ylim([0 .9])
indAll = find(All(:,1)./All(:,2) >= 0.025);
length(indAll);
[freqSort indSort] = sort(freqExceed);
indSortTop = indExceed(indSort(end-4:end))
freqSortTop = freqSort(end-4:end);
coorChange = [{'C10083U'},{'C5768A'},{'C8655U'},{'A14553C'},{'A404U'}]
aaChange = [{'S>F'},{'H>N'},{'S>F'},{'L>F'},{'K>Stop'}]
for i = 2:length(indSortTop)
text(indSortTop(i)-200, freqSortTop(i) + 0.05,strcat(coorChange{i},', ',aaChange{i}),'FontSize',18);
end
text(10087-200,0.192,'T10087G','FontSize',18);
for i = 1:length(uniqueGene1)
for j = 0:10:(Coor1(i,2) - Coor1(i,1))
plot([Coor1(i,1)+j,Coor1(i,1)+j],[0.0,0.02],'Color',Colors(i,:),'HandleVisibility','off')
end
end
for i = 1:length(uniqueGene2)
for j = 0:5:(Coor2(i,2) - Coor2(i,1))
plot([Coor2(i,1)+j,Coor2(i,1)+j],[0.005,0.02],'k','HandleVisibility','off')
end
end
legend('All','A','U','G','C','Nonsyn');
xlabel('Genome position')
ylabel('Proportion of samples');
set(gca,'FontSize',22);
box on
set(gcf,'PaperPosition',[0 0 16 6]);
saveas(1,'Fig8.jpg');
function nonsynIndex = getNonSyn(List)
aa1 = cellfun(@(s) s{11}, List(2:end), 'UniformOutput',false);
aa2 = cellfun(@(s) s{16}, List(2:end), 'UniformOutput',false);
aa3 = cellfun(@(s) s{12}, List(2:end), 'UniformOutput',false);
aa4 = cellfun(@(s) s{17}, List(2:end), 'UniformOutput',false);
diffAA1 = find(strcmp(aa1,aa2)~=1);
diffAA2 = find(strcmp(aa3,aa4)~=1);
nonsynIndex = union(diffAA1,diffAA2);
end
function Coor = getGeneCoor(GeneList,GeneName);
Coor = zeros(length(GeneName),2);
for i = 1:length(GeneName);
ind = find(strcmp(GeneList,GeneName{i})==1);
Coor(i,:) = [min(ind),max(ind)];
end
end
function recurrFreq = returnFreq(A)
recurrFreq = zeros(length(A),1);
indKeep = find(contains(A,',') == 1);
if length(indKeep) > 0
data = cellfun(@(s) strsplit(s, ','), A(indKeep),'UniformOutput',false);
refAllele = cell2mat(cellfun(@(s) str2num(s{1}), data,'UniformOutput',false));
altAllele = cell2mat(cellfun(@(s) str2num(s{2}), data,'UniformOutput',false));
recurrFreq(indKeep) = altAllele./(refAllele + altAllele);
end
end
function recurrVec = returnRecurrent(A)
indKeep = find(contains(A,',') == 1);
if length(indKeep) > 0
data = cellfun(@(s) strsplit(s, ','), A(indKeep),'UniformOutput',false);
refAllele = cell2mat(cellfun(@(s) str2num(s{1}), data,'UniformOutput',false));
altAllele = cell2mat(cellfun(@(s) str2num(s{2}), data,'UniformOutput',false));
alleleFreq = altAllele./(refAllele + altAllele);
ind1 = intersect(find(alleleFreq < 0.5),find(alleleFreq > 0 ));
ind2 = intersect(find(alleleFreq > 0.5),find(alleleFreq < 1 ));
recurrVec = [length(ind1), 401 - length(indKeep)];
else
recurrVec = [0, 0];
end
end
Computing file changes ...