clear all tic % 1) load ChIP-atlas gene lists for Cpp/P300 and NfkB/p65 [chip]=load_chip_atlas_human(); [0 toc/1e3] % 2) load and convert three scRNA data [sc]=list_scRNA2(1); for i1=1:length(sc) % 3) load scRNMA-seq UMI count Matrix in csv [a,gene]=convert_scRNA_to_mat(sc(i1)); [1 i1 toc/1e3] % 4) separate human and mouse cells [hg,geneh,ah,mg,genem,am,jh,jm]=isolate_mixed_sample(a,gene); HeLa(i1).ah=ah; HeLa(i1).gene=geneh; [2 i1 toc/1e3] end % 5) normalized UMI counts to library size and calculate mean and CV [HeLa]=Calcu_Hela_A2_A3_C2(HeLa); [3 toc/1e3] % 6) find genes with CVs significantly above the basal level [HeLa,Fig6C,index_outlier,cv_threashold,CPM_t]=Hela_find_CV_outlier(HeLa); [4 toc/1e3] % 7) statistic analysis of effects of A485+LMK on CV with outlier genes with large CVs [HeLa,bin, Fig6D,Fig6E,Fig6F]=Hela_bin_CVs_chip(HeLa,index_outlier,chip); % 8) calculate histograms for UMI counts and for "SAVER-denoised' UMI counts list(1).name='HMGCS1'; list(2).name='HMOX1';list(3).name='mRuby-2A-bla'; [Fig6G,list]=histograms_hela_UMI(HeLa,list); save('HeLa_A2_A3_C2_final.mat','HeLa','chip','geneh','index_outlier','cv_threashold','CPM_t','Fig6C','Fig6D','Fig6E','Fig6F','Fig6G','-v7.3'); % end of main program function [sc]=list_scRNA2(j1) % sample infomations sc(j1).folder='.\'; sc(j1).source='halpos+huada'; % sequencing platforms, data conbined sc(j1).name='A1A2_withIndex';sc(j1).species='human_mouse'; j1=j1+1;sc(j1).name='B1A3_withIndex';sc(j1).folder=sc(1).folder;sc(j1).source=sc(1).source;sc(j1).species='human_mouse'; j1=j1+1;sc(j1).name='C1C2_withIndex';sc(j1).folder=sc(1).folder;sc(j1).source=sc(1).source;sc(j1).species='human_mouse'; end function [chip]=load_chip_atlas_human() % downloaded from ChIP-atlas fileP300='EP300-human.tsv'; fileCbp='CREBBP-human.tsv'; fileNfkb='RELA-human.tsv'; temp=importdata(fileP300); ng=length(temp.data(:,1)); for i1=1:ng P300.gene{i1}=temp.textdata{i1+1,1}; end temp=importdata(fileCbp); ng=length(temp.data(:,1)); for i1=1:ng Cbp.gene{i1}=temp.textdata{i1+1,1}; end temp=importdata(fileNfkb); ng=length(temp.data(:,1)); for i1=1:ng Nfkb.gene{i1}=temp.textdata{i1+1,1}; end % merge ChIP list and scores for P300 and Cbp g1=string(P300.gene); g2=string(Cbp.gene); g12=unique([g1 g2]); CbpP300.gene=cellstr(g12); chip(1).name='Cbp/P300'; chip(1).gene=CbpP300.gene; chip(2).name='Rela (NfkB P65)'; chip(2).gene=Nfkb.gene; end function [a,gene]=convert_scRNA_to_mat(sc) filecsv=sprintf('%s%s.csv',sc.folder,sc.name) % load count matrix csv files temp=importdata(filecsv); clear gene % generate gene lists for i1=1:length(temp.textdata)-1 temp0=char(temp.textdata{1,i1+1}); if temp0(1)=='"' gene{i1}=temp0(2:end-1); else gene{i1}=temp0; end end a=temp.data; end function [hg,geneh,ah,mg,genem,am,jh,jm]=isolate_mixed_sample(a,gene) jh=0; jm=0; for i1=1:length(gene) temp=char(gene{i1}); if temp(1:2)=='hg' % human genes jh=jh+1; hg(jh)=i1; % indices for human genes geneh{jh}=temp(6:end); elseif temp(1:2)=='mm' % mouse genes jm=jm+1; mg(jm)=i1; % indices for mouse genes genem{jm}=temp(6:end); elseif temp(1:5)=='mRuby' % mRuby-blas transcripts in HeLa cell jh=jh+1; hg(jh)=i1; % indices for mouse genes geneh{jh}=temp; end end for i1=1:length(mg) temp=char(gene{mg(i1)}); genem{i1}=temp(6:end); end clear ah am ah=a(:,hg); am=a(:,mg); sh=sum(ah'); sm=sum(am'); clear ah am jh=find(sm<0.01*sh); jm=find(sh<0.018*sm); ah=a(jh,hg); am=a(jm,mg); end function [HeLa]=Calcu_Hela_A2_A3_C2(HeLa) b1h=HeLa(1).ah; b2h=HeLa(2).ah; b3h=HeLa(3).ah; geneh=HeLa(1).gene; ngh=length(geneh); TR1=sum(b1h')'; % total UMI counts for each cells TR2=sum(b2h')'; % total UMI counts for each cells TR3=sum(b3h')'; % total UMI counts for each cells % counts per million (CPM) normalized by the library size b1hn=1e6*b1h./repmat(TR1,1,ngh); b2hn=1e6*b2h./repmat(TR2,1,ngh); b3hn=1e6*b3h./repmat(TR3,1,ngh); % means of normalized UMI m10=mean(b1hn); m20=mean(b2hn); m30=mean(b3hn); % CVs for normalized UMI cv10=std(b1hn)./m10; cv20=std(b2hn)./m20; cv30=std(b3hn)./m30; HeLa(1).name='A2'; HeLa(1).ah=b1h; HeLa(1).ahn=b1hn; HeLa(1).m=m10; HeLa(1).cv=cv10; HeLa(2).name='A3'; HeLa(2).ah=b2h; HeLa(2).ahn=b2hn; HeLa(2).m=m20; HeLa(2).cv=cv20; HeLa(3).name='C2'; HeLa(3).ah=b3h; HeLa(3).ahn=b3hn; HeLa(3).m=m30; HeLa(3).cv=cv30; end function [HeLa,Fig6C,index_outlier,cv_threashold,CPM_t]=Hela_find_CV_outlier(HeLa) geneh=HeLa(1).gene; ngh=length(geneh); b1hn=HeLa(1).ahn; m10=HeLa(1).m; cv10=HeLa(1).cv; b2hn=HeLa(2).ahn; m20=HeLa(2).m; cv20=HeLa(2).cv; b3hn=HeLa(3).ahn; m30=HeLa(3).m; cv30=HeLa(3).cv; % % % merge two control samples A2, A3 b12hn=[b1hn;b2hn]; nc12=length(b12hn(:,1)); m12=mean(b12hn); cv12=std(b12hn)./m12; HeLa(4).name='A2 & A3'; HeLa(4).ahn=b12hn; HeLa(4).m=m12; HeLa(4).cv=cv12; % % % filter out the one cell dramatically influnces CV for each gene [cv11,c_1]=filter_onecell(b1hn,cv10,ngh); [cv21,c_2]=filter_onecell(b2hn,cv20,ngh); [cv31,c_3]=filter_onecell(b3hn,cv30,ngh); [cv121,c_12]=filter_onecell(b12hn,cv12,ngh); HeLa(1).cv1=cv11;HeLa(2).cv1=cv21;HeLa(3).cv1=cv31;HeLa(4).cv1=cv121; % filter out low expression genes % find the power function for baseline for CV vs mean % find the power function for boundery for abnormal CV vs mean j1=find(m10>1 & m20>1 & m30>1); % genes with mean > 1CPM in all of the three samples % merge three data sets mm=[m10(j1) m20(j1) m30(j1)]; ccv=[cv11(j1) cv21(j1) cv31(j1)]; % rank the means of the genes, and group the genes into 200 per groups % and calculate average mean and average cv n_group_average=200; [temp,j2]=sort(mm); ngtotal=length(j2); j1x=1:n_group_average:ngtotal; j1y=j1x+n_group_average-1; j1y(end)=length(j2); npart=length(j1x); for i1=1:npart mean_mm(i1)=mean(mm(j2(j1x(i1):j1y(i1)))); % means of the means mean_ccv(i1)=mean(ccv(j2(j1x(i1):j1y(i1)))); % means of the CVs sd_ccv(i1)=std(ccv(j2(j1x(i1):j1y(i1)))); % SDs of the CVs end % calculate and plot the cutoff for genes with large CVs % power function and starting parameters cv_mean_model=@(a,b,x) (a*x.^b); % fitting function startPoints=[10 -0.5]; % fit mean CCs to means ft1=fit(mean_mm',mean_ccv',cv_mean_model,'Start', startPoints); % fit mean CV + 3*sd CVs to means ft2=fit(mean_mm',mean_ccv'+3*sd_ccv',cv_mean_model,'Start', startPoints); % x0=(1:1:1e5)'; cvbase=cv_mean_model(ft1.a,ft1.b,x0); cvbound=cv_mean_model(ft2.a,ft2.b,x0); figure(1); hold off plot(log10(mm),ccv,'b.','MarkerSize',8);hold on plot(log10(x0),cvbase,'m-','lineWidth',2); plot(log10(x0),cvbound,'r--','lineWidth',2); hold off legend('C.V.s','baseline','cutoff') xlabel({'single cell expressions','log10(CPM)'}) ylabel('C.V.'); Fig6C.xdata=log10(mm); Fig6C.ydata=ccv; Fig6C.xfit=log10(x0); Fig6C.yfit=cvbase; Fig6C.xcutoff=log10(x0); Fig6C.ycutoff=cvbound; Fig6C.fit1=ft1;Fig6C.fit2=ft2; % finding outlier genes (abover the mean_CV+3sd vs mean curve) % finding abover the curve, other criteria % 1) minimal mean expression: 5CPM % 2) minimal CV: 1.0 cv_threashold=1; CPM_t=5; % calculated CV cutoff for each genes cv121_t=cv_mean_model(ft2.a,ft2.b,m12(j1)); cv31_t=cv_mean_model(ft2.a,ft2.b,m30(j1)); % find genes with CVs above the cutoff in any of the samples j12t=find(cv121(j1)>cv_threashold & cv121(j1)>=cv121_t & m12(j1)>CPM_t); j3t=find(cv31(j1)>cv_threashold & cv31(j1)>=cv31_t & m30(j1)>CPM_t); jj=sort([j12t j3t]); % removing redundant gene list temp=find(diff(jj)==0); jj(temp)=[]; index_outlier=j1(jj); end function [HeLa,bin, Fig6D,Fig6E,Fig6F]=Hela_bin_CVs_chip(HeLa,index_outlier,chip) cv11=HeLa(1).cv1; cv21=HeLa(2).cv1; cv31=HeLa(3).cv1; cv121=HeLa(4).cv1; jj0=index_outlier; geneh=HeLa(1).gene; % CV threshold for outlier genes with low and high CV cvmn=[0 2 ]; cvmx=[2 100]; % find outliear genes with low and high CVS for i1=1:2 k1=find(cv121(jj0)>cvmn(i1) & cv121(jj0)2*abs(cv_1-cv_2) cv11(i1)=cv_1; c_1(i1)=k1(end); end end end function [jj1]=find_genes_in_chip_0(geneh,jj0,chip,i0) gene_chip=chip(i0).gene; nj0=length(jj0); jj1=[]; j1=0; for i1=1:nj0 temp1=char(geneh{jj0(i1)}); for i2=1:length(gene_chip) temp2=char(gene_chip{i2}); if length(temp1)==length(temp2) if temp1==temp2 j1=j1+1; jj1(j1)=jj0(i1); end end end end end