#!/usr/bin/perl -w # example usage: # perl permute_ASVs_synthetic.pl 84 10 1000 > permute.ASVs.Phyla84.ratio10.seq1000.txt use strict; my $N_genera = $ARGV[0]; # the number of genera present in the sample. Here we will use genera, but this could be any taxonomic level (the higher of the two) my $ASV_per_g = $ARGV[1]; # the number of ASVs per genus. Here we will use ASVs, but this could be any taxonomic level nested within a higher level my $N_reads = $ARGV[2]; # the number of reads sampled my %taxonomy; #keys=ASV_ID, values=taxonomy my @ASVs; # list of all ASVs # Assign ASVs to genera my $ASV_ID = 0; for (my $g=1; $g<=$N_genera; $g++) { for (my $n=1; $n<=$ASV_per_g; $n++) { $taxonomy{$ASV_ID} = $g; $ASV_ID++; } } # Print headers print join "\t", "true_ASVs", "true_G", "obs_ASVs", "obs_G"; print "\n"; # Print the true number of ASVs and genera in each sample, in a range of samples containing from 1 to $N_genera for (my $g=1; $g<=$N_genera; $g++) { my @ASVs; for (my $genus=1; $genus<=$g; $genus++) { for (my $n=0; $n<$ASV_ID; $n++) { next unless ($taxonomy{$n} == $genus); push @ASVs, $n; } } print join "\t", (@ASVs+0), $g; print "\t"; # in each sample, count the total number of observd unique ASVs and observed unique genera at a given sampling effort ($N_reads) my $N_ASV = 0; my $N_genus = 0; my %seen_ASV; my %seen_genus; for (my $j=0; $j<$N_reads; $j++) { # sample a set number of reads at random from the ASVs in the sample my $j = int rand ( @ASVs+0 ); my $ASV = $ASVs[$j]; my $genus = $taxonomy{$ASV}; unless (exists $seen_ASV{$ASV}) { $seen_ASV{$ASV} = 1; $N_ASV++; } unless (exists $seen_genus{$genus}) { $seen_genus{$genus} = 1; $N_genus++ } } print join "\t", $N_ASV, $N_genus; print "\n"; }