https://github.com/bnavarrodominguez/SD-Population-genomics
Raw File
Tip revision: e012c1df579871600334847e254a1ecc6c053592 authored by Amanda Larracuente on 10 May 2022, 19:52:54 UTC
Updated README
Tip revision: e012c1d
shared_privates_distance_distr_v2.R
#############Load libraries
require(dplyr)
require(stringr)
require(ggplot2)
require(ggforce)
require(ggpubr)
require(viridis)
require(cowplot)


priv_sites <- read.delim("shared_private_snps/zisd_private_snps.txt", header=FALSE)
shar_sites <- read.delim("shared_private_snps/zisd_shared_snps.txt", header=FALSE)


priv_sites %>% group_by(V1) %>% dplyr::summarise(tot= n())
shar_sites %>% group_by(V1) %>% dplyr::summarise(tot=n())

shar_sites$id = interaction(shar_sites$V1,shar_sites$V2,sep = '_',drop = TRUE)
priv_sites$id = interaction(priv_sites$V1,priv_sites$V2,sep = '_',drop = TRUE)


regions <- read.csv("../Figure_2_and_3/chromosome_regions.csv")

priv_sites$class = 'priv'
shar_sites$class = 'shar'

datos = rbind(priv_sites,shar_sites)
head(datos)
colnames(datos) = c("chr","snp","ref","alt","idk",'class')
head(datos)

table(datos$class)


#### Add marks
regions

datos$chromstate = ifelse(datos$chr==1 & datos$snp > regions$start[6] & datos$snp | datos$chr==2 & datos$snp < regions$end[5],"het","eu" )
datos$in2rmal = ifelse(datos$chr==2 & datos$snp > regions$start[1] &datos$snp< regions$end[1],"in2rmal",'st' )
datos$sweep = ifelse(datos$chr==1 & datos$snp > regions$start[7] & datos$snp & datos$snp < regions$end[7] 
                     | datos$chr==2 & datos$snp > regions$start[8] & datos$snp & datos$snp < regions$end[7] ,"sweep","st" )

head(datos)


#############Runs
head(datos)

datos = datos[order(datos$chr, datos$snp),]
head(datos)

runs = datos %>% 
  mutate(rleid = cumsum(class != lag(class, default = ""))) %>%
  group_by(chr,rleid) %>%
  dplyr::summarise(class = first(class), x = min(snp), y = max(snp), n.snps = n(), dist.xy = y-x) %>%
  select(-rleid)
runs

list.runs = split(runs,f = runs$chr)
names(list.runs) = c("a","b")

list.runs$a$dist.xx = ave(list.runs$a$x,list.runs$a$chr, FUN=function(x) c(diff(x),0))
list.runs$a$dist.next = list.runs$a$dist.xx - list.runs$a$dist.xy

list.runs$b$dist.xx = ave(list.runs$b$x,list.runs$b$chr, FUN=function(x) c(diff(x),0))
list.runs$b$dist.next = list.runs$b$dist.xx - list.runs$b$dist.xy

runs = do.call(rbind, list.runs)
runs


runs$dist.xx = NULL

#######Being:
## x = start of the run
## y = end of the run
## n.snps = number of SNPs in the run
## dist.xy = distance between first and last SNP of the cluster
## dist.next = distance to next cluster (of the opposite kind -> if P: distance to next S, if S: distance to next P)



in2rmal = regions[1,c(2,3)]

dat = subset(runs,runs$chr == '2' & runs$x>min(in2rmal) & runs$y<max(in2rmal) & runs$dist.xy>0)

distr_inv = ggplot(dat,aes(x = x,y = dist.xy,color=class))+
  geom_point(alpha=.5)+
  scale_color_manual(values = c("darkorange",'darkblue'))+
  theme_minimal()

distr_inv

hist_runs = ggplot(dat,aes(x = dist.xy,fill=class,color=class))+
  geom_histogram(alpha=.5,binwidth = 100,position = "dodge")+
  scale_fill_manual(values = c("darkorange",'darkblue'))+
  scale_color_manual(values = c("darkorange",'darkblue'))+
  coord_cartesian(xlim = c(0,10000))+
  theme()

hist_runs

back to top