https://doi.org/10.5281/zenodo.14318846
ssf_datamaking_noHMM.R
####set up####
#ctrl alt t to run sections
rm(list=ls())
#THIS IS THE DATAMAKING FILE FOR THE SSF
# theres a lot of loops as need to apply the functions within entry events
# and nesting the data wasnt working
#### packages ####
library(raster)
library(ggplot2)
library(dplyr)
library(data.table)
library(rgdal)#shapefile
library(sp)
library(rgeos)
library(amt) #for step selection
library(lubridate) #for step selection
#library(purrr) #for nesting? #or not
library(tidyr)
library(crawl)
library(sf)
library(mapview)
library(maptools)
####read file list####
files <- list.files("data/find ladder/data near ladder CRAWLED")
####other files####
#get raster file names for use later
raster_files <- list.files("rasters") #identify file names
#get substrate data
substrate_poly <-readOGR('substrate/Materials_hydraulicModel.shp')
#flow
flow <-read.csv('data/hourly_flow.csv')
flow$time <- as.POSIXct(flow$date, format='%d/%m/%Y %H:%M')
#temperatre
temp <- read.csv('data/median_temperatures.csv')
####START HERE - pick file to do####
data_on_fish <-read.csv('data/Tagged_fish_data.csv')
temp <- read.csv('data/median_temperatures.csv')
#pick a dataframe
####LOOP####
for (q in 1:length(files)){
starttime <- Sys.time()
data_to_read <- files[q]
data_frame <- read.csv(paste0('data/find ladder/data near ladder CRAWLED/',data_to_read))
#get time as posixct
data_frame$time2 <- as.POSIXct(data_frame$time2)
#rename data_Frame
data <- data_frame
fish_id_for_print <- data$fish_id[1]
print(paste0('started fish id ',fish_id_for_print,' at time: ',starttime))
####loop all
#set up blank df to append to
final_df <- data.frame(matrix(ncol = 29, nrow = 0))
colnames(final_df) <- c("x1_","x2_","y1_","y2_","sl_" ,"direction_p","ta_","t1_","t2_",
"dt_","step_id_","case_","cos_ta_","log_sl_","substrate_start",
"substrate_end","time_start","time_end","discharge_start",
"discharge_end","depth_start","water_velocity_start",
"water_vel_grad_start","depth_end","water_velocity_end",
"water_vel_grad_end", "species","fish_id","approach" )
#break this loop into smaller loops
approaches <- unique(data$approach)
num_entries <- length(approaches)
resampled_data_total <- data.frame(matrix(ncol=4,nrow=0))
colnames(resampled_data_total) <-c("x_","y_","t_","burst_")
data_with_random_steps <- data.frame(matrix(ncol=17,nrow=0))
colnames(data_with_random_steps) <- c("x1_","x2_","y1_","y2_","sl_",
"direction_p","ta_","t1_","t2_","dt_",
"step_id_","case_","cos_ta_","log_sl_",
"species","approach","fish_id")
#blank list to append entry events which have insufficient turning angles to
not_enough_ta_ <-data.frame(matrix(ncol=2,nrow=0))
colnames(not_enough_ta_) <-c('fish_id','approach')
not_enough_ta_blank <- not_enough_ta_
#blank list to append entry events with too few points
not_enough_points <- data.frame(matrix(ncol=2,nrow=0))
colnames(not_enough_points) <-c('fish_id','approach')
not_enough_points_blank <- not_enough_points
#get fish ids as names
fish_id_list <-unique(data$fish_id)
#LOOP
for (i in approaches) {
#select approach and keep only p locations from crawl (e.g. every 20 s)
data2 <- filter(data, approach==i&locType=="p")
#convert df to tibble so the code works
data2 <- as_tibble(data2)
#make a track
data_track <- make_track(data2,mu.x,mu.y,time2,crs=32632)
data_track <-arrange(data_track,t_)
#resample rate 20s
resampled_data <- track_resample(data_track,rate = seconds(20),
tolerance = seconds(0.5))
#keeping tolerance very low since should have positions every 5s
#filter min burst so at least 3 points (2 steps) per burst
resampled_data <- filter_min_n_burst(resampled_data,min_n = 3)
#LOOP TO RESAMPLE APART FROM THOSE WITHOUT ENOUGH TA (AKA PART 1)
if(length(resampled_data$x_)>0){
#generate the steps from the data
resampled_steps <- steps_by_burst(resampled_data)
#if loop to check if >2 ta_
#dont think i will need this if only >3 points per burst
no.ta <- sum(!is.na(resampled_steps$ta_))
if(no.ta>=2){
#generate random steps and add to the data
random_steps_data <-random_steps(resampled_steps, n_control=100) #100 random steps so can cut this down later
#add time of day
random_steps_data <- time_of_day(random_steps_data,where='both',include.crepuscule = FALSE)
#add cosine of turning angle and log of step length
random_steps_data$cos_ta_ <- cos(random_steps_data$ta_)
random_steps_data$log_sl_ <- log(random_steps_data$sl_)
#add other variables of importance from the og dataset!
#add species, done this way so its easier
random_steps_data$species <- unique(data2$species)
#add entry event
random_steps_data$approach <- data2$approach[1]
#add fish id
random_steps_data$fish_id <- unique(data2$id)
##ok this loop can end here and append to larger df for example
data_with_random_steps <- rbind(data_with_random_steps,random_steps_data)
}
else{
not_enough_ta_ <-append(not_enough_ta_, unique(data2$approach))
not_enough_ta_2 <-not_enough_ta_blank
not_enough_ta_2[1,] <- c(data2$fish_id[1],i)
not_enough_ta_ <- rbind(not_enough_ta_,not_enough_ta_2)
}
}
else {
not_enough_points <- append(not_enough_points, unique(data2$approach))
not_enough_points2 <-not_enough_points_blank
not_enough_points2[1,] <- c(data2$fish_id[1],i)
not_enough_points <- rbind(not_enough_points,not_enough_points2)
}
}
##substrate
data3 <- data_with_random_steps
#get points for extraction
points_start <- data.frame(data3$x1_,data3$y1_)
names(points_start) <-c('x','y')
points_end <- data.frame(data3$x2_,data3$y2_)
names(points_end) <-c('x','y')
substrate_extract_start <- raster::extract(substrate_poly,points_start)
substrate_start <- substrate_extract_start$MATNAME
substrate_extract_end <- raster::extract(substrate_poly,points_end)
substrate_end <- substrate_extract_end$MATNAME
#duplicated data to be safe and check its ok
data_with_substrate<- cbind(data3, substrate_start,substrate_end)
####add env data by discharge to datas####
#rename data4 just because
data4 <- data_with_random_steps
#data4 <- read.csv('with_random_steps_all_steps_define_distr.csv')
data4_backup <- data4
data4 <- as_tibble(data4)
#get start and end points of the data
data.startpoints <-data4[,c(2,4)]
data.endpoints <- data4[,c(3,5)]
#and diischarges
#need hourly time first
data4$time_start <- round_date(as.POSIXct(data4$t1_,format='%Y-%m-%d %H:%M:%S') ,unit='hour')
data4$time_end <- round_date(as.POSIXct(data4$t2_,format='%Y-%m-%d %H:%M:%S'),unit='hour')
#then can add discharges
data4$discharge_start <- flow[match(data4$time_start,flow$time),2]
data4$discharge_end <- flow[match(data4$time_end,flow$time),2]
#and then df to save the extracted points to
data.startpoints.fin <- data.startpoints
data.endpoints.fin <-data.endpoints
#NEED LOOP FOR IF DISCHARGE >80 TO TREAT AS 80
for(i in 1:length(raster_files)) {
j <-gsub('.tif','', raster_files[i])
data.startpoints.fin[[paste0(j)]] <- raster::extract(raster(paste0("rasters/",
raster_files[i])),data.startpoints)
data.endpoints.fin[[paste0(j)]] <- raster::extract(raster(paste0("rasters/",
raster_files[i])),data.endpoints)
}
#ok so the next step will use the 2 separate df
#below is confusing bc we make another variable data4 but its just how the code was made initially
#just to save a copy
data5 <- data_with_substrate
#will add correct depth, velcity and gradient values plus angles to above df
#in loop below data4 is the data with random steps and substrate and discharge
#and data_all_env is the MASSIVE file of all random steps plus every single
#value from the rasters like 62 columns
#ok below loop
#backup data5
data5_2 <- data5
data5$time_start <- round_date(as.POSIXct(data5$t1_,format='%Y-%m-%d %H:%M:%S') ,unit='hour')
data5$time_end <- round_date(as.POSIXct(data5$t2_,format='%Y-%m-%d %H:%M:%S'),unit='hour')
#then can add discharges
data5$discharge_start <- flow[match(data5$time_start,flow$time),2]
data5$discharge_end <- flow[match(data5$time_end,flow$time),2]
#drop new time columns
data5 <-subset(data5,select=-c(time_start,time_end))
approaches <- unique(data5_2$approach)
final_df <- data.frame(matrix(ncol = 29, nrow = 0))
colnames(final_df) <- c("x1_","x2_","y1_","y2_","sl_" ,"direction_p","ta_","t1_","t2_",
"dt_","step_id_","case_","cos_ta_","log_sl_","substrate_start",
"substrate_end","time_start","time_end","discharge_start",
"discharge_end","depth_start","water_velocity_start",
"water_vel_grad_start","depth_end","water_velocity_end",
"water_vel_grad_end", "species","fish_id","approach" )
#use this loop if got covariates via raster::extract
#use the line below if need to run this loop over several sessions
for(i in 1:length(data5$step_id_)) {
#FOR START POINTS
#get discharge at i
discharge_start_10s <- round(data5$discharge_start[i],digits=-1)
#need if ... for discharge >80
#below extracts headings for a discharge
#aka picks the headings containing the discharge value from the all env df
if(discharge_start_10s<80){
headings_start <- names(data.startpoints.fin)[grepl(discharge_start_10s,names(data.startpoints.fin))]
}
if(discharge_start_10s>=80){
headings_start <- names(data.startpoints.fin)[grepl(80,names(data.startpoints.fin))]
}
#dont need line below anymore since these headings are only start
#headings_start <- headings_start[grepl('start',headings_start)]
#individual headings for each variable
#this means can direct r where to TAKE VALUES FROM
depth_head_start <- headings_start[grepl('dep',headings_start)]
vel_head_start <- headings_start[grepl('vel_',headings_start)]
velgrad_head_start <- headings_start[grepl('svg',headings_start)]
vel_angle_head_start <- headings_start[grepl('Vel_Angle',headings_start)]
svg_angle_head_start <- headings_start[grepl('SVG_',headings_start)]
#then add to df
data5[i, 'depth_start'] <- data.startpoints.fin[i, depth_head_start]
data5[i, 'water_velocity_start'] <- data.startpoints.fin[i, vel_head_start]
data5[i, 'svg_start'] <- data.startpoints.fin[i, velgrad_head_start]
data5[i, 'velocity_angle_start'] <- data.startpoints.fin[i, vel_angle_head_start]
data5[i, 'svg_angle_start'] <- data.startpoints.fin[i, svg_angle_head_start]
#FOR END POINTS
#get discharge at i
discharge_end_10s <- round(data5$discharge_end[i],digits=-1)
if(discharge_end_10s<80){
headings_end <- names(data.endpoints.fin)[grepl(discharge_start_10s,names(data.startpoints.fin))]
}
if(discharge_end_10s>=80){
headings_end <- names(data.endpoints.fin)[grepl(80,names(data.startpoints.fin))]
}
#individual headings for each variable
#this means can direct r where to TAKE VALUES FROM
depth_head_end <- headings_end[grepl('dep',headings_end)]
vel_head_end <- headings_end[grepl('vel_',headings_end)]
velgrad_head_end <- headings_end[grepl('svg',headings_end)]
vel_angle_head_end <- headings_end[grepl('Vel_Angle',headings_end)]
svg_angle_head_end <- headings_end[grepl('SVG_',headings_end)]
#then add to df
#IT WORKS!!
data5[i, 'depth_end'] <-data.endpoints.fin[i, depth_head_end]
data5[i, 'water_velocity_end'] <- data.endpoints.fin[i, vel_head_end]
data5[i, 'svg_end'] <- data.endpoints.fin[i, velgrad_head_end]
data5[i, 'velocity_angle_end'] <- data.endpoints.fin[i, vel_angle_head_end]
data5[i, 'svg_angle_end'] <- data.endpoints.fin[i, svg_angle_head_end]
}
####make step id sequential for all approaches####
#step id per fish eg not per approach
data.sub <- summarise(group_by(data5,fish_id,approach,step_id_),count=n())
data.sub$step_id_new <- ave(data.sub$step_id_,data.sub$fish_id,
FUN = seq_along)
#ok above works
#merge to get new step_id and then drop additional columns
data5.5 <-merge(data5,data.sub,by=c('approach',"step_id_",'fish_id'))
data5.5 <-subset(data5.5,select=c(-count))
#FILTER TO REMOVE RANDOM STEPS OUT OF RIVER
#check number of true steps before and after removing nas!!
data5.5.2 <- na.omit(data5.5) #since if out of river, env data will be NA
#THEN REUDCE RANDOM STEPS PER STEP to 10
#test 1
data_true_only <- filter(data5.5.2,case_==TRUE)
data_false_only <- filter(data5.5.2,case_==FALSE)
#get 10 null steps per step ID
data6 <- data_false_only %>% group_by(approach,step_id_,case_) %>% slice_sample(n=10) %>% ungroup
#ok well. THEN. match step id of remaining trues to new falses
truestepid <- unique(data_true_only$step_id_new)
data6.2 <- filter(data6, step_id_new %in% truestepid) #ok good
#then rbind
data7 <- rbind(data_true_only,data6.2)
data8 <- arrange(data7,t1_)
#save
#also, calculate ABSOLUTE angle of fish so can do stuff relative to flow angle
#tan of difference in x and diff y
diffx <- data8$x2_ - data8$x1_
diffy <- data8$y2_ - data8$y1_
angle <- atan(diffy/diffx)*180/pi
angle_stuff<-as.data.frame(cbind(diffx,diffy,angle))
angle_stuff<-mutate(angle_stuff,absolute=case_when(diffx>0 ~ 90-angle,
diffx<0 ~270-angle))
data8$absolute_angle <- angle_stuff$absolute
#cos of the angles
data8$cos_vel_ang_start <- cos(data8$velocity_angle_start*pi/180)
data8$cos_svg_ang_start<- cos(data8$svg_angle_start*pi/180)
data8$cos_vel_ang_end <- cos(data8$velocity_angle_end*pi/180)
data8$cos_svg_ang_end <- cos(data8$svg_angle_end*pi/180)
data8$cos_abs_ang_end <- cos(data8$absolute_angle *pi/180)
#relative angles
#diff vel and ab angle
#do maths in brackets for regular angles and then cos it
#need to be between 0 and 1, or -1 and 1
data8$diff_vel_abs_ang_start <- cos((data8$velocity_angle_start-data8$absolute_angle)*pi/180)
data8$diff_svg_abs_ang_start <- cos((data8$svg_angle_start-data8$absolute_angle)*pi/180)
data8$diff_vel_abs_ang_end <- cos((data8$velocity_angle_end-data8$absolute_angle)*pi/180)
data8$diff_svg_abs_ang_end <- cos((data8$svg_angle_end-data8$absolute_angle)*pi/180)
data_final <- data8
####add fish lengths etc####
#ok so want to match tag id on this sheet to tag id on the main data sheet
data_on_fish_i <- filter(data_on_fish,Tag_ID == data_final$fish_id[1])
data_final$fork_length <- data_on_fish_i$Fork_length_mm
data_final$total_length <- data_on_fish_i$Total_length_mm
data_final$weight <- data_on_fish_i$Weight_g
####add temp####
temp$time3 <- as.POSIXct(temp$time2,format='%Y-%m-%d %H:%M:%S')
#sort temperature and remove receiver
data_final$time_start <- round_date(as.POSIXct(data_final$t1_,format='%Y-%m-%d %H:%M:%S') ,'10mins')
data_final$time_end <- round_date(as.POSIXct(data_final$t2_,format='%Y-%m-%d %H:%M:%S'),unit='10mins')
#sort temperature and remove receiver
data_final$temp_start <- temp[match(data_final$time_start,temp$time3),3]
data_final$temp_end <- temp[match(data_final$time_end,temp$time3),3]
data_final <- subset(data_final,select=-c(time_start,time_end))
#### save final df <3####
#hashed so i dont ever accidentally run it
write.csv(data_final,paste0('data/find ladder/final ssf dataframes/',data_final$fish_id[1],' ssf_data_20sec_time_gap_60min_track.csv'),row.names=F)
####end of potential loop
endtime <- Sys.time()
print(paste0('Finished fish id ',fish_id_for_print,' at time: ',endtime))
print(paste0('time taken: ',as.numeric(endtime-starttime,units='mins'), ' minutes'))
}
####end loop####