https://doi.org/10.5281/zenodo.14318846
functions.py
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 27 08:44:18 2020
Code containing various functions used for filtering codes. Functions are imported and
applied in other codes.
Functions and purposes:
convert_coords - convert coordinated from wgs to utm32N projection
calc_distance - calculate distance between 2 points
calc_vel - calculate velocity
add_discharge - adds discharge to the nearest 10m^3/s. Requires the
discharge data to be hourly. This code is a prequesite to
filtering if in the river or not (as that code relies on
river shape files for every 10m^3/s discharge)
add_discharge_cont - same as above but discharge as a continuous variable
filter_if_in_river_all_discharges - filters if points are within river
shapefiles that exist for every 10^3/s
discharge, output is a df with points
outside the river dropped
if_in_shapefile - function to identify if point in shapefile or not
add_temp - add hourly temperature to the data (can easily do in R now and
likely will do so since needed for SSF data)
also need to extract temperature from receivers instead so
this code is unused now as far as later data processing goes
filter_time_gap - filter data based on time between successive points and
remove points under a defined threshold
convert_decimal_hours - code to convert time to decimal hours. again
defunct as can do more easily in R at a later stage.
@author: Rachel
"""
#------ getting set up ---------------#
import os
import pandas as pd
import pyproj
import numpy as np
import shapefile
#import matplotlib.pyplot as plt
import shapely.geometry
#from shapely.geometry import shape, Point
from shapely.geometry import Point
#%%
#function to convert coordinates
def convert_coords(data_frame,longitude_heading,latitude_heading):
'''
function to convert coords from wgs to utm 32N
Parameters
----------
data_frame : fish data frame
longitude_heading : column name of longitude to be converted
latitude_heading : column name of latitude to be converted
Returns
-------
adds new columns to df as x and y
'''
#define input projection
in_proj = pyproj.Proj("epsg:4326")
#define output projection
out_proj = pyproj.Proj("epsg:32632")
#transform original coords x1,y1 to x2,y2
x2,y2 = pyproj.Transformer.from_proj(in_proj,
out_proj,always_xy=True).transform(data_frame[longitude_heading].values,
data_frame[latitude_heading].values)
#have defined always_xy to be True as otherwise switches long and lat around
#assign converted coordinates to new columns in the data frame
data_frame['x'], data_frame['y'] = x2, y2
return
#so to summarise this function
#takes input and output projections
#transforms the coordinates
#saves new coordinates to the specified data frame as a new column
#%%
#function for distance between 2 points
def calc_distance(data_frame,x_coord,y_coord):
'''
calculates distance between successive points and saves output as new column
Parameters
----------
data_frame : data frame
x_coord : column name
y_coord : column name
'''
#calculate differences between value and preceeding entry for x and y
delta_x= data_frame[x_coord].diff()
delta_y = data_frame[y_coord].diff()
#formula to calculate distance
distance = (delta_x**2+delta_y**2)**0.5
#assign distance to data frame
data_frame['distance_travelled'] = distance
return
#%%
#function for velocity
def calc_vel(data_frame,date_time_source,distance_heading):
'''
function to calculate velocity + add as column to the dataframe
Parameters
----------
data_frame : data frame
date_time_source : column name OR index
distance_heading : column name of distance column
Returns
-------
None.
'''
#convert index to a series that can be used in calculations
time = date_time_source.to_series()
#work out difference in times
delta_time = time.diff()
#convert to seconds
time_s = delta_time/np.timedelta64(1,'s')
#calculate velocity
velocity = data_frame[distance_heading]/time_s
#assign velocity to column
data_frame['fish_velocity'] = velocity
#create time diff column for fun
data_frame['time_diff'] = time_s
return
#%%
#function to split based on discharge; hourly
#flow_data_hourly = pd.read_csv('hourly_flow.csv', sep=',', index_col='date')
#fish_df = manip_data.copy()
#discharge_df = flow_data_hourly
def add_discharge(discharge_df,fish_df_og):
'''
Function to add corresponding hourly discharge values (to nearest 10)
to every fish location. Assign output to dataframe.
Parameters
----------
discharge_df : file containing discharge values
fish_df : file with fish data
Returns
-------
merged : merged data frame
'''
#create separate date column so can use as reference merger
discharge_df['Date'] = discharge_df.index
#round discharges to nearest 10
discharge_df['discharge_nearest_10'] = discharge_df['Q'].round(-1)
fish_df = fish_df_og
#first need to convert from time stamps into date +hour
#this makes it compatible for mean daily discharge
#below assigns the index to a variable; not in the dataframe yet as didnt work that way
hourly = fish_df.index
hourly = pd.to_datetime(hourly)
#code below to round to nearest hour
hourly = hourly.round('H')
#now convert to string and get rid of extras; adds to dataframe as a column
fish_df['Date'] = hourly.strftime('%d/%m/%Y %H:%M')
#save index as new column as merging it with flow will get rid of it
fish_df['Time'] =fish_df.index
#merge the two based on date
merged = pd.merge(fish_df,discharge_df,on='Date')
#recreate the index
merged.index = merged['Time']
#drop unnecessary columns
merged = merged.drop(['Date','Time','Q'],axis=1)
return merged
#different version, single output
#multiple outputs generated in next step
#%% add discharge as a continuous variable
def add_discharge_cont(discharge_df,fish_df):
'''
Function to add corresponding hourly discharge values
to every fish location. Assign output to dataframe.
Parameters
----------
discharge_df : file containing discharge values
fish_df : file with fish data
Returns
-------
merged : merged data frame
'''
#create separate date column so can use as reference merger
discharge_df['Date'] = discharge_df.index
#round discharges to nearest 10
discharge_df['discharge'] = discharge_df['Q']
#first need to convert from time stamps into date +hour
#this makes it compatible for mean daily discharge
hourly = fish_df.index
hourly = pd.to_datetime(hourly)
#code below to round to nearest hour
hourly = hourly.round('H')
#now convert 2 string and get rid of extras; adds to dataframe as a column
fish_df['Date'] = hourly.strftime('%d/%m/%Y %H:%M')
#save index as new column
fish_df['Time'] =fish_df.index
#merge the two based on date
merged = pd.merge(fish_df,discharge_df,on='Date')
#recreate the index
merged.index = merged['Time']
#drop unnecessary columns
merged = merged.drop(['Date','Time','Q'],axis=1)
return merged
#different version, single output
#multiple outputs generated in next step
#%%
#function to filter if in river or not
#based on discharge - currently set for daily
#this function will go inside the "split by discharge" one
#OK this has been editted to use the 80m discharge shapefile to check if in river
#when discharge >84.9
#i havent tested it yet though
def filter_if_in_river_all_discharges(data_frame, shp_file_folder,lon_location,lat_location):
'''
A function to filter if a point is in the river or not. Rows containing
points outside the shapefile are removed. This function takes a dataframe
with multiple discharge values and uses a separate shapefile for each
discharge value present
Parameters
----------
data_frame : data frame of fish points with multiple discharge values
shp_file_folder : folder where all shapefiles location
lon_location : column name for the longitude: currently using UTM format so 'x'
lat_location : column name for the latitude: currently using UTM format so 'y'
output_folder : folder where want output to be saved
Returns
-------
filtered_df : the filtered data frame with points dropped.
'''
#create copy of dataframe to avoid manipulating original
data_frame_copy = data_frame.copy()
#generates a new column for whether a point is in river or not
data_frame_copy['in_river'] = 'placeholder'
#loop to get right shape file per discharge
#it goes through each line individually
#change so pre read in shapefiles
for i in range((len(data_frame_copy))):
#get the discharge for that single row
discharge_value = int(data_frame_copy['discharge_nearest_10'][i])
#bit to account for discharges over 80
#if 80 or less
if discharge_value <= 80:
#create shapefile name using discharge value
shp_file_location = shp_file_folder+'wet_'+str(discharge_value)+'m3s.shp'
#read shapefile
shp_file = shapefile.Reader(shp_file_location)
#create polygon
polygons = shp_file.shapes()
#define latitude and longitude for this
longitude = data_frame_copy[lon_location]
latitude = data_frame_copy[lat_location]
point = Point(longitude[i], latitude[i])
polygon = shapely.geometry.shape(polygons[0]) #this code shows shape of shape file
data_frame_copy['in_river'][i] = polygon.contains(point)
else:
#else use the 80 shapefile
shp_file_location = shp_file_folder+'wet_'+str(80)+'m3s.shp'
#read shapefile
shp_file = shapefile.Reader(shp_file_location)
#create polygon
polygons = shp_file.shapes()
#define latitude and longitude for this
longitude = data_frame_copy[lon_location]
latitude = data_frame_copy[lat_location]
point = Point(longitude[i], latitude[i])
polygon =shapely.geometry.shape(polygons[0]) #this code shows shape of shape file
data_frame_copy['in_river'][i] = polygon.contains(point)
#this filters the data and keeps only points in the river
filtered_df = data_frame_copy[(data_frame_copy['in_river']==True)]
#drop true/false column
filtered_df = filtered_df.drop(['in_river'], axis=1)
return filtered_df
#function creates a filtered df
#%%
#function to determine if in a shapefile e.g. area near ladder, within ladder
def if_in_shapefile(data_frame, shp_file_name,lon_location,lat_location,
output_col_name, number):
'''
function to determine if a point is within a specified shapefile. It
assigns a True or False value for points within and outwith the specified
shapefile.
Parameters
----------
data_frame : data frame containing the points.
shp_file : shapefile
lon_location : column name containing longitude (in this case x as utm shapefile)
lat_location : column name containing latitude (in this case y as utm shapefile)
output_col_name : name of column where want true/false saved
number : for 'rectangle' it is 1; for 'ladder' it is 0.
I do not know why it has to be like this
Returns
-------
alters the supplied dataframe, no returns
'''
shp_shapes = shp_file_name.shapes()
#ok need to have [1] in the code below for unknown Reason
#it just works that way
shp_polygon = shapely.geometry.shape(shp_shapes[number])
#create placeholder column for results to go in
data_frame[output_col_name] = bool('True')
#loop that will mark if point is in rectangle or not
for i in range(len(data_frame)):
point = Point(data_frame[lon_location][i],data_frame[lat_location][i])
data_frame[output_col_name][i] = shp_polygon.contains(point)
return
#%%
#function to assign temperature
def add_temp(temp_data,fish_df):
'''
function to add hourly temperature as a column to the dataframe
Parameters
----------
temp_df : source of temperature data.
fish_df : fish data frame
Returns
-------
creates data framenew column containing hourly temperature
'''
#copy data frame for manipulation
fish_df = fish_df.copy()
#code to merge temp to time, based off the discharge cpde
#create separate date column so can use as reference merger
temp_data = temp_data.copy()
temp_data['Date'] = temp_data.index
#first need to convert from time stamps into date +hour
#this makes it compatible
#ok so below assigns the index to a variable; not in the dataframe yet as didnt work that way lol
hourly = fish_df.index
#code below to round to nearest hour
hourly = hourly.round('H')
#now convert 2 string and get rid of extras; adds to dataframe as a column
fish_df['Date'] = hourly.strftime('%d/%m/%Y %H:%M')
#save index as new column
fish_df['Time'] = fish_df.index
#merge the two based on date
data_with_temp = pd.merge(fish_df,temp_data,on='Date')
#recreate the index
data_with_temp.index = data_with_temp['Time']
#drop unnecessary columns
data_with_temp = data_with_temp.drop(['Date','Time'],axis=1)
return data_with_temp
#%%
#function to filter time points based on difference between them
def filter_time_gap(data_frame,upper_limit):
'''
function to filter dataframe to remove points within a specified time range.
Parameters
----------
data_frame : data frame of fish locations between which want to filter
upper_limit : time range (in seconds) below which we want to discard points
Returns
-------
data_frame : a data frame for assigning to a variable
'''
#copy df
data_frame = data_frame.copy()
#time difference between points in rectangle
#convert index to a series that can be used in calculations
time = data_frame.index
time = time.to_series()
#work out difference in times
delta_time = time.diff()
#convert to seconds
delta_time = delta_time/np.timedelta64(1,'s')
#assign to sheet
data_frame['time_between_points_in_rect'] = delta_time
#filter locations within specified limit
data_frame = data_frame.loc[data_frame['time_between_points_in_rect'].isnull() | (data_frame['time_between_points_in_rect']>=upper_limit)]
return data_frame
#%%
#function to convert time of day to decimal time
#makes in easier to use as a predictor in models when decimal hours
def convert_decimal_hours(data_frame, input_column):
'''
function to convert a time stamp (in hh:mm:ss format) to decimal hours. The
result is saved to the input data frame, as a new column 'decimal hour'
Parameters
----------
data_frame : data_frame needed
input_column : time column, index e.g. data.index or data[column]
'''
#create output column with placeholder number
date_time = pd.to_datetime(input_column)
#create integer vectors for each componenent
hour = date_time.strftime('%H').astype(str).astype(int)
mins = date_time.strftime('%M').astype(str).astype(int)
secs = date_time.strftime('%S').astype(str).astype(int)
#do the maths
dec_hour = hour+(mins/60)+(secs/3600)
data_frame['decimal_hour'] = dec_hour
return