# -*- coding: utf-8 -*- """ Created on Tue Oct 27 08:44:18 2020 Code containing various functions used for filtering codes. Functions are imported and applied in other codes. Functions and purposes: convert_coords - convert coordinated from wgs to utm32N projection calc_distance - calculate distance between 2 points calc_vel - calculate velocity add_discharge - adds discharge to the nearest 10m^3/s. Requires the discharge data to be hourly. This code is a prequesite to filtering if in the river or not (as that code relies on river shape files for every 10m^3/s discharge) add_discharge_cont - same as above but discharge as a continuous variable filter_if_in_river_all_discharges - filters if points are within river shapefiles that exist for every 10^3/s discharge, output is a df with points outside the river dropped if_in_shapefile - function to identify if point in shapefile or not add_temp - add hourly temperature to the data (can easily do in R now and likely will do so since needed for SSF data) also need to extract temperature from receivers instead so this code is unused now as far as later data processing goes filter_time_gap - filter data based on time between successive points and remove points under a defined threshold convert_decimal_hours - code to convert time to decimal hours. again defunct as can do more easily in R at a later stage. @author: Rachel """ #------ getting set up ---------------# import os import pandas as pd import pyproj import numpy as np import shapefile #import matplotlib.pyplot as plt import shapely.geometry #from shapely.geometry import shape, Point from shapely.geometry import Point #%% #function to convert coordinates def convert_coords(data_frame,longitude_heading,latitude_heading): ''' function to convert coords from wgs to utm 32N Parameters ---------- data_frame : fish data frame longitude_heading : column name of longitude to be converted latitude_heading : column name of latitude to be converted Returns ------- adds new columns to df as x and y ''' #define input projection in_proj = pyproj.Proj("epsg:4326") #define output projection out_proj = pyproj.Proj("epsg:32632") #transform original coords x1,y1 to x2,y2 x2,y2 = pyproj.Transformer.from_proj(in_proj, out_proj,always_xy=True).transform(data_frame[longitude_heading].values, data_frame[latitude_heading].values) #have defined always_xy to be True as otherwise switches long and lat around #assign converted coordinates to new columns in the data frame data_frame['x'], data_frame['y'] = x2, y2 return #so to summarise this function #takes input and output projections #transforms the coordinates #saves new coordinates to the specified data frame as a new column #%% #function for distance between 2 points def calc_distance(data_frame,x_coord,y_coord): ''' calculates distance between successive points and saves output as new column Parameters ---------- data_frame : data frame x_coord : column name y_coord : column name ''' #calculate differences between value and preceeding entry for x and y delta_x= data_frame[x_coord].diff() delta_y = data_frame[y_coord].diff() #formula to calculate distance distance = (delta_x**2+delta_y**2)**0.5 #assign distance to data frame data_frame['distance_travelled'] = distance return #%% #function for velocity def calc_vel(data_frame,date_time_source,distance_heading): ''' function to calculate velocity + add as column to the dataframe Parameters ---------- data_frame : data frame date_time_source : column name OR index distance_heading : column name of distance column Returns ------- None. ''' #convert index to a series that can be used in calculations time = date_time_source.to_series() #work out difference in times delta_time = time.diff() #convert to seconds time_s = delta_time/np.timedelta64(1,'s') #calculate velocity velocity = data_frame[distance_heading]/time_s #assign velocity to column data_frame['fish_velocity'] = velocity #create time diff column for fun data_frame['time_diff'] = time_s return #%% #function to split based on discharge; hourly #flow_data_hourly = pd.read_csv('hourly_flow.csv', sep=',', index_col='date') #fish_df = manip_data.copy() #discharge_df = flow_data_hourly def add_discharge(discharge_df,fish_df_og): ''' Function to add corresponding hourly discharge values (to nearest 10) to every fish location. Assign output to dataframe. Parameters ---------- discharge_df : file containing discharge values fish_df : file with fish data Returns ------- merged : merged data frame ''' #create separate date column so can use as reference merger discharge_df['Date'] = discharge_df.index #round discharges to nearest 10 discharge_df['discharge_nearest_10'] = discharge_df['Q'].round(-1) fish_df = fish_df_og #first need to convert from time stamps into date +hour #this makes it compatible for mean daily discharge #below assigns the index to a variable; not in the dataframe yet as didnt work that way hourly = fish_df.index hourly = pd.to_datetime(hourly) #code below to round to nearest hour hourly = hourly.round('H') #now convert to string and get rid of extras; adds to dataframe as a column fish_df['Date'] = hourly.strftime('%d/%m/%Y %H:%M') #save index as new column as merging it with flow will get rid of it fish_df['Time'] =fish_df.index #merge the two based on date merged = pd.merge(fish_df,discharge_df,on='Date') #recreate the index merged.index = merged['Time'] #drop unnecessary columns merged = merged.drop(['Date','Time','Q'],axis=1) return merged #different version, single output #multiple outputs generated in next step #%% add discharge as a continuous variable def add_discharge_cont(discharge_df,fish_df): ''' Function to add corresponding hourly discharge values to every fish location. Assign output to dataframe. Parameters ---------- discharge_df : file containing discharge values fish_df : file with fish data Returns ------- merged : merged data frame ''' #create separate date column so can use as reference merger discharge_df['Date'] = discharge_df.index #round discharges to nearest 10 discharge_df['discharge'] = discharge_df['Q'] #first need to convert from time stamps into date +hour #this makes it compatible for mean daily discharge hourly = fish_df.index hourly = pd.to_datetime(hourly) #code below to round to nearest hour hourly = hourly.round('H') #now convert 2 string and get rid of extras; adds to dataframe as a column fish_df['Date'] = hourly.strftime('%d/%m/%Y %H:%M') #save index as new column fish_df['Time'] =fish_df.index #merge the two based on date merged = pd.merge(fish_df,discharge_df,on='Date') #recreate the index merged.index = merged['Time'] #drop unnecessary columns merged = merged.drop(['Date','Time','Q'],axis=1) return merged #different version, single output #multiple outputs generated in next step #%% #function to filter if in river or not #based on discharge - currently set for daily #this function will go inside the "split by discharge" one #OK this has been editted to use the 80m discharge shapefile to check if in river #when discharge >84.9 #i havent tested it yet though def filter_if_in_river_all_discharges(data_frame, shp_file_folder,lon_location,lat_location): ''' A function to filter if a point is in the river or not. Rows containing points outside the shapefile are removed. This function takes a dataframe with multiple discharge values and uses a separate shapefile for each discharge value present Parameters ---------- data_frame : data frame of fish points with multiple discharge values shp_file_folder : folder where all shapefiles location lon_location : column name for the longitude: currently using UTM format so 'x' lat_location : column name for the latitude: currently using UTM format so 'y' output_folder : folder where want output to be saved Returns ------- filtered_df : the filtered data frame with points dropped. ''' #create copy of dataframe to avoid manipulating original data_frame_copy = data_frame.copy() #generates a new column for whether a point is in river or not data_frame_copy['in_river'] = 'placeholder' #loop to get right shape file per discharge #it goes through each line individually #change so pre read in shapefiles for i in range((len(data_frame_copy))): #get the discharge for that single row discharge_value = int(data_frame_copy['discharge_nearest_10'][i]) #bit to account for discharges over 80 #if 80 or less if discharge_value <= 80: #create shapefile name using discharge value shp_file_location = shp_file_folder+'wet_'+str(discharge_value)+'m3s.shp' #read shapefile shp_file = shapefile.Reader(shp_file_location) #create polygon polygons = shp_file.shapes() #define latitude and longitude for this longitude = data_frame_copy[lon_location] latitude = data_frame_copy[lat_location] point = Point(longitude[i], latitude[i]) polygon = shapely.geometry.shape(polygons[0]) #this code shows shape of shape file data_frame_copy['in_river'][i] = polygon.contains(point) else: #else use the 80 shapefile shp_file_location = shp_file_folder+'wet_'+str(80)+'m3s.shp' #read shapefile shp_file = shapefile.Reader(shp_file_location) #create polygon polygons = shp_file.shapes() #define latitude and longitude for this longitude = data_frame_copy[lon_location] latitude = data_frame_copy[lat_location] point = Point(longitude[i], latitude[i]) polygon =shapely.geometry.shape(polygons[0]) #this code shows shape of shape file data_frame_copy['in_river'][i] = polygon.contains(point) #this filters the data and keeps only points in the river filtered_df = data_frame_copy[(data_frame_copy['in_river']==True)] #drop true/false column filtered_df = filtered_df.drop(['in_river'], axis=1) return filtered_df #function creates a filtered df #%% #function to determine if in a shapefile e.g. area near ladder, within ladder def if_in_shapefile(data_frame, shp_file_name,lon_location,lat_location, output_col_name, number): ''' function to determine if a point is within a specified shapefile. It assigns a True or False value for points within and outwith the specified shapefile. Parameters ---------- data_frame : data frame containing the points. shp_file : shapefile lon_location : column name containing longitude (in this case x as utm shapefile) lat_location : column name containing latitude (in this case y as utm shapefile) output_col_name : name of column where want true/false saved number : for 'rectangle' it is 1; for 'ladder' it is 0. I do not know why it has to be like this Returns ------- alters the supplied dataframe, no returns ''' shp_shapes = shp_file_name.shapes() #ok need to have [1] in the code below for unknown Reason #it just works that way shp_polygon = shapely.geometry.shape(shp_shapes[number]) #create placeholder column for results to go in data_frame[output_col_name] = bool('True') #loop that will mark if point is in rectangle or not for i in range(len(data_frame)): point = Point(data_frame[lon_location][i],data_frame[lat_location][i]) data_frame[output_col_name][i] = shp_polygon.contains(point) return #%% #function to assign temperature def add_temp(temp_data,fish_df): ''' function to add hourly temperature as a column to the dataframe Parameters ---------- temp_df : source of temperature data. fish_df : fish data frame Returns ------- creates data framenew column containing hourly temperature ''' #copy data frame for manipulation fish_df = fish_df.copy() #code to merge temp to time, based off the discharge cpde #create separate date column so can use as reference merger temp_data = temp_data.copy() temp_data['Date'] = temp_data.index #first need to convert from time stamps into date +hour #this makes it compatible #ok so below assigns the index to a variable; not in the dataframe yet as didnt work that way lol hourly = fish_df.index #code below to round to nearest hour hourly = hourly.round('H') #now convert 2 string and get rid of extras; adds to dataframe as a column fish_df['Date'] = hourly.strftime('%d/%m/%Y %H:%M') #save index as new column fish_df['Time'] = fish_df.index #merge the two based on date data_with_temp = pd.merge(fish_df,temp_data,on='Date') #recreate the index data_with_temp.index = data_with_temp['Time'] #drop unnecessary columns data_with_temp = data_with_temp.drop(['Date','Time'],axis=1) return data_with_temp #%% #function to filter time points based on difference between them def filter_time_gap(data_frame,upper_limit): ''' function to filter dataframe to remove points within a specified time range. Parameters ---------- data_frame : data frame of fish locations between which want to filter upper_limit : time range (in seconds) below which we want to discard points Returns ------- data_frame : a data frame for assigning to a variable ''' #copy df data_frame = data_frame.copy() #time difference between points in rectangle #convert index to a series that can be used in calculations time = data_frame.index time = time.to_series() #work out difference in times delta_time = time.diff() #convert to seconds delta_time = delta_time/np.timedelta64(1,'s') #assign to sheet data_frame['time_between_points_in_rect'] = delta_time #filter locations within specified limit data_frame = data_frame.loc[data_frame['time_between_points_in_rect'].isnull() | (data_frame['time_between_points_in_rect']>=upper_limit)] return data_frame #%% #function to convert time of day to decimal time #makes in easier to use as a predictor in models when decimal hours def convert_decimal_hours(data_frame, input_column): ''' function to convert a time stamp (in hh:mm:ss format) to decimal hours. The result is saved to the input data frame, as a new column 'decimal hour' Parameters ---------- data_frame : data_frame needed input_column : time column, index e.g. data.index or data[column] ''' #create output column with placeholder number date_time = pd.to_datetime(input_column) #create integer vectors for each componenent hour = date_time.strftime('%H').astype(str).astype(int) mins = date_time.strftime('%M').astype(str).astype(int) secs = date_time.strftime('%S').astype(str).astype(int) #do the maths dec_hour = hour+(mins/60)+(secs/3600) data_frame['decimal_hour'] = dec_hour return