01 - filtering_data_code_new_positions.py
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 27 08:53:04 2020
code to run to filter data sheets
@author: Rachel
"""
#------ getting set up ---------------#
import os
import pandas as pd
#some of these are 'unused' but i dont trust python to let me delete them without issues
import pyproj
import numpy as np
import math
import datetime
#shapefile from pyshp eg need pyshp installed to import shapefile
import shapefile
from shapely.geometry import shape, Point
proj_directory = os.getcwd() #this may need altered to work
#need this command to import the Functions
os.chdir(proj_directory+'\\code to filter raw data') #this needs to be the "code to filter raw data" folder
#import functions needed for this code
from functions import (convert_coords, add_discharge, if_in_shapefile,
add_discharge_cont)
#%% folder/variable defining :)
#define input folder containing the csvs for each fish
input_folder = (proj_directory+'\data')
assert os.path.exists(input_folder)
#get file names as list to loop
raw_file_names = os.listdir(input_folder)
#define diff output folders
#folder to save filtered data
output_folder_with_vel = (proj_directory+'\data\\data filtered NEW per fish\\')
#read in flow data - hourly!!
flow_data = pd.read_csv(proj_directory+'\data\\hourly_flow.csv', sep=',', index_col='date')
#%%
#if need to start and resumed, can use the hashed out code below to
#select from the list of file names and pick a selection from a certain point
#raw_file_names = raw_file_names[1:40]
#%% loop loop
#gives status updates udring the loop and how long it takes
for i in raw_file_names:
#read in csv
start_time_total = datetime.datetime.now()
data = pd.read_csv(input_folder+i, index_col='dt', parse_dates=['dt'])
#save id as variable
fish_name = data['id'][0]
print('STARTED fish id '+str(fish_name)+' at time:', start_time_total)
#subset data to keep only important columns # removing only the columns that
#say which hydrophones detected the fish
data2 = data.iloc[:,0:11]
data2['species'] = data['Species']
#to save changing variable names
data3 = data2.copy()
#remove where max error is <2
data3 = data3[(data3['error_x']<2)&(data3['error_y']<2)&(data3['error_xy']<2)]
print('fish id '+str(fish_name)+' - errors filtered')
#add for in case lose all data when sort on error
length = len(data3['id'])
if length>0 :
#convert coords
convert_coords(data3, 'lon', 'lat')
print('fish id '+str(fish_name)+' - coordinates converted')
data5 =add_discharge(flow_data,data3)
data5_filtered = data5.copy()
end_length= len(data5_filtered)
#ok code below - doesnt sort by index
data5_filtered = data5_filtered.sort_index()
#ok this shoudl work for all fish to new df
data5_filtered.to_csv(output_folder_with_vel+'\\'+str(fish_name)+'_new_positions.csv')
end_time_total = datetime.datetime.now()
total_time = end_time_total-start_time_total
print('EXPORTED - fish id '+str(fish_name)+' has been processed. total time taken for this sheet:',
total_time)
else:
#so if there is no data after filtering, can print to console which
#dataframes are affected
print('ERROR - fish id '+str(fish_name)+' all positions too high error'+
' so all data lost when filtered for error')