Revision 2868f59b32d05a61091e70962e6e6a16463c6a64 authored by Susi Zajitschek on 29 October 2020, 00:41:32 UTC, committed by GitHub on 29 October 2020, 00:41:32 UTC
1 parent b9802e9
data_load_clean.R
# loads the raw data, setting some default types for various columns
load_raw <- function(filename) {
read_csv(filename,
col_types = cols(
.default = col_character(),
project_id = col_character(),
id = col_character(),
parameter_id = col_character(),
age_in_days = col_integer(),
date_of_experiment = col_datetime(format = ""),
weight = col_double(),
phenotyping_center_id = col_character(),
production_center_id = col_character(),
weight_date = col_datetime(format = ""),
date_of_birth = col_datetime(format = ""),
procedure_id = col_character(),
pipeline_id = col_character(),
biological_sample_id = col_character(),
biological_model_id = col_character(),
weight_days_old = col_integer(),
datasource_id = col_character(),
experiment_id = col_character(),
data_point = col_double(),
age_in_weeks = col_integer(),
`_version_` = col_character()
)
)
}
# Apply some standard cleaning to the data
clean_raw_data <- function(mydata) {
mydata %>%
# Fileter to IMPC source (recommened by Jeremey in email to Susi on 20 Aug 2018)
filter(datasource_name == 'IMPC') %>%
# standardise trait names
mutate(parameter_name = tolower(parameter_name) ) %>%
# remove extreme ages
filter(age_in_days > 0 & age_in_days < 500) %>%
# remove NAs
filter(!is.na(data_point)) %>%
# subset to reasonable set of variables
# date_of_experiment: Jeremy suggested using as an indicator of batch-level effects
select(production_center, strain_name, strain_accession_id, biological_sample_id, pipeline_stable_id, procedure_group, procedure_name, sex, date_of_experiment, age_in_days, weight, parameter_name, data_point) %>%
arrange(production_center, biological_sample_id, age_in_days)
}
# subset data to select data for given parameter, and taking a single record per individual, choosing the record as close as possible to
# age_center
data_subset_parameter_individual_by_age <- function(mydata, parameter, age_min, age_center) {
tmp <- mydata %>%
filter(age_in_days >= age_min,
parameter_name == parameter) %>%
# take results for single individual closest to age_center
mutate(age_diff = abs(age_center - age_in_days)) %>%
group_by(biological_sample_id) %>%
filter(age_diff == min(age_diff)) %>%
select(-age_diff)
# still some individuals with multiple records (because same individual appears under different procedures, so filter to one record)
i <- match(unique(tmp$biological_sample_id), tmp$biological_sample_id)
tmp[i, ]
}
Computing file changes ...