Explore the water quality data collected by EMP in order to process it correctly for inclusion in the drought environmental predictors data set.
# Load packages
library(tidyverse)
library(scales)
library(viridisLite)
library(sf)
library(mapview)
# Make sure we are using `deltamapr` version 1.0.0, commit 5b11044a14aada45562d92899e2d8db42c0e8daf
# install.packages("devtools")
# devtools::install_github("InteragencyEcologicalProgram/deltamapr", ref = "5b11044a14aada45562d92899e2d8db42c0e8daf")
library(deltamapr)
library(contentid)
library(here)
library(conflicted)
# Declare package conflict preferences
conflicts_prefer(dplyr::filter())
# Check if we are in the correct working directory
i_am("drought_variables/explore_emp_wq_data.Rmd")
# Set default options for mapview - turn off basemap color shuffle
mapviewOptions(basemaps.color.shuffle = FALSE)
# Function to create custom individual mapview objects for a set of coordinates
mapview_from_coord <- function(sf, layer_name, pt_color) {
mapview(
sf,
layer.name = layer_name,
col.regions = pt_color,
alpha.regions = 0.8,
cex = 5,
label = "Station"
)
}
# Function to create heatmaps of sampling effort
heatmap_sampeff <- function(df) {
df %>%
mutate(Station = fct_rev(factor(Station))) %>%
ggplot(aes(x = YearAdj, y = Station, fill = num_samples)) +
geom_tile() +
scale_x_continuous(
limits = c(1974, 2022),
breaks = breaks_pretty(20),
expand = expansion()
) +
scale_fill_viridis_c(name = "Number of Samples") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "top"
)
}
# Function to create boxplots for each individual parameter
boxplot_param <- function(df) {
df %>%
ggplot(aes(x = Station, y = Result)) +
geom_boxplot(outliers = FALSE) +
geom_jitter(aes(color = YearAdj), width = 0.2) +
scale_color_viridis_c(name = "Adjusted\nYear", option = "plasma") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
}
Import EMP WQ data and station coordinates from the EDI data package 458.10.
# Register a contentid for the EMP WQ data and station coordinates from the EDI data package 458.10
# This only needs to be done once
# EMP station coordinates
# register(
# "https://portal.edirepository.org/nis/dataviewer?packageid=edi.458.10&entityid=86dd696bc3f8407ff52954094e1e9dcf"
# )
# EMP WQ data:
# register(
# "https://portal.edirepository.org/nis/dataviewer?packageid=edi.458.10&entityid=cf231071093ac2861893793517db26f3"
# )
# Define contentid for EMP station coordinates
id_emp_coord <- "hash://sha256/120d7790b505f22a103ee392d13ca1c02b37fb9842d549942580b04c94eb2c67"
# Define contentid for EMP WQ data
id_emp_wq <- "hash://sha256/4c4b56d7d2a888b35efe2a6f2818f2b8b2fab26d0adb8f1c73ef522996b26ea8"
# Resolve the contentid's storing local copies for faster import
file_emp_coord <- resolve(id_emp_coord, store = TRUE)
file_emp_wq <- resolve(id_emp_wq, store = TRUE)
# Import EMP WQ data and station coordinates from the local copies using their contentid's
df_emp_coord <- read_csv(file_emp_coord)
df_emp_wq <- read_csv(
file = file_emp_wq,
col_types = cols_only(
Station = "c",
Date = "c",
Secchi = "d",
SpCndSurface = "d",
SpCndBottom = "d",
WTSurface = "d",
WTBottom = "d",
DissAmmonia_Sign = "c",
DissAmmonia = "d",
DissNitrateNitrite_Sign = "c",
DissNitrateNitrite = "d",
DissOrthophos_Sign = "c",
DissOrthophos = "d"
)
)
Import spatial data including the station coordinates for biological data and the Delta subregion polygons from the Drought Synthesis.
# Import station coordinates for biological data
df_bio_coord <- read_csv(here("Station_coordinates.csv"))
# Load Delta subregion polygons from the Drought Synthesis
sf_delta <- R_EDSM_Subregions_Mahardja_FLOAT %>%
select(SubRegion) %>%
# Remove a few subregions outside our area of interest
filter(!SubRegion %in% c("San Francisco Bay", "South Bay", "Upper Napa River"))
Spatial data for maps:
# Prepare station coordinates of biological data for map - separate by taxon so
# they can be added as independent layers
ndf_bio_coord <- df_bio_coord %>%
rename_with(str_to_title) %>%
drop_na(Lat_wgs84, Long_wgs84) %>%
# Remove phytoplankton stations because we're using a different set of
# environmental predictors for them
filter(Taxon != "phytoplankton") %>%
st_as_sf(coords = c("Long_wgs84", "Lat_wgs84"), crs = 4326) %>%
# Convert CRS so that its the same as the region and subregion polygons
st_transform(crs = st_crs(sf_delta)) %>%
nest(.by = Taxon, .key = "sf_coord") %>%
mutate(Taxon = str_to_title(Taxon)) %>%
arrange(Taxon)
# Prepare EMP station coordinates for map
sf_emp_coord <- df_emp_coord %>%
drop_na(Latitude, Longitude) %>%
st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
# Convert CRS so that its the same as the subregion polygons
st_transform(crs = st_crs(sf_delta)) %>%
replace_na(list(EndDate = "ongoing"))
EMP WQ data:
# Prepare EMP WQ data for heatmap of sampling effort
df_emp_wq_c <- df_emp_wq %>%
mutate(
Date = ymd(Date),
# Add placeholder values for <RL records without RL values
DissAmmonia = if_else(DissAmmonia_Sign == "<" & is.na(DissAmmonia), 0.01, DissAmmonia),
DissNitrateNitrite = if_else(DissNitrateNitrite_Sign == "<" & is.na(DissNitrateNitrite), 0.01, DissNitrateNitrite),
DissOrthophos = if_else(DissOrthophos_Sign == "<" & is.na(DissOrthophos), 0.01, DissOrthophos)
) %>%
# Remove records where values for all parameters are NA
filter(!if_all(where(is.numeric), is.na)) %>%
# Only include 2021 and earlier - use adjusted calendar year:
# December-November, with December of the previous calendar year included with
# the following year
mutate(
Month = month(Date),
YearAdj = if_else(Month == 12, year(Date) + 1, year(Date)),
.after = Date
) %>%
filter(YearAdj <= 2021)
In order to spatially filter the EMP WQ sampling locations, we need to find the Delta subregions where biological data has been collected and categorize them based on taxon. We’ll then use the resulting subregion polygons to spatially filter the EMP WQ stations.
# Filter and categorize polygons from sf_delta
sf_delta_bio <- ndf_bio_coord %>%
mutate(
sf_subr = list(sf_delta),
sf_subr_filt = map2(
sf_subr,
sf_coord,
~ st_join(.x, .y, join = st_intersects) %>%
drop_na(Station) %>%
distinct(SubRegion, geometry)
)
) %>%
select(Taxon, sf_subr_filt) %>%
unnest(sf_subr_filt) %>%
arrange(Taxon) %>%
group_by(SubRegion) %>%
mutate(
Taxon = str_flatten_comma(Taxon),
Taxon = if_else(Taxon == "Benthic, Fishes, Zooplankton", "All Taxa", Taxon)
) %>%
ungroup() %>%
distinct() %>%
st_as_sf(crs = st_crs(sf_delta))
# Create list of two mapview layers of the subregion polygons (all and filtered)
# to be used across maps
ls_subr_polygons <- list(
mapview(
sf_delta,
col.regions = "grey",
layer.name = "All Subregions",
alpha.regions = 0.5,
legend = FALSE
),
mapview(
sf_delta_bio,
zcol = "Taxon",
layer.name = "Filtered Subregions",
label = "SubRegion"
)
)
Let’s visualize these subregions with the biological and EMP WQ stations overlayed.
ndf_bio_coord %>%
mutate(
point_color = plasma(n = 3),
map_obj = pmap(list(sf_coord, Taxon, point_color), mapview_from_coord)
) %>%
pull(map_obj) %>%
append(ls_subr_polygons, after = 0) %>%
reduce(`+`)
sf_emp_coord %>%
mapview_from_coord("EMP WQ Stations", "yellow") %>%
append(ls_subr_polygons, after = 0) %>%
reduce(`+`)
After looking at the maps above, we’ll add the “Franks Tract” subregion to the polygons used for spatially filtering the EMP WQ stations. Otherwise, the polygons used for filtering look good. We’ll come back to how they are categorized later.
# Add Franks Tract subregion to the polygons used for filtering
sf_delta_bio_c <- sf_delta_bio %>%
bind_rows(sf_delta %>% filter(SubRegion == "Franks Tract")) %>%
replace_na(list(Taxon = "Extra"))
# Spatially join subregions to the EMP WQ coordinates and remove stations
# outside of subregions
sf_emp_coord_sfilt <- sf_emp_coord %>%
st_join(sf_delta_bio_c, join = st_intersects) %>%
drop_na(SubRegion)
# Remove data for stations outside of subregions from EMP WQ dataset
df_emp_wq_sfilt <- df_emp_wq_c %>% filter(Station %in% unique(sf_emp_coord_sfilt$Station))
In order to combine overlapping EMP WQ stations and to filter them temporally, let’s take a look at a map of where the filtered EMP WQ stations are located and heatmaps of their sampling effort.
sf_emp_coord_sfilt %>%
mapview_from_coord("EMP WQ Stations", "yellow") +
mapview(
sf_delta_bio_c,
zcol = "Taxon",
layer.name = "Filtered Subregions",
label = "SubRegion"
)
# Create a tibble of parameters to be used for the heatmaps
df_emp_wq_param <-
tibble(
Parameter = c(
"WTSurface",
"WTBottom",
"SpCndSurface",
"SpCndBottom",
"Secchi",
"DissAmmonia",
"DissNitrateNitrite",
"DissOrthophos"
),
Pretty_Name = c(
"Water Temperature (surface)",
"Water Temperature (bottom)",
"Specific Conductance (surface)",
"Specific Conductance (bottom)",
"Secchi depth",
"Dissolved Ammonia",
"Dissolved Nitrate + Nitrite",
"Dissolved Ortho-phosphate"
)
)
# Create nested dataframe and generate heatmaps for each parameter
ndf_emp_wq_sampeff <- df_emp_wq_param %>%
mutate(
df_data = list(df_emp_wq_sfilt),
df_data_filt = map2(df_data, Parameter, ~ drop_na(.x, all_of(.y)))
) %>%
add_row(
Pretty_Name = "All Parameters",
df_data_filt = list(df_emp_wq_sfilt),
.before = 1
) %>%
mutate(
plt_samp_effort = map(
df_data_filt,
~ count(.x, Station, YearAdj, name = "num_samples") %>%
heatmap_sampeff()
)
)
Looking at the map of filtered EMP WQ stations and heatmaps of their sampling effort, we’ll combine the following stations:
# Combine overlapping stations in both WQ and station coordinates dataframes
ls_emp_sfilt <-
lst(df_emp_wq_sfilt, sf_emp_coord_sfilt) %>%
map(
~ mutate(
.x,
Station = case_when(
Station %in% c("D41", "D42") ~ "D41-D42",
Station %in% c("MD10", "MD10A") ~ "MD10-MD10A",
Station %in% c("S42", "NZS42") ~ "S42-NZS42",
.default = Station
)
)
)
Next, we’ll filter stations temporally based on their time series completeness. We’ll keep stations that have less than 10% missing data assuming a monthly sampling frequency. This will be for each station’s overall sampling record - not for each individual parameter.
# Calculate total number of monthly sampling events expected between 1975-2021
yrs_range <- range(ls_emp_sfilt$df_emp_wq_sfilt$YearAdj)
num_yrs_mo <- length(yrs_range[1]:yrs_range[2]) * 12
# Calculate threshold for number of sampling events (10% missing)
num_yrs_thresh <- round(num_yrs_mo - num_yrs_mo * 0.1)
# Determine which stations meet the time series completeness criteria
lt_sta_keep <- ls_emp_sfilt$df_emp_wq_sfilt %>%
distinct(Station, Month, YearAdj) %>%
count(Station) %>%
filter(n >= num_yrs_thresh) %>%
pull(Station)
# Filter stations temporally in both WQ and station coordinates dataframes
ls_emp_sfilt_tf <- ls_emp_sfilt %>% map(~ filter(.x, Station %in% lt_sta_keep))
Let’s take a look at a map of the final EMP WQ stations after spatial and temporal filtering. These will be used in the drought environmental predictors data set.
ls_emp_sfilt_tf$sf_emp_coord_sfilt %>%
mapview_from_coord("EMP WQ Stations", "yellow") +
mapview(
sf_delta_bio_c,
zcol = "Taxon",
layer.name = "Filtered Subregions",
label = "SubRegion"
)
The fish sampling locations extend further west than the stations for benthic and zooplankton. We’ll create two versions of the EMP water quality data to be included in the environmental predictors data set - one for fish and another for benthic and zooplankton. For fish, we’ll include all EMP WQ stations after spatial and temporal filtering. For benthic and zooplankton, we’ll exclude stations D41-D42 and D6 which are outside of the spatial range for sampling for these two taxonomic groups.
Let’s also take a look at heatmaps of monthly sampling effort for each parameter of the final EMP WQ stations after spatial and temporal filtering.
# Create nested dataframe and generate heatmaps for each parameter
ndf_emp_wq_sampeff_f <- df_emp_wq_param %>%
mutate(
df_data = list(ls_emp_sfilt_tf$df_emp_wq_sfilt),
df_data_filt = map2(df_data, Parameter, ~ drop_na(.x, all_of(.y)))
) %>%
add_row(
Pretty_Name = "All Parameters",
df_data_filt = list(ls_emp_sfilt_tf$df_emp_wq_sfilt),
.before = 1
) %>%
mutate(
plt_samp_effort = map(
df_data_filt,
~ distinct(.x, Station, Month, YearAdj) %>%
count(Station, YearAdj, name = "num_samples") %>%
heatmap_sampeff()
)
)
A couple of observations from the sampling effort plots:
Now that we finished with the spatial and temporal filtering, we’ll take a look to see if there are any obvious outliers that should be omitted from the data set before averaging.
# Restructure data to long format to allow for easier plotting and
# implementation of functions for outlier inspection
# Separate nutrient parameters from WQ measurements because only the nutrients
# have "Sign" variables
df_emp_wq_f_nutr <- ls_emp_sfilt_tf$df_emp_wq_sfilt %>%
select(Station, Date, Month, YearAdj, starts_with("Diss")) %>%
rename_with(~ paste0(.x, "_Result"), starts_with("Diss") & !ends_with("_Sign")) %>%
pivot_longer(
cols = starts_with("Diss"),
names_to = c("Parameter", ".value"),
names_sep = "_"
) %>%
drop_na(Result)
# Exclude bottom measurements because they won't be used in the drought
# environmental predictors data set - they have too few samples
wq_params <- c("Secchi", "SpCndSurface", "WTSurface")
df_emp_wq_f_wqmeas <- ls_emp_sfilt_tf$df_emp_wq_sfilt %>%
select(Station, Date, Month, YearAdj, all_of(wq_params)) %>%
pivot_longer(
cols = all_of(wq_params),
names_to = "Parameter",
values_to = "Result"
) %>%
drop_na(Result)
There are a few nutrient values that are less than the reporting limit with reporting limits that are very high compared to the range of the values for the parameter (> 75th percentile). This includes the highest dissolved ortho-phosphate value in the data set. We will flag and take a closer look at these values for possible removal from the data set.
df_nutr_high_rl_flag <- df_emp_wq_f_nutr %>%
group_by(Parameter) %>%
mutate(Quant_75 = quantile(Result, probs = 0.75)) %>%
ungroup() %>%
mutate(HighRL_flag = if_else(Sign == "<" & Result > Quant_75, TRUE, FALSE))
# View flagged data points
df_nutr_high_rl_flag %>%
filter(HighRL_flag) %>%
arrange(Parameter, desc(Result)) %>%
print(n = 40)
## # A tibble: 37 × 9
## Station Date Month YearAdj Parameter Sign Result Quant_75 HighRL_flag
## <chr> <date> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <lgl>
## 1 D28A 2020-01-15 1 2020 DissAmmo… < 0.25 0.1 TRUE
## 2 D16 2020-01-16 1 2020 DissAmmo… < 0.25 0.1 TRUE
## 3 MD10-MD… 2020-01-16 1 2020 DissAmmo… < 0.25 0.1 TRUE
## 4 D16 2020-02-14 2 2020 DissAmmo… < 0.25 0.1 TRUE
## 5 MD10-MD… 2020-02-14 2 2020 DissAmmo… < 0.25 0.1 TRUE
## 6 P8 2020-02-14 2 2020 DissAmmo… < 0.25 0.1 TRUE
## 7 D28A 2020-03-02 3 2020 DissAmmo… < 0.25 0.1 TRUE
## 8 MD10-MD… 2020-03-03 3 2020 DissAmmo… < 0.25 0.1 TRUE
## 9 D41-D42 2020-03-09 3 2020 DissAmmo… < 0.25 0.1 TRUE
## 10 D12 2019-10-03 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 11 D28A 2019-10-03 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 12 D16 2019-10-04 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 13 D26 2019-10-04 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 14 MD10-MD… 2019-10-04 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 15 P8 2019-10-04 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 16 D10 2019-10-07 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 17 D6 2019-10-07 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 18 D8 2019-10-07 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 19 D41-D42 2019-10-09 10 2019 DissAmmo… < 0.2 0.1 TRUE
## 20 D12 2019-11-04 11 2019 DissAmmo… < 0.2 0.1 TRUE
## 21 D28A 2019-11-04 11 2019 DissAmmo… < 0.2 0.1 TRUE
## 22 D16 2019-11-05 11 2019 DissAmmo… < 0.2 0.1 TRUE
## 23 D26 2019-11-05 11 2019 DissAmmo… < 0.2 0.1 TRUE
## 24 MD10-MD… 2019-11-05 11 2019 DissAmmo… < 0.2 0.1 TRUE
## 25 P8 2019-11-05 11 2019 DissAmmo… < 0.2 0.1 TRUE
## 26 D8 2019-11-06 11 2019 DissAmmo… < 0.2 0.1 TRUE
## 27 D7 2019-11-07 11 2019 DissAmmo… < 0.2 0.1 TRUE
## 28 D16 2019-12-04 12 2020 DissAmmo… < 0.2 0.1 TRUE
## 29 D41-D42 2019-10-09 10 2019 DissNitr… < 5.5 0.47 TRUE
## 30 D41-D42 2019-11-12 11 2019 DissNitr… < 5.5 0.47 TRUE
## 31 D6 2019-10-07 10 2019 DissNitr… < 2.8 0.47 TRUE
## 32 D7 2019-10-08 10 2019 DissNitr… < 2.8 0.47 TRUE
## 33 D7 2019-11-07 11 2019 DissNitr… < 2.8 0.47 TRUE
## 34 D8 2019-10-07 10 2019 DissNitr… < 1.1 0.47 TRUE
## 35 D10 2019-10-07 10 2019 DissNitr… < 0.55 0.47 TRUE
## 36 D7 2019-11-07 11 2019 DissOrth… < 2 0.095 TRUE
## 37 D4 2019-11-06 11 2019 DissOrth… < 0.4 0.095 TRUE
# View range of values for each parameter
df_emp_wq_f_nutr %>%
summarize(
min_val = min(Result),
first_quantile = quantile(Result, probs = 0.25),
median = median(Result),
third_quantile = quantile(Result, probs = 0.75),
max_val = max(Result),
.by = Parameter
) %>%
arrange(Parameter)
## # A tibble: 3 × 6
## Parameter min_val first_quantile median third_quantile max_val
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 DissAmmonia 0.01 0.03 0.06 0.1 2.94
## 2 DissNitrateNitrite 0.01 0.22 0.33 0.47 15.2
## 3 DissOrthophos 0.01 0.053 0.07 0.095 2
Upon closer inspection, all 37 values that are less than the reporting limit with reporting limits that are greater than the 75th percentile of the values for the parameter should be removed from the data set.
df_emp_wq_f_nutr_c1 <- df_nutr_high_rl_flag %>%
filter(!HighRL_flag) %>%
select(-c(Quant_75, HighRL_flag))
Next, let’s make some basic boxplots to look for outliers.
ndf_emp_wq_bplt <- df_emp_wq_f_nutr_c1 %>%
# Remove nutrient data collected at D10, D12, D16, and D22 because of
# inadequate sampling
filter(!Station %in% c("D10", "D12", "D16", "D22")) %>%
bind_rows(df_emp_wq_f_wqmeas) %>%
nest(.by = Parameter, .key = "df_data") %>%
left_join(df_emp_wq_param, by = join_by(Parameter)) %>%
mutate(plt_boxplot = map(df_data, boxplot_param)) %>%
arrange(Parameter)
There are a few outliers that stand out in the boxplots that need to be investigated further:
Now we’ll look for outliers more objectively by using Z-score tests to flag data points to investigate further. For the water quality measurement parameters (Secchi depth, Specific Conductance, and Water Temperature), we’ll use a standard Z-score test to flag data more than 10 SDs away from the mean of each station. For the nutrient parameters, we’ll use the modified Z-score test with the same threshold. We used the modified test for nutrients because its more resistant to censored data.
# Start with the Z-score test on WQ measurement parameters
df_wqmeas_flag <- df_emp_wq_f_wqmeas %>%
group_by(Parameter, Station) %>%
mutate(
tmp_mean = mean(Result),
tmp_sd = sd(Result),
Zscore = if_else(tmp_sd == 0, NA_real_, abs((Result - tmp_mean) / tmp_sd)),
Zscore_flag = case_when(Zscore > 10 ~ TRUE, .default = FALSE)
) %>%
ungroup() %>%
select(!starts_with("tmp_"))
# View flagged data points
df_wqmeas_flag %>% filter(Zscore_flag) %>% arrange(Parameter, desc(Zscore))
## # A tibble: 3 × 8
## Station Date Month YearAdj Parameter Result Zscore Zscore_flag
## <chr> <date> <dbl> <dbl> <chr> <dbl> <dbl> <lgl>
## 1 D22 2000-12-06 12 2001 SpCndSurface 35580 15.3 TRUE
## 2 D12 1999-07-12 7 1999 SpCndSurface 39934 11.9 TRUE
## 3 D16 1994-09-27 9 1994 SpCndSurface 6982 10.2 TRUE
# Plot specific conductance at all stations from adjusted year 1999-2001 to look
# at flagged values from D12 and D22
df_emp_wq_f_wqmeas %>%
filter(
Parameter == "SpCndSurface",
YearAdj %in% 1999:2001
) %>%
ggplot(aes(x = Date, y = Result, color = Station)) +
geom_line() +
geom_point() +
scale_color_viridis_d(option = "plasma") +
theme_bw()
# Plot specific conductance at all stations for adjusted year 1994 to look at
# flagged value from D16
df_emp_wq_f_wqmeas %>%
filter(
Parameter == "SpCndSurface",
YearAdj == 1994
) %>%
ggplot(aes(x = Date, y = Result, color = Station)) +
geom_line() +
geom_point() +
scale_color_viridis_d(option = "plasma") +
theme_bw()
The flagged specific conductance values at D12 and D22 look suspicious enough to omit from the data set; however, while the flagged value at D16 looks questionable, we’ll leave it in the data set.
# Now, let's run the modified Z-score test on the nutrient parameters
df_nutr_modzscore_flag <- df_emp_wq_f_nutr_c1 %>%
group_by(Parameter, Station) %>%
mutate(
tmp_median = median(Result),
tmp_mad = mad(Result),
ModZscore = if_else(tmp_mad == 0, NA_real_, abs(0.6745 * (Result - tmp_median) / tmp_mad)),
ModZscore_flag = case_when(ModZscore > 15 ~ TRUE, .default = FALSE)
) %>%
ungroup() %>%
select(!starts_with("tmp_"))
# View flagged data points
df_nutr_modzscore_flag %>% filter(ModZscore_flag) %>% arrange(Parameter, desc(ModZscore))
## # A tibble: 5 × 9
## Station Date Month YearAdj Parameter Sign Result ModZscore
## <chr> <date> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 D28A 1996-01-25 1 1996 DissAmmonia = 1 22.1
## 2 P8 2004-02-26 2 2004 DissAmmonia = 2.94 18.4
## 3 D28A 2017-07-14 7 2017 DissNitrateNitrite = 15.2 45.2
## 4 D4 2019-01-10 1 2019 DissNitrateNitrite = 6.07 26.2
## 5 D4 2017-06-14 6 2017 DissNitrateNitrite = 3.68 15.3
## # ℹ 1 more variable: ModZscore_flag <lgl>
# Plot dissolved ammonia at all stations from adjusted years 1995-1996 to look
# at flagged value from D28A
df_emp_wq_f_nutr_c1 %>%
filter(
Parameter == "DissAmmonia",
YearAdj %in% 1995:1996
) %>%
ggplot(aes(x = Date, y = Result, color = Station)) +
geom_line() +
geom_point() +
scale_color_viridis_d(option = "plasma") +
theme_bw()
# Plot dissolved ammonia at all stations from adjusted years 2003-2004 to look
# at flagged value from P8
df_emp_wq_f_nutr_c1 %>%
filter(
Parameter == "DissAmmonia",
YearAdj %in% 2003:2004
) %>%
ggplot(aes(x = Date, y = Result, color = Station)) +
geom_line() +
geom_point() +
scale_color_viridis_d(option = "plasma") +
theme_bw()
# Plot dissolved nitrate + nitrite at all stations from adjusted years 2017-2019
# to look at flagged values from D28A and D4
df_emp_wq_f_nutr_c1 %>%
filter(
Parameter == "DissNitrateNitrite",
YearAdj %in% 2017:2019
) %>%
ggplot(aes(x = Date, y = Result, color = Station)) +
geom_line() +
geom_point() +
scale_color_viridis_d(option = "plasma") +
theme_bw()
After inspecting the data flagged by the modified Z-score test, the dissolved ammonia values appear to be valid based on best professional judgment, so we will only exclude the dissolved nitrate + nitrite values flagged by the modified Z-score test.
Now we’ll remove the outliers we determined as bad data.
# Un-flag the specific conductance value at D16 and remove the remaining flagged
# data points from the data set
df_emp_wq_f_wqmeas_c <- df_wqmeas_flag %>%
mutate(
Zscore_flag = if_else(
Parameter == "SpCndSurface" & Station == "D16" & Date == "1994-09-27",
FALSE,
Zscore_flag
)
) %>%
filter(!Zscore_flag)
# Un-flag the DissAmmonia values and remove the DissNitrateNitrite flagged data
# points from the data set
df_emp_wq_f_nutr_c2 <- df_nutr_modzscore_flag %>%
mutate(ModZscore_flag = if_else(Parameter == "DissAmmonia", FALSE, ModZscore_flag)) %>%
filter(!ModZscore_flag)
# Recombine WQ measurement and nutrient data now that all outliers have been
# removed
df_emp_wq_f_long <- df_emp_wq_f_nutr_c2 %>%
# Remove nutrient data collected at D10, D12, D16, and D22 because of
# inadequate sampling
filter(!Station %in% c("D10", "D12", "D16", "D22")) %>%
bind_rows(df_emp_wq_f_wqmeas_c) %>%
select(!contains("Zscore"))
Let’s take one more look at heatmaps of monthly sampling effort for each parameter after removing outliers to assess the final data before averaging.
# Create nested dataframe and generate heatmaps for each parameter
ndf_emp_wq_sampeff_f2 <- df_emp_wq_f_long %>%
nest(.by = Parameter, .key = "df_data") %>%
left_join(df_emp_wq_param, by = join_by(Parameter)) %>%
mutate(
plt_samp_effort = map(
df_data,
~ distinct(.x, Station, Month, YearAdj) %>%
count(Station, YearAdj, name = "num_samples") %>%
heatmap_sampeff()
)
) %>%
arrange(Parameter)