Purpose

Explore the water quality data collected by EMP in order to process it correctly for inclusion in the drought environmental predictors data set.

Global code and functions

# Load packages
library(tidyverse)
library(scales)
library(viridisLite)
library(sf)
library(mapview)
# Make sure we are using `deltamapr` version 1.0.0, commit 5b11044a14aada45562d92899e2d8db42c0e8daf
# install.packages("devtools") 
# devtools::install_github("InteragencyEcologicalProgram/deltamapr", ref = "5b11044a14aada45562d92899e2d8db42c0e8daf")
library(deltamapr)
library(contentid)
library(here)
library(conflicted)

# Declare package conflict preferences
conflicts_prefer(dplyr::filter())

# Check if we are in the correct working directory
i_am("drought_variables/explore_emp_wq_data.Rmd")

# Set default options for mapview - turn off basemap color shuffle
mapviewOptions(basemaps.color.shuffle = FALSE)
# Function to create custom individual mapview objects for a set of coordinates
mapview_from_coord <- function(sf, layer_name, pt_color) {
  mapview(
    sf,
    layer.name = layer_name,
    col.regions = pt_color,
    alpha.regions = 0.8,
    cex = 5,
    label = "Station"
  )
}

# Function to create heatmaps of sampling effort
heatmap_sampeff <- function(df) {
  df %>%
    mutate(Station = fct_rev(factor(Station))) %>%
    ggplot(aes(x = YearAdj, y = Station, fill = num_samples)) +
    geom_tile() +
    scale_x_continuous(
      limits = c(1974, 2022),
      breaks = breaks_pretty(20), 
      expand = expansion()
    ) +
    scale_fill_viridis_c(name = "Number of Samples") +
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
      legend.position = "top"
    )
}

# Function to create boxplots for each individual parameter
boxplot_param <- function(df) {
  df %>% 
    ggplot(aes(x = Station, y = Result)) +
    geom_boxplot(outliers = FALSE) +
    geom_jitter(aes(color = YearAdj), width = 0.2) +
    scale_color_viridis_c(name = "Adjusted\nYear", option = "plasma") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
}

Import data

Import EMP WQ data and station coordinates from the EDI data package 458.10.

# Register a contentid for the EMP WQ data and station coordinates from the EDI data package 458.10
# This only needs to be done once
# EMP station coordinates
# register(
#   "https://portal.edirepository.org/nis/dataviewer?packageid=edi.458.10&entityid=86dd696bc3f8407ff52954094e1e9dcf"
# )

# EMP WQ data:
# register(
#   "https://portal.edirepository.org/nis/dataviewer?packageid=edi.458.10&entityid=cf231071093ac2861893793517db26f3"
# )

# Define contentid for EMP station coordinates
id_emp_coord <- "hash://sha256/120d7790b505f22a103ee392d13ca1c02b37fb9842d549942580b04c94eb2c67"

# Define contentid for EMP WQ data
id_emp_wq <- "hash://sha256/4c4b56d7d2a888b35efe2a6f2818f2b8b2fab26d0adb8f1c73ef522996b26ea8"

# Resolve the contentid's storing local copies for faster import
file_emp_coord <- resolve(id_emp_coord, store = TRUE)
file_emp_wq <- resolve(id_emp_wq, store = TRUE)

# Import EMP WQ data and station coordinates from the local copies using their contentid's
df_emp_coord <- read_csv(file_emp_coord)

df_emp_wq <- read_csv(
  file = file_emp_wq,
  col_types = cols_only(
    Station = "c",
    Date = "c",
    Secchi = "d",
    SpCndSurface = "d",
    SpCndBottom = "d",
    WTSurface = "d",
    WTBottom = "d",
    DissAmmonia_Sign = "c",
    DissAmmonia = "d",
    DissNitrateNitrite_Sign = "c",
    DissNitrateNitrite = "d",
    DissOrthophos_Sign = "c",
    DissOrthophos = "d"
  )
)

Import spatial data including the station coordinates for biological data and the Delta subregion polygons from the Drought Synthesis.

# Import station coordinates for biological data
df_bio_coord <- read_csv(here("Station_coordinates.csv"))

# Load Delta subregion polygons from the Drought Synthesis
sf_delta <- R_EDSM_Subregions_Mahardja_FLOAT %>% 
  select(SubRegion) %>% 
  # Remove a few subregions outside our area of interest
  filter(!SubRegion %in% c("San Francisco Bay", "South Bay", "Upper Napa River")) 

Prepare Data

Spatial data for maps:

# Prepare station coordinates of biological data for map - separate by taxon so
  # they can be added as independent layers
ndf_bio_coord <- df_bio_coord %>%
  rename_with(str_to_title) %>% 
  drop_na(Lat_wgs84, Long_wgs84) %>%
  # Remove phytoplankton stations because we're using a different set of
    # environmental predictors for them
  filter(Taxon != "phytoplankton") %>% 
  st_as_sf(coords = c("Long_wgs84", "Lat_wgs84"), crs = 4326) %>% 
  # Convert CRS so that its the same as the region and subregion polygons
  st_transform(crs = st_crs(sf_delta)) %>% 
  nest(.by = Taxon, .key = "sf_coord") %>% 
  mutate(Taxon = str_to_title(Taxon)) %>% 
  arrange(Taxon)

# Prepare EMP station coordinates for map
sf_emp_coord <- df_emp_coord %>% 
  drop_na(Latitude, Longitude) %>% 
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>% 
  # Convert CRS so that its the same as the subregion polygons
  st_transform(crs = st_crs(sf_delta)) %>%
  replace_na(list(EndDate = "ongoing"))

EMP WQ data:

# Prepare EMP WQ data for heatmap of sampling effort
df_emp_wq_c <- df_emp_wq %>% 
  mutate(
    Date = ymd(Date),
    # Add placeholder values for <RL records without RL values
    DissAmmonia = if_else(DissAmmonia_Sign == "<" & is.na(DissAmmonia), 0.01, DissAmmonia),
    DissNitrateNitrite = if_else(DissNitrateNitrite_Sign == "<" & is.na(DissNitrateNitrite), 0.01, DissNitrateNitrite),
    DissOrthophos = if_else(DissOrthophos_Sign == "<" & is.na(DissOrthophos), 0.01, DissOrthophos)
  ) %>% 
  # Remove records where values for all parameters are NA
  filter(!if_all(where(is.numeric), is.na)) %>% 
  # Only include 2021 and earlier - use adjusted calendar year:
    # December-November, with December of the previous calendar year included with
    # the following year
  mutate(
    Month = month(Date),
    YearAdj = if_else(Month == 12, year(Date) + 1, year(Date)),
    .after = Date
  ) %>% 
  filter(YearAdj <= 2021)

Filter data spatially

In order to spatially filter the EMP WQ sampling locations, we need to find the Delta subregions where biological data has been collected and categorize them based on taxon. We’ll then use the resulting subregion polygons to spatially filter the EMP WQ stations.

# Filter and categorize polygons from sf_delta
sf_delta_bio <- ndf_bio_coord %>% 
  mutate(
    sf_subr = list(sf_delta),
    sf_subr_filt = map2(
      sf_subr, 
      sf_coord, 
      ~ st_join(.x, .y, join = st_intersects) %>% 
        drop_na(Station) %>% 
        distinct(SubRegion, geometry)
    )
  ) %>% 
  select(Taxon, sf_subr_filt) %>% 
  unnest(sf_subr_filt) %>% 
  arrange(Taxon) %>% 
  group_by(SubRegion) %>% 
  mutate(
    Taxon = str_flatten_comma(Taxon),
    Taxon = if_else(Taxon == "Benthic, Fishes, Zooplankton", "All Taxa", Taxon)
  ) %>% 
  ungroup() %>% 
  distinct() %>% 
  st_as_sf(crs = st_crs(sf_delta))

# Create list of two mapview layers of the subregion polygons (all and filtered)
  # to be used across maps
ls_subr_polygons <- list(
  mapview(
    sf_delta,
    col.regions = "grey",
    layer.name = "All Subregions",
    alpha.regions = 0.5,
    legend = FALSE
  ),
  mapview(
    sf_delta_bio, 
    zcol = "Taxon", 
    layer.name = "Filtered Subregions",
    label = "SubRegion"
  )
)

Let’s visualize these subregions with the biological and EMP WQ stations overlayed.

Subregions map with biological sampling locations

ndf_bio_coord %>% 
  mutate(
    point_color = plasma(n = 3),
    map_obj = pmap(list(sf_coord, Taxon, point_color), mapview_from_coord)
  ) %>% 
  pull(map_obj) %>% 
  append(ls_subr_polygons, after = 0) %>% 
  reduce(`+`)

Subregions map with EMP WQ locations

sf_emp_coord %>% 
  mapview_from_coord("EMP WQ Stations", "yellow") %>% 
  append(ls_subr_polygons, after = 0) %>%
  reduce(`+`)

Filter EMP WQ stations

After looking at the maps above, we’ll add the “Franks Tract” subregion to the polygons used for spatially filtering the EMP WQ stations. Otherwise, the polygons used for filtering look good. We’ll come back to how they are categorized later.

# Add Franks Tract subregion to the polygons used for filtering
sf_delta_bio_c <- sf_delta_bio %>% 
  bind_rows(sf_delta %>% filter(SubRegion == "Franks Tract")) %>% 
  replace_na(list(Taxon = "Extra"))

# Spatially join subregions to the EMP WQ coordinates and remove stations
  # outside of subregions
sf_emp_coord_sfilt <- sf_emp_coord %>% 
  st_join(sf_delta_bio_c, join = st_intersects) %>% 
  drop_na(SubRegion)

# Remove data for stations outside of subregions from EMP WQ dataset
df_emp_wq_sfilt <- df_emp_wq_c %>% filter(Station %in% unique(sf_emp_coord_sfilt$Station))

Combine overlapping stations

In order to combine overlapping EMP WQ stations and to filter them temporally, let’s take a look at a map of where the filtered EMP WQ stations are located and heatmaps of their sampling effort.

Map of EMP WQ stations

sf_emp_coord_sfilt %>% 
  mapview_from_coord("EMP WQ Stations", "yellow") +
  mapview(
    sf_delta_bio_c, 
    zcol = "Taxon", 
    layer.name = "Filtered Subregions",
    label = "SubRegion"
  )

Sampling Effort

# Create a tibble of parameters to be used for the heatmaps
df_emp_wq_param <- 
  tibble(
    Parameter = c(
      "WTSurface",
      "WTBottom",
      "SpCndSurface",
      "SpCndBottom",
      "Secchi",
      "DissAmmonia",
      "DissNitrateNitrite",
      "DissOrthophos"
    ),
    Pretty_Name = c(
      "Water Temperature (surface)",
      "Water Temperature (bottom)",
      "Specific Conductance (surface)",
      "Specific Conductance (bottom)",
      "Secchi depth",
      "Dissolved Ammonia",
      "Dissolved Nitrate + Nitrite",
      "Dissolved Ortho-phosphate"
    )
  )

# Create nested dataframe and generate heatmaps for each parameter
ndf_emp_wq_sampeff <- df_emp_wq_param %>% 
  mutate(
    df_data = list(df_emp_wq_sfilt),
    df_data_filt = map2(df_data, Parameter, ~ drop_na(.x, all_of(.y)))
  ) %>% 
  add_row(
    Pretty_Name = "All Parameters", 
    df_data_filt = list(df_emp_wq_sfilt), 
    .before = 1
  ) %>% 
  mutate(
    plt_samp_effort = map(
      df_data_filt,
      ~ count(.x, Station, YearAdj, name = "num_samples") %>%
        heatmap_sampeff()
    )
  )

All Parameters

Water Temperature (surface)

Water Temperature (bottom)

Specific Conductance (surface)

Specific Conductance (bottom)

Secchi depth

Dissolved Ammonia

Dissolved Nitrate + Nitrite

Dissolved Ortho-phosphate

Combine Stations

Looking at the map of filtered EMP WQ stations and heatmaps of their sampling effort, we’ll combine the following stations:

  • D41 and D42
  • MD10 and MD10A
  • S42 and NZS42
# Combine overlapping stations in both WQ and station coordinates dataframes
ls_emp_sfilt <- 
  lst(df_emp_wq_sfilt, sf_emp_coord_sfilt) %>% 
  map(
    ~ mutate(
      .x,
      Station = case_when(
        Station %in% c("D41", "D42") ~ "D41-D42",
        Station %in% c("MD10", "MD10A") ~ "MD10-MD10A",
        Station %in% c("S42", "NZS42") ~ "S42-NZS42",
        .default = Station
      )
    )
  )

Filter data temporally

Next, we’ll filter stations temporally based on their time series completeness. We’ll keep stations that have less than 10% missing data assuming a monthly sampling frequency. This will be for each station’s overall sampling record - not for each individual parameter.

# Calculate total number of monthly sampling events expected between 1975-2021
yrs_range <- range(ls_emp_sfilt$df_emp_wq_sfilt$YearAdj)
num_yrs_mo <- length(yrs_range[1]:yrs_range[2]) * 12

# Calculate threshold for number of sampling events (10% missing)
num_yrs_thresh <- round(num_yrs_mo - num_yrs_mo * 0.1)

# Determine which stations meet the time series completeness criteria
lt_sta_keep <- ls_emp_sfilt$df_emp_wq_sfilt %>% 
  distinct(Station, Month, YearAdj) %>% 
  count(Station) %>% 
  filter(n >= num_yrs_thresh) %>% 
  pull(Station)

# Filter stations temporally in both WQ and station coordinates dataframes
ls_emp_sfilt_tf <- ls_emp_sfilt %>% map(~ filter(.x, Station %in% lt_sta_keep))

Map of final EMP WQ stations

Let’s take a look at a map of the final EMP WQ stations after spatial and temporal filtering. These will be used in the drought environmental predictors data set.

ls_emp_sfilt_tf$sf_emp_coord_sfilt %>% 
  mapview_from_coord("EMP WQ Stations", "yellow") +
  mapview(
    sf_delta_bio_c, 
    zcol = "Taxon", 
    layer.name = "Filtered Subregions",
    label = "SubRegion"
  )

The fish sampling locations extend further west than the stations for benthic and zooplankton. We’ll create two versions of the EMP water quality data to be included in the environmental predictors data set - one for fish and another for benthic and zooplankton. For fish, we’ll include all EMP WQ stations after spatial and temporal filtering. For benthic and zooplankton, we’ll exclude stations D41-D42 and D6 which are outside of the spatial range for sampling for these two taxonomic groups.

Sampling Effort

Let’s also take a look at heatmaps of monthly sampling effort for each parameter of the final EMP WQ stations after spatial and temporal filtering.

# Create nested dataframe and generate heatmaps for each parameter
ndf_emp_wq_sampeff_f <- df_emp_wq_param %>% 
  mutate(
    df_data = list(ls_emp_sfilt_tf$df_emp_wq_sfilt),
    df_data_filt = map2(df_data, Parameter, ~ drop_na(.x, all_of(.y)))
  ) %>% 
  add_row(
    Pretty_Name = "All Parameters", 
    df_data_filt = list(ls_emp_sfilt_tf$df_emp_wq_sfilt), 
    .before = 1
  ) %>% 
  mutate(
    plt_samp_effort = map(
      df_data_filt,
      ~ distinct(.x, Station, Month, YearAdj) %>% 
        count(Station, YearAdj, name = "num_samples") %>%
        heatmap_sampeff()
    )
  )

All Parameters

Water Temperature (surface)

Water Temperature (bottom)

Specific Conductance (surface)

Specific Conductance (bottom)

Secchi depth

Dissolved Ammonia

Dissolved Nitrate + Nitrite

Dissolved Ortho-phosphate

A couple of observations from the sampling effort plots:

  • We’ll include the following parameters in the environmental predictors data set: water temperature (surface), specific conductance (surface), Secchi depth, dissolved ammonia, dissolved nitrate + nitrite, and dissolved ortho-phosphate.
  • Unfortunately, nutrient data wasn’t collected at D10, D12, D16, and D22 from 1996-2016, so we won’t include these stations in the averages for the nutrient parameters. Also, EMP didn’t start collecting dissolved ammonia until 1979.

Inspect Data for Outliers

Now that we finished with the spatial and temporal filtering, we’ll take a look to see if there are any obvious outliers that should be omitted from the data set before averaging.

# Restructure data to long format to allow for easier plotting and
  # implementation of functions for outlier inspection
# Separate nutrient parameters from WQ measurements because only the nutrients
  # have "Sign" variables
df_emp_wq_f_nutr <- ls_emp_sfilt_tf$df_emp_wq_sfilt %>% 
  select(Station, Date, Month, YearAdj, starts_with("Diss")) %>% 
  rename_with(~ paste0(.x, "_Result"), starts_with("Diss") & !ends_with("_Sign")) %>% 
  pivot_longer(
    cols = starts_with("Diss"), 
    names_to = c("Parameter", ".value"), 
    names_sep = "_"
  ) %>% 
  drop_na(Result)

# Exclude bottom measurements because they won't be used in the drought
  # environmental predictors data set - they have too few samples
wq_params <- c("Secchi", "SpCndSurface", "WTSurface")

df_emp_wq_f_wqmeas <- ls_emp_sfilt_tf$df_emp_wq_sfilt %>% 
  select(Station, Date, Month, YearAdj, all_of(wq_params)) %>% 
  pivot_longer(
    cols = all_of(wq_params), 
    names_to = "Parameter", 
    values_to = "Result"
  ) %>% 
  drop_na(Result)

High RL Values

There are a few nutrient values that are less than the reporting limit with reporting limits that are very high compared to the range of the values for the parameter (> 75th percentile). This includes the highest dissolved ortho-phosphate value in the data set. We will flag and take a closer look at these values for possible removal from the data set.

df_nutr_high_rl_flag <- df_emp_wq_f_nutr %>% 
  group_by(Parameter) %>% 
  mutate(Quant_75 = quantile(Result, probs = 0.75)) %>% 
  ungroup() %>% 
  mutate(HighRL_flag = if_else(Sign == "<" & Result > Quant_75, TRUE, FALSE))

# View flagged data points
df_nutr_high_rl_flag %>% 
  filter(HighRL_flag) %>%
  arrange(Parameter, desc(Result)) %>% 
  print(n = 40)
## # A tibble: 37 × 9
##    Station  Date       Month YearAdj Parameter Sign  Result Quant_75 HighRL_flag
##    <chr>    <date>     <dbl>   <dbl> <chr>     <chr>  <dbl>    <dbl> <lgl>      
##  1 D28A     2020-01-15     1    2020 DissAmmo… <       0.25    0.1   TRUE       
##  2 D16      2020-01-16     1    2020 DissAmmo… <       0.25    0.1   TRUE       
##  3 MD10-MD… 2020-01-16     1    2020 DissAmmo… <       0.25    0.1   TRUE       
##  4 D16      2020-02-14     2    2020 DissAmmo… <       0.25    0.1   TRUE       
##  5 MD10-MD… 2020-02-14     2    2020 DissAmmo… <       0.25    0.1   TRUE       
##  6 P8       2020-02-14     2    2020 DissAmmo… <       0.25    0.1   TRUE       
##  7 D28A     2020-03-02     3    2020 DissAmmo… <       0.25    0.1   TRUE       
##  8 MD10-MD… 2020-03-03     3    2020 DissAmmo… <       0.25    0.1   TRUE       
##  9 D41-D42  2020-03-09     3    2020 DissAmmo… <       0.25    0.1   TRUE       
## 10 D12      2019-10-03    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 11 D28A     2019-10-03    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 12 D16      2019-10-04    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 13 D26      2019-10-04    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 14 MD10-MD… 2019-10-04    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 15 P8       2019-10-04    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 16 D10      2019-10-07    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 17 D6       2019-10-07    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 18 D8       2019-10-07    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 19 D41-D42  2019-10-09    10    2019 DissAmmo… <       0.2     0.1   TRUE       
## 20 D12      2019-11-04    11    2019 DissAmmo… <       0.2     0.1   TRUE       
## 21 D28A     2019-11-04    11    2019 DissAmmo… <       0.2     0.1   TRUE       
## 22 D16      2019-11-05    11    2019 DissAmmo… <       0.2     0.1   TRUE       
## 23 D26      2019-11-05    11    2019 DissAmmo… <       0.2     0.1   TRUE       
## 24 MD10-MD… 2019-11-05    11    2019 DissAmmo… <       0.2     0.1   TRUE       
## 25 P8       2019-11-05    11    2019 DissAmmo… <       0.2     0.1   TRUE       
## 26 D8       2019-11-06    11    2019 DissAmmo… <       0.2     0.1   TRUE       
## 27 D7       2019-11-07    11    2019 DissAmmo… <       0.2     0.1   TRUE       
## 28 D16      2019-12-04    12    2020 DissAmmo… <       0.2     0.1   TRUE       
## 29 D41-D42  2019-10-09    10    2019 DissNitr… <       5.5     0.47  TRUE       
## 30 D41-D42  2019-11-12    11    2019 DissNitr… <       5.5     0.47  TRUE       
## 31 D6       2019-10-07    10    2019 DissNitr… <       2.8     0.47  TRUE       
## 32 D7       2019-10-08    10    2019 DissNitr… <       2.8     0.47  TRUE       
## 33 D7       2019-11-07    11    2019 DissNitr… <       2.8     0.47  TRUE       
## 34 D8       2019-10-07    10    2019 DissNitr… <       1.1     0.47  TRUE       
## 35 D10      2019-10-07    10    2019 DissNitr… <       0.55    0.47  TRUE       
## 36 D7       2019-11-07    11    2019 DissOrth… <       2       0.095 TRUE       
## 37 D4       2019-11-06    11    2019 DissOrth… <       0.4     0.095 TRUE
# View range of values for each parameter
df_emp_wq_f_nutr %>% 
  summarize(
    min_val = min(Result),
    first_quantile = quantile(Result, probs = 0.25),
    median = median(Result),
    third_quantile = quantile(Result, probs = 0.75),
    max_val = max(Result),
    .by = Parameter
  ) %>% 
  arrange(Parameter)
## # A tibble: 3 × 6
##   Parameter          min_val first_quantile median third_quantile max_val
##   <chr>                <dbl>          <dbl>  <dbl>          <dbl>   <dbl>
## 1 DissAmmonia           0.01          0.03    0.06          0.1      2.94
## 2 DissNitrateNitrite    0.01          0.22    0.33          0.47    15.2 
## 3 DissOrthophos         0.01          0.053   0.07          0.095    2

Upon closer inspection, all 37 values that are less than the reporting limit with reporting limits that are greater than the 75th percentile of the values for the parameter should be removed from the data set.

df_emp_wq_f_nutr_c1 <- df_nutr_high_rl_flag %>% 
  filter(!HighRL_flag) %>% 
  select(-c(Quant_75, HighRL_flag))

Boxplots

Next, let’s make some basic boxplots to look for outliers.

ndf_emp_wq_bplt <- df_emp_wq_f_nutr_c1 %>% 
  # Remove nutrient data collected at D10, D12, D16, and D22 because of 
    # inadequate sampling
  filter(!Station %in% c("D10", "D12", "D16", "D22")) %>% 
  bind_rows(df_emp_wq_f_wqmeas) %>% 
  nest(.by = Parameter, .key = "df_data") %>% 
  left_join(df_emp_wq_param, by = join_by(Parameter)) %>% 
  mutate(plt_boxplot = map(df_data, boxplot_param)) %>% 
  arrange(Parameter)

Dissolved Ammonia

Dissolved Nitrate + Nitrite

Dissolved Ortho-phosphate

Secchi depth

Specific Conductance (surface)

Water Temperature (surface)

Flag Outliers

There are a few outliers that stand out in the boxplots that need to be investigated further:

  • Specific Conductance at D12 and D22
  • Dissolved Ammonia at D28A
  • Dissolved Nitrate + Nitrite at D28A and D4

Now we’ll look for outliers more objectively by using Z-score tests to flag data points to investigate further. For the water quality measurement parameters (Secchi depth, Specific Conductance, and Water Temperature), we’ll use a standard Z-score test to flag data more than 10 SDs away from the mean of each station. For the nutrient parameters, we’ll use the modified Z-score test with the same threshold. We used the modified test for nutrients because its more resistant to censored data.

# Start with the Z-score test on WQ measurement parameters
df_wqmeas_flag <- df_emp_wq_f_wqmeas %>% 
  group_by(Parameter, Station) %>% 
  mutate(
    tmp_mean = mean(Result),
    tmp_sd = sd(Result),
    Zscore = if_else(tmp_sd == 0, NA_real_, abs((Result - tmp_mean) / tmp_sd)),
    Zscore_flag = case_when(Zscore > 10 ~ TRUE, .default = FALSE)
  ) %>% 
  ungroup() %>% 
  select(!starts_with("tmp_"))

# View flagged data points
df_wqmeas_flag %>% filter(Zscore_flag) %>% arrange(Parameter, desc(Zscore))
## # A tibble: 3 × 8
##   Station Date       Month YearAdj Parameter    Result Zscore Zscore_flag
##   <chr>   <date>     <dbl>   <dbl> <chr>         <dbl>  <dbl> <lgl>      
## 1 D22     2000-12-06    12    2001 SpCndSurface  35580   15.3 TRUE       
## 2 D12     1999-07-12     7    1999 SpCndSurface  39934   11.9 TRUE       
## 3 D16     1994-09-27     9    1994 SpCndSurface   6982   10.2 TRUE
# Plot specific conductance at all stations from adjusted year 1999-2001 to look
  # at flagged values from D12 and D22
df_emp_wq_f_wqmeas %>% 
  filter(
    Parameter == "SpCndSurface",
    YearAdj %in% 1999:2001
  ) %>% 
  ggplot(aes(x = Date, y = Result, color = Station)) +
  geom_line() +
  geom_point() +
  scale_color_viridis_d(option = "plasma") +
  theme_bw()

# Plot specific conductance at all stations for adjusted year 1994 to look at
  # flagged value from D16
df_emp_wq_f_wqmeas %>% 
  filter(
    Parameter == "SpCndSurface",
    YearAdj == 1994
  ) %>% 
  ggplot(aes(x = Date, y = Result, color = Station)) +
  geom_line() +
  geom_point() +
  scale_color_viridis_d(option = "plasma") +
  theme_bw()

The flagged specific conductance values at D12 and D22 look suspicious enough to omit from the data set; however, while the flagged value at D16 looks questionable, we’ll leave it in the data set.

# Now, let's run the modified Z-score test on the nutrient parameters
df_nutr_modzscore_flag <- df_emp_wq_f_nutr_c1 %>% 
  group_by(Parameter, Station) %>% 
  mutate(
    tmp_median = median(Result),
    tmp_mad = mad(Result),
    ModZscore = if_else(tmp_mad == 0, NA_real_, abs(0.6745 * (Result - tmp_median) / tmp_mad)),
    ModZscore_flag = case_when(ModZscore > 15 ~ TRUE, .default = FALSE)
  ) %>% 
  ungroup() %>% 
  select(!starts_with("tmp_"))

# View flagged data points
df_nutr_modzscore_flag %>% filter(ModZscore_flag) %>% arrange(Parameter, desc(ModZscore))
## # A tibble: 5 × 9
##   Station Date       Month YearAdj Parameter          Sign  Result ModZscore
##   <chr>   <date>     <dbl>   <dbl> <chr>              <chr>  <dbl>     <dbl>
## 1 D28A    1996-01-25     1    1996 DissAmmonia        =       1         22.1
## 2 P8      2004-02-26     2    2004 DissAmmonia        =       2.94      18.4
## 3 D28A    2017-07-14     7    2017 DissNitrateNitrite =      15.2       45.2
## 4 D4      2019-01-10     1    2019 DissNitrateNitrite =       6.07      26.2
## 5 D4      2017-06-14     6    2017 DissNitrateNitrite =       3.68      15.3
## # ℹ 1 more variable: ModZscore_flag <lgl>
# Plot dissolved ammonia at all stations from adjusted years 1995-1996 to look
  # at flagged value from D28A
df_emp_wq_f_nutr_c1 %>% 
  filter(
    Parameter == "DissAmmonia",
    YearAdj %in% 1995:1996
  ) %>% 
  ggplot(aes(x = Date, y = Result, color = Station)) +
  geom_line() +
  geom_point() +
  scale_color_viridis_d(option = "plasma") +
  theme_bw()

# Plot dissolved ammonia at all stations from adjusted years 2003-2004 to look
  # at flagged value from P8
df_emp_wq_f_nutr_c1 %>% 
  filter(
    Parameter == "DissAmmonia",
    YearAdj %in% 2003:2004
  ) %>% 
  ggplot(aes(x = Date, y = Result, color = Station)) +
  geom_line() +
  geom_point() +
  scale_color_viridis_d(option = "plasma") +
  theme_bw()

# Plot dissolved nitrate + nitrite at all stations from adjusted years 2017-2019
  # to look at flagged values from D28A and D4
df_emp_wq_f_nutr_c1 %>% 
  filter(
    Parameter == "DissNitrateNitrite",
    YearAdj %in% 2017:2019
  ) %>% 
  ggplot(aes(x = Date, y = Result, color = Station)) +
  geom_line() +
  geom_point() +
  scale_color_viridis_d(option = "plasma") +
  theme_bw()

After inspecting the data flagged by the modified Z-score test, the dissolved ammonia values appear to be valid based on best professional judgment, so we will only exclude the dissolved nitrate + nitrite values flagged by the modified Z-score test.

Remove Outliers

Now we’ll remove the outliers we determined as bad data.

# Un-flag the specific conductance value at D16 and remove the remaining flagged
  # data points from the data set
df_emp_wq_f_wqmeas_c <- df_wqmeas_flag %>% 
  mutate(
    Zscore_flag = if_else(
      Parameter == "SpCndSurface" & Station == "D16" & Date == "1994-09-27",
      FALSE,
      Zscore_flag
    )
  ) %>% 
  filter(!Zscore_flag)

# Un-flag the DissAmmonia values and remove the DissNitrateNitrite flagged data
  # points from the data set
df_emp_wq_f_nutr_c2 <- df_nutr_modzscore_flag %>%
  mutate(ModZscore_flag = if_else(Parameter == "DissAmmonia", FALSE, ModZscore_flag)) %>% 
  filter(!ModZscore_flag)

# Recombine WQ measurement and nutrient data now that all outliers have been
  # removed
df_emp_wq_f_long <- df_emp_wq_f_nutr_c2 %>% 
  # Remove nutrient data collected at D10, D12, D16, and D22 because of 
    # inadequate sampling
  filter(!Station %in% c("D10", "D12", "D16", "D22")) %>% 
  bind_rows(df_emp_wq_f_wqmeas_c) %>% 
  select(!contains("Zscore"))

Sampling Effort after removing outliers

Let’s take one more look at heatmaps of monthly sampling effort for each parameter after removing outliers to assess the final data before averaging.

# Create nested dataframe and generate heatmaps for each parameter
ndf_emp_wq_sampeff_f2 <- df_emp_wq_f_long %>% 
  nest(.by = Parameter, .key = "df_data") %>% 
  left_join(df_emp_wq_param, by = join_by(Parameter)) %>% 
  mutate(
    plt_samp_effort = map(
      df_data,
      ~ distinct(.x, Station, Month, YearAdj) %>% 
        count(Station, YearAdj, name = "num_samples") %>%
        heatmap_sampeff()
    )
  ) %>% 
  arrange(Parameter)

Dissolved Ammonia

Dissolved Nitrate + Nitrite

Dissolved Ortho-phosphate

Secchi depth

Specific Conductance (surface)

Water Temperature (surface)

Conclusions and Next Steps

  • The fish sampling locations extend further west than the stations for benthic and zooplankton. We’ll create two versions of the EMP water quality data to be included in the environmental predictors data set - one for fish and another for benthic and zooplankton. For fish, we’ll include all EMP WQ stations after spatial and temporal filtering. For benthic and zooplankton, we’ll exclude stations D41-D42 and D6 which are outside of the spatial range for sampling for these two taxonomic groups.
  • We’ll include the following parameters in the environmental predictors data set: water temperature (surface), specific conductance (surface), Secchi depth, dissolved ammonia, dissolved nitrate + nitrite, and dissolved ortho-phosphate. Unfortunately, nutrient data wasn’t collected at D10, D12, D16, and D22 from 1996-2016, so we won’t include these stations in the averages for the nutrient parameters. Also, EMP didn’t start collecting dissolved ammonia until 1979.