Process Input Data for Models

Author

Dave Bosworth

Published

December 31, 2025

Purpose

Prepare input data for models used in the DWR nutrient synthesis project.

Global Code

Code
# Load packages
library(tidyverse)
library(readxl)
library(rlang)
library(glue)
library(hms)
library(scales)
library(sf)
library(mapview)
library(EDIutils)
library(fs)
library(here)
library(conflicted)

# Declare package conflict preferences
conflicts_prefer(dplyr::filter(), hms::hms())
Code
# Run session info to display package versions
devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.5.2 (2025-10-31 ucrt)
 os       Windows 11 x64 (build 26100)
 system   x86_64, mingw32
 ui       RTerm
 language (EN)
 collate  English_United States.utf8
 ctype    English_United States.utf8
 tz       America/Los_Angeles
 date     2025-12-31
 pandoc   3.6.3 @ c:\\Users\\dboswort\\AppData\\Local\\Programs\\Positron\\resources\\app\\quarto\\bin\\tools/ (via rmarkdown)
 quarto   NA @ C:\\PROGRA~1\\Quarto\\bin\\quarto.exe

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version date (UTC) lib source
 base64enc      0.1-3   2015-07-28 [1] CRAN (R 4.5.0)
 cachem         1.1.0   2024-05-16 [1] CRAN (R 4.5.0)
 cellranger     1.1.0   2016-07-27 [1] CRAN (R 4.5.0)
 class          7.3-23  2025-01-01 [2] CRAN (R 4.5.2)
 classInt       0.4-11  2025-01-08 [1] CRAN (R 4.5.0)
 cli            3.6.5   2025-04-23 [1] CRAN (R 4.5.0)
 codetools      0.2-20  2024-03-31 [2] CRAN (R 4.5.2)
 conflicted   * 1.2.0   2023-02-01 [1] CRAN (R 4.5.0)
 crosstalk      1.2.1   2023-11-23 [1] CRAN (R 4.5.0)
 DBI            1.2.3   2024-06-02 [1] CRAN (R 4.5.0)
 devtools       2.4.5   2022-10-11 [1] CRAN (R 4.5.0)
 digest         0.6.37  2024-08-19 [1] CRAN (R 4.5.0)
 dplyr        * 1.1.4   2023-11-17 [1] CRAN (R 4.5.0)
 e1071          1.7-16  2024-09-16 [1] CRAN (R 4.5.0)
 EDIutils     * 1.0.3   2023-10-10 [1] CRAN (R 4.5.0)
 ellipsis       0.3.2   2021-04-29 [1] CRAN (R 4.5.0)
 evaluate       1.0.5   2025-08-27 [1] CRAN (R 4.5.1)
 farver         2.1.2   2024-05-13 [1] CRAN (R 4.5.0)
 fastmap        1.2.0   2024-05-15 [1] CRAN (R 4.5.0)
 forcats      * 1.0.0   2023-01-29 [1] CRAN (R 4.5.0)
 fs           * 1.6.6   2025-04-12 [1] CRAN (R 4.5.0)
 generics       0.1.4   2025-05-09 [1] CRAN (R 4.5.0)
 ggplot2      * 4.0.0   2025-09-11 [1] CRAN (R 4.5.1)
 glue         * 1.8.0   2024-09-30 [1] CRAN (R 4.5.0)
 gtable         0.3.6   2024-10-25 [1] CRAN (R 4.5.0)
 here         * 1.0.1   2020-12-13 [1] CRAN (R 4.5.0)
 hms          * 1.1.3   2023-03-21 [1] CRAN (R 4.5.0)
 htmltools      0.5.8.1 2024-04-04 [1] CRAN (R 4.5.0)
 htmlwidgets    1.6.4   2023-12-06 [1] CRAN (R 4.5.0)
 httpuv         1.6.16  2025-04-16 [1] CRAN (R 4.5.0)
 jsonlite       2.0.0   2025-03-27 [1] CRAN (R 4.5.0)
 KernSmooth     2.23-26 2025-01-01 [2] CRAN (R 4.5.2)
 knitr          1.50    2025-03-16 [1] CRAN (R 4.5.0)
 later          1.4.2   2025-04-08 [1] CRAN (R 4.5.0)
 lattice        0.22-7  2025-04-02 [1] CRAN (R 4.5.0)
 leafem         0.2.4   2025-05-01 [1] CRAN (R 4.5.0)
 leaflet        2.2.2   2024-03-26 [1] CRAN (R 4.5.0)
 lifecycle      1.0.4   2023-11-07 [1] CRAN (R 4.5.0)
 lubridate    * 1.9.4   2024-12-08 [1] CRAN (R 4.5.0)
 magrittr       2.0.3   2022-03-30 [1] CRAN (R 4.5.0)
 mapview      * 2.11.2  2023-10-13 [1] CRAN (R 4.5.0)
 memoise        2.0.1   2021-11-26 [1] CRAN (R 4.5.0)
 mime           0.13    2025-03-17 [1] CRAN (R 4.5.0)
 miniUI         0.1.2   2025-04-17 [1] CRAN (R 4.5.0)
 pillar         1.11.1  2025-09-17 [1] CRAN (R 4.5.1)
 pkgbuild       1.4.8   2025-05-26 [1] CRAN (R 4.5.0)
 pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.5.0)
 pkgload        1.4.0   2024-06-28 [1] CRAN (R 4.5.0)
 png            0.1-8   2022-11-29 [1] CRAN (R 4.5.0)
 profvis        0.4.0   2024-09-20 [1] CRAN (R 4.5.0)
 promises       1.3.2   2024-11-28 [1] CRAN (R 4.5.0)
 proxy          0.4-27  2022-06-09 [1] CRAN (R 4.5.0)
 purrr        * 1.1.0   2025-07-10 [1] CRAN (R 4.5.1)
 R6             2.6.1   2025-02-15 [1] CRAN (R 4.5.0)
 raster         3.6-32  2025-03-28 [1] CRAN (R 4.5.0)
 RColorBrewer   1.1-3   2022-04-03 [1] CRAN (R 4.5.0)
 Rcpp           1.0.14  2025-01-12 [1] CRAN (R 4.5.0)
 readr        * 2.1.5   2024-01-10 [1] CRAN (R 4.5.0)
 readxl       * 1.4.5   2025-03-07 [1] CRAN (R 4.5.0)
 remotes        2.5.0   2024-03-17 [1] CRAN (R 4.5.0)
 rlang        * 1.1.6   2025-04-11 [1] CRAN (R 4.5.0)
 rmarkdown      2.29    2024-11-04 [1] CRAN (R 4.5.0)
 rprojroot      2.1.1   2025-08-26 [1] CRAN (R 4.5.1)
 S7             0.2.0   2024-11-07 [1] CRAN (R 4.5.1)
 satellite      1.0.5   2024-02-10 [1] CRAN (R 4.5.0)
 scales       * 1.4.0   2025-04-24 [1] CRAN (R 4.5.0)
 sessioninfo    1.2.3   2025-02-05 [1] CRAN (R 4.5.0)
 sf           * 1.0-21  2025-05-15 [1] CRAN (R 4.5.0)
 shiny          1.10.0  2024-12-14 [1] CRAN (R 4.5.0)
 sp             2.2-0   2025-02-01 [1] CRAN (R 4.5.0)
 stringi        1.8.7   2025-03-27 [1] CRAN (R 4.5.0)
 stringr      * 1.5.2   2025-09-08 [1] CRAN (R 4.5.1)
 terra          1.8-42  2025-04-02 [1] CRAN (R 4.5.0)
 tibble       * 3.3.0   2025-06-08 [1] CRAN (R 4.5.0)
 tidyr        * 1.3.1   2024-01-24 [1] CRAN (R 4.5.0)
 tidyselect     1.2.1   2024-03-11 [1] CRAN (R 4.5.0)
 tidyverse    * 2.0.0   2023-02-22 [1] CRAN (R 4.5.1)
 timechange     0.3.0   2024-01-18 [1] CRAN (R 4.5.0)
 tzdb           0.5.0   2025-03-15 [1] CRAN (R 4.5.0)
 units          0.8-7   2025-03-11 [1] CRAN (R 4.5.0)
 urlchecker     1.0.1   2021-11-30 [1] CRAN (R 4.5.0)
 usethis        3.2.1   2025-09-06 [1] CRAN (R 4.5.1)
 vctrs          0.6.5   2023-12-01 [1] CRAN (R 4.5.0)
 withr          3.0.2   2024-10-28 [1] CRAN (R 4.5.0)
 xfun           0.53    2025-08-19 [1] CRAN (R 4.5.1)
 xtable         1.8-4   2019-04-21 [1] CRAN (R 4.5.0)
 yaml           2.3.10  2024-07-26 [1] CRAN (R 4.5.0)

 [1] C:/R/win-library/4.5
 [2] C:/Program Files/R/R-4.5.2/library
 * ── Packages attached to the search path.

──────────────────────────────────────────────────────────────────────────────

Create functions used in this document:

Code
# Get data entity names for specified EDI ID
get_edi_data_entities <- function(edi_id) {
  df_data_ent <- read_data_entity_names(edi_id)
  inform(c(
    "i" = paste0(
      "Data entities for ",
      edi_id,
      " include:\n",
      paste(df_data_ent$entityName, collapse = "\n"),
      "\n"
    )
  ))
  return(df_data_ent$entityName)
}

# Download specified data entities from an EDI package and save raw bytes files to a
# temporary directory
get_edi_data <- function(edi_id, entity_names) {
  df_data_ent <- read_data_entity_names(edi_id)
  df_data_ent_filt <- df_data_ent %>% filter(entityName %in% entity_names)

  ls_data_raw <-
    map(df_data_ent_filt$entityId, \(x) {
      read_data_entity(edi_id, entityId = x)
    }) %>%
    set_names(df_data_ent_filt$entityName)

  temp_dir <- tempdir()
  for (i in 1:length(ls_data_raw)) {
    file_raw <- file.path(temp_dir, glue("{names(ls_data_raw)[i]}.bin"))
    con <- file(file_raw, "wb")
    writeBin(ls_data_raw[[i]], con)
    close(con)
  }
}

# Replace values below the reporting limit with simulated values between `min_val`
# and the RL
replace_blw_rl <- function(df, min_val = 0, seed = 1) {
  # Pull out values that are below the RL
  df_blw_rl <- df %>% filter(Sign == "<")

  # Replace below RL values with simulated ones
  withr::with_seed(
    # Set seed for reproducibility
    seed = seed,
    df_blw_rl_sim <- df_blw_rl %>%
      mutate(
        Result = round(runif(nrow(df_blw_rl), min = min_val, max = Result), 6)
      )
  )

  # Add simulated values back to main data frame
  df %>% filter(Sign != "<") %>% bind_rows(df_blw_rl_sim)
}

# Add variable for season as a factor
add_season <- function(df) {
  df %>%
    mutate(
      Season = case_match(
        Month,
        3:5 ~ "Spring",
        6:8 ~ "Summer",
        9:11 ~ "Fall",
        c(12, 1, 2) ~ "Winter"
      ),
      Season = factor(Season, levels = c("Winter", "Spring", "Summer", "Fall"))
    )
}

Load globally-used data:

Code
# Load regions shapefile
sf_delta <- read_sf(here("data/spatial/delta_subregions.shp"))

Regions Map

Create map showing SubRegions and Regions used to process the data.

Code
mapview(sf_delta, zcol = "Region", alpha.regions = 0.5)

DWR Integrated Discrete Nutrient Data

Import Data

Code
# Import nutrient data set
df_nutr <- readRDS(here("data/processed/nutrient_data_discrete.rds"))

# Import phytoplankton data from FRP's EDI publication to assign lat-long coordinates
# to FRP's nutrient data in the DWR integrated discrete nutrient data set
edi_id_frp <- "edi.269.5"
edi_data_ent_frp <- get_edi_data_entities(edi_id_frp)
edi_data_ent_frp_phyto <- str_subset(edi_data_ent_frp, "^phyto")
get_edi_data(edi_id_frp, edi_data_ent_frp_phyto)
temp_files <- list.files(tempdir(), full.names = TRUE)

df_frp_phyto_coord <-
  read_csv(str_subset(temp_files, edi_data_ent_frp_phyto)) %>%
  drop_na(LatitudeStart, LongitudeStart) %>%
  summarize(
    Latitude = mean(LatitudeStart),
    Longitude = mean(LongitudeStart),
    .by = Location
  )

Prepare Data

Code
# Prepare data for aggregation
df_nutr_c1 <- df_nutr %>%
  # Add lat-long coordinates for FRP stations from the phytoplankton data set
  filter(Project == "Fish Restoration Program") %>%
  select(-c(Latitude, Longitude)) %>%
  left_join(df_frp_phyto_coord, by = join_by(Station_Name == Location)) %>%
  bind_rows(df_nutr %>% filter(Project != "Fish Restoration Program")) %>%
  # Remove records without lat-long coordinates
  drop_na(Latitude, Longitude) %>%
  # Assign SubRegions to the stations
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
  st_transform(crs = st_crs(sf_delta)) %>%
  st_join(sf_delta, join = st_intersects) %>%
  # Remove any data outside our subregions of interest
  filter(!is.na(SubRegion)) %>%
  st_drop_geometry()

Remove various duplicated records so that there is only one sample per station-day.

Code
# Clean up duplicated records that share same Date_Time - it might be more
# appropriate to address this in the nutrient data integration code. TODO: talk to
# Morgan about this
df_nutr_dt_dups <- df_nutr_c1 %>%
  add_count(Project, Station_Name, Date_Time, Analyte) %>%
  filter(n > 1) %>%
  select(-n) %>%
  drop_na(Date_Time)

# Keep the records with the "earliest" Sample_Code
df_nutr_dt_dups_fixed <- df_nutr_dt_dups %>%
  arrange(Project, Station_Name, Date_Time, Analyte, Sample_Code) %>%
  distinct(Project, Station_Name, Date_Time, Analyte, .keep_all = TRUE)

df_nutr_c2 <- df_nutr_c1 %>%
  anti_join(df_nutr_dt_dups) %>%
  bind_rows(df_nutr_dt_dups_fixed)
Code
# Clean up duplicated records that share same Date and don't have a collection time
df_nutr_dups_nodt <- df_nutr_c2 %>%
  add_count(Project, Station_Name, Date, Analyte) %>%
  filter(n > 1, is.na(Date_Time)) %>%
  select(-n)

# Keep the records with the "earliest" Sample_Code
df_nutr_dups_nodt_fixed <- df_nutr_dups_nodt %>%
  arrange(Project, Station_Name, Date, Analyte, Sample_Code) %>%
  distinct(Project, Station_Name, Date, Analyte, .keep_all = TRUE)

df_nutr_c3 <- df_nutr_c2 %>%
  anti_join(df_nutr_dups_nodt) %>%
  bind_rows(df_nutr_dups_nodt_fixed)
Code
# Filter data so that there is only one sample per station-day by choosing the data
# point closest to noon
df_nutr_daily_dups <- df_nutr_c3 %>%
  add_count(Project, Station_Name, Date, Analyte) %>%
  filter(n > 1) %>%
  select(-n)

df_nutr_daily_dups_fixed <- df_nutr_daily_dups %>%
  # Create variable for time and calculate difference from noon for each data point
  mutate(
    Time = as_hms(Date_Time),
    Noon_diff = abs(hms(hours = 12) - Time)
  ) %>%
  group_by(Project, Station_Name, Date, Analyte) %>%
  # Select only 1 data point per Station, Date, and Analyte, choose data closest to
  # noon
  filter(Noon_diff == min(Noon_diff)) %>%
  # When points are equidistant from noon, select earlier point
  filter(Time == min(Time)) %>%
  ungroup() %>%
  select(-c(Time, Noon_diff))

df_nutr_c4 <- df_nutr_c3 %>%
  anti_join(df_nutr_daily_dups) %>%
  bind_rows(df_nutr_daily_dups_fixed)

Replace values below the reporting limit with simulated values.

Code
# Replace values below the reporting limit with simulated values
df_nutr_c5 <- df_nutr_c4 %>%
  mutate(
    Sign = if_else(Detection_Condition == "Not Detected", "<", "="),
    Result = if_else(
      Detection_Condition == "Not Detected",
      Reporting_Limit,
      Result
    ),
    .keep = "unused"
  ) %>%
  # Omit <RL values with reporting limits that are greater than the 95th percentile
  # of the values for the parameter
  mutate(q95 = quantile(Result, probs = 0.95), .by = Analyte) %>%
  filter(!(Sign == "<" & Result > q95)) %>%
  select(-q95) %>%
  replace_blw_rl()

Aggregate Data

Code
# Calculate Monthly-Regional averages
df_nutr_avg_mo <- df_nutr_c5 %>%
  mutate(
    Year = year(Date),
    Month = month(Date)
  ) %>%
  summarize(
    Result = mean(Result),
    Num_samples = n(),
    .by = c(Year, Month, Region, Analyte)
  )
Code
# Further calculate Seasonal-Regional averages using Adjusted calendar year:
# December-November, with December of the previous calendar year included with the
# following year
df_nutr_avg_seas <- df_nutr_avg_mo %>%
  mutate(YearAdj = if_else(Month == 12, Year + 1, Year)) %>%
  add_season() %>%
  summarize(
    Result = mean(Result),
    Num_samples = sum(Num_samples),
    .by = c(YearAdj, Season, Region, Analyte)
  )
Code
# Convert both sets of averages to wide format
ls_nutr_avg_wide <- lst(df_nutr_avg_mo, df_nutr_avg_seas) %>%
  map(
    \(x) {
      mutate(
        x,
        Analyte = case_match(
          Analyte,
          "Dissolved Ammonia" ~ "DissAmmonia",
          "Dissolved Nitrate" ~ "DissNitrate",
          "Dissolved Nitrate + Nitrite" ~ "DissNitrateNitrite",
          "Dissolved Organic Nitrogen" ~ "DON",
          "Dissolved ortho-Phosphate" ~ "DissOrthophos",
          "Total Kjeldahl Nitrogen" ~ "TKN",
          "Total Phosphorus" ~ "TotPhos"
        )
      ) %>%
        arrange(Analyte, .locale = "en") %>%
        select(-Num_samples) %>%
        pivot_wider(names_from = Analyte, values_from = Result)
    }
  )

Sampling Effort Plots

Monthly Values

Code
ls_nutr_avg_mo_plt <- df_nutr_avg_mo %>%
  mutate(Month = month(Month, label = TRUE)) %>%
  complete(nesting(Year, Month), Region, Analyte) %>%
  nest(.by = Analyte) %>%
  deframe() %>%
  imap(
    \(x, idx) {
      ggplot(data = x, aes(x = Year, y = Region, fill = Num_samples)) +
        geom_tile() +
        facet_wrap(vars(Month)) +
        ggtitle(idx) +
        scale_y_discrete(drop = FALSE) +
        scale_x_continuous(
          breaks = breaks_pretty(10),
          expand = expansion(mult = 0.02)
        ) +
        scale_fill_viridis_c(
          name = "Number of\nSamples",
          na.value = "transparent"
        ) +
        theme_bw() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
    }
  )

Seasonal Values

Code
df_nutr_avg_seas %>%
  ggplot(aes(x = YearAdj, y = Region, fill = Num_samples)) +
  geom_tile() +
  facet_grid(
    rows = vars(Analyte),
    cols = vars(Season),
    labeller = label_wrap_gen(width = 20)
  ) +
  scale_x_continuous(
    breaks = breaks_pretty(10),
    expand = expansion(mult = 0.02)
  ) +
  scale_fill_viridis_c(name = "Number of\nSamples") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Integrated Discrete WQ Data from discretewq

Import Data

Code
# Import integrated water quality data set from discretewq
df_dwq <- readRDS(here("data/external/discretewq_2009_2024.rds"))

Prepare Data

Code
# Define water quality parameters to keep from discretewq
dwq_param <- c(
  "Microcystis",
  "Temperature",
  "Salinity",
  "DissolvedOxygen",
  "pH",
  "TurbidityNTU",
  "TurbidityFNU",
  "Chlorophyll"
)

# Prepare data for aggregation
df_dwq_c1 <- df_dwq %>%
  select(
    Source,
    Station,
    Latitude,
    Longitude,
    Date,
    Datetime,
    Year,
    Month,
    all_of(dwq_param),
    Chlorophyll_Sign
  ) %>%
  # Convert Datetime to PST
  mutate(Datetime = with_tz(Datetime, tzone = "Etc/GMT+8")) %>%
  # Remove rows where all parameters are NA
  filter(!if_all(all_of(dwq_param), is.na)) %>%
  # Remove records without lat-long coordinates
  drop_na(Latitude, Longitude) %>%
  # Assign SubRegions to the stations
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
  st_transform(crs = st_crs(sf_delta)) %>%
  st_join(sf_delta, join = st_intersects) %>%
  # Remove any data outside our subregions of interest
  filter(!is.na(SubRegion)) %>%
  st_drop_geometry() %>%
  # Only keep the stations located in Suisun, Montezuma, and Nurse Sloughs from the
  # Suisun Marsh survey
  filter(!(Source == "Suisun" & !str_detect(Station, "^SU|^MZ|^NS")))

# Pivot the parameter columns to long data structure
df_dwq_c2 <- df_dwq_c1 %>%
  # Add a value suffix to the parameter columns
  rename_with(\(x) paste0(x, "_Result"), all_of(dwq_param)) %>%
  # Pivot parameters longer, creating a sign and result column for each parameter
  pivot_longer(
    cols = ends_with(c("Sign", "Result")),
    names_to = c("Parameter", ".value"),
    names_pattern = "(.*)_(.*)"
  ) %>%
  drop_na(Result) %>%
  replace_na(list(Sign = "="))

Consolidate Turbidity parameters and prefer TurbidityNTU when both NTU and FNU were collected.

Code
df_dwq_c3 <- df_dwq_c2 %>%
  mutate(
    Parameter2 = if_else(
      str_detect(Parameter, "^Turbidity"),
      "Turbidity",
      Parameter
    )
  )

df_dwq_turb_dups <- df_dwq_c3 %>%
  filter(Parameter2 == "Turbidity") %>%
  add_count(Source, Station, Datetime) %>%
  filter(n > 1) %>%
  select(-n)

df_dwq_turb_dups_fixed <- df_dwq_turb_dups %>%
  filter(Parameter == "TurbidityNTU")

df_dwq_c4 <- df_dwq_c3 %>%
  anti_join(df_dwq_turb_dups) %>%
  bind_rows(df_dwq_turb_dups_fixed) %>%
  select(-Parameter) %>%
  rename(Parameter = Parameter2)

Remove various duplicated records so that there is only one sample per station-day.

Code
# Clean up the duplicated records that share same Datetime from EDSM
df_dwq_dt_dups <- df_dwq_c4 %>%
  add_count(Source, Station, Datetime, Parameter) %>%
  filter(n > 1) %>%
  select(-n)

df_dwq_dt_dups_fixed <- df_dwq_dt_dups %>%
  distinct(Source, Station, Datetime, Parameter, .keep_all = TRUE)

df_dwq_c5 <- df_dwq_c4 %>%
  anti_join(df_dwq_dt_dups) %>%
  bind_rows(df_dwq_dt_dups_fixed)
Code
# Filter data so that there is only one sample per station-day by choosing the data
# point closest to noon
df_dwq_daily_dups <- df_dwq_c5 %>%
  add_count(Source, Station, Date, Parameter) %>%
  filter(n > 1) %>%
  select(-n)

df_dwq_daily_dups_fixed <- df_dwq_daily_dups %>%
  # Create variable for time and calculate difference from noon for each data point
  mutate(
    Time = as_hms(Datetime),
    Noon_diff = abs(hms(hours = 12) - Time)
  ) %>%
  group_by(Source, Station, Date, Parameter) %>%
  # Select only 1 data point per Station, Date, and Parameter, choose data closest to
  # noon
  filter(Noon_diff == min(Noon_diff)) %>%
  # When points are equidistant from noon, select earlier point
  filter(Time == min(Time)) %>%
  ungroup() %>%
  select(-c(Time, Noon_diff))

df_dwq_c6 <- df_dwq_c5 %>%
  anti_join(df_dwq_daily_dups) %>%
  bind_rows(df_dwq_daily_dups_fixed)

Remove values that are out of range of reasonable limits for the parameter

Code
df_dwq_c7 <- df_dwq_c6 %>%
  # Temperature value of 2.9 collected on 8/12/2014 in Suisun Marsh
  filter(!(Parameter == "Temperature" & Result < 3)) %>%
  # Temperature value of 116 collected on 3/23/2017 at USGS-11447650
  filter(!(Parameter == "Temperature" & Result > 35)) %>%
  # DO values greater than 20 (these are obviously % Saturation)
  filter(!(Parameter == "DissolvedOxygen" & Result > 20)) %>%
  # pH value of 2.56 collected on 7/2/2014 in the Yolo Bypass
  filter(!(Parameter == "pH" & Result < 3)) %>%
  # Two Turbidity values greater than 1000 collected in early Sept 2017 in the South
  # Delta
  filter(!(Parameter == "Turbidity" & Result > 1000)) %>%
  # A few Chlorophyll values less than or equal to zero
  filter(!(Parameter == "Chlorophyll" & Result <= 0))

Aggregate Data

Code
# Calculate Monthly-Regional averages
df_dwq_avg_mo <- df_dwq_c7 %>%
  # Convert pH values to H+ concentration before averaging
  mutate(Result = if_else(Parameter == "pH", 10^-Result, Result)) %>%
  # Replace Chlorophyll values below the reporting limit with simulated values
  replace_blw_rl() %>%
  summarize(
    Result = mean(Result),
    Num_samples = n(),
    .by = c(Year, Month, Region, Parameter)
  )
Code
# Further calculate Seasonal-Regional averages using Adjusted calendar year:
# December-November, with December of the previous calendar year included with the
# following year
df_dwq_avg_seas <- df_dwq_avg_mo %>%
  mutate(YearAdj = if_else(Month == 12, Year + 1, Year)) %>%
  add_season() %>%
  summarize(
    Result = mean(Result),
    Num_samples = sum(Num_samples),
    .by = c(YearAdj, Season, Region, Parameter)
  )
Code
# Convert both sets of averages to wide format
ls_dwq_avg_wide <- lst(df_dwq_avg_mo, df_dwq_avg_seas) %>%
  map(
    \(x) {
      select(x, -Num_samples) %>%
        pivot_wider(names_from = Parameter, values_from = Result) %>%
        mutate(
          # Round Microcystis to nearest whole number
          Microcystis_round = round(Microcystis),
          # Convert average H+ concentration to average pH
          pH = -log10(pH)
        )
    }
  )

Sampling Effort Plots

Monthly Values

Code
ls_dwq_avg_mo_plt <- df_dwq_avg_mo %>%
  filter(Year %in% 2010:2024) %>%
  mutate(Month = month(Month, label = TRUE)) %>%
  complete(nesting(Year, Month), Region, Parameter) %>%
  nest(.by = Parameter) %>%
  deframe() %>%
  imap(
    \(x, idx) {
      ggplot(data = x, aes(x = Year, y = Region, fill = Num_samples)) +
        geom_tile() +
        facet_wrap(vars(Month)) +
        ggtitle(idx) +
        scale_y_discrete(drop = FALSE) +
        scale_x_continuous(
          breaks = breaks_pretty(10),
          expand = expansion(mult = 0.02)
        ) +
        scale_fill_viridis_c(
          name = "Number of\nSamples",
          na.value = "transparent"
        ) +
        theme_bw() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
    }
  )

Seasonal Values

Code
df_dwq_avg_seas %>%
  filter(YearAdj %in% 2010:2024) %>%
  ggplot(aes(x = YearAdj, y = Region, fill = Num_samples)) +
  geom_tile() +
  facet_grid(rows = vars(Parameter), cols = vars(Season)) +
  scale_x_continuous(
    breaks = breaks_pretty(10),
    expand = expansion(mult = 0.02)
  ) +
  scale_fill_viridis_c(name = "Number of\nSamples") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

DAYFLOW

Import Data

Code
# Define file paths of DAYFLOW data
fp_dayflow <- dir_ls(here("data/external"), regexp = "dayflow.+\\.rds$")

# Import data
ls_dayflow <- fp_dayflow %>%
  map(readRDS) %>%
  set_names(\(x) str_extract(x, "(?<=external/)dayflow.+(?=\\.rds)"))

Prepare Data

Code
# Clean up and combine data
df_dayflow <- ls_dayflow %>%
  map_at("dayflow_1997_2023", \(x) mutate(x, Date = mdy(Date))) %>%
  map(\(x) select(x, Date, SAC, SJR, TOT)) %>%
  list_rbind() %>%
  mutate(Year = year(Date), Month = month(Date)) %>%
  # Remove data prior to 2009 and after 2024
  filter(Year %in% 2009:2024)

Aggregate Data

Code
# Calculate monthly totals
df_dayflow_tot_mo <- df_dayflow %>%
  summarize(
    Sac_Inflow = sum(SAC),
    SJR_Inflow = sum(SJR),
    Total_Inflow = sum(TOT),
    .by = c(Year, Month)
  )
Code
# Further calculate seasonal totals using Adjusted calendar year: December-November,
# with December of the previous calendar year included with the following year
df_dayflow_tot_seas <- df_dayflow_tot_mo %>%
  mutate(YearAdj = if_else(Month == 12, Year + 1, Year)) %>%
  add_season() %>%
  summarize(across(ends_with("Inflow"), sum), .by = c(YearAdj, Season))

Water Year Hydrologic Classification Indices

Import Data

Code
df_wyt_2010_2024 <- readRDS(here("data/external/wy_type_2010_2024.rds"))

Prepare Data

Code
# Prepare monthly values
df_wyt_2010_2024_mo <- df_wyt_2010_2024 %>%
  # Expand with months
  expand_grid(Month = 1:12) %>%
  # Adjust WY to calendar year
  mutate(Year = if_else(Month %in% 10:12, Year - 1, Year))

# Prepare seasonal values
df_wyt_2010_2024_seas <- df_wyt_2010_2024_mo %>%
  add_season() %>%
  # Use adjusted calendar year for the seasonal assignments Adjusted calendar year:
  # December-November, with December of the previous calendar year included with the
  # following year
  filter(!Month %in% 10:12) %>%
  select(-Month) %>%
  rename(YearAdj = Year) %>%
  distinct()

Clam Data

Import Data

Code
# Import clam grazing rate data from the drought synthesis EDI package
edi_id_drt <- "edi.1653.1"
edi_data_ent_drt <- get_edi_data_entities(edi_id_drt)
regex_clams <- "_clams|\\s{1}Clam\\s{1}"
edi_data_ent_drt_clams <- str_subset(edi_data_ent_drt, regex_clams)
get_edi_data(edi_id_drt, edi_data_ent_drt_clams)
fp_drt_clams <- dir_ls(tempdir(), regexp = regex_clams)

ls_drt_clams <- fp_drt_clams %>%
  map(read_csv) %>%
  set_names(\(x) {
    str_replace_all(str_extract(basename(x), ".+(?=\\.bin)"), "\\s", "_")
  })

# Import additional Suisun Marsh clam data from the SMSCG data package on EDI
edi_id_smscg <- "edi.876.8"
edi_data_ent_smscg <- get_edi_data_entities(edi_id_smscg)
edi_data_ent_smscg_clams <- str_subset(edi_data_ent_smscg, "smscg_clams")
get_edi_data(edi_id_smscg, edi_data_ent_smscg_clams)
temp_files <- list.files(tempdir(), full.names = TRUE)
df_smscg_clams <- read_csv(str_subset(temp_files, edi_data_ent_smscg_clams))

# Import additional North Delta clam data collected by USGS
ndf_usgs_clams <-
  tibble(
    fp = dir(here("data/external"), pattern = "BioRecGR", full.names = TRUE),
    Year = map_int(
      fp,
      \(x) as.numeric(str_extract(basename(x), "(?<=Delta)\\d{4}"))
    ),
    skip_row_num = if_else(Year == 2015, 0, 1),
    df_data = map2(fp, skip_row_num, \(x, y) read_excel(path = x, skip = y))
  )

Prepare Data

Code
# Clam grazing rate data from the drought synthesis EDI package
df_drt_clams <- ls_drt_clams %>%
  map_at(
    "GRTS_clams",
    \(x) {
      rename(x, Station = SiteID) %>%
        mutate(
          Date = ymd(paste(Year, Month, "15", sep = "-")),
          Filtration_Rate = Turnover_Rate * Depth
        )
    }
  ) %>%
  bind_rows() %>%
  group_by(Station, Date, Latitude, Longitude, Depth) %>%
  summarize(
    Clam_Filtration = sum(Filtration_Rate, na.rm = TRUE),
    Clam_Turnover = sum(Turnover_Rate, na.rm = TRUE),
    CorbiculaBiomass = sum(Biomass[which(Clam == "CF")]),
    PotamocorbulaBiomass = sum(Biomass[which(Clam == "PA")]),
    .groups = "drop_last"
  ) %>%
  # Average values collected at different depths but at same station and date
  summarize(
    across(
      c(starts_with("Clam_"), ends_with("Biomass")),
      \(x) na_if(mean(x, na.rm = TRUE), NaN)
    ),
    .groups = "drop"
  ) %>%
  mutate(
    Year = year(Date),
    Month = month(Date),
    Longitude = if_else(Longitude > 0, Longitude * -1, Longitude)
  )

# Suisun Marsh clam data from the SMSCG data package on EDI
df_smscg_clams_c <- df_smscg_clams %>%
  mutate(
    Year,
    Month = month(Date),
    Station,
    Date,
    Latitude = North_decimal_degrees,
    Longitude = West_decimal_degrees,
    CorbiculaBiomass = Corbicula_AFDM_g_per_m2,
    PotamocorbulaBiomass = Potamocorbula_AFDM_g_per_m2,
    Clam_Filtration = Total_filtration_rate_m3_per_m2_per_day,
    Clam_Turnover = Total_grazing_turnover_per_day,
    .keep = "none"
  )

# North Delta clam data collected by USGS
df_usgs_clams <- ndf_usgs_clams %>%
  select(Year, df_data) %>%
  deframe() %>%
  map(\(x) rename_with(x, str_to_title)) %>%
  map_at(
    "2015",
    \(x) {
      mutate(x, Station = as.character(Station)) %>%
        rename(Year_data = Year)
    }
  ) %>%
  list_rbind(names_to = "Year_fp") %>%
  mutate(
    Year = if_else(!is.na(Year_data), Year_data, as.numeric(Year_fp)),
    Date = ymd(paste(Year, Month, "15", sep = "-")),
    Filtration_Rate = Gr,
    Turnover_Rate = Grto,
    Latitude = Lat,
    Longitude = Long
  ) %>%
  group_by(Station, Date, Latitude, Longitude)

# Summarize Filtration and Turnover rates separately from the CF and PA biomass
df_usgs_clams_rates <- df_usgs_clams %>%
  summarize(
    Clam_Filtration = sum(Filtration_Rate, na.rm = T),
    Clam_Turnover = sum(Turnover_Rate),
    .groups = "drop"
  )

df_usgs_clams_biomass <- df_usgs_clams %>%
  filter(Clam %in% c("CF", "PA")) %>%
  group_by(Clam, .add = TRUE) %>%
  summarize(Biomass = sum(Biomass), .groups = "drop") %>%
  pivot_wider(names_from = Clam, values_from = Biomass) %>%
  rename(CorbiculaBiomass = CF, PotamocorbulaBiomass = PA)

df_usgs_clams_c <-
  full_join(
    df_usgs_clams_rates,
    df_usgs_clams_biomass,
    by = join_by(Station, Date, Latitude, Longitude)
  ) %>%
  mutate(Year = year(Date), Month = month(Date))

# Combine all clams datasets
df_clams_all <- bind_rows(df_drt_clams, df_smscg_clams_c, df_usgs_clams_c)

# Remove data prior to 2009 and after 2024, add Regions
df_clams_all_c <- df_clams_all %>%
  filter(Year %in% 2009:2024) %>%
  # Assign SubRegions to the stations
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
  st_transform(crs = st_crs(sf_delta)) %>%
  st_join(sf_delta, join = st_intersects) %>%
  # Remove any data outside our subregions of interest
  filter(!is.na(SubRegion)) %>%
  st_drop_geometry()

Aggregate Data

Code
# Calculate Monthly-Regional averages
df_clams_avg_mo <- df_clams_all_c %>%
  summarize(
    across(
      c(starts_with("Clam_"), ends_with("Biomass")),
      \(x) na_if(mean(x, na.rm = TRUE), NaN)
    ),
    .by = c(Year, Month, Region)
  )
Code
# Further calculate Seasonal-Regional averages using Adjusted calendar year:
# December-November, with December of the previous calendar year included with the
# following year
df_clams_avg_seas <- df_clams_avg_mo %>%
  mutate(YearAdj = if_else(Month == 12, Year + 1, Year)) %>%
  add_season() %>%
  summarize(
    across(
      c(starts_with("Clam_"), ends_with("Biomass")),
      \(x) na_if(mean(x, na.rm = TRUE), NaN)
    ),
    .by = c(YearAdj, Season, Region)
  )

Vegetation Index Data

Import Data

Code
# Vegetation Index data
df_veg_index <- read_csv(here("data/raw/wqes_veg.csv"))

# Station Info
df_veg_index_stations <- read_csv(here("data/raw/wqes_veg_stations.csv"))

Prepare Data

Code
df_veg_index_c <- df_veg_index %>%
  mutate(
    StationCode,
    Date = date(FldDate),
    Month = month(Date),
    Year = year(Date),
    VegIndex_Subm = FldObsVegSub,
    VegIndex_Surf = FldObsVegSurf,
    .keep = "none"
  ) %>%
  # Remove records were all values are missing, remove data collected after 2024
  filter(
    !if_all(starts_with("VegIndex"), is.na),
    Year <= 2024
  ) %>%
  # Convert index levels to numeric values
  mutate(
    across(
      starts_with("VegIndex"),
      \(x) {
        case_match(
          x,
          "Not Visible" ~ 0,
          "Low" ~ 1,
          c("medium", "Medium") ~ 2,
          "High" ~ 3,
          "Extreme" ~ 4
        )
      }
    )
  ) %>%
  # Assign SubRegions to the stations
  left_join(
    df_veg_index_stations %>%
      distinct(
        StationCode,
        Latitude = `Latitude (WGS84)`,
        Longitude = `Longitude (WGS84)`
      ),
    by = join_by(StationCode)
  ) %>%
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
  st_transform(crs = st_crs(sf_delta)) %>%
  st_join(sf_delta, join = st_intersects) %>%
  # Remove any data outside our subregions of interest
  filter(!is.na(SubRegion)) %>%
  st_drop_geometry() %>%
  # Remove a few duplicated records
  distinct()

Aggregate Data

Code
# Calculate Monthly-Regional averages
df_veg_index_avg_mo <- df_veg_index_c %>%
  summarize(
    across(
      starts_with("VegIndex"),
      \(x) na_if(mean(x, na.rm = TRUE), NaN)
    ),
    .by = c(Year, Month, Region)
  )
Code
# Further calculate Seasonal-Regional averages using Adjusted calendar year:
# December-November, with December of the previous calendar year included with the
# following year
df_veg_index_avg_seas <- df_veg_index_avg_mo %>%
  mutate(YearAdj = if_else(Month == 12, Year + 1, Year)) %>%
  add_season() %>%
  summarize(
    across(
      starts_with("VegIndex"),
      \(x) na_if(mean(x, na.rm = TRUE), NaN)
    ),
    .by = c(YearAdj, Season, Region)
  )

Wind Data

Import Data

Code
# Monthly values for each region is in a separate csv file
# Define file paths
fp_wind_mo <- dir(
  here("data/raw"),
  pattern = "Wind_[[:alpha:]]{4}_monthly.+\\.csv$",
  full.names = TRUE
)

# Import each csv file into a nested dataframe
ndf_wind_mo <- tibble(
  fp = fp_wind_mo,
  Region = str_extract(fp, "(?<=Wind_)[:alpha:]{4}(?=_monthly)"),
  df_data = map(fp, read_csv)
)

Prepare Data

Code
df_wind_mo <- ndf_wind_mo %>%
  mutate(
    Region = case_match(
      Region,
      "Conf" ~ "Confluence",
      "Nort" ~ "North",
      "SBay" ~ "Suisun Bay",
      "SCen" ~ "SouthCentral",
      "SMar" ~ "Suisun Marsh"
    ),
    df_data = map(
      df_data,
      \(x) {
        rename_with(
          x,
          \(y) str_extract(y, "(?<=[:alpha:]{4}-)[:alpha:]+$"),
          ends_with(c("mean", "max"))
        )
      }
    ),
    .keep = "none"
  ) %>%
  unnest(df_data) %>%
  select(
    Region,
    date,
    max = Wmax,
    ends_with(c("AvLt3.0", "AvLt4.5"))
  ) %>%
  rename_with(\(x) paste0("Wind_", x), where(is.numeric)) %>%
  mutate(
    Year = year(date),
    Month = month(date),
    .keep = "unused"
  )

We’ll only include monthly values for the wind variables for now.

Combine All Data

Code
# Import region assignments
df_regions <- read_csv(here("data/raw/region_assignments.csv"))

# Create data frame that contains all possible combinations of year, month, and
# region
df_yr_mo_reg <-
  expand_grid(
    Year = 2010:2024,
    Month = 1:12,
    Region = unique(df_regions$Region)
  ) %>%
  # Remove rows after June 2024
  filter(!(Year == 2024 & Month > 6)) %>%
  # Create Month-Year column
  mutate(
    MonthYear = paste(month(Month, label = TRUE), Year, sep = "-"),
    .before = Year
  )

# Combine all Monthly-Regional averages, monthly totals, or annual values into one
# data frame
monthly_values <-
  list(
    df_yr_mo_reg,
    ls_nutr_avg_wide$df_nutr_avg_mo,
    ls_dwq_avg_wide$df_dwq_avg_mo,
    df_dayflow_tot_mo,
    df_wyt_2010_2024_mo,
    df_clams_avg_mo,
    df_veg_index_avg_mo,
    df_wind_mo
  ) %>%
  reduce(left_join)
Code
# Create data frame that contains all possible combinations of year, season, and
# region
df_yr_seas_reg <- df_yr_mo_reg %>%
  add_season() %>%
  distinct(YearAdj = Year, Season, Region) %>%
  # Remove Summer 2024 because of low sampling
  filter(!(YearAdj == 2024 & Season == "Summer"))

# Combine all Seasonal-Regional averages, seasonal totals, or annual values into one
# data frame
seasonal_values <-
  list(
    df_yr_seas_reg,
    ls_nutr_avg_wide$df_nutr_avg_seas,
    ls_dwq_avg_wide$df_dwq_avg_seas,
    df_dayflow_tot_seas,
    df_wyt_2010_2024_seas,
    df_clams_avg_seas,
    df_veg_index_avg_seas
  ) %>%
  reduce(left_join)

Export Processed Data

Export the monthly and seasonal data both as .csv and .rds files for the models.

Code
ls_final_data <- lst(monthly_values, seasonal_values)
fp_processed <- here("data/processed")

# Export data as csv files
ls_final_data %>%
  iwalk(\(x, idx) write_csv(x, file = paste0(fp_processed, "/", idx, ".csv")))

# Export data as rds files
ls_final_data %>%
  iwalk(\(x, idx) saveRDS(x, file = paste0(fp_processed, "/", idx, ".rds")))