Process Input Data for Models

Author

Dave Bosworth

Published

October 3, 2025

Purpose

Prepare input data for models used in the DWR nutrient synthesis project.

Global Code

Code
# Load packages
library(tidyverse)
library(rlang)
library(glue)
library(hms)
library(scales)
# Make sure we are using `deltamapr` version 1.0.1, commit fe34697b3d1aaa2945bbfc647582a19e251abf67
# install.packages("devtools")
# devtools::install_github("InteragencyEcologicalProgram/deltamapr", ref = "fe34697b3d1aaa2945bbfc647582a19e251abf67")
library(deltamapr)
# Make sure we are using `discretewq` version 2.4.0.9000, commit b68425c2eca88015b5e01ae6029f84b2a3af2ea8
# devtools::install_github("InteragencyEcologicalProgram/discretewq", ref = "b68425c2eca88015b5e01ae6029f84b2a3af2ea8")
library(discretewq)
library(sf)
library(mapview)
library(EDIutils)
library(fs)
library(here)
library(conflicted)

# Declare package conflict preferences
conflicts_prefer(dplyr::filter(), hms::hms())
Code
# Run session info to display package versions
devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.5.1 (2025-06-13 ucrt)
 os       Windows 11 x64 (build 26100)
 system   x86_64, mingw32
 ui       RTerm
 language (EN)
 collate  English_United States.utf8
 ctype    English_United States.utf8
 tz       America/Los_Angeles
 date     2025-10-03
 pandoc   3.6.3 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
 quarto   NA @ C:\\PROGRA~1\\Quarto\\bin\\quarto.exe

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version    date (UTC) lib source
 base64enc      0.1-3      2015-07-28 [1] CRAN (R 4.5.0)
 cachem         1.1.0      2024-05-16 [1] CRAN (R 4.5.0)
 class          7.3-23     2025-01-01 [2] CRAN (R 4.5.1)
 classInt       0.4-11     2025-01-08 [1] CRAN (R 4.5.0)
 cli            3.6.5      2025-04-23 [1] CRAN (R 4.5.0)
 codetools      0.2-20     2024-03-31 [2] CRAN (R 4.5.1)
 conflicted   * 1.2.0      2023-02-01 [1] CRAN (R 4.5.0)
 crosstalk      1.2.1      2023-11-23 [1] CRAN (R 4.5.0)
 DBI            1.2.3      2024-06-02 [1] CRAN (R 4.5.0)
 deltamapr    * 1.0.1      2025-09-17 [1] Github (InteragencyEcologicalProgram/deltamapr@fe34697)
 devtools       2.4.5      2022-10-11 [1] CRAN (R 4.5.0)
 digest         0.6.37     2024-08-19 [1] CRAN (R 4.5.0)
 discretewq   * 2.4.0.9000 2025-09-17 [1] Github (InteragencyEcologicalProgram/discretewq@b68425c)
 dplyr        * 1.1.4      2023-11-17 [1] CRAN (R 4.5.0)
 e1071          1.7-16     2024-09-16 [1] CRAN (R 4.5.0)
 EDIutils     * 1.0.3      2023-10-10 [1] CRAN (R 4.5.0)
 ellipsis       0.3.2      2021-04-29 [1] CRAN (R 4.5.0)
 evaluate       1.0.5      2025-08-27 [1] CRAN (R 4.5.1)
 farver         2.1.2      2024-05-13 [1] CRAN (R 4.5.0)
 fastmap        1.2.0      2024-05-15 [1] CRAN (R 4.5.0)
 forcats      * 1.0.0      2023-01-29 [1] CRAN (R 4.5.0)
 fs           * 1.6.6      2025-04-12 [1] CRAN (R 4.5.0)
 generics       0.1.4      2025-05-09 [1] CRAN (R 4.5.0)
 ggplot2      * 4.0.0      2025-09-11 [1] CRAN (R 4.5.1)
 glue         * 1.8.0      2024-09-30 [1] CRAN (R 4.5.0)
 gtable         0.3.6      2024-10-25 [1] CRAN (R 4.5.0)
 here         * 1.0.1      2020-12-13 [1] CRAN (R 4.5.0)
 hms          * 1.1.3      2023-03-21 [1] CRAN (R 4.5.0)
 htmltools      0.5.8.1    2024-04-04 [1] CRAN (R 4.5.0)
 htmlwidgets    1.6.4      2023-12-06 [1] CRAN (R 4.5.0)
 httpuv         1.6.16     2025-04-16 [1] CRAN (R 4.5.0)
 jsonlite       2.0.0      2025-03-27 [1] CRAN (R 4.5.0)
 KernSmooth     2.23-26    2025-01-01 [2] CRAN (R 4.5.1)
 knitr          1.50       2025-03-16 [1] CRAN (R 4.5.0)
 later          1.4.2      2025-04-08 [1] CRAN (R 4.5.0)
 lattice        0.22-7     2025-04-02 [1] CRAN (R 4.5.0)
 leafem         0.2.4      2025-05-01 [1] CRAN (R 4.5.0)
 leaflet        2.2.2      2024-03-26 [1] CRAN (R 4.5.0)
 lifecycle      1.0.4      2023-11-07 [1] CRAN (R 4.5.0)
 lubridate    * 1.9.4      2024-12-08 [1] CRAN (R 4.5.0)
 magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.5.0)
 mapview      * 2.11.2     2023-10-13 [1] CRAN (R 4.5.0)
 memoise        2.0.1      2021-11-26 [1] CRAN (R 4.5.0)
 mime           0.13       2025-03-17 [1] CRAN (R 4.5.0)
 miniUI         0.1.2      2025-04-17 [1] CRAN (R 4.5.0)
 pillar         1.11.1     2025-09-17 [1] CRAN (R 4.5.1)
 pkgbuild       1.4.8      2025-05-26 [1] CRAN (R 4.5.0)
 pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.5.0)
 pkgload        1.4.0      2024-06-28 [1] CRAN (R 4.5.0)
 png            0.1-8      2022-11-29 [1] CRAN (R 4.5.0)
 profvis        0.4.0      2024-09-20 [1] CRAN (R 4.5.0)
 promises       1.3.2      2024-11-28 [1] CRAN (R 4.5.0)
 proxy          0.4-27     2022-06-09 [1] CRAN (R 4.5.0)
 purrr        * 1.1.0      2025-07-10 [1] CRAN (R 4.5.1)
 R6             2.6.1      2025-02-15 [1] CRAN (R 4.5.0)
 raster         3.6-32     2025-03-28 [1] CRAN (R 4.5.0)
 RColorBrewer   1.1-3      2022-04-03 [1] CRAN (R 4.5.0)
 Rcpp           1.0.14     2025-01-12 [1] CRAN (R 4.5.0)
 readr        * 2.1.5      2024-01-10 [1] CRAN (R 4.5.0)
 remotes        2.5.0      2024-03-17 [1] CRAN (R 4.5.0)
 rlang        * 1.1.6      2025-04-11 [1] CRAN (R 4.5.0)
 rmarkdown      2.29       2024-11-04 [1] CRAN (R 4.5.0)
 rprojroot      2.1.1      2025-08-26 [1] CRAN (R 4.5.1)
 rstudioapi     0.17.1     2024-10-22 [1] CRAN (R 4.5.0)
 S7             0.2.0      2024-11-07 [1] CRAN (R 4.5.1)
 satellite      1.0.5      2024-02-10 [1] CRAN (R 4.5.0)
 scales       * 1.4.0      2025-04-24 [1] CRAN (R 4.5.0)
 sessioninfo    1.2.3      2025-02-05 [1] CRAN (R 4.5.0)
 sf           * 1.0-21     2025-05-15 [1] CRAN (R 4.5.0)
 shiny          1.10.0     2024-12-14 [1] CRAN (R 4.5.0)
 sp             2.2-0      2025-02-01 [1] CRAN (R 4.5.0)
 stringi        1.8.7      2025-03-27 [1] CRAN (R 4.5.0)
 stringr      * 1.5.2      2025-09-08 [1] CRAN (R 4.5.1)
 terra          1.8-42     2025-04-02 [1] CRAN (R 4.5.0)
 tibble       * 3.3.0      2025-06-08 [1] CRAN (R 4.5.0)
 tidyr        * 1.3.1      2024-01-24 [1] CRAN (R 4.5.0)
 tidyselect     1.2.1      2024-03-11 [1] CRAN (R 4.5.0)
 tidyverse    * 2.0.0      2023-02-22 [1] CRAN (R 4.5.1)
 timechange     0.3.0      2024-01-18 [1] CRAN (R 4.5.0)
 tzdb           0.5.0      2025-03-15 [1] CRAN (R 4.5.0)
 units          0.8-7      2025-03-11 [1] CRAN (R 4.5.0)
 urlchecker     1.0.1      2021-11-30 [1] CRAN (R 4.5.0)
 usethis        3.2.1      2025-09-06 [1] CRAN (R 4.5.1)
 vctrs          0.6.5      2023-12-01 [1] CRAN (R 4.5.0)
 withr          3.0.2      2024-10-28 [1] CRAN (R 4.5.0)
 xfun           0.53       2025-08-19 [1] CRAN (R 4.5.1)
 xtable         1.8-4      2019-04-21 [1] CRAN (R 4.5.0)
 yaml           2.3.10     2024-07-26 [1] CRAN (R 4.5.0)

 [1] C:/R/win-library/4.5
 [2] C:/Program Files/R/R-4.5.1/library
 * ── Packages attached to the search path.

──────────────────────────────────────────────────────────────────────────────

Create functions used in this document:

Code
# Get data entity names for specified EDI ID
get_edi_data_entities <- function(edi_id) {
  df_data_ent <- read_data_entity_names(edi_id)
  inform(c(
    "i" = paste0(
      "Data entities for ", edi_id, " include:\n",
      paste(df_data_ent$entityName, collapse = "\n"), "\n"
    ))
  )
  return(df_data_ent$entityName)
}

# Download specified data entities from an EDI package and save raw bytes files to a temporary
# directory
get_edi_data <- function(edi_id, entity_names) {
  df_data_ent <- read_data_entity_names(edi_id)
  df_data_ent_filt <- df_data_ent %>% filter(entityName %in% entity_names)
  
  ls_data_raw <-
    map(df_data_ent_filt$entityId, \(x) read_data_entity(edi_id, entityId = x)) %>%
    set_names(df_data_ent_filt$entityName)
  
  temp_dir <- tempdir()
  for (i in 1:length(ls_data_raw)) {
    file_raw <- file.path(temp_dir, glue("{names(ls_data_raw)[i]}.bin"))
    con <- file(file_raw, "wb")
    writeBin(ls_data_raw[[i]], con)
    close(con)
  }
}

# Replace values below the reporting limit with simulated values between `min_val` and the RL
replace_blw_rl <- function(df, min_val = 0, seed = 1) {
  # Pull out values that are below the RL
  df_blw_rl <- df %>% filter(Sign == "<")
  
  # Replace below RL values with simulated ones
  withr::with_seed(
    # Set seed for reproducibility
    seed = seed,
    df_blw_rl_sim <- df_blw_rl %>% 
      mutate(Result = round(runif(nrow(df_blw_rl), min = min_val, max = Result), 6))
  )
  
  # Add simulated values back to main data frame
  df %>% filter(Sign != "<") %>% bind_rows(df_blw_rl_sim)
}

# Add variable for season as a factor
add_season <- function(df) {
  df %>% mutate(
    Season = case_match(
      Month,
      3:5 ~ "Spring",
      6:8 ~ "Summer",
      9:11 ~ "Fall",
      c(12, 1, 2) ~ "Winter"
    ),
    Season = factor(Season, levels = c("Winter", "Spring", "Summer", "Fall"))
  )
}

Load globally-used data:

Code
# Import region assignments
df_regions <- read_csv(here("data/raw/region_assignments.csv"))

# Load Delta EDSM shapefile and only keep SubRegions east of Carquinez Straight
sf_delta <- R_EDSM_Subregions_Mahardja_FLOAT %>% 
  filter(
    !SubRegion %in% c(
      "Carquinez Strait", 
      "Lower Napa River", 
      "San Francisco Bay",
      "San Pablo Bay",
      "South Bay",
      "Upper Napa River" 
    )
  ) %>% 
  select(SubRegion) %>% 
  # Add region assignments to the SubRegion shapefile
  left_join(df_regions, by = join_by(SubRegion)) %>% 
  relocate(Region, .after = SubRegion)

Regions Map

Create map showing SubRegions and Regions used to process the data.

Code
mapview(sf_delta, zcol = "Region", alpha.regions = 0.5)

DWR Integrated Discrete Nutrient Data

Import Data

Code
# Import nutrient data set
df_nutr <- readRDS(here("data/processed/nutrient_data_discrete.rds"))

# Import phytoplankton data from FRP's EDI publication to assign lat-long coordinates to FRP's
  # nutrient data in the DWR integrated discrete nutrient data set
edi_data_ent <- get_edi_data_entities("edi.269.5")
edi_data_ent_sub <- str_subset(edi_data_ent, "^phyto")
get_edi_data("edi.269.5", edi_data_ent_sub)
temp_files <- list.files(tempdir(), full.names = TRUE)

df_frp_phyto_coord <- read_csv(str_subset(temp_files, edi_data_ent_sub)) %>% 
  drop_na(LatitudeStart, LongitudeStart) %>% 
  summarize(
    Latitude = mean(LatitudeStart),
    Longitude = mean(LongitudeStart),
    .by = Location
  )

Prepare Data

Code
# Prepare data for aggregation
df_nutr_c1 <- df_nutr %>% 
  # Add lat-long coordinates for FRP stations from the phytoplankton data set
  filter(Project == "Fish Restoration Program") %>% 
  select(-c(Latitude, Longitude)) %>% 
  left_join(df_frp_phyto_coord, by = join_by(Station_Name == Location)) %>% 
  bind_rows(df_nutr %>% filter(Project != "Fish Restoration Program")) %>% 
  # Remove records without lat-long coordinates
  drop_na(Latitude, Longitude) %>% 
  # Assign SubRegions to the stations
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
  st_transform(crs = st_crs(sf_delta)) %>%
  st_join(sf_delta, join = st_intersects) %>%
  # Remove any data outside our subregions of interest
  filter(!is.na(SubRegion)) %>%
  st_drop_geometry()

Remove various duplicated records so that there is only one sample per station-day.

Code
# Clean up duplicated records that share same Date_Time - it might be more appropriate to address
  # this in the nutrient data integration code. TODO: talk to Morgan about this
df_nutr_dt_dups <- df_nutr_c1 %>% 
  add_count(Project, Station_Name, Date_Time, Analyte) %>% 
  filter(n > 1) %>% 
  select(-n) %>% 
  drop_na(Date_Time)

# Keep the records with the "earliest" Sample_Code
df_nutr_dt_dups_fixed <- df_nutr_dt_dups %>% 
  arrange(Project, Station_Name, Date_Time, Analyte, Sample_Code) %>% 
  distinct(Project, Station_Name, Date_Time, Analyte, .keep_all = TRUE)

df_nutr_c2 <- df_nutr_c1 %>% 
  anti_join(df_nutr_dt_dups) %>% 
  bind_rows(df_nutr_dt_dups_fixed)
Code
# Clean up duplicated records that share same Date and don't have a collection time
df_nutr_dups_nodt <- df_nutr_c2 %>% 
  add_count(Project, Station_Name, Date, Analyte) %>% 
  filter(n > 1, is.na(Date_Time)) %>% 
  select(-n)

# Keep the records with the "earliest" Sample_Code
df_nutr_dups_nodt_fixed <- df_nutr_dups_nodt %>% 
  arrange(Project, Station_Name, Date, Analyte, Sample_Code) %>% 
  distinct(Project, Station_Name, Date, Analyte, .keep_all = TRUE)

df_nutr_c3 <- df_nutr_c2 %>% 
  anti_join(df_nutr_dups_nodt) %>% 
  bind_rows(df_nutr_dups_nodt_fixed)
Code
# Filter data so that there is only one sample per station-day by choosing the data point closest 
  # to noon
df_nutr_daily_dups <- df_nutr_c3 %>% 
  add_count(Project, Station_Name, Date, Analyte) %>% 
  filter(n > 1) %>% 
  select(-n)

df_nutr_daily_dups_fixed <- df_nutr_daily_dups %>% 
  # Create variable for time and calculate difference from noon for each data point
  mutate(
    Time = as_hms(Date_Time),
    Noon_diff = abs(hms(hours = 12) - Time)
  ) %>%
  group_by(Project, Station_Name, Date, Analyte) %>%
  # Select only 1 data point per Station, Date, and Analyte, choose data closest to noon
  filter(Noon_diff == min(Noon_diff)) %>%
  # When points are equidistant from noon, select earlier point
  filter(Time == min(Time)) %>%
  ungroup() %>% 
  select(-c(Time, Noon_diff))

df_nutr_c4 <- df_nutr_c3 %>% 
  anti_join(df_nutr_daily_dups) %>% 
  bind_rows(df_nutr_daily_dups_fixed)

Replace values below the reporting limit with simulated values.

Code
# Replace values below the reporting limit with simulated values
df_nutr_c5 <- df_nutr_c4 %>% 
  mutate(
    Sign = if_else(Detection_Condition == "Not Detected", "<", "="),
    Result = if_else(Detection_Condition == "Not Detected", Reporting_Limit, Result),
    .keep = "unused"
  ) %>% 
  # Omit <RL values with reporting limits that are greater than the 95th percentile of the values
    # for the parameter
  mutate(q95 = quantile(Result, probs = 0.95), .by = Analyte) %>% 
  filter(!(Sign == "<" & Result > q95)) %>% 
  select(-q95) %>% 
  replace_blw_rl()

Aggregate Data

Code
# Calculate Monthly-Regional averages
df_nutr_avg_mo <- df_nutr_c5 %>% 
  mutate(
    Year = year(Date),
    Month = month(Date)
  ) %>% 
  summarize(
    Result = mean(Result),
    Num_samples = n(),
    .by = c(Year, Month, Region, Analyte)
  )
Code
# Further calculate Seasonal-Regional averages using Adjusted calendar year: December-November, with
  # December of the previous calendar year included with the following year
df_nutr_avg_seas <- df_nutr_avg_mo %>% 
  mutate(YearAdj = if_else(Month == 12, Year + 1, Year)) %>% 
  add_season() %>% 
  summarize(
    Result = mean(Result),
    Num_samples = sum(Num_samples),
    .by = c(YearAdj, Season, Region, Analyte)
  )
Code
# Convert both sets of averages to wide format
ls_nutr_avg_wide <- lst(df_nutr_avg_mo, df_nutr_avg_seas) %>% 
  map(
    \(x) mutate(x,
      Analyte = case_match(
        Analyte,
        "Dissolved Ammonia" ~ "DissAmmonia",
        "Dissolved Nitrate" ~ "DissNitrate",
        "Dissolved Nitrate + Nitrite" ~ "DissNitrateNitrite",
        "Dissolved Organic Nitrogen" ~ "DON",
        "Dissolved ortho-Phosphate" ~ "DissOrthophos",
        "Total Kjeldahl Nitrogen" ~ "TKN",
        "Total Phosphorus" ~ "TotPhos"
      )
    ) %>% 
    arrange(Analyte, .locale = "en") %>% 
    select(-Num_samples) %>% 
    pivot_wider(names_from = Analyte, values_from = Result)
  )

Sampling Effort Plots

Monthly Values

Code
ls_nutr_avg_mo_plt <- df_nutr_avg_mo %>% 
  mutate(Month = month(Month, label = TRUE)) %>% 
  complete(nesting(Year, Month), Region, Analyte) %>%
  nest(.by = Analyte) %>% 
  deframe() %>% 
  imap(
    \(x, idx) ggplot(data = x, aes(x = Year, y = Region, fill = Num_samples)) +
      geom_tile() +
      facet_wrap(vars(Month)) +
      ggtitle(idx) +
      scale_y_discrete(drop = FALSE) +
      scale_x_continuous(
        breaks = breaks_pretty(10), 
        expand = expansion(mult = 0.02)
      ) +
      scale_fill_viridis_c(name = "Number of\nSamples", na.value = "transparent") +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
  )

Seasonal Values

Code
df_nutr_avg_seas %>% ggplot(aes(x = YearAdj, y = Region, fill = Num_samples)) +
  geom_tile() +
  facet_grid(
    rows = vars(Analyte), 
    cols = vars(Season), 
    labeller = label_wrap_gen(width = 20)
  ) +
  scale_x_continuous(
    breaks = breaks_pretty(10), 
    expand = expansion(mult = 0.02)
  ) +
  scale_fill_viridis_c(name = "Number of\nSamples") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Integrated Discrete WQ Data from discretewq

Import Data

Code
# Import full integrated data set from discretewq
df_dwq <- wq(
  Sources = c(
    "20mm", "Baystudy", "DJFMP", "DOP", "EDSM", "EMP", "FMWT", "NCRO", "SDO", "SKT", "SLS", "STN",
    "Suisun", "USBR", "USGS_CAWSC", "USGS_SFBS", "YBFMP"
  )
)

Prepare Data

Code
# Define water quality parameters to keep from discretewq
dwq_param <- c(
  "Microcystis", "Temperature", "Salinity", "DissolvedOxygen", "pH", "TurbidityNTU",
  "TurbidityFNU", "Chlorophyll"
)

# Prepare data for aggregation
df_dwq_c1 <- df_dwq %>% 
  select(
    Source, Station, Latitude, Longitude, Date, Datetime, Year, Month, 
    all_of(dwq_param), Chlorophyll_Sign
  ) %>% 
  # Convert Datetime to PST
  mutate(Datetime = with_tz(Datetime, tzone = "Etc/GMT+8")) %>%
  # Remove rows where all parameters are NA
  filter(!if_all(all_of(dwq_param), is.na)) %>% 
  # Remove records without lat-long coordinates
  drop_na(Latitude, Longitude) %>% 
  # Assign SubRegions to the stations
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
  st_transform(crs = st_crs(sf_delta)) %>%
  st_join(sf_delta, join = st_intersects) %>%
  # Remove any data outside our subregions of interest
  filter(!is.na(SubRegion)) %>%
  st_drop_geometry() %>% 
  # Remove data prior to 2009 and after 2024
  filter(Year %in% 2009:2024) %>% 
  # Only keep the stations located in Suisun, Montezuma, and Nurse Sloughs from the Suisun Marsh
    # survey
  filter(!(Source == "Suisun" & !str_detect(Station, "^SU|^MZ|^NS")))

# Pivot the parameter columns to long data structure
df_dwq_c2 <- df_dwq_c1 %>% 
  # Add a value suffix to the parameter columns
  rename_with(\(x) paste0(x, "_Result"), all_of(dwq_param)) %>% 
  # Pivot parameters longer, creating a sign and result column for each parameter
  pivot_longer(
    cols = ends_with(c("Sign", "Result")),
    names_to = c("Parameter", ".value"),
    names_pattern = "(.*)_(.*)"
  ) %>% 
  drop_na(Result) %>% 
  replace_na(list(Sign = "="))

Consolidate Turbidity parameters and prefer TurbidityNTU when both NTU and FNU were collected.

Code
df_dwq_c3 <- df_dwq_c2 %>% 
  mutate(Parameter2 = if_else(str_detect(Parameter, "^Turbidity"), "Turbidity", Parameter))

df_dwq_turb_dups <- df_dwq_c3 %>% 
  filter(Parameter2 == "Turbidity") %>% 
  add_count(Source, Station, Datetime) %>% 
  filter(n > 1) %>% 
  select(-n)

df_dwq_turb_dups_fixed <- df_dwq_turb_dups %>% filter(Parameter == "TurbidityNTU")

df_dwq_c4 <- df_dwq_c3 %>% 
  anti_join(df_dwq_turb_dups) %>% 
  bind_rows(df_dwq_turb_dups_fixed) %>% 
  select(-Parameter) %>% 
  rename(Parameter = Parameter2)

Remove various duplicated records so that there is only one sample per station-day.

Code
# Clean up the duplicated records that share same Datetime from EDSM
df_dwq_dt_dups <- df_dwq_c4 %>% 
  add_count(Source, Station, Datetime, Parameter) %>% 
  filter(n > 1) %>% 
  select(-n)

df_dwq_dt_dups_fixed <- df_dwq_dt_dups %>% 
  distinct(Source, Station, Datetime, Parameter, .keep_all = TRUE)

df_dwq_c5 <- df_dwq_c4 %>% 
  anti_join(df_dwq_dt_dups) %>% 
  bind_rows(df_dwq_dt_dups_fixed)
Code
# Filter data so that there is only one sample per station-day by choosing the data point closest 
  # to noon
df_dwq_daily_dups <- df_dwq_c5 %>% 
  add_count(Source, Station, Date, Parameter) %>% 
  filter(n > 1) %>% 
  select(-n)

df_dwq_daily_dups_fixed <- df_dwq_daily_dups %>% 
  # Create variable for time and calculate difference from noon for each data point
  mutate(
    Time = as_hms(Datetime),
    Noon_diff = abs(hms(hours = 12) - Time)
  ) %>%
  group_by(Source, Station, Date, Parameter) %>%
  # Select only 1 data point per Station, Date, and Parameter, choose data closest to noon
  filter(Noon_diff == min(Noon_diff)) %>%
  # When points are equidistant from noon, select earlier point
  filter(Time == min(Time)) %>%
  ungroup() %>% 
  select(-c(Time, Noon_diff))

df_dwq_c6 <- df_dwq_c5 %>% 
  anti_join(df_dwq_daily_dups) %>% 
  bind_rows(df_dwq_daily_dups_fixed)

Remove values that are out of range of reasonable limits for the parameter

Code
df_dwq_c7 <- df_dwq_c6 %>% 
  # Temperature value of 2.9 collected on 8/12/2014 in Suisun Marsh
  filter(!(Parameter == "Temperature" & Result < 3)) %>% 
  # Temperature value of 116 collected on 3/23/2017 at USGS-11447650
  filter(!(Parameter == "Temperature" & Result > 35)) %>% 
  # DO values greater than 20 (these are obviously % Saturation)
  filter(!(Parameter == "DissolvedOxygen" & Result > 20)) %>% 
  # pH value of 2.56 collected on 7/2/2014 in the Yolo Bypass
  filter(!(Parameter == "pH" & Result < 3)) %>% 
  # Two Turbidity values greater than 1000 collected in early Sept 2017 in the South Delta
  filter(!(Parameter == "Turbidity" & Result > 1000)) %>% 
  # A few Chlorophyll values less than or equal to zero
  filter(!(Parameter == "Chlorophyll" & Result <= 0))

Aggregate Data

Code
# Calculate Monthly-Regional averages
df_dwq_avg_mo <- df_dwq_c7 %>% 
  # Convert pH values to H+ concentration before averaging
  mutate(Result = if_else(Parameter == "pH", 10^-Result, Result)) %>% 
  # Replace Chlorophyll values below the reporting limit with simulated values
  replace_blw_rl() %>% 
  summarize(
    Result = mean(Result),
    Num_samples = n(),
    .by = c(Year, Month, Region, Parameter)
  )
Code
# Further calculate Seasonal-Regional averages using Adjusted calendar year: December-November, with
  # December of the previous calendar year included with the following year
df_dwq_avg_seas <- df_dwq_avg_mo %>% 
  mutate(YearAdj = if_else(Month == 12, Year + 1, Year)) %>% 
  add_season() %>% 
  summarize(
    Result = mean(Result),
    Num_samples = sum(Num_samples),
    .by = c(YearAdj, Season, Region, Parameter)
  )
Code
# Convert both sets of averages to wide format
ls_dwq_avg_wide <- lst(df_dwq_avg_mo, df_dwq_avg_seas) %>% 
  map(
    \(x) select(x, -Num_samples) %>% 
      pivot_wider(names_from = Parameter, values_from = Result) %>% 
      mutate(
        # Round Microcystis to nearest whole number
        Microcystis = round(Microcystis),
        # Convert average H+ concentration to average pH
        pH = -log10(pH)
      )
  )

Sampling Effort Plots

Monthly Values

Code
ls_dwq_avg_mo_plt <- df_dwq_avg_mo %>% 
  filter(Year %in% 2010:2024) %>% 
  mutate(Month = month(Month, label = TRUE)) %>% 
  complete(nesting(Year, Month), Region, Parameter) %>%
  nest(.by = Parameter) %>% 
  deframe() %>% 
  imap(
    \(x, idx) ggplot(data = x, aes(x = Year, y = Region, fill = Num_samples)) +
      geom_tile() +
      facet_wrap(vars(Month)) +
      ggtitle(idx) +
      scale_y_discrete(drop = FALSE) +
      scale_x_continuous(
        breaks = breaks_pretty(10), 
        expand = expansion(mult = 0.02)
      ) +
      scale_fill_viridis_c(name = "Number of\nSamples", na.value = "transparent") +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
  )

Seasonal Values

Code
df_dwq_avg_seas %>% 
  filter(YearAdj %in% 2010:2024) %>% 
  ggplot(aes(x = YearAdj, y = Region, fill = Num_samples)) +
    geom_tile() +
    facet_grid(rows = vars(Parameter), cols = vars(Season)) +
    scale_x_continuous(
      breaks = breaks_pretty(10), 
      expand = expansion(mult = 0.02)
    ) +
    scale_fill_viridis_c(name = "Number of\nSamples") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

DAYFLOW

Download Data

Download data from CNRA data portal: https://data.cnra.ca.gov/dataset/dayflow

This code chunk is set to not evaluate upon rendering and is only here for transparency sake. Set eval to TRUE to run the download data code if desired.

Code
# Download data from URL
dayflow_1997_2023 <- read_csv(
  "https://data.cnra.ca.gov/dataset/06ee2016-b138-47d7-9e85-f46fae674536/resource/21c377fe-53b8-4bd6-9e1f-2025221be095/download/dayflow-results-1997-2023.csv"
)
dayflow_2024 <- read_csv(
  "https://data.cnra.ca.gov/dataset/06ee2016-b138-47d7-9e85-f46fae674536/resource/6a7cb172-fb16-480d-9f4f-0322548fee83/download/dayflowcalculations2024.csv"
)

# Save local copies of data
lst(dayflow_1997_2023, dayflow_2024) %>%
  iwalk(\(x, idx) write_csv(x, file = here("data/raw", paste0(idx, ".csv"))))

rm(dayflow_1997_2023, dayflow_2024)

Import Data

Code
# Define file paths of DAYFLOW data
fp_dayflow <- dir_ls(here("data/raw"), regexp = "dayflow.+\\.csv$")

# Import data
ls_dayflow <- fp_dayflow %>% 
  map(read_csv) %>% 
  set_names(\(x) str_extract(x, "(?<=raw/)dayflow.+(?=\\.csv)"))

Prepare Data

Code
# Clean up and combine data
df_dayflow <- ls_dayflow %>% 
  map_at("dayflow_1997_2023", \(x) mutate(x, Date = mdy(Date))) %>% 
  map(\(x) select(x, Date, SAC, SJR, TOT)) %>% 
  list_rbind() %>% 
  mutate(Year = year(Date), Month = month(Date)) %>% 
  # Remove data prior to 2009 and after 2024
  filter(Year %in% 2009:2024)

Aggregate Data

Code
# Calculate monthly totals
df_dayflow_tot_mo <- df_dayflow %>% 
  summarize(
    Sac_Inflow = sum(SAC),
    SJR_Inflow = sum(SJR),
    Total_Inflow = sum(TOT),
    .by = c(Year, Month)
  )
Code
# Further calculate seasonal totals using Adjusted calendar year: December-November, with December
  # of the previous calendar year included with the following year
df_dayflow_tot_seas <- df_dayflow_tot_mo %>% 
  mutate(YearAdj = if_else(Month == 12, Year + 1, Year)) %>% 
  add_season() %>% 
  summarize(across(ends_with("Inflow"), sum), .by = c(YearAdj, Season))

Combine All Data

Code
# Create data frame that contains all possible combinations of year, month, and region
df_yr_mo_reg <- 
  expand_grid(
    Year = 2010:2024,
    Month = 1:12,
    Region = unique(df_regions$Region)
  ) %>% 
  # Remove rows after June 2024
  filter(!(Year == 2024 & Month > 6)) %>% 
  # Create Month-Year column
  mutate(MonthYear = paste(month(Month, label = TRUE), Year, sep = "-"), .before = Year)

# Combine all Monthly-Regional averages (or monthly totals) into one data frame
monthly_values <- 
  list(
    df_yr_mo_reg, 
    ls_nutr_avg_wide$df_nutr_avg_mo, 
    ls_dwq_avg_wide$df_dwq_avg_mo,
    df_dayflow_tot_mo
  ) %>% 
  reduce(left_join)
Code
# Create data frame that contains all possible combinations of year, season, and region
df_yr_seas_reg <- df_yr_mo_reg %>% 
  add_season() %>% 
  distinct(YearAdj = Year, Season, Region) %>% 
  # Remove Summer 2024 because of low sampling
  filter(!(YearAdj == 2024 & Season == "Summer"))

# Combine all Seasonal-Regional averages (or seasonal totals) into one data frame
seasonal_values <- 
  list(
    df_yr_seas_reg, 
    ls_nutr_avg_wide$df_nutr_avg_seas, 
    ls_dwq_avg_wide$df_dwq_avg_seas,
    df_dayflow_tot_seas
  ) %>% 
  reduce(left_join)

Export Processed Data

Export the monthly and seasonal data both as .csv and .rds files for the models. Also export the Delta SubRegions and Regions shapefile.

Code
ls_final_data <- lst(monthly_values, seasonal_values)
fp_processed <- here("data/processed")

# Export data as csv files
ls_final_data %>% 
  iwalk(\(x, idx) write_csv(x, file = paste0(fp_processed, "/", idx, ".csv")))

# Export data as rds files
ls_final_data %>% 
  iwalk(\(x, idx) saveRDS(x, file = paste0(fp_processed, "/", idx, ".rds")))

# Export Delta SubRegions/Regions shapefile
sf_delta %>% write_sf(here("data/spatial/delta_subregions.shp"))