NIEHS · kyle-messier · Sep 25, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 26, 2024
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -34,6 +34,7 @@ jobs:
         with:
           extra-packages: |
             any::pkgdown,
+            any::remotes,
             local::.
           needs: website
 

diff --git a/.gitignore b/.gitignore
@@ -104,3 +104,6 @@ slurm_error.log
 **/*.sif
 /Meta/
 /doc/
+
+# interactive targets
+targets_start.Rout
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: beethoven
 Title: Building an Extensible, rEproducible, Test-driven, Harmonized, Open-source, Versioned, ENsemble model for air quality
-Version: 0.4.0
+Version: 0.4.1
 Authors@R: c(
     person("Kyle", "Messier", , "kyle.messier@nih.gov", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9508-9623")),
     person("Insang", "Song", role = c("aut", "ctb"), comment = c(ORCID = "0000-0001-8732-3256")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,14 +3,18 @@
 export(add_time_col)
 export(append_predecessors)
 export(assign_learner_cv)
+export(attach_pred)
 export(attach_xy)
 export(calc_geos_strict)
 export(calc_gmted_direct)
 export(calc_narr2)
 export(calculate)
+export(convert_cv_index_rset)
 export(feature_raw_download)
 export(fit_base_learner)
+export(fit_base_tune)
 export(fit_meta_learner)
+export(fl_dates)
 export(generate_cv_index_sp)
 export(generate_cv_index_spt)
 export(generate_cv_index_ts)
@@ -30,16 +34,19 @@ export(post_calc_merge_all)
 export(post_calc_merge_features)
 export(post_calc_unify_timecols)
 export(post_calc_year_expand)
+export(pred_colname)
 export(predict_meta_learner)
 export(process_geos_bulk)
 export(process_narr2)
 export(read_locs)
 export(read_paths)
+export(reduce_list)
 export(reduce_merge)
 export(set_args_calc)
 export(set_args_download)
 export(set_slurm_resource)
 export(set_target_years)
+export(split_dates)
 export(switch_generate_cv_rset)
 export(switch_model)
 export(unmarshal_function)
@@ -102,6 +109,7 @@ importFrom(recipes,recipe)
 importFrom(recipes,update_role)
 importFrom(rlang,as_name)
 importFrom(rlang,inject)
+importFrom(rlang,quo_get_expr)
 importFrom(rlang,sym)
 importFrom(rsample,make_splits)
 importFrom(rsample,manual_rset)

diff --git a/R/base_learner.R b/R/base_learner.R
@@ -1,8 +1,5 @@
 # Base learners and auxiliary functions for base learners
 
-
-# nocov start
-
 #' Make sampled subdataframes for base learners
 #'
 #' Per beethoven resampling strategy, this function selects
@@ -235,7 +232,23 @@ fit_base_learner <-
           args_generate_cv
         )
       # generate row index
-      cv_index <- inject_match(switch_generate_cv_rset, args_generate_cv)
+      # cv_index <- inject_match(switch_generate_cv_rset, args_generate_cv)
+
+      # manually replicate switch_generate_cv_rset function
+      # the formals() argument used in inject_match does not properly
+      # identify the expected arguments in the switched functions
+      # identify cv_mode
+      cv_mode_arg <- match.arg(cv_mode)
+      target_fun <-
+        switch(
+          cv_mode_arg,
+          spatial = generate_cv_index_sp,
+          temporal = generate_cv_index_ts,
+          spatiotemporal = generate_cv_index_spt
+        )
+
+      # generate row index
+      cv_index <- inject_match(target_fun, args_generate_cv)
 
       # using cv_index, restore rset
       base_vfold <-
@@ -257,6 +270,11 @@ fit_base_learner <-
       grid_params <- tune_grid_in[grid_row_idx, ]
     } else {
       grid_params <- NULL
+      # drop mtry from model arguments if using baysian tuning
+      # for xgboost
+      if (model$engine %in% c("xgboost", "lightgbm")) {
+        model <- model %>% parsnip::set_args(mtry = NULL)
+      }
     }
 
 
@@ -306,6 +324,8 @@ fit_base_learner <-
 #' @importFrom yardstick metric_set rmse mape rsq mae
 #' @importFrom parsnip fit
 #' @importFrom stats predict
+#' @importFrom rlang quo_get_expr
+#' @export
 fit_base_tune <-
   function(
     recipe,
@@ -368,19 +388,33 @@ fit_base_tune <-
           control = wf_config
         )
     }
-    if (trim_resamples) {
-      base_wftune$splits <- NA
-    }
+    # DEVELOPMENT CHANGE
+    # mm-0904 Drop base_wftune from return when trim_resamples = TRUE
+    # due to large data size. 1 iter > 25Gb
+    # if (trim_resamples) {
+    #   base_wftune$splits <- NA
+    # }
     if (return_best) {
       # Select the best hyperparameters
       base_wfparam <-
         tune::select_best(
           base_wftune,
           metric = c("rmse", "rsq", "mae")
         )
-
       # finalize workflow with the best tuned hyperparameters
       base_wfresult <- tune::finalize_workflow(base_wf, base_wfparam)
+
+      # DEVELOPMENT CHANGE
+      # mm-0904 unlist multi-layered hidden units if mlp model
+      if (model$engine == "brulee" && is.list(grid$hidden_units)) {
+        base_wfresult$fit$actions$model$spec$args$hidden_units <-
+          unlist(
+            rlang::quo_get_expr(
+              base_wfresult$fit$actions$model$spec$args$hidden_units
+            )
+          )
+      }
+
       # Best-fit model
       base_wf_fit_best <- parsnip::fit(base_wfresult, data = data_full)
       # Prediction with the best model
@@ -394,6 +428,11 @@ fit_base_tune <-
           best_performance = base_wftune
         )
     }
+    # DEVELOPMENT CHANGE
+    # mm-0904 see above
+    if (trim_resamples) {
+      base_wflist <- base_wflist[-3]
+    }
     return(base_wflist)
   }
 
@@ -467,6 +506,7 @@ assign_learner_cv <-
 #' training-test data.frames and a column of labels.
 #' @author Insang Song
 #' @importFrom rsample make_splits manual_rset
+#' @export
 convert_cv_index_rset <-
   function(
     cvindex,
@@ -743,6 +783,8 @@ generate_cv_index_spt <-
       # ref_list contains the index of the group pairs
       ref_list <-
         Map(c, data_exd_rowid[search_idx], data_exd_colid[search_idx])
+    } else {
+      ref_list <- NULL
     }
     attr(index_cv, "ref_list") <- ref_list
     # generate row index for restoring rset
@@ -949,6 +991,3 @@ switch_generate_cv_rset <-
     cvindex <- inject_match(target_fun, list(...))
     return(cvindex)
   }
-
-
-# nocov end
diff --git a/R/calc_postprocessing.R b/R/calc_postprocessing.R
@@ -1,5 +1,3 @@
-# nocov start
-
 
 #' Add Time Column
 #'
@@ -104,13 +102,12 @@ post_calc_convert_time <-
   function(
     df
   ) {
-    df <- data.table::copy(data.table::as.data.table(df))
-    df <- df[, `:=`(time, as.character(time))]
-    return(df)
+    stopifnot("time" %in% names(df))
+    df1 <- data.table::copy(data.table::as.data.table(df))
+    df1[, time := as.character(time)]
+    return(df1)
   }
 
-# nocov end
-
 
 #' Join a data.frame with a year-only date column to
 #'  that with a full date column
@@ -278,9 +275,6 @@ post_calc_df_year_expand <- function(
 }
 
 
-
-# nocov start
-
 #' Merge spatial and spatiotemporal covariate data
 #' @keywords Post-calculation
 #' @param locs Location. e.g., AQS sites.
@@ -317,12 +311,16 @@ post_calc_merge_all <-
         locs_merged, df_spt,
         by = c(locs_id, time_id)
       )
+    # need POSIXt class for amadeus function
+    locs_merged[[time_id]] <- as.POSIXct(locs_merged[[time_id]])
     locs_merged <-
       amadeus::calc_temporal_dummies(
         locs = locs_merged,
         locs_id = locs_id,
         year = target_years
       )
+    # reset time as character
+    locs_merged[[time_id]] <- as.character(locs_merged[[time_id]])
     return(locs_merged)
   }
 
@@ -366,8 +364,6 @@ post_calc_drop_cols <-
     return(df)
   }
 
-# nocov end
-
 
 #' Automatic joining by the time and spatial identifiers
 #' @description The key assumption is that all data frames will have
@@ -469,8 +465,6 @@ post_calc_autojoin <-
   }
 
 
-# nocov start
-
 #' Impute missing values and attach lagged features
 #' @keywords Post-calculation
 #' @note
@@ -512,8 +506,14 @@ impute_all <-
   ) {
     data.table::setDTthreads(nthreads_dt)
     if (is.character(dt)) {
-      dt <- file.path("output/qs", dt)
-      dt <- qs::qread(dt)
+      if (!endsWith(dt, ".qs")) {
+        stop(
+          paste0(
+            "If `dt` points to a file, provide full path to .qs file.\n"
+          )
+        )
+      }
+      dt <- qs::qread(file.path(dt))
     }
     dt$time <- as.POSIXct(dt$time)
     # remove unnecessary columns
@@ -757,6 +757,3 @@ append_predecessors <-
       return(bound_large)
     }
   }
-
-
-# nocov end