Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

0.4.1 #360

Merged
merged 21 commits into from
Sep 25, 2024
Merged

0.4.1 #360

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ jobs:
with:
extra-packages: |
any::pkgdown,
any::remotes,
local::.
needs: website

Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,6 @@ slurm_error.log
**/*.sif
/Meta/
/doc/

# interactive targets
targets_start.Rout
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: beethoven
Title: Building an Extensible, rEproducible, Test-driven, Harmonized, Open-source, Versioned, ENsemble model for air quality
Version: 0.4.0
Version: 0.4.1
Authors@R: c(
person("Kyle", "Messier", , "kyle.messier@nih.gov", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9508-9623")),
person("Insang", "Song", role = c("aut", "ctb"), comment = c(ORCID = "0000-0001-8732-3256")),
Expand Down
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@
export(add_time_col)
export(append_predecessors)
export(assign_learner_cv)
export(attach_pred)
export(attach_xy)
export(calc_geos_strict)
export(calc_gmted_direct)
export(calc_narr2)
export(calculate)
export(convert_cv_index_rset)
export(feature_raw_download)
export(fit_base_learner)
export(fit_base_tune)
export(fit_meta_learner)
export(fl_dates)
export(generate_cv_index_sp)
export(generate_cv_index_spt)
export(generate_cv_index_ts)
Expand All @@ -30,16 +34,19 @@ export(post_calc_merge_all)
export(post_calc_merge_features)
export(post_calc_unify_timecols)
export(post_calc_year_expand)
export(pred_colname)
export(predict_meta_learner)
export(process_geos_bulk)
export(process_narr2)
export(read_locs)
export(read_paths)
export(reduce_list)
export(reduce_merge)
export(set_args_calc)
export(set_args_download)
export(set_slurm_resource)
export(set_target_years)
export(split_dates)
export(switch_generate_cv_rset)
export(switch_model)
export(unmarshal_function)
Expand Down Expand Up @@ -102,6 +109,7 @@ importFrom(recipes,recipe)
importFrom(recipes,update_role)
importFrom(rlang,as_name)
importFrom(rlang,inject)
importFrom(rlang,quo_get_expr)
importFrom(rlang,sym)
importFrom(rsample,make_splits)
importFrom(rsample,manual_rset)
Expand Down
61 changes: 50 additions & 11 deletions R/base_learner.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
# Base learners and auxiliary functions for base learners


# nocov start

#' Make sampled subdataframes for base learners
#'
#' Per beethoven resampling strategy, this function selects
Expand Down Expand Up @@ -235,7 +232,23 @@ fit_base_learner <-
args_generate_cv
)
# generate row index
cv_index <- inject_match(switch_generate_cv_rset, args_generate_cv)
# cv_index <- inject_match(switch_generate_cv_rset, args_generate_cv)

# manually replicate switch_generate_cv_rset function
# the formals() argument used in inject_match does not properly
# identify the expected arguments in the switched functions
# identify cv_mode
cv_mode_arg <- match.arg(cv_mode)
target_fun <-
switch(
cv_mode_arg,
spatial = generate_cv_index_sp,
temporal = generate_cv_index_ts,
spatiotemporal = generate_cv_index_spt
)

# generate row index
cv_index <- inject_match(target_fun, args_generate_cv)

# using cv_index, restore rset
base_vfold <-
Expand All @@ -257,6 +270,11 @@ fit_base_learner <-
grid_params <- tune_grid_in[grid_row_idx, ]
} else {
grid_params <- NULL
# drop mtry from model arguments if using baysian tuning
# for xgboost
if (model$engine %in% c("xgboost", "lightgbm")) {
model <- model %>% parsnip::set_args(mtry = NULL)
}
}


Expand Down Expand Up @@ -306,6 +324,8 @@ fit_base_learner <-
#' @importFrom yardstick metric_set rmse mape rsq mae
#' @importFrom parsnip fit
#' @importFrom stats predict
#' @importFrom rlang quo_get_expr
#' @export
fit_base_tune <-
function(
recipe,
Expand Down Expand Up @@ -368,19 +388,33 @@ fit_base_tune <-
control = wf_config
)
}
if (trim_resamples) {
base_wftune$splits <- NA
}
# DEVELOPMENT CHANGE
# mm-0904 Drop base_wftune from return when trim_resamples = TRUE
# due to large data size. 1 iter > 25Gb
# if (trim_resamples) {
# base_wftune$splits <- NA
# }
if (return_best) {
# Select the best hyperparameters
base_wfparam <-
tune::select_best(
base_wftune,
metric = c("rmse", "rsq", "mae")
)

# finalize workflow with the best tuned hyperparameters
base_wfresult <- tune::finalize_workflow(base_wf, base_wfparam)

# DEVELOPMENT CHANGE
# mm-0904 unlist multi-layered hidden units if mlp model
if (model$engine == "brulee" && is.list(grid$hidden_units)) {
base_wfresult$fit$actions$model$spec$args$hidden_units <-
unlist(
rlang::quo_get_expr(
base_wfresult$fit$actions$model$spec$args$hidden_units
)
)
}

# Best-fit model
base_wf_fit_best <- parsnip::fit(base_wfresult, data = data_full)
# Prediction with the best model
Expand All @@ -394,6 +428,11 @@ fit_base_tune <-
best_performance = base_wftune
)
}
# DEVELOPMENT CHANGE
# mm-0904 see above
if (trim_resamples) {
base_wflist <- base_wflist[-3]
}
return(base_wflist)
}

Expand Down Expand Up @@ -467,6 +506,7 @@ assign_learner_cv <-
#' training-test data.frames and a column of labels.
#' @author Insang Song
#' @importFrom rsample make_splits manual_rset
#' @export
convert_cv_index_rset <-
function(
cvindex,
Expand Down Expand Up @@ -743,6 +783,8 @@ generate_cv_index_spt <-
# ref_list contains the index of the group pairs
ref_list <-
Map(c, data_exd_rowid[search_idx], data_exd_colid[search_idx])
} else {
ref_list <- NULL
}
attr(index_cv, "ref_list") <- ref_list
# generate row index for restoring rset
Expand Down Expand Up @@ -949,6 +991,3 @@ switch_generate_cv_rset <-
cvindex <- inject_match(target_fun, list(...))
return(cvindex)
}


# nocov end
35 changes: 16 additions & 19 deletions R/calc_postprocessing.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# nocov start


#' Add Time Column
#'
Expand Down Expand Up @@ -104,13 +102,12 @@ post_calc_convert_time <-
function(
df
) {
df <- data.table::copy(data.table::as.data.table(df))
df <- df[, `:=`(time, as.character(time))]
return(df)
stopifnot("time" %in% names(df))
df1 <- data.table::copy(data.table::as.data.table(df))
df1[, time := as.character(time)]
return(df1)
}

# nocov end


#' Join a data.frame with a year-only date column to
#' that with a full date column
Expand Down Expand Up @@ -278,9 +275,6 @@ post_calc_df_year_expand <- function(
}



# nocov start

#' Merge spatial and spatiotemporal covariate data
#' @keywords Post-calculation
#' @param locs Location. e.g., AQS sites.
Expand Down Expand Up @@ -317,12 +311,16 @@ post_calc_merge_all <-
locs_merged, df_spt,
by = c(locs_id, time_id)
)
# need POSIXt class for amadeus function
locs_merged[[time_id]] <- as.POSIXct(locs_merged[[time_id]])
locs_merged <-
amadeus::calc_temporal_dummies(
locs = locs_merged,
locs_id = locs_id,
year = target_years
)
# reset time as character
locs_merged[[time_id]] <- as.character(locs_merged[[time_id]])
return(locs_merged)
}

Expand Down Expand Up @@ -366,8 +364,6 @@ post_calc_drop_cols <-
return(df)
}

# nocov end


#' Automatic joining by the time and spatial identifiers
#' @description The key assumption is that all data frames will have
Expand Down Expand Up @@ -469,8 +465,6 @@ post_calc_autojoin <-
}


# nocov start

#' Impute missing values and attach lagged features
#' @keywords Post-calculation
#' @note
Expand Down Expand Up @@ -512,8 +506,14 @@ impute_all <-
) {
data.table::setDTthreads(nthreads_dt)
if (is.character(dt)) {
dt <- file.path("output/qs", dt)
dt <- qs::qread(dt)
if (!endsWith(dt, ".qs")) {
stop(
paste0(
"If `dt` points to a file, provide full path to .qs file.\n"
)
)
}
dt <- qs::qread(file.path(dt))
}
dt$time <- as.POSIXct(dt$time)
# remove unnecessary columns
Expand Down Expand Up @@ -757,6 +757,3 @@ append_predecessors <-
return(bound_large)
}
}


# nocov end
Loading
Loading