diff --git a/.Rbuildignore b/.Rbuildignore index 83f372bd..000efbf6 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,4 +1,3 @@ -^vignettes/ ^tests/ ^input/ ^tools/ @@ -18,6 +17,9 @@ input/* inst/targets-wide ^.future/ ^_targets/ +_targets.R +_targets.yaml +\*.Rproj \.out$ \.err$ \.sif$ @@ -25,3 +27,6 @@ inst/targets-wide \.vscode \.qmd$ \.sh$ +R/reserved.R +^doc$ +^Meta$ diff --git a/.gitattributes b/.gitattributes index 79a2ae81..04075830 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,4 @@ *.csv filter=lfs diff=lfs merge=lfs -text *.nc filter=lfs diff=lfs merge=lfs -text - input/** linguist-generated +inst/extdata/aqs_88101_2022.qs filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml index 9975ad06..dec2daf8 100644 --- a/.github/workflows/check-standard.yaml +++ b/.github/workflows/check-standard.yaml @@ -23,9 +23,19 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::rcmdcheck, any::remotes, any::units, any::rmarkdown + extra-packages: | + any::rcmdcheck + any::remotes + any::units + any::rmarkdown needs: check + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + - name: install remote packages run: | Rscript -e 'remotes::install_github(sprintf("NIEHS/%s", c("amadeus", "chopin")), upgrade = FALSE)' diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 57aba397..64bcea54 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -32,9 +32,16 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::pkgdown, local::. + extra-packages: | + any::pkgdown, + local::. needs: website + - name: install remote packages + run: | + Rscript -e 'remotes::install_github(sprintf("NIEHS/%s", c("amadeus", "chopin")), upgrade = FALSE)' + shell: bash + - name: Build site run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) shell: Rscript {0} diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 8c9d390d..220709d7 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -24,16 +24,11 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::covr, any::remotes + extra-packages: any::covr, any::DT, any::htmltools needs: coverage - - name: install remote packages - run: | - Rscript -e 'remotes::install_github(sprintf("NIEHS/%s", c("amadeus", "chopin")), upgrade = FALSE)' - shell: bash - - name: Cache C++ and R dependencies - uses: actions/cache@v4 + uses: actions/cache@v2 with: path: | ~/.cache/R @@ -43,66 +38,51 @@ jobs: dependencies-${{ runner.os }}- - name: Test coverage - run: > - Rscript -e - "covd<-covr::coverage_to_list()$totalcoverage; + run: | + Rscript -e "covpack<-covr::package_coverage(install_path='${{ github.workspace }}/cov', clean=FALSE); \ + covr::report(covpack, file = file.path('${{ github.workspace }}', 'chopin-coverage-report.html')); \ + covd<-covr::coverage_to_list(covpack)$totalcoverage; \ write.table(covd[length(covd)], file = '${{ github.workspace }}/local_cov.Rout', row.names = F, col.names = F)" shell: bash + - name: Upload covr report as artifact + uses: actions/upload-artifact@v4 + with: + name: covr-report + path: ${{ github.workspace }}/beethoven-coverage-report.html + + - name: Upload workspace dump as artifact if the test fails + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: test-outputs + path: ${{ github.workspace }}/** + - name: Get Values id: get-values shell: bash run: | COV=$(cat ${{ github.workspace }}/local_cov.Rout) - echo "Patch coverage read from local_cov.Rout: $COV" echo "coverage=$COV" >> $GITHUB_OUTPUT - name: Checkout gh-pages uses: actions/checkout@v4 with: ref: gh-pages - - - name: Patch comparison - id: patch-comparison - shell: bash - run: | - cov_patch="${{ steps.get-values.outputs.coverage }}" - if [[ ! -f cov_current.Rout ]]; then - cov_current=0 - else - cov_current=$(cat cov_current.Rout) - fi - echo "Current coverage: $cov_current" - - if (( $(echo "$cov_patch >= $cov_current" | bc -l) )); then - echo "Patch coverage ($cov_patch) is greater than or equal to current coverage ($cov_current)." - echo "cov_update=$cov_patch" >> $GITHUB_OUTPUT - else - echo "Patch coverage ($cov_patch) is less than current coverage ($cov_current)." - exit 1 - fi - - - name: Overwrite cov_current.Rout - if: ${{ github.event_name == 'push' }} - shell: bash - run: | - cov_update="${{ steps.patch-comparison.outputs.cov_update }}" - touch cov_current.Rout - echo "$cov_update" > cov_current.Rout - + - name: Create Badges shell: bash run: | npm i -g badgen-cli export COV=${{ steps.get-values.outputs.coverage }} - COLOR=$(node -p '+process.env.COV >= 95 ? `green` : `yellow`') + COLOR=$(node -p '+process.env.COV >= 95 ? `green` : `orange`') mkdir -p badges badgen -j coverage -s $COV% -c $COLOR > badges/coverage.svg - name: Deploy Badges uses: stefanzweifel/git-auto-commit-action@v4 with: - commit_message: "Update badges [skip ci] & cov_current.Rout" + commit_message: "Update badges [skip ci]" branch: gh-pages skip_fetch: true skip_checkout: true @@ -113,4 +93,4 @@ jobs: - name: Checkout Back uses: actions/checkout@v4 with: - ref: ${{ github.ref }} + ref: ${{ github.ref }} diff --git a/.gitignore b/.gitignore index b0084717..86d56b1f 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,5 @@ slurm_error.log # Apptainer images **/*.sif +/Meta/ +/doc/ diff --git a/.lintr b/.lintr index abe816d0..2f6e319f 100644 --- a/.lintr +++ b/.lintr @@ -2,7 +2,7 @@ linters: linters_with_defaults( commented_code_linter = NULL ) exclusions: list( - "tests/testthat/test-download_functions.R", + "tests", "inst", "_targets.R" ) diff --git a/DESCRIPTION b/DESCRIPTION index de0f1442..a1e3affb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: beethoven Title: Building an Extensible, rEproducible, Test-driven, Harmonized, Open-source, Versioned, ENsemble model for air quality -Version: 0.3.5 +Version: 0.4.0 Authors@R: c( person("Kyle", "Messier", , "kyle.messier@nih.gov", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9508-9623")), person("Insang", "Song", role = c("aut", "ctb"), comment = c(ORCID = "0000-0001-8732-3256")), @@ -16,24 +16,23 @@ Authors@R: c( Description: Near Real Time air pollution model results and code produced by the SET group. It is fully tested, versioned, and open source and open access. Depends: R (>= 4.1.0) Imports: + amadeus, dplyr, sf, stats, terra, methods, - BART, data.table, future, targets, tune (>= 1.2.1), rlang, rsample, + spatialsample, tidyr, tidyselect, yardstick, - bonsai, - lightgbm, - vetiver, + torch, workflows, recipes, missRanger, @@ -43,19 +42,25 @@ Imports: anticlust, qs, stringi, + tibble, collapse, tigris, - graphics + graphics, + scatterplot3d Suggests: testthat (>= 3.0.0), covr, withr, - tarchetypes, - tidymodels, - xgboost, - torch, + bonsai, brulee, glmnet, + lightgbm, + scoringRules, + future.mirai, + xgboost (<= 2.0.3.1), + vetiver, + tarchetypes, + tidymodels, knitr, rmarkdown, stars, @@ -64,10 +69,10 @@ Suggests: FNN, ggplot2, doRNG, - scatterplot3d, furrr, quarto Remotes: NIEHS/amadeus, NIEHS/chopin +SystemRequirements: NetCDF4, CUDA (>= 10.0) Encoding: UTF-8 VignetteBuilder: knitr, rmarkdown Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index dbd4115c..a8dc5e4c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,26 +2,27 @@ export(add_time_col) export(append_predecessors) +export(assign_learner_cv) export(attach_xy) export(calc_geos_strict) export(calc_gmted_direct) export(calc_narr2) export(calculate) -export(convert_cv_index_rset) -export(fit_base_brulee) -export(fit_base_elnet) -export(fit_base_lightgbm) -export(fit_base_xgb) -export(generate_cv_index) +export(feature_raw_download) +export(fit_base_learner) +export(fit_meta_learner) +export(generate_cv_index_sp) +export(generate_cv_index_spt) +export(generate_cv_index_ts) export(impute_all) export(inject_calculate) export(inject_geos) export(inject_gmted) +export(inject_match) export(inject_modis_par) +export(inject_nlcd) export(load_modis_files) export(loadargs) -export(meta_learner_fit) -export(meta_learner_predict) export(par_narr) export(post_calc_autojoin) export(post_calc_df_year_expand) @@ -29,18 +30,31 @@ export(post_calc_merge_all) export(post_calc_merge_features) export(post_calc_unify_timecols) export(post_calc_year_expand) -export(prepare_cvindex) +export(predict_meta_learner) export(process_geos_bulk) export(process_narr2) export(read_locs) export(read_paths) export(reduce_merge) -export(restore_fit_best) export(set_args_calc) export(set_args_download) export(set_slurm_resource) -export(vis_rset) -import(BART) +export(set_target_years) +export(switch_generate_cv_rset) +export(switch_model) +export(unmarshal_function) +export(vis_spt_rset) +importFrom(amadeus,calc_covariates) +importFrom(amadeus,calc_prepare_locs) +importFrom(amadeus,calc_temporal_dummies) +importFrom(amadeus,calc_worker) +importFrom(amadeus,check_for_null_parameters) +importFrom(amadeus,download_data) +importFrom(amadeus,download_sanitize_path) +importFrom(amadeus,generate_date_sequence) +importFrom(amadeus,process_aqs) +importFrom(amadeus,process_covariates) +importFrom(amadeus,process_gmted_codes) importFrom(anticlust,balanced_clustering) importFrom(collapse,fnth) importFrom(collapse,fvar) @@ -48,7 +62,6 @@ importFrom(collapse,join) importFrom(collapse,replace_inf) importFrom(collapse,replace_na) importFrom(collapse,set_collapse) -importFrom(data.table,.SD) importFrom(data.table,`:=`) importFrom(data.table,as.data.table) importFrom(data.table,copy) @@ -56,24 +69,29 @@ importFrom(data.table,merge.data.table) importFrom(data.table,rbindlist) importFrom(data.table,setDTthreads) importFrom(data.table,setnafill) +importFrom(dplyr,"%>%") importFrom(dplyr,`%>%`) importFrom(dplyr,across) importFrom(dplyr,all_of) -importFrom(dplyr,as_tibble) -importFrom(dplyr,bind_rows) +importFrom(dplyr,full_join) importFrom(dplyr,group_by) +importFrom(dplyr,mutate) +importFrom(dplyr,rowwise) +importFrom(dplyr,slice_sample) importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(future,multicore) -importFrom(future,multisession) importFrom(future,plan) importFrom(future,sequential) importFrom(future,tweak) importFrom(future.apply,future_lapply) importFrom(future.batchtools,batchtools_slurm) +importFrom(graphics,par) +importFrom(methods,getPackageName) importFrom(methods,is) importFrom(missRanger,missRanger) importFrom(parsnip,boost_tree) +importFrom(parsnip,fit) importFrom(parsnip,linear_reg) importFrom(parsnip,mlp) importFrom(parsnip,set_engine) @@ -88,26 +106,48 @@ importFrom(rlang,sym) importFrom(rsample,make_splits) importFrom(rsample,manual_rset) importFrom(rsample,vfold_cv) +importFrom(scatterplot3d,scatterplot3d) importFrom(sf,st_as_sf) importFrom(sf,st_coordinates) +importFrom(spatialsample,spatial_block_cv) +importFrom(stats,dist) importFrom(stats,predict) +importFrom(stats,quantile) importFrom(stats,sd) importFrom(stats,setNames) importFrom(stringi,stri_replace_all_regex) importFrom(targets,tar_resources) importFrom(targets,tar_resources_future) importFrom(terra,crs) +importFrom(terra,describe) importFrom(terra,extract) +importFrom(terra,nlyr) importFrom(terra,rast) +importFrom(terra,sds) +importFrom(terra,set.crs) importFrom(terra,subset) importFrom(terra,time) importFrom(terra,varnames) +importFrom(terra,vect) +importFrom(tibble,tribble) +importFrom(tidyr,pivot_longer) +importFrom(tidyr,pivot_wider) importFrom(tidyselect,all_of) importFrom(tigris,counties) +importFrom(tune,control_bayes) +importFrom(tune,control_grid) importFrom(tune,fit_best) +importFrom(tune,select_best) +importFrom(tune,tune) +importFrom(tune,tune_bayes) importFrom(tune,tune_grid) +importFrom(utils,combn) importFrom(workflows,add_model) importFrom(workflows,add_recipe) +importFrom(workflows,add_variables) importFrom(workflows,workflow) +importFrom(yardstick,mae) +importFrom(yardstick,mape) importFrom(yardstick,metric_set) importFrom(yardstick,rmse) +importFrom(yardstick,rsq) diff --git a/R/base_learner.R b/R/base_learner.R new file mode 100644 index 00000000..3c20c0d0 --- /dev/null +++ b/R/base_learner.R @@ -0,0 +1,954 @@ +# Base learners and auxiliary functions for base learners + + +# nocov start + +#' Make sampled subdataframes for base learners +#' +#' Per beethoven resampling strategy, this function selects +#' the predefined number of rows from the input data table and +#' saves the row index in .rowindex field. +#' +#' @keywords Baselearner +#' @param data An object that inherits data.frame. +#' @param n The number of rows to be sampled. +#' @param p The proportion of rows to be used. Default is 0.3. +#' @return The row index of the original data. The name of the original +#' data object is stored in attribute "object_origin". +make_subdata <- + function( + data, + n = NULL, + p = 0.3 + ) { + if (is.null(n) && is.null(p)) { + stop("Please provide either n or p.") + } + nr <- seq_len(nrow(data)) + if (!is.null(n)) { + nsample <- sample(nr, n) + } else { + nsample <- sample(nr, ceiling(nrow(data) * p)) + } + # data <- data[nsample, ] + rowindex <- nsample + data_name <- as.character(substitute(data)) + attr(rowindex, "object_origin") <- data_name[length(data_name)] + return(rowindex) + } + + +#' Define a base learner model based on parsnip and tune +#' @keywords Baselearner +#' @param model_type character(1). Model type to be used. +#' Default is "mlp". Available options are "mlp", "xgb", "lgb", "elnet". +#' @param learn_rate numeric(1). The learning rate for the model. +#' Default is 0.1. +#' @param device character(1). The device to be used for training. +#' Default is "cuda:0". Make sure that your system is equipped +#' with CUDA-enabled graphical processing units. +#' @return A parsnip model object. +#' @importFrom parsnip mlp set_engine set_mode boost_tree linear_reg +#' @importFrom dplyr %>% +#' @export +switch_model <- + function( + model_type = c("mlp", "xgb", "lgb", "elnet"), + learn_rate = 0.1, + device = "cuda:0" + ) { + + switch( + model_type, + mlp = + parsnip::mlp( + hidden_units = parsnip::tune(), + dropout = parsnip::tune(), + epochs = 500, + activation = parsnip::tune(), + learn_rate = parsnip::tune() + ) %>% + parsnip::set_engine("brulee", device = device) %>% + parsnip::set_mode("regression"), + lgb = + parsnip::boost_tree( + mtry = parsnip::tune(), + trees = parsnip::tune(), + learn_rate = parsnip::tune() + ) %>% + parsnip::set_engine("lightgbm", device_type = device) %>% + parsnip::set_mode("regression"), + xgb = + parsnip::boost_tree( + mtry = parsnip::tune(), + trees = parsnip::tune(), + learn_rate = parsnip::tune() + ) %>% + parsnip::set_engine("xgboost", device = device) %>% + parsnip::set_mode("regression"), + elnet = + parsnip::linear_reg( + mixture = parsnip::tune(), + penalty = parsnip::tune() + ) %>% + parsnip::set_engine("glmnet") %>% + parsnip::set_mode("regression") + ) + + } + + + +#' Base learner: tune hyperparameters and retrieve the best model +#' +#' Multilayer perceptron model with different configurations of +#' hidden units, dropout, activation, and learning rate using brulee +#' and tidymodels. With proper settings, users can utilize graphics +#' processing units (GPU) to speed up the training process. +#' +#' LightGBM model is fitted at the defined rate (`r_subsample`) of +#' the input dataset by grid or Bayesian optimization search. +#' With proper settings, users can utilize graphics +#' processing units (GPU) to speed up the training process. +#' +#' XGBoost model is fitted at the defined rate (`r_subsample`) of +#' the input dataset by grid or Bayesian optimization search. +#' With proper settings, users can utilize graphics +#' processing units (GPU) to speed up the training process. +#' +#' Elastic net model is fitted at the defined rate (`r_subsample`) of +#' the input dataset by grid search or Bayesian optimization. +#' +#' @keywords Baselearner +#' @note tune package should be 1.2.0 or higher. +#' brulee, xgboost, and lightgbm should be installed with GPU support. +#' Grid search is not activated in this function, regardless of other parts' +#' description. +#' @details +#' * MLP: Hyperparameters `hidden_units`, `dropout`, `activation`, +#' and `learn_rate` are tuned. `With tune_mode = "grid"`, +#' users can modify `learn_rate` explicitly, and other hyperparameters +#' will be predefined (56 combinations per `learn_rate` for mlp). +#' * XGBoost: Hyperparameters `mtry`, `ntrees`, and `learn_rate` are +#' tuned. With `tune_mode = "grid"`, +#' users can modify `learn_rate` explicitly, and other hyperparameters +#' will be predefined (30 combinations per `learn_rate`). +#' * LightGBM: Hyperparameters `mtry`, `ntrees`, and `learn_rate` are +#' tuned. With `tune_mode = "grid"`, +#' users can modify `learn_rate` explicitly, and other hyperparameters +#' will be predefined (30 combinations per `learn_rate`). +#' * Elastic net: Hyperparameters `mixture` and `penalty` are tuned. +#' +#' Tuning is performed based on random grid search (size = 10). +#' @param learner character(1). The base learner to be used. +#' Default is "mlp". Available options are "mlp", "xgb", "lgb", "elnet". +#' @param dt_full The full data table to be used for prediction. +#' @param r_subsample numeric(1). The proportion of rows to be used. +#' @param model The parsnip model object. Preferably generated from +#' `switch_model`. +#' @param folds integer(1). Number of cross-validation folds. +#' If NULL, `cv_mode` should be defined to be used in [rsample::vfold_cv]. +#' @param cv_mode character(1). +#' Cross-validation mode. Default is "spatiotemporal". +#' Available options are "spatiotemporal", "spatial", "temporal". +#' @param args_generate_cv List of arguments to be passed to +#' `switch_generate_cv_rset` function. +#' @param tune_mode character(1). Hyperparameter tuning mode. +#' Default is "grid", "bayes" is acceptable. +#' @param tune_bayes_iter integer(1). The number of iterations for +#' Bayesian optimization. Default is 10. Only used when `tune_mode = "bayes"`. +#' @param tune_grid_in data.frame object that includes the grid for +#' hyperparameter tuning. `tune_grid_size` rows will be randomly picked +#' from this data.frame for grid search. +#' @param tune_grid_size integer(1). The number of grid size for hyperparameter +#' tuning. Default is 10. Only used when `tune_mode = "grid"`. +#' @param learn_rate The learning rate for the model. For branching purpose. +#' Default is 0.1. +#' @param yvar The target variable. +#' @param xvar The predictor variables. +#' @param nthreads integer(1). The number of threads to be used for +#' tuning. Default is 8L. `learner = "elnet"` will utilize the multiple +#' threads in [future::multicore()] plan. +#' @param trim_resamples logical(1). Default is TRUE, which replaces the actual +#' data.frames in splits column of `tune_results` object with NA. +#' @param return_best logical(1). If TRUE, the best tuned model is returned. +#' @param ... Additional arguments to be passed. +#' +#' @return The fitted workflow. +#' @importFrom recipes recipe update_role +#' @importFrom dplyr `%>%` +#' @importFrom parsnip mlp set_engine set_mode +#' @importFrom workflows workflow add_recipe add_model +#' @importFrom tune tune_grid fit_best +#' @importFrom tidyselect all_of +#' @importFrom yardstick metric_set rmse +#' @importFrom rsample vfold_cv +#' @export +fit_base_learner <- + function( + learner = c("mlp", "xgb", "lgb", "elnet"), + dt_full, + r_subsample = 0.3, + model = NULL, + folds = 5L, + cv_mode = c("spatiotemporal", "spatial", "temporal"), + args_generate_cv = NULL, + tune_mode = "grid", + tune_bayes_iter = 10L, + tune_grid_in = NULL, + tune_grid_size = 10L, + learn_rate = 0.1, + yvar = "Arithmetic.Mean", + xvar = seq(5, ncol(dt_sample)), + nthreads = 8L, + trim_resamples = FALSE, + return_best = TRUE, + ... + ) { + learner <- match.arg(learner) + tune_mode <- match.arg(tune_mode, c("grid", "bayes")) + cv_mode <- match.arg(cv_mode) + stopifnot("parsnip model must be defined." = !is.null(model)) + + dt_sample_rowidx <- make_subdata(dt_full, p = r_subsample) + dt_sample <- dt_full[dt_sample_rowidx, ] + + # detect model name + model_name <- model$engine + + base_recipe <- + recipes::recipe( + dt_sample[1, ] + ) %>% + # do we want to normalize the predictors? + # if so, an additional definition of truly continuous variables is needed + # recipes::step_normalize(recipes::all_numeric_predictors()) %>% + recipes::update_role(!!xvar) %>% + recipes::update_role(!!yvar, new_role = "outcome") + + if (!is.null(folds)) { + base_vfold <- rsample::vfold_cv(dt_sample, v = folds) + } else { + args_generate_cv <- + c( + list(data = dt_sample, cv_mode = cv_mode), + args_generate_cv + ) + # generate row index + cv_index <- inject_match(switch_generate_cv_rset, args_generate_cv) + + # using cv_index, restore rset + base_vfold <- + convert_cv_index_rset( + cv_index, dt_sample, cv_mode = cv_mode + ) + } + + # generate random grid from hyperparameters + # dials approach is too complicated to implement since + # we already declared tuning hyperparameters with tune(), + # which is not compatible with dials approach to limit + # possible value ranges per hyperparameter. + # model_params <- tune::extract_parameter_set_dials(model) + # grid_params <- dials::grid_random(model_params, size = tune_grid_size) + if (tune_mode == "grid") { + grid_row_idx <- + sample(nrow(tune_grid_in), tune_grid_size, replace = FALSE) + grid_params <- tune_grid_in[grid_row_idx, ] + } else { + grid_params <- NULL + } + + + if (model_name == "glmnet") { + future::plan(future::multicore, workers = nthreads) + } + base_wftune <- + fit_base_tune( + recipe = base_recipe, + model = model, + resample = base_vfold, + tune_mode = tune_mode, + grid = grid_params, + iter_bayes = tune_bayes_iter, + trim_resamples = trim_resamples, + return_best = return_best, + data_full = dt_full + ) + if (model_name == "glmnet") { + future::plan(future::sequential) + } + + return(base_wftune) + } + + +#' Tune base learner +#' @keywords Baselearner internal +#' @param recipe The recipe object. +#' @param model The model object. +#' @param resample The resample object. It is expected to be generated from the +#' subsamples. +#' @param tune_mode character(1). Hyperparameter tuning mode. +#' Default is "bayes", "grid" is acceptable. +#' @param grid The grid object for hyperparameter tuning. +#' @param trim_resamples logical(1). Default is TRUE, which replaces the actual +#' data.frames in splits column of `tune_results` object with NA. +#' @param return_best logical(1). If TRUE, the best tuned model is returned. +#' @param data_full The full data frame to be used for prediction. +#' @return List of 3: +#' * `base_prediction`: `data.frame` of the best model prediction. +#' * `base_parameter`: `tune_results` object of the best model. +#' * `best_performance`: `data.frame` of the performance metrics. It +#' includes RMSE, MAPE, R-squared, and MAE for **all** tuned models. +#' @importFrom workflows workflow add_recipe add_model +#' @importFrom tune tune_grid tune_bayes control_grid control_bayes +#' @importFrom yardstick metric_set rmse mape rsq mae +#' @importFrom parsnip fit +#' @importFrom stats predict +fit_base_tune <- + function( + recipe, + model, + resample, + tune_mode = c("bayes", "grid"), + grid = NULL, + iter_bayes = 10L, + trim_resamples = TRUE, + return_best = TRUE, + data_full = NULL + ) { + stopifnot("data_full must be entered." = !is.null(data_full)) + tune_mode <- match.arg(tune_mode) + base_wf <- + workflows::workflow() %>% + workflows::add_recipe(recipe) %>% + workflows::add_model(model) + + if (tune_mode == "grid") { + wf_config <- + tune::control_grid( + verbose = TRUE, + save_pred = FALSE, + save_workflow = TRUE + ) + base_wftune <- + base_wf %>% + tune::tune_grid( + resamples = resample, + grid = grid, + metrics = + yardstick::metric_set( + yardstick::rmse, + yardstick::mape, + yardstick::rsq, + yardstick::mae + ), + control = wf_config + ) + } else { + wf_config <- + tune::control_bayes( + verbose = TRUE, + save_pred = FALSE, + save_workflow = TRUE + ) + base_wftune <- + base_wf %>% + tune::tune_bayes( + resamples = resample, + iter = iter_bayes, + metrics = + yardstick::metric_set( + yardstick::rmse, + yardstick::mae, + yardstick::mape, + yardstick::rsq + ), + control = wf_config + ) + } + if (trim_resamples) { + base_wftune$splits <- NA + } + if (return_best) { + # Select the best hyperparameters + base_wfparam <- + tune::select_best( + base_wftune, + metric = c("rmse", "rsq", "mae") + ) + + # finalize workflow with the best tuned hyperparameters + base_wfresult <- tune::finalize_workflow(base_wf, base_wfparam) + # Best-fit model + base_wf_fit_best <- parsnip::fit(base_wfresult, data = data_full) + # Prediction with the best model + base_wf_pred_best <- + stats::predict(base_wf_fit_best, new_data = data_full) + + base_wflist <- + list( + base_prediction = base_wf_pred_best, + base_parameter = base_wfparam, + best_performance = base_wftune + ) + } + return(base_wflist) + } + + +#' Shuffle cross-validation mode for each learner type +#' @keywords Baselearner +#' @param learner character(1). The base learner to be used. +#' Default is "mlp". Available options are "mlp", "lgb", "elnet". +#' @param cv_mode character(1). The cross-validation mode to be used. +#' Default is "spatiotemporal". Available options are "spatiotemporal", +#' "spatial", "temporal". +#' @param cv_rep integer(1). The number of repetitions for each `cv_mode`. +#' @param num_device integer(1). The number of CUDA devices to be used. +#' Each device will be assigned to each eligible learner (i.e., lgb, mlp). +#' @return A data frame with three columns: learner, cv_mode, and device. +#' @export +assign_learner_cv <- + function( + learner = c("lgb", "mlp", "elnet"), + cv_mode = c("spatiotemporal", "spatial", "temporal"), + cv_rep = 100L, + num_device = ifelse(torch::cuda_device_count() > 1, 2, 1) + ) { + learner_eligible <- c("lgb", "mlp") + learner <- sort(learner) + learner_eligible_flag <- learner %in% learner_eligible + cuda_devices <- seq_len(sum(learner_eligible_flag)) - 1 + cuda_devices <- sprintf("cuda:%d", cuda_devices) + cuda_devices <- + rep(cuda_devices, ceiling(length(learner) / length(cuda_devices))) + cuda_devices <- cuda_devices[seq_len(sum(learner_eligible_flag))] + cuda_get <- learner_eligible_flag + cuda_get[learner_eligible_flag] <- cuda_devices + cuda_get[!learner_eligible_flag] <- "null" + + learner_l <- split(learner, learner) + learner_l <- mapply( + function(x, y) { + cv_mode_rep <- rep(cv_mode, each = cv_rep) + df <- + data.frame( + learner = rep(x, cv_rep * length(cv_mode)), + cv_mode = + cv_mode_rep[sample(length(cv_mode_rep), length(cv_mode_rep))], + device = y + ) + return(df) + }, + learner_l, cuda_get, SIMPLIFY = FALSE + ) + learner_v <- do.call(rbind, learner_l) + return(learner_v) + } + + +#' Generate manual rset object from spatiotemporal cross-validation indices +#' @keywords Baselearner +#' @param cvindex One of: +#' * integer row indices for `id_out` in a `rset` object. +#' * List of integer row indices stored in elements named `analysis` and +#' `assessment`. +#' @param data data.frame object from which the `cvindex` is used +#' to create `rset` object +#' @param ref_list List of custom reference group indices. +#' Default is `attr(cvindex, "ref_list")`, where it is assumed that `cvindex` +#' contains an `list` attribute named "ref_list". +#' if not NULL, it will be used as a reference instead of max(cvindex). +#' @param cv_mode character(1). Spatiotemporal cross-validation indexing +#' method label. +#' @return rset object of `rsample` package. A tibble with a list column of +#' training-test data.frames and a column of labels. +#' @author Insang Song +#' @importFrom rsample make_splits manual_rset +convert_cv_index_rset <- + function( + cvindex, + data, + ref_list = attr(cvindex, "ref_list"), + cv_mode = c("spatiotemporal", "spatial", "temporal") + ) { + cv_mode <- match.arg(cv_mode) + # if (length(cvindex) != nrow(data)) { + # stop("cvindex length should be equal to nrow(data).") + # } + + len_cvi <- seq_along(cvindex) + + if (is.list(cvindex)) { + list_split_dfs <- Map(function(x) { + rsample::make_splits(x = x, data = data) + }, cvindex) + } else { + if (!is.null(ref_list)) { + list_cvi <- ref_list + len_cvi <- seq_along(list_cvi) + } else { + maxcvi <- max(cvindex) + len_cvi <- seq_len(maxcvi) + list_cvi <- split(len_cvi, len_cvi) + } + list_cvi_rows <- + lapply( + list_cvi, + function(x) { + list(analysis = which(!cvindex %in% x), + assessment = which(cvindex %in% x)) + } + ) + list_split_dfs <- + lapply( + list_cvi_rows, + function(x) { + rsample::make_splits(x = x, data = data) + } + ) + } + + modename <- sprintf("cvfold_%s_%03d", cv_mode, len_cvi) + rset_stcv <- rsample::manual_rset(list_split_dfs, modename) + return(rset_stcv) + } + + +#' Attach XY coordinates to a data frame +#' +#' This function attaches XY coordinates to a data frame based on a spatial +#' object containing the coordinates. It performs a left join operation to +#' match the coordinates with the corresponding locations in the data frame. +#' @keywords Utility +#' @param data_full The full data frame to which XY coordinates will +#' be attached. +#' @param data_sf The spatial object containing the XY coordinates. +#' @param locs_id The column name in the spatial object that represents the +#' location identifier. +#' @param time_id The column name in the data frame that represents the time +#' identifier. +#' +#' @return A data frame with the XY coordinates attached. +#' +#' @importFrom sf st_coordinates +#' @importFrom stats setNames +#' @importFrom collapse join +#' @export +attach_xy <- + function( + data_full, + data_sf, + locs_id = "site_id", + time_id = "time" + ) { + data_sfd <- sf::st_coordinates(data_sf) + data_sf <- data_sf[[locs_id]] + data_sfd <- data.frame(site_id = data_sf, data.frame(data_sfd)) + data_sfd <- stats::setNames(data_sfd, c(locs_id, "lon", "lat")) + + # data_full_lean <- data_full[, c(locs_id, time_id), with = FALSE] + data_full_attach <- + collapse::join( + data_full, data_sfd, on = locs_id, how = "left" + ) + return(data_full_attach) + } + + + +#' Generate spatio-temporal cross-validation index with anticlust +#' +#' This function generates a spatio-temporal cross-validation index +#' based on the anticlust package. The function first calculates the +#' spatial clustering index using the [anticlust::balanced_clustering()] +#' function as default, and if `cv_pairs` is provided, it generates rank-based +#' pairs based on the proximity between cluster centroids. +#' `cv_pairs` can be NULL, in which case only the spatial clustering index +#' is generated. `ngroup_init` should be lower than `cv_pairs`, while +#' it imposes a condition that `nrow(data) %% ngroup_init` should be 0 +#' and `cv_pairs` should be less than the number of 2-combinations of +#' `ngroup_init`. Each training set will get 50% overlap +#' with adjacent training sets. "Pairs (combinations)" are selected +#' based on the rank of centroids of `ngroup_init` number of initial +#' clusters, where users have two options. +#' +#' * Mode "1" assigns at least one pair for each +#' initial cluster, meaning that `ngroup_init` pairs are assigned for each +#' initial cluster, then the remaining pairs will be ranked to finalize +#' the `cv_pairs` sets. +#' * Mode "2" will rank the pairwise distances +#' directly, which may ignore some overly large initial clusters for pairing. +#' +#' Of course, mode "2" is faster than mode "1", thus users are advised +#' to use mode "2" when they are sure that the initial clusters are +#' spatially uniformly distributed. +#' @keywords Baselearner +#' @param data data.table with X, Y, and time information. +#' @param target_cols character(3). Names of columns for X, Y, and time. +#' Default is c("lon", "lat", "time"). Order insensitive. +#' @param preprocessing character(1). Preprocessing method for the fields +#' defined in `target_cols`. This serves to homogenize the scale of +#' the data. Default is "none". +#' * "none": no preprocessing. +#' * "normalize": normalize the data. +#' * "standardize": standardize the data. +#' @param ngroup_init integer(1). Initial number of splits for +#' pairing groups. Default is 5L. +#' @param cv_pairs integer(1). Number of pairs for cross-validation. +#' This value will be used to generate a rank-based pairs +#' based on `target_cols` values. +#' @param pairing character(1) Pair selection method. +#' * "1": search the nearest for each cluster then others +#' are selected based on the rank. +#' * "2": rank the pairwise distances directly +#' @param ... Additional arguments to be passed. +#' @note `nrow(data) %% ngroup_init` should be 0. This is a required +#' condition for the anticlust::balanced_clustering(). +#' @return List of numeric vectors with balanced cluster numbers and +#' reference lists of assessment set pair numbers in attributes. +#' @author Insang Song +#' @importFrom rsample manual_rset +#' @importFrom anticlust balanced_clustering +#' @importFrom dplyr group_by summarize across ungroup all_of +#' @importFrom stats dist +#' @importFrom utils combn +#' @export +#' @examples +#' library(data.table) +#' data <- data.table( +#' lon = runif(100), +#' lat = runif(100), +#' time = +#' rep( +#' seq.Date(from = as.Date("2021-01-01"), to = as.Date("2021-01-05"), +#' by = "day"), +#' 20 +#' ) +#' ) +#' rset_spt <- +#' generate_cv_index_spt( +#' data, preprocessing = "normalize", +#' ngroup_init = 5L, cv_pairs = 6L +#' ) +#' rset_spt +generate_cv_index_spt <- + function( + data, + target_cols = c("lon", "lat", "time"), + preprocessing = c("none", "normalize", "standardize"), + ngroup_init = 5L, + cv_pairs = NULL, + pairing = c("1", "2"), + ... + ) { + if (length(target_cols) != 3) { + stop("Please provide three target columns.") + } + if (!is.null(cv_pairs)) { + # ngroup_init check + if (ngroup_init >= cv_pairs) { + stop("ngroup_init should be less than cv_pairs.") + } + # 2-combinations of ngroup_init check + if (ncol(utils::combn(seq_len(ngroup_init), 2)) < cv_pairs) { + stop( + paste0( + "cv_pairs cannot be larger than ", + "the number of 2-combinations of ngroup_init." + ) + ) + } + } + # data_orig <- data + data <- data[, target_cols, with = FALSE] + data$time <- as.numeric(data$time) + + # select preprocessing plan + # Yes, normalize/standardize spatiotemporal coordinates + # may make little sense, but it would homogenize the scale of drastically + # different value ranges of the coordinates (i.e., seconds in POSIXct) + data_proc <- + switch( + preprocessing, + none = data, + normalize = (data + abs(apply(data, 2, min))) / + (apply(data, 2, max) + abs(apply(data, 2, min))), + standardize = collapse::fscale(data) + ) + + # !!! ngroup_init should be a divisor of nrow(data_proc) !!! + index_cv <- anticlust::balanced_clustering(data_proc, ngroup_init) + cv_index <- NULL + # ref_list <- NULL + if (!is.null(cv_pairs)) { + pairing <- match.arg(pairing) + data_ex <- data_proc + data_ex$cv_index <- index_cv + + data_exs <- data_ex |> + dplyr::group_by(cv_index) |> + dplyr::summarize( + dplyr::across(dplyr::all_of(target_cols), ~mean(as.numeric(.x))) + ) |> + dplyr::ungroup() + + data_exs$cv_index <- NULL + data_exm <- stats::dist(data_exs) + data_exd <- as.vector(data_exm) + data_exmfull <- as.matrix(data_exm) + # index searching in dist matrix out of dist + data_exd_colid <- + unlist(Map(seq_len, seq_len(max(index_cv) - 1))) + # rep(seq_len(max(index_cv) - 1), seq(max(index_cv) - 1, 1, -1)) + data_exd_rowid <- rep(seq(2, max(index_cv)), seq_len(max(index_cv) - 1)) + + if (pairing == "2") { + search_idx <- which(rank(-data_exd) <= cv_pairs) + ref_list <- NULL + } else { + # min rank element index per each cluster centroid + search_each1 <- + apply(data_exmfull, 1, \(x) which.min(replace(x, which.min(x), Inf))) + # sort the index + search_each1sort <- + Map(c, seq_along(search_each1), search_each1) + # keep the distinct pairs + search_each1sort <- + unique(Map(sort, search_each1sort)) + # return(list(data_exd_colid, data_exd_rowid, search_each1sort)) + search_idx_each1 <- + which( + Reduce( + `|`, + Map( + \(x) data_exd_colid %in% x[1] & data_exd_rowid %in% x[2], + search_each1sort + ) + ) + ) + + # replace the nearest pairs' distance to Inf + search_idx_others <- + which(rank(-replace(data_exd, search_idx_each1, Inf)) <= cv_pairs) + # remove the nearest pairs + # sort the distance of the remaining pairs + search_idx_others <- + search_idx_others[1:(cv_pairs - length(search_idx_each1))] + search_idx <- c(search_idx_each1, search_idx_others) + } + + # ref_list contains the index of the group pairs + ref_list <- + Map(c, data_exd_rowid[search_idx], data_exd_colid[search_idx]) + } + attr(index_cv, "ref_list") <- ref_list + # generate row index for restoring rset + # 0.3.9: ref_list is added to an attribute of index_cv + + return(index_cv) + } + + + +# non site-wise; just using temporal information +#' Generate temporal cross-validation index +#' @keywords Baselearner +#' @param data data.table with X, Y, and time information. +#' @param time_col character(1). Field name with time information. +#' @param cv_fold integer(1). Number of cross-validation folds. +#' @param window integer(1). Window size for each fold. +#' Simply meaning overlaps between folds. Unit is +#' the base unit of temporal values stored in `time_col`. +#' Window size is put into `as.difftime` function, then the half of it +#' (if odd, rounded number + 1 is applied) is used for overlaps +#' in the middle folds. +#' @return List of numeric vector with out-of-sample indices. +#' @examples +#' data <- data.frame( +#' time = seq.Date(from = as.Date("2021-01-01"), by = "day", length.out = 100), +#' value = rnorm(100) +#' ) +#' rset_ts <- +#' generate_cv_index_ts(data, time_col = "time", cv_fold = 10, window = 14) +#' @importFrom stats quantile +#' @export +generate_cv_index_ts <- + function( + data, + time_col = "time", + cv_fold = 10L, + window = 14L + ) { + tcol <- unlist(data[[time_col]]) + time_vec <- as.POSIXct(sort(unique(tcol))) + # time_range interpretation + time_vec_quantile <- quantile(time_vec, probs = seq(0, 1, 0.1)) + time_vec_quantile <- as.Date(time_vec_quantile) + # define overlaps (half of window size) + tdiff <- as.difftime(window / 2, units = "days") + tdiff_h <- round(tdiff / 2) + 1 + + cv_index <- + lapply( + seq_len(cv_fold), + function(x) { + if (x == 1) { + # don't be confused! in_id is the training set + in_id <- which(tcol > time_vec_quantile[x + 1] - tdiff) + out_id <- which(tcol <= time_vec_quantile[x + 1] + tdiff) + } else if (x == cv_fold) { + # last fold + in_id <- which(tcol <= time_vec_quantile[x] + tdiff) + out_id <- which(tcol > time_vec_quantile[x] - tdiff) + } else { + in_id <- + which( + tcol < time_vec_quantile[x] + tdiff_h | + tcol >= time_vec_quantile[x + 1] - tdiff_h + ) + out_id <- + which( + tcol >= time_vec_quantile[x] - tdiff_h & + tcol <= time_vec_quantile[x + 1] + tdiff_h + ) + } + return(list(analysis = in_id, assessment = out_id)) + } + ) + + return(cv_index) + } + + +#' Prepare spatial and spatiotemporal cross validation sets +#' @keywords Baselearner +#' @param data data.table with X, Y, and time information. +#' @param target_cols character(3). Names of columns for X, Y. +#' Default is `c("lon", "lat")`. It is passed to sf::st_as_sf to +#' subsequently generate spatial cross-validation indices using +#' `spatialsample::spatial_block_cv` and +#' `spatialsample::spatial_clustering_cv`. +#' @param cv_make_fun function(1). Function to generate spatial +#' cross-validation indices. +#' Default is `spatialsample::spatial_block_cv`. +#' @param ... Additional arguments to be passed to `cv_make_fun`. +#' @seealso [`spatialsample::spatial_block_cv`], +#' [`spatialsample::spatial_clustering_cv`], +#' [`spatialsample::spatial_buffer_vfold_cv`] +#' @return A list of numeric vectors with in- and out-of-sample row indices or +#' a numeric vector with out-of-sample indices. +#' @importFrom rlang inject +#' @importFrom sf st_as_sf +#' @importFrom spatialsample spatial_block_cv +#' @importFrom rsample manual_rset +#' @importFrom dplyr %>% slice_sample +#' @importFrom methods getPackageName +#' @export +generate_cv_index_sp <- + function( + data, + target_cols = c("lon", "lat"), + cv_make_fun = spatialsample::spatial_block_cv, + ... + ) { + + data_sf <- sf::st_as_sf(data, coords = target_cols, remove = FALSE) + cv_index <- + rlang::inject( + cv_make_fun( + data_sf, + !!!list(...) + ) + ) + + # retrieve in_id + data_rowid <- seq_len(nrow(data)) + newcv <- data_rowid + if ( + !all( + !is.na(Reduce(c, Map(function(x) is.na(x$out_id), cv_index$splits))) + ) + ) { + newcv <- + lapply( + cv_index$splits, + function(x) list(analysis = x$in_id, assessment = x$out_id) + ) + } else { + cv_index <- lapply(cv_index$splits, function(x) x$in_id) + for (i in seq_along(cv_index)) { + newcv[setdiff(data_rowid, cv_index[[i]])] <- i + } + } + + return(newcv) + } + + + +#' Visualize the spatio-temporal cross-validation index +#' @keywords Baselearner +#' @param rsplit rsample::manual_rset() object. +#' @param cex numeric(1). Size of the points in the plot. +#' @param angle numeric(1). Viewing angle of 3D plot. +#' @return None. A plot will be generated. +#' @importFrom scatterplot3d scatterplot3d +#' @importFrom graphics par +#' @export +vis_spt_rset <- + function(rsplit, cex = 0.02, angle = 60) { + nsplit <- nrow(rsplit) + graphics::par(mfrow = c(ceiling(nsplit / 3), 3)) + for (i in seq_len(nsplit)) { + cleared <- rsplit[i, 1][[1]][[1]]$data + cleared$indx <- 0 + cleared$indx[rsplit[i, 1][[1]][[1]]$in_id] <- "In" + cleared$indx[rsplit[i, 1][[1]][[1]]$out_id] <- "Out" + cleared$indx <- factor(cleared$indx) + cleared$time <- as.POSIXct(cleared$time) + scatterplot3d::scatterplot3d( + cleared$lon, cleared$lat, cleared$time, + color = rev(as.integer(cleared$indx) + 1), + cex.symbols = cex, pch = 19, + angle = angle + ) + } + } + + + +#' Choose cross-validation strategy for the base learner +#' @keywords Baselearner +#' @param learner character(1). Learner type. Should be one of: +#' * "spatial": spatial cross-validation. +#' * "temporal": temporal cross-validation. +#' * "spatiotemporal": spatiotemporal cross-validation. +#' @param ... Additional arguments to be passed. +#' @note This function's returned value is used as an input for +#' `fit_base_brulee`, `fit_base_lightgbm`, and `fit_base_elnet`. +#' Learner values can be used as a branching point for the cross-validation +#' strategy. +#' @return [rsample::manual_rset()] output object. +#' @export +switch_generate_cv_rset <- + function( + learner = c("spatial", "temporal", "spatiotemporal"), + ... + ) { + learner <- match.arg(learner) + target_fun <- + switch( + learner, + spatial = generate_cv_index_sp, + temporal = generate_cv_index_ts, + spatiotemporal = generate_cv_index_spt + ) + cvindex <- inject_match(target_fun, list(...)) + return(cvindex) + } + + +# nocov end diff --git a/R/calc_postprocessing.R b/R/calc_postprocessing.R new file mode 100644 index 00000000..1a0b4c49 --- /dev/null +++ b/R/calc_postprocessing.R @@ -0,0 +1,762 @@ +# nocov start + + +#' Add Time Column +#' +#' This function adds a time column to a data frame. +#' +#' @keywords Post-calculation +#' @param df The data frame to which the time column will be added. +#' @param time_value The value to be assigned to the time column. +#' @param time_id The name of the time column (default is "time"). +#' +#' @return The data frame with the added time column. +#' +#' @examples +#' \dontrun{ +#' df <- data.frame(x = 1:5, y = letters[1:5]) +#' add_time_col(df, "2022-01-01") +#' } +#' @export +add_time_col <- function(df, time_value, time_id = "time") { + if (!time_id %in% names(df)) { + df[[time_id]] <- time_value + } + return(df) +} + + + +#' Merge input data.frame objects +#' @param by character. Joining keys. See [`merge`] for details. +#' @param time logical(1). Whether or not include time identifier. +#' Set this `TRUE` will supersede `by` value by appending time identifier. +#' @param ... data.frame objects to merge +#' @return data.table +#' @keywords Post-calculation +#' @importFrom data.table as.data.table +#' @importFrom data.table merge.data.table +#' @export +post_calc_merge_features <- + function( + by = c("site_id"), + time = FALSE, + ... + ) { + ellipsis <- list(...) + if (time) { + by <- c("site_id", "time") + ellipsis_clean <- + lapply( + ellipsis, + function(x) { + x <- data.table::as.data.table(x) + col_coords <- grep("(lon|lat)", names(x)) + if (length(col_coords) > 0 && !is.null(col_coords)) { + x <- x[, -col_coords, with = FALSE] + } + x$time <- as.character(x$time) + return(x) + } + ) + } else { + ellipsis_clean <- ellipsis + } + joined <- + Reduce(function(x, y) { + data.table::merge.data.table( + x, y, + by = by, all.x = TRUE, suffixes = c("_Ma", "_Mb") + ) + }, ellipsis_clean) + return(joined) + } + + +#' Change time column name +#' @param df data.frame +#' @param candidates character. Candidate column names. +#' @param replace character. New column name. +#' @return data.frame +#' @keywords Post-calculation +#' @export +post_calc_unify_timecols <- + function( + df, + candidates = c("year"), + replace = "time" + ) { + if (sum(names(df) %in% candidates) > 1) { + stop("More than a candidate is detected in the input.") + } + names(df)[names(df) %in% candidates] <- replace + return(df) + } + + +#' Convert time column to character +#' @keywords Post-calculation +#' @param df data.table +#' @note This function takes preprocessed data.table with +#' a column named `"time"`. +#' @importFrom data.table as.data.table copy +post_calc_convert_time <- + function( + df + ) { + df <- data.table::copy(data.table::as.data.table(df)) + df <- df[, `:=`(time, as.character(time))] + return(df) + } + +# nocov end + + +#' Join a data.frame with a year-only date column to +#' that with a full date column +#' @description The full date column will be converted to a year column +#' as a new column, then the data.frame with the year-only column will +#' be joined. +#' @keywords Post-calculation +#' @param df_year data.frame with a year-only date column +#' @param df_date data.frame with a full date column +#' @param field_year character(1). Year column in `df_year` +#' @param field_date character(1). Date column in `df_date` +#' @param spid character(1). Name of the unique location identifier field. +#' @importFrom methods is +#' @importFrom data.table merge.data.table +#' @importFrom data.table `:=` +#' @return data.frame +post_calc_join_yeardate <- + function( + df_year, + df_date, + field_year = "time", + field_date = "time", + spid = "site_id" + ) { + if (!inherits(df_year, "data.frame") && !inherits(df_date, "data.frame")) { + stop("Both inputs should be data.frame.") + } + + names(df_year)[which(names(df_year) %in% field_year)] <- "year" + df_year$year <- as.character(unlist(df_year$year)) + df_date$year <- as.character(substr(df_date[[field_date]], 1, 4)) + + df_joined <- + data.table::merge.data.table( + df_date, df_year, + by = c(spid, "year"), + all.x = TRUE + ) + + df_joined <- df_joined[, c("year") := NULL] + return(df_joined) + } + + +# 2018~2022, 2017, 2020 +# 2017 ... 2020 ... +# 2017 +#' Map the available raw data years over the given period +#' @description +#' Many raw datasets are periodically updated and the period could +#' be longer than a year. This function maps the available years +#' over the given period. +#' @keywords Post-calculation +#' @param time_start integer(1). Starting year. +#' @param time_end integer(1). Ending year. +#' @param time_unit character(1). Time unit. Default is `"year"`. +#' @param time_available vector. Available years. +#' @return integer vector of length (time_end - time_start + 1). +#' Each element will get the nearest preceeding available year. +#' @note +#' The minimum of `time_available` will be filled in front of the first +#' available year when the minimum of `time_available` is greater +#' than `time_start`. +#' @examples +#' \dontrun{ +#' process_calc_year_expand(2018, 2022, "year", c(2017, 2020, 2021)) +#' process_calc_year_expand(2018, 2022, "year", c(2020, 2021)) +#' } +#' @export +post_calc_year_expand <- + function( + time_start = NULL, + time_end = NULL, + time_unit = "year", + time_available = NULL + ) { + time_seq <- seq(time_start, time_end) + time_target_seq <- findInterval(time_seq, time_available) + time_target_seq <- time_available[time_target_seq] + if (min(time_available) > time_start) { + time_target_seq <- + c( + rep(min(time_available), + min(time_available) - time_start), + time_target_seq + ) + } + return(time_target_seq) + } + + + +#' Expand a data frame by year +#' +#' This function expands a data frame by year, creating multiple rows +#' for each year based on the time period specified. +#' @keywords Post-calculation +#' @param df The input data frame. The data frame should have the same +#' number of rows per year, meaning that it assumes this argument is +#' a spatial-only feature data.frame. +#' @param locs_id The column name of the location identifier in the data frame. +#' @param time_field The column name of the time field in the data frame. +#' @param time_start The start of the time period. +#' @param time_end The end of the time period. +#' @param time_unit The unit of time to expand the data frame. Only for record. +#' @param time_available A vector of available time periods. +#' @param ... Placeholders. +#' @note Year expansion rule is to assign the nearest past year +#' in the available years; if there is no past year in the available years, +#' the first available year is rolled back to the start of the time period. +#' @return The expanded data frame with multiple rows for each year. +#' @seealso [`post_calc_year_expand()`] +#' @examples +#' \dontrun{ +#' df <- data.frame(year = c(2010, 2010, 2011, 2012), +#' value = c(1, 2, 3, 4)) +#' df_expanded <- +#' post_calc_df_year_expand(df, locs_id = "site_id", time_field = "year", +#' time_start = 2011, time_end = 2012, +#' time_unit = "year") +#' print(df_expanded) +#' } +#' @importFrom stats sd +#' @export +post_calc_df_year_expand <- function( + df, + locs_id = "site_id", + time_field = "time", + time_start = NULL, + time_end = NULL, + time_unit = "year", + time_available = NULL, + ... +) { + time_summary <- table(unlist(df[[time_field]])) + if (length(time_summary) != 1) { + if (stats::sd(time_summary) != 0) { + stop("df should be a data frame with the same number of rows per year") + } + } + # assume that df is the row-bound data frame + if (is.character(df[[time_field]])) { + df[[time_field]] <- as.integer(df[[time_field]]) + } + df_years <- unique(df[[time_field]]) + nlocs <- length(unique(df[[locs_id]])) + year_period <- seq(time_start, time_end) + # assign the time period to the available years + year_assigned <- + post_calc_year_expand(time_start, time_end, time_unit, df_years) + df_years_repeats <- table(year_assigned) + + # repeat data frames + df_expanded <- Map( + function(y) { + df_sub <- df[df[[time_field]] == df_years[y], ] + df_sub <- df_sub[rep(seq_len(nrow(df_sub)), df_years_repeats[y]), ] + return(df_sub) + }, + seq_along(year_assigned) + ) + df_expanded <- do.call(rbind, df_expanded) + df_expanded[[time_field]] <- rep(year_period, each = nlocs) + return(df_expanded) +} + + + +# nocov start + +#' Merge spatial and spatiotemporal covariate data +#' @keywords Post-calculation +#' @param locs Location. e.g., AQS sites. +#' @param locs_id character(1). Location identifier. +#' @param time_id character(1). Location identifier. +#' @param target_years integer. Used to dummify nominal year. +#' @param df_sp data.frame. Spatial-only covariates. +#' @param df_spt data.frame. Spatiotemporal covariates. +#' @note This version assumes the time_id contains Date-like strings. +#' @return data.frame +#' @importFrom data.table merge.data.table +#' @importFrom amadeus calc_temporal_dummies +#' @export +post_calc_merge_all <- + function( + locs, + locs_id, + time_id, + target_years = seq(2018, 2022), + df_sp, + df_spt + ) { + if (methods::is(locs, "sf")) { + locs <- sf::st_drop_geometry(locs) + } + locs$time <- as.character(locs$time) + locs <- data.table::as.data.table(locs) + locs_merged <- + data.table::merge.data.table( + locs, df_sp, by = c(locs_id) + ) + locs_merged <- + data.table::merge.data.table( + locs_merged, df_spt, + by = c(locs_id, time_id) + ) + locs_merged <- + amadeus::calc_temporal_dummies( + locs = locs_merged, + locs_id = locs_id, + year = target_years + ) + return(locs_merged) + } + + +#' Remove columns from a data frame based on regular expression patterns. +#' @keywords Post-calculation +#' +#' This function removes columns from a data frame that match +#' any of the specified +#' regular expression patterns. By default, it removes columns with names that +#' match the patterns "^lon$|^lat$|geoid|year$|description". +#' +#' @param df The input data frame. +#' @param candidates A character vector of regular expression patterns +#' to match against column names. Columns that match any of the patterns +#' will be removed. The default value is +#' "^lon$|^lat$|geoid|year$|description". +#' @param strict logical(1). If `TRUE`, +#' only `c("site_id", "time")` will be kept. +#' @return The modified data frame with the specified columns removed. +#' +#' @examples +#' \dontrun{ +#' df <- data.frame(lon = 1:5, lat = 6:10, geoid = 11:15, year = 2010:2014, +#' description = letters[1:5], other = 16:20) +#' post_calc_drop_cols(df) +#' } +post_calc_drop_cols <- + function( + df, + candidates = "(^lon$|^lat$|geoid|year$|description|geometry)", + strict = FALSE + ) { + idx_remove <- + if (!strict) { + grep(candidates, names(df), value = TRUE) + } else { + grep("site_id|time", names(df), value = TRUE, invert = TRUE) + } + df <- df[, -idx_remove, with = FALSE] + return(df) + } + +# nocov end + + +#' Automatic joining by the time and spatial identifiers +#' @description The key assumption is that all data frames will have +#' time field and spatial field and the data should have one of date or year. +#' Whether the input time unit is year or date +#' is determined by the coercion of the **first row value** of the time field +#' into a character with `as.Date()`. This function will fail if it +#' gets year-like string with length 4. +#' +#' @param df_fine The fine-grained data frame. +#' @param df_coarse The coarse-grained data frame. +#' @param field_sp The name of the spatial field in the data frames. +#' @param field_t The name of the time field in the data frames. +#' @param year_start The starting year of the time period. +#' @param year_end The ending year of the time period. +#' @keywords Post-calculation +#' @return A merged data table. +#' @examples +# nolint start +#' \dontrun{ +#' df_fine0 <- data.frame(site_id = c("A", "B", "B", "C"), +#' time = as.Date(c("2022-01-01", "2022-01-02", "2021-12-31", "2021-01-03")), +#' value = c(1, 2, 3, 5)) +#' df_coarse0 <- data.frame(site_id = c("A", "B", "C"), +#' time = c("2022", "2022", "2021"), +#' other_value = c(10, 20, 30)) +#' jdf <- post_calc_autojoin(df_fine0, df_coarse0) +#' print(jdf) +#' } +# nolint end +#' @importFrom data.table merge.data.table +#' @importFrom rlang as_name sym +#' @export +post_calc_autojoin <- + function( + df_fine, + df_coarse, + field_sp = "site_id", + field_t = "time", + year_start = 2018L, + year_end = 2022L + ) { + if (any(grepl("population", names(df_coarse)))) { + df_coarse <- df_coarse[, -c("time"), with = FALSE] + } + common_field <- intersect(names(df_fine), names(df_coarse)) + df_fine <- data.table::as.data.table(df_fine) + df_coarse <- data.table::as.data.table(df_coarse) + df_fine <- post_calc_drop_cols(df_fine) + df_coarse <- post_calc_drop_cols(df_coarse) + # if (length(common_field) > 2) { + # message("The data frames have more than two common fields.") + # message("Trying to remove the redundant common fields...") + # common_field <- intersect(names(df_fine), names(df_coarse)) + # print(common_field) + # common_field <- + # common_field[-which(!common_field %in% c(field_sp, field_t))] + # } + if (length(common_field) == 1) { + print(common_field) + if (common_field == field_sp) { + joined <- data.table::merge.data.table( + df_fine, df_coarse, + by = field_sp, + all.x = TRUE + ) + } + } + if (length(common_field) == 2) { + if (all(common_field %in% c(field_sp, field_t))) { + # t_fine <- try(as.Date(df_fine[[field_t]][1])) + df_fine[[field_t]] <- as.character(df_fine[[field_t]]) + df_coarse[[field_t]] <- as.character(df_coarse[[field_t]]) + t_coarse <- try(as.Date(df_coarse[[field_t]][1])) + if (inherits(t_coarse, "try-error")) { + message( + "The time field includes years. Trying different join strategy." + ) + # derive the available years from the coarsely resolved data + coarse_years <- sort(unique(unlist(as.integer(df_coarse[[field_t]])))) + df_coarse2 <- post_calc_df_year_expand( + df_coarse, + time_start = year_start, + time_end = year_end, + time_available = coarse_years + ) + joined <- + post_calc_join_yeardate(df_coarse2, df_fine, field_t, field_t) + } else { + joined <- data.table::merge.data.table( + df_fine, df_coarse, + by = c(field_sp, field_t), + all.x = TRUE + ) + } + } + } + return(joined) + } + + +# nocov start + +#' Impute missing values and attach lagged features +#' @keywords Post-calculation +#' @note +#' This function performs imputation on a given data table +#' by replacing missing values with imputed values. +#' It follows a series of steps including data cleaning, name cleaning, +#' geoscf column renaming, NDVI 16-day backward filling, +#' zero-variance exclusion, excessive "true zeros" exclusion, +#' and imputation using missRanger. +#' A few points should be discussed to sophisticate the imputation +#' process: exclusion threshold for rates of zero observations, +#' which might lead to significant improvement in the imputation +#' process especially in terms of speed and accuracy. +#' @param dt The input data table to be imputed. +#' @param period The period for lagged features. +#' @param nthreads_dt The number of threads to be used for +#' data.table operations. +#' @param nthreads_collapse The number of threads to be used for +#' collapse operations. +#' @param nthreads_imputation The number of threads to be used for +#' the imputation process. +#' +#' @return The imputed data table with lagged features. +#' +#' @importFrom collapse set_collapse replace_inf replace_na fvar fnth +#' @importFrom data.table setDTthreads setnafill +#' @importFrom qs qread +#' @importFrom stats setNames +#' @importFrom stringi stri_replace_all_regex +#' @importFrom missRanger missRanger +#' @export +impute_all <- + function( + dt, + period, + nthreads_dt = 32L, + nthreads_collapse = 32L, + nthreads_imputation = 32L + ) { + data.table::setDTthreads(nthreads_dt) + if (is.character(dt)) { + dt <- file.path("output/qs", dt) + dt <- qs::qread(dt) + } + dt$time <- as.POSIXct(dt$time) + # remove unnecessary columns + query <- "^(site_id|time)\\.[0-9]+" + dt <- dt[, !grepl(query, names(dt)), with = FALSE] + + # name cleaning + dt <- stats::setNames( + dt, + sub("light_00000", "OTH_HMSWL_0_00000", names(dt)) + ) + dt <- stats::setNames( + dt, + sub("medium_00000", "OTH_HMSWM_0_00000", names(dt)) + ) + dt <- stats::setNames( + dt, + sub("heavy_00000", "OTH_HMSWH_0_00000", names(dt)) + ) + dt <- stats::setNames( + dt, + sub("population_", "POP_SEDAC_0_", names(dt)) + ) + + geoscn <- + "ACET\tGEO_ACETO_0_00000 + ALD2\tGEO_ACETA_0_00000 + ALK4\tGEO_CALKA_0_00000 + BCPI\tGEO_HIBCA_0_00000 + BCPO\tGEO_HOBCA_0_00000 + BENZ\tGEO_BENZE_0_00000 + C2H6\tGEO_ETHTE_0_00000 + C3H8\tGEO_PROPA_0_00000 + CH4\tGEO_METHA_0_00000 + CO\tGEO_CMONO_0_00000 + DST1\tGEO_DUST1_0_00000 + DST2\tGEO_DUST2_0_00000 + DST3\tGEO_DUST3_0_00000 + DST4\tGEO_DUST4_0_00000 + EOH\tGEO_ETHOL_0_00000 + H2O2\tGEO_HYPER_0_00000 + HCHO\tGEO_FORMA_0_00000 + HNO3\tGEO_NITAC_0_00000 + HNO4\tGEO_PERAC_0_00000 + ISOP\tGEO_ISOPR_0_00000 + MACR\tGEO_METHC_0_00000 + MEK\tGEO_MEKET_0_00000 + MVK\tGEO_MVKET_0_00000 + N2O5\tGEO_DIPEN_0_00000 + NH3\tGEO_AMNIA_0_00000 + NH4\tGEO_AMNUM_0_00000 + NIT\tGEO_INNIT_0_00000 + NO\tGEO_NIOXI_0_00000 + NO2\tGEO_NIDIO_0_00000 + NOy\tGEO_NITRO_0_00000 + OCPI\tGEO_HIORG_0_00000 + OCPO\tGEO_HOORG_0_00000 + PAN\tGEO_PERNI_0_00000 + PM25_RH35_GCC\tGEO_PM25X_0_00000 + PM25_RH35_GOCART\tGEO_PM25R_0_00000 + PM25bc_RH35_GCC\tGEO_BLCPM_0_00000 + PM25du_RH35_GCC\tGEO_DUSPM_0_00000 + PM25ni_RH35_GCC\tGEO_NITPM_0_00000 + PM25oc_RH35_GCC\tGEO_ORCPM_0_00000 + PM25soa_RH35_GCC\tGEO_SORPM_0_00000 + PM25ss_RH35_GCC\tGEO_SEAPM_0_00000 + PM25su_RH35_GCC\tGEO_SULPM_0_00000 + PRPE\tGEO_CALKE_0_00000 + RCHO\tGEO_CALDH_0_00000 + SALA\tGEO_FSEAS_0_00000 + SALC\tGEO_CSEAS_0_00000 + SO2\tGEO_SULDI_0_00000 + SOAP\tGEO_SOAPR_0_00000 + SOAS\tGEO_SOASI_0_00000 + TOLU\tGEO_TOLUE_0_00000 + XYLE\tGEO_XYLEN_0_00000 + CO_y\tGEO_COVMR_0_00000 + NO2_y\tGEO_NOVMR_0_00000 + O3\tGEO_OZVMR_0_00000 + SO2_y\tGEO_SOVMR_0_00000" + + geoscn <- strsplit(geoscn, "\n") + geoscn <- unlist(geoscn) + geoscn <- strsplit(geoscn, "\t") + geoscn <- do.call(rbind, geoscn) + geoscndf <- as.data.frame(geoscn, stringsAsFactors = FALSE) + colnames(geoscndf) <- c("variable", "code") + geoscndf$variable <- trimws(geoscndf$variable) + + for (i in seq_len(nrow(geoscndf))) { + dt <- + setNames( + dt, + stringi::stri_replace_all_regex( + names(dt), sprintf("%s$", geoscndf$variable[i]), geoscndf$code[i] + ) + ) + } + site_id <- NULL + # NDVI 16-day + # For each site_id, backward filling for 16-day NDVI + # Last Observation Carried Forward is the method used; + # it assumes that the rows are ordered by date + dt <- dt[order(site_id, time), ] + col_ndviv <- grep("MOD_NDVIV_", names(dt)) + dtndviv <- + data.table::setnafill( + dt, type = "nocb", nan = NA, + cols = col_ndviv + ) + + collapse::set_collapse(mask = "manip", nthreads = nthreads_collapse) + + target_replace <- grep("^MOD_", names(dt), invert = TRUE) + dt <- collapse::replace_inf(dtndviv, value = NA, replace.nan = TRUE) + dt <- collapse::replace_na(dt, value = 0, cols = target_replace) + + # zero-variance exclusion + dt_colvars <- collapse::fvar(dt[, 5:ncol(dt), with = FALSE]) + zero_var_fields <- names(dt_colvars[dt_colvars == 0]) + + # Exclude fields with zero variance using data.table + dt <- dt[, (zero_var_fields) := NULL] + + # Store the name of zero variance fields as an attribute of the input object + attr(dt, "zero_var_fields") <- zero_var_fields + + # excluding columns with excessive "true zeros" + # we should have a threshold for the zero rate + # exc_zero <- collapse::fnth(dt[, 5:ncol(dt), with = FALSE], n = 0.9) + # exc_zero <- unname(which(exc_zero == 0)) + 5L + # dt <- dt[, (exc_zero) := NULL] + + # Q: Do we use all other features to impute? -- Yes. + # 32-thread, 10% for tree building, 200 trees, 4 rounds: 11 hours + imputed <- + missRanger::missRanger( + data = dt, + maxiter = 10L, + num.trees = 300L, + num.threads = nthreads_imputation, + mtry = 100L, + sample.fraction = 0.1 + ) + + imputed <- amadeus::calc_temporal_dummies(imputed, "time") + return(imputed) + # lagged features: changing period (period[1] + 1 day) + # period <- as.Date(period) + # period[1] <- period[1] + as.difftime(1, units = "days") + # period <- as.character(period) + # index_lag <- + # sprintf("MET_%s", c("ATSFC", "ACPRC", "PRSFC", "SPHUM", "WNDSP")) + # index_lag <- grep(paste(index_lag, collapse = "|"), names(dt)) + # target_lag <- imputed[, index_lag, with = FALSE] + + # output <- amadeus::calc_lagged(target_lag, period, 1, "site_id") + # return(output) + } + + +#' Append Predecessors +#' +#' This function appends predecessors to an existing object or +#' creates a new object if none exists. +#' +#' @keywords Post-calculation +#' @param path_qs The path where the predecessors will be stored. +#' @param period_new The new period to be appended. +#' @param input_new The new input object to be appended. +#' @param nthreads The number of threads to be used. +#' +#' @return If no existing predecessors are found, the function saves +#' the new input object and returns the name of the saved file. +#' If existing predecessors are found, the function appends +#' the new input object to the existing ones and returns the combined object. +#' @export +append_predecessors <- + function( + path_qs = "output/qs", + period_new = NULL, + input_new = NULL, + nthreads = 8L + ) { + if (is.null(input_new)) { + stop("Please provide a valid object.") + } + if (!dir.exists(path_qs)) { + dir.create(path_qs, recursive = TRUE) + } + input_old <- list.files(path_qs, "*.*.qs$", full.names = TRUE) + + # validate input_old with period_new + # if (length(input_old) > 0) { + # periods_old <- do.call(rbind, strsplit(input_old, "_")) + # periods_old <- periods_old[, 4:5] + # periods_old_check <- vapply( + # seq(1, nrow(periods_old)), + # function(i) { + # period_old <- periods_old[i, ] + # period_old <- as.Date(period_old, format = "%Y-%m-%d") + # period_new <- as.Date(period_new, format = "%Y-%m-%d") + # if (period_new[1] < period_old[1] | period_new[2] < period_old[2]) { + # return(FALSE) + # } else { + # return(TRUE) + # } + # }, + # logical(1) + # ) + # if (!all(periods_old_check)) { + # stop("Results have an overlap period. Please provide a valid period.") + # } + # } + period_new <- sapply(period_new, as.character) + time_create <- gsub("[[:punct:]]|[[:blank:]]", "", Sys.time()) + name_qs <- + sprintf( + "dt_feat_pm25_%s_%s_%s.qs", + period_new[1], period_new[2], time_create + ) + if (length(input_old) == 0) { + qs::qsave(input_new, file = file.path(path_qs, name_qs)) + return(name_qs) + } else { + # vv <- list() + qs::qsave(input_new, file = file.path(path_qs, name_qs)) + input_update <- list.files(path_qs, "*.*.qs$", full.names = TRUE) + bound_large <- + Reduce( + function(x, y) { + if (inherits(x, "data.frame")) { + bound <- rbind(x, qs::qread(y)) + } else { + bound <- rbind(qs::qread(x), qs::qread(y)) + } + return(bound) + }, + input_update + ) + return(bound_large) + } + } + + +# nocov end diff --git a/R/calculate.R b/R/calculate.R new file mode 100644 index 00000000..52d4093b --- /dev/null +++ b/R/calculate.R @@ -0,0 +1,527 @@ +# nocov start + + +#' Process atmospheric composition data by chunks +#' @keywords Calculation +#' @description +#' Returning a single `SpatRasterDataset` object. +#' Removed `tapp` for performance; impose a strict assumption that +#' there are no missing values +#' @param path character(1). Directory with downloaded netCDF (.nc4) files. or +#' netCDF file paths. +#' @param date character(2). length of 10. Format "YYYY-MM-DD". +#' @param locs Locations to extract. +#' @param locs_id character(1). Location identifier. +#' @param ... Arguments passed to [`terra::rast`]. +#' @note +#' Layer names of the returned `SpatRaster` object contain the variable, +#' pressure level, date +#' Reference duration: 1 day summary, all layers: 106 seconds +#' hard-coded subsets for subdataset selection +#' @author Mitchell Manware, Insang Song +#' @return a `SpatRaster` object; +#' @importFrom sf st_as_sf +#' @importFrom future.apply future_lapply +#' @importFrom data.table rbindlist +#' @importFrom future plan multicore sequential +#' @importFrom amadeus generate_date_sequence +#' @importFrom terra describe rast time subset crs varnames vect sds extract +#' @importFrom terra nlyr set.crs +#' @importFrom dplyr full_join +#' @export +calc_geos_strict <- + function(path = NULL, + date = c("2018-01-01", "2018-01-01"), + locs = NULL, + locs_id = NULL, + ...) { + #### directory setup + if (length(path) == 1) { + if (dir.exists(path)) { + # path <- amadeus::download_sanitize_path(path) + paths <- list.files( + path, + pattern = "GEOS-CF.v01.rpl", + full.names = TRUE + ) + paths <- paths[grep( + ".nc4", + paths + )] + } + } else { + paths <- path + } + #### check for variable + # amadeus::check_for_null_parameters(mget(ls())) + #### identify file paths + #### identify dates based on user input + dates_of_interest <- amadeus::generate_date_sequence( + date[1], + date[2], + sub_hyphen = TRUE + ) + dates_of_interest_incl <- amadeus::generate_date_sequence( + date[1], + date[2], + sub_hyphen = FALSE + ) + #### subset file paths to only dates of interest + data_paths <- unique( + grep( + paste( + dates_of_interest, + collapse = "|" + ), + paths, + value = TRUE + ) + ) + + #### identify collection + collection <- regmatches( + data_paths[1], + # the pattern accommodates 3-4 characters for the variable name, + # 3-4 alphanumerics for the temporal resolution, + # 8-9 alphanumerics for the output dimensions + # nolint start + regexpr( + "GEOS-CF.v01.rpl.(aqc|chm)_[[:alpha:]]{3,4}_[[:alnum:]]{3,4}_[[:alnum:]]{8,9}_v[1-9]", + data_paths[1] + ) + ) + cat( + paste0( + "Identified collection ", + collection, + ".\n" + ) + ) + if (length(unique(collection)) > 1) { + warning( + "Multiple collections detected. Returning data for all collections.\n" + ) + } + + filename_date <- sort(regmatches( + data_paths, + regexpr( + "20[0-9]{2}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])", + data_paths + ) + )) + if (any(table(filename_date) < 24)) { + warning( + "Some dates include less than 24 hours. Check the downloaded files." + ) + } + # nolint end + # to export locs (pointers are not exportable) + locs <- sf::st_as_sf(locs) + + # split filename dates daily + filename_date <- as.Date(filename_date, format = "%Y%m%d") + filename_date <- filename_date[filename_date %in% dates_of_interest_incl] + filename_date_cl <- as.integer(as.factor(filename_date)) + + future_inserted <- split(data_paths, filename_date_cl) + other_args <- list(...) + other_args$nthreads <- NULL + data_variables <- terra::describe(data_paths[1], sds = TRUE)$var + + search_variables <- + if (grepl("chm", collection)) { + c("ACET", "ALD2", "ALK4", "BCPI", "BCPO", "BENZ", "C2H6", "C3H8", + "CH4", "CO", "DST1", "DST2", "DST3", "DST4", "EOH", "H2O2", + "HCHO", "HNO3", "HNO4", "ISOP", "MACR", "MEK", "MVK", "N2O5", + "NH3", "NH4", "NIT", "NO", "NO2", "NOy", "OCPI", "OCPO", "PAN", + "PM25_RH35_GCC", "PM25_RH35_GOCART", "PM25bc_RH35_GCC", + "PM25du_RH35_GCC", "PM25ni_RH35_GCC", "PM25oc_RH35_GCC", + "PM25soa_RH35_GCC", "PM25ss_RH35_GCC", "PM25su_RH35_GCC", + "PRPE", "RCHO", "SALA", "SALC", "SO2", "SOAP", "SOAS", "TOLU", "XYLE" + ) + } else { + c("CO", "NO2", "O3", "SO2") + } + + # fs is the hourly file paths per day (each element with N=24) + summary_byvar <- function(x = search_variables, fs) { + rast_in <- rlang::inject(terra::rast(fs, !!!other_args)) + # strongly assume that we take the single day. no need to filter dates + # per variable, + # all files (hourly) are cleaned and processed + sds_proc <- + lapply( + x, + function(v) { + rast_inidx <- grep(v, data_variables) + #rast_in <- mean(rast_in[[rast_inidx]]) + rast_summary <- terra::mean(rast_in[[rast_inidx]]) + rtin <- as.Date(terra::time(rast_in)) + rtin_u <- unique(rtin) + cat(sprintf("Processing %s, date: %s\n", v, rtin_u)) + # rast_summary <- vector("list", length = length(rtin_u)) + # for (d in seq_along(rtin_u)) { + # rast_d <- rast_in[[rtin == rtin_u[d]]] + # rast_summary[[d]] <- mean(rast_d) + # } + # rast_summary <- do.call(c, rast_summary) + + # the next line is deprecated + # rast_summary <- terra::tapp(rast_in, index = "days", fun = "mean") + terra::time(rast_summary) <- rtin_u + names(rast_summary) <- + paste0( + rep(gsub("_lev=.*", "", v), terra::nlyr(rast_summary)) + ) + terra::set.crs(rast_summary, "EPSG:4326") + return(rast_summary) + } + ) + sds_proc <- terra::sds(sds_proc) + + locstr <- terra::vect(locs) + rast_ext <- terra::extract(sds_proc, locstr, ID = TRUE) + # rast_ext <- lapply(rast_ext, + # function(df) { + # df$ID <- unlist(locs[[locs_id]]) + # return(df) + # } + # ) + rast_ext <- + Reduce(function(dfa, dfb) dplyr::full_join(dfa, dfb, by = "ID"), + rast_ext + ) + rast_ext$time <- unique(as.Date(terra::time(rast_in))) + rast_ext$ID <- unlist(locs[[locs_id]])[rast_ext$ID] + names(rast_ext)[names(rast_ext) == "ID"] <- locs_id + return(rast_ext) + + } + future::plan(future::multicore, workers = 10) + rast_summary <- + future.apply::future_lapply( + future_inserted, + function(fs) summary_byvar(fs = fs) + ) + future::plan(future::sequential) + rast_summary <- data.table::rbindlist(rast_summary) + + return(rast_summary) + + } + + +#' Reflown gmted processing +#' @keywords Calculation +#' @param variable character(2). Statistic and resolution. +#' @param path character(1). Directory with downloaded GMTED files. +#' @param locs data.frame/SpatVector/sf. Locations. +#' @param locs_id character(1). Location identifier. +#' @param win numeric(4). Window for the raster. +#' @param radius numeric(1). Radius for the extraction. +#' @param fun character(1). Function to apply. +#' @param ... Additional parameters to be passed to other functions. +#' @return A data.frame containing the extracted GMTED data. +#' @importFrom terra rast varnames extract +#' @importFrom amadeus process_gmted_codes calc_prepare_locs calc_worker +#' @importFrom amadeus download_sanitize_path +#' @export +calc_gmted_direct <- function( + variable = NULL, + path = NULL, + locs = NULL, + locs_id = NULL, + win = c(-126, -62, 22, 52), + radius = 0, + fun = "mean", + ...) { + #### directory setup + path <- amadeus::download_sanitize_path(path) + #### check for length of variable + if (!(length(variable) == 2)) { + stop( + paste0( + "Please provide a vector with the statistic and resolution.\n" + ) + ) + } + #### identify statistic and resolution + statistic <- variable[1] + statistic_code <- amadeus::process_gmted_codes( + statistic, + statistic = TRUE, + invert = FALSE + ) + resolution <- variable[2] + resolution_code <- amadeus::process_gmted_codes( + resolution, + resolution = TRUE, + invert = FALSE + ) + cat(paste0( + "Cleaning ", + statistic, + " data at ", + resolution, + " resolution.\n" + )) + statistic_from <- c( + "Breakline Emphasis", "Systematic Subsample", + "Median Statistic", "Minimum Statistic", + "Mean Statistic", "Maximum Statistic", + "Standard Deviation Statistic" + ) + statistic_to <- c( + "BRKL", "SSUB", "MEDN", "MINI", "MEAN", "MAXL", "STDV" + ) + statistic_to <- + sprintf("LDU_E%s", statistic_to[match(statistic, statistic_from)]) + + #### identify file path + paths <- list.dirs( + path, + full.names = TRUE + ) + data_path <- + grep( + sprintf( + "%s%s_grd", + statistic_code, + as.character(resolution_code) + ), + paths, value = TRUE + ) + + #### import data + data <- terra::rast(data_path, win = win) + #### layer name + names(data) <- paste0( + "elevation_", + gsub( + "_grd", + "", + names(data) + ) + ) + #### varnames + terra::varnames(data) <- paste0( + "Elevation: ", + statistic, + " (", + resolution, + ")" + ) + from <- data + #return(from) + #### prepare locations list + sites_list <- amadeus::calc_prepare_locs( + from = from, + locs = locs, + locs_id = locs_id, + radius = radius + ) + sites_e <- sites_list[[1]] + sites_id <- sites_list[[2]] + #### perform extraction + sites_extracted <- amadeus::calc_worker( + dataset = "gmted", + from = from, + locs_vector = sites_e, + locs_df = sites_id, + radius = radius, + fun = fun, + variable = 2, + time = NULL, + time_type = "timeless" + ) + #### convert integer to numeric + sites_extracted[, 2] <- as.numeric(sites_extracted[, 2]) + #### define column names + colnames(sites_extracted) <- c( + locs_id, + paste0( + statistic_to, "_", sprintf("%05d", radius) + ) + ) + #### return data.frame + return(data.frame(sites_extracted)) +} + + + +#' Calculate aggregated values for specified locations +#' +#' This function calculates aggregated values for specified locations from +#' a raster dataset. +#' +#' @keywords Calculation +#' @param from The raster dataset from which to extract values. +#' @param locs A data frame containing the locations for which +#' to calculate aggregated values. +#' It should have a column in `locs_id` value +#' that contains unique identifiers for each location. +#' @param locs_id An optional column name +#' in the \code{locs} data frame that contains additional location +#' identifiers. +#' @param radius The radius within which to include neighboring locations +#' for aggregation. Default is 0. +#' @param fun The aggregation function to use. +#' It can be a character string specifying a function name +#' (e.g., "mean", "sum"), +#' or it can be a custom function. Default is "mean". +#' @param ... Additional arguments to be passed to +#' the aggregation function. +#' +#' @return A data frame containing the aggregated values for each +#' location and time point. +#' @importFrom amadeus calc_prepare_locs calc_worker +#' @importFrom terra time +#' @importFrom tidyr pivot_longer pivot_wider +#' @importFrom dplyr rowwise mutate ungroup +#' @importFrom data.table as.data.table +#' @export +calc_narr2 <- function( + from, + locs, + locs_id = NULL, + radius = 0, + fun = "mean", + ... +) { + # + name <- geometry <- value <- NULL + ### prepare locations list + sites_list <- amadeus::calc_prepare_locs( + from = from, + locs = locs[, "site_id"], + locs_id = locs_id, + radius = radius + ) + sites_e <- sites_list[[1]] + # sites_id <- sites_list[[2]] + #### identify pressure level or monolevel data + time_from <- terra::time(from) + timetab <- table(time_from) + if (!all(timetab == 1)) { + time_split <- + split(time_from, + #ceiling(seq_along(time_from) / 29L)) + ceiling(as.integer(as.factor(time_from)) / 14L)) + sites_extracted <- Map( + function(day) { + cat(sprintf("Processing %s...\n", paste(day[1], "-", day[length(day)]))) + from_day <- from[[time_from %in% day]] + sites_extracted_day <- terra::extract( + from_day, + sites_e, + bind = TRUE + ) + sites_extracted_day <- data.frame(sites_extracted_day) + if ("geometry" %in% names(sites_extracted_day)) { + sites_extracted_day <- sites_extracted_day |> + dplyr::select(-geometry) + } + return(sites_extracted_day) + }, + time_split + ) + sites_extracted <- reduce_merge(sites_extracted, by = c("site_id")) + } else { + sites_extracted <- + terra::extract( + from, + sites_e, + bind = TRUE + ) + sites_extracted <- as.data.frame(sites_extracted) + if ("geometry" %in% names(sites_extracted)) { + sites_extracted <- sites_extracted |> + dplyr::select(-geometry) + } + } + sites_extracted <- + sites_extracted |> + tidyr::pivot_longer(cols = tidyselect::starts_with("MET_")) |> + dplyr::rowwise() |> + dplyr::mutate( + time = + regmatches( + name, + regexpr( + "20[0-9]{2,2}[0-1][0-9][0-3][0-9]", + name + ) + ) + ) |> + dplyr::mutate( + name = sub(paste0("_", time), "", name) + ) |> + dplyr::ungroup() |> + dplyr::mutate( + time = as.character(as.Date(time, format = "%Y%m%d")) + ) |> + tidyr::pivot_wider( + names_from = name, + values_from = value, + id_cols = c("site_id", "time") + ) + sites_extracted <- data.table::as.data.table(sites_extracted) + names(sites_extracted)[-1:-2] <- + sprintf("%s_%05d", names(sites_extracted)[-1:-2], radius) + + #### return data.frame + return(sites_extracted) +} + + + + +#' Parallelize NARR feature calculation +#' +#' This function parallelizes the processing and calculation of +#' NARR data for multiple domains. +#' @keywords Calculation +#' @param domain A character vector specifying the domains to process. +#' @param path A character vector specifying the path to the NARR data. +#' @param date A character vector specifying the date of the +#' NARR data to process. +#' @param locs A data frame specifying the locations to calculate NARR data for. +#' @param nthreads An integer specifying the number of threads +#' to use for parallel processing. Default is 24. +#' +#' @return A list of results from the parallel processing. +#' @importFrom future plan multicore sequential +#' @importFrom future.apply future_lapply +#' @export +par_narr <- function(domain, path, date, locs, nthreads = 24L) { + + if (!dir.exists(path)) { + stop("The specified path does not exist.") + } + future::plan(future::multicore, workers = nthreads) + + res <- + future.apply::future_lapply( + domain, + function(x) { + from <- process_narr2( + path = path, + variable = x, + date = date + ) + calc_narr2( + from = from, + locs = locs, + locs_id = "site_id" + ) + }, + future.seed = TRUE + ) + future::plan(future::sequential) + return(res) + +} + + +# nocov end diff --git a/R/init_arguments.R b/R/init_arguments.R new file mode 100644 index 00000000..b6022032 --- /dev/null +++ b/R/init_arguments.R @@ -0,0 +1,481 @@ + +#' Set arguments for the calculation process +#' +#' This function sets the arguments for the calculation process. +#' It takes several parameters including site ID, time ID, time period, +#' extent, user email, export path, and input path. +#' @keywords Utility +#' @param char_siteid Character string specifying the site ID. +#' Default is "site_id". +#' @param char_timeid Character string specifying the time ID. +#' Default is "time". +#' @param char_period Character vector specifying the time period. +#' Default is c("2018-01-01", "2022-10-31"). +#' @param num_extent Numeric vector specifying the extent. +#' Default is c(-126, -62, 22, 52). +#' @param char_user_email Character string specifying the user email. +#' Default is the current user's email with nih.gov domain. +#' @param export logical(1). If TRUE, the list for the calculation process +#' is exported to `path_export`. Default is FALSE. +#' @param path_export Character string specifying the export path. +#' Default is "inst/targets/calc_spec.qs". +#' @param char_input_dir Character string specifying the input path. +#' Default is "input". +#' @param nthreads_nasa integer(1). Number of threads for NASA data. +#' Default is 14L. +#' @param nthreads_tri integer(1). Number of threads for TRI data. +#' Default is 5L. +#' @param nthreads_geoscf integer(1). Number of threads for GEOSCF data. +#' Default is 10L. +#' @param nthreads_gmted integer(1). Number of threads for GMTED data. +#' Default is 4L. +#' @param nthreads_narr integer(1). Number of threads for NARR data. +#' Default is 24L. +#' @param nthreads_groads integer(1). Number of threads for GROADS data. +#' Default is 3L. +#' @param nthreads_population integer(1). Number of threads for population data. +#' Default is 3L. +#' @param nthreads_append integer(1). Number of threads for appending data. +#' Default is 8L. +#' @param nthreads_impute integer(1). Number of threads for imputing data. +#' Default is 64L. +#' +#' @note +#' The number of threads used is fixed as 1L +#' otherwise specified in `nthreads_*` arguments. +#' path_input should contain the following subdirectories: +#' - modis/raw/61/MOD11A1 +#' - modis/raw/61/MOD06_L2 +#' - modis/raw/61/MOD09GA +#' - modis/raw/61/MCD19A2 +#' - modis/raw/61/MOD13A2 +#' - modis/raw/5000/VNP46A2 +#' - aqs +#' - nlcd +#' - geos/aqc_tavg_1hr_g1440x721_v1 +#' - geos/chm_tavg_1hr_g1440x721_v1 +#' - HMS_Smoke/data +#' - gmted +#' - nei +#' - narr +#' - HMS_Smoke +#' - koppen_geiger +#' - ecoregions +#' - sedac_groads +#' - sedac_population +#' +#' @return A list of arguments for common use +#' in the calculation process. A *.qs or *.rds file defined in +#' `path_export` is saved if `export` is TRUE. +#' +#' * char_siteid: Character string specifying the site ID. +#' * char_timeid: Character string specifying the time ID. +#' * char_period: Character vector specifying the time period. +#' * num_extent: Numeric vector specifying the extent. +#' * char_user_email: Character string specifying the user email. +#' * char_input_dir: Character string specifying the input path. +#' * nthreads_nasa: Number of threads for NASA data. +#' * nthreads_tri: Number of threads for TRI data. +#' * nthreads_geoscf: Number of threads for GEOS-CF data. +#' * nthreads_gmted: Number of threads for GMTED data. +#' * nthreads_narr: Number of threads for NARR data. +#' * nthreads_groads: Number of threads for SEDAC Groads data. +#' * nthreads_population: Number of threads for population data. +#' * nthreads_append: Number of threads for appending data. +#' * nthreads_impute: Number of threads for imputing data. +#' @author Insang Song +#' @importFrom qs qsave +#' @importFrom tibble tribble +#' @export +# nolint start +set_args_calc <- + function( + char_siteid = "site_id", + char_timeid = "time", + char_period = c("2018-01-01", "2022-10-31"), + num_extent = c(-126, -62, 22, 52), + char_user_email = paste0(Sys.getenv("USER"), "@nih.gov"), + export = FALSE, + path_export = "inst/targets/calc_spec.qs", + char_input_dir = "input", + nthreads_nasa = 14L, + nthreads_tri = 5L, + nthreads_geoscf = 10L, + nthreads_gmted = 4L, + nthreads_narr = 24L, + nthreads_groads = 3L, + nthreads_population = 3L, + nthreads_append = 8L, + nthreads_impute = 64L + ) { + list_common <- + list( + char_siteid = char_siteid, + char_timeid = char_timeid, + char_period = char_period, + extent = num_extent, + char_user_email = char_user_email, + char_input_dir = char_input_dir, + nthreads_nasa = nthreads_nasa, + nthreads_tri = nthreads_tri, + nthreads_geoscf = nthreads_geoscf, + nthreads_gmted = nthreads_gmted, + nthreads_narr = nthreads_narr, + nthreads_groads = nthreads_groads, + nthreads_population = nthreads_population, + nthreads_append = nthreads_append, + nthreads_impute = nthreads_impute + ) + ain <- function(x, append = FALSE) { + if (append) { + file.path(char_input_dir, x, "data_files") + } else { + file.path(char_input_dir, x) + } + } + if (export) { + list_paths <- + list( + mod11 = load_modis_files(ain("modis/raw/61/MOD11A1"), date = list_common$char_period), + mod06 = load_modis_files(ain("modis/raw/61/MOD06_L2"), date = list_common$char_period), + mod09 = load_modis_files(ain("modis/raw/61/MOD09GA"), date = list_common$char_period), + mcd19 = load_modis_files(ain("modis/raw/61/MCD19A2"), date = list_common$char_period), + mod13 = load_modis_files(ain("modis/raw/61/MOD13A2"), date = list_common$char_period), + viirs = load_modis_files(ain("modis/raw/5000/VNP46A2"), "h5$", date = list_common$char_period) + ) + + list_calcspec <- + list( + aqs = list(path = ain("aqs", TRUE), + date = list_common$char_period), + mod11 = list(from = list_paths$mod11, + name_covariates = sprintf("MOD_SFCT%s_0_", c("D", "N")), + subdataset = "^LST_", + nthreads = nthreads_nasa, + radius = c(1e3, 1e4, 5e4)), + mod06 = list(from = list_paths$mod06, + name_covariates = sprintf("MOD_CLCV%s_0_", c("D", "N")), + subdataset = c("Cloud_Fraction_Day", "Cloud_Fraction_Night"), + nthreads = nthreads_nasa, + preprocess = "amadeus::process_modis_swath", + radius = c(1e3, 1e4, 5e4)), + mod09 = list(from = list_paths$mod09, + name_covariates = sprintf("MOD_SFCRF_%d_", seq(1, 7)), + subdataset = "^sur_refl_", + nthreads = nthreads_nasa, + radius = c(1e3, 1e4, 5e4)), + mcd19_1km = list(from = list_paths$mcd19, + name_covariates = sprintf("MOD_AD%dTA_0_", c(4, 5)), + subdataset = "^Optical_Depth", + nthreads = nthreads_nasa, + radius = c(1e3, 1e4, 5e4)), + mcd19_5km = list(from = list_paths$mcd19, + name_covariates = sprintf("MOD_%sAN_0_", c("CSZ", "CVZ", "RAZ", "SCT", "GLN")), + subdataset = "cos|RelAZ|Angle", + nthreads = nthreads_nasa, + radius = c(1e3, 1e4, 5e4)), + mod13 = list(from = list_paths$mod13, + name_covariates = "MOD_NDVIV_0_", + subdataset = "(NDVI)", + nthreads = nthreads_nasa, + radius = c(1e3, 1e4, 5e4)), + viirs = list(from = list_paths$viirs, + name_covariates = "MOD_LGHTN_0_", + subdataset = 3, + nthreads = nthreads_nasa, + preprocess = "amadeus::process_blackmarble", + radius = c(1e3, 1e4, 5e4)), + geoscf_aqc = list(date = list_common$char_period, + path = ain("geos/aqc_tavg_1hr_g1440x721_v1"), + nthreads = nthreads_geoscf), + geoscf_chm = list(date = list_common$char_period, + path = ain("geos/chm_tavg_1hr_g1440x721_v1"), + nthreads = nthreads_geoscf), + # base class covariates start here + hms = list(path = ain("HMS_Smoke", TRUE), + date = list_common$char_period, + covariate = "hms" + ), + gmted = list( + path = ain("gmted", TRUE), + covariate = "gmted" + ), + nei = list( + domain = c(2017, 2020), + domain_name = "year", + path = ain("nei", TRUE), + covariate = "nei" + ), + tri = list( + domain = seq(2018, 2022), + domain_name = "year", + path = ain("tri"), + radius = c(1e3, 1e4, 5e4), + covariate = "tri", + nthreads = nthreads_tri + ), + nlcd = list( + domain = c(2019, 2021), + domain_name = "year", + path = ain("nlcd", TRUE), + covariate = "nlcd", + mode = "exact", + extent = NULL, + radius = c(1e3, 1e4, 5e4), + max_cells = 1e8, + nthreads = 6L + ), + koppen = list(path = ain("koppen_geiger/data_files/Beck_KG_V1_present_0p0083.tif"), + covariate = "koppen", + nthreads = 1L), + ecoregions = list(path = ain("ecoregions/data_files/us_eco_l3_state_boundaries.shp"), + covariate = "ecoregions", + nthreads = 1L), + narr = list( + path = ain("narr"), + covariate = "narr", + domain_reduced = c("air.sfc", "albedo", "apcp", "dswrf", "evap", "hcdc", + "hpbl", "lcdc", "lhtfl", "mcdc", "omega", "pr_wtr", + "pres.sfc", "shtfl", "snowc", "soilm", + "tcdc", "ulwrf.sfc", "uwnd.10m", "vis", "vwnd.10m", "weasd"), + domain_appt = c("prate", "shum"), + domain = c("air.sfc", "albedo", "apcp", "dswrf", "evap", "hcdc", + "hpbl", "lcdc", "lhtfl", "mcdc", "omega", "pr_wtr", + "prate", "pres.sfc", "shtfl", "shum", "snowc", "soilm", + "tcdc", "ulwrf.sfc", "uwnd.10m", "vis", "vwnd.10m", "weasd"), + domain_name = "variable", + date = list_common$char_period, + process_function = "beethoven::process_narr2", + calc_function = "beethoven::calc_narr2", + nthreads = nthreads_narr + ), + groads = list( + path = ain("sedac_groads/data_files/gROADS-v1-americas.gdb"), + covariate = "groads", + radius = c(1e3, 1e4, 5e4), + nthreads = nthreads_groads), + population = list( + path = ain("sedac_population/data_files/gpw_v4_population_density_adjusted_to_2015_unwpp_country_totals_rev11_2020_30_sec.tif"), + covariate = "population", fun = "mean", + radius = c(1e3, 1e4, 5e4), + nthreads = nthreads_population + ) + ) + + attr(list_calcspec, "description") <- + tibble::tribble( + ~dataset, ~description, + "mod11", "MODIS Land Surface Temperature Day/Night", + "mod06", "MODIS Cloud Fraction Day/Night", + "mod09", "MODIS Surface Reflectance", + "mcd19_1km", "MCD19A2 1km", + "mcd19_5km", "MCD19A2 5km", + "mod13", "MODIS Normalized Difference Vegetation Index", + "viirs", "VIIRS Nighttime Lights", + "hms", "NOAA Hazard Mapping System Smoke", + "geoscf_aqc", "GEOS-CF AQC", + "geoscf_chm", "GEOS-CF CHM", + "gmted", "GMTED elevation", + "nei", "National Emission Inventory", + "tri", "Toxic Release Inventory", + "nlcd", "National Land Cover Database", + "koppen", "Koppen-Geiger Climate Classification", + "ecoregions", "EPA Ecoregions", + "narr", "NARR", + "groads", "SEDAC Global Roads", + "population", "SEDAC Population Density" + ) + if (is.null(path_export)) { + assign("arglist_calcspec", list_calcspec, envir = .GlobalEnv) + return(list_common) + } else { + if (endsWith(path_export, "qs")) { + qs::qsave(list_calcspec, path_export) + } else if (endsWith(path_export, "rds")) { + saveRDS(list_calcspec, path_export) + } else { + stop("Please provide a valid file extension: qs or rds.") + } + + return(list_common) + } + } + return(list_common) + } + + +#' Generate argument list for raw data download +#' @keywords Utility +#' @param char_period Character(2) vector specifying the time period. +#' Default is c("2018-01-01", "2022-10-31"). +#' @param char_input_dir Character string specifying the input path. +#' Default is "input". +#' @param nasa_earth_data_token Character string specifying the NASA Earth Data token. +#' @param mod06_filelist character(1). File path to a CSV file with MOD06 download +#' URLs. +#' @param year_nlcd numeric(2). Numeric vector specifying the NLCD years. +#' Default is c(2019, 2021). +#' @param export logical(1). If TRUE, the list is saved to `path_export`. +#' Default is `TRUE`. +#' @param path_export Character string specifying the export path. +#' Default is "inst/targets/download_spec.qs". +#' @export +set_args_download <- + function( + char_period = c("2018-01-01", "2022-10-31"), + char_input_dir = "input", + nasa_earth_data_token = NULL, + mod06_filelist = NULL, + year_nlcd = c(2019, 2021), + export = FALSE, + path_export = "inst/targets/download_spec.qs" + ) { + # NULL NASA Earth Data token will warn users + if (is.null(nasa_earth_data_token)) { + warning( + paste0( + "Argument nasa_earth_data_token is NULL. ", + "Please provide a NASA Earth Data token to make the downloading ", + "process work properly." + ) + ) + } + + # append input path + ain <- function(x) { + file.path(char_input_dir, x) + } + + char_date_temp <- c("%s-01-01", "%s-12-31") + time_periods <- as.numeric(substr(char_period, 1, 4)) + time_sequence <- seq(time_periods[1], time_periods[2]) + year_nei <- seq(2017, time_periods[2], 3) + gmted_vars <- + c("Breakline Emphasis", "Systematic Subsample", "Median Statistic", + "Minimum Statistic", "Mean Statistic", "Maximum Statistic", + "Standard Deviation Statistic" + ) + narr_variables_mono <- + c("air.sfc", "albedo", "apcp", "dswrf", "evap", "hcdc", + "hpbl", "lcdc", "lhtfl", "mcdc", "pr_wtr", + "prate", "pres.sfc", "shtfl", "snowc", "soilm", + "tcdc", "ulwrf.sfc", "uwnd.10m", "vis", "vwnd.10m", "weasd") + narr_variables_plevels <- + c("omega", "shum") + + list_download_config <- + list( + aqs = list(dataset_name = "aqs", directory_to_save = ain("aqs"), + year = time_periods, + unzip = TRUE, remove_zip = TRUE), + mod11 = lapply(time_sequence, + function(t) { + list(dataset_name = "modis", directory_to_save = ain("modis/raw"), + product = "MOD11A1", + date = sprintf(char_date_temp, as.character(t)), + nasa_earth_data_token = nasa_earth_data_token) + }), + mod06 = lapply(time_sequence, + function(t) { + list(dataset_name = "modis", directory_to_save = ain("modis/raw"), + product = "MOD06_L2", + date = sprintf(char_date_temp, as.character(t)), + nasa_earth_data_token = nasa_earth_data_token, + mod06_links = mod06_filelist) + }), + mod09 = lapply(time_sequence, + function(t) { + list(dataset_name = "modis", directory_to_save = ain("modis/raw"), + product = "MOD09GA", + date = sprintf(char_date_temp, as.character(t)), + nasa_earth_data_token = nasa_earth_data_token) + }), + mcd19 = lapply(time_sequence, + function(t) { + list(dataset_name = "modis", directory_to_save = ain("modis/raw"), + product = "MCD19A2", date = sprintf(char_date_temp, as.character(t)), + nasa_earth_data_token = nasa_earth_data_token) + }), + mod13 = lapply(time_sequence, + function(t) { + list(dataset_name = "modis", directory_to_save = ain("modis/raw"), + product = "MOD13A2", date = sprintf(char_date_temp, as.character(t)), + nasa_earth_data_token = nasa_earth_data_token) + }), + viirs = lapply(time_sequence, + function(t) { + list(dataset_name = "modis", directory_to_save = ain("modis/raw"), + product = "VNP46A2", date = sprintf(char_date_temp, as.character(t)), + version = "5000", + nasa_earth_data_token = nasa_earth_data_token) + }), + geoscf_aqc = list(dataset_name = "geos", directory_to_save = ain("geos"), + collection = "aqc_tavg_1hr_g1440x721_v1", + date = char_period), + geoscf_chm = list(dataset_name = "geos", directory_to_save = ain("geos"), + collection = "chm_tavg_1hr_g1440x721_v1", + date = char_period), + hms = list(dataset_name = "smoke", directory_to_save = ain("HMS_Smoke"), + data_format = "Shapefile", + date = char_period, + unzip = TRUE, remove_zip = TRUE), + gmted = lapply(gmted_vars, + function(v) { + list(dataset_name = "gmted", directory_to_save = ain("gmted"), + statistic = v, resolution = "7.5 arc-seconds", + unzip = TRUE, remove_zip = TRUE) + }), + nei = lapply(year_nei, + function(y) { + list(dataset_name = "nei", directory_to_save = ain("nei"), + year_target = y, unzip = TRUE) + }), + tri = list(dataset_name = "tri", + directory_to_save = ain("tri"), + year = time_periods), + nlcd = lapply(year_nlcd, + function(y) { + list(dataset_name = "nlcd", directory_to_save = ain("nlcd"), + year = y, + unzip = TRUE, remove_zip = TRUE) + }), + koppen = list(dataset_name = "koppen", directory_to_save = ain("koppen_geiger"), + data_resolution = "0.0083", time_period = "Present", unzip = TRUE, remove_zip = TRUE), + ecoregions = list(dataset_name = "koppen", + directory_to_save = ain("ecoregions"), + unzip = TRUE, remove_zip = TRUE), + narr_monolevel = lapply(narr_variables_mono, + function(v) { + list(dataset_name = "narr", + directory_to_save = ain("narr"), + variables = v, year = time_periods) + }), + narr_p_levels = lapply(narr_variables_plevels, + function(v) { + list(dataset_name = "narr", + directory_to_save = ain("narr"), + variables = v, year = time_periods) + }) + , + groads = list(dataset_name = "sedac_groads", directory_to_save = ain("sedac_groads"), + data_region = "Americas", data_format = "Geodatabase", + unzip = TRUE, remove_zip = TRUE), + population = list(dataset_name = "sedac_population", directory_to_save = ain("sedac_population"), + data_resolution = "30 second", data_format = "GeoTIFF", year = "2020", unzip = TRUE, remove_zip = TRUE) + ) + + if (export) { + if (endsWith(path_export, "qs")) { + qs::qsave(list_download_config, path_export) + message("Download configuration is saved to ", path_export) + } else if (endsWith(path_export, "rds")) { + saveRDS(list_download_config, path_export) + message("Download configuration is saved to ", path_export) + } else { + stop("Please provide a valid file extension: qs or rds.") + } + } + return(list_download_config) + } + +# nolint end diff --git a/R/injection.R b/R/injection.R new file mode 100644 index 00000000..2a81b195 --- /dev/null +++ b/R/injection.R @@ -0,0 +1,458 @@ +# nocov start + +#' Check file status and download if necessary +#' @keywords Utility +#' @param path Path to qs file with all download specifications per +#' dataset. +#' @param dataset_name character(1). Dataset name. +#' @param ... Arguments passed to `amadeus::download_data` +#' @importFrom amadeus download_data +#' @importFrom rlang inject +#' @return logical(1). +#' @export +feature_raw_download <- + function( + path = NULL, + dataset_name = NULL, + ... + ) { + if (!file.exists(path)) { + stop("The path does not exist.") + } + if (!endsWith(path, ".qs")) { + stop("The file should be in QS format.") + } + args_check <- loadargs(path, dataset = dataset_name) + + # run amadeus::download_data + tryCatch( + { + if (is.list(args_check[[1]])) { + for (i in seq_along(args_check)) { + rlang::inject( + amadeus::download_data( + acknowledgement = TRUE, + download = TRUE, + !!!args_check[[i]] + ) + ) + } + } else { + rlang::inject( + amadeus::download_data( + acknowledgement = TRUE, + download = TRUE, + !!!args_check + ) + ) + } + return(TRUE) + }, + error = function(e) { + stop(e) + } + ) + } + + +#' Set which years to be processed +#' @keywords Utility +#' @note This function is designed to define the temporal domain +#' from the calculation period and the available years of raw data. +#' @param period character(2)/integer(2) of integer/character/Date. +#' @param available vector of integer or Date. Available years to be processed. +#' @return A vector of years to be processed. +#' @export +set_target_years <- + function( + period = NULL, + available = NULL + ) { + if (is.character(period)) { + if (all(nchar(period) == 4)) { + period <- as.integer(period) + } else { + period <- as.integer(substr(period, 1, 4)) + } + } + assigned <- + post_calc_year_expand(period[1], period[2], time_available = available) + return(assigned) + } + + + +# calculate over a list +#' Spatiotemporal covariate calculation +#' @keywords Calculation +#' @param domain vector of integer/character/Date. +#' Depending on temporal resolution of raw datasets. +#' Nullable; If `NULL`, it will be set to `c(1)`. +#' @param domain_name character(1). Name of the domain. Default is `"year"`. +#' @param nthreads integer(1). Number of threads to use. +#' @param process_function Raw data processor. Default is +#' [`amadeus::process_covariates`] +#' @param calc_function Function to calculate covariates. +#' [`amadeus::calc_covariates`] +#' @param ... Arguments passed to `process_function` and `calc_function` +#' @return A data.table object. +#' @importFrom data.table rbindlist +#' @importFrom rlang inject +#' @importFrom amadeus process_covariates calc_covariates +#' @importFrom future plan sequential multicore +#' @export +calculate <- + function( + domain = NULL, + domain_name = "year", + nthreads = 1L, + process_function = amadeus::process_covariates, + calc_function = amadeus::calc_covariates, + ... + ) { + if (is.null(domain)) { + domain <- c(1) + } + # split the domain, make years from the domain list + # assuming that domain length is the same as the number of years + domainlist <- split(domain, seq_along(domain)) + years_data <- seq_along(domain) + 2017 + + if (nthreads == 1L) { + future::plan(future::sequential) + } else { + future::plan(future::multicore, workers = nthreads) + } + # double twists: list_iteration is made to distinguish + # cases where a single radius is accepted or ones have no radius + # argument. + res_calc <- + #try( + future.apply::future_mapply( + function(domain_each, year_each) { + # we assume that ... have no "year" and "from" arguments + args_process <- c(arg = domain_each, list(...)) + names(args_process)[1] <- domain_name + if (!is.null(args_process$covariate) && + any(names(args_process) %in% c("covariate")) + ) { + if (args_process$covariate == "nei") { + args_process$county <- process_counties() + } + } + + # load balancing strategy + # if radius is detected, split the list + if (any(names(args_process) %in% c("radius"))) { + list_iteration <- + split(args_process$radius, seq_along(args_process$radius)) + } else { + list_iteration <- list(1) + } + + list_iteration_calc <- + Map( + function(r) { + args_process$radius <- r + from_in <- + rlang::inject( + process_function(!!!args_process) + ) + res <- rlang::inject( + calc_function( + from = from_in, + !!!args_process + ) + ) + # using domain_name, add both + # data year and covariate year + if (!is.null(domain) && domain_name == "year") { + res <- + add_time_col( + res, domain_each, + sprintf("%s_year", unname(args_process$covariate)) + ) + } + res <- data.table::as.data.table(res) + return(res) + }, + list_iteration + ) + df_iteration_calc <- + if (length(list_iteration_calc) == 1) { + list_iteration_calc[[1]] + } else { + by_detected <- + Reduce(intersect, lapply(list_iteration_calc, names)) + reduce_merge(list_iteration_calc, by = by_detected) + } + return(df_iteration_calc) + }, + domainlist, years_data, SIMPLIFY = FALSE, + future.seed = TRUE + ) + + future::plan(future::sequential) + if (inherits(res_calc, "try-error")) { + cat(paste0(attr(res_calc, "condition")$message, "\n")) + stop("Results do not match expectations.") + } + res_calc <- lapply(res_calc, + function(x) { + if ("time" %in% names(x)) { + if (nchar(x$time[1]) != 4) { + x$time <- data.table::as.IDate(x$time) + } + } + xconvt <- data.table::as.data.table(x) + return(xconvt) + } + ) + # res_calcdf <- if (length(res_calc) == 1) { + # data.table::as.data.table(res_calc[[1]]) + # } else if (domain_name %in% c("year", "date")) { + # data.table::rbindlist(res_calc, use.names = TRUE, fill = TRUE) + # } else { + # reduce_merge(res_calc, by = c("site_id", "time")) + # } + return(res_calc) + } + + + + +#' Injects the calculate function with specified arguments. +#' +#' This function injects the calculate function with the specified arguments, +#' allowing for dynamic customization of the function's behavior. +#' @keywords Calculation +#' @param covariate character(1). The name of the covariate to be calculated. +#' @param locs The locations to be used in the calculation. +#' @param injection Additional arguments to be injected into +#' the calculate function. +#' +#' @return The result of the calculate function with the injected arguments. +#' +#' @examples +#' \dontrun{ +#' inject_calculate( +#' locs = my_locs, buffer = 10, domain = my_domain, +#' injection = list(arg1 = "value1", arg2 = "value2") +#' ) +#' } +#' @export +inject_calculate <- function(covariate, locs, injection) { + rlang::inject( + calculate( + locs = locs, + !!!injection + ) + ) +} + +#' Injects arguments to parallelize MODIS/VIIRS data processing +#' +#' @keywords Calculation +#' @param locs A data frame containing the locations for which MODIS +#' features need to be calculated. +#' @param injection **List** of dditional parameters to be passed to the +#' `calc_modis_par` function. +#' @return MODIS/VIIRS feature data.frame. +#' @seealso [`amadeus::calc_modis_daily`], [`amadeus::calc_modis_par`] +#' @importFrom rlang inject +#' @examples +#' \dontrun{ +#' files <- +#' c( +#' "/downloads/modis/mod06/MOD06_L2.A2022001.0000.061.2022001160000.hdf", +#' "/downloads/modis/mod06/MOD06_L2.A2022001.0005.061.2022001160000.hdf" +#' ) +#' my_locs <- data.frame(site_id = 1:2, lon = c(-88, -87), lat = c(35, 35)) +#' my_locs <- sf::st_as_sf(my_locs, coords = c("lon", "lat")) +#' inject_modis_par( +#' locs = my_locs, +#' injection = list(path = files, subdataset = "Cloud_Fraction_Day", +#' name_covariates = "MOD_CLCVD_0_", nthreads = 2L, +#' preprocess = amadeus::process_modis_swath, radius = c(1000))) +#' } +#' @export +inject_modis_par <- function(locs, injection) { + rlang::inject( + amadeus::calc_modis_par( + locs = locs, + locs_id = "site_id", + !!!injection + ) + ) +} + +#' Injects geographic information into a data frame +#' +#' This function injects geographic information into a data frame using +#' the `calc_geos_strict` function. The injected information includes +#' latitude and longitude coordinates based on the specified locations, +#' a location ID column, a window range, and a snapping option. +#' +#' @keywords Calculation +#' @param locs A data frame containing the locations for which +#' geographic information needs to be injected. +#' @param injection A list of additional arguments to be passed to +#' the `calc_geos_strict` function. +#' @param ... Placeholders +#' @return A modified data frame with injected geographic information. +#' @export +inject_geos <- function(locs, injection, ...) { + rlang::inject( + calc_geos_strict( + locs = locs, + locs_id = "site_id", + win = c(-126, -62, 22, 52), + snap = "out", + !!!injection + ) + ) +} + + +#' Injects GMTED data into specified locations +#' +#' This function injects GMTED (Global Multi-resolution Terrain Elevation Data) +#' into specified locations. It calculates the GMTED values for each +#' location within different radii and returns the merged results. +#' +#' @keywords Calculation +#' @param locs A data frame/sf/SpatVector containing the locations +#' where GMTED variables needs to be calculated +#' @param variable The variable for which GMTED data needs to be calculated. +#' @param radii A vector of radii for which GMTED data needs +#' to be calculated. +#' @param injection A list of additional arguments to be passed to +#' the `calc_gmted_direct` function. +#' @param nthreads The number of threads to be used for parallel processing. +#' Default is 4. +#' +#' @return A data frame containing the merged results of GMTED data +#' for each location within different radii. +#' @importFrom future plan +#' @importFrom future.apply future_lapply +#' @importFrom rlang inject +#' @export +inject_gmted <- function(locs, variable, radii, injection, nthreads = 4L) { + future::plan(future::multicore, workers = nthreads) + + radii_list <- split(radii, seq_along(radii)) + radii_rep <- + future.apply::future_lapply( + radii_list, + function(r) { + rlang::inject( + calc_gmted_direct( + locs = locs, + locs_id = "site_id", + radius = r, + variable = c(variable, "7.5 arc-seconds"), + !!!injection + ) + ) + }, + future.seed = TRUE + ) + radii_rep <- lapply(radii_rep, function(x) as.data.frame(x)) + radii_join <- reduce_merge(radii_rep, "site_id") + future::plan(future::sequential) + return(radii_join) +} + +# nocov end + + +#' Reduce and merge a list of data tables +#' +#' This function takes a list of data tables and merges them together +#' using the specified columns. It uses the `merge.data.table` function +#' from the `data.table` package to perform the merge. +#' +#' @param list_in A list of data tables to be merged. +#' @param by The columns to merge the data tables on. +#' If `NULL`, the function will automatically detect the common column names. +#' @param all.x logical(1). Keeping all rows from the first input. +#' @param all.y logical(1). Keeping all rows from the second input. +#' @return A merged data table. +#' @keywords Post-calculation +#' @examples +#' \dontrun{ +#' # Create example data tables +#' dt1 <- data.table(a = 1:3, b = 4:6) +#' dt2 <- data.table(a = 2:4, c = 7:9) +#' dt3 <- data.table(a = 3:5, d = 10:12) +#' +#' # Merge the data tables +#' reduce_merge(list(dt1, dt2, dt3), by = "a") +#' } +#' @importFrom data.table as.data.table merge.data.table +#' @export +reduce_merge <- + function( + list_in, + by = c("site_id", "time"), + all.x = TRUE, all.y = FALSE + ) { + list_check <- sapply(list_in, nrow) + list_checkdiff <- diff(list_check) + if (any(list_checkdiff > 0)) all.y <- TRUE + for (i in seq_len(length(list_in))) { + list_in[[i]] <- data.table::as.data.table(list_in[[i]]) + } + + Reduce( + function(x, y) { + if (is.null(by)) by <- intersect(names(x), names(y)) + data.table::merge.data.table( + x, y, by = by, all.x = all.x, all.y = all.y + ) + }, + list_in + ) + } + + +#' Injects the calculate function with matched arguments. +#' @keywords Calculation +#' @param f function. +#' @param args List of arguments that are attempted to be injected into `f`. +#' @return Injected function evaluation. +#' @export +#' @importFrom rlang inject +inject_match <- function(f, args) { + f_args <- formals(f) + + # Find the matching arguments + matching_args <- intersect(names(args), names(f_args)) + + # Inject the matching arguments + rlang::inject(f(!!!args[matching_args])) + +} + +# nocov start + +#' Inject arguments into NLCD calculation function for branching +#' @keywords Calculation +#' @param year An integer specifying the year to calculate NLCD data for. +#' @param radius An integer specifying the radius for the NLCD calculation. +#' @param ... Additional arguments to be passed to the NLCD calculation +#' function. +#' @return data.frame object. +#' @export +inject_nlcd <- + function( + year = 2019, + radius = 1000, + ... + ) { + args_ext <- list(...) + args_ext <- c(args_ext, list(year = year, radius = radius)) + inject_match(amadeus::calc_nlcd, args_ext) + } + +# nocov end diff --git a/R/load.R b/R/load.R new file mode 100644 index 00000000..d21c06b1 --- /dev/null +++ b/R/load.R @@ -0,0 +1,204 @@ +# nocov start + +## file check: chunking +## if using tarchetypes::tar_files, +## the file *lists* should be stored as a single file +## Provided that the download is completed in a defined +## time period such that users can distiguish a **set** of files +## from each other, +## timestamp check: `fs::file_info(...)$modification_time` +## can be used in bulk file check (will be time consuming as +## the number of files grow, though). +## The file list of the previous successful run will be stored as a file +## and we just save the file list of the current run, which are +## older than a certain rerun interval (e.g., 6 months). +## If one really wants to keep the shorter rerun interval, +## the strategy should be changed. +## THINK: How can we know the downloaded files are complete and correct? +## quick diagram: +## file set 1 ... file set x +## (listing function runs) +## list1.rds ... listx.rds +## (hashed; not modified) ... (not run) +## (pass) ... (run) +## ... ... (downstream process + calculation) +## (as-is) ... (as-is) --- unless modified or manually invalidated + +#' Load arguments from the formatted argument list file +#' @description This function loads the list object of arguments +#' to be injected into the calculation functions defined at +#' each target. The arguments are numeric or character, and some +#' of these are function names. In this case, the internal function +#' `unmarshal_function` is called to convert the function name +#' to the actual function. +#' @keywords Utility +#' @param argfile character(1). Path to the argument file. RDS format. +#' @param dataset character(1). Dataset name. +#' @return A list of arguments stored in `dataset` slot of the +#' argument file. +#' @importFrom qs qread +#' @export +loadargs <- function(argfile, dataset) { + if (endsWith(argfile, ".rds")) { + arglist <- readRDS(argfile) + } else if (endsWith(argfile, ".qs")) { + arglist <- qs::qread(argfile) + } else { + stop("Invalid format.") + } + check_args <- arglist[[dataset]] + namecheck <- grep("preprocess|_function", names(check_args)) + if (length(namecheck) > 0) { + for (i in namecheck) { + check_args[[i]] <- unmarshal_function(check_args[[i]]) + } + } + return(check_args) +} + + +# nolint start +#' Load MODIS files from a specified path. +#' +#' This function takes a path and an optional pattern as input and +#' returns a list of MODIS files found in the specified path. +#' @keywords Utility +#' @param path The path where the MODIS files are located. +#' @param pattern An optional regular expression pattern to filter the files. +#' The default pattern is "hdf$". +#' @param date A vector of two dates to filter the files by. +#' The default is an empty character vector. +#' @return A list of full file names of the MODIS files found +#' in the specified path. +#' +#' @examples +#' \dontrun{ +#' # Load MODIS files from the current directory +#' modis_files <- +#' load_modis_files( +#' ".", +#' date = c("2018-01-01", "2018-01-31") +#' ) +#' +#' # Load MODIS files from a specific directory with a custom pattern +#' modis_files <- +#' load_modis_files( +#' "/path/to/files", +#' pattern = "MOD.*hdf$", +#' date = c("2018-01-01", "2018-01-31") +#' ) +#' } +#' @export +# nolint end +load_modis_files <- function(path, pattern = "hdf$", date = character(2)) { + modis_files <- + list.files( + path, pattern = pattern, + recursive = TRUE, + full.names = TRUE + ) + date_exp <- + amadeus::generate_date_sequence(date[1], date[2], sub_hyphen = FALSE) + date_exp <- strftime(date_exp, format = "%Y%j") + modis_files <- + grep( + sprintf("(%s)", paste(paste0("A", date_exp), collapse = "|")), + modis_files, value = TRUE + ) + return(modis_files) +} + + +#' Read AQS data +#' @keywords Utility +#' @param fun_aqs function to import AQS data. +#' Default is `amadeus::process_aqs` +#' @param export Export the file to qs. Default is FALSE. +#' @param ... Passed arguments to `fun_aqs` +#' @return Depending on `fun_aqs` specification. +#' @importFrom qs qsave +#' @importFrom amadeus process_aqs +#' @export +read_locs <- + function( + fun_aqs = amadeus::process_aqs, + export = FALSE, + ... + ) { + aqs_read <- fun_aqs(...) + if (export) qs::qsave(aqs_read, file = "input/sf_feat_proc_aqs_sites.qs") + return(aqs_read) + } + + + +#' Unmarshal functions +#' @keywords Utility +#' @param pkg_func_str Character string specifying the package and function. +#' @return Function object. +#' @note The function name string must include two colons `::`. +#' Also, the package preceding the two colons should be loaded in the +#' current environment. +#' @description this function is developed to avoid +#' random errors in compressing and decompressing R function objects +#' with `qs::qsave` and `qs::qread`. If you encounter such errors, please use +#' this function with function name strings to save and load the function +#' objects. +#' @export +#' @examples +#' unmarshal_function("amadeus::process_aqs") +unmarshal_function <- + function(pkg_func_str) { + stopifnot(grepl("::", pkg_func_str)) + pkg_func_split <- strsplit(pkg_func_str, "::")[[1]] + pkg_name <- pkg_func_split[1] + func_name <- pkg_func_split[2] + get(func_name, envir = asNamespace(pkg_name)) + } + + + +#' Read paths from a directory with a specific file extension +#' @keywords Utility +#' @param path The directory path from which to read the paths. +#' @param extension The file extension to match. Defaults to ".hdf". +#' @param target_dates A character vector of length 2 containing +#' the start and end dates. +#' @param julian logical(1). If `TRUE`, the dates are in Julian format. +#' @return A character vector containing the full paths of the matching files. +#' +#' @examples +#' \dontrun{ +#' # Read paths from a directory with default extension +#' read_paths("/path/to/directory") +#' +#' # Read paths from a directory with custom extension +#' read_paths("/path/to/directory", ".txt") +#' } +#' @export +read_paths <- + function( + path, + extension = ".hdf", + target_dates = c("2020-01-01", "2020-01-15"), + julian = FALSE + ) { + flist <- + list.files( + path = path, + pattern = sprintf("%s$", extension), + full.names = TRUE, + recursive = TRUE + ) + if (!missing(target_dates)) { + dateseq <- + seq(as.Date(target_dates[1]), as.Date(target_dates[2]), by = "day") + dateseq <- + if (julian) format(dateseq, "%Y%j") else format(dateseq, "%Y%m%d") + dateseq <- sprintf("A(%s)", paste(dateseq, collapse = "|")) + flist <- grep(dateseq, flist, value = TRUE) + } + return(flist) + } + +# nocov end diff --git a/R/meta_learner.R b/R/meta_learner.R index 4a7e754a..5d63d6bf 100644 --- a/R/meta_learner.R +++ b/R/meta_learner.R @@ -1,150 +1,114 @@ -# nolint start -#' Fit a BART (Bayesian Additive Regression Tree) meta learner. It takes predictions of other models such as kriging, GLM, machine learning models as input and fits a BART Model -#' @param base_predictor_list - P x 1 list where P = p is a base predictor -#' vector (numeric). Each predictor vector should be the same length and -#' named. -#' @param kfolds integer, index of k-folds for cross-validation. This should be -#' produced with regards to spatial and/or temporal considerations -#' @param y dependent variable -#' @param ... Passed arguments to \link[BART]{wbart} -#' @return meta_fit_obj object of meta learner -#' @export -#' @examples NULL -# nolint end -meta_learner_fit <- function(base_predictor_list, - kfolds, y, ...) { - # Unnamed base_predictor_list is not accepted - if (is.null(names(base_predictor_list)) || - any(is.na(names(base_predictor_list)))) { - stop("base_predictor_list should be a named list.\n") - } - - # check lengths of each base predictor add a test for names - if (sapply(base_predictor_list, length, simplify = TRUE) |> - stats::var() != 0) { - stop("Error in meta_learner_fit: - Base predictors need to be the same length") - } - - # check that length of base predictors is the same than y - if (lengths(base_predictor_list)[1] != length(y)) { - stop("Error in meta_learner_fit: - Predictors and response are not the same length") - } +# nocov start - # check that length of kfolds is the same than y - if (length(kfolds) != length(y)) { - stop("Error in meta_learner_fit: - kfolds vector and response are not the same length") - } - - # check that base_predictor_list only contains only numeric - if (any(sapply(base_predictor_list, class) != "numeric")) { - stop("Error in meta_learner_fit: - Some of base predictors are not numeric") - } - - # convert list to data.frame - x_design <- as.data.frame(base_predictor_list) - - # Unique k-folds (typically 5 or 10) - nk <- length(unique(kfolds)) - # Pre-allocate list of meta objects - meta_fit_obj <- vector(mode = "list", length = nk) - for (i in 1:nk) { - # get the training and test sets - x_tr <- x_design[kfolds != i, ] - x_te <- x_design[kfolds == i, ] - y_tr <- y[kfolds != i] - # Fit the BART model - meta_fit_obj[[i]] <- BART::wbart( - x.train = x_tr, - y.train = y_tr, - x.test = x_te, - ... - ) +#' Fit meta learner +#' +#' This function subsets the full data by column subsamples (rate=50%) +#' The optimal hyperparameter search is performed based on spatial, +#' temporal, and spatiotemporal cross-validation schemes. +#' As of version 0.4.0, the function relies on RMSE to select the +#' best hyperparameter set. +#' @keywords meta_learner +#' @param data data.frame. Full data. +#' @param p_col_sel numeric(1). Rate of column resampling. Default is 0.5. +#' @param rset rset object. Specification of training/test sets. +#' @param yvar character(1). Outcome variable name +#' @param xvar character. Feature names. +#' @param tune_iter integer(1). Bayesian optimization iterations. +#' Default is 50. +#' @importFrom parsnip linear_reg +#' @importFrom workflows workflow add_variables add_model +#' @importFrom yardstick metric_set rmse mae +#' @importFrom tune tune tune_bayes select_best +#' @return List of 3, including the best-fit model, the best hyperparameters, +#' and the all performance records from `tune::tune_bayes()`. +#' Note that the meta learner function returns the best-fit model, +#' not predicted values. +#' @export +fit_meta_learner <- + function( + data, + p_col_sel = 0.5, + rset = NULL, + yvar = "Arithmetic.Mean", + xvar = character(0), + tune_iter = 50L + ) { + + # define model + meta_model <- + parsnip::linear_reg( + engine = "glmnet", + mode = "regression", + penalty = tune::tune(), + mixture = tune::tune() + ) + + # define recipe + meta_recipe <- + recipes::recipe( + data[1, ] + ) %>% + recipes::update_role(!!xvar) %>% + recipes::update_role(!!yvar, new_role = "outcome") + + # define workflow from recipe and model + meta_workflow <- + workflows::workflow() %>% + workflows::add_recipe( + meta_recipe + ) %>% + workflows::add_model(meta_model) + + # tune hyperparameters per Bayesian optimization + meta_tuned <- + tune::tune_bayes( + object = meta_workflow, + resamples = rset, + iter = tune_iter, + control = tune::control_bayes( + verbose = TRUE, + save_pred = FALSE, + save_workflow = TRUE + ), + metrics = yardstick::metric_set( + yardstick::rmse, yardstick::mae, yardstick::rsq + ) + ) + + meta_wfparam <- + tune::select_best( + meta_tuned, + metric = c("rmse", "rsq", "mae") + ) + + # finalize workflow with the best tuned hyperparameters + meta_wfresult <- tune::finalize_workflow(meta_workflow, meta_wfparam) + # Best-fit model + meta_wf_fit_best <- parsnip::fit(meta_wfresult, data = data) + + meta_wflist <- + list( + meta_fitted = meta_wf_fit_best, + meta_parameter = meta_wfparam, + best_performance = meta_tuned + ) + return(meta_wflist) } - return(meta_fit_obj) -} - -# nolint start -#' Create meta_learner predictions from the list of BART fit objects and predictions of base learners -#' @description -#' The meta learner used in this package, Bayesian Additive Regression Tree (BART), is not explicitly a spatiotemporal model, but the input covariates (outputs of each base learner) are S-T based. -# nolint end -#' @param meta_fit list of BART objects from meta_learner_fit -#' @param base_outputs_stdt stdt object. -#' list with datatable containing lat, lon, time and the covariates -#' (outputs of each base learner) at prediction locations and crs. -#' @param nthreads integer(1). Number of threads used in [BART::predict.wbart] -#' @note The predictions can be a rast or sf, which depends on the same -#' respective format of the covariance matrix input - cov_pred -#' @return meta_pred: the final meta learner predictions -#' @importFrom data.table .SD -#' @import BART +#' Predict meta learner +#' @keywords meta_learner +#' @param meta_fitted Fitted meta learner model. +#' @param new_data data.frame. New data. Must have the same +#' predictands and predictors as the training data. #' @importFrom stats predict +#' @return Predicted values. #' @export -#' -#' @examples NULL -#' @references https://rspatial.github.io/terra/reference/predict.html -meta_learner_predict <- function(meta_fit, base_outputs_stdt, nthreads = 2) { - if (!(identical(class(base_outputs_stdt), c("list", "stdt")))) { - stop("Error: param base_outputs_stdt is not in stdt format.") - } - - base_outputs <- base_outputs_stdt$stdt - - if (any(!(colnames(meta_fit[[1]]$varcount) %in% colnames(base_outputs)))) { - stop("Error: baselearners list incomplete or with wrong names") +predict_meta_learner <- + function( + meta_fitted, + new_data + ) { + stats::predict(meta_fitted, new_data) } - # extract baselearners predictions used in metalearner - base_name_index <- seq(1, ncol(base_outputs)) - # changed to integer indices - # as we impose the fixed column order in stdt objects. - spt_name_index <- grep("(lon|lat|time)", colnames(base_outputs)) - base_name_index <- base_name_index[-spt_name_index] - mat_pred <- as.matrix(base_outputs[, .SD, .SDcols = base_name_index]) - - # pre-allocate - meta_pred <- matrix(nrow = nrow(mat_pred), ncol = length(meta_fit)) - - iter_pred <- function( - meta_fit_in = meta_fit, - mat_pred_in, - meta_pred_in = meta_pred, - nthreads_in = nthreads) { - for (i in seq_along(meta_fit_in)) { - meta_pred_in[, i] <- - predict( - object = meta_fit_in[[i]], - newdata = mat_pred_in, - mc.cores = nthreads_in - ) |> - apply(2, mean) - } - meta_pred_out <- apply(meta_pred_in, 1, mean) - return(meta_pred_out) - } - - meta_pred_out <- iter_pred(mat_pred_in = mat_pred) - meta_pred_out <- meta_pred_out |> - matrix(ncol = 1) |> - as.data.frame() - names(meta_pred_out) <- "meta_pred" - spt_names <- grep("(lon|lat|time)", colnames(base_outputs), value = TRUE) - - meta_pred_out <- - cbind( - base_outputs[, .SD, .SDcols = spt_names], - meta_pred_out - ) - meta_pred_out <- list( - "stdt" = meta_pred_out, - "crs_dt" = base_outputs_stdt$crs_dt - ) - class(meta_pred_out) <- c("list", "stdt") - - return(meta_pred_out) -} +# nocov end diff --git a/R/pipeline_base_functions.R b/R/pipeline_base_functions.R deleted file mode 100644 index 6c176904..00000000 --- a/R/pipeline_base_functions.R +++ /dev/null @@ -1,3708 +0,0 @@ -## pipeline base functions -# nocov start - -## file check: chunking -## if using tarchetypes::tar_files, -## the file *lists* should be stored as a single file -## Provided that the download is completed in a defined -## time period such that users can distiguish a **set** of files -## from each other, -## timestamp check: `fs::file_info(...)$modification_time` -## can be used in bulk file check (will be time consuming as -## the number of files grow, though). -## The file list of the previous successful run will be stored as a file -## and we just save the file list of the current run, which are -## older than a certain rerun interval (e.g., 6 months). -## If one really wants to keep the shorter rerun interval, -## the strategy should be changed. -## THINK: How can we know the downloaded files are complete and correct? -## quick diagram: -## file set 1 ... file set x -## (listing function runs) -## list1.rds ... listx.rds -## (hashed; not modified) ... (not run) -## (pass) ... (run) -## ... ... (downstream process + calculation) -## (as-is) ... (as-is) --- unless modified or manually invalidated - -#' Load arguments from the formatted argument list file -#' @keywords Utility -#' @param argfile character(1). Path to the argument file. RDS format. -#' @param dataset character(1). Dataset name. -#' @returns A list of arguments. -#' @importFrom qs qread -#' @export -loadargs <- function(argfile, dataset) { - if (endsWith(argfile, ".rds")) { - arglist <- readRDS(argfile) - } else if (endsWith(argfile, ".qs")) { - arglist <- qs::qread(argfile) - } else { - stop("Invalid format.") - } - arglist[[dataset]] -} - - -#' Check if a query date falls within a time interval -#' -#' This function checks if a given query date falls within a time interval -#' defined by a vector of two dates. -#' @keywords Miscellaneous -#' @param query_date The query date to check. -#' @param tvec A vector of two dates defining the time interval. -#' -#' @returns TRUE if the query date falls within the time interval, -#' FALSE otherwise. -#' -#' @examples -#' \dontrun{ -#' query_date <- as.Date("2022-01-01") -#' tvec <- c(as.Date("2021-01-01"), as.Date("2023-01-01")) -#' `%tin%`(query_date, tvec) -#' } -`%tin%` <- function(query_date, tvec) { - tvec <- sort(tvec) - query_date <= tvec[1] & query_date >= tvec[2] -} - -#' Load MODIS files from a specified path. -#' -#' This function takes a path and an optional pattern as input and -#' returns a list of MODIS files found in the specified path. -#' @keywords Utility -#' @param path The path where the MODIS files are located. -#' @param pattern An optional regular expression pattern to filter the files. -#' The default pattern is "hdf$". -#' @param date A vector of two dates to filter the files by. -#' The default is an empty character vector. -#' @returns A list of full file names of the MODIS files found -#' in the specified path. -#' -#' @examples -#' \dontrun{ -#' # Load MODIS files from the current directory -#' modis_files <- load_modis_files(".") -#' -#' # Load MODIS files from a specific directory with a custom pattern -#' modis_files <- load_modis_files("/path/to/files", pattern = "MOD.*hdf$") -#' } -#' @export -load_modis_files <- function(path, pattern = "hdf$", date = character(2)) { - modis_files <- - list.files( - path, pattern = pattern, - recursive = TRUE, - full.names = TRUE - ) - date_exp <- - amadeus::generate_date_sequence(date[1], date[2], sub_hyphen = FALSE) - date_exp <- strftime(date_exp, format = "%Y%j") - modis_files <- - grep( - sprintf("(%s)", paste(paste0("A", date_exp), collapse = "|")), - modis_files, value = TRUE - ) - return(modis_files) -} - -#' Injects the calculate function with specified arguments. -#' -#' This function injects the calculate function with the specified arguments, -#' allowing for dynamic customization of the function's behavior. -#' @keywords Calculation -#' @param covariate character(1). The name of the covariate to be calculated. -#' @param locs The locations to be used in the calculation. -#' @param injection Additional arguments to be injected into -#' the calculate function. -#' -#' @returns The result of the calculate function with the injected arguments. -#' -#' @examples -#' \dontrun{ -#' inject_calculate( -#' locs = my_locs, buffer = 10, domain = my_domain, -#' injection = list(arg1 = "value1", arg2 = "value2") -#' ) -#' } -#' @export -inject_calculate <- function(covariate, locs, injection) { - rlang::inject( - calculate( - locs = locs, - !!!injection - ) - ) -} - -#' Injects arguments to parallelize MODIS/VIIRS data processing -#' -#' @keywords Calculation -#' @param locs A data frame containing the locations for which MODIS -#' features need to be calculated. -#' @param domain The domain in which the MODIS PAR data should be injected. -#' @param injection Additional parameters to be passed to the -#' `calc_modis_par` function. -#' @returns The modified domain with the injected MODIS PAR data. -#' @export -inject_modis_par <- function(locs, domain, injection) { - rlang::inject( - amadeus::calc_modis_par( - locs = locs, - locs_id = "site_id", - !!!injection - ) - ) -} - -#' Injects geographic information into a data frame -#' -#' This function injects geographic information into a data frame using -#' the `calc_geos_strict` function. The injected information includes -#' latitude and longitude coordinates based on the specified locations, -#' a location ID column, a window range, and a snapping option. -#' -#' @keywords Calculation -#' @param locs A data frame containing the locations for which -#' geographic information needs to be injected. -#' @param injection A list of additional arguments to be passed to -#' the `calc_geos_strict` function. -#' @param ... Placeholders -#' @returns A modified data frame with injected geographic information. -#' @export -inject_geos <- function(locs, injection, ...) { - rlang::inject( - calc_geos_strict( - locs = locs, - locs_id = "site_id", - win = c(-126, -62, 22, 52), - snap = "out", - !!!injection - ) - ) -} - - -#' Injects GMTED data into specified locations -#' -#' This function injects GMTED (Global Multi-resolution Terrain Elevation Data) -#' into specified locations. It calculates the GMTED values for each -#' location within different radii and returns the merged results. -#' -#' @keywords Calculation -#' @param locs A data frame/sf/SpatVector containing the locations -#' where GMTED variables needs to be calculated -#' @param variable The variable for which GMTED data needs to be calculated. -#' @param radii A vector of radii for which GMTED data needs -#' to be calculated. -#' @param injection A list of additional arguments to be passed to -#' the `calc_gmted_direct` function. -#' @param nthreads The number of threads to be used for parallel processing. -#' Default is 4. -#' -#' @returns A data frame containing the merged results of GMTED data -#' for each location within different radii. -#' @importFrom future plan -#' @importFrom future.apply future_lapply -#' @importFrom rlang inject -#' @export -inject_gmted <- function(locs, variable, radii, injection, nthreads = 4L) { - future::plan(future::multicore, workers = nthreads) - - radii_list <- split(radii, seq_along(radii)) - radii_rep <- - future.apply::future_lapply( - radii_list, - function(r) { - rlang::inject( - calc_gmted_direct( - locs = locs, - locs_id = "site_id", - radius = r, - variable = c(variable, "7.5 arc-seconds"), - !!!injection - ) - ) - }, - future.seed = TRUE - ) - radii_rep <- lapply(radii_rep, function(x) as.data.frame(x)) - radii_join <- reduce_merge(radii_rep, "site_id") - future::plan(future::sequential) - return(radii_join) -} - - -#' Reduce and merge a list of data tables -#' -#' This function takes a list of data tables and merges them together -#' using the specified columns. It uses the `merge.data.table` function -#' from the `data.table` package to perform the merge. -#' -#' @param list_in A list of data tables to be merged. -#' @param by The columns to merge the data tables on. -#' @param all.x logical(1). Keeping all rows from the first input. -#' @param all.y logical(1). Keeping all rows from the second input. -#' @returns A merged data table. -#' @keywords Post-calculation -#' @examples -#' \dontrun{ -#' # Create example data tables -#' dt1 <- data.table(a = 1:3, b = 4:6) -#' dt2 <- data.table(a = 2:4, c = 7:9) -#' dt3 <- data.table(a = 3:5, d = 10:12) -#' -#' # Merge the data tables -#' reduce_merge(list(dt1, dt2, dt3), by = "a") -#' } -#' @importFrom data.table as.data.table merge.data.table -#' @export -reduce_merge <- - function( - list_in, - by = c("site_id", "time"), - all.x = TRUE, all.y = FALSE - ) { - list_check <- sapply(list_in, nrow) - list_checkdiff <- diff(list_check) - if (any(list_checkdiff > 0)) all.y <- TRUE - for (i in seq_len(length(list_in))) { - list_in[[i]] <- data.table::as.data.table(list_in[[i]]) - } - - Reduce( - function(x, y) { - if (is.null(by)) by <- intersect(names(x), names(y)) - data.table::merge.data.table( - x, y, by = by, all.x = all.x, all.y = all.y - ) - }, - list_in - ) - } - - -#' Parallelize NARR feature calculation -#' -#' This function parallelizes the processing and calculation of -#' NARR data for multiple domains. -#' @keywords Calculation -#' @param domain A character vector specifying the domains to process. -#' @param date A character vector specifying the date of the -#' NARR data to process. -#' @param locs A data frame specifying the locations to calculate NARR data for. -#' @param nthreads An integer specifying the number of threads -#' to use for parallel processing. Default is 24. -#' -#' @returns A list of results from the parallel processing. -#' @importFrom future plan multicore sequential -#' @importFrom future.apply future_lapply -#' @export -par_narr <- function(domain, date, locs, nthreads = 24L) { - - future::plan(future::multicore, workers = nthreads) - - res <- - future.apply::future_lapply( - domain, - function(x) { - from <- process_narr2( - path = "input/narr", - variable = x, - date = date - ) - calc_narr2( - from = from, - locs = locs, - locs_id = "site_id" - ) - }, - future.seed = TRUE - ) - future::plan(future::sequential) - return(res) - -} - -#' Add Time Column -#' -#' This function adds a time column to a data frame. -#' -#' @keywords Post-calculation -#' @param df The data frame to which the time column will be added. -#' @param time_value The value to be assigned to the time column. -#' @param time_id The name of the time column (default is "time"). -#' -#' @returns The data frame with the added time column. -#' -#' @examples -#' \dontrun{ -#' df <- data.frame(x = 1:5, y = letters[1:5]) -#' add_time_col(df, "2022-01-01") -#' } -#' @export -add_time_col <- function(df, time_value, time_id = "time") { - if (!time_id %in% names(df)) { - df[[time_id]] <- time_value - } - return(df) -} - - -# 2018~2022, 2017, 2020 -# 2017 ... 2020 ... -# 2017 -#' Map the available raw data years over the given period -#' @description -#' Many raw datasets are periodically updated and the period could -#' be longer than a year. This function maps the available years -#' over the given period. -#' @keywords Post-calculation -#' @param time_start integer(1). Starting year. -#' @param time_end integer(1). Ending year. -#' @param time_unit character(1). Time unit. Default is `"year"`. -#' @param time_available vector. Available years. -#' @returns integer vector of length (time_end - time_start + 1). -#' Each element will get the nearest preceeding available year. -#' @note -#' The minimum of `time_available` will be filled in front of the first -#' available year when the minimum of `time_available` is greater -#' than `time_start`. -#' @examples -#' \dontrun{ -#' process_calc_year_expand(2018, 2022, "year", c(2017, 2020, 2021)) -#' process_calc_year_expand(2018, 2022, "year", c(2020, 2021)) -#' } -#' @export -post_calc_year_expand <- - function( - time_start = NULL, - time_end = NULL, - time_unit = "year", - time_available = NULL - ) { - time_seq <- seq(time_start, time_end) - time_target_seq <- findInterval(time_seq, time_available) - time_target_seq <- time_available[time_target_seq] - if (min(time_available) > time_start) { - time_target_seq <- - c( - rep(min(time_available), - min(time_available) - time_start), - time_target_seq - ) - } - return(time_target_seq) - } - - -#' Expand a data frame by year -#' -#' This function expands a data frame by year, creating multiple rows -#' for each year based on the time period specified. -#' @keywords Post-calculation -#' @param df The input data frame. -#' @param locs_id The column name of the location identifier in the data frame. -#' @param time_field The column name of the time field in the data frame. -#' @param time_start The start of the time period. -#' @param time_end The end of the time period. -#' @param time_unit The unit of time to expand the data frame. Only for record. -#' @param time_available A vector of available time periods. -#' @param ... Placeholders. -#' @note Year expansion rule is to assign the nearest past year -#' in the available years;#' if there is no past year in the available years, -#' the first available year is rolled back to the start of the time period. -#' @returns The expanded data frame with multiple rows for each year. -#' @seealso [`post_calc_year_expand()`] -#' @examples -#' \dontrun{ -#' df <- data.frame(year = c(2010, 2010, 2011, 2012), -#' value = c(1, 2, 3, 4)) -#' df_expanded <- -#' post_calc_df_year_expand(df, locs_id = "site_id", time_field = "year", -#' time_start = 2011, time_end = 2012, -#' time_unit = "year") -#' print(df_expanded) -#' } -#' @importFrom stats sd -#' @export -post_calc_df_year_expand <- function( - df, - locs_id = "site_id", - time_field = "time", - time_start = NULL, - time_end = NULL, - time_unit = "year", - time_available = NULL, - ... -) { - time_summary <- table(unlist(df[[time_field]])) - if (length(time_summary) != 1) { - if (stats::sd(time_summary) != 0) { - stop("df should be a data frame with the same number of rows per year") - } - } - # assume that df is the row-bound data frame - if (is.character(df[[time_field]])) { - df[[time_field]] <- as.integer(df[[time_field]]) - } - df_years <- unique(df[[time_field]]) - nlocs <- length(unique(df[[locs_id]])) - year_period <- seq(time_start, time_end) - # assign the time period to the available years - year_assigned <- - post_calc_year_expand(time_start, time_end, time_unit, df_years) - df_years_repeats <- table(year_assigned) - - # repeat data frames - df_expanded <- Map( - function(y) { - df_sub <- df[df[[time_field]] == df_years[y], ] - df_sub <- df_sub[rep(seq_len(nrow(df_sub)), df_years_repeats[y]), ] - return(df_sub) - }, - seq_along(year_assigned) - ) - df_expanded <- do.call(rbind, df_expanded) - df_expanded[[time_field]] <- rep(year_period, each = nlocs) - return(df_expanded) -} - - -# calculate over a list -#' Spatiotemporal covariate calculation -#' @keywords Calculation -#' @param domain vector of integer/character/Date. -#' Depending on temporal resolution of raw datasets. -#' Nullable; If `NULL`, it will be set to `c(1)`. -#' @param domain_name character(1). Name of the domain. Default is `"year"`. -#' @param nthreads integer(1). Number of threads to use. -#' @param process_function Raw data processor. Default is -#' [`amadeus::process_covariates`] -#' @param calc_function Function to calculate covariates. -#' [`amadeus::calc_covariates`] -#' @param ... Arguments passed to `process_function` and `calc_function` -#' @returns A data.table object. -#' @importFrom data.table rbindlist -#' @importFrom rlang inject -#' @export -# FIXME: this function works inefficiently in expense of -# returning uniform list of length(|years|) output. -# It could seriously affect the performance in scaled calculation -# as it calculates the same covariate for several years. -# Future updates should reduce the workload by calculating -# source data years only then assign proper preceding years -# to the output as another target. -calculate <- - function( - domain = NULL, - domain_name = "year", - nthreads = 1L, - process_function = amadeus::process_covariates, - calc_function = amadeus::calc_covariates, - ... - ) { - if (is.null(domain)) { - domain <- c(1) - } - # split the domain, make years from the domain list - # assuming that domain length is the same as the number of years - domainlist <- split(domain, seq_along(domain)) - years_data <- seq_along(domain) + 2017 - - if (nthreads == 1L) { - future::plan(future::sequential) - } else { - future::plan(future::multicore, workers = nthreads) - } - # double twists: list_iteration is made to distinguish - # cases where a single radius is accepted or ones have no radius - # argument. - res_calc <- - #try( - future.apply::future_mapply( - function(domain_each, year_each) { - # we assume that ... have no "year" and "from" arguments - args_process <- c(arg = domain_each, list(...)) - names(args_process)[1] <- domain_name - if (!is.null(args_process$covariate) && - any(names(args_process) %in% c("covariate")) - ) { - if (args_process$covariate == "nei") { - args_process$county <- process_counties() - } - } - - # load balancing strategy - # if radius is detected, split the list - if (any(names(args_process) %in% c("radius"))) { - list_iteration <- - split(args_process$radius, seq_along(args_process$radius)) - } else { - list_iteration <- list(1) - } - - list_iteration_calc <- - Map( - function(r) { - args_process$radius <- r - from_in <- - rlang::inject( - process_function(!!!args_process) - ) - res <- rlang::inject( - calc_function( - from = from_in, - !!!args_process - ) - ) - # using domain_name, add both - # data year and covariate year - if (!is.null(domain) && domain_name == "year") { - res <- - add_time_col( - res, domain_each, - sprintf("%s_year", unname(args_process$covariate)) - ) - } - res <- data.table::as.data.table(res) - return(res) - }, - list_iteration - ) - df_iteration_calc <- - if (length(list_iteration_calc) == 1) { - list_iteration_calc[[1]] - } else { - by_detected <- - Reduce(intersect, lapply(list_iteration_calc, names)) - reduce_merge(list_iteration_calc, by = by_detected) - } - return(df_iteration_calc) - }, - domainlist, years_data, SIMPLIFY = FALSE, - future.seed = TRUE - ) - - future::plan(future::sequential) - if (inherits(res_calc, "try-error")) { - cat(paste0(attr(res_calc, "condition")$message, "\n")) - stop("Results do not match expectations.") - } - res_calc <- lapply(res_calc, - function(x) { - if ("time" %in% names(x)) { - if (nchar(x$time[1]) != 4) { - x$time <- data.table::as.IDate(x$time) - } - } - xconvt <- data.table::as.data.table(x) - return(xconvt) - } - ) - # res_calcdf <- if (length(res_calc) == 1) { - # data.table::as.data.table(res_calc[[1]]) - # } else if (domain_name %in% c("year", "date")) { - # data.table::rbindlist(res_calc, use.names = TRUE, fill = TRUE) - # } else { - # reduce_merge(res_calc, by = c("site_id", "time")) - # } - return(res_calc) - } - - - -#' Set resource management for SLURM -#' @keywords Utility -#' @param template_file SLURM job submission shell template path. -#' @param partition character(1). Name of partition. Default is `"geo"` -#' @param ncpus integer(1). Number of CPU cores assigned to each task. -#' @param ntasks integer(1). Number of tasks to submit. -#' @param memory integer(1). Specifically odds to 2*x GB. -#' @param user_email character(1). User email address. -#' @param error_log character(1). Error log file name. -#' @note This function is designed to be used with `tar_resources`. -#' Suggested number of `ncpus` is more than 1 for typical multicore R tasks. -#' @returns A list of resources for `tar_resources` -#' @author Insang Song -#' @importFrom future tweak -#' @importFrom future.batchtools batchtools_slurm -#' @importFrom targets tar_resources -#' @importFrom targets tar_resources_future -#' @export -set_slurm_resource <- - function( - template_file = "inst/targets/template_slurm.tmpl", - partition = "geo", - ncpus = 2L, - ntasks = 2L, - memory = 8, - user_email = paste0(Sys.getenv("USER"), "@nih.gov"), - error_log = "slurm_error.log" - ) { - targets::tar_resources( - future = targets::tar_resources_future( - plan = future::tweak( - future.batchtools::batchtools_slurm, - template = template_file, - resources = - list( - partition = partition, - ntasks = ntasks, - ncpus = ncpus, - memory = memory, - email = user_email, - error.file = error_log - ) - ) - ) - ) - } - - -#' Read AQS data -#' @keywords Utility -#' @param fun_aqs function to import AQS data. -#' Default is `amadeus::process_aqs` -#' @param export Export the file to qs. Default is FALSE. -#' @param ... Passed arguments to `fun_aqs` -#' @returns Depending on `fun_aqs` specification. -#' @importFrom qs qsave -#' @export -read_locs <- - function( - fun_aqs = amadeus::process_aqs, - export = FALSE, - ... - ) { - aqs_read <- fun_aqs(...) - if (export) qs::qsave(aqs_read, file = "input/sf_feat_proc_aqs_sites.qs") - return(aqs_read) - } - - - -#' Check file status and download if necessary -#' @keywords Utility -#' @param path Path to qs file with all download specifications per -#' dataset. -#' @param dataset_name character(1). Dataset name. -#' @param ... Arguments passed to `amadeus::download_data` -#' @returns logical(1). -feature_raw_download <- - function( - path = NULL, - dataset_name = NULL, - ... - ) { - if (!file.exists(path)) { - stop("The path does not exist.") - } - if (!endsWith(path, ".qs")) { - stop("The file should be in QS format.") - } - args_check <- loadargs(path, dataset = dataset_name) - - # run amadeus::download_data - tryCatch( - { - if (is.list(args_check[[1]])) { - for (i in seq_along(args_check)) { - rlang::inject( - amadeus::download_data( - dataset_name = dataset_name, - acknowledgement = TRUE, - download = TRUE, - !!!args_check[[i]] - ) - ) - } - } else { - rlang::inject( - amadeus::download_data( - dataset_name = dataset_name, - acknowledgement = TRUE, - download = TRUE, - !!!args_check - ) - ) - } - return(TRUE) - }, - error = function(e) { - stop(e) - } - ) - } - -#' Load county sf object -#' @keywords Calculation -#' @param year integer(1). Year of the county shapefile. -#' @param exclude character. State FIPS codes to exclude. -#' Default is c("02", "15", "60", "66", "68", "69", "72", "78"). -#' @returns sf object -#' @importFrom tigris counties -process_counties <- - function( - year = 2020, - exclude = c("02", "15", "60", "66", "68", "69", "72", "78") - ) { - options(tigris_use_cache = TRUE) - cnty <- tigris::counties(year = year) - cnty <- - cnty[!cnty$STATEFP %in% - c("02", "15", "60", "66", "68", "69", "72", "78"), ] - return(cnty) - } - - - -#' Merge input data.frame objects -#' @param by character. Joining keys. See [`merge`] for details. -#' @param time logical(1). Whether or not include time identifier. -#' Set this `TRUE` will supersede `by` value by appending time identifier. -#' @param ... data.frame objects to merge -#' @returns data.table -#' @keywords Post-calculation -#' @importFrom data.table as.data.table -#' @importFrom data.table merge.data.table -#' @export -post_calc_merge_features <- - function( - by = c("site_id"), - time = FALSE, - ... - ) { - ellipsis <- list(...) - if (time) { - by <- c("site_id", "time") - ellipsis_clean <- - lapply( - ellipsis, - function(x) { - x <- data.table::as.data.table(x) - col_coords <- grep("(lon|lat)", names(x)) - if (length(col_coords) > 0 && !is.null(col_coords)) { - x <- x[, -col_coords, with = FALSE] - } - x$time <- as.character(x$time) - return(x) - } - ) - } else { - ellipsis_clean <- ellipsis - } - joined <- - Reduce(function(x, y) { - data.table::merge.data.table( - x, y, - by = by, all.x = TRUE, suffixes = c("_Ma", "_Mb") - ) - }, ellipsis_clean) - return(joined) - } - - -#' Change time column name -#' @param df data.frame -#' @param candidates character. Candidate column names. -#' @param replace character. New column name. -#' @returns data.frame -#' @keywords Post-calculation -#' @export -post_calc_unify_timecols <- - function( - df, - candidates = c("year"), - replace = "time" - ) { - if (sum(names(df) %in% candidates) > 1) { - stop("More than a candidate is detected in the input.") - } - names(df)[names(df) %in% candidates] <- replace - return(df) - } - - -#' Convert time column to character -#' @keywords Post-calculation -#' @param df data.table -#' @note This function takes preprocessed data.table with -#' a column named `"time"`. -#' @importFrom data.table as.data.table copy -post_calc_convert_time <- - function( - df - ) { - df <- data.table::copy(data.table::as.data.table(df)) - df <- df[, `:=`(time, as.character(time))] - return(df) - } - -#' Join a data.frame with a year-only date column to -#' that with a full date column -#' @description The full date column will be converted to a year column -#' as a new column, then the data.frame with the year-only column will -#' be joined. -#' @keywords Post-calculation -#' @param df_year data.frame with a year-only date column -#' @param df_date data.frame with a full date column -#' @param field_year character(1). Year column in `df_year` -#' @param field_date character(1). Date column in `df_date` -#' @param spid character(1). Name of the unique location identifier field. -#' @importFrom methods is -#' @importFrom data.table merge.data.table -#' @importFrom data.table `:=` -#' @returns data.frame -post_calc_join_yeardate <- - function( - df_year, - df_date, - field_year = "time", - field_date = "time", - spid = "site_id" - ) { - if (!inherits(df_year, "data.frame") && !inherits(df_date, "data.frame")) { - stop("Both inputs should be data.frame.") - } - - names(df_year)[which(names(df_year) %in% field_year)] <- "year" - df_year$year <- as.character(unlist(df_year$year)) - df_date$year <- as.character(substr(df_date[[field_date]], 1, 4)) - - df_joined <- - data.table::merge.data.table( - df_date, df_year, - by = c(spid, "year"), - all.x = TRUE - ) - - df_joined <- df_joined[, c("year") := NULL] - return(df_joined) - } - - -#' Merge spatial and spatiotemporal covariate data -#' @keywords Post-calculation -#' @param locs Location. e.g., AQS sites. -#' @param locs_id character(1). Location identifier. -#' @param time_id character(1). Location identifier. -#' @param target_years integer. Used to dummify nominal year. -#' @param df_sp data.frame. Spatial-only covariates. -#' @param df_spt data.frame. Spatiotemporal covariates. -#' @note This version assumes the time_id contains Date-like strings. -#' @returns data.frame -#' @importFrom data.table merge.data.table -#' @export -post_calc_merge_all <- - function( - locs, - locs_id, - time_id, - target_years = seq(2018, 2022), - df_sp, - df_spt - ) { - if (methods::is(locs, "sf")) { - locs <- sf::st_drop_geometry(locs) - } - locs$time <- as.character(locs$time) - locs <- data.table::as.data.table(locs) - locs_merged <- - data.table::merge.data.table( - locs, df_sp, by = c(locs_id) - ) - locs_merged <- - data.table::merge.data.table( - locs_merged, df_spt, - by = c(locs_id, time_id) - ) - locs_merged <- - amadeus::calc_temporal_dummies( - locs = locs_merged, - locs_id = locs_id, - year = target_years - ) - return(locs_merged) - } - - -#' Remove columns from a data frame based on regular expression patterns. -#' @keywords Post-calculation -#' -#' This function removes columns from a data frame that match -#' any of the specified -#' regular expression patterns. By default, it removes columns with names that -#' match the patterns "^lon$|^lat$|geoid|year$|description". -#' -#' @param df The input data frame. -#' @param candidates A character vector of regular expression patterns -#' to match against column names. Columns that match any of the patterns -#' will be removed. The default value is -#' "^lon$|^lat$|geoid|year$|description". -#' @param strict logical(1). If `TRUE`, -#' only `c("site_id", "time")` will be kept. -#' @returns The modified data frame with the specified columns removed. -#' -#' @examples -#' \dontrun{ -#' df <- data.frame(lon = 1:5, lat = 6:10, geoid = 11:15, year = 2010:2014, -#' description = letters[1:5], other = 16:20) -#' post_calc_drop_cols(df) -#' } -post_calc_drop_cols <- - function( - df, - candidates = "(^lon$|^lat$|geoid|year$|description|geometry)", - strict = FALSE - ) { - idx_remove <- - if (!strict) { - grep(candidates, names(df), value = TRUE) - } else { - grep("site_id|time", names(df), value = TRUE, invert = TRUE) - } - df <- df[, -idx_remove, with = FALSE] - return(df) - } - -#' Automatic joining by the time and spatial identifiers -#' @description The key assumption is that all data frames will have -#' time field and spatial field and the data should have one of date or year. -#' Whether the input time unit is year or date -#' is determined by the coercion of the **first row value** of the time field -#' into a character with `as.Date()`. This function will fail if it -#' gets year-like string with length 4. -#' -#' @param df_fine The fine-grained data frame. -#' @param df_coarse The coarse-grained data frame. -#' @param field_sp The name of the spatial field in the data frames. -#' @param field_t The name of the time field in the data frames. -#' @param year_start The starting year of the time period. -#' @param year_end The ending year of the time period. -#' @keywords Post-calculation -#' @returns A merged data table. -#' @examples -# nolint start -#' \dontrun{ -#' df_fine0 <- data.frame(site_id = c("A", "B", "B", "C"), -#' time = as.Date(c("2022-01-01", "2022-01-02", "2021-12-31", "2021-01-03")), -#' value = c(1, 2, 3, 5)) -#' df_coarse0 <- data.frame(site_id = c("A", "B", "C"), -#' time = c("2022", "2022", "2021"), -#' other_value = c(10, 20, 30)) -#' jdf <- post_calc_autojoin(df_fine0, df_coarse0) -#' print(jdf) -#' } -# nolint end -#' @importFrom data.table merge.data.table -#' @importFrom rlang as_name -#' @importFrom rlang sym -#' @export -post_calc_autojoin <- - function( - df_fine, - df_coarse, - field_sp = "site_id", - field_t = "time", - year_start = 2018L, - year_end = 2022L - ) { - if (any(grepl("population", names(df_coarse)))) { - df_coarse <- df_coarse[, -c("time"), with = FALSE] - } - common_field <- intersect(names(df_fine), names(df_coarse)) - df_fine <- data.table::as.data.table(df_fine) - df_coarse <- data.table::as.data.table(df_coarse) - df_fine <- post_calc_drop_cols(df_fine) - df_coarse <- post_calc_drop_cols(df_coarse) - # if (length(common_field) > 2) { - # message("The data frames have more than two common fields.") - # message("Trying to remove the redundant common fields...") - # common_field <- intersect(names(df_fine), names(df_coarse)) - # print(common_field) - # common_field <- - # common_field[-which(!common_field %in% c(field_sp, field_t))] - # } - if (length(common_field) == 1) { - print(common_field) - if (common_field == field_sp) { - joined <- data.table::merge.data.table( - df_fine, df_coarse, - by = field_sp, - all.x = TRUE - ) - } - } - if (length(common_field) == 2) { - if (all(common_field %in% c(field_sp, field_t))) { - # t_fine <- try(as.Date(df_fine[[field_t]][1])) - df_fine[[field_t]] <- as.character(df_fine[[field_t]]) - df_coarse[[field_t]] <- as.character(df_coarse[[field_t]]) - t_coarse <- try(as.Date(df_coarse[[field_t]][1])) - if (inherits(t_coarse, "try-error")) { - message( - "The time field includes years. Trying different join strategy." - ) - coarse_years <- sort(unique(unlist(as.integer(df_coarse[[field_t]])))) - df_coarse2 <- post_calc_df_year_expand( - df_coarse, - time_start = year_start, - time_end = year_end, - time_available = coarse_years - ) - joined <- - post_calc_join_yeardate(df_coarse2, df_fine, field_t, field_t) - } else { - joined <- data.table::merge.data.table( - df_fine, df_coarse, - by = c(field_sp, field_t), - all.x = TRUE - ) - } - } - } - return(joined) - } - - - -#' Read paths from a directory with a specific file extension -#' @keywords Utility -#' @param path The directory path from which to read the paths. -#' @param extension The file extension to match. Defaults to ".hdf". -#' @param target_dates A character vector of length 2 containing -#' the start and end dates. -#' @param julian logical(1). If `TRUE`, the dates are in Julian format. -#' @returns A character vector containing the full paths of the matching files. -#' -#' @examples -#' \dontrun{ -#' # Read paths from a directory with default extension -#' read_paths("/path/to/directory") -#' -#' # Read paths from a directory with custom extension -#' read_paths("/path/to/directory", ".txt") -#' } -#' @export -read_paths <- - function( - path, - extension = ".hdf", - target_dates = c("2020-01-01", "2020-01-15"), - julian = FALSE - ) { - flist <- - list.files( - path = path, - pattern = sprintf("%s$", extension), - full.names = TRUE, - recursive = TRUE - ) - if (!missing(target_dates)) { - dateseq <- - seq(as.Date(target_dates[1]), as.Date(target_dates[2]), by = "day") - dateseq <- - if (julian) format(dateseq, "%Y%j") else format(dateseq, "%Y%m%d") - dateseq <- sprintf("A(%s)", paste(dateseq, collapse = "|")) - flist <- grep(dateseq, flist, value = TRUE) - } - return(flist) - } - - - -#' Search package functions -#' @keywords Utility -#' @param package character(1). Package name. -#' @param search character(1). Search term. -#' @returns A character vector containing the matching function names. -#' @examples -#' # Search for functions in the `amadeus` package -#' \dontrun{ -#' search_function("amadeus", "process_") -#' } -search_function <- function(package, search) { - library(package, character.only = TRUE) - grep(search, ls(sprintf("package:%s", package)), value = TRUE) -} - -#' Get data.frame of function parameters -#' @keywords Utility -#' @param functions character. Vector of function names. -#' @returns A data.frame containing the parameters of the functions. -#' @importFrom dplyr as_tibble bind_rows -df_params <- function(functions) { - params <- lapply(functions, function(x) { - args <- - dplyr::as_tibble( - lapply(as.list(formals(get(x))), \(p) list(p)), - .name_repair = "minimal" - ) - return(args) - }) - paramsdf <- Reduce(dplyr::bind_rows, params) - return(paramsdf) -} - - -#' Process atmospheric composition data by chunks (v2) -#' @keywords Calculation -#' @description -#' Returning a single `SpatRasterDataset` object. -#' @param date character(2). length of 10. Format "YYYY-MM-DD". -#' @param path character(1). Directory with downloaded netCDF (.nc4) files. or -#' netCDF file paths. -#' @param ... Arguments passed to [`terra::rast`]. -#' @note -#' Layer names of the returned `SpatRaster` object contain the variable, -#' pressure level, date -#' Reference duration: 1 day summary, all layers: 115 seconds -#' Superseded by [`calc_geos_strict`]. -#' @author Mitchell Manware, Insang Song -#' @return a `SpatRaster` object; -#' @importFrom terra rast -#' @importFrom terra time -#' @importFrom terra varnames -#' @importFrom terra crs -#' @importFrom terra subset -#' @export -process_geos_bulk <- - function(path = NULL, - date = c("2018-01-01", "2018-01-01"), - ...) { - #### directory setup - if (length(path) == 1) { - - if (dir.exists(path)) { - path <- amadeus::download_sanitize_path(path) - paths <- list.files( - path, - pattern = "GEOS-CF.v01.rpl", - full.names = TRUE - ) - paths <- paths[grep( - ".nc4", - paths - )] - } - } else { - paths <- path - } - #### check for variable - amadeus::check_for_null_parameters(mget(ls())) - #### identify file paths - #### identify dates based on user input - dates_of_interest <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = TRUE - ) - #### subset file paths to only dates of interest - data_paths <- unique( - grep( - paste( - dates_of_interest, - collapse = "|" - ), - paths, - value = TRUE - ) - ) - #### identify collection - collection <- amadeus::process_collection( - data_paths[1], - source = "geos", - collection = TRUE - ) - cat( - paste0( - "Identified collection ", - collection, - ".\n" - ) - ) - if (length(unique(collection)) > 1) { - warning( - "Multiple collections detected. Returning data for all collections.\n" - ) - } - - filename_date <- regmatches( - data_paths, - regexpr( - "20[0-9]{2}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])", - data_paths - ) - ) - if (any(table(filename_date) < 24)) { - warning( - "Some dates include less than 24 hours. Check the downloaded files." - ) - } - if (length(unique(filename_date)) > 10) { - message( - "More than 10 unique dates detected. Try 10-day chunks..." - ) - } - - # split filename date every 10 days - filename_date <- as.Date(filename_date, format = "%Y%m%d") - filename_date_cl <- as.integer(cut(filename_date, "30 days")) - - future_inserted <- split(data_paths, filename_date_cl) - other_args <- list(...) - data_variables <- names(terra::rast(data_paths[1])) - # nolint start - summary_byvar <- function(x = data_variables, fs) { - rast_in <- rlang::inject(terra::rast(fs, !!!other_args)) - terra::sds(lapply( - x, - function(v) { - rast_inidx <- grep(v, names(rast_in)) - rast_in <- rast_in[[rast_inidx]] - rast_summary <- terra::tapp(rast_in, index = "days", fun = "mean") - names(rast_summary) <- - paste0( - rep(v, terra::nlyr(rast_summary)), "_", - terra::time(rast_summary) - ) - terra::set.crs(rast_summary, "EPSG:4326") - return(rast_summary) - } - )) - } - # nolint end - - # summary by 10 days - # TODO: dropping furrr? - rast_10d_summary <- - furrr::future_map( - .x = future_inserted, - .f = ~summary_byvar(fs = .x), - .options = - furrr::furrr_options( - globals = c("other_args", "data_variables") - ) - ) - rast_10d_summary <- Reduce(c, rast_10d_summary) - return(rast_10d_summary) - - } - -#' Process atmospheric composition data by chunks (v3) -#' @keywords Calculation -#' @description -#' Returning a single `SpatRasterDataset` object. -#' Removed `tapp` for performance; impose a strict assumption that -#' there are no missing values -#' @param path character(1). Directory with downloaded netCDF (.nc4) files. or -#' netCDF file paths. -#' @param date character(2). length of 10. Format "YYYY-MM-DD". -#' @param locs Locations to extract. -#' @param locs_id character(1). Location identifier. -#' @param ... Arguments passed to [`terra::rast`]. -#' @note -#' Layer names of the returned `SpatRaster` object contain the variable, -#' pressure level, date -#' Reference duration: 1 day summary, all layers: 106 seconds -#' hard-coded subsets for subdataset selection -#' @author Mitchell Manware, Insang Song -#' @return a `SpatRaster` object; -#' @importFrom terra rast -#' @importFrom terra time -#' @importFrom terra varnames -#' @importFrom terra crs -#' @importFrom terra subset -#' @importFrom sf st_as_sf -#' @importFrom future.apply future_lapply -#' @importFrom data.table rbindlist -#' @export -calc_geos_strict <- - function(path = NULL, - date = c("2018-01-01", "2018-01-01"), - locs = NULL, - locs_id = NULL, - ...) { - #### directory setup - if (length(path) == 1) { - if (dir.exists(path)) { - # path <- amadeus::download_sanitize_path(path) - paths <- list.files( - path, - pattern = "GEOS-CF.v01.rpl", - full.names = TRUE - ) - paths <- paths[grep( - ".nc4", - paths - )] - } - } else { - paths <- path - } - #### check for variable - # amadeus::check_for_null_parameters(mget(ls())) - #### identify file paths - #### identify dates based on user input - dates_of_interest <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = TRUE - ) - dates_of_interest_incl <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = FALSE - ) - #### subset file paths to only dates of interest - data_paths <- unique( - grep( - paste( - dates_of_interest, - collapse = "|" - ), - paths, - value = TRUE - ) - ) - - #### identify collection - collection <- regmatches( - data_paths[1], - # the pattern accommodates 3-4 characters for the variable name, - # 3-4 alphanumerics for the temporal resolution, - # 8-9 alphanumerics for the output dimensions - # nolint start - regexpr( - "GEOS-CF.v01.rpl.(aqc|chm)_[[:alpha:]]{3,4}_[[:alnum:]]{3,4}_[[:alnum:]]{8,9}_v[1-9]", - data_paths[1] - ) - ) - cat( - paste0( - "Identified collection ", - collection, - ".\n" - ) - ) - if (length(unique(collection)) > 1) { - warning( - "Multiple collections detected. Returning data for all collections.\n" - ) - } - - filename_date <- sort(regmatches( - data_paths, - regexpr( - "20[0-9]{2}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])", - data_paths - ) - )) - if (any(table(filename_date) < 24)) { - warning( - "Some dates include less than 24 hours. Check the downloaded files." - ) - } - # nolint end - # to export locs (pointers are not exportable) - locs <- sf::st_as_sf(locs) - - # split filename dates daily - filename_date <- as.Date(filename_date, format = "%Y%m%d") - filename_date <- filename_date[filename_date %in% dates_of_interest_incl] - filename_date_cl <- as.integer(as.factor(filename_date)) - - future_inserted <- split(data_paths, filename_date_cl) - other_args <- list(...) - other_args$nthreads <- NULL - data_variables <- terra::describe(data_paths[1], sds = TRUE)$var - - search_variables <- - if (grepl("chm", collection)) { - c("ACET", "ALD2", "ALK4", "BCPI", "BCPO", "BENZ", "C2H6", "C3H8", - "CH4", "CO", "DST1", "DST2", "DST3", "DST4", "EOH", "H2O2", - "HCHO", "HNO3", "HNO4", "ISOP", "MACR", "MEK", "MVK", "N2O5", - "NH3", "NH4", "NIT", "NO", "NO2", "NOy", "OCPI", "OCPO", "PAN", - "PM25_RH35_GCC", "PM25_RH35_GOCART", "PM25bc_RH35_GCC", - "PM25du_RH35_GCC", "PM25ni_RH35_GCC", "PM25oc_RH35_GCC", - "PM25soa_RH35_GCC", "PM25ss_RH35_GCC", "PM25su_RH35_GCC", - "PRPE", "RCHO", "SALA", "SALC", "SO2", "SOAP", "SOAS", "TOLU", "XYLE" - ) - } else { - c("CO", "NO2", "O3", "SO2") - } - - # fs is the hourly file paths per day (each element with N=24) - summary_byvar <- function(x = search_variables, fs) { - rast_in <- rlang::inject(terra::rast(fs, !!!other_args)) - # strongly assume that we take the single day. no need to filter dates - # per variable, - # all files (hourly) are cleaned and processed - sds_proc <- - lapply( - x, - function(v) { - rast_inidx <- grep(v, data_variables) - #rast_in <- mean(rast_in[[rast_inidx]]) - rast_summary <- terra::mean(rast_in[[rast_inidx]]) - rtin <- as.Date(terra::time(rast_in)) - rtin_u <- unique(rtin) - cat(sprintf("Processing %s, date: %s\n", v, rtin_u)) - # rast_summary <- vector("list", length = length(rtin_u)) - # for (d in seq_along(rtin_u)) { - # rast_d <- rast_in[[rtin == rtin_u[d]]] - # rast_summary[[d]] <- mean(rast_d) - # } - # rast_summary <- do.call(c, rast_summary) - - # the next line is deprecated - # rast_summary <- terra::tapp(rast_in, index = "days", fun = "mean") - terra::time(rast_summary) <- rtin_u - names(rast_summary) <- - paste0( - rep(gsub("_lev=.*", "", v), terra::nlyr(rast_summary)) - ) - terra::set.crs(rast_summary, "EPSG:4326") - return(rast_summary) - } - ) - sds_proc <- terra::sds(sds_proc) - - locstr <- terra::vect(locs) - rast_ext <- terra::extract(sds_proc, locstr, ID = TRUE) - # rast_ext <- lapply(rast_ext, - # function(df) { - # df$ID <- unlist(locs[[locs_id]]) - # return(df) - # } - # ) - rast_ext <- - Reduce(function(dfa, dfb) dplyr::full_join(dfa, dfb, by = "ID"), - rast_ext - ) - rast_ext$time <- unique(as.Date(terra::time(rast_in))) - rast_ext$ID <- unlist(locs[[locs_id]])[rast_ext$ID] - names(rast_ext)[names(rast_ext) == "ID"] <- locs_id - return(rast_ext) - - } - future::plan(future::multicore, workers = 10) - rast_summary <- - future.apply::future_lapply( - future_inserted, - function(fs) summary_byvar(fs = fs) - ) - future::plan(future::sequential) - rast_summary <- data.table::rbindlist(rast_summary) - - return(rast_summary) - - } - - -#' Reflown gmted processing -#' @keywords Calculation -#' @param variable character(2). Statistic and resolution. -#' @param path character(1). Directory with downloaded GMTED files. -#' @param locs data.frame/SpatVector/sf. Locations. -#' @param locs_id character(1). Location identifier. -#' @param win numeric(4). Window for the raster. -#' @param radius numeric(1). Radius for the extraction. -#' @param fun character(1). Function to apply. -#' @param ... Additional parameters to be passed to other functions. -#' @returns A data.frame containing the extracted GMTED data. -#' @importFrom terra rast -#' @importFrom terra varnames -#' @importFrom terra extract -#' @export -calc_gmted_direct <- function( - variable = NULL, - path = NULL, - locs = NULL, - locs_id = NULL, - win = c(-126, -62, 22, 52), - radius = 0, - fun = "mean", - ...) { - #### directory setup - path <- amadeus::download_sanitize_path(path) - #### check for length of variable - if (!(length(variable) == 2)) { - stop( - paste0( - "Please provide a vector with the statistic and resolution.\n" - ) - ) - } - #### identify statistic and resolution - statistic <- variable[1] - statistic_code <- amadeus::process_gmted_codes( - statistic, - statistic = TRUE, - invert = FALSE - ) - resolution <- variable[2] - resolution_code <- amadeus::process_gmted_codes( - resolution, - resolution = TRUE, - invert = FALSE - ) - cat(paste0( - "Cleaning ", - statistic, - " data at ", - resolution, - " resolution.\n" - )) - statistic_from <- c( - "Breakline Emphasis", "Systematic Subsample", - "Median Statistic", "Minimum Statistic", - "Mean Statistic", "Maximum Statistic", - "Standard Deviation Statistic" - ) - statistic_to <- c( - "BRKL", "SSUB", "MEDN", "MINI", "MEAN", "MAXL", "STDV" - ) - statistic_to <- - sprintf("LDU_E%s", statistic_to[match(statistic, statistic_from)]) - - #### identify file path - paths <- list.dirs( - path, - full.names = TRUE - ) - data_path <- - grep( - sprintf( - "%s%s_grd", - statistic_code, - as.character(resolution_code) - ), - paths, value = TRUE - ) - - #### import data - data <- terra::rast(data_path, win = win) - #### layer name - names(data) <- paste0( - "elevation_", - gsub( - "_grd", - "", - names(data) - ) - ) - #### varnames - terra::varnames(data) <- paste0( - "Elevation: ", - statistic, - " (", - resolution, - ")" - ) - from <- data - #return(from) - #### prepare locations list - sites_list <- amadeus::calc_prepare_locs( - from = from, - locs = locs, - locs_id = locs_id, - radius = radius - ) - sites_e <- sites_list[[1]] - sites_id <- sites_list[[2]] - #### perform extraction - sites_extracted <- amadeus::calc_worker( - dataset = "gmted", - from = from, - locs_vector = sites_e, - locs_df = sites_id, - radius = radius, - fun = fun, - variable = 2, - time = NULL, - time_type = "timeless" - ) - #### convert integer to numeric - sites_extracted[, 2] <- as.numeric(sites_extracted[, 2]) - #### define column names - colnames(sites_extracted) <- c( - locs_id, - paste0( - statistic_to, "_", sprintf("%05d", radius) - ) - ) - #### return data.frame - return(data.frame(sites_extracted)) -} - - - -#' Process NARR2 Data -#' -#' This function processes NARR2 data based on the specified parameters. -#' -#' @keywords Calculation -#' @param date A character vector specifying the start and end dates -#' in the format "YYYY-MM-DD". -#' @param variable A character vector specifying the variable of interest. -#' @param path A character vector specifying the path to the data files. -#' @param ... Additional parameters to be passed to other functions. -#' -#' @returns A SpatRaster object containing the processed NARR2 data. -#' -#' @details This function performs the following steps: -#' 1. Sets up the directory path. -#' 2. Checks for null parameters. -#' 3. Identifies file paths based on the specified variable. -#' 4. Generates a date sequence based on the specified start and end dates. -#' 5. Filters the file paths to include only dates of interest. -#' 6. Sets up the search abbreviation and target variable. -#' 7. Imports and processes the data for each file path. -#' 8. Subsets the data to include only dates of interest. -#' 9. Returns the processed data as a SpatRaster object. -#' -#' @examples -#' # Process NARR2 data for the variable "PRATE" from -#' # September 1, 2023 to September 1, 2023 -#' \dontrun{ -#' data <- -#' process_narr2( -#' date = c("2023-09-01", "2023-09-01"), -#' variable = "PRATE", -#' path = "/path/to/data" -#' ) -#' } -#' -#' @export -process_narr2 <- function( - date = c("2023-09-01", "2023-09-01"), - variable = NULL, - path = NULL, - ...) { - #### directory setup - path <- amadeus::download_sanitize_path(path) - #### check for variable - amadeus::check_for_null_parameters(mget(ls())) - #### identify file paths - data_paths <- list.files( - path, - pattern = variable, - recursive = TRUE, - full.names = TRUE - ) - # data_paths <- grep( - # sprintf("%s*.*.nc", variable), - # data_paths, - # value = TRUE - # ) - #### define date sequence - date_sequence <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = TRUE - ) - #### path ncar - ym_from <- regmatches( - data_paths, - regexpr( - "2[0-9]{3,5}", - data_paths - ) - ) - ym_of_interest <- - substr(date_sequence, - 1, ifelse(all(nchar(ym_from) == 6), 6, 4)) - ym_of_interest <- unique(ym_of_interest) - #### subset file paths to only dates of interest - data_paths_ym <- unique( - grep( - paste( - ym_of_interest, - collapse = "|" - ), - data_paths, - value = TRUE - ) - ) - - search_abbr <- list.dirs(path)[-1] - search_abbr <- sub(paste0(path, "/"), "", search_abbr) - search_to <- c( - "ATSFC", "ALBDO", "ACPRC", "DSWRF", "ACEVP", - "HCLAF", "PLBLH", "LCLAF", "LATHF", "MCLAF", - "OMEGA", "PRWTR", "PRATE", "PRSFC", "SENHF", - "SPHUM", "SNWCV", "SLMSC", "CLDCV", "ULWRF", - "UWIND", "VISIB", "VWIND", "ACSNW" - ) - search_to <- - sprintf("MET_%s", search_to[match(variable, search_abbr)]) - - #### initiate for loop - data_full <- terra::rast() - for (p in seq_along(data_paths_ym)) { - #### import data - data_year <- terra::rast(data_paths_ym[p]) - data_year_tinfo <- terra::time(data_year) - time_processed <- as.POSIXlt(data_year_tinfo) - time_this <- time_processed[1] - cat(paste0( - "Cleaning ", variable, " data for ", - sprintf( - "%s, %d %s", - strftime(time_this, "%B"), - time_this$year + 1900, - "...\n" - ) - )) - #### check for mono or pressure levels - lvinfo <- regmatches( - names(data_year), - regexpr("level=[0-9]{3,4}", names(data_year)) - ) - if (length(lvinfo) == 0) { - cat("Detected monolevel data...\n") - names(data_year) <- paste0( - search_to, "_", - gsub("-", "", data_year_tinfo) - ) - } else { - cat("Detected pressure levels data...\n") - lvinfo <- sub("level=", "", lvinfo) - lvinfo <- sprintf("%04d", as.integer(lvinfo)) - lvinfo <- paste0("L", lvinfo) - terra::time(data_year) <- as.Date(data_year_tinfo) - names(data_year) <- sprintf( - "%s_%s_%s", - search_to, - lvinfo, - gsub("-", "", data_year_tinfo) - ) - } - data_full <- c( - data_full, - data_year, - warn = FALSE - ) - } - - #### subset years to dates of interest - data_full_cn <- names(data_full) - data_return <- terra::subset( - data_full, - which( - substr( - data_full_cn, - nchar(data_full_cn) - 7, - nchar(data_full_cn) - ) %in% date_sequence - ) - ) - cat(paste0( - "Returning daily ", - variable, - " data from ", - as.Date(date_sequence[1], format = "%Y%m%d"), - " to ", - as.Date( - date_sequence[length(date_sequence)], - format = "%Y%m%d" - ), - ".\n" - )) - #### return SpatRaster - return(data_return) -} - - -#' Calculate aggregated values for specified locations -#' -#' This function calculates aggregated values for specified locations from -#' a raster dataset. -#' -#' @keywords Calculation -#' @param from The raster dataset from which to extract values. -#' @param locs A data frame containing the locations for which -#' to calculate aggregated values. -#' It should have a column in `locs_id` value -#' that contains unique identifiers for each location. -#' @param locs_id An optional column name -#' in the \code{locs} data frame that contains additional location -#' identifiers. -#' @param radius The radius within which to include neighboring locations -#' for aggregation. Default is 0. -#' @param fun The aggregation function to use. -#' It can be a character string specifying a function name -#' (e.g., "mean", "sum"), -#' or it can be a custom function. Default is "mean". -#' @param ... Additional arguments to be passed to -#' the aggregation function. -#' -#' @returns A data frame containing the aggregated values for each -#' location and time point. -#' @export -calc_narr2 <- function( - from, - locs, - locs_id = NULL, - radius = 0, - fun = "mean", - ... -) { - # - name <- geometry <- value <- NULL - ### prepare locations list - sites_list <- amadeus::calc_prepare_locs( - from = from, - locs = locs[, "site_id"], - locs_id = locs_id, - radius = radius - ) - sites_e <- sites_list[[1]] - # sites_id <- sites_list[[2]] - #### identify pressure level or monolevel data - time_from <- terra::time(from) - timetab <- table(time_from) - if (!all(timetab == 1)) { - time_split <- - split(time_from, - #ceiling(seq_along(time_from) / 29L)) - ceiling(as.integer(as.factor(time_from)) / 14L)) - sites_extracted <- Map( - function(day) { - cat(sprintf("Processing %s...\n", paste(day[1], "-", day[length(day)]))) - from_day <- from[[time_from %in% day]] - sites_extracted_day <- terra::extract( - from_day, - sites_e, - bind = TRUE - ) - sites_extracted_day <- data.frame(sites_extracted_day) - if ("geometry" %in% names(sites_extracted_day)) { - sites_extracted_day <- sites_extracted_day |> - dplyr::select(-geometry) - } - return(sites_extracted_day) - }, - time_split - ) - sites_extracted <- reduce_merge(sites_extracted, by = c("site_id")) - } else { - sites_extracted <- - terra::extract( - from, - sites_e, - bind = TRUE - ) - sites_extracted <- as.data.frame(sites_extracted) - if ("geometry" %in% names(sites_extracted)) { - sites_extracted <- sites_extracted |> - dplyr::select(-geometry) - } - } - sites_extracted <- - sites_extracted |> - tidyr::pivot_longer(cols = tidyselect::starts_with("MET_")) |> - dplyr::rowwise() |> - dplyr::mutate( - time = - regmatches( - name, - regexpr( - "20[0-9]{2,2}[0-1][0-9][0-3][0-9]", - name - ) - ) - ) |> - dplyr::mutate( - name = sub(paste0("_", time), "", name) - ) |> - dplyr::ungroup() |> - dplyr::mutate( - time = as.character(as.Date(time, format = "%Y%m%d")) - ) |> - tidyr::pivot_wider( - names_from = name, - values_from = value, - id_cols = c("site_id", "time") - ) - sites_extracted <- data.table::as.data.table(sites_extracted) - names(sites_extracted)[-1:-2] <- - sprintf("%s_%05d", names(sites_extracted)[-1:-2], radius) - - #### return data.frame - return(sites_extracted) -} - -#' Impute missing values and attach lagged features -#' @keywords Post-calculation -#' @note under construction. -#' This function performs imputation on a given data table -#' by replacing missing values with imputed values. -#' It follows a series of steps including data cleaning, name cleaning, -#' geoscf column renaming, NDVI 16-day backward filling, -#' zero-variance exclusion, excessive "true zeros" exclusion, -#' and imputation using missRanger. -#' -#' @param dt The input data table to be imputed. -#' @param period The period for lagged features. -#' @param nthreads_dt The number of threads to be used for -#' data.table operations. -#' @param nthreads_collapse The number of threads to be used for -#' collapse operations. -#' @param nthreads_imputation The number of threads to be used for -#' the imputation process. -#' -#' @returns The imputed data table with lagged features. -#' -#' @importFrom collapse set_collapse replace_inf replace_na fvar fnth -#' @importFrom data.table setDTthreads setnafill -#' @importFrom qs qread -#' @importFrom stats setNames -#' @importFrom stringi stri_replace_all_regex -#' @importFrom missRanger missRanger -#' @export -impute_all <- - function( - dt, - period, - nthreads_dt = 32L, - nthreads_collapse = 32L, - nthreads_imputation = 32L - ) { - data.table::setDTthreads(nthreads_dt) - if (is.character(dt)) { - dt <- file.path("output/qs", dt) - dt <- qs::qread(dt) - } - dt$time <- as.POSIXct(dt$time) - # remove unnecessary columns - query <- "^(site_id|time)\\.[0-9]+" - dt <- dt[, !grepl(query, names(dt)), with = FALSE] - - # name cleaning - dt <- stats::setNames(dt, sub("light_1", "OTH_HMSWL_0_00000", names(dt))) - dt <- stats::setNames(dt, sub("medium_1", "OTH_HMSWM_0_00000", names(dt))) - dt <- stats::setNames(dt, sub("heavy_1", "OTH_HMSWH_0_00000", names(dt))) - dt <- stats::setNames(dt, sub("population_", "POP_SEDAC_0_", names(dt))) - - geoscn <- - "ACET\tGEO_ACETO_0_00000 - ALD2\tGEO_ACETA_0_00000 - ALK4\tGEO_CALKA_0_00000 - BCPI\tGEO_HIBCA_0_00000 - BCPO\tGEO_HOBCA_0_00000 - BENZ\tGEO_BENZE_0_00000 - C2H6\tGEO_ETHTE_0_00000 - C3H8\tGEO_PROPA_0_00000 - CH4\tGEO_METHA_0_00000 - CO\tGEO_CMONO_0_00000 - DST1\tGEO_DUST1_0_00000 - DST2\tGEO_DUST2_0_00000 - DST3\tGEO_DUST3_0_00000 - DST4\tGEO_DUST4_0_00000 - EOH\tGEO_ETHOL_0_00000 - H2O2\tGEO_HYPER_0_00000 - HCHO\tGEO_FORMA_0_00000 - HNO3\tGEO_NITAC_0_00000 - HNO4\tGEO_PERAC_0_00000 - ISOP\tGEO_ISOPR_0_00000 - MACR\tGEO_METHC_0_00000 - MEK\tGEO_MEKET_0_00000 - MVK\tGEO_MVKET_0_00000 - N2O5\tGEO_DIPEN_0_00000 - NH3\tGEO_AMNIA_0_00000 - NH4\tGEO_AMNUM_0_00000 - NIT\tGEO_INNIT_0_00000 - NO\tGEO_NIOXI_0_00000 - NO2\tGEO_NIDIO_0_00000 - NOy\tGEO_NITRO_0_00000 - OCPI\tGEO_HIORG_0_00000 - OCPO\tGEO_HOORG_0_00000 - PAN\tGEO_PERNI_0_00000 - PM25_RH35_GCC\tGEO_PM25X_0_00000 - PM25_RH35_GOCART\tGEO_PM25R_0_00000 - PM25bc_RH35_GCC\tGEO_BLCPM_0_00000 - PM25du_RH35_GCC\tGEO_DUSPM_0_00000 - PM25ni_RH35_GCC\tGEO_NITPM_0_00000 - PM25oc_RH35_GCC\tGEO_ORCPM_0_00000 - PM25soa_RH35_GCC\tGEO_SORPM_0_00000 - PM25ss_RH35_GCC\tGEO_SEAPM_0_00000 - PM25su_RH35_GCC\tGEO_SULPM_0_00000 - PRPE\tGEO_CALKE_0_00000 - RCHO\tGEO_CALDH_0_00000 - SALA\tGEO_FSEAS_0_00000 - SALC\tGEO_CSEAS_0_00000 - SO2\tGEO_SULDI_0_00000 - SOAP\tGEO_SOAPR_0_00000 - SOAS\tGEO_SOASI_0_00000 - TOLU\tGEO_TOLUE_0_00000 - XYLE\tGEO_XYLEN_0_00000 - CO_y\tGEO_COVMR_0_00000 - NO2_y\tGEO_NOVMR_0_00000 - O3\tGEO_OZVMR_0_00000 - SO2_y\tGEO_SOVMR_0_00000" - - geoscn <- strsplit(geoscn, "\n") - geoscn <- unlist(geoscn) - geoscn <- strsplit(geoscn, "\t") - geoscn <- do.call(rbind, geoscn) - geoscndf <- as.data.frame(geoscn, stringsAsFactors = FALSE) - colnames(geoscndf) <- c("variable", "code") - geoscndf$variable <- trimws(geoscndf$variable) - - for (i in seq_len(nrow(geoscndf))) { - dt <- - setNames( - dt, - stringi::stri_replace_all_regex( - names(dt), sprintf("%s$", geoscndf$variable[i]), geoscndf$code[i] - ) - ) - } - site_id <- NULL - # NDVI 16-day - # For each site_id, backward filling for 16-day NDVI - # Last Observation Carried Forward is the method used; - # it assumes that the rows are ordered by date - dt <- dt[order(site_id, time), ] - col_ndviv <- grep("MOD_NDVIV_", names(dt)) - dtndviv <- - data.table::setnafill( - dt, type = "nocb", nan = NA, - cols = col_ndviv - ) - - collapse::set_collapse(mask = "manip", nthreads = nthreads_collapse) - - target_replace <- grep("^MOD_", names(dt), invert = TRUE) - dt <- collapse::replace_inf(dtndviv, value = NA, replace.nan = TRUE) - dt <- collapse::replace_na(dt, value = 0, cols = target_replace) - - # zero-variance exclusion - dt_colvars <- collapse::fvar(dt[, 5:ncol(dt), with = FALSE]) - zero_var_fields <- names(dt_colvars[dt_colvars == 0]) - - # Exclude fields with zero variance using data.table - dt <- dt[, (zero_var_fields) := NULL] - - # Store the name of zero variance fields as an attribute of the input object - attr(dt, "zero_var_fields") <- zero_var_fields - - # excluding columns with excessive "true zeros" - # we should have a threshold for the zero rate - # exc_zero <- collapse::fnth(dt[, 5:ncol(dt), with = FALSE], n = 0.9) - # exc_zero <- unname(which(exc_zero == 0)) + 5L - # dt <- dt[, (exc_zero) := NULL] - - # Q: Do we use all other features to impute? -- Yes. - # 32-thread, 10% for tree building, 200 trees, 4 rounds: 11 hours - imputed <- - missRanger::missRanger( - data = dt, - maxiter = 30L, - num.trees = 300L, - num.threads = nthreads_imputation, - mtry = 50L, - sample.fraction = 0.1 - ) - - imputed <- amadeus::calc_temporal_dummies(imputed, "time") - return(imputed) - # lagged features: changing period (period[1] + 1 day) - # period <- as.Date(period) - # period[1] <- period[1] + as.difftime(1, units = "days") - # period <- as.character(period) - # index_lag <- - # sprintf("MET_%s", c("ATSFC", "ACPRC", "PRSFC", "SPHUM", "WNDSP")) - # index_lag <- grep(paste(index_lag, collapse = "|"), names(dt)) - # target_lag <- imputed[, index_lag, with = FALSE] - - # output <- amadeus::calc_lagged(target_lag, period, 1, "site_id") - # return(output) - } - - -#' Append Predecessors -#' -#' This function appends predecessors to an existing object or -#' creates a new object if none exists. -#' -#' @keywords Post-calculation -#' @param path_qs The path where the predecessors will be stored. -#' @param period_new The new period to be appended. -#' @param input_new The new input object to be appended. -#' @param nthreads The number of threads to be used. -#' -#' @returns If no existing predecessors are found, the function saves -#' the new input object and returns the name of the saved file. -#' If existing predecessors are found, the function appends -#' the new input object to the existing ones and returns the combined object. -#' @export -append_predecessors <- - function( - path_qs = "output/qs", - period_new = NULL, - input_new = NULL, - nthreads = 8L - ) { - if (is.null(input_new)) { - stop("Please provide a valid object.") - } - if (!dir.exists(path_qs)) { - dir.create(path_qs, recursive = TRUE) - } - input_old <- list.files(path_qs, "*.*.qs$", full.names = TRUE) - - # validate input_old with period_new - # if (length(input_old) > 0) { - # periods_old <- do.call(rbind, strsplit(input_old, "_")) - # periods_old <- periods_old[, 4:5] - # periods_old_check <- vapply( - # seq(1, nrow(periods_old)), - # function(i) { - # period_old <- periods_old[i, ] - # period_old <- as.Date(period_old, format = "%Y-%m-%d") - # period_new <- as.Date(period_new, format = "%Y-%m-%d") - # if (period_new[1] < period_old[1] | period_new[2] < period_old[2]) { - # return(FALSE) - # } else { - # return(TRUE) - # } - # }, - # logical(1) - # ) - # if (!all(periods_old_check)) { - # stop("Results have an overlap period. Please provide a valid period.") - # } - # } - period_new <- sapply(period_new, as.character) - time_create <- gsub("[[:punct:]]|[[:blank:]]", "", Sys.time()) - name_qs <- - sprintf( - "dt_feat_pm25_%s_%s_%s.qs", - period_new[1], period_new[2], time_create - ) - if (length(input_old) == 0) { - qs::qsave(input_new, file = file.path(path_qs, name_qs)) - return(name_qs) - } else { - # vv <- list() - qs::qsave(input_new, file = file.path(path_qs, name_qs)) - input_update <- list.files(path_qs, "*.*.qs$", full.names = TRUE) - bound_large <- - Reduce( - function(x, y) { - if (inherits(x, "data.frame")) { - bound <- rbind(x, qs::qread(y)) - } else { - bound <- rbind(qs::qread(x), qs::qread(y)) - } - return(bound) - }, - input_update - ) - return(bound_large) - } - } - - -# nested parallelization -# IN PROGRESS -# TODO: identify bottleneck -#' @noRd -par_nest <- - function( - path, - ... - ) { - chopin::par_grid( - path, - fun_dist = calculate, - ... - ) - } - - - -#' Make sampled subdataframes for base learners -#' -#' Per beethoven resampling strategy, this function selects -#' the predefined number of rows from the input data table and -#' saves the row index in .rowindex field. -#' -#' @keywords Baselearner -#' @param data An object that inherits data.frame. -#' @param n The number of rows to be sampled. -#' @param p The proportion of rows to be used. Default is 0.3. -#' @returns The sampled data table with row index saved in .rowindex field. -make_subdata <- - function( - data, - n = NULL, - p = 0.3 - ) { - if (is.null(n) && is.null(p)) { - stop("Please provide either n or p.") - } - nr <- seq_len(nrow(data)) - if (!is.null(n)) { - nsample <- sample(nr, n) - } else { - nsample <- sample(nr, ceiling(nrow(data) * p)) - } - data <- data[nsample, ] - data$.rowindex <- nsample - data_name <- as.character(substitute(data)) - attr(data, "object_origin") <- data_name[length(data_name)] - return(data) - } - - -#' Base learner: Multilayer perceptron with brulee -#' -#' Multilayer perceptron model with different configurations of -#' hidden units, dropout, activation, and learning rate using brulee -#' and tidymodels. With proper settings, users can utilize graphics -#' processing units (GPU) to speed up the training process. -#' @keywords Baselearner -#' @note tune package should be 1.2.0 or higher. -#' brulee should be installed with GPU support. -#' @details Hyperparameters `hidden_units`, `dropout`, `activation`, -#' and `learn_rate` are tuned. `With tune_mode = "grid"`, -#' users can modify `learn_rate` explicitly, and other hyperparameters -#' will be predefined (56 combinations per `learn_rate`). -#' @param dt_imputed The input data table to be used for fitting. -#' @param folds pre-generated rset object with minimal number of columns. -#' If NULL, `vfold` should be numeric to be used in [rsample::vfold_cv]. -#' @param tune_mode character(1). Hyperparameter tuning mode. -#' Default is "grid", "bayes" is acceptable. -#' @param tune_bayes_iter integer(1). The number of iterations for -#' Bayesian optimization. Default is 50. Only used when `tune_mode = "bayes"`. -#' @param learn_rate The learning rate for the model. For branching purpose. -#' Default is 0.1. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param device The device to be used for training. -#' Default is "cuda:0". Make sure that your system is equipped -#' with CUDA-enabled graphical processing units. -#' @param trim_resamples logical(1). Default is TRUE, which replaces the actual -#' data.frames in splits column of `tune_results` object with NA. -#' @param return_best logical(1). If TRUE, the best tuned model is returned. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom recipes recipe update_role -#' @importFrom dplyr `%>%` -#' @importFrom parsnip mlp set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid fit_best -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_brulee <- - function( - dt_imputed, - folds = NULL, - tune_mode = "grid", - tune_bayes_iter = 50L, - learn_rate = 0.1, - yvar = "Arithmetic.Mean", - xvar = seq(5, ncol(dt_imputed)), - vfold = 5L, - device = "cuda:0", - trim_resamples = TRUE, - return_best = FALSE, - ... - ) { - tune_mode <- match.arg(tune_mode, c("grid", "bayes")) - - # 2^9=512, 2^15=32768 (#param is around 10% of selected rows) - grid_hyper_tune <- - expand.grid( - hidden_units = list(c(1024), c(64, 64), c(32, 32, 32), c(16, 16, 16)), - dropout = 1 / seq(4, 2, -1), - activation = c("relu", "leaky_relu"), - learn_rate = learn_rate - ) - # dt_imputed <- - # dt_imputed %>% - # dplyr::slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed[1, ] - ) %>% - # do we want to normalize the predictors? - # if so, an additional definition of truly continuous variables is needed - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(!!xvar) %>% - recipes::update_role(!!yvar, new_role = "outcome") #%>% - # recipes::step_normalize(!!yvar) - - if (is.null(folds)) { - base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) - } else { - base_vfold <- folds - } - base_vfold <- - restore_rset_full(rset = base_vfold, data_full = dt_imputed) - - base_model <- - parsnip::mlp( - hidden_units = parsnip::tune(), - dropout = parsnip::tune(), - epochs = 1000L, - activation = parsnip::tune(), - learn_rate = parsnip::tune() - ) %>% - parsnip::set_engine("brulee", device = device) %>% - parsnip::set_mode("regression") - - - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) - - if (tune_mode == "grid") { - wf_config <- - tune::control_grid( - verbose = TRUE, - save_pred = TRUE, - save_workflow = TRUE - ) - base_wf <- - base_wf %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = - yardstick::metric_set( - yardstick::rmse, - yardstick::mape, - yardstick::rsq, - yardstick::mae - ), - control = wf_config - ) - } else { - wf_config <- - tune::control_bayes( - verbose = TRUE, - save_pred = TRUE, - save_workflow = TRUE - ) - base_wf <- - base_wf %>% - tune::tune_bayes( - resamples = base_vfold, - iter = tune_bayes_iter, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config - ) - } - if (trim_resamples) { - base_wf$splits <- NA - } - if (return_best) { - base_wf <- tune::show_best(base_wf, n = 1) - } - return(base_wf) - } - - -# dt <- qs::qread("output/dt_feat_design_imputed_061024.qs") -# dtd <- dplyr::as_tibble(dt) -# dtfit <- fit_base_brulee(dtd, r_subsample = 0.3) - - -#' Base learner: Extreme gradient boosting (XGBoost) -#' -#' XGBoost model is fitted at the defined rate (`r_subsample`) of -#' the input dataset by grid search. -#' With proper settings, users can utilize graphics -#' processing units (GPU) to speed up the training process. -#' @keywords Baselearner -#' @note tune package should be 1.2.0 or higher. -#' xgboost should be installed with GPU support. -#' @details Hyperparameters `mtry`, `ntrees`, and `learn_rate` are -#' tuned. With `tune_mode = "grid"`, -#' users can modify `learn_rate` explicitly, and other hyperparameters -#' will be predefined (30 combinations per `learn_rate`). -#' @param dt_imputed The input data table to be used for fitting. -#' @param folds pre-generated rset object with minimal number of columns. -#' If NULL, `vfold` should be numeric to be used in [rsample::vfold_cv]. -#' @param tune_mode character(1). Hyperparameter tuning mode. -#' Default is "grid", "bayes" is acceptable. -#' @param tune_bayes_iter integer(1). The number of iterations for -#' Bayesian optimization. Default is 50. Only used when `tune_mode = "bayes"`. -#' @param learn_rate The learning rate for the model. For branching purpose. -#' Default is 0.1. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param device The device to be used for training. -#' Default is "cuda:0". Make sure that your system is equipped -#' with CUDA-enabled graphical processing units. -#' @param trim_resamples logical(1). Default is TRUE, which replaces the actual -#' data.frames in splits column of `tune_results` object with NA. -#' @param return_best logical(1). If TRUE, the best tuned model is returned. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom recipes recipe update_role -#' @importFrom dplyr `%>%` -#' @importFrom parsnip boost_tree set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid fit_best -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_xgb <- - function( - dt_imputed, - folds = NULL, - tune_mode = "grid", - tune_bayes_iter = 50L, - learn_rate = 0.1, - yvar = "Arithmetic.Mean", - xvar = seq(5, ncol(dt_imputed)), - vfold = 5L, - device = "cuda:0", - trim_resamples = TRUE, - return_best = FALSE, - ... - ) { - tune_mode <- match.arg(tune_mode, c("grid", "bayes")) - # P --> ++ / fix as many hyperparams as possible - grid_hyper_tune <- - expand.grid( - mtry = floor(c(0.02, 0.1, 0.02) * ncol(dt_imputed)), - trees = seq(1000, 3000, 500), - learn_rate = learn_rate - ) - # dt_imputed <- - # dt_imputed %>% - # dplyr::slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed[1, ] - ) %>% - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(tidyselect::all_of(xvar)) %>% - recipes::update_role(tidyselect::all_of(yvar), new_role = "outcome") - if (is.null(folds)) { - base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) - } else { - base_vfold <- folds - } - base_vfold <- - restore_rset_full(rset = base_vfold, data_full = dt_imputed) - - base_model <- - parsnip::boost_tree( - mtry = parsnip::tune(), - trees = parsnip::tune(), - learn_rate = parsnip::tune() - ) %>% - parsnip::set_engine("xgboost", device = device) %>% - parsnip::set_mode("regression") - - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) - - if (tune_mode == "grid") { - wf_config <- - tune::control_grid( - verbose = TRUE, - save_pred = TRUE, - save_workflow = TRUE - ) - - base_wf <- - base_wf %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = - yardstick::metric_set( - yardstick::rmse, - yardstick::mape, - yardstick::rsq, - yardstick::mae - ), - control = wf_config - ) - } else { - wf_config <- - tune::control_bayes( - verbose = TRUE, - save_pred = TRUE, - save_workflow = TRUE - ) - base_wf <- - base_wf %>% - tune::tune_bayes( - resamples = base_vfold, - iter = tune_bayes_iter, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config - ) - } - if (trim_resamples) { - base_wf$splits <- NA - } - if (return_best) { - base_wf <- tune::show_best(base_wf, n = 1) - } - - return(base_wf) - - } - -# dt <- qs::qread("output/dt_feat_design_imputed_061024.qs") -# dtd <- dplyr::as_tibble(dt) -# dtfitx <- fit_base_xgb(dtd, xvar = names(dtd)[6:105], r_subsample = 0.3) - - -#' Base learner: Light Gradient Boosting Machine (LightGBM) -#' -#' LightGBM model is fitted at the defined rate (`r_subsample`) of -#' the input dataset by grid or Bayesian optimization search. -#' With proper settings, users can utilize graphics -#' processing units (GPU) to speed up the training process. -#' @keywords Baselearner -#' @note tune package should be 1.2.0 or higher. -#' xgboost should be installed with GPU support. -#' @details Hyperparameters `mtry`, `ntrees`, and `learn_rate` are -#' tuned. With `tune_mode = "grid"`, -#' users can modify `learn_rate` explicitly, and other hyperparameters -#' will be predefined (30 combinations per `learn_rate`). -#' @param dt_imputed The input data table to be used for fitting. -#' @param folds pre-generated rset object with minimal number of columns. -#' If NULL, `vfold` should be numeric to be used in [rsample::vfold_cv]. -#' @param tune_mode character(1). Hyperparameter tuning mode. -#' Default is "grid", "bayes" is acceptable. -#' @param tune_bayes_iter integer(1). The number of iterations for -#' Bayesian optimization. Default is 50. Only used when `tune_mode = "bayes"`. -#' @param learn_rate The learning rate for the model. For branching purpose. -#' Default is 0.1. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param device The device to be used for training. -#' Default is `"gpu"`. Make sure that your system is equipped -#' with OpenCL-capable graphical processing units. -#' A GPU-enabled version of LightGBM should be installed. -#' @param trim_resamples logical(1). Default is TRUE, which replaces the actual -#' data.frames in splits column of `tune_results` object with NA. -#' @param return_best logical(1). If TRUE, the best tuned model is returned. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom recipes recipe update_role -#' @importFrom dplyr `%>%` -#' @importFrom parsnip boost_tree set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid fit_best -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_lightgbm <- - function( - dt_imputed, - folds = NULL, - tune_mode = "grid", - tune_bayes_iter = 50L, - learn_rate = 0.1, - yvar = "Arithmetic.Mean", - xvar = seq(5, ncol(dt_imputed)), - vfold = 5L, - device = "gpu", - trim_resamples = TRUE, - return_best = FALSE, - ... - ) { - tune_mode <- match.arg(tune_mode, c("grid", "bayes")) - # P --> ++ / fix as many hyperparams as possible - grid_hyper_tune <- - expand.grid( - mtry = floor(c(0.02, 0.1, 0.02) * ncol(dt_imputed)), - trees = seq(1000, 3000, 500), - learn_rate = learn_rate - ) - # dt_imputed <- - # dt_imputed %>% - # dplyr::slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed[1, ] - ) %>% - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(tidyselect::all_of(xvar)) %>% - recipes::update_role(tidyselect::all_of(yvar), new_role = "outcome") - if (is.null(folds)) { - base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) - } else { - base_vfold <- folds - } - base_vfold <- - restore_rset_full(rset = base_vfold, data_full = dt_imputed) - - base_model <- - parsnip::boost_tree( - mtry = parsnip::tune(), - trees = parsnip::tune(), - learn_rate = parsnip::tune() - ) %>% - parsnip::set_engine("xgboost", device_type = device) %>% - parsnip::set_mode("regression") - - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) - - if (tune_mode == "grid") { - wf_config <- - tune::control_grid( - verbose = TRUE, - save_pred = TRUE, - save_workflow = TRUE - ) - base_wf <- - base_wf %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = - yardstick::metric_set( - yardstick::rmse, - yardstick::mape, - yardstick::rsq, - yardstick::mae - ), - control = wf_config - ) - } else { - wf_config <- - tune::control_bayes( - verbose = TRUE, - save_pred = TRUE, - save_workflow = TRUE - ) - base_wf <- - base_wf %>% - tune::tune_bayes( - resamples = base_vfold, - iter = tune_bayes_iter, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config - ) - } - if (trim_resamples) { - base_wf$splits <- NA - } - if (return_best) { - base_wf <- tune::show_best(base_wf, n = 1) - } - - return(base_wf) - - } - - -#' Base learner: Elastic net -#' -#' Elastic net model is fitted at the defined rate (`r_subsample`) of -#' the input dataset by grid search. -#' @keywords Baselearner -#' @note tune package should be 1.2.0 or higher. -#' @param dt_imputed The input data table to be used for fitting. -#' @param folds pre-generated rset object with minimal number of columns. -#' If NULL, `vfold` should be numeric to be used in [rsample::vfold_cv]. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param tune_mode character(1). Hyperparameter tuning mode. -#' Default is "grid", "bayes" is acceptable. -#' @param tune_bayes_iter integer(1). The number of iterations for -#' Bayesian optimization. Default is 50. Only used when `tune_mode = "bayes"`. -#' @param nthreads The number of threads to be used. Default is 16L. -#' @param return_best logical(1). If TRUE, the best tuned model is returned. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom future plan multicore multisession -#' @importFrom dplyr `%>%` -#' @importFrom recipes recipe update_role -#' @importFrom parsnip linear_reg set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid fit_best -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_elnet <- - function( - dt_imputed, - folds = NULL, - # r_subsample = 0.3, - yvar = "Arithmetic.Mean", - xvar = seq(5, ncol(dt_imputed)), - tune_mode = "grid", - tune_bayes_iter = 50L, - vfold = 5L, - nthreads = 16L, - trim_resamples = TRUE, - return_best = FALSE, - ... - ) { - grid_hyper_tune <- - expand.grid( - mixture = seq(0, 1, length.out = 21), - penalty = 10 ^ seq(-3, 5) - ) - # dt_imputed <- - # dt_imputed %>% - # dplyr::slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed[1, ] - ) %>% - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(tidyselect::all_of(xvar)) %>% - recipes::update_role(tidyselect::all_of(yvar), new_role = "outcome") - if (is.null(folds)) { - base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) - } else { - base_vfold <- folds - } - base_vfold <- - restore_rset_full(rset = base_vfold, data_full = dt_imputed) - - base_model <- - parsnip::linear_reg( - mixture = parsnip::tune(), - penalty = parsnip::tune() - ) %>% - parsnip::set_engine("glmnet") %>% - parsnip::set_mode("regression") - - future::plan(future::multicore, workers = nthreads) - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) - - if (tune_mode == "grid") { - wf_config <- - tune::control_grid( - verbose = TRUE, - save_pred = TRUE, - save_workflow = TRUE - ) - - base_wf <- - base_wf %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = - yardstick::metric_set( - yardstick::rmse, - yardstick::mape, - yardstick::rsq, - yardstick::mae - ), - control = wf_config, - parallel_over = "resamples" - ) - } else { - wf_config <- - tune::control_bayes( - verbose = TRUE, - save_pred = TRUE, - save_workflow = TRUE - ) - base_wf <- - base_wf %>% - tune::tune_bayes( - resamples = base_vfold, - iter = tune_bayes_iter, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config, - parallel_over = "resamples" - ) - } - if (trim_resamples) { - base_wf$splits <- NA - } - if (return_best) { - base_wf <- tune::show_best(base_wf, n = 1) - } - future::plan(future::sequential) - return(base_wf) - - } - - - -#' Generate manual rset object from spatiotemporal cross-validation indices -#' @keywords Baselearner -#' @param cvindex integer length of nrow(data). -#' @param data data.frame. -#' @param ref_list List of custom reference indices. Default is NULL. -#' if not NULL, it will be used as a reference instead of max(cvindex). -#' @param cv_mode character(1). Spatiotemporal cross-validation indexing -#' method label. -#' @returns rset object of `rsample` package. A tibble with a list column of -#' training-test data.frames and a column of labels. -#' @author Insang Song -#' @importFrom rsample make_splits -#' @importFrom rsample manual_rset -#' @export -convert_cv_index_rset <- - function( - cvindex, - data, - ref_list = NULL, - cv_mode = "spt" - ) { - if (length(cvindex) != nrow(data)) { - stop("cvindex length should be equal to nrow(data).") - } - - if (!is.null(ref_list)) { - list_cvi <- ref_list - len_cvi <- seq_along(list_cvi) - } else { - maxcvi <- max(cvindex) - len_cvi <- seq_len(maxcvi) - list_cvi <- split(len_cvi, len_cvi) - } - list_cvi_rows <- - lapply( - list_cvi, - function(x) { - list(analysis = which(!cvindex %in% x), - assessment = which(cvindex %in% x)) - } - ) - list_split_dfs <- - lapply( - list_cvi_rows, - function(x) { - rsample::make_splits(x = x, data = data) - } - ) - modename <- sprintf("cvfold_%s_%03d", cv_mode, len_cvi) - rset_stcv <- rsample::manual_rset(list_split_dfs, modename) - return(rset_stcv) - } - - -#' Attach XY coordinates to a data frame -#' -#' This function attaches XY coordinates to a data frame based on a spatial -#' object containing the coordinates. It performs a left join operation to -#' match the coordinates with the corresponding locations in the data frame. -#' @keywords Utility -#' @param data_full The full data frame to which XY coordinates will -#' be attached. -#' @param data_sf The spatial object containing the XY coordinates. -#' @param locs_id The column name in the spatial object that represents the -#' location identifier. -#' @param time_id The column name in the data frame that represents the time -#' identifier. -#' -#' @returns A data frame with the XY coordinates attached. -#' -#' @importFrom sf st_coordinates -#' @importFrom stats setNames -#' @importFrom collapse join -#' @export -attach_xy <- - function( - data_full, - data_sf, - locs_id = "site_id", - time_id = "time" - ) { - data_sfd <- sf::st_coordinates(data_sf) - data_sf <- data_sf[[locs_id]] - data_sfd <- data.frame(site_id = data_sf, data.frame(data_sfd)) - data_sfd <- stats::setNames(data_sfd, c(locs_id, "lon", "lat")) - - data_full_lean <- data_full[, c(locs_id, time_id), with = FALSE] - data_full_attach <- - collapse::join( - data_full_lean, data_sfd, on = locs_id, how = "left" - ) - return(data_full_attach) - } - - - -#' Generate spatio-temporal cross-validation index with anticlust -#' -#' This function generates a spatio-temporal cross-validation index -#' based on the anticlust package. The function first calculates the -#' spatial clustering index using the balanced_clustering function as -#' default, and if `cv_pairs` is provided, -#' it generates rank-based pairs based on the proximity between -#' cluster centroids. -#' @keywords Baselearner -#' @param data data.table with X, Y, and time information. -#' @param target_cols character(3). Names of columns for X, Y, and time. -#' Default is c("lon", "lat", "time"). -#' Order insensitive. -#' @param preprocessing character(1). Preprocessing method. -#' * "none": no preprocessing. -#' * "normalize": normalize the data. -#' * "standardize": standardize the data. -#' @param cv_fold integer(1). Number of folds for cross-validation. -#' default is 5L. -#' @param cv_pairs integer(1). Number of pairs for cross-validation. -#' This value will be used to generate a rank-based pairs -#' based on `target_cols` values. -#' @param pairing character(1) Pair selection method. -#' * "1": search the nearest for each cluster then others -#' are selected based on the rank. -#' * "2": rank the pairwise distances directly -#' @param cv_mode character(1). Spatiotemporal cross-validation indexing -#' @param ... Additional arguments to be passed. -#' @note nrow(data) %% cv_fold should be 0. -#' @returns rsample::manual_rset() object. -#' @author Insang Song -#' @importFrom rsample manual_rset -#' @importFrom anticlust balanced_clustering -#' @importFrom dplyr group_by summarize across ungroup all_of -#' @export -generate_cv_index <- - function( - data, - target_cols = c("lon", "lat", "time"), - preprocessing = c("none", "normalize", "standardize"), - cv_fold = 5L, - cv_pairs = NULL, - pairing = c("1", "2"), - cv_mode = "spt", - ... - ) { - if (length(target_cols) != 3) { - stop("Please provide three target columns.") - } - data_orig <- data - data <- data[, target_cols, with = FALSE] - data$time <- as.numeric(data$time) - data_proc <- - switch( - preprocessing, - none = data, - normalize = (data + abs(apply(data, 2, min))) / - (apply(data, 2, max) + abs(apply(data, 2, min))), - standardize = collapse::fscale(data) - ) - index_cv <- anticlust::balanced_clustering(data_proc, cv_fold) - cv_index <- NULL - ref_list <- NULL - if (!is.null(cv_pairs)) { - pairing <- match.arg(pairing) - data_ex <- data_proc - data_ex$cv_index <- index_cv - data_exs <- data_ex |> - dplyr::group_by(cv_index) |> - dplyr::summarize( - dplyr::across(dplyr::all_of(target_cols), ~mean(as.numeric(.x))) - ) |> - dplyr::ungroup() - - data_exs$cv_index <- NULL - data_exm <- stats::dist(data_exs) - data_exd <- as.vector(data_exm) - data_exmfull <- as.matrix(data_exm) - # index searching in dist matrix out of dist - data_exd_colid <- - unlist(Map(seq_len, seq_len(max(index_cv) - 1))) - # rep(seq_len(max(index_cv) - 1), seq(max(index_cv) - 1, 1, -1)) - data_exd_rowid <- rep(seq(2, max(index_cv)), seq_len(max(index_cv) - 1)) - if (pairing == "2") { - search_idx <- which(rank(-data_exd) <= cv_pairs) - } else { - # min rank element index per each cluster centroid - search_each1 <- - apply(data_exmfull, 1, \(x) which.min(replace(x, which.min(x), Inf))) - # sort the index - search_each1sort <- - Map(c, seq_along(search_each1), search_each1) - # keep the distinct pairs - search_each1sort <- - unique(Map(sort, search_each1sort)) - # return(list(data_exd_colid, data_exd_rowid, search_each1sort)) - search_idx_each1 <- - which( - Reduce( - `|`, - Map( - \(x) data_exd_colid %in% x[1] & data_exd_rowid %in% x[2], - search_each1sort - ) - ) - ) - - # replace the nearest pairs' distance to Inf - search_idx_others <- - which(rank(-replace(data_exd, search_idx_each1, Inf)) <= cv_pairs) - # remove the nearest pairs - # sort the distance of the remaining pairs - search_idx_others <- - search_idx_others[1:(cv_pairs - length(search_idx_each1))] - search_idx <- c(search_idx_each1, search_idx_others) - } - ref_list <- - Map(c, data_exd_rowid[search_idx], data_exd_colid[search_idx]) - } - - rset_cv <- - convert_cv_index_rset( - index_cv, data_orig, ref_list = ref_list, cv_mode = cv_mode - ) - return(rset_cv) - } - -#' Visualize the spatio-temporal cross-validation index -#' @keywords Baselearner -#' @param rsplit rsample::manual_rset() object. -#' @param angle numeric(1). Viewing angle of 3D plot. -#' @returns None. A plot will be generated. -#' @seealso [`generate_cv_index`] -#' @export -vis_rset <- - function(rsplit, angle = 60) { - nsplit <- nrow(rsplit) - graphics::par(mfrow = c(ceiling(nsplit / 3), 3)) - for (i in seq_len(nsplit)) { - cleared <- rsplit[i, 1][[1]][[1]]$data - cleared$indx <- 0 - cleared$indx[rsplit[i, 1][[1]][[1]]$in_id] <- "In" - cleared$indx[rsplit[i, 1][[1]][[1]]$out_id] <- "Out" - cleared$indx <- factor(cleared$indx) - cleared$time <- as.POSIXct(cleared$time) - scatterplot3d::scatterplot3d( - cleared$lon, cleared$lat, cleared$time, - color = rev(as.integer(cleared$indx) + 1), - cex.symbols = 0.02, pch = 19, - angle = angle - ) - } - } -# nocov end - -#' Get Divisors -#' @keywords Miscellaneous -#' @param x integer(1). A positive integer. -#' @returns A vector of divisors of x. -divisor <- - function(x) { - xv <- seq_len(x) - xv[which(x %% xv == 0)] - } - - - -# nocov start - -#' Set arguments for the calculation process -#' -#' This function sets the arguments for the calculation process. -#' It takes several parameters including site ID, time ID, time period, -#' extent, user email, export path, and input path. -#' @keywords Utility -#' @param char_siteid Character string specifying the site ID. -#' Default is "site_id". -#' @param char_timeid Character string specifying the time ID. -#' Default is "time". -#' @param char_period Character vector specifying the time period. -#' Default is c("2018-01-01", "2022-10-31"). -#' @param num_extent Numeric vector specifying the extent. -#' Default is c(-126, -62, 22, 52). -#' @param char_user_email Character string specifying the user email. -#' Default is the current user's email with nih.gov domain. -#' @param export logical(1). If TRUE, the list for the calculation process -#' is exported to `path_export`. Default is FALSE. -#' @param path_export Character string specifying the export path. -#' Default is "inst/targets/punchcard_calc.qs". -#' @param char_input_dir Character string specifying the input path. -#' Default is "input". -#' @param nthreads_nasa integer(1). Number of threads for NASA data. -#' Default is 14L. -#' @param nthreads_hms integer(1). Number of threads for HMS data. -#' Default is 3L. -#' @param nthreads_tri integer(1). Number of threads for TRI data. -#' Default is 5L. -#' @param nthreads_geoscf integer(1). Number of threads for GEOSCF data. -#' Default is 10L. -#' @param nthreads_gmted integer(1). Number of threads for GMTED data. -#' Default is 4L. -#' @param nthreads_narr integer(1). Number of threads for NARR data. -#' Default is 24L. -#' @param nthreads_groads integer(1). Number of threads for GROADS data. -#' Default is 3L. -#' @param nthreads_population integer(1). Number of threads for population data. -#' Default is 3L. -#' @param nthreads_append integer(1). Number of threads for appending data. -#' Default is 8L. -#' @param nthreads_impute integer(1). Number of threads for imputing data. -#' Default is 64L. -#' -#' @note -#' The number of threads used is fixed as 1L -#' otherwise specified in `nthreads_*` arguments. -#' path_input should contain the following subdirectories: -#' - modis/raw/61/MOD11A1 -#' - modis/raw/61/MOD06_L2 -#' - modis/raw/61/MOD09GA -#' - modis/raw/61/MCD19A2 -#' - modis/raw/61/MOD13A2 -#' - modis/raw/5000/VNP46A2 -#' - aqs -#' - nlcd -#' - geos/aqc_tavg_1hr_g1440x721_v1 -#' - geos/chm_tavg_1hr_g1440x721_v1 -#' - HMS_Smoke/data -#' - gmted -#' - nei -#' - narr -#' - HMS_Smoke -#' - koppen_geiger -#' - ecoregions -#' - sedac_groads -#' - sedac_population -#' -#' @returns A list of arguments for common use -#' in the calculation process. -#' * char_siteid: Character string specifying the site ID. -#' * char_timeid: Character string specifying the time ID. -#' * char_period: Character vector specifying the time period. -#' * num_extent: Numeric vector specifying the extent. -#' * char_user_email: Character string specifying the user email. -#' * char_input_dir: Character string specifying the input path. -#' * nthreads_nasa: Number of threads for NASA data. -#' * nthreads_hms: Number of threads for HMS data. -#' * nthreads_tri: Number of threads for TRI data. -#' * nthreads_geoscf: Number of threads for GEOS-CF data. -#' * nthreads_gmted: Number of threads for GMTED data. -#' * nthreads_narr: Number of threads for NARR data. -#' * nthreads_groads: Number of threads for SEDAC Groads data. -#' * nthreads_population: Number of threads for population data. -#' * nthreads_append: Number of threads for appending data. -#' * nthreads_impute: Number of threads for imputing data. -#' @author Insang Song -#' @importFrom qs qsave -#' @export -# nolint start -set_args_calc <- - function( - char_siteid = "site_id", - char_timeid = "time", - char_period = c("2018-01-01", "2022-10-31"), - num_extent = c(-126, -62, 22, 52), - char_user_email = paste0(Sys.getenv("USER"), "@nih.gov"), - export = FALSE, - path_export = "inst/targets/punchcard_calc.qs", - char_input_dir = "input", - nthreads_nasa = 14L, - nthreads_hms = 3L, - nthreads_tri = 5L, - nthreads_geoscf = 10L, - nthreads_gmted = 4L, - nthreads_narr = 24L, - nthreads_groads = 3L, - nthreads_population = 3L, - nthreads_append = 8L, - nthreads_impute = 64L - ) { - list_common <- - list( - char_siteid = char_siteid, - char_timeid = char_timeid, - char_period = char_period, - extent = num_extent, - char_user_email = char_user_email, - char_input_dir = char_input_dir, - nthreads_nasa = nthreads_nasa, - nthreads_hms = nthreads_hms, - nthreads_tri = nthreads_tri, - nthreads_geoscf = nthreads_geoscf, - nthreads_gmted = nthreads_gmted, - nthreads_narr = nthreads_narr, - nthreads_groads = nthreads_groads, - nthreads_population = nthreads_population, - nthreads_append = nthreads_append, - nthreads_impute = nthreads_impute - ) - ain <- function(x, append = FALSE) { - if (append) { - file.path(char_input_dir, x, "data_files") - } else { - file.path(char_input_dir, x) - } - } - if (export) { - list_paths <- - list( - mod11 = load_modis_files(ain("modis/raw/61/MOD11A1"), date = list_common$char_period), - mod06 = load_modis_files(ain("modis/raw/61/MOD06_L2"), date = list_common$char_period), - mod09 = load_modis_files(ain("modis/raw/61/MOD09GA"), date = list_common$char_period), - mcd19 = load_modis_files(ain("modis/raw/61/MCD19A2"), date = list_common$char_period), - mod13 = load_modis_files(ain("modis/raw/61/MOD13A2"), date = list_common$char_period), - viirs = load_modis_files(ain("modis/raw/5000/VNP46A2"), "h5$", date = list_common$char_period) - ) - - list_proccalc <- - list( - aqs = list(path = ain("aqs", TRUE)), - mod11 = list(from = list_paths$mod11, - name_covariates = sprintf("MOD_SFCT%s_0_", c("D", "N")), - subdataset = "^LST_", - nthreads = nthreads_nasa, - radius = c(1e3, 1e4, 5e4)), - mod06 = list(from = list_paths$mod06, - name_covariates = sprintf("MOD_CLCV%s_0_", c("D", "N")), - subdataset = c("Cloud_Fraction_Day", "Cloud_Fraction_Night"), - nthreads = nthreads_nasa, - preprocess = amadeus::process_modis_swath, - radius = c(1e3, 1e4, 5e4)), - mod09 = list(from = list_paths$mod09, - name_covariates = sprintf("MOD_SFCRF_%d_", seq(1, 7)), - subdataset = "^sur_refl_", - nthreads = nthreads_nasa, - radius = c(1e3, 1e4, 5e4)), - mcd19_1km = list(from = list_paths$mcd19, - name_covariates = sprintf("MOD_AD%dTA_0_", c(4, 5)), - subdataset = "^Optical_Depth", - nthreads = nthreads_nasa, - radius = c(1e3, 1e4, 5e4)), - mcd19_5km = list(from = list_paths$mcd19, - name_covariates = sprintf("MOD_%sAN_0_", c("CSZ", "CVZ", "RAZ", "SCT", "GLN")), - subdataset = "cos|RelAZ|Angle", - nthreads = nthreads_nasa, - radius = c(1e3, 1e4, 5e4)), - mod13 = list(from = list_paths$mod13, - name_covariates = "MOD_NDVIV_0_", - subdataset = "(NDVI)", - nthreads = nthreads_nasa, - radius = c(1e3, 1e4, 5e4)), - viirs = list(from = list_paths$viirs, - name_covariates = "MOD_LGHTN_0_", - subdataset = 3, - nthreads = nthreads_nasa, - preprocess = amadeus::process_blackmarble, - radius = c(1e3, 1e4, 5e4)), - geoscf_aqc = list(date = list_common$char_period, - path = ain("geos/aqc_tavg_1hr_g1440x721_v1"), - nthreads = nthreads_geoscf), - geoscf_chm = list(date = list_common$char_period, - path = ain("geos/chm_tavg_1hr_g1440x721_v1"), - nthreads = nthreads_geoscf), - # base class covariates start here - hms = list(path = ain("HMS_Smoke", TRUE), - date = list_common$char_period, - covariate = "hms", - domain = c("Light", "Medium", "Heavy"), - nthreads = nthreads_hms, - domain_name = "variable"), - gmted = list( - path = ain("gmted", TRUE), - covariate = "gmted" - ), - nei = list( - domain = c(2017, 2020), - domain_name = "year", - path = ain("nei", TRUE), - covariate = "nei" - ), - tri = list( - domain = seq(2018, 2022), - domain_name = "year", - path = ain("tri"), - radius = c(1e3, 1e4, 5e4), - covariate = "tri", - nthreads = nthreads_tri - ), - nlcd = list( - domain = c(2019, 2021), - domain_name = "year", - path = ain("nlcd", TRUE), - covariate = "nlcd", - mode = "exact", - extent = NULL, - radius = c(1e3, 1e4, 5e4), - max_cells = 1e8 - ), - koppen = list(path = ain("koppen_geiger/data_files/Beck_KG_V1_present_0p0083.tif"), - covariate = "koppen", - nthreads = 1L), - ecoregions = list(path = ain("ecoregions/data_files/us_eco_l3_state_boundaries.shp"), - covariate = "ecoregions", - nthreads = 1L), - narr = list( - path = ain("narr"), - covariate = "narr", - domain_reduced = c("air.sfc", "albedo", "apcp", "dswrf", "evap", "hcdc", - "hpbl", "lcdc", "lhtfl", "mcdc", "omega", "pr_wtr", - "pres.sfc", "shtfl", "snowc", "soilm", - "tcdc", "ulwrf.sfc", "uwnd.10m", "vis", "vwnd.10m", "weasd"), - domain_appt = c("prate", "shum"), - domain = c("air.sfc", "albedo", "apcp", "dswrf", "evap", "hcdc", - "hpbl", "lcdc", "lhtfl", "mcdc", "omega", "pr_wtr", - "prate", "pres.sfc", "shtfl", "shum", "snowc", "soilm", - "tcdc", "ulwrf.sfc", "uwnd.10m", "vis", "vwnd.10m", "weasd"), - domain_name = "variable", - date = list_common$char_period, - process_function = process_narr2, - calc_function = calc_narr2, - nthreads = nthreads_narr - ), - groads = list( - path = ain("sedac_groads/data_files/gROADS-v1-americas.gdb"), - covariate = "groads", - radius = c(1e3, 1e4, 5e4), - nthreads = nthreads_groads), - population = list( - path = ain("sedac_population/data_files/gpw_v4_population_density_adjusted_to_2015_unwpp_country_totals_rev11_2020_30_sec.tif"), - covariate = "population", fun = "mean", - radius = c(1e3, 1e4, 5e4), - nthreads = nthreads_population - ) - ) - - attr(list_proccalc, "description") <- - tibble::tribble( - ~dataset, ~description, - "mod11", "MODIS Land Surface Temperature Day/Night", - "mod06", "MODIS Cloud Fraction Day/Night", - "mod09", "MODIS Surface Reflectance", - "mcd19_1km", "MCD19A2 1km", - "mcd19_5km", "MCD19A2 5km", - "mod13", "MODIS Normalized Difference Vegetation Indexß", - "viirs", "VIIRS Nighttime Lights", - "hms", "NOAA Hazard Mapping System Smoke", - "geoscf_aqc", "GEOS-CF AQC", - "geoscf_chm", "GEOS-CF CHM", - "gmted", "GMTED elevation", - "nei", "National Emission Inventory", - "tri", "Toxic Release Inventory", - "nlcd", "National Land Cover Database", - "koppen", "Koppen-Geiger Climate Classification", - "ecoregions", "EPA Ecoregions", - "narr", "NARR", - "groads", "SEDAC Global Roads", - "population", "SEDAC Population Density" - ) - if (is.null(path_export)) { - assign("arglist_proccalc", list_proccalc, envir = .GlobalEnv) - return(list_common) - } else { - qs::qsave( - list_proccalc, - path_export - ) - return(list_common) - } - } - return(list_common) - } - - -#' Generate argument list for raw data download -#' @keywords Utility -#' @param char_period Character(2) vector specifying the time period. -#' Default is c("2018-01-01", "2022-10-31"). -#' @param char_input_dir Character string specifying the input path. -#' Default is "input". -#' @param nasa_earth_data_token Character string specifying the NASA Earth Data token. -#' @param year_nlcd numeric(2). Numeric vector specifying the NLCD years. -#' Default is c(2019, 2021). -#' @param export logical(1). If TRUE, the list is saved to `path_export`. -#' Default is `TRUE`. -#' @param path_export Character string specifying the export path. -#' Default is "inst/targets/download_spec.qs". -#' @export -set_args_download <- - function( - char_period = c("2018-01-01", "2022-10-31"), - char_input_dir = "input", - nasa_earth_data_token = NULL, - year_nlcd = c(2019, 2021), - export = FALSE, - path_export = "inst/targets/download_spec.qs" - ) { - ain <- function(x, append = FALSE) { - if (append) { - file.path(char_input_dir, x, "data_files") - } else { - file.path(char_input_dir, x) - } - } - - time_periods <- as.numeric(substr(char_period, 1, 4)) - year_nei <- seq(2017, time_periods[2], 3) - gmted_vars <- - c("Breakline Emphasis", "Systematic Subsample", "Median Statistic", - "Minimum Statistic", "Mean Statistic", "Maximum Statistic", - "Standard Deviation Statistic" - ) - narr_variables_mono <- - c("air.sfc", "albedo", "apcp", "dswrf", "evap", "hcdc", - "hpbl", "lcdc", "lhtfl", "mcdc", "pr_wtr", - "prate", "pres.sfc", "shtfl", "snowc", "soilm", - "tcdc", "ulwrf.sfc", "uwnd.10m", "vis", "vwnd.10m", "weasd") - narr_variables_plevels <- - c("omega", "shum") - - list_download_config <- - list( - aqs = list(dataset_name = "aqs", directory_to_save = ain("aqs", TRUE), - year_start = time_periods[1], year_end = time_periods[2], - unzip = TRUE, remove_zip = TRUE), - mod11 = list(dataset_name = "modis", directory_to_save = ain("modis/raw"), - product = "MOD11A1", date_start = char_period[1], date_end = char_period[2], - nasa_earth_data_token = nasa_earth_data_token), - mod06 = list(dataset_name = "modis", directory_to_save = ain("modis/raw"), - product = "MOD06_L2", date_start = char_period[1], date_end = char_period[2], - nasa_earth_data_token = nasa_earth_data_token), - mod09 = list(dataset_name = "modis", directory_to_save = ain("modis/raw"), - product = "MOD09GA", date_start = char_period[1], date_end = char_period[2], - nasa_earth_data_token = nasa_earth_data_token), - mcd19 = list(dataset_name = "modis", directory_to_save = ain("modis/raw"), - product = "MCD19A2", date_start = char_period[1], date_end = char_period[2], - nasa_earth_data_token = nasa_earth_data_token), - mod13 = list(dataset_name = "modis", directory_to_save = ain("modis/raw"), - product = "MOD13A2", date_start = char_period[1], date_end = char_period[2], - nasa_earth_data_token = nasa_earth_data_token), - viirs = list(dataset_name = "modis", directory_to_save = ain("modis/raw"), - product = "VNP46A2", date_start = char_period[1], date_end = char_period[2], - version = "5000", - nasa_earth_data_token = nasa_earth_data_token), - geoscf_aqc = list(dataset_name = "geos", directory_to_save = ain("geos"), - collection = "aqc_tavg_1hr_g1440x721_v1", - date_start = char_period[1], date_end = char_period[2]), - geoscf_chm = list(dataset_name = "geos", directory_to_save = ain("geos"), - collection = "chm_tavg_1hr_g1440x721_v1", - date_start = char_period[1], date_end = char_period[2]), - hms = list(dataset_name = "smoke", directory_to_save = ain("HMS_Smoke"), - data_format = "Shapefile", - date_start = char_period[1], date_end = char_period[2], - unzip = TRUE, remove_zip = TRUE), - gmted = lapply(gmted_vars, - function(v) { - list(dataset_name = "gmted", directory_to_save = ain("gmted", TRUE), - static = v, resolution = "7.5 arc-seconds", - unzip = TRUE, remove_zip = TRUE) - }), - nei = lapply(year_nei, - function(y) { - list(dataset_name = "nei", directory_to_save = ain("nei", TRUE), - year_target = y, unzip = TRUE) - }), - tri = list(dataset_name = "tri", directory_to_save = ain("tri"), - year_start = time_periods[1], year_end = time_periods[2]), - nlcd = lapply(year_nlcd, - function(y) { - list(dataset_name = "nlcd", directory_to_save = ain("nlcd", TRUE), - year = y, - unzip = TRUE, remove_zip = TRUE) - }), - koppen = list(dataset_name = "koppen", directory_to_save = ain("koppen_geiger", TRUE), - data_resolution = "0.0083", time_period = "Present", unzip = TRUE, remove_zip = TRUE), - ecoregions = list(dataset_name = "koppen", directory_to_save = ain("ecoregions", TRUE), - unzip = TRUE, remove_zip = TRUE), - narr_monolevel = lapply(narr_variables_mono, - function(v) { - list(dataset_name = "narr_monolevel", directory_to_save = ain("narr"), - variables = v, year_start = char_period[1], year_end = char_period[2]) - }), - narr_p_levels = lapply(narr_variables_plevels, - function(v) { - list(dataset_name = "narr_p_levels", directory_to_save = ain("narr"), - variables = v, year_start = char_period[1], year_end = char_period[2]) - }) - , - groads = list(dataset_name = "sedac_groads", directory_to_save = ain("sedac_groads", TRUE), - data_region = "Americas", data_format = "Geodatabase", - unzip = TRUE, remove_zip = TRUE), - population = list(dataset_name = "sedac_population", directory_to_save = ain("sedac_population", TRUE), - data_resolution = "30 second", data_format = "GeoTIFF", year = "2020", unzip = TRUE, remove_zip = TRUE) - ) - - if (export) { - qs::qsave(list_download_config, path_export) - message("Download configuration is saved to ", path_export) - } - return(list_download_config) - } - -# nolint end - -#' Prepare spatial and spatiotemporal cross validation sets -#' @keywords Baselearner -#' @param data data.table with X, Y, and time information. -#' @param r_subsample The proportion of rows to be sampled. -#' @param target_cols character(3). Names of columns for X, Y. -#' Default is `c("lon", "lat")`. It is passed to sf::st_as_sf to -#' subsequently generate spatial cross-validation indices using -#' `spatialsample::spatial_block_cv` and -#' `spatialsample::spatial_clustering_cv`. -#' @param cv_make_fun function(1). Function to generate spatial -#' cross-validation indices. Default is `spatialsample::spatial_block_cv`. -#' @seealso [`generate_cv_index`] [`spatialsample::spatial_block_cv`] -#' [`spatialsample::spatial_clustering_cv`] -#' @returns rsample::manual_rset() object. -#' @importFrom rlang inject -#' @importFrom sf st_as_sf -#' @export -prepare_cvindex <- - function( - data, - r_subsample = 0.3, - target_cols = c("lon", "lat"), - cv_make_fun = spatialsample::spatial_block_cv, - ... - ) { - data <- data %>% - dplyr::slice_sample(prop = r_subsample) - - if (methods::getPackageName(environment(cv_make_fun)) == "beethoven") { - cv_index <- - rlang::inject( - cv_make_fun( - data = data, - target_cols = target_cols, - !!!list(...) - ) - ) - } else { - data_sf <- sf::st_as_sf(data, coords = target_cols, remove = FALSE) - cv_index <- - rlang::inject( - cv_make_fun( - data_sf, - !!!list(...) - ) - ) - # assign id with function name - fun_name <- as.character(substitute(cv_make_fun)) - fun_name <- fun_name[length(fun_name)] - cv_index$id <- sprintf("%s_%02d", fun_name, seq_len(nrow(cv_index))) - } - return(cv_index) - } - - -#' Restore the full data set from the rset object -#' @keywords Baselearner -#' @param rset rsample::manual_rset() object's `splits` column -#' @param data_full data.table with all features -#' @returns A list of data.table objects. -#' @note $splits should be present in rset. -restore_rset_full <- - function(rset, data_full) { - rset$splits <- - lapply( - rset$splits, - function(x) { - x$data <- - collapse::join( - x$data, - data_full, - on = c("site_id", "time"), - how = "left" - ) - return(x) - } - ) - return(rset) - } - - -#' Restore the full data set from two rset objects then fit the best model -#' @keywords Baselearner -#' @param rset_trimmed rset object without data in splits column. -#' @param rset_full rset object with full data. -#' @param df_full data.table with full data. -#' @param nested logical(1). If TRUE, the rset object is nested. -#' @param nest_length integer(1). Length of the nested list. -#' i.e., Number of resamples. -#' @returns rset object with full data in splits column. -#' @export -restore_fit_best <- - function( - rset_trimmed, - rset_full, - df_full, - by = c("site_id", "time"), - nested = TRUE, - nest_length = 30L - ) { - parsnip_spec <- - workflows::extract_spec_parsnip(rset_trimmed[[1]]) - # Do I need to restore full data in rset_trimmed? - - # reassemble the branched rsets - if (nested) { - rset_trimmed <- - as.list(seq_len(nest_length)) %>% - lapply( - function(x) { - # here we have list length of 4, each has .metric column, - # which we want to bind_rows at - # [[1]] $.metric - # [[2]] $.metric ... - template <- x[[1]] - combined_lr <- x[seq(x, length(rset_trimmed), nest_length)] - # length of 4; - # combine rows of each element in four lists - combined_lr <- - mapply( - function(df1, df2, df3, df4) { - dplyr::bind_rows(df1, df2, df3, df4) - }, - combined_lr[[1]], combined_lr[[2]], - combined_lr[[3]], combined_lr[[4]], - SIMPLIFY = FALSE - ) - template$.metric <- combined_lr - return(template) - } - ) - } - - tuned_best <- tune::show_best(rset_trimmed, n = 1) - model_best <- - rlang::inject( - parsnip::update(parsnip_spec, parameters = !!!as.list(tuned_best)) - ) - - # fit the entire data - model_fit <- parsnip::fit(model_best, data = df_full) - pred <- predict(model_fit, data = df_full) - return(pred) - - } - - - -# nocov end diff --git a/R/processing.R b/R/processing.R new file mode 100644 index 00000000..f72f8b71 --- /dev/null +++ b/R/processing.R @@ -0,0 +1,333 @@ +# nocov start +# please note that functions here are modified version of +# the original functions in the package amadeus (<0.2.0) + + +#' Process atmospheric composition data by chunks (v2) +#' @keywords Calculation +#' @description +#' Returning a single `SpatRasterDataset` object. +#' @param date character(2). length of 10. Format "YYYY-MM-DD". +#' @param path character(1). Directory with downloaded netCDF (.nc4) files. or +#' netCDF file paths. +#' @param ... Arguments passed to [`terra::rast`]. +#' @note +#' Layer names of the returned `SpatRaster` object contain the variable, +#' pressure level, date +#' Reference duration: 1 day summary, all layers: 115 seconds +#' Superseded by [`calc_geos_strict`]. +#' @author Mitchell Manware, Insang Song +#' @return a `SpatRaster` object; +#' @importFrom terra rast +#' @importFrom terra time +#' @importFrom terra varnames +#' @importFrom terra crs +#' @importFrom terra subset +#' @export +process_geos_bulk <- + function(path = NULL, + date = c("2018-01-01", "2018-01-01"), + ...) { + #### directory setup + if (length(path) == 1) { + + if (dir.exists(path)) { + path <- amadeus::download_sanitize_path(path) + paths <- list.files( + path, + pattern = "GEOS-CF.v01.rpl", + full.names = TRUE + ) + paths <- paths[grep( + ".nc4", + paths + )] + } + } else { + paths <- path + } + #### check for variable + amadeus::check_for_null_parameters(mget(ls())) + #### identify file paths + #### identify dates based on user input + dates_of_interest <- amadeus::generate_date_sequence( + date[1], + date[2], + sub_hyphen = TRUE + ) + #### subset file paths to only dates of interest + data_paths <- unique( + grep( + paste( + dates_of_interest, + collapse = "|" + ), + paths, + value = TRUE + ) + ) + #### identify collection + collection <- amadeus::process_collection( + data_paths[1], + source = "geos", + collection = TRUE + ) + cat( + paste0( + "Identified collection ", + collection, + ".\n" + ) + ) + if (length(unique(collection)) > 1) { + warning( + "Multiple collections detected. Returning data for all collections.\n" + ) + } + + filename_date <- regmatches( + data_paths, + regexpr( + "20[0-9]{2}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])", + data_paths + ) + ) + if (any(table(filename_date) < 24)) { + warning( + "Some dates include less than 24 hours. Check the downloaded files." + ) + } + if (length(unique(filename_date)) > 10) { + message( + "More than 10 unique dates detected. Try 10-day chunks..." + ) + } + + # split filename date every 10 days + filename_date <- as.Date(filename_date, format = "%Y%m%d") + filename_date_cl <- as.integer(cut(filename_date, "30 days")) + + future_inserted <- split(data_paths, filename_date_cl) + other_args <- list(...) + data_variables <- names(terra::rast(data_paths[1])) + # nolint start + summary_byvar <- function(x = data_variables, fs) { + rast_in <- rlang::inject(terra::rast(fs, !!!other_args)) + terra::sds(lapply( + x, + function(v) { + rast_inidx <- grep(v, names(rast_in)) + rast_in <- rast_in[[rast_inidx]] + rast_summary <- terra::tapp(rast_in, index = "days", fun = "mean") + names(rast_summary) <- + paste0( + rep(v, terra::nlyr(rast_summary)), "_", + terra::time(rast_summary) + ) + terra::set.crs(rast_summary, "EPSG:4326") + return(rast_summary) + } + )) + } + # nolint end + + # summary by 10 days + # TODO: dropping furrr? + rast_10d_summary <- + furrr::future_map( + .x = future_inserted, + .f = ~summary_byvar(fs = .x), + .options = + furrr::furrr_options( + globals = c("other_args", "data_variables") + ) + ) + rast_10d_summary <- Reduce(c, rast_10d_summary) + return(rast_10d_summary) + + } + + + + +#' Process NARR Data (v2) +#' +#' This function processes NARR2 data based on the specified parameters. +#' +#' @keywords Calculation +#' @param date A character vector specifying the start and end dates +#' in the format "YYYY-MM-DD". +#' @param variable A character vector specifying the variable of interest. +#' @param path A character vector specifying the path to the data files. +#' @param ... Additional parameters to be passed to other functions. +#' +#' @return A SpatRaster object containing the processed NARR2 data. +#' +#' @details This function performs the following steps: +#' 1. Sets up the directory path. +#' 2. Checks for null parameters. +#' 3. Identifies file paths based on the specified variable. +#' 4. Generates a date sequence based on the specified start and end dates. +#' 5. Filters the file paths to include only dates of interest. +#' 6. Sets up the search abbreviation and target variable. +#' 7. Imports and processes the data for each file path. +#' 8. Subsets the data to include only dates of interest. +#' 9. Returns the processed data as a SpatRaster object. +#' +#' @examples +#' # Process NARR2 data for the variable "PRATE" from +#' # September 1, 2023 to September 1, 2023 +#' \dontrun{ +#' data <- +#' process_narr2( +#' date = c("2023-09-01", "2023-09-01"), +#' variable = "PRATE", +#' path = "/path/to/data" +#' ) +#' } +#' @importFrom amadeus download_sanitize_path check_for_null_parameters +#' @importFrom amadeus generate_date_sequence +#' @importFrom terra rast time subset +#' @export +process_narr2 <- function( + date = c("2023-09-01", "2023-09-01"), + variable = NULL, + path = NULL, + ...) { + #### directory setup + path <- amadeus::download_sanitize_path(path) + #### check for variable + amadeus::check_for_null_parameters(mget(ls())) + #### identify file paths + data_paths <- list.files( + path, + pattern = variable, + recursive = TRUE, + full.names = TRUE + ) + # data_paths <- grep( + # sprintf("%s*.*.nc", variable), + # data_paths, + # value = TRUE + # ) + #### define date sequence + date_sequence <- amadeus::generate_date_sequence( + date[1], + date[2], + sub_hyphen = TRUE + ) + #### path ncar + ym_from <- regmatches( + data_paths, + regexpr( + "2[0-9]{3,5}", + data_paths + ) + ) + ym_of_interest <- + substr(date_sequence, + 1, ifelse(all(nchar(ym_from) == 6), 6, 4)) + ym_of_interest <- unique(ym_of_interest) + #### subset file paths to only dates of interest + data_paths_ym <- unique( + grep( + paste( + ym_of_interest, + collapse = "|" + ), + data_paths, + value = TRUE + ) + ) + + search_abbr <- list.dirs(path)[-1] + search_abbr <- sub(paste0(path, "/"), "", search_abbr) + search_to <- c( + "ATSFC", "ALBDO", "ACPRC", "DSWRF", "ACEVP", + "HCLAF", "PLBLH", "LCLAF", "LATHF", "MCLAF", + "OMEGA", "PRWTR", "PRATE", "PRSFC", "SENHF", + "SPHUM", "SNWCV", "SLMSC", "CLDCV", "ULWRF", + "UWIND", "VISIB", "VWIND", "ACSNW" + ) + search_to <- + sprintf("MET_%s", search_to[match(variable, search_abbr)]) + + #### initiate for loop + data_full <- terra::rast() + for (p in seq_along(data_paths_ym)) { + #### import data + data_year <- terra::rast(data_paths_ym[p]) + data_year_tinfo <- terra::time(data_year) + time_processed <- as.POSIXlt(data_year_tinfo) + time_this <- time_processed[1] + cat(paste0( + "Cleaning ", variable, " data for ", + sprintf( + "%s, %d %s", + strftime(time_this, "%B"), + time_this$year + 1900, + "...\n" + ) + )) + #### check for mono or pressure levels + lvinfo <- regmatches( + names(data_year), + regexpr("level=[0-9]{3,4}", names(data_year)) + ) + if (length(lvinfo) == 0) { + cat("Detected monolevel data...\n") + names(data_year) <- paste0( + search_to, "_", + gsub("-", "", data_year_tinfo) + ) + } else { + cat("Detected pressure levels data...\n") + lvinfo <- sub("level=", "", lvinfo) + lvinfo <- sprintf("%04d", as.integer(lvinfo)) + lvinfo <- paste0("L", lvinfo) + terra::time(data_year) <- as.Date(data_year_tinfo) + names(data_year) <- sprintf( + "%s_%s_%s", + search_to, + lvinfo, + gsub("-", "", data_year_tinfo) + ) + } + data_full <- c( + data_full, + data_year, + warn = FALSE + ) + } + + #### subset years to dates of interest + data_full_cn <- names(data_full) + data_return <- terra::subset( + data_full, + which( + substr( + data_full_cn, + nchar(data_full_cn) - 7, + nchar(data_full_cn) + ) %in% date_sequence + ) + ) + cat(paste0( + "Returning daily ", + variable, + " data from ", + as.Date(date_sequence[1], format = "%Y%m%d"), + " to ", + as.Date( + date_sequence[length(date_sequence)], + format = "%Y%m%d" + ), + ".\n" + )) + #### return SpatRaster + return(data_return) +} + + + +# nocov end diff --git a/R/processing_misc.R b/R/processing_misc.R new file mode 100644 index 00000000..a3144efb --- /dev/null +++ b/R/processing_misc.R @@ -0,0 +1,34 @@ +# nocov start + +#' Load county sf object +#' @keywords Calculation +#' @param year integer(1). Year of the county shapefile. +#' @param exclude character. State FIPS codes to exclude. +#' Default is c("02", "15", "60", "66", "68", "69", "72", "78"). +#' @return sf object +#' @importFrom tigris counties +process_counties <- + function( + year = 2020, + exclude = c("02", "15", "60", "66", "68", "69", "72", "78") + ) { + options(tigris_use_cache = TRUE) + cnty <- tigris::counties(year = year) + cnty <- + cnty[!cnty$STATEFP %in% + c("02", "15", "60", "66", "68", "69", "72", "78"), ] + return(cnty) + } + + +#' Get Divisors +#' @keywords Miscellaneous +#' @param x integer(1). A positive integer. +#' @return A vector of divisors of x. +divisor <- + function(x) { + xv <- seq_len(x) + xv[which(x %% xv == 0)] + } + +# nocov end diff --git a/R/reserved.R b/R/reserved.R new file mode 100644 index 00000000..6bd76211 --- /dev/null +++ b/R/reserved.R @@ -0,0 +1,775 @@ +# nocov start +# not used in the pipeline, but good for educational/development purposes + + +#' Process atmospheric composition data by chunks (v2) +#' @keywords Calculation +#' @description +#' Returning a single `SpatRasterDataset` object. +#' @param date character(2). length of 10. Format "YYYY-MM-DD". +#' @param path character(1). Directory with downloaded netCDF (.nc4) files. or +#' netCDF file paths. +#' @param ... Arguments passed to [`terra::rast`]. +#' @note +#' Layer names of the returned `SpatRaster` object contain the variable, +#' pressure level, date +#' Reference duration: 1 day summary, all layers: 115 seconds +#' Superseded by [`calc_geos_strict`]. +#' @author Mitchell Manware, Insang Song +#' @return a `SpatRaster` object; +#' @importFrom terra rast +#' @importFrom terra time +#' @importFrom terra varnames +#' @importFrom terra crs +#' @importFrom terra subset +#' @export +process_geos_bulk <- + function(path = NULL, + date = c("2018-01-01", "2018-01-01"), + ...) { + #### directory setup + if (length(path) == 1) { + + if (dir.exists(path)) { + path <- amadeus::download_sanitize_path(path) + paths <- list.files( + path, + pattern = "GEOS-CF.v01.rpl", + full.names = TRUE + ) + paths <- paths[grep( + ".nc4", + paths + )] + } + } else { + paths <- path + } + #### check for variable + amadeus::check_for_null_parameters(mget(ls())) + #### identify file paths + #### identify dates based on user input + dates_of_interest <- amadeus::generate_date_sequence( + date[1], + date[2], + sub_hyphen = TRUE + ) + #### subset file paths to only dates of interest + data_paths <- unique( + grep( + paste( + dates_of_interest, + collapse = "|" + ), + paths, + value = TRUE + ) + ) + #### identify collection + collection <- amadeus::process_collection( + data_paths[1], + source = "geos", + collection = TRUE + ) + cat( + paste0( + "Identified collection ", + collection, + ".\n" + ) + ) + if (length(unique(collection)) > 1) { + warning( + "Multiple collections detected. Returning data for all collections.\n" + ) + } + + filename_date <- regmatches( + data_paths, + regexpr( + "20[0-9]{2}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])", + data_paths + ) + ) + if (any(table(filename_date) < 24)) { + warning( + "Some dates include less than 24 hours. Check the downloaded files." + ) + } + if (length(unique(filename_date)) > 10) { + message( + "More than 10 unique dates detected. Try 10-day chunks..." + ) + } + + # split filename date every 10 days + filename_date <- as.Date(filename_date, format = "%Y%m%d") + filename_date_cl <- as.integer(cut(filename_date, "30 days")) + + future_inserted <- split(data_paths, filename_date_cl) + other_args <- list(...) + data_variables <- names(terra::rast(data_paths[1])) + # nolint start + summary_byvar <- function(x = data_variables, fs) { + rast_in <- rlang::inject(terra::rast(fs, !!!other_args)) + terra::sds(lapply( + x, + function(v) { + rast_inidx <- grep(v, names(rast_in)) + rast_in <- rast_in[[rast_inidx]] + rast_summary <- terra::tapp(rast_in, index = "days", fun = "mean") + names(rast_summary) <- + paste0( + rep(v, terra::nlyr(rast_summary)), "_", + terra::time(rast_summary) + ) + terra::set.crs(rast_summary, "EPSG:4326") + return(rast_summary) + } + )) + } + # nolint end + + # summary by 10 days + # TODO: dropping furrr? + rast_10d_summary <- + furrr::future_map( + .x = future_inserted, + .f = ~summary_byvar(fs = .x), + .options = + furrr::furrr_options( + globals = c("other_args", "data_variables") + ) + ) + rast_10d_summary <- Reduce(c, rast_10d_summary) + return(rast_10d_summary) + + } + + + + +#' Search package functions +#' @keywords Utility +#' @param package character(1). Package name. +#' @param search character(1). Search term. +#' @return A character vector containing the matching function names. +#' @examples +#' # Search for functions in the `amadeus` package +#' \dontrun{ +#' search_function("amadeus", "process_") +#' } +search_function <- function(package, search) { + library(package, character.only = TRUE) + grep(search, ls(sprintf("package:%s", package)), value = TRUE) +} + +#' Get data.frame of function parameters +#' @keywords Utility +#' @param functions character. Vector of function names. +#' @return A data.frame containing the parameters of the functions. +#' @importFrom dplyr as_tibble bind_rows +df_params <- function(functions) { + params <- lapply(functions, function(x) { + args <- + dplyr::as_tibble( + lapply(as.list(formals(get(x))), \(p) list(p)), + .name_repair = "minimal" + ) + return(args) + }) + paramsdf <- Reduce(dplyr::bind_rows, params) + return(paramsdf) +} + + +# nolint start +#' Base learner: Multilayer perceptron with brulee +#' +#' Multilayer perceptron model with different configurations of +#' hidden units, dropout, activation, and learning rate using brulee +#' and tidymodels. With proper settings, users can utilize graphics +#' processing units (GPU) to speed up the training process. +#' @keywords Baselearner soft-deprecated +#' @note tune package should be 1.2.0 or higher. +#' brulee should be installed with GPU support. +#' @details Hyperparameters `hidden_units`, `dropout`, `activation`, +#' and `learn_rate` are tuned. `With tune_mode = "grid"`, +#' users can modify `learn_rate` explicitly, and other hyperparameters +#' will be predefined (56 combinations per `learn_rate`). +#' @param dt_imputed The input data table to be used for fitting. +#' @param folds pre-generated rset object with minimal number of columns. +#' If NULL, `vfold` should be numeric to be used in [rsample::vfold_cv]. +#' @param tune_mode character(1). Hyperparameter tuning mode. +#' Default is "bayes", "grid" is acceptable. +#' @param tune_bayes_iter integer(1). The number of iterations for +#' Bayesian optimization. Default is 10. Only used when `tune_mode = "bayes"`. +#' @param learn_rate The learning rate for the model. For branching purpose. +#' Default is 0.1. +#' @param yvar The target variable. +#' @param xvar The predictor variables. +#' @param vfold The number of folds for cross-validation. +#' @param device The device to be used for training. +#' Default is "cuda:0". Make sure that your system is equipped +#' with CUDA-enabled graphical processing units. +#' @param trim_resamples logical(1). Default is TRUE, which replaces the actual +#' data.frames in splits column of `tune_results` object with NA. +#' @param return_best logical(1). If TRUE, the best tuned model is returned. +#' @param ... Additional arguments to be passed. +#' +#' @return The fitted workflow. +#' @importFrom recipes recipe update_role +#' @importFrom dplyr `%>%` +#' @importFrom parsnip mlp set_engine set_mode +#' @importFrom workflows workflow add_recipe add_model +#' @importFrom tune tune_grid fit_best +#' @importFrom tidyselect all_of +#' @importFrom yardstick metric_set rmse +#' @importFrom rsample vfold_cv +fit_base_brulee <- + function( + dt_sample, + dt_full, + folds = NULL, + cv_mode = c("spatiotemporal", "spatial", "temporal"), + tune_mode = "bayes", + tune_bayes_iter = 10L, + learn_rate = 0.1, + yvar = "Arithmetic.Mean", + xvar = seq(5, ncol(dt_sample)), + vfold = 5L, + device = "cuda:0", + trim_resamples = FALSE, + return_best = TRUE, + args_generate_cv = NULL, + ... + ) { + tune_mode <- match.arg(tune_mode, c("grid", "bayes")) + cv_mode <- match.arg(cv_mode) + + # 2^9=512, 2^15=32768 (#param is around 10% of selected rows) + grid_hyper_tune <- + expand.grid( + hidden_units = list(c(1024), c(64, 64), c(32, 32, 32), c(16, 16, 16)), + dropout = 1 / seq(4, 2, -1), + activation = c("relu", "leaky_relu"), + learn_rate = learn_rate + ) + + base_recipe <- + recipes::recipe( + dt_sample[1, ] + ) %>% + # do we want to normalize the predictors? + # if so, an additional definition of truly continuous variables is needed + # recipes::step_normalize(recipes::all_numeric_predictors()) %>% + recipes::update_role(!!xvar) %>% + recipes::update_role(!!yvar, new_role = "outcome") #%>% + # recipes::step_normalize(!!yvar) + + if (is.null(folds)) { + base_vfold <- rsample::vfold_cv(dt_sample, v = vfold) + } else { + args_generate_cv <- + c( + list(data = dt_sample, cv_mode = cv_mode), + args_generate_cv + ) + # generate row index + cv_index <- inject_match(switch_generate_cv_rset, args_generate_cv) + + # using cv_index, restore rset + # NOTE 08122024: not modified -- should be recoded + base_vfold <- + convert_cv_index_rset( + cv_index, dt_sample, cv_mode = cv_mode + ) + } + # base_vfold <- + # restore_rset_full(rset = base_vfold, data_full = dt_sample) + + base_model <- + parsnip::mlp( + hidden_units = parsnip::tune(), + dropout = parsnip::tune(), + epochs = 1000L, + activation = parsnip::tune(), + learn_rate = parsnip::tune() + ) %>% + parsnip::set_engine("brulee", device = device) %>% + parsnip::set_mode("regression") + + base_wftune <- + fit_base_tune( + recipe = base_recipe, + model = base_model, + resample = base_vfold, + tune_mode = tune_mode, + grid = grid_hyper_tune, + iter_bayes = tune_bayes_iter, + trim_resamples = trim_resamples, + return_best = return_best + ) + + return(base_wftune) + } + +# dt <- qs::qread("output/dt_feat_design_imputed_061024.qs") +# dtd <- dplyr::as_tibble(dt) +# dtfit <- fit_base_brulee(dtd, r_subsample = 0.3) + + +#' Base learner: Extreme gradient boosting (XGBoost) +#' +#' XGBoost model is fitted at the defined rate (`r_subsample`) of +#' the input dataset by grid search. +#' With proper settings, users can utilize graphics +#' processing units (GPU) to speed up the training process. +#' @keywords Baselearner soft-deprecated +#' @note tune package should be 1.2.0 or higher. +#' xgboost should be installed with GPU support. +#' @details Hyperparameters `mtry`, `ntrees`, and `learn_rate` are +#' tuned. With `tune_mode = "grid"`, +#' users can modify `learn_rate` explicitly, and other hyperparameters +#' will be predefined (30 combinations per `learn_rate`). +#' @param dt_imputed The input data table to be used for fitting. +#' @param folds pre-generated rset object with minimal number of columns. +#' If NULL, `vfold` should be numeric to be used in [rsample::vfold_cv]. +#' @param tune_mode character(1). Hyperparameter tuning mode. +#' Default is "bayes", "grid" is acceptable. +#' @param tune_bayes_iter integer(1). The number of iterations for +#' Bayesian optimization. Default is 10. Only used when `tune_mode = "bayes"`. +#' @param learn_rate The learning rate for the model. For branching purpose. +#' Default is 0.1. +#' @param yvar The target variable. +#' @param xvar The predictor variables. +#' @param vfold The number of folds for cross-validation. +#' @param device The device to be used for training. +#' Default is "cuda:0". Make sure that your system is equipped +#' with CUDA-enabled graphical processing units. +#' @param trim_resamples logical(1). Default is TRUE, which replaces the actual +#' data.frames in splits column of `tune_results` object with NA. +#' @param return_best logical(1). If TRUE, the best tuned model is returned. +#' @param ... Additional arguments to be passed. +#' +#' @return The fitted workflow. +#' @importFrom recipes recipe update_role +#' @importFrom dplyr `%>%` +#' @importFrom parsnip boost_tree set_engine set_mode +#' @importFrom workflows workflow add_recipe add_model +#' @importFrom tune tune_grid fit_best +#' @importFrom tidyselect all_of +#' @importFrom yardstick metric_set rmse +#' @importFrom rsample vfold_cv +fit_base_xgb <- + function( + dt_imputed, + folds = NULL, + tune_mode = "bayes", + tune_bayes_iter = 10L, + learn_rate = 0.1, + yvar = "Arithmetic.Mean", + xvar = seq(5, ncol(dt_imputed)), + vfold = 5L, + device = "cuda:0", + trim_resamples = TRUE, + return_best = FALSE, + ... + ) { + tune_mode <- match.arg(tune_mode, c("grid", "bayes")) + # P --> ++ / fix as many hyperparams as possible + grid_hyper_tune <- + expand.grid( + mtry = floor(c(0.02, 0.1, 0.02) * ncol(dt_imputed)), + trees = seq(1000, 3000, 500), + learn_rate = learn_rate + ) + # dt_imputed <- + # dt_imputed %>% + # dplyr::slice_sample(prop = r_subsample) + + # generate row index for restoring rset + cv_index <- switch_generate_cv_rset( + data = dt_imputed, + cv_mode = cv_mode + ) + # using cv_index, restore rset + # NOTE 08122024: not modified -- should be recoded + rset_cv <- + convert_cv_index_rset( + index_cv, data_orig, ref_list = ref_list, cv_mode = cv_mode + ) + + + + base_recipe <- + recipes::recipe( + dt_imputed[1, ] + ) %>% + # recipes::step_normalize(recipes::all_numeric_predictors()) %>% + recipes::update_role(tidyselect::all_of(xvar)) %>% + recipes::update_role(tidyselect::all_of(yvar), new_role = "outcome") + if (is.null(folds)) { + base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) + } else { + base_vfold <- folds + } + base_vfold <- + restore_rset_full(rset = base_vfold, data_full = dt_imputed) + + base_model <- + parsnip::boost_tree( + mtry = parsnip::tune(), + trees = parsnip::tune(), + learn_rate = parsnip::tune() + ) %>% + parsnip::set_engine("xgboost", device = device) %>% + parsnip::set_mode("regression") + + base_wftune <- + fit_base_tune( + recipe = base_recipe, + model = base_model, + resample = base_vfold, + tune_mode = tune_mode, + grid = grid_hyper_tune, + iter_bayes = tune_bayes_iter, + trim_resamples = trim_resamples, + return_best = return_best + ) + + return(base_wftune) + + } + +# dt <- qs::qread("output/dt_feat_design_imputed_061024.qs") +# dtd <- dplyr::as_tibble(dt) +# dtfitx <- fit_base_xgb(dtd, xvar = names(dtd)[6:105], r_subsample = 0.3) + + +#' Base learner: Light Gradient Boosting Machine (LightGBM) +#' +#' LightGBM model is fitted at the defined rate (`r_subsample`) of +#' the input dataset by grid or Bayesian optimization search. +#' With proper settings, users can utilize graphics +#' processing units (GPU) to speed up the training process. +#' @keywords Baselearner soft-deprecated +#' @note tune package should be 1.2.0 or higher. +#' xgboost should be installed with GPU support. +#' @details Hyperparameters `mtry`, `ntrees`, and `learn_rate` are +#' tuned. With `tune_mode = "grid"`, +#' users can modify `learn_rate` explicitly, and other hyperparameters +#' will be predefined (30 combinations per `learn_rate`). +#' @param dt_imputed The input data table to be used for fitting. +#' @param folds pre-generated rset object with minimal number of columns. +#' If NULL, `vfold` should be numeric to be used in [rsample::vfold_cv]. +#' @param tune_mode character(1). Hyperparameter tuning mode. +#' Default is "bayes", "grid" is acceptable. +#' @param tune_bayes_iter integer(1). The number of iterations for +#' Bayesian optimization. Default is 10. Only used when `tune_mode = "bayes"`. +#' @param learn_rate The learning rate for the model. For branching purpose. +#' Default is 0.1. +#' @param yvar The target variable. +#' @param xvar The predictor variables. +#' @param vfold The number of folds for cross-validation. +#' @param device The device to be used for training. +#' Default is `"gpu"`. Make sure that your system is equipped +#' with OpenCL-capable graphical processing units. +#' A GPU-enabled version of LightGBM should be installed. +#' @param trim_resamples logical(1). Default is TRUE, which replaces the actual +#' data.frames in splits column of `tune_results` object with NA. +#' @param return_best logical(1). If TRUE, the best tuned model is returned. +#' @param ... Additional arguments to be passed. +#' +#' @return The fitted workflow. +#' @importFrom recipes recipe update_role +#' @importFrom dplyr `%>%` +#' @importFrom parsnip boost_tree set_engine set_mode +#' @importFrom workflows workflow add_recipe add_model +#' @importFrom tune tune_grid fit_best +#' @importFrom tidyselect all_of +#' @importFrom yardstick metric_set rmse +#' @importFrom rsample vfold_cv +fit_base_lightgbm <- + function( + dt_imputed, + folds = NULL, + tune_mode = "bayes", + tune_bayes_iter = 10L, + learn_rate = 0.1, + yvar = "Arithmetic.Mean", + xvar = seq(5, ncol(dt_imputed)), + vfold = 5L, + device = "gpu", + trim_resamples = TRUE, + return_best = FALSE, + ... + ) { + tune_mode <- match.arg(tune_mode, c("grid", "bayes")) + # P --> ++ / fix as many hyperparams as possible + grid_hyper_tune <- + expand.grid( + mtry = floor(c(0.02, 0.1, 0.02) * ncol(dt_imputed)), + trees = seq(1000, 3000, 500), + learn_rate = learn_rate + ) + + # generate row index for restoring rset + cv_index <- switch_generate_cv_rset( + data = dt_imputed, + cv_mode = cv_mode + ) + # using cv_index, restore rset + # NOTE 08122024: not modified -- should be recoded + rset_cv <- + convert_cv_index_rset( + index_cv, data_orig, ref_list = ref_list, cv_mode = cv_mode + ) + + + base_recipe <- + recipes::recipe( + dt_imputed[1, ] + ) %>% + # recipes::step_normalize(recipes::all_numeric_predictors()) %>% + recipes::update_role(tidyselect::all_of(xvar)) %>% + recipes::update_role(tidyselect::all_of(yvar), new_role = "outcome") + if (is.null(folds)) { + base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) + } else { + base_vfold <- folds + } + base_vfold <- + restore_rset_full(rset = base_vfold, data_full = dt_imputed) + + base_model <- + parsnip::boost_tree( + mtry = parsnip::tune(), + trees = parsnip::tune(), + learn_rate = parsnip::tune() + ) %>% + parsnip::set_engine("lightgbm", device_type = device) %>% + parsnip::set_mode("regression") + + base_wftune <- + fit_base_tune( + recipe = base_recipe, + model = base_model, + resample = base_vfold, + tune_mode = tune_mode, + grid = grid_hyper_tune, + iter_bayes = tune_bayes_iter, + trim_resamples = trim_resamples, + return_best = return_best + ) + + return(base_wftune) + + } + + +#' Base learner: Elastic net +#' +#' Elastic net model is fitted at the defined rate (`r_subsample`) of +#' the input dataset by grid search. +#' @keywords Baselearner soft-deprecated +#' @note tune package should be 1.2.0 or higher. +#' @param dt_imputed The input data table to be used for fitting. +#' @param folds pre-generated rset object with minimal number of columns. +#' If NULL, `vfold` should be numeric to be used in [rsample::vfold_cv]. +#' @param yvar The target variable. +#' @param xvar The predictor variables. +#' @param vfold The number of folds for cross-validation. +#' @param tune_mode character(1). Hyperparameter tuning mode. +#' Default is "grid", "bayes" is acceptable. +#' @param tune_bayes_iter integer(1). The number of iterations for +#' Bayesian optimization. Default is 50. Only used when `tune_mode = "bayes"`. +#' @param nthreads The number of threads to be used. Default is 16L. +#' @param return_best logical(1). If TRUE, the best tuned model is returned. +#' @param ... Additional arguments to be passed. +#' +#' @return The fitted workflow. +#' @importFrom future plan multicore multisession +#' @importFrom dplyr `%>%` +#' @importFrom recipes recipe update_role +#' @importFrom parsnip linear_reg set_engine set_mode +#' @importFrom workflows workflow add_recipe add_model +#' @importFrom tune tune_grid fit_best +#' @importFrom tidyselect all_of +#' @importFrom yardstick metric_set rmse +#' @importFrom rsample vfold_cv +fit_base_elnet <- + function( + dt_imputed, + folds = NULL, + # r_subsample = 0.3, + yvar = "Arithmetic.Mean", + xvar = seq(5, ncol(dt_imputed)), + tune_mode = "grid", + tune_bayes_iter = 50L, + vfold = 5L, + nthreads = 16L, + trim_resamples = TRUE, + return_best = FALSE, + ... + ) { + grid_hyper_tune <- + expand.grid( + mixture = seq(0, 1, length.out = 21), + penalty = 10 ^ seq(-3, 5) + ) + + # generate row index for restoring rset + cv_index <- switch_generate_cv_rset( + data = dt_imputed, + cv_mode = cv_mode + ) + # using cv_index, restore rset + # NOTE 08122024: not modified -- should be recoded + rset_cv <- + convert_cv_index_rset( + index_cv, data_orig, ref_list = ref_list, cv_mode = cv_mode + ) + + + + base_recipe <- + recipes::recipe( + dt_imputed[1, ] + ) %>% + # recipes::step_normalize(recipes::all_numeric_predictors()) %>% + recipes::update_role(tidyselect::all_of(xvar)) %>% + recipes::update_role(tidyselect::all_of(yvar), new_role = "outcome") + + if (is.null(folds)) { + base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) + } else { + base_vfold <- folds + } + base_vfold <- + restore_rset_full(rset = base_vfold, data_full = dt_imputed) + + base_model <- + parsnip::linear_reg( + mixture = parsnip::tune(), + penalty = parsnip::tune() + ) %>% + parsnip::set_engine("glmnet") %>% + parsnip::set_mode("regression") + + future::plan(future::multicore, workers = nthreads) + base_wftune <- + fit_base_tune( + recipe = base_recipe, + model = base_model, + resample = base_vfold, + tune_mode = tune_mode, + grid = grid_hyper_tune, + iter_bayes = tune_bayes_iter, + trim_resamples = trim_resamples, + return_best = return_best + ) + future::plan(future::sequential) + return(base_wftune) + + } +# nolint end + +#' Restore the full data set from the rset object +#' @keywords Baselearner soft-deprecated +#' @param rset [rsample::manual_rset()] object's `splits` column +#' @param data_full data.table with all features +#' @return A list of data.table objects. +#' @importFrom collapse join +#' @note $splits should be present in rset. +restore_rset_full <- + function(rset, data_full) { + rset$splits <- + lapply( + rset$splits, + function(x) { + x$data <- + collapse::join( + x$data, + data_full, + on = c("site_id", "time"), + how = "left" + ) + return(x) + } + ) + return(rset) + } + + +#' Restore the full data set from two rset objects then fit the best model +#' @keywords Baselearner soft-deprecated +#' @param rset_trimmed rset object without data in splits column. +#' @param rset_full rset object with full data. +#' @param df_full data.table with full data. +#' @param nested logical(1). If TRUE, the rset object is nested. +#' @param nest_length integer(1). Length of the nested list. +#' i.e., Number of resamples. +#' @note Per introduction of fit_base_tune, +#' the utility of this function might be limited. +#' @return rset object with full data in splits column. +#' @importFrom dplyr %>% +#' @export +restore_fit_best <- + function( + rset_trimmed, + rset_full, + df_full, + by = c("site_id", "time"), + nested = TRUE, + nest_length = 30L + ) { + parsnip_spec <- + workflows::extract_spec_parsnip(rset_trimmed[[1]]) + # Do I need to restore full data in rset_trimmed? + + # reassemble the branched rsets + if (nested) { + rset_trimmed <- + as.list(seq_len(nest_length)) %>% + lapply( + function(x) { + # here we have list length of 4, each has .metric column, + # which we want to bind_rows at + # [[1]] $.metric + # [[2]] $.metric ... + template <- x[[1]] + combined_lr <- x[seq(x, length(rset_trimmed), nest_length)] + # length of 4; + # combine rows of each element in four lists + combined_lr <- + mapply( + function(df1, df2, df3, df4) { + dplyr::bind_rows(df1, df2, df3, df4) + }, + combined_lr[[1]], combined_lr[[2]], + combined_lr[[3]], combined_lr[[4]], + SIMPLIFY = FALSE + ) + template$.metric <- combined_lr + return(template) + } + ) + } + + tuned_best <- tune::show_best(rset_trimmed, n = 1) + model_best <- + rlang::inject( + update(parsnip_spec, parameters = !!!as.list(tuned_best)) + ) + + # fit the entire data + model_fit <- parsnip::fit(model_best, data = df_full) + pred <- predict(model_fit, data = df_full) + return(pred) + + } + + + +# nocov end diff --git a/R/targets_control.R b/R/targets_control.R new file mode 100644 index 00000000..613b840c --- /dev/null +++ b/R/targets_control.R @@ -0,0 +1,55 @@ +# nocov start + + +#' Set resource management for SLURM +#' +#' This function sets up resources for SLURM job submission. +#' Note that this function is designed to work with `tar_make_future()` +#' @keywords Utility +#' @param template_file SLURM job submission shell template path. +#' @param partition character(1). Name of partition. Default is `"geo"` +#' @param ncpus integer(1). Number of CPU cores assigned to each task. +#' @param ntasks integer(1). Number of tasks to submit. +#' @param memory integer(1). Specifically odds to 2*x GB. +#' @param user_email character(1). User email address. +#' @param error_log character(1). Error log file name. +#' @note This function is designed to be used with `tar_resources`. +#' Suggested number of `ncpus` is more than 1 for typical multicore R tasks. +#' @return A list of resources for `tar_resources` +#' @author Insang Song +#' @importFrom future tweak +#' @importFrom future.batchtools batchtools_slurm +#' @importFrom targets tar_resources +#' @importFrom targets tar_resources_future +#' @export +set_slurm_resource <- + function( + template_file = "inst/targets/template_slurm.tmpl", + partition = "geo", + ncpus = 2L, + ntasks = 2L, + memory = 8, + user_email = paste0(Sys.getenv("USER"), "@nih.gov"), + error_log = "slurm_error.log" + ) { + targets::tar_resources( + future = targets::tar_resources_future( + plan = future::tweak( + future.batchtools::batchtools_slurm, + template = template_file, + resources = + list( + partition = partition, + ntasks = ntasks, + ncpus = ncpus, + memory = memory, + email = user_email, + error.file = error_log + ) + ) + ) + ) + } + + +# nocov end diff --git a/README.md b/README.md index ab2b689e..4dc94b48 100755 --- a/README.md +++ b/README.md @@ -1,12 +1,20 @@ +# Building an Extensible, rEproducible, Test-driven, Harmonized, Open-source, Versioned, ENsemble model for air quality two hexagons with distributed tan, orange, and teal with geometric symbols placed. Two hexagons are diagonally placed from the top left to the bottom right + + +

+ [![R-CMD-check](https://github.com/NIEHS/beethoven/actions/workflows/check-standard.yaml/badge.svg)](https://github.com/NIEHS/beethoven/actions/workflows/check-standard.yaml) -[![cov](https://NIEHS.github.io/beethoven/badges/coverage.svg)](https://github.com/NIEHS/beethoven/actions) +[![cov](https://NIEHS.github.io/beethoven/badges/coverage.svg)](https://github.com/NIEHS/beethoven/actions/workflows/test-coverage.yaml) [![lint](https://github.com/NIEHS/beethoven/actions/workflows/lint.yaml/badge.svg)](https://github.com/NIEHS/beethoven/actions/workflows/lint.yaml) [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) -# Building an Extensible, rEproducible, Test-driven, Harmonized, Open-source, Versioned, ENsemble model for air quality -Group Project for the Spatiotemporal Exposures and Toxicology group with help from friends :smiley: :cowboy_hat_face: :earth_americas: +Group Project for the Spatiotemporal Exposures and Toxicology group with help from friends :smiley: :cowboy_hat_face: :earth_americas: + + +

+ ## Installation ```r @@ -152,18 +160,10 @@ Here, we describe the structure of the project and the naming conventions used. - `vignettes/` Rmd (and potentially Qmd) narrative text and code files. These are rendered into the **Articles** for the package website created by [pkgdown](https://pkgdown.r-lib.org/) - `inst/` Is a sub-directory for arbitrary files outside of the main `R/` directory - `targets` which include the important pipeline file `_targets.R` - - `lookup` Is a subdirectory for text file lookup table used in the pipeline to synchronize paths, names, abbreviations, etc. - `.github/workflows/` This hidden directory is where the GitHub CI/CD yaml files reside ##### The following sub-directories are not including the package build and included only in the source code here -- `tools/` This sub-directory is dedicated to educational or demonstration material (e.g. Rshiny). -- `input/` ***warning soon to be deprecated*** This sub-directory contains data used during the analysis. It is going to be superceded by the use of `targets` -- `output/` ***warning: soon to be deprecated*** This sub-directory contains data used during the analysis. It is going to be superceded by the use of `targets` -Currently, as of 3/29/24, the output folder contains .rds files for each -of the covariates/features for model development. e.g.: - -- NRTAP_Covars_NLCD.rds -- NRTAP_Covars_TRI.rds +- `tools/` This sub-directory is dedicated to educational or demonstration material (e.g. Rshiny). #### Relevant files @@ -179,7 +179,7 @@ Here, we provide the `beethoven` naming conventions for objects as used in `targ For `tar_target` functions, we use the following naming conventions: -Naming conventions for `tar` objects. We are motivated by the [Compositional Forecast](https://cfconventions.org/Data/cf-standard-names/docs/guidelines.html) (CF) model naming conventions: +Naming conventions for `targets` objects. We are motivated by the [Compositional Forecast](https://cfconventions.org/Data/cf-standard-names/docs/guidelines.html) (CF) model naming conventions: e.g. [surface] [component] standard_name [at surface] [in medium] [due to process] [assuming condition] In CF, the entire process can be known from the required and optional naming pieces. @@ -196,14 +196,14 @@ Examples: 1) `sf_PM25_log10-fit_AQS_siteid` is an `sf` object for `PM25` data th #### Naming section definitions: -- **R object type**: sf, datatable, tibble, SpatRaster, SpatVector +- **R object type**: chr (character), list, sf, dt (datatable), tibble, SpatRaster, SpatVector - **role:** Detailed description of the role of the object in the pipeline. Allowable keywords: - PM25 - - feature (i.e. geographic covariate) + - feat (feature) (i.e. geographic covariate) - base_model - - base_model suffix types: linear, random_forest, xgboost, neural_net etc. + - base_model suffix types: linear, random_forest, lgb (lightGBM), xgb (xgboost), mlp (neural network, multilayer perceptron) etc. - meta_model - prediction - plot @@ -213,8 +213,7 @@ Examples: 1) `sf_PM25_log10-fit_AQS_siteid` is an `sf` object for `PM25` data th are also articulated here. Allowable keywords: - raw - - process - - calc + - calc: results from processing-calculation chains - fit: Ready for base/meta learner fitting - result: Final result - log @@ -222,7 +221,6 @@ are also articulated here. Allowable keywords: - **source:** the original data source - - AQS - MODIS - GMTED @@ -230,8 +228,8 @@ are also articulated here. Allowable keywords: - NARR - GEOSCF - TRI + - NEI - KOPPENGEIGER - - MERRA2 - HMS - gROADS - POPULATION @@ -301,14 +299,13 @@ Users could comment out the three lines to keep targets in `_targets` directory set_args_calc( char_siteid = "site_id", char_timeid = "time", - char_period = c("2018-01-01", "2022-10-31"), + char_period = c("2018-01-01", "2022-12-31"), num_extent = c(-126, -62, 22, 52), char_user_email = paste0(Sys.getenv("USER"), "@nih.gov"), export = FALSE, - path_export = "inst/targets/punchcard_calc.qs", + path_export = "inst/targets/calc_spec.qs", path_input = "input", nthreads_nasa = 14L, - nthreads_hms = 3L, nthreads_tri = 5L, nthreads_geoscf = 10L, nthreads_gmted = 4L, @@ -326,10 +323,455 @@ After switching to the project root directory (in terminal, `cd [project_root]`, > [!NOTE] > With `export = TRUE`, it will take some time to proceed to the next because it will recursively search hdf file paths. The time is affected by the number of files to search or the length of the period (`char_period`). +> [!WARNING] +> Please make sure that you are at the project root before proceeding to the following. The HPC example requires additional edits related to SBATCH directives and project root directory. + ```shell Rscript inst/targets/targets_start.R & ``` +Or in NIEHS HPC, modify several lines to match your user environment: + +```shell +# ... +#SBATCH --output=YOUR_PATH/pipeline_out.out +#SBATCH --error=YOUR_PATH/pipeline_err.err +# ... +# The --mail-user flag is optional +#SBATCH --mail-user=MYACCOUNT@nih.gov +# ... +USER_PROJDIR=/YOUR/PROJECT/ROOT +nohup nice -4 Rscript $USER_PROJDIR/inst/targets/targets_start.R +``` + +`YOUR_PATH`, `MYACCOUNT` and `/YOUR_PROJECT_ROOT` should be changed. In the end, you can run the following command: + +```shell +sbatch inst/targets/run.sh +``` + +The script will submit a job with effective commands with SLURM level directives defined by lines starting `#SBATCH`, which allocate CPU threads and memory from the specified partition. + +`inst/targets/run.sh` includes several lines exporting environment variables to bind GDAL/GEOS/PROJ versions newer than system default, geospatial packages built upon these libraries, and the user library location where required packages are installed. The environment variables need to be changed following NIEHS HPC system changes in the future. + > [!WARNING] > `set_args_*` family for downloading and summarizing prediction outcomes will be added in the future version. + + + +# Developer's guide + + +## Preamble +The objective of this document is to provide developers with the current implementation of `beethoven` pipeline for version 0.3.9. + +We assume the potential users have basic knowledge of `targets` and `tarchetypes` packages as well as functional and meta-programming. It is recommended to read Advanced R (by Hadley Wickham)'s chapters for these topics. + + +## Pipeline component and basic implementation +The pipeline is based on `targets` package. All targets are **stored** in a designated storage, which can be either a directory path or a URL when one uses cloud storage or web servers. Here we classify the components into three groups: + +1. Pipeline execution components: the highest level script to run the pipeline. +2. Pipeline configuration components: function arguments that are injected into the functions in each target. +3. Pipeline target components: definitions of each target, essentially lists of `targets::tar_target()` call classified by pipeline steps + + +Let's take a moment to be a user. You should consult specific file when: + +- `_targets.R`: you need to modify or saw errors on library locations, targets storage locations, required libraries + - Check `set_args_*()` function parts when you encounter "file or directory not found" error +- `run_slurm.sh`: "the pipeline status is not reported to my email address." +- `inst/targets/targets_*.R` files: any errors related to running targets except for lower level issues in `beethoven` or `amadeus` functions + +> [!NOTE] +> Please expand the toggle below to display function trees for `inst/targets/targets_*.R` files. Only functions that are directly called in each file are displayed due to screen real estate and readability concerns. + + +
+`targets_*.R` file function tree + + +```mermaid +graph LR + + %% Define styles for the target files + style arglist fill:#ffcccc,stroke-width:2px,stroke:#000000,opacity:0.5 + style baselearner fill:#ccffcc,stroke-width:2px,stroke:#000000,opacity:0.5 + style calculateF fill:#ccccff,stroke-width:2px,stroke:#000000,opacity:0.5 + style download fill:#ffccff,stroke-width:2px,stroke:#000000,opacity:0.5 + style initialize fill:#ccffff,stroke-width:2px,stroke:#000000,opacity:0.5 + style metalearner fill:#ffffcc,stroke-width:2px,stroke:#000000,opacity:0.5 + style predict fill:#ffcc99,stroke-width:2px,stroke:#000000,opacity:0.5 + + %% Define the target files as nodes + arglist["**inst/targets/targets_arglist.R**"] + baselearner["**inst/targets/targets_baselearner.R**"] + calculateF["**inst/targets/targets_calculate.R**"] + download["**inst/targets/targets_download.R**"] + initialize["**inst/targets/targets_initialize.R**"] + metalearner["**inst/targets/targets_metalearner.R**"] + predict["**inst/targets/targets_predict.R**"] + + %% Define the branches with arrowhead connections + fargdown["`set_args_download`"] ---|`set_args_download`| arglist + fargcalc["`set_args_calc`"] ---|`set_args_calc`| arglist + fraw["`feature_raw_download`"] ---|`feature_raw_download`| download + readlocs["`read_locs`"] ---|`read_locs`| initialize + fitbase["`fit_base_learner`"] ---|`fit_base_learner`| baselearner + switchmodel["`switch_model`"] ---|`switch_model`| baselearner + makesub["`make_subdata`"] ---|`make_subdata`| baselearner + covindexrset["`convert_cv_index_rset`"] ---|`convert_cv_index_rset`| baselearner + attach["`attach_xy`"] ---|`attach_xy`| baselearner + gencvsp["`generate_cv_index_sp`"] ---|`generate_cv_index_sp`| baselearner + gencvts["`generate_cv_index_ts`"] ---|`generate_cv_index_ts`| baselearner + gencvspt["`generate_cv_index_spt`"] ---|`generate_cv_index_spt`| baselearner + switchrset["`switch_generate_cv_rset`"] ---|`switch_generate_cv_rset`| baselearner + fcalc["`calculate`"] ---|`calculate`| calculateF + fcalcinj["`inject_calculate`"] ---|`inject_calculate`| calculateF + fcalcinjmod["`inject_modis_par`"] ---|`inject_modis_par`| calculateF + fcalcinjgmted["`inject_gmted`"] ---|`inject_gmted`| calculateF + fcalcinjmatch["`inject_match`"] ---|`inject_match`| calculateF + fcalcgeos["`calc_geos_strict`"] ---|`calc_geos_strict`| calculateF + fcalcgmted["`calc_gmted_direct`"] ---|`calc_gmted_direct`| calculateF + fcalcnarr2["`calc_narr2`"] ---|`calc_narr2`| calculateF + fparnarr["`par_narr`"] ---|`par_narr`| calculateF + fmetalearn["`fit_meta_learner`"] ---|`fit_meta_learner`| metalearner + G["`pred`"] ---|`pred`| predict + + %% Apply thin solid dark grey lines to the branches + classDef branchStyle stroke-width:1px,stroke:#333333 + class fargdown,fargcalc,fraw,readlocs,fitbase,switchmodel,makesub,covindexrset,attach,gencvsp,gencvts,gencvspt,switchrset,fcalc,fcalcinj,fcalcinjmod,fcalcinjgmted,fcalcinjmatch,fcalcgeos,fcalcgmted,fcalcnarr2,fparnarr,fmetalearn,G branchStyle +``` + +
+ + +![](man/figures/pipeline-code-relations.svg) + +The details of argument injection is illustrated below. The specific arguments to inject are loaded from QS files that are required to be saved in `inst/targets` directory. Each QS file contains a nested list object where function arguments for downloading raw data and calculating features are defined and store. + + +#### `inst/targets/download_spec.qs` +The file is generated by a `beethoven` function `set_args_download`. In `_targets.R` file, one can skip to generate this file if raw data download is already done or unnecessary. + +```r +generate_list_download <- FALSE + +arglist_download <- + set_args_download( + char_period = c("2018-01-01", "2022-12-31"), + char_input_dir = "input", + nasa_earth_data_token = NULL,#Sys.getenv("NASA_EARTHDATA_TOKEN"), + export = generate_list_download, + path_export = "inst/targets/download_spec.qs" + ) +``` + + +#### `inst/targets/calc_spec.qs` +`set_args_calc()` function will generate this file. The file name can be changed (` path_export = "inst/targets/calc_spec.qs" `), but it must start with `calc_` as the file name prefix is used to search QS files to manage different periods. Like `download_spec.qs`, whether or not to run this function can be specified by a logical variable named `generate_list_calc` in `_targets.R` file. + +```r +generate_list_calc <- FALSE + +arglist_common <- + set_args_calc( + char_siteid = "site_id", + char_timeid = "time", + char_period = c("2018-01-01", "2022-12-31"), + num_extent = c(-126, -62, 22, 52), + char_user_email = paste0(Sys.getenv("USER"), "@nih.gov"), + export = generate_list_calc, + path_export = "inst/targets/calc_spec.qs", + char_input_dir = "/ddn/gs1/group/set/Projects/NRT-AP-Model/input" + ) +``` +QUESTION: Where (which function calls) and when is `inst/targets/init_target.sh` used? + +![](man/figures/pipeline-schema.svg) + +As a compromise between the layouts for standard R packages and `targets` pipelines, we mainly keep `tar_target()` definitions in `inst/targets/`, whereas the `targets` required components are stored in the project root. All targets are recorded in `_targets/` directory by default, and it can be changed to somewhere else by defining an external directory at `store` argument in `tar_config_set()` in `_targets.R`. If you change that part in `_targets.R`, you should run `init_targets_storage.sh` **in the project root** to create the specified directory. + +```shell +. init_targets_storage.sh +``` + +```r +# replacing yaml file. +tar_config_set( + store = "/__your__desired__location__" +) +``` + +## Before running the pipeline +For the future release and tests on various environments, one should check several lines across R and shell script files: + +- Shell script + - `/run_interactive.sh`: this file is for running the host `targets` process **in an interactive session**. All system variables including `PATH` and `LD_LIBRARY_PATH` to align with the current development system environment. The lines in the provided file are set for NIEHS HPC. Note that it may stall if there are too many other processes running on the interactive node. + - `/run_slurm.sh`: this file is for running the host `targets` process **on SLURM by SBATCH script**, meaning that one should run `sbatch run_slurm.sh`. The working directory is set in this bash script to the root of your project (i.e. `beethoven` clone root) : + +``` + # modify it into the proper directory path. and output/error paths in the + # # SBATCH directives + USER_PROJDIR=/ddn/gs1/home/$USER/projects + + nohup nice -4 Rscript $USER_PROJDIR/beethoven/inst/targets/targets_start.R +``` +- R script + - `/targets.R`: Lines 10-12, `tar_config_set(store = ...)` should be reviewed if it is set properly not to overwrite successfully run targets. + - `/targets.R`: `set_args_download` and `set_args_calc` functions, i.e., `char_input_dir` argument and `char_period`. + - `/targets.R`: `library` argument value in `tar_option_set` to match the current system environment + + + +## Basic structure of branches +We will call "grand target" as a set of branches if any branching technique is applied at a target. + +When one target is branched out, the grand target should be a list, either being a nested or a plain list, depending on the context or the command run inside each branch. Branch names include automatic hash after the grand target name as a suffix. Users may use their own suffixes for legibility. Branches have their own good to provide succinct network layout (i.e., an interactive plot generated by `tar_visnetwork(targets_only = TRUE)`), while they add complication to debug. It is strongly advised that the unit function that is applied to each branch should be fully tested. + +## Branching in beethoven +Branching is actively employed in most parts in `beethoven`. Here we will navigate which targets are branched out and rationales for branching in each target. + +### Downloading raw data from the source +Download targets are separated from the calculation-model fitting sequence and operate a bit different from other targets. Arguments stored in a QS file or QS files (`inst/targets/download_*.qs`) are injected to `amadeus::download_data()` and it will initiate building raw data download targets. The target is rigorous branched out thus is represented as one square node when one runs `targets::tar_visnetwork()`. Building the target named 'lgl_rawdir_download' will download the raw data from the internet and it will be performed **sequentially** under the current setting. + +Users may bypass the downloading targets by setting a temporary system variable with `Sys.setenv("BTV_DOWNLOAD_PASS" = "FALSE")`, which is included in `_targets.R`. + +```r +# bypass option +Sys.setenv("BTV_DOWNLOAD_PASS" = "FALSE") + +# abridged for display... + +# # nullify download target if bypass option is set +if (Sys.getenv("BTV_DOWNLOAD_PASS") == "TRUE") { + target_download <- NULL +} +``` + + +### `list_feat_calc_base` +Per `beethoven` targets naming convention, this object will be a list and it has eight elements at the first level. We use "first level" here as the list is nested. It is also related to maintain `list_feat_calc_base_flat` at the following target. Eight elements are defined in a preceding target `chr_iter_calc_features`: + +```r + tar_target( + chr_iter_calc_features, + command = c("hms", "tri", "nei", + "ecoregions", "koppen", "population", "groads"), + iteration = "list", + description = "Feature calculation" + ) + +``` + +Using `inject_calculate` function and argument lists generated by `set_args_calc` function, `chr_iter_calc_features` are passed to `amadeus` functions for calculation. Please note that the pattern of `list_feat_calc_base` is not simply `map(chr_iter_calc_features)`, rather `cross(file_prep_calc_args, chr_iter_calc_features)`, for potential expansion to keep multiple argument files in the future. + +Each element in `chr_iter_calc_features` is iterated as a list then `list_feat_calc_base` will be a nested list. `list_feat_calc_base` will merge nested elements into one `data.frame` (`data.table` actually), resulting in a non-nested `list`, which means each element in this `list` object is a `data.frame`. + +### `list_feat_calc_nlcd` +From version 0.3.10, NLCD target is separated from `list_feat_calc_base` from runtime concerns. Here we take nested parallelization strategy, where each `amadeus::calc_nlcd()` run with different year and buffer size is parallelized where each will use 10 threads. In the initial study period, we have six combinations (two NLCD years in 2019 and 2021, and three radii of 1, 10, and 50 kilometers). Thus, the NLCD target will use 60 threads, but not necessarily concurrently. Each combination will get its slot in the resulting list target, therefore the following `dt_feat_calc_nlcd` is created by `data.frame` pivotting. + + + +### `list_feat_calc_nasa` +MODIS-VIIRS product processing is a bit more complex than others since many preprocessing steps are involved in this raw data. Please note that `chr_iter_calc_nasa` divides MOD19A2 product by spatial resolution since difference in spatial resolution of raster layers makes it difficult to stack layers that can be advantageous to improve processing speed. The branching itself is simple to use a character vector of length 7 to iterate the process, but there is a different avenue that might introduce complexity in terms of computational infrastructure and implementation of parallel processing. + +We introduced nested parallelization to expedite the MODIS/VIIRS processing, where `tar_make_future` will submit jobs per MODIS/VIIRS product code via SLURM batchtools and multiple threads are used in each job. If one wants to make a transition to `crew` based pipeline operation in the future, this part indeed requires a tremendous amount of refactoring not only in beethoven functions but also amadeus functions considering features of `crew`/`mirai` workers which are different from `future`. + + +### `list_feat_calc_geoscf` +We use a character vector of length 2 to distinguish chm from aqc products. A modified version of `amadeus::calc_geos`, `calc_geos_strict` is employed to calculate features. The key modification is to fix the radius argument as zero then to remove the top-level argument radius from the function. + + +### `list_feat_calc_gmted` +Here we use custom function `calc_gmted_direct`, which has different logic from what was used in `amadeus::calc_gmted`. `inject_gmted` uses that function to parallelize the calculation by radius length. + +### `list_feat_calc_narr` +Again, modified functions `process_narr2` and `calc_narr2` are applied and the parallelization for NARR data is done by `par_narr`. Here we did not branch out by NARR variable names since they are a bit long (length of 46) such that each dispatched branch will add up overhead to submit SLURM job for each variable. + + +## Merge branches + +Functions with prefix `post_calc_` merge branches, which contain various internal structures. Most of the branches are list of depth 1, which means `data.frame` or `data.table` objects are in each list element. Others are list of depth 2. + +### Tackling space-time discrepancy + +Each source data have different temporal resolution and update frequency. This leads to the different dimensions across targets due to the measures to save time for computation. For example, NLCD targets will get N (number of sites) times 2 (2019 and 2021 per initial study period as of August 2024), whereas NARR targets will get N times $|D|$ (where $D$ is the set of dates), which equals to the full site-date combinations during the study period. To tackle the discrepancy across calculated targets, automatic expansion strategy is implemented by inferring temporal resolution from targets. Automatic expansion starts from resolving native temporal resolution from each target then proceeds to adding a provisional field year from date, which is removed after all required join operations will be completed. Most of the time, date-to-year conversion is performed internally in `expand` functions in `beethoven` and full space-time `data.frame` is prioritized to left join the multiple targets. + +### Value filling strategies + +Temporal resolution discrepancy makes `NA` values in joined `data.frame`s. In MODIS/VIIRS targets, NDVI (a subdataset of MOD13A1 product) is based on a 16-day cycle, differing from other products on a daily cycle. We consider the reported date of "16-day cycle" as the **last day** of the cycle. + +* **MODIS/VIIRS**: Therefore, the `NA` values introduced by joining `data.frame`s by date field are filled in `impute_all` using `data.table::setnafill` with next observation carried forward (`type = "nocb"`) option. +* MODIS/VIIRS targets may have `NaN` values where nonexisting values are assigned as replacements. These values are replaced with `NA` at first, then with zeros. +* Other nonignorable `NA`s in the joined target will be imputed by missForest (name of the original method used; actually using `missRanger` package for efficiency). + +### Autojoin functions + +Automatic join function `post_calc_autojoin` is one of the most complex function in `beethoven` codebase, which encapsulates the efforts to resolve all sorts of space-time discrepancies across targets. Full and coarse site-date combinations and full and coarse site-year combinations are automatically resolved in the function. The coarse site-year combination is a challenge since some years are out of the study period and such *anchor* years should be repeated to fill in for no gaps in the joined data. Another `post_calc_df_year_expand` and its upstream `post_calc_year_expand` function repeat coarse site-year `data.frame`s properly to ensure that there will be no years with missing values. + +```r +post_calc_autojoin <- + function( + df_fine, + df_coarse, + field_sp = "site_id", + field_t = "time", + year_start = 2018L, + year_end = 2022L + ) { + # Dataset specific preprocessing + if (any(grepl("population", names(df_coarse)))) { + df_coarse <- df_coarse[, -c("time"), with = FALSE] + } + + # Detect common field names + common_field <- intersect(names(df_fine), names(df_coarse)) + + # Clean inputs to retain necessary fields + df_fine <- data.table::as.data.table(df_fine) + df_coarse <- data.table::as.data.table(df_coarse) + df_fine <- post_calc_drop_cols(df_fine) + df_coarse <- post_calc_drop_cols(df_coarse) + + # Take strategy depending on the length of common field names + # Length 1 means that `site_id` is the only intersecting field + if (length(common_field) == 1) { + print(common_field) + if (common_field == field_sp) { + joined <- data.table::merge.data.table( + df_fine, df_coarse, + by = field_sp, + all.x = TRUE + ) + } + } + # When space-time join is requested, + if (length(common_field) == 2) { + if (all(common_field %in% c(field_sp, field_t))) { + # Type check to characters + df_fine[[field_t]] <- as.character(df_fine[[field_t]]) + df_coarse[[field_t]] <- as.character(df_coarse[[field_t]]) + + # When `time` field contains years, `as.Date` call will return error(s) + t_coarse <- try(as.Date(df_coarse[[field_t]][1])) + # If an error is detected, print information + if (inherits(t_coarse, "try-error")) { + message( + "The time field includes years. Trying different join strategy." + ) + coarse_years <- sort(unique(unlist(as.integer(df_coarse[[field_t]])))) + + # coarse site-year combination is expanded + df_coarse2 <- post_calc_df_year_expand( + df_coarse, + time_start = year_start, + time_end = year_end, + time_available = coarse_years + ) + joined <- + post_calc_join_yeardate(df_coarse2, df_fine, field_t, field_t) + } else { + # site-date combination data.frames are joined as they are regardless of coarseness + # Left join is enforced + joined <- data.table::merge.data.table( + df_fine, df_coarse, + by = c(field_sp, field_t), + all.x = TRUE + ) + } + } + } + return(joined) + } +``` + +### Managing calculated features + +The calculation configuration files can be multiple, which means the calculated feature targets can also be multiple. The `dt_feat_calc_cumulative` target operates differently depending on the existence of a *.qs file in the `output/qs` directory. If there is any *.qs file in the `output/qs` directory, the `dt_feat_calc_design` target will be appended (i.e., `rbind()`-ed) to the contents of the `*.qs` files. The first run will assign a file name string to `dt_feat_calc_cumulative`. + +```r +append_predecessors( + path_qs = "output/qs", + period_new = arglist_common$char_period, + input_new = dt_feat_calc_design, + nthreads = arglist_common$nthreads_append +) +``` + +### Imputation + +The calculated features contain a fair amount of `NA` or `NaN`s depending on the raw dataset. We distinguish these into "true zeros" and "true missing" for the subsequent imputation process. For imputation, `missRanger` is used. The `missRanger` arguments can be adjusted in the `impute_all()` function. + +- True zeros: TRI features include many `NA`s as the raw data is a long `data.frame` with source location-chemicals pair keys. This structure requires long-to-wide pivoting, resulting in a sparse `data.frame` with `NA`s where no chemicals were reported in certain locations. Therefore, these `NA`s are considered true zeros. + +- Missing: daily satellite-derived features except for the 16-day NDVI are considered to include missing values. Such missing values are mainly coming from intermittent data transmission disruption or planned maintenance. `NA`s in the 16-day NDVI field are filled by the "last observation carried forward" principle. `NaN` values in others are replaced with `NA` and put into the imputation function. + + + +## Base learners + +For efficiency, GPU-enabled version is recommended for `xgboost`/`lightgbm` and `brulee`. These packages need to be installed manually with modifications of system environment variables. Developers should consult `lightgbm` official documentation for building the package by hand, `xgboost` GitHub repository release page for installing the CUDA version manually and `brulee` GitHub repository (i.e., in `gpu` branch) to install the proper version of each package with careful consideration on the computing infrastructure. "GPU" here refers to CUDA-enabled devices produced by NVIDIA corporation. This does not necessarily mean that this package as a part of U.S. government work endorses NVIDIA corporation and its products in any sort. + +> [!WARNING] +> As of version 0.3.10, `xgboost` < v2.1.0 should be used due to breaking changes in v2.1.0 in handling additional arguments in `xgb.DMatrix` (cf. [xgboost pull record](https://github.com/dmlc/xgboost/pull/9862)), which leads to break `parsnip::boost_tree()` function call. + + +### tidymodels infrastructure + +We want to actively adopt evolving packages in the `tidymodels` ecosystem while keeping as minimal dependency tree as possible. In this package, major `tidymodels` packages that are used in base and meta learners include-- + +* `parsnip` +* `recipe` +* `rsample` +* `spatialsample` +* `tune` +* `workflow` + +### Branching +With rigorous branching, we maintain the base learner fitting targets as one node with 900 branches, which include $\texttt{3 (base learners)}\times +texttt{3 (CV strategies)}\times \texttt{100 resamples}$. LightGBM and multilayer perceptron models are running on GPUs, while elastic net models are fit on CPUs. + + +### Cross validation + +Due to `rsample` design, each cross-validation fold will include an **actual** `data.frame` (`tibble`) object. It has own good for self-contained modeling practices that easily guarantee reproducibility, however, it also has limitations when used with large data and `targets` pipeline as `targets` **stores** such objects in disk space. Such characteristics lead to inflate the disk space for base and meta learner training. Ten-fold cross-validation sets from 900K*3.2K `data.frame` take $9\texttt{M} \times 3.2\texttt{K} \times 8\texttt{bytes}$=230GB. Randomization schemes for model ensemble will increase that size to 10 times and more, which is equivalent to 2.3TB and more when uncompressed. The current development version modifies the original `rsample`'s `rset` design to store *row indices* of the joined `data.frame` target to reduce data size in disk. + + +#### Use `rset` object in the last resort + +`rset` object is a powerful tool to ensure that all cross-validation sets "flow" through the modeling process, but has a limitation in large-scale modeling with `target`: storage issues. When one stores `rset` objects in the pipeline even with a mild randomization (e.g., 30% row sampling in the base learner step in `beethoven` pipeline), the total disk space required to keep `rset` object easily exceed several times of the original `data.frame` object. Thus, we prefer to keep *row indices* to restore `rset` object *inside* each base learner fitting function. Row indices here are derived from the row subsamples for base learners. `targets` will only store row indices bound with each subsample, such that the total usage of storage will be reduced significantly. Besides the disk space concerns, it has its own good to reduce the overhead or I/O for compressing massive `data.frame` (actually, `tibble`) objects. + +- `restore_*` functions restore `rset` object from row indices and their upstream `data.frame` +- `generate_*` functions generate row indices from input `data.frame` by the user-defined cross-validation strategy. + +`fit_base_learner()` is a quite long and versatile function that accepts a dozen arguments, therefore developers should be aware of each component in the function. The current implementation separated `parsnip` and `tune` parts from `fit_base_learner()`. The flowchart of `fit_base_learner()` is displayed below. + +```mermaid +graph TD + %% Define the target files as nodes + frecipe["minimal data"] + fittune["tuning results"] + fmodel["parsnip model definition"] + ftune["tuning functions"] + bestmodel["best model from tuning"] + bestworkflow["workflow of the best model"] + fitmodel["fitted best model with full data"] + bestfit["predicted values from one base learner"] + + + %% Define the branches with arrowhead connections + frecipe ---|recipes::recipe()| fittune + fmodel ---|`switch_model()`| fittune + ftune ---|`tune_*()`| fittune + fittune ---|tune::select_best()| bestmodel + bestmodel ---|tune::finalize_workflow()| bestworkflow + bestworkflow ---|parsnip::fit()| fitmodel + fitmodel ---|predict()| bestfit +``` + + +## Containerization +- TODO: build GPU-enabled Apptainer image +- TODO: make a new branch to replace `container-engine` \ No newline at end of file diff --git a/_pkgdown.yml b/_pkgdown.yml index 71c8fb25..faf2754f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -48,11 +48,6 @@ articles: contents: - list_features -- title: Downloading EPA Daily Data - desc: Download AQS data from EPA - contents: - - epa_download - - title: Generate prediction points desc: Generate 8 millions of prediction points contents: diff --git a/_targets.R b/_targets.R index edb705aa..50fce7a0 100755 --- a/_targets.R +++ b/_targets.R @@ -2,7 +2,10 @@ library(targets) library(tarchetypes) library(future) library(future.batchtools) +library(dplyr) library(beethoven) +library(tidymodels) +library(bonsai) Sys.setenv("LD_LIBRARY_PATH" = paste("/ddn/gs1/biotools/R/lib64/R/customlib", Sys.getenv("LD_LIBRARY_PATH"), sep = ":")) @@ -11,6 +14,13 @@ tar_config_set( store = "/ddn/gs1/group/set/pipeline/beethoven_targets" ) +# maximum future exportable object size is set 50GB +# TODO: the maximum size error did not appear until recently +# and suddenly appeared. Need to investigate the cause. +# Should be removed after the investigation. +options(future.globals.maxSize = 50 * 2^30) + + generate_list_download <- FALSE arglist_download <- @@ -18,6 +28,7 @@ arglist_download <- char_period = c("2018-01-01", "2022-12-31"), char_input_dir = "input", nasa_earth_data_token = NULL,#Sys.getenv("NASA_EARTHDATA_TOKEN"), + mod06_filelist = "inst/targets/mod06_links_2018_2022.csv", export = generate_list_download, path_export = "inst/targets/download_spec.qs" ) @@ -32,7 +43,7 @@ arglist_common <- num_extent = c(-126, -62, 22, 52), char_user_email = paste0(Sys.getenv("USER"), "@nih.gov"), export = generate_list_calc, - path_export = "inst/targets/punchcard_calc.qs", + path_export = "inst/targets/calc_spec.qs", char_input_dir = "/ddn/gs1/group/set/Projects/NRT-AP-Model/input" ) @@ -62,7 +73,7 @@ plan( list( memory = 8, log.file = "slurm_run.log", - ncpus = 1, partition = "geo,highmem", ntasks = 1, + ncpus = 1, partition = "geo", ntasks = 1, email = arglist_common$char_user_email, error.file = "slurm_error.log" ) @@ -90,7 +101,7 @@ tar_option_set( "data.table", "sf", "terra", "exactextractr", #"crew", "crew.cluster", "tigris", "dplyr", - "future.batchtools", "qs", "collapse", + "future.batchtools", "qs", "collapse", "bonsai", "tidymodels", "tune", "rsample", "torch", "brulee", "glmnet", "xgboost", "future", "future.apply", "future.callr", "callr", diff --git a/doc/list_features.R b/doc/list_features.R new file mode 100644 index 00000000..0aec432d --- /dev/null +++ b/doc/list_features.R @@ -0,0 +1,18 @@ +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set(message = FALSE) +library(knitr) + +tab_feature <- + read.csv( + system.file( + "extdata", + "beethoven_covariate_list.csv", + package = "beethoven" + ) + ) + +## ----------------------------------------------------------------------------- +knitr::kable( + tab_feature +) + diff --git a/doc/list_features.Rmd b/doc/list_features.Rmd new file mode 100644 index 00000000..69ecc77d --- /dev/null +++ b/doc/list_features.Rmd @@ -0,0 +1,31 @@ +--- +title: "List of features" +author: "SET group" +vignette: > + %\VignetteIndexEntry{Generate prediction points} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +pkgdown: + as_is: true +--- + + +```{r setup, include=FALSE} +knitr::opts_chunk$set(message = FALSE) +library(knitr) + +tab_feature <- + read.csv( + system.file( + "extdata", + "beethoven_covariate_list.csv", + package = "beethoven" + ) + ) +``` + +```{r} +knitr::kable( + tab_feature +) +``` diff --git a/doc/list_features.html b/doc/list_features.html new file mode 100644 index 00000000..8fd60cfc --- /dev/null +++ b/doc/list_features.html @@ -0,0 +1,1465 @@ + + + + + + + + + + + + + + +List of features + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
knitr::kable(
+  tab_feature
+)
+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CategoryCovariateShort.Name
MeteorologicalAccumulated snowMET_ACSNW_0_00000
MeteorologicalAccumulated total evaporationMET_ACEVP_0_00000
MeteorologicalAccumulated total precipitationMET_ACPRC_0_00000
MeteorologicalAir temperature (surface)MET_ATSFC_0_00000
MeteorologicalAlbedoMET_ALBDO_0_00000
MeteorologicalCloud coverageMET_CLDCV_0_00000
MeteorologicalDownard shortwave radition flux (surface)MET_DSWRF_0_00000
MeteorologicalHigh cloud area fractionMET_HCLAF_0_00000
MeteorologicalLatent heat fluxMET_LATHF_0_00000
MeteorologicalLow cloud area fractionMET_LCLAF_0_00000
MeteorologicalMedium cloud area fractionMET_MCLAF_0_00000
MeteorologicalOmegaMET_OMEGA_0_00000
MeteorologicalPlanetary boundary layer heightMET_PLBLH_0_00000
MeteorologicalPrecipitable water for the entire atmosphereMET_PRWEA_0_00000
MeteorologicalPrecipitation rateMET_PRRTE_0_00000
MeteorologicalPressure (surface)MET_PRSFC_0_00000
MeteorologicalSensible heat fluxMET_SENHF_0_00000
MeteorologicalSnow coverMET_SNWCV_0_00000
MeteorologicalSoil moisture contentMET_SLMSC_0_00000
MeteorologicalSpecific humidty at 2 mMET_SPHUM_0_00000
Meteorologicalu-wind at 10 m (east-west component)MET_UWIND_0_00000
MeteorologicalUpward longwave radiation on flux (surface)MET_ULWRF_0_00000
Meteorologicalv-wind at 10 m (north-south component)MET_VWIND_0_00000
MeteorologicalVisibilityMET_VISIB_0_00000
MeteorologicalWind Speed⏀MET_WNDSP_0_00000
MeteorologicalAir temperature (surface) (1-day lag)MET_ATSFC_1_00000
MeteorologicalAccumulated total precipitation (1-day lag)MET_ACPRC_1_00000
MeteorologicalPressure (surface) (1-day lag)MET_PRSFC_1_00000
MeteorologicalSpecific humidty at 2 m (1-day lag)MET_SPHUM_1_00000
MeteorologicalWind speed (1-day lag)⏀MET_WNDSP_1_00000
Land UseLand-use type: wetland (100m)LDU_TWTLD_0_00100
Land UseLand-use type: water (100m)LDU_TWATR_0_00100
Land UseLand-use type: planted (100m)LDU_TPLNT_0_00100
Land UseLand-use type: herbaceous (100m)LDU_THERB_0_00100
Land UseLand-use type: shrubland (100m)LDU_TSHRB_0_00100
Land UseLand-use type: barren (100m)LDU_TBARN_0_00100
Land UseLand-use type: developed (100m)LDU_TDEVL_0_00100
Land UseLand-use type: wetland (1km)LDU_TWTLD_0_01000
Land UseLand-use type: water (1km)LDU_TWATR_0_01000
Land UseLand-use type: planted (1km)LDU_TPLNT_0_01000
Land UseLand-use type: herbaceous (1km)LDU_THERB_0_01000
Land UseLand-use type: shrubland (1km)LDU_TSHRB_0_01000
Land UseLand-use type: barren (1km)LDU_TBARN_0_01000
Land UseLand-use type: developed (1km)LDU_TDEVL_0_01000
Land UseLand-use type: wetland (10km)LDU_TWTLD_0_10000
Land UseLand-use type: water (10km)LDU_TWATR_0_10000
Land UseLand-use type: planted (10km)LDU_TPLNT_0_10000
Land UseLand-use type: herbaceous (10km)LDU_THERB_0_10000
Land UseLand-use type: shrubland (10km)LDU_TSHRB_0_10000
Land UseLand-use type: barren (10km)LDU_TBARN_0_10000
Land UseLand-use type: developed (10km)LDU_TDEVL_0_10000
Land UseElevation: maximal (100m)LDU_EMAXL_0_00100
Land UseElevation: minimal (100m)LDU_EMINL_0_00100
Land UseElevation: median (100m)LDU_EMEDN_0_00100
Land UseElevation: mean (100m)LDU_EMEAN_0_00100
Land UseElevation: systematic subsample (100m)LDU_ESSUB_0_00100
Land UseElevation: breakline emphasis (100m)LDU_EBRKL_0_00100
Land UseElevation: standard deviation (100m)LDU_ESTDV_0_00100
Land UseElevation: maximal (1km)LDU_EMAXL_0_01000
Land UseElevation: minimal (1km)LDU_EMINL_0_01000
Land UseElevation: median (1km)LDU_EMEDN_0_01000
Land UseElevation: mean (1km)LDU_EMEAN_0_01000
Land UseElevation: systematic subsample (1km)LDU_ESSUB_0_01000
Land UseElevation: breakline emphasis (1km)LDU_EBRKL_0_01000
Land UseElevation: standard deviation (1km)LDU_ESTDV_0_01000
Land UseElevation: maximal (10km)LDU_EMAXL_0_10000
Land UseElevation: minimal (10km)LDU_EMINL_0_10000
Land UseElevation: median (10km)LDU_EMEDN_0_10000
Land UseElevation: mean (10km)LDU_EMEAN_0_10000
Land UseElevation: systematic subsample (10km)LDU_ESSUB_0_10000
Land UseElevation: breakline emphasis (10km)LDU_EBRKL_0_10000
Land UseElevation: standard deviation (10km)LDU_ESTDV_0_10000
Land UseRoad density: highway density (100m)LDU_HWYDN_0_00100
Land UseRoad density: highway density (1km)LDU_HWYDN_0_01000
Land UseRoad density: highway density (10km)LDU_HWYDN_0_10000
Land UseRoad density: highway and primary road density +(100m)LDU_HPRDN_0_00100
Land UseRoad density: highway and primary road density +(1km)LDU_HPRDN_0_01000
Land UseRoad density: highway and primary road density +(10km)LDU_HPRDN_0_10000
Land UseRoad density: all road (highway, primary, secondary) +density (100m)LDU_HPSDN_0_00100
Land UseRoad density: all road (highway, primary, secondary) +density (1km)LDU_HPSDN_0_01000
Land UseRoad density: all road (highway, primary, secondary) +density (10km)LDU_HPSDN_0_10000
Land UseAnnual average daily traffic volume (100m)LDU_TRAFC_0_00100
Land UseAnnual average daily traffic volume (1km)LDU_TRAFC_0_01000
Land UseAnnual average daily traffic volume (10km)LDU_TRAFC_0_10000
MODISSurface reflectanceMOD_SFCRF_0_00000
MODISSurface temperature during the dayMOD_SFCTD_0_00000
MODISSurface temperature at nightMOD_SFCTN_0_00000
MODISCloud coverage during the dayMOD_CLCVD_0_00000
MODISCloud coverage at nightMOD_CLCVN_0_00000
MODISAerosol optical depth at 470 nm (MCD19A2)MOD_AD4TA_0_00000
MODISAerosol optical depth at 550 nm (MCD19A2)MOD_AD5TA_0_00000
MODISCosine of Solar Zenith Angle (MCD19A2)MOD_CSZAN_0_00000
MODISCosine of View Zenith Angle (MCD19A2)MOD_CVZAN_0_00000
MODISRelative Azimuth Angle (MCD19A2)MOD_RAZAN_0_00000
MODISScattering Angle (MCD19A2)MOD_SCTAN_0_00000
MODISGlint Angle (MCD19A2)MOD_GLNAN_0_00000
MODISNormalized Difference Vegetation Index (NDVI) +valueMOD_NDVIV_0_00000
MODISLight at night (VNP46A2)MOD_LGHTN_0_00500
MERRA2Hydrophilic Black CarbonMER_PHOBC_0_00000
GEOS-CFAcetoneGEO_ACETO_0_00000
GEOS-CFAcetaldehydeGEO_ACETA_0_00000
GEOS-CFC4 alkanesGEO_CALKA_0_00000
GEOS-CFHydrophilic black carbon aerosolGEO_HIBCA_0_00000
GEOS-CFHydrophobic black carbon aerosolGEO_HOBCA_0_00000
GEOS-CFBenzeneGEO_BENZE_0_00000
GEOS-CFEthaneGEO_ETHTE_0_00000
GEOS-CFPropaneGEO_PROPA_0_00000
GEOS-CFMethaneGEO_METHA_0_00000
GEOS-CFCarbon monoxideGEO_CMONO_0_00000
GEOS-CFDust (0.7 microns)GEO_DUST1_0_00000
GEOS-CFDust (1.4 microns)GEO_DUST2_0_00000
GEOS-CFDust (2.4 microns)GEO_DUST3_0_00000
GEOS-CFDust (4.5 microns)GEO_DUST4_0_00000
GEOS-CFEthanolGEO_ETHOL_0_00000
GEOS-CFHydrogen peroxideGEO_HYPER_0_00000
GEOS-CFFormaldehydeGEO_FORMA_0_00000
GEOS-CFNitric acidGEO_NITAC_0_00000
GEOS-CFPeroynitric acidGEO_PERAC_0_00000
GEOS-CFIsopreneGEO_ISOPR_0_00000
GEOS-CFMethacroleinGEO_METHC_0_00000
GEOS-CFMethyl ethyl ketoneGEO_MEKET_0_00000
GEOS-CFMethyl vinyl ketoneGEO_MVKET_0_00000
GEOS-CFDinitrogen pentoxideGEO_DIPEN_0_00000
GEOS-CFAmmoniaGEO_AMNIA_0_00000
GEOS-CFAmmoniumGEO_AMNUM_0_00000
GEOS-CFInorganic nitratesGEO_INNIT_0_00000
GEOS-CFNitrogen oxideGEO_NIOXI_0_00000
GEOS-CFNitrogen dioxideGEO_NIDIO_0_00000
GEOS-CFReactive nitrogrenGEO_NITRO_0_00000
GEOS-CFHydrophilic organic carbonGEO_HIORG_0_00000
GEOS-CFHydrophobic organic carbonGEO_HOORG_0_00000
GEOS-CFPeroyacetyl nitrateGEO_PERNI_0_00000
GEOS-CFPM2.5 at RH 35GEO_PM25X_0_00000
GEOS-CFPM2.5 at RH 35 (reconstructed)GEO_PM25R_0_00000
GEOS-CFBlack carbon PM2.5GEO_BLCPM_0_00000
GEOS-CFDust PM2.5GEO_DUSPM_0_00000
GEOS-CFNitrate PM2.5GEO_NITPM_0_00000
GEOS-CFOrganic carbon PM2.5GEO_ORCPM_0_00000
GEOS-CFSecondary organic aerosol PM2.5GEO_SORPM_0_00000
GEOS-CFSea salt PM2.5GEO_SEAPM_0_00000
GEOS-CFSulfate PM2.5GEO_SULPM_0_00000
GEOS-CFLumped C3 alkenesGEO_CALKE_0_00000
GEOS-CFLumped aldehydeGEO_CALDH_0_00000
GEOS-CFFine sea salt aerosol (0.01 - 0.05 microns)GEO_FSEAS_0_00000
GEOS-CFCoarse sea salt aerosol (0.5 - 8 microns)GEO_CSEAS_0_00000
GEOS-CFSulfer dioxideGEO_SULDI_0_00000
GEOS-CFSOA PrecursorGEO_SOAPR_0_00000
GEOS-CFSOA SimpleGEO_SOASI_0_00000
GEOS-CFTolueneGEO_TOLUE_0_00000
GEOS-CFXyleneGEO_XYLEN_0_00000
GEOS-CFCO volume mixing ratio dry airGEO_COVMR_0_00000
GEOS-CFNO2 volume mixing raito dry airGEO_NOVMR_0_00000
GEOS-CFO3 volume mixing ratio dry airGEO_OZVMR_0_00000
GEOS-CFSO2 volume mixing ratio dry airGEO_SOVMR_0_00000
DummyYear 2018DUM_Y2018_0_00000
DummyYear 2019DUM_Y2019_0_00000
DummyYear 2020DUM_Y2020_0_00000
DummyYear 2021DUM_Y2021_0_00000
DummyYear 2022DUM_Y2022_0_00000
DummyJanuaryDUM_JANUA_0_00000
DummyFebruaryDUM_FEBRU_0_00000
DummyMarchDUM_MARCH_0_00000
DummyAprilDUM_APRIL_0_00000
DummyMayDUM_MAYMA_0_00000
DummyJuneDUM_JUNEJ_0_00000
DummyJulyDUM_JULYJ_0_00000
DummyAugustDUM_AUGUS_0_00000
DummySeptemberDUM_SEPTE_0_00000
DummyOctoberDUM_OCTOB_0_00000
DummyNovemberDUM_NOVEM_0_00000
DummyDecemberDUM_DECEM_0_00000
DummyWeekday 1DUM_WKDY1_0_00000
DummyWeekday 2DUM_WKDY2_0_00000
DummyWeekday 3DUM_WKDY3_0_00000
DummyWeekday 4DUM_WKDY4_0_00000
DummyWeekday 5DUM_WKDY5_0_00000
DummyWeekday 6DUM_WKDY6_0_00000
DummyWeekday 7DUM_WKDY7_0_00000
DummyClimate Region ADUM_CLRGA_0_00000
DummyClimate Region BDUM_CLRGB_0_00000
DummyClimate Region CDUM_CLRGC_0_00000
DummyClimate Region DDUM_CLRGD_0_00000
DummyClimate Region EDUM_CLRGE_0_00000
DummyEcoregion Level 2: Mixed wood shieldDUM_E2052_0_00000
DummyEcoregion Level 2: Atlantic highlandsDUM_E2053_0_00000
DummyEcoregion Level 2: Marine West coast forestDUM_E2071_0_00000
DummyEcoregion Level 2: Mixed wood plainsDUM_E2081_0_00000
DummyEcoregion Level 2: Central USA plainsDUM_E2082_0_00000
DummyEcoregion Level 2: Southeastern USA plainsDUM_E2083_0_00000
DummyEcoregion Level 2: Ozark, Ouachita-Appalachian +forestsDUM_E2084_0_00000
DummyEcoregion Level 2: Mississippi alluvial and Southeast +USA coastal plainsDUM_E2085_0_00000
DummyEcoregion Level 2: Temperate prairiesDUM_E2092_0_00000
DummyEcoregion Level 2: West-Central semi-arid prairiesDUM_E2093_0_00000
DummyEcoregion Level 2: South Central semi-arid +prairiesDUM_E2094_0_00000
DummyEcoregion Level 2: Texas-Louisiana coastal plainDUM_E2095_0_00000
DummyEcoregion Level 2: Tamaulipas-Texas semiarid plainDUM_E2096_0_00000
DummyEcoregion Level 2: Cold desertsDUM_E2101_0_00000
DummyEcoregion Level 2: Warm desertsDUM_E2102_0_00000
DummyEcoregion Level 2: Mediterranean CaliforniaDUM_E2111_0_00000
DummyEcoregion Level 2: Western Sierra Madre PiedmontDUM_E2121_0_00000
DummyEcoregion Level 2: EvergladesDUM_E2154_0_00000
OtherPopulation densityOTH_POPDN_0_00000
OtherWildfire smoke plume coverage (light)OTH_HMSWL_0_00000
OtherWildfire smoke plume coverage (medium)OTH_HMSWM_0_00000
OtherWildfire smoke plume coverage (heavy)OTH_HMSWH_0_00000
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/doc/prediction_points.R b/doc/prediction_points.R new file mode 100644 index 00000000..b137f86a --- /dev/null +++ b/doc/prediction_points.R @@ -0,0 +1,85 @@ +## ----init-setting, echo = FALSE----------------------------------------------- +knitr::opts_chunk$set(message = FALSE, warning = FALSE) + +## ----load-terra--------------------------------------------------------------- +library(terra) +library(tigris) + +## ----load-usmain-------------------------------------------------------------- +usmain <- tigris::states(progress_bar = FALSE) +exclude <- c("02", "15", "60", "66", "68", "69", "72", "78") +usmain <- usmain[!usmain$STATEFP %in% exclude, ] +usmain <- terra::vect(usmain) +usmain <- terra::aggregate(usmain) +usmain <- terra::project(usmain, "EPSG:5070") +plot(usmain) + + +## ----gen-grid-prep------------------------------------------------------------ +corner_ul <- c(-2.40, 3.26) * 1e6 +corner_lr <- c(2.40, 0.12) * 1e6 + +corners <- c(corner_ul, corner_lr) +# reorganize xmin, ymin, xmax, ymax, which are ll, ur form +corners_re <- corners[c(1, 3, 4, 2)] +names(corners_re) <- c("xmin", "xmax", "ymin", "ymax") +corners_ext <- terra::ext(corners_re) + +## ----gen-grid-1km, eval = FALSE----------------------------------------------- +# corners_ras <- +# terra::rast( +# corners_ext, +# resolution = c(1000L, 1000L), +# crs = "EPSG:5070" +# ) +# +# terra::values(corners_ras) <- 1L +# corners_ras_sub <- +# terra::crop( +# corners_ras, +# usmain, +# snap = "out", +# mask = TRUE +# ) +# +# corners_pnts <- terra::as.points(corners_ras_sub) +# corners_pnts_df <- as.data.frame(corners_pnts, geom = "XY") +# corners_pnts_df$site_id <- seq(1, nrow(corners_pnts_df)) +# names(corners_pnts_df)[2:3] <- c("lon", "lat") +# corners_pnts_df <- corners_pnts_df[, c("site_id", "lon", "lat")] +# + +## ----save-rds, eval = FALSE--------------------------------------------------- +# saveRDS( +# corners_pnts_df, +# file = "./input/prediction_grid.rds", +# compress = "xz" +# ) +# + +## ----gen-grid-10km, echo = FALSE, message = FALSE, error = FALSE, fig.width = 8, fig.height = 4.8---- +corners_ras10 <- + terra::rast( + corners_ext, + resolution = c(10000L, 10000L), + crs = "EPSG:5070" + ) + +terra::values(corners_ras10) <- 1L +corners_ras_sub10 <- + terra::crop( + corners_ras10, + usmain, + snap = "out", + mask = TRUE + ) + +corners_pnts10 <- terra::as.points(corners_ras_sub10) + +## ----plot-grid-10km----------------------------------------------------------- +plot( + corners_pnts10, + cex = 0.1, + main = "10-km grid points in the mainland US" +) + diff --git a/doc/prediction_points.Rmd b/doc/prediction_points.Rmd new file mode 100644 index 00000000..84147b12 --- /dev/null +++ b/doc/prediction_points.Rmd @@ -0,0 +1,137 @@ +--- +title: "Generate prediction points" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Generate prediction points} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +author: "Insang Song" +--- + +```{r init-setting, echo = FALSE} +knitr::opts_chunk$set(message = FALSE, warning = FALSE) +``` + +```{r load-terra} +library(terra) +library(tigris) +``` + +# Objective +This vignette will demonstrate how the prediction grid points at 1-km +resolution are generated from the polygon data of the mainland US with +`terra` package. + +# Strategy +- We set the upper left and lower right corners then make regular grid +points at 1,000 meter interval. +- `EPSG:5070`, Conus Albers equal area projection, is used throughout +this vignette. + +```{r load-usmain} +usmain <- tigris::states(progress_bar = FALSE) +exclude <- c("02", "15", "60", "66", "68", "69", "72", "78") +usmain <- usmain[!usmain$STATEFP %in% exclude, ] +usmain <- terra::vect(usmain) +usmain <- terra::aggregate(usmain) +usmain <- terra::project(usmain, "EPSG:5070") +plot(usmain) + +``` + +# Generate +Regular or random points can be generated from an extent or a polygon +object with `terra::spatSample()` or `sf::st_sample()`. +A faster way of generating regular points is to leverage a raster object, +where cells are organized in a regular grid. The code block below +generates 1-km resolution grid points following steps: + +1. Identify corners to generate a rectangular extent +(i.e., `SpatExtent` object from `terra::ext()`) +2. Create a `SpatRaster` object with a fixed resolution and +coordinate system (in this case, `EPSG:5070`) +3. Assign a value to the void raster +4. Crop the raster object with the mainland US polygon +5. Convert the cropped raster to points (we have a `SpatVector` object) +6. Convert the `SpatVector` object to a three-column `data.frame` object +7. Save the `data.frame` object from step 6 as an RDS file + +Steps 6 and 7 reduce the file size substantially as all data in the +`data.frame` from step 6 are in numeric type. +This means the data can be compressed efficiently. + +```{r gen-grid-prep} +corner_ul <- c(-2.40, 3.26) * 1e6 +corner_lr <- c(2.40, 0.12) * 1e6 + +corners <- c(corner_ul, corner_lr) +# reorganize xmin, ymin, xmax, ymax, which are ll, ur form +corners_re <- corners[c(1, 3, 4, 2)] +names(corners_re) <- c("xmin", "xmax", "ymin", "ymax") +corners_ext <- terra::ext(corners_re) +``` + + +```{r gen-grid-1km, eval = FALSE} +corners_ras <- + terra::rast( + corners_ext, + resolution = c(1000L, 1000L), + crs = "EPSG:5070" + ) + +terra::values(corners_ras) <- 1L +corners_ras_sub <- + terra::crop( + corners_ras, + usmain, + snap = "out", + mask = TRUE + ) + +corners_pnts <- terra::as.points(corners_ras_sub) +corners_pnts_df <- as.data.frame(corners_pnts, geom = "XY") +corners_pnts_df$site_id <- seq(1, nrow(corners_pnts_df)) +names(corners_pnts_df)[2:3] <- c("lon", "lat") +corners_pnts_df <- corners_pnts_df[, c("site_id", "lon", "lat")] + +``` + +```{r save-rds, eval = FALSE} +saveRDS( + corners_pnts_df, + file = "./input/prediction_grid.rds", + compress = "xz" +) + +``` + +Below is a map of 10-km grid points in the mainland US for faster rendering. +The actual 1-km result will look denser. +```{r gen-grid-10km, echo = FALSE, message = FALSE, error = FALSE, fig.width = 8, fig.height = 4.8} +corners_ras10 <- + terra::rast( + corners_ext, + resolution = c(10000L, 10000L), + crs = "EPSG:5070" + ) + +terra::values(corners_ras10) <- 1L +corners_ras_sub10 <- + terra::crop( + corners_ras10, + usmain, + snap = "out", + mask = TRUE + ) + +corners_pnts10 <- terra::as.points(corners_ras_sub10) +``` + +```{r plot-grid-10km} +plot( + corners_pnts10, + cex = 0.1, + main = "10-km grid points in the mainland US" +) +``` diff --git a/doc/prediction_points.html b/doc/prediction_points.html new file mode 100644 index 00000000..e80baaf8 --- /dev/null +++ b/doc/prediction_points.html @@ -0,0 +1,456 @@ + + + + + + + + + + + + + + + +Generate prediction points + + + + + + + + + + + + + + + + + + + + + + + + + + +

Generate prediction points

+

Insang Song

+ + + +
library(terra)
+library(tigris)
+
+

Objective

+

This vignette will demonstrate how the prediction grid points at 1-km +resolution are generated from the polygon data of the mainland US with +terra package.

+
+
+

Strategy

+ +
usmain <- tigris::states(progress_bar = FALSE)
+exclude <- c("02", "15", "60", "66", "68", "69", "72", "78")
+usmain <- usmain[!usmain$STATEFP %in% exclude, ]
+usmain <- terra::vect(usmain)
+usmain <- terra::aggregate(usmain)
+usmain <- terra::project(usmain, "EPSG:5070")
+plot(usmain)
+

+
+
+

Generate

+

Regular or random points can be generated from an extent or a polygon +object with terra::spatSample() or +sf::st_sample(). A faster way of generating regular points +is to leverage a raster object, where cells are organized in a regular +grid. The code block below generates 1-km resolution grid points +following steps:

+
    +
  1. Identify corners to generate a rectangular extent (i.e., +SpatExtent object from terra::ext())
  2. +
  3. Create a SpatRaster object with a fixed resolution and +coordinate system (in this case, EPSG:5070)
  4. +
  5. Assign a value to the void raster
  6. +
  7. Crop the raster object with the mainland US polygon
  8. +
  9. Convert the cropped raster to points (we have a +SpatVector object)
  10. +
  11. Convert the SpatVector object to a three-column +data.frame object
  12. +
  13. Save the data.frame object from step 6 as an RDS +file
  14. +
+

Steps 6 and 7 reduce the file size substantially as all data in the +data.frame from step 6 are in numeric type. This means the +data can be compressed efficiently.

+
corner_ul <- c(-2.40, 3.26) * 1e6
+corner_lr <- c(2.40, 0.12) * 1e6
+
+corners <- c(corner_ul, corner_lr)
+# reorganize xmin, ymin, xmax, ymax, which are ll, ur form
+corners_re <- corners[c(1, 3, 4, 2)]
+names(corners_re) <- c("xmin", "xmax", "ymin", "ymax")
+corners_ext <- terra::ext(corners_re)
+
corners_ras <-
+  terra::rast(
+    corners_ext,
+    resolution = c(1000L, 1000L),
+    crs = "EPSG:5070"
+  )
+
+terra::values(corners_ras) <- 1L
+corners_ras_sub <-
+  terra::crop(
+    corners_ras,
+    usmain,
+    snap = "out",
+    mask = TRUE
+  )
+
+corners_pnts <- terra::as.points(corners_ras_sub)
+corners_pnts_df <- as.data.frame(corners_pnts, geom = "XY")
+corners_pnts_df$site_id <- seq(1, nrow(corners_pnts_df))
+names(corners_pnts_df)[2:3] <- c("lon", "lat")
+corners_pnts_df <- corners_pnts_df[, c("site_id", "lon", "lat")]
+
saveRDS(
+  corners_pnts_df,
+  file = "./input/prediction_grid.rds",
+  compress = "xz"
+)
+

Below is a map of 10-km grid points in the mainland US for faster +rendering. The actual 1-km result will look denser.

+
plot(
+  corners_pnts10,
+  cex = 0.1,
+  main = "10-km grid points in the mainland US"
+)
+

+
+ + + + + + + + + + + diff --git a/init_targets_storage.sh b/init_targets_storage.sh new file mode 100644 index 00000000..f055f173 --- /dev/null +++ b/init_targets_storage.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# Edited: 08/15/2024 +# Insang Song +# Objective: to configure targets storage directory from _targets.R + +# Modify $USER if you don't want to use system username +USER_SESSION_B=$USER + +# beethoven project directory +USER_HOME="/ddn/gs1/home/$USER_SESSION_B" + +if [ ! -f "_targets.R" ]; then + echo "Invalid attempt. _targets.R does not exist in the current directory." + exit 1 +fi + +# Define the external directory for storing targets +# Search for a "store = " in tar_config_set inside _targets.R +STR_PATH_SEARCH="store = *" +STR_PATH_FOUND=$(grep -o "$STR_PATH_SEARCH" _targets.R) + +# Check if the path was found +if [ -z "$STR_PATH_SEARCH" ]; then + echo "String '$STR_PATH_FOUND' not found in _targets.R. Halting execution." + exit 1 +else + # Remove specific strings from the found string + # Example: Remove "store =" from the found string + DIR_TARGETS_USER=$(echo "$STR_PATH_FOUND" | sed 's/store = //g') +fi + + +# Check if the directory exists, if not, create it +if [ ! -d "$DIR_TARGETS_USER" ]; then + mkdir -p "$DIR_TARGETS_USER" + echo "_targets storage folder $DIR_TARGETS_USER is created." +elif [ -d "/pipeline" ]; then + echo "It seems the pipeline is placed in a container. Make sure that you have a correctly configured Apptainer image file." + echo "Also make sure setup_hook.sh and sbatch is executable." + echo "_targets storage location: /opt/_targets (CONTAINER INTERNAL)" + echo "Please check whether the pipeline execution script binds the correct directory to /opt/_targets before running the pipeline." +else + echo "Directory $DIR_TARGETS_USER already exists." +fi diff --git a/inst/targets/README.md b/inst/targets/README.md new file mode 100644 index 00000000..8c2f5a56 --- /dev/null +++ b/inst/targets/README.md @@ -0,0 +1,197 @@ +# Developer's guide + +## Preamble +The objective of this document is to provide developers with the current implementation of `beethoven` pipeline as of July 20, 2024 (version 0.3.7) + +We assume the potential users have basic knowledge of `targets` and `tarchetypes` packages as well as functional programming and metaprogramming. It is recommended to read Advanced R (by Hadley Wickham)'s chapters for these topics. + +## Before running the pipeline +For the future release and tests on various environments, one should check several lines across R and shell script files: + +- Shell script + - `/tar_run.sh`: all system variables including `PATH` and `LD_LIBRARY_PATH` to align with the current system environment. The lines in the provided file are set for NIEHS HPC. + - `inst/targets/run.sh`: project directory path + - `inst/targets/run_impute.sh` (if necessary when the imputation target is dispatched separately): project directory path +- R script + - `/targets.R`: Lines 10-12, `tar_config_set(store = ...)` should be reviewed if it is set properly not to overwrite successfully run targets. + - `/targets.R`: `set_args_download` and `set_args_calc` functions, i.e., `char_input_dir` argument and `char_period`. + - `/targets.R`: `library` argument value in `tar_option_set` to match the current system environment + + + +## Basic structure of branches +We will call "grand target" as a set of branches if any branching technique is applied at a target. + +When one target is branched out, the grand target should be a list, either being a nested or a plain list, depending on the context or the command run inside each branch. Branch names include automatic hash after the grand target name as a suffix. Users may use their own suffixes for legibility. Branches have their own good to provide succinct network layout (i.e., an interactive plot generated by `tar_visnetwork()`), while they add complication to debug. It is strongly advised that the unit function that is applied to each branch should be fully tested. + +## Branching in beethoven +Branching is actively employed in most parts in `beethoven`. Here we will navigate which targets are branched out and rationales for branching in each target. + +### `list_feat_calc_base` +Per `beethoven` targets naming convention, this object will be a list and it has eight elements at the first level. We use "first level" here as the list is nested. It is also related to maintain `list_feat_calc_base_flat` at the following target. Eight elements are defined in a preceding target `chr_iter_calc_features`: + +```r + tar_target( + chr_iter_calc_features, + command = c("hms", "nlcd", "tri", "nei", + "ecoregions", "koppen", "population", "groads"), + iteration = "list", + description = "Feature calculation" + ) + +``` + +Using `inject_calculate` function and argument lists generated by `set_args_calc` function, `chr_iter_calc_features` are passed to `amadeus` functions for calculation. Please note that the pattern of `list_feat_calc_base` is not simply `map(chr_iter_calc_features)`, rather `cross(file_prep_calc_args, chr_iter_calc_features)`, for potential expansion to keep multiple argument files in the future. + +Each element in `chr_iter_calc_features` is iterated as a list then `list_feat_calc_base` will be a nested list. `list_feat_calc_base` will merge nested elements into one `data.frame` (`data.table` actually), resulting in a non-nested `list`, which means each element in this `list` object is a `data.frame`. + + +### `list_feat_calc_nasa` +MODIS-VIIRS product processing is a bit more complex than others since many preprocessing steps are involved in this raw data. Please note that `chr_iter_calc_nasa` divides MOD19A2 product by spatial resolution since difference in spatial resolution of raster layers makes it difficult to stack layers that can be advantageous to improve processing speed. The branching itself is simple to use a character vector of length 7 to iterate the process, but there is a different avenue that might introduce complexity in terms of computational infrastructure and implementation of parallel processing. + +We introduced nested parallelization to expedite the MODIS/VIIRS processing, where `tar_make_future` will submit jobs per MODIS/VIIRS product code via SLURM batchtools and multiple threads are used in each job. If one wants to make a transition to `crew` based pipeline operation in the future, this part indeed requires a tremendous amount of refactoring not only in beethoven functions but also amadeus functions considering features of `crew`/`mirai` workers which are different from `future`. + + +### `list_feat_calc_geoscf` +We use a character vector of length 2 to distinguish chm from aqc products. A modified version of `amadeus::calc_geos`, `calc_geos_strict` is employed to calculate features. The key modification is to fix the radius argument as zero then to remove the top-level argument radius from the function. + + +### `list_feat_calc_gmted` +Here we use custom function `calc_gmted_direct`, which has different logic from what was used in `amadeus::calc_gmted`. `inject_gmted` uses that function to parallelize the calculation by radius length. + +### `list_feat_calc_narr` +Again, modified functions `process_narr2` and `calc_narr2` are applied and the parallelization for NARR data is done by `par_narr`. Here we did not branch out by NARR variable names since they are a bit long (length of 46) such that each dispatched branch will add up overhead to submit SLURM job for each variable. + + +## Merge branches + +Functions with prefix `post_calc_` merge branches, which contain various internal structures. Most of the branches are list of depth 1, which means `data.frame` or `data.table` objects are in each list element. Others are list of depth 2. + +### Tackling space-time discrepancy + +Each source data have different temporal resolution and update frequency. This leads to the different dimensions across targets due to the measures to save time for computation. For example, NLCD targets will get N (number of sites) times 2 (2019 and 2021 per initial study period as of August 2024), whereas NARR targets will get N times $|D|$ (where $D$ is the set of dates), which equals to the full site-date combinations during the study period. To tackle the discrepancy across calculated targets, automatic expansion strategy is implemented by inferring temporal resolution from targets. Automatic expansion starts from resolving native temporal resolution from each target then proceeds to adding a provisional field year from date, which is removed after all required join operations will be completed. Most of the time, date-to-year conversion is performed internally in `expand` functions in `beethoven` and full space-time `data.frame` is prioritized to left join the multiple targets. + +### Value filling strategies + +Temporal resolution discrepancy makes `NA` values in joined `data.frame`s. In MODIS/VIIRS targets, NDVI (a subdataset of MOD13A1 product) is based on a 16-day cycle, differing from other products on a daily cycle. We consider the reported date of "16-day cycle" as the **last day** of the cycle. + +* **MODIS/VIIRS**: Therefore, the `NA` values introduced by joining `data.frame`s by date field are filled in `impute_all` using `data.table::setnafill` with next observation carried forward (`type = "nocb"`) option. +* MODIS/VIIRS targets may have `NaN` values where nonexisting values are assigned as replacements. These values are replaced with `NA` at first, then with zeros. +* Other nonignorable `NA`s in the joined target will be imputed by missForest (name of the original method used; actually using `missRanger` package for efficiency). + +### Autojoin functions + +Automatic join function `post_calc_autojoin` is one of the most complex function in `beethoven` codebase, which encapsulates the efforts to resolve all sorts of space-time discrepancies across targets. Full and coarse site-date combinations and full and coarse site-year combinations are automatically resolved in the function. The coarse site-year combination is a challenge since some years are out of the study period and such *anchor* years should be repeated to fill in for no gaps in the joined data. Another `post_calc_df_year_expand` and its upstream `post_calc_year_expand` function repeat coarse site-year `data.frame`s properly to ensure that there will be no years with missing values. + +```r +post_calc_autojoin <- + function( + df_fine, + df_coarse, + field_sp = "site_id", + field_t = "time", + year_start = 2018L, + year_end = 2022L + ) { + # Dataset specific preprocessing + if (any(grepl("population", names(df_coarse)))) { + df_coarse <- df_coarse[, -c("time"), with = FALSE] + } + + # Detect common field names + common_field <- intersect(names(df_fine), names(df_coarse)) + + # Clean inputs to retain necessary fields + df_fine <- data.table::as.data.table(df_fine) + df_coarse <- data.table::as.data.table(df_coarse) + df_fine <- post_calc_drop_cols(df_fine) + df_coarse <- post_calc_drop_cols(df_coarse) + + # Take strategy depending on the length of common field names + # Length 1 means that `site_id` is the only intersecting field + if (length(common_field) == 1) { + print(common_field) + if (common_field == field_sp) { + joined <- data.table::merge.data.table( + df_fine, df_coarse, + by = field_sp, + all.x = TRUE + ) + } + } + # When space-time join is requested, + if (length(common_field) == 2) { + if (all(common_field %in% c(field_sp, field_t))) { + # Type check to characters + df_fine[[field_t]] <- as.character(df_fine[[field_t]]) + df_coarse[[field_t]] <- as.character(df_coarse[[field_t]]) + + # When `time` field contains years, `as.Date` call will return error(s) + t_coarse <- try(as.Date(df_coarse[[field_t]][1])) + # If an error is detected, print information + if (inherits(t_coarse, "try-error")) { + message( + "The time field includes years. Trying different join strategy." + ) + coarse_years <- sort(unique(unlist(as.integer(df_coarse[[field_t]])))) + + # coarse site-year combination is expanded + df_coarse2 <- post_calc_df_year_expand( + df_coarse, + time_start = year_start, + time_end = year_end, + time_available = coarse_years + ) + joined <- + post_calc_join_yeardate(df_coarse2, df_fine, field_t, field_t) + } else { + # site-date combination data.frames are joined as they are regardless of coarseness + # Left join is enforced + joined <- data.table::merge.data.table( + df_fine, df_coarse, + by = c(field_sp, field_t), + all.x = TRUE + ) + } + } + } + return(joined) + } +``` + + +## Base learners + +For efficiency, GPU-enabled version is recommended for `lightgbm` and `brulee`. These packages need to be installed manually with modifications of system environment variables. Developers should consult `lightgbm` official documentation and `brulee` GitHub repository (i.e., in `gpu` branch) to install the proper version of each package with careful consideration on the computing infrastructure. "GPU" here refers to CUDA-enabled devices produced by NVIDIA corporation. This does not necessarily mean that this package as a part of U.S. government work endorses NVIDIA corporation and its products in any sort. + +### tidymodels infrastructure + +We want to actively adopt evolving packages in the `tidymodels` ecosystem while keeping as minimal dependency tree as possible. In this package, major `tidymodels` packages that are used in base and meta learners include-- + +* `parsnip` +* `recipe` +* `rsample` +* `spatialsample` +* `tune` +* `workflow` + +### Cross validation + +Due to `rsample` design, each cross-validation fold will include an **actual** `data.frame` (`tibble`) object. It has own good for self-contained modeling practices that easily guarantee reproducibility, however, it also has limitations when used with large data and `targets` pipeline as `targets` **stores** such objects in disk space. Such characteristics lead to inflate the disk space for base and meta learner training. Ten-fold cross-validation sets from 900K*3.2K `data.frame` take $9\texttt{M} \times 3.2\texttt{K} \times 8\texttt{bytes}$=230GB. Randomization schemes for model ensemble will increase that size to 10 times and more, which is equivalent to 2.3TB and more when uncompressed. The current development version modifies the original `rsample`'s `rset` design to store *row indices* of the joined `data.frame` target to reduce data size in disk. + + +#### Use `rset` object in the last resort + +`rset` object is a powerful tool to ensure that all cross-validation sets "flow" through the modeling process, but has a limitation in large-scale modeling with `target`: storage issues. When one stores `rset` objects in the pipeline even with a mild randomization (e.g., 30% row sampling in the base learner step in `beethoven` pipeline), the total disk space required to keep `rset` object easily exceed several times of the original `data.frame` object. Thus, we prefer to keep *row indices* to restore `rset` object *inside* each base learner fitting function. Row indices here are derived from the row subsamples for base learners. `targets` will only store row indices bound with each subsample, such that the total usage of storage will be reduced significantly. Besides the disk space concerns, it has its own good to reduce the overhead or I/O for compressing massive `data.frame` (actually, `tibble`) objects. + +- `restore_*` functions restore `rset` object from row indices and their upstream `data.frame` +- `generate_*` functions generate row indices from input `data.frame` by the user-defined cross-validation strategy. + + + + + +- Object size issue +- Tweaking object for size reduction +- Restoration upon size + diff --git a/inst/targets/_targets.R b/inst/targets/_targets.R deleted file mode 100644 index 29e50ba0..00000000 --- a/inst/targets/_targets.R +++ /dev/null @@ -1,132 +0,0 @@ -library(targets) -library(tarchetypes) -library(future) -library(future.batchtools) -library(beethoven) - -# replacing yaml file. -tar_config_set( - store = "/ddn/gs1/group/set/pipeline/beethoven_targets" -) - - -generate_list <- FALSE - -arglist_common <- - set_args_calc( - char_siteid = "site_id", - char_timeid = "time", - char_period = c("2022-09-01", "2022-10-31"), - num_extent = c(-126, -62, 22, 52), - char_user_email = paste0(Sys.getenv("USER"), "@nih.gov"), - export = generate_list, - path_export = "inst/targets/punchcard_calc.qs", - char_input_dir = "/ddn/gs1/group/set/Projects/NRT-AP-Model/input" - ) - -tar_source("inst/targets/targets_initialize.R") -tar_source("inst/targets/targets_download.R") -tar_source("inst/targets/targets_calculate.R") -tar_source("inst/targets/targets_baselearner.R") -tar_source("inst/targets/targets_metalearner.R") -tar_source("inst/targets/targets_predict.R") - -# bypass option -Sys.setenv("BTV_DOWNLOAD_PASS" = "TRUE") - -# -# bind custom built GDAL -# Users should export the right path to the GDAL library -# by export LD_LIBRARY_PATH=.... command. -.libPaths( - c( - "/ddn/gs1/biotools/R/lib64/R/custompkg", - .libPaths() - ) -) - -# arglist_common is generated by targets_arglist.R. -plan( - list( - tweak( - future.batchtools::batchtools_slurm, - template = "inst/targets/template_slurm.tmpl", - resources = - list( - memory = 8, - log.file = "slurm_run.log", - ncpus = 1, partition = "geo", ntasks = 1, - email = arglist_common$char_user_email, - error.file = "slurm_error.log" - ) - ), - multicore - ) -) - -# # invalidate any nodes older than 180 days: force running the pipeline -# tar_invalidate(any_of(tar_older(Sys.time() - as.difftime(180, units = "days")))) - - -# # nullify download target if bypass option is set -if (Sys.getenv("BTV_DOWNLOAD_PASS") == "TRUE") { - target_download <- NULL -} - -# targets options -# For GPU support, users should be aware of setting environment -# variables and GPU versions of the packages. -# TODO: check if the controller and resources setting are required -tar_option_set( - packages = - c("beethoven", "amadeus", "chopin", "targets", "tarchetypes", - "data.table", "sf", "terra", "exactextractr", - #"crew", "crew.cluster", - "tigris", "dplyr", - "future.batchtools", "qs", "collapse", - "tidymodels", "tune", "rsample", "torch", "brulee", - "glmnet", "xgboost", - "future", "future.apply", "future.callr", "callr", - "stars", "rlang", "parallelly"), - library = c("/ddn/gs1/biotools/R/lib64/R/custompkg", "/ddn/gs1/home/songi2/r-libs"), - repository = "local", - error = "stop", - memory = "transient", - format = "qs", - storage = "worker", - deployment = "worker", - garbage_collection = TRUE, - seed = 202401L -) - - -# should run tar_make_future() -list( - target_init, - target_download, - target_calculate_fit, - target_baselearner - #, - # target_metalearner, - # target_calculate_predict, - # target_predict, - # # documents and summary statistics - # targets::tar_target( - # summary_urban_rural, - # summary_prediction( - # grid_filled, - # level = "point", - # contrast = "urbanrural")) - # , - # targets::tar_target( - # summary_state, - # summary_prediction( - # grid_filled, - # level = "point", - # contrast = "state" - # ) - # ) -) - -# targets::tar_visnetwork(targets_only = TRUE) -# END OF FILE diff --git a/inst/targets/base_function_dev_demo.r b/inst/targets/base_function_dev_demo.r index 94150f93..2e4e33d7 100644 --- a/inst/targets/base_function_dev_demo.r +++ b/inst/targets/base_function_dev_demo.r @@ -751,3 +751,50 @@ lapply(lfna, function(x) { xxd <- x[duplicated(xx) | duplicated(xx, fromLast = TRUE),] dim(xxd) }) + + + +library(sf) +options(sf_use_s2 = FALSE) +# Define the coordinates of the point in North Carolina +lon <- -79.0558 +lat <- 35.7596 + +# Create the point sf object +point <- st_point(c(lon, lat)) +point <- st_sfc(point, crs = 4326) +point <- st_as_sf(point) +point$site_id <- "1" +point$time <- "2024-01-01" + +inject_nlcd(year = 2021, + radius = 1000, + from = amadeus::process_nlcd( + path = "/ddn/gs1/group/set/Projects/NRT-AP-Model/input/nlcd/data_files", + year = 2021 + ), + locs = point, + locs_id = "site_id", + nthreads = 1L, + mode = "exact", + max_cells = 3e7 + ) + +df_feat_calc_nlcd_params <- data.frame( + year = 2021, + radius = 1000 +) +file_prep_calc_args <- "inst/targets/calc_spec.qs" + +inject_nlcd(year = df_feat_calc_nlcd_params$year, + radius = df_feat_calc_nlcd_params$radius, + from = amadeus::process_nlcd( + path = loadargs(file_prep_calc_args, "nlcd")$path, + year = df_feat_calc_nlcd_params$year + ), + locs = point, + locs_id = arglist_common$char_siteid, + nthreads = 10L, + mode = "exact", + max_cells = 3e7 + ) diff --git a/inst/targets/calc_spec.qs b/inst/targets/calc_spec.qs new file mode 100644 index 00000000..5bc10afa Binary files /dev/null and b/inst/targets/calc_spec.qs differ diff --git a/inst/targets/mod06_links_2018_2022.csv b/inst/targets/mod06_links_2018_2022.csv new file mode 100644 index 00000000..27155709 --- /dev/null +++ b/inst/targets/mod06_links_2018_2022.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:359ecaea5d17e4b933216ed925f9dd947667057729df0d80a0f830d5dcf2228d +size 2385371 diff --git a/inst/targets/pipeline_base_functions.R b/inst/targets/pipeline_base_functions.R deleted file mode 100644 index 61f5dbef..00000000 --- a/inst/targets/pipeline_base_functions.R +++ /dev/null @@ -1,2731 +0,0 @@ -## pipeline base functions -# nocov start - -## file check: chunking -## if using tarchetypes::tar_files, -## the file *lists* should be stored as a single file -## Provided that the download is completed in a defined -## time period such that users can distiguish a **set** of files -## from each other, -## timestamp check: `fs::file_info(...)$modification_time` -## can be used in bulk file check (will be time consuming as -## the number of files grow, though). -## The file list of the previous successful run will be stored as a file -## and we just save the file list of the current run, which are -## older than a certain rerun interval (e.g., 6 months). -## If one really wants to keep the shorter rerun interval, -## the strategy should be changed. -## THINK: How can we know the downloaded files are complete and correct? -## quick diagram: -## file set 1 ... file set x -## (listing function runs) -## list1.rds ... listx.rds -## (hashed; not modified) ... (not run) -## (pass) ... (run) -## ... ... (downstream process + calculation) -## (as-is) ... (as-is) --- unless modified or manually invalidated - -#' Load arguments from the formatted argument list file -#' @keywords Utility -#' @param argfile character(1). Path to the argument file. RDS format. -#' @param dataset character(1). Dataset name. -#' @returns A list of arguments. -#' @importFrom qs qread -#' @export -loadargs <- function(argfile, dataset) { - if (endsWith(argfile, ".rds")) { - arglist <- readRDS(argfile) - } else if (endsWith(argfile, ".qs")) { - arglist <- qs::qread(argfile) - } else { - stop("Invalid format.") - } - arglist[[dataset]] -} - - -#' Check if a query date falls within a time interval -#' -#' This function checks if a given query date falls within a time interval -#' defined by a vector of two dates. -#' @keywords Miscellaneous -#' @param query_date The query date to check. -#' @param tvec A vector of two dates defining the time interval. -#' -#' @returns TRUE if the query date falls within the time interval, -#' FALSE otherwise. -#' -#' @examples -#' \dontrun{ -#' query_date <- as.Date("2022-01-01") -#' tvec <- c(as.Date("2021-01-01"), as.Date("2023-01-01")) -#' `%tin%`(query_date, tvec) -#' } -#' @export -`%tin%` <- function(query_date, tvec) { - tvec <- sort(tvec) - query_date <= tvec[1] & query_date >= tvec[2] -} - -#' Load MODIS files from a specified path. -#' -#' This function takes a path and an optional pattern as input and -#' returns a list of MODIS files found in the specified path. -#' @keywords Utility -#' @param path The path where the MODIS files are located. -#' @param pattern An optional regular expression pattern to filter the files. -#' The default pattern is "hdf$". -#' @param date A vector of two dates to filter the files by. -#' The default is an empty character vector. -#' @returns A list of full file names of the MODIS files found -#' in the specified path. -#' -#' @examples -#' \dontrun{ -#' # Load MODIS files from the current directory -#' modis_files <- load_modis_files(".") -#' -#' # Load MODIS files from a specific directory with a custom pattern -#' modis_files <- load_modis_files("/path/to/files", pattern = "MOD.*hdf$") -#' } -#' @export -load_modis_files <- function(path, pattern = "hdf$", date = character(2)) { - modis_files <- - list.files( - path, pattern = pattern, - recursive = TRUE, - full.names = TRUE - ) - date_exp <- - amadeus::generate_date_sequence(date[1], date[2], sub_hyphen = FALSE) - date_exp <- strftime(date_exp, format = "%Y%j") - modis_files <- - grep( - sprintf("(%s)", paste(paste0("A", date_exp), collapse = "|")), - modis_files, value = TRUE - ) - return(modis_files) -} - -#' Injects the calculate function with specified arguments. -#' -#' This function injects the calculate function with the specified arguments, -#' allowing for dynamic customization of the function's behavior. -#' @keywords Calculation -#' @param covariate character(1). The name of the covariate to be calculated. -#' @param locs The locations to be used in the calculation. -#' @param injection Additional arguments to be injected into -#' the calculate function. -#' -#' @returns The result of the calculate function with the injected arguments. -#' -#' @examples -#' \dontrun{ -#' inject_calculate( -#' locs = my_locs, buffer = 10, domain = my_domain, -#' injection = list(arg1 = "value1", arg2 = "value2") -#' ) -#' } -#' @export -inject_calculate <- function(covariate, locs, injection) { - rlang::inject( - calculate( - locs = locs, - !!!injection - ) - ) -} - -#' Injects arguments to parallelize MODIS/VIIRS data processing -#' -#' @keywords Calculation -#' @param locs A data frame containing the locations for which MODIS -#' features need to be calculated. -#' @param domain The domain in which the MODIS PAR data should be injected. -#' @param injection Additional parameters to be passed to the -#' `calc_modis_par` function. -#' @returns The modified domain with the injected MODIS PAR data. -#' @export -inject_modis_par <- function(locs, domain, injection) { - rlang::inject( - amadeus::calc_modis_par( - locs = locs, - locs_id = "site_id", - !!!injection - ) - ) -} - -#' Injects geographic information into a data frame -#' -#' This function injects geographic information into a data frame using -#' the `calc_geos_strict` function. The injected information includes -#' latitude and longitude coordinates based on the specified locations, -#' a location ID column, a window range, and a snapping option. -#' -#' @keywords Calculation -#' @param locs A data frame containing the locations for which -#' geographic information needs to be injected. -#' @param injection A list of additional arguments to be passed to -#' the `calc_geos_strict` function. -#' @returns A modified data frame with injected geographic information. -#' @export -inject_geos <- function(locs, injection) { - rlang::inject( - calc_geos_strict( - locs = locs, - locs_id = "site_id", - win = c(-126, -62, 22, 52), - snap = "out", - !!!injection - ) - ) -} - - -#' Injects GMTED data into specified locations -#' -#' This function injects GMTED (Global Multi-resolution Terrain Elevation Data) -#' into specified locations. It calculates the GMTED values for each -#' location within different radii and returns the merged results. -#' -#' @keywords Calculation -#' @param locs A data frame/sf/SpatVector containing the locations -#' where GMTED variables needs to be calculated -#' @param variable The variable for which GMTED data needs to be calculated. -#' @param radii A vector of radii for which GMTED data needs -#' to be calculated. -#' @param injection A list of additional arguments to be passed to -#' the `calc_gmted_direct` function. -#' @param nthreads The number of threads to be used for parallel processing. -#' Default is 4. -#' -#' @returns A data frame containing the merged results of GMTED data -#' for each location within different radii. -#' @importFrom future plan -#' @importFrom future.apply future_lapply -#' @importFrom rlang inject -#' @export -inject_gmted <- function(locs, variable, radii, injection, nthreads = 4L) { - future::plan(future::multicore, workers = nthreads) - - radii_list <- split(radii, seq_along(radii)) - radii_rep <- - future.apply::future_lapply( - radii_list, - function(r) { - rlang::inject( - calc_gmted_direct( - locs = locs, - locs_id = "site_id", - radius = r, - variable = c(variable, "7.5 arc-seconds"), - !!!injection - ) - ) - } - ) - radii_rep <- lapply(radii_rep, function(x) as.data.frame(x)) - radii_join <- reduce_merge(radii_rep, "site_id") - future::plan(future::sequential) - return(radii_join) -} - - -#' Reduce and merge a list of data tables -#' -#' This function takes a list of data tables and merges them together -#' using the specified columns. It uses the `merge.data.table` function -#' from the `data.table` package to perform the merge. -#' -#' @param list_in A list of data tables to be merged. -#' @param by The columns to merge the data tables on. -#' @param all.x logical(1). Keeping all rows from the first input. -#' @param all.y logical(1). Keeping all rows from the second input. -#' @returns A merged data table. -#' @keywords Post-calculation -#' @examples -#' \dontrun{ -#' # Create example data tables -#' dt1 <- data.table(a = 1:3, b = 4:6) -#' dt2 <- data.table(a = 2:4, c = 7:9) -#' dt3 <- data.table(a = 3:5, d = 10:12) -#' -#' # Merge the data tables -#' reduce_merge(list(dt1, dt2, dt3), by = "a") -#' } -#' @importFrom data.table as.data.table merge.data.table -#' @export -reduce_merge <- - function( - list_in, - by = c("site_id", "time"), - all.x = TRUE, all.y = FALSE - ) { - list_check <- sapply(list_in, nrow) - list_checkdiff <- diff(list_check) - if (any(list_checkdiff > 0)) all.y <- TRUE - for (i in seq_len(length(list_in))) { - list_in[[i]] <- data.table::as.data.table(list_in[[i]]) - } - - Reduce( - function(x, y) { - if (is.null(by)) by <- intersect(names(x), names(y)) - data.table::merge.data.table( - x, y, by = by, all.x = all.x, all.y = all.y - ) - }, - list_in - ) - } - - -#' Parallelize NARR feature calculation -#' -#' This function parallelizes the processing and calculation of -#' NARR data for multiple domains. -#' @keywords Calculation -#' @param domain A character vector specifying the domains to process. -#' @param date A character vector specifying the date of the -#' NARR data to process. -#' @param locs A data frame specifying the locations to calculate NARR data for. -#' @param nthreads An integer specifying the number of threads -#' to use for parallel processing. Default is 24. -#' -#' @returns A list of results from the parallel processing. -#' @importFrom future plan multicore sequential -#' @importFrom future.apply future_lapply -#' @export -par_narr <- function(domain, date, locs, nthreads = 24L) { - - future::plan(future::multicore, workers = nthreads) - - res <- - future.apply::future_lapply( - domain, - function(x) { - from <- process_narr2( - path = "input/narr", - variable = x, - date = date - ) - calc_narr2( - from = from, - locs = locs, - locs_id = "site_id" - ) - }, - future.seed = TRUE - ) - future::plan(future::sequential) - return(res) - -} - -#' Add Time Column -#' -#' This function adds a time column to a data frame. -#' -#' @keywords Post-calculation -#' @param df The data frame to which the time column will be added. -#' @param time_value The value to be assigned to the time column. -#' @param time_id The name of the time column (default is "time"). -#' -#' @returns The data frame with the added time column. -#' -#' @examples -#' \dontrun{ -#' df <- data.frame(x = 1:5, y = letters[1:5]) -#' add_time_col(df, "2022-01-01") -#' } -#' @export -add_time_col <- function(df, time_value, time_id = "time") { - if (!time_id %in% names(df)) { - df[[time_id]] <- time_value - } - return(df) -} - - -# 2018~2022, 2017, 2020 -# 2017 ... 2020 ... -# 2017 -#' Map the available raw data years over the given period -#' @description -#' Many raw datasets are periodically updated and the period could -#' be longer than a year. This function maps the available years -#' over the given period. -#' @keywords Post-calculation -#' @param time_start integer(1). Starting year. -#' @param time_end integer(1). Ending year. -#' @param time_unit character(1). Time unit. Default is `"year"`. -#' @param time_available vector. Available years. -#' @returns integer vector of length (time_end - time_start + 1). -#' Each element will get the nearest preceeding available year. -#' @note -#' The minimum of `time_available` will be filled in front of the first -#' available year when the minimum of `time_available` is greater -#' than `time_start`. -#' @examples -#' \dontrun{ -#' process_calc_year_expand(2018, 2022, "year", c(2017, 2020, 2021)) -#' process_calc_year_expand(2018, 2022, "year", c(2020, 2021)) -#' } -#' @export -post_calc_year_expand <- - function( - time_start = NULL, - time_end = NULL, - time_unit = "year", - time_available = NULL - ) { - time_seq <- seq(time_start, time_end) - time_target_seq <- findInterval(time_seq, time_available) - time_target_seq <- time_available[time_target_seq] - if (min(time_available) > time_start) { - time_target_seq <- - c( - rep(min(time_available), - min(time_available) - time_start), - time_target_seq - ) - } - return(time_target_seq) - } - - -#' Expand a data frame by year -#' -#' This function expands a data frame by year, creating multiple rows -#' for each year based on the time period specified. -#' @keywords Post-calculation -#' @param df The input data frame. -#' @param locs_id The column name of the location identifier in the data frame. -#' @param time_field The column name of the time field in the data frame. -#' @param time_start The start of the time period. -#' @param time_end The end of the time period. -#' @param time_unit The unit of time to expand the data frame. Only for record. -#' @param time_available A vector of available time periods. -#' @param ... Placeholders. -#' @note Year expansion rule is to assign the nearest past year -#' in the available years;#' if there is no past year in the available years, -#' the first available year is rolled back to the start of the time period. -#' @returns The expanded data frame with multiple rows for each year. -#' @seealso [`post_calc_year_expand()`] -#' @examples -#' \dontrun{ -#' df <- data.frame(year = c(2010, 2010, 2011, 2012), -#' value = c(1, 2, 3, 4)) -#' df_expanded <- -#' post_calc_df_year_expand(df, locs_id = "site_id", time_field = "year", -#' time_start = 2011, time_end = 2012, -#' time_unit = "year") -#' print(df_expanded) -#' } -#' @importFrom stats sd -#' @export -post_calc_df_year_expand <- function( - df, - locs_id = "site_id", - time_field = "time", - time_start = NULL, - time_end = NULL, - time_unit = "year", - time_available = NULL, - ... -) { - time_summary <- table(unlist(df[[time_field]])) - if (length(time_summary) != 1) { - if (stats::sd(time_summary) != 0) { - stop("df should be a data frame with the same number of rows per year") - } - } - # assume that df is the row-bound data frame - if (is.character(df[[time_field]])) { - df[[time_field]] <- as.integer(df[[time_field]]) - } - df_years <- unique(df[[time_field]]) - nlocs <- length(unique(df[[locs_id]])) - year_period <- seq(time_start, time_end) - # assign the time period to the available years - year_assigned <- - post_calc_year_expand(time_start, time_end, time_unit, df_years) - df_years_repeats <- table(year_assigned) - - # repeat data frames - df_expanded <- Map( - function(y) { - df_sub <- df[df[[time_field]] == df_years[y], ] - df_sub <- df_sub[rep(seq_len(nrow(df_sub)), df_years_repeats[y]), ] - return(df_sub) - }, - seq_along(year_assigned) - ) - df_expanded <- do.call(rbind, df_expanded) - df_expanded[[time_field]] <- rep(year_period, each = nlocs) - return(df_expanded) -} - - -# calculate over a list -#' Spatiotemporal covariate calculation -#' @keywords Calculation -#' @param domain vector of integer/character/Date. -#' Depending on temporal resolution of raw datasets. -#' Nullable; If `NULL`, it will be set to `c(1)`. -#' @param domain_name character(1). Name of the domain. Default is `"year"`. -#' @param nthreads integer(1). Number of threads to use. -#' @param process_function Raw data processor. Default is -#' [`amadeus::process_covariates`] -#' @param calc_function Function to calculate covariates. -#' [`amadeus::calc_covariates`] -#' @param ... Arguments passed to `process_function` and `calc_function` -#' @returns A data.table object. -#' @importFrom data.table rbindlist -#' @importFrom rlang inject -#' @export -# FIXME: this function works inefficiently in expense of -# returning uniform list of length(|years|) output. -# It could seriously affect the performance in scaled calculation -# as it calculates the same covariate for several years. -# Future updates should reduce the workload by calculating -# source data years only then assign proper preceding years -# to the output as another target. -calculate <- - function( - domain = NULL, - domain_name = "year", - nthreads = 1L, - process_function = amadeus::process_covariates, - calc_function = amadeus::calc_covariates, - ... - ) { - if (is.null(domain)) { - domain <- c(1) - } - # split the domain, make years from the domain list - # assuming that domain length is the same as the number of years - domainlist <- split(domain, seq_along(domain)) - years_data <- seq_along(domain) + 2017 - - if (nthreads == 1L) { - future::plan(future::sequential) - } else { - future::plan(future::multicore, workers = nthreads) - } - # double twists: list_iteration is made to distinguish - # cases where a single radius is accepted or ones have no radius - # argument. - res_calc <- - #try( - future.apply::future_mapply( - function(domain_each, year_each) { - # we assume that ... have no "year" and "from" arguments - args_process <- c(arg = domain_each, list(...)) - names(args_process)[1] <- domain_name - if (!is.null(args_process$covariate) && - any(names(args_process) %in% c("covariate")) - ) { - if (args_process$covariate == "nei") { - args_process$county <- process_counties() - } - } - - # load balancing strategy - # if radius is detected, split the list - if (any(names(args_process) %in% c("radius"))) { - list_iteration <- - split(args_process$radius, seq_along(args_process$radius)) - } else { - list_iteration <- list(1) - } - - list_iteration_calc <- - Map( - function(r) { - args_process$radius <- r - from_in <- - rlang::inject( - process_function(!!!args_process) - ) - res <- rlang::inject( - calc_function( - from = from_in, - !!!args_process - ) - ) - # using domain_name, add both - # data year and covariate year - if (!is.null(domain) && domain_name == "year") { - res <- - add_time_col( - res, domain_each, - sprintf("%s_year", unname(args_process$covariate)) - ) - } - res <- data.table::as.data.table(res) - return(res) - }, - list_iteration - ) - df_iteration_calc <- - if (length(list_iteration_calc) == 1) { - list_iteration_calc[[1]] - } else { - by_detected <- - Reduce(intersect, lapply(list_iteration_calc, names)) - reduce_merge(list_iteration_calc, by = by_detected) - } - return(df_iteration_calc) - }, - domainlist, years_data, SIMPLIFY = FALSE, - future.seed = TRUE - ) - - future::plan(future::sequential) - if (inherits(res_calc, "try-error")) { - cat(paste0(attr(res_calc, "condition")$message, "\n")) - stop("Results do not match expectations.") - } - res_calc <- lapply(res_calc, - function(x) { - if ("time" %in% names(x)) { - if (nchar(x$time[1]) != 4) { - x$time <- data.table::as.IDate(x$time) - } - } - xconvt <- data.table::as.data.table(x) - return(xconvt) - } - ) - # res_calcdf <- if (length(res_calc) == 1) { - # data.table::as.data.table(res_calc[[1]]) - # } else if (domain_name %in% c("year", "date")) { - # data.table::rbindlist(res_calc, use.names = TRUE, fill = TRUE) - # } else { - # reduce_merge(res_calc, by = c("site_id", "time")) - # } - return(res_calc) - } - - - -#' Set resource management for SLURM -#' @keywords Utility -#' @param template_file SLURM job submission shell template path. -#' @param partition character(1). Name of partition. Default is `"geo"` -#' @param ncpus integer(1). Number of CPU cores assigned to each task. -#' @param ntasks integer(1). Number of tasks to submit. -#' @param memory integer(1). Specifically odds to 2*x GB. -#' @param user_email character(1). User email address. -#' @param error_log character(1). Error log file name. -#' @note This function is designed to be used with `tar_resources`. -#' Suggested number of `ncpus` is more than 1 for typical multicore R tasks. -#' @returns A list of resources for `tar_resources` -#' @author Insang Song -#' @importFrom future tweak -#' @importFrom future.batchtools batchtools_slurm -#' @importFrom targets tar_resources -#' @importFrom targets tar_resources_future -#' @export -set_slurm_resource <- - function( - template_file = "inst/targets/template_slurm.tmpl", - partition = "geo", - ncpus = 2L, - ntasks = 2L, - memory = 8, - user_email, - error_log = "slurm_error.log" - ) { - targets::tar_resources( - future = targets::tar_resources_future( - plan = future::tweak( - future.batchtools::batchtools_slurm, - template = template_file, - resources = - list( - partition = partition, - ntasks = ntasks, - ncpus = ncpus, - memory = memory, - email = user_email, - error.file = error_log - ) - ) - ) - ) - } - - -#' Read AQS data -#' @keywords Utility -#' @param fun_aqs function to import AQS data. -#' Default is `amadeus::process_aqs` -#' @param export Export the file to qs. Default is FALSE. -#' @param ... Passed arguments to `fun_aqs` -#' @returns Depending on `fun_aqs` specification. -#' @importFrom qs qsave -#' @export -read_locs <- - function( - fun_aqs = amadeus::process_aqs, - export = FALSE, - ... - ) { - aqs_read <- fun_aqs(...) - if (export) qs::qsave(aqs_read, file = "input/sf_feat_proc_aqs_sites.qs") - return(aqs_read) - } - - - -#' Check file status and download if necessary -#' @keywords Utility -#' @param path download path. -#' @param dataset_name Dataset name. See [`amadeus::download_data`] for details. -#' @param ... Arguments passed to `amadeus::download_data` -#' @returns logical(1). -feature_raw_download <- - function( - path = NULL, - dataset_name = NULL, - ... - ) { - # run amadeus::download_data - tryCatch( - { - amadeus::download_data(dataset_name = dataset_name, ...) - }, - error = function(e) { - stop(e) - } - ) - } - -#' Load county sf object -#' @keywords Calculation -#' @param year integer(1). Year of the county shapefile. -#' @param exclude character. State FIPS codes to exclude. -#' Default is c("02", "15", "60", "66", "68", "69", "72", "78"). -#' @returns sf object -#' @importFrom tigris counties -#' @export -process_counties <- - function( - year = 2020, - exclude = c("02", "15", "60", "66", "68", "69", "72", "78") - ) { - options(tigris_use_cache = TRUE) - cnty <- tigris::counties(year = year) - cnty <- - cnty[!cnty$STATEFP %in% - c("02", "15", "60", "66", "68", "69", "72", "78"), ] - return(cnty) - } - - - -#' Merge input data.frame objects -#' @param by character. Joining keys. See [`merge`] for details. -#' @param time logical(1). Whether or not include time identifier. -#' Set this `TRUE` will supersede `by` value by appending time identifier. -#' @param ... data.frame objects to merge -#' @returns data.table -#' @keywords Post-calculation -#' @importFrom data.table as.data.table -#' @importFrom data.table merge.data.table -#' @export -post_calc_merge_features <- - function( - by = c("site_id"), - time = FALSE, - ... - ) { - ellipsis <- list(...) - if (time) { - by <- c("site_id", "time") - ellipsis_clean <- - lapply( - ellipsis, - function(x) { - x <- data.table::as.data.table(x) - col_coords <- grep("(lon|lat)", names(x)) - if (length(col_coords) > 0 && !is.null(col_coords)) { - x <- x[, -col_coords, with = FALSE] - } - x$time <- as.character(x$time) - return(x) - } - ) - } else { - ellipsis_clean <- ellipsis - } - joined <- - Reduce(function(x, y) { - data.table::merge.data.table( - x, y, - by = by, all.x = TRUE, suffixes = c("_Ma", "_Mb") - ) - }, ellipsis_clean) - return(joined) - } - - -#' Change time column name -#' @param df data.frame -#' @param candidates character. Candidate column names. -#' @param replace character. New column name. -#' @returns data.frame -#' @keywords Post-calculation -#' @export -post_calc_unify_timecols <- - function( - df, - candidates = c("year"), - replace = "time" - ) { - if (sum(names(df) %in% candidates) > 1) { - stop("More than a candidate is detected in the input.") - } - names(df)[names(df) %in% candidates] <- replace - return(df) - } - - -#' Convert time column to character -#' @keywords Post-calculation -#' @param df data.table -#' @note This function takes preprocessed data.table with -#' a column named `"time"`. -#' @importFrom data.table as.data.table copy -#' @export -post_calc_convert_time <- - function( - df - ) { - df <- data.table::copy(data.table::as.data.table(df)) - df <- df[, `:=`(time, as.character(time))] - return(df) - } - -#' Join a data.frame with a year-only date column to -#' that with a full date column -#' @description The full date column will be converted to a year column -#' as a new column, then the data.frame with the year-only column will -#' be joined. -#' @keywords Post-calculation -#' @param df_year data.frame with a year-only date column -#' @param df_date data.frame with a full date column -#' @param field_year character(1). Year column in `df_year` -#' @param field_date character(1). Date column in `df_date` -#' @param spid character(1). Name of the unique location identifier field. -#' @importFrom methods is -#' @importFrom data.table merge.data.table -#' @importFrom data.table `:=` -#' @returns data.frame -post_calc_join_yeardate <- - function( - df_year, - df_date, - field_year = "time", - field_date = "time", - spid = "site_id" - ) { - if (!inherits(df_year, "data.frame") && !inherits(df_date, "data.frame")) { - stop("Both inputs should be data.frame.") - } - - names(df_year)[which(names(df_year) %in% field_year)] <- "year" - df_year$year <- as.character(unlist(df_year$year)) - df_date$year <- as.character(substr(df_date[[field_date]], 1, 4)) - - df_joined <- - data.table::merge.data.table( - df_date, df_year, - by = c(spid, "year"), - all.x = TRUE - ) - - df_joined <- df_joined[, c("year") := NULL] - return(df_joined) - } - - -#' Merge spatial and spatiotemporal covariate data -#' @keywords Post-calculation -#' @param locs Location. e.g., AQS sites. -#' @param locs_id character(1). Location identifier. -#' @param time_id character(1). Location identifier. -#' @param target_years integer. Used to dummify nominal year. -#' @param df_sp data.frame. Spatial-only covariates. -#' @param df_spt data.frame. Spatiotemporal covariates. -#' @note This version assumes the time_id contains Date-like strings. -#' @returns data.frame -#' @importFrom data.table merge.data.table -#' @export -post_calc_merge_all <- - function( - locs, - locs_id, - time_id, - target_years = seq(2018, 2022), - df_sp, - df_spt - ) { - if (methods::is(locs, "sf")) { - locs <- sf::st_drop_geometry(locs) - } - locs$time <- as.character(locs$time) - locs <- data.table::as.data.table(locs) - locs_merged <- - data.table::merge.data.table( - locs, df_sp, by = c(locs_id) - ) - locs_merged <- - data.table::merge.data.table( - locs_merged, df_spt, - by = c(locs_id, time_id) - ) - locs_merged <- - amadeus::calc_temporal_dummies( - locs = locs_merged, - locs_id = locs_id, - year = target_years - ) - return(locs_merged) - } - - -#' Remove columns from a data frame based on regular expression patterns. -#' @keywords Post-calculation -#' -#' This function removes columns from a data frame that match -#' any of the specified -#' regular expression patterns. By default, it removes columns with names that -#' match the patterns "^lon$|^lat$|geoid|year$|description". -#' -#' @param df The input data frame. -#' @param candidates A character vector of regular expression patterns -#' to match against column names. Columns that match any of the patterns -#' will be removed. The default value is -#' "^lon$|^lat$|geoid|year$|description". -#' @param strict logical(1). If `TRUE`, -#' only `c("site_id", "time")` will be kept. -#' @returns The modified data frame with the specified columns removed. -#' -#' @examples -#' \dontrun{ -#' df <- data.frame(lon = 1:5, lat = 6:10, geoid = 11:15, year = 2010:2014, -#' description = letters[1:5], other = 16:20) -#' post_calc_drop_cols(df) -#' } -#' @export -post_calc_drop_cols <- - function( - df, - candidates = "(^lon$|^lat$|geoid|year$|description|geometry)", - strict = FALSE - ) { - idx_remove <- - if (!strict) { - grep(candidates, names(df), value = TRUE) - } else { - grep("site_id|time", names(df), value = TRUE, invert = TRUE) - } - df <- df[, -idx_remove, with = FALSE] - return(df) - } - -#' Automatic joining by the time and spatial identifiers -#' @description The key assumption is that all data frames will have -#' time field and spatial field and the data should have one of date or year. -#' Whether the input time unit is year or date -#' is determined by the coercion of the **first row value** of the time field -#' into a character with `as.Date()`. This function will fail if it -#' gets year-like string with length 4. -#' -#' @param df_fine The fine-grained data frame. -#' @param df_coarse The coarse-grained data frame. -#' @param field_sp The name of the spatial field in the data frames. -#' @param field_t The name of the time field in the data frames. -#' @param year_start The starting year of the time period. -#' @param year_end The ending year of the time period. -#' @keywords Post-calculation -#' @returns A merged data table. -#' @examples -# nolint start -#' \dontrun{ -#' df_fine0 <- data.frame(site_id = c("A", "B", "B", "C"), -#' time = as.Date(c("2022-01-01", "2022-01-02", "2021-12-31", "2021-01-03")), -#' value = c(1, 2, 3, 5)) -#' df_coarse0 <- data.frame(site_id = c("A", "B", "C"), -#' time = c("2022", "2022", "2021"), -#' other_value = c(10, 20, 30)) -#' jdf <- post_calc_autojoin(df_fine0, df_coarse0) -#' print(jdf) -#' } -# nolint end -#' @importFrom data.table merge.data.table -#' @importFrom rlang as_name -#' @importFrom rlang sym -#' @export -post_calc_autojoin <- - function( - df_fine, - df_coarse, - field_sp = "site_id", - field_t = "time", - year_start = 2018L, - year_end = 2022L - ) { - if (any(grepl("population", names(df_coarse)))) { - df_coarse <- df_coarse[, -c("time"), with = FALSE] - } - common_field <- intersect(names(df_fine), names(df_coarse)) - df_fine <- data.table::as.data.table(df_fine) - df_coarse <- data.table::as.data.table(df_coarse) - df_fine <- post_calc_drop_cols(df_fine) - df_coarse <- post_calc_drop_cols(df_coarse) - # if (length(common_field) > 2) { - # message("The data frames have more than two common fields.") - # message("Trying to remove the redundant common fields...") - # common_field <- intersect(names(df_fine), names(df_coarse)) - # print(common_field) - # common_field <- - # common_field[-which(!common_field %in% c(field_sp, field_t))] - # } - if (length(common_field) == 1) { - print(common_field) - if (common_field == field_sp) { - joined <- data.table::merge.data.table( - df_fine, df_coarse, - by = field_sp, - all.x = TRUE - ) - } - } - if (length(common_field) == 2) { - if (all(common_field %in% c(field_sp, field_t))) { - # t_fine <- try(as.Date(df_fine[[field_t]][1])) - df_fine[[field_t]] <- as.character(df_fine[[field_t]]) - df_coarse[[field_t]] <- as.character(df_coarse[[field_t]]) - t_coarse <- try(as.Date(df_coarse[[field_t]][1])) - if (inherits(t_coarse, "try-error")) { - message( - "The time field includes years. Trying different join strategy." - ) - coarse_years <- sort(unique(unlist(as.integer(df_coarse[[field_t]])))) - df_coarse2 <- post_calc_df_year_expand( - df_coarse, - time_start = year_start, - time_end = year_end, - time_available = coarse_years - ) - joined <- - post_calc_join_yeardate(df_coarse2, df_fine, field_t, field_t) - } else { - joined <- data.table::merge.data.table( - df_fine, df_coarse, - by = c(field_sp, field_t), - all.x = TRUE - ) - } - } - } - return(joined) - } - - - -#' Read paths from a directory with a specific file extension -#' @keywords Utility -#' @param path The directory path from which to read the paths. -#' @param extension The file extension to match. Defaults to ".hdf". -#' @param target_dates A character vector of length 2 containing -#' the start and end dates. -#' @param julian logical(1). If `TRUE`, the dates are in Julian format. -#' @returns A character vector containing the full paths of the matching files. -#' -#' @examples -#' \dontrun{ -#' # Read paths from a directory with default extension -#' read_paths("/path/to/directory") -#' -#' # Read paths from a directory with custom extension -#' read_paths("/path/to/directory", ".txt") -#' } -#' @export -read_paths <- - function( - path, - extension = ".hdf", - target_dates = c("2020-01-01", "2020-01-15"), - julian = FALSE - ) { - flist <- - list.files( - path = path, - pattern = sprintf("%s$", extension), - full.names = TRUE, - recursive = TRUE - ) - if (!missing(target_dates)) { - dateseq <- - seq(as.Date(target_dates[1]), as.Date(target_dates[2]), by = "day") - dateseq <- - if (julian) format(dateseq, "%Y%j") else format(dateseq, "%Y%m%d") - dateseq <- sprintf("A(%s)", paste(dateseq, collapse = "|")) - flist <- grep(dateseq, flist, value = TRUE) - } - return(flist) - } - - - -#' Search package functions -#' @keywords Utility -#' @param package character(1). Package name. -#' @param search character(1). Search term. -#' @returns A character vector containing the matching function names. -#' @examples -#' # Search for functions in the `amadeus` package -#' \dontrun{ -#' search_function("amadeus", "process_") -#' } -#' @export -search_function <- function(package, search) { - library(package, character.only = TRUE) - grep(search, ls(sprintf("package:%s", package)), value = TRUE) -} - -#' Get data.frame of function parameters -#' @keywords Utility -#' @param functions character. Vector of function names. -#' @returns A data.frame containing the parameters of the functions. -#' @importFrom dplyr as_tibble bind_rows -#' @export -df_params <- function(functions) { - params <- lapply(functions, function(x) { - args <- - dplyr::as_tibble( - lapply(as.list(formals(get(x))), \(p) list(p)), - .name_repair = "minimal" - ) - return(args) - }) - paramsdf <- Reduce(dplyr::bind_rows, params) - return(paramsdf) -} - - -#' Process atmospheric composition data by chunks (v2) -#' @keywords Calculation -#' @description -#' Returning a single `SpatRasterDataset` object. -#' @param date character(2). length of 10. Format "YYYY-MM-DD". -#' @param path character(1). Directory with downloaded netCDF (.nc4) files. or -#' netCDF file paths. -#' @param ... Arguments passed to [`terra::rast`]. -#' @note -#' Layer names of the returned `SpatRaster` object contain the variable, -#' pressure level, date -#' Reference duration: 1 day summary, all layers: 115 seconds -#' Superseded by [`calc_geos_strict`]. -#' @author Mitchell Manware, Insang Song -#' @return a `SpatRaster` object; -#' @importFrom terra rast -#' @importFrom terra time -#' @importFrom terra varnames -#' @importFrom terra crs -#' @importFrom terra subset -#' @export -process_geos_bulk <- - function(path = NULL, - date = c("2018-01-01", "2018-01-01"), - ...) { - #### directory setup - if (length(path) == 1) { - - if (dir.exists(path)) { - path <- amadeus::download_sanitize_path(path) - paths <- list.files( - path, - pattern = "GEOS-CF.v01.rpl", - full.names = TRUE - ) - paths <- paths[grep( - ".nc4", - paths - )] - } - } else { - paths <- path - } - #### check for variable - amadeus::check_for_null_parameters(mget(ls())) - #### identify file paths - #### identify dates based on user input - dates_of_interest <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = TRUE - ) - #### subset file paths to only dates of interest - data_paths <- unique( - grep( - paste( - dates_of_interest, - collapse = "|" - ), - paths, - value = TRUE - ) - ) - #### identify collection - collection <- amadeus::process_collection( - data_paths[1], - source = "geos", - collection = TRUE - ) - cat( - paste0( - "Identified collection ", - collection, - ".\n" - ) - ) - if (length(unique(collection)) > 1) { - warning( - "Multiple collections detected. Returning data for all collections.\n" - ) - } - - filename_date <- regmatches( - data_paths, - regexpr( - "20[0-9]{2}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])", - data_paths - ) - ) - if (any(table(filename_date) < 24)) { - warning( - "Some dates include less than 24 hours. Check the downloaded files." - ) - } - if (length(unique(filename_date)) > 10) { - message( - "More than 10 unique dates detected. Try 10-day chunks..." - ) - } - - # split filename date every 10 days - filename_date <- as.Date(filename_date, format = "%Y%m%d") - filename_date_cl <- as.integer(cut(filename_date, "30 days")) - - future_inserted <- split(data_paths, filename_date_cl) - other_args <- list(...) - data_variables <- names(terra::rast(data_paths[1])) - # nolint start - summary_byvar <- function(x = data_variables, fs) { - rast_in <- rlang::inject(terra::rast(fs, !!!other_args)) - terra::sds(lapply( - x, - function(v) { - rast_inidx <- grep(v, names(rast_in)) - rast_in <- rast_in[[rast_inidx]] - rast_summary <- terra::tapp(rast_in, index = "days", fun = "mean") - names(rast_summary) <- - paste0( - rep(v, terra::nlyr(rast_summary)), "_", - terra::time(rast_summary) - ) - terra::set.crs(rast_summary, "EPSG:4326") - return(rast_summary) - } - )) - } - # nolint end - - # summary by 10 days - # TODO: dropping furrr? - rast_10d_summary <- - furrr::future_map( - .x = future_inserted, - .f = ~summary_byvar(fs = .x), - .options = - furrr::furrr_options( - globals = c("other_args", "data_variables") - ) - ) - rast_10d_summary <- Reduce(c, rast_10d_summary) - return(rast_10d_summary) - - } - -#' Process atmospheric composition data by chunks (v3) -#' @keywords Calculation -#' @description -#' Returning a single `SpatRasterDataset` object. -#' Removed `tapp` for performance; impose a strict assumption that -#' there are no missing values -#' @param path character(1). Directory with downloaded netCDF (.nc4) files. or -#' netCDF file paths. -#' @param date character(2). length of 10. Format "YYYY-MM-DD". -#' @param locs Locations to extract. -#' @param locs_id character(1). Location identifier. -#' @param ... Arguments passed to [`terra::rast`]. -#' @note -#' Layer names of the returned `SpatRaster` object contain the variable, -#' pressure level, date -#' Reference duration: 1 day summary, all layers: 106 seconds -#' hard-coded subsets for subdataset selection -#' @author Mitchell Manware, Insang Song -#' @return a `SpatRaster` object; -#' @importFrom terra rast -#' @importFrom terra time -#' @importFrom terra varnames -#' @importFrom terra crs -#' @importFrom terra subset -#' @importFrom sf st_as_sf -#' @importFrom future.apply future_lapply -#' @importFrom data.table rbindlist -#' @export -calc_geos_strict <- - function(path = NULL, - date = c("2018-01-01", "2018-01-01"), - locs = NULL, - locs_id = NULL, - ...) { - #### directory setup - if (length(path) == 1) { - if (dir.exists(path)) { - # path <- amadeus::download_sanitize_path(path) - paths <- list.files( - path, - pattern = "GEOS-CF.v01.rpl", - full.names = TRUE - ) - paths <- paths[grep( - ".nc4", - paths - )] - } - } else { - paths <- path - } - #### check for variable - # amadeus::check_for_null_parameters(mget(ls())) - #### identify file paths - #### identify dates based on user input - dates_of_interest <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = TRUE - ) - dates_of_interest_incl <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = FALSE - ) - #### subset file paths to only dates of interest - data_paths <- unique( - grep( - paste( - dates_of_interest, - collapse = "|" - ), - paths, - value = TRUE - ) - ) - - #### identify collection - collection <- regmatches( - data_paths[1], - # the pattern accommodates 3-4 characters for the variable name, - # 3-4 alphanumerics for the temporal resolution, - # 8-9 alphanumerics for the output dimensions - # nolint start - regexpr( - "GEOS-CF.v01.rpl.(aqc|chm)_[[:alpha:]]{3,4}_[[:alnum:]]{3,4}_[[:alnum:]]{8,9}_v[1-9]", - data_paths[1] - ) - ) - cat( - paste0( - "Identified collection ", - collection, - ".\n" - ) - ) - if (length(unique(collection)) > 1) { - warning( - "Multiple collections detected. Returning data for all collections.\n" - ) - } - - filename_date <- sort(regmatches( - data_paths, - regexpr( - "20[0-9]{2}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])", - data_paths - ) - )) - if (any(table(filename_date) < 24)) { - warning( - "Some dates include less than 24 hours. Check the downloaded files." - ) - } - # nolint end - # to export locs (pointers are not exportable) - locs <- sf::st_as_sf(locs) - - # split filename dates daily - filename_date <- as.Date(filename_date, format = "%Y%m%d") - filename_date <- filename_date[filename_date %in% dates_of_interest_incl] - filename_date_cl <- as.integer(as.factor(filename_date)) - - future_inserted <- split(data_paths, filename_date_cl) - other_args <- list(...) - data_variables <- terra::describe(data_paths[1], sds = TRUE)$var - - search_variables <- - if (grepl("chm", collection)) { - c("ACET", "ALD2", "ALK4", "BCPI", "BCPO", "BENZ", "C2H6", "C3H8", - "CH4", "CO", "DST1", "DST2", "DST3", "DST4", "EOH", "H2O2", - "HCHO", "HNO3", "HNO4", "ISOP", "MACR", "MEK", "MVK", "N2O5", - "NH3", "NH4", "NIT", "NO", "NO2", "NOy", "OCPI", "OCPO", "PAN", - "PM25_RH35_GCC", "PM25_RH35_GOCART", "PM25bc_RH35_GCC", - "PM25du_RH35_GCC", "PM25ni_RH35_GCC", "PM25oc_RH35_GCC", - "PM25soa_RH35_GCC", "PM25ss_RH35_GCC", "PM25su_RH35_GCC", - "PRPE", "RCHO", "SALA", "SALC", "SO2", "SOAP", "SOAS", "TOLU", "XYLE" - ) - } else { - c("CO", "NO2", "O3", "SO2") - } - - # fs is the hourly file paths per day (each element with N=24) - summary_byvar <- function(x = search_variables, fs) { - rast_in <- rlang::inject(terra::rast(fs, !!!other_args)) - # strongly assume that we take the single day. no need to filter dates - # per variable, - # all files (hourly) are cleaned and processed - sds_proc <- - lapply( - x, - function(v) { - rast_inidx <- grep(v, data_variables) - #rast_in <- mean(rast_in[[rast_inidx]]) - rast_summary <- terra::mean(rast_in[[rast_inidx]]) - rtin <- as.Date(terra::time(rast_in)) - rtin_u <- unique(rtin) - cat(sprintf("Processing %s, date: %s\n", v, rtin_u)) - # rast_summary <- vector("list", length = length(rtin_u)) - # for (d in seq_along(rtin_u)) { - # rast_d <- rast_in[[rtin == rtin_u[d]]] - # rast_summary[[d]] <- mean(rast_d) - # } - # rast_summary <- do.call(c, rast_summary) - - # the next line is deprecated - # rast_summary <- terra::tapp(rast_in, index = "days", fun = "mean") - terra::time(rast_summary) <- rtin_u - names(rast_summary) <- - paste0( - rep(gsub("_lev=.*", "", v), terra::nlyr(rast_summary)) - ) - terra::set.crs(rast_summary, "EPSG:4326") - return(rast_summary) - } - ) - sds_proc <- terra::sds(sds_proc) - - locstr <- terra::vect(locs) - rast_ext <- terra::extract(sds_proc, locstr, ID = TRUE) - # rast_ext <- lapply(rast_ext, - # function(df) { - # df$ID <- unlist(locs[[locs_id]]) - # return(df) - # } - # ) - rast_ext <- - Reduce(function(dfa, dfb) dplyr::full_join(dfa, dfb, by = "ID"), - rast_ext - ) - rast_ext$time <- unique(as.Date(terra::time(rast_in))) - rast_ext$ID <- unlist(locs[[locs_id]])[rast_ext$ID] - names(rast_ext)[names(rast_ext) == "ID"] <- locs_id - return(rast_ext) - - } - future::plan(future::multicore, workers = 10) - rast_summary <- - future.apply::future_lapply( - future_inserted, - function(fs) summary_byvar(fs = fs) - ) - future::plan(future::sequential) - rast_summary <- data.table::rbindlist(rast_summary) - - return(rast_summary) - - } - - -#' Reflown gmted processing -#' @keywords Calculation -#' @param variable character(2). Statistic and resolution. -#' @param path character(1). Directory with downloaded GMTED files. -#' @param locs data.frame/SpatVector/sf. Locations. -#' @param locs_id character(1). Location identifier. -#' @param win numeric(4). Window for the raster. -#' @param radius numeric(1). Radius for the extraction. -#' @param fun character(1). Function to apply. -#' @param ... Additional parameters to be passed to other functions. -#' @returns A data.frame containing the extracted GMTED data. -#' @importFrom terra rast -#' @importFrom terra varnames -#' @importFrom terra extract -#' @export -calc_gmted_direct <- function( - variable = NULL, - path = NULL, - locs = NULL, - locs_id = NULL, - win = c(-126, -62, 22, 52), - radius = 0, - fun = "mean", - ...) { - #### directory setup - path <- amadeus::download_sanitize_path(path) - #### check for length of variable - if (!(length(variable) == 2)) { - stop( - paste0( - "Please provide a vector with the statistic and resolution.\n" - ) - ) - } - #### identify statistic and resolution - statistic <- variable[1] - statistic_code <- amadeus::process_gmted_codes( - statistic, - statistic = TRUE, - invert = FALSE - ) - resolution <- variable[2] - resolution_code <- amadeus::process_gmted_codes( - resolution, - resolution = TRUE, - invert = FALSE - ) - cat(paste0( - "Cleaning ", - statistic, - " data at ", - resolution, - " resolution.\n" - )) - statistic_from <- c( - "Breakline Emphasis", "Systematic Subsample", - "Median Statistic", "Minimum Statistic", - "Mean Statistic", "Maximum Statistic", - "Standard Deviation Statistic" - ) - statistic_to <- c( - "BRKL", "SSUB", "MEDN", "MINI", "MEAN", "MAXL", "STDV" - ) - statistic_to <- - sprintf("LDU_E%s", statistic_to[match(statistic, statistic_from)]) - - #### identify file path - paths <- list.dirs( - path, - full.names = TRUE - ) - data_path <- - grep( - sprintf( - "%s%s_grd", - statistic_code, - as.character(resolution_code) - ), - paths, value = TRUE - ) - - #### import data - data <- terra::rast(data_path, win = win) - #### layer name - names(data) <- paste0( - "elevation_", - gsub( - "_grd", - "", - names(data) - ) - ) - #### varnames - terra::varnames(data) <- paste0( - "Elevation: ", - statistic, - " (", - resolution, - ")" - ) - from <- data - #return(from) - #### prepare locations list - sites_list <- amadeus::calc_prepare_locs( - from = from, - locs = locs, - locs_id = locs_id, - radius = radius - ) - sites_e <- sites_list[[1]] - sites_id <- sites_list[[2]] - #### perform extraction - sites_extracted <- amadeus::calc_worker( - dataset = "gmted", - from = from, - locs_vector = sites_e, - locs_df = sites_id, - radius = radius, - fun = fun, - variable = 2, - time = NULL, - time_type = "timeless" - ) - #### convert integer to numeric - sites_extracted[, 2] <- as.numeric(sites_extracted[, 2]) - #### define column names - colnames(sites_extracted) <- c( - locs_id, - paste0( - statistic_to, "_", sprintf("%05d", radius) - ) - ) - #### return data.frame - return(data.frame(sites_extracted)) -} - - - -#' Process NARR2 Data -#' -#' This function processes NARR2 data based on the specified parameters. -#' -#' @keywords Calculation -#' @param date A character vector specifying the start and end dates -#' in the format "YYYY-MM-DD". -#' @param variable A character vector specifying the variable of interest. -#' @param path A character vector specifying the path to the data files. -#' @param ... Additional parameters to be passed to other functions. -#' -#' @returns A SpatRaster object containing the processed NARR2 data. -#' -#' @details This function performs the following steps: -#' 1. Sets up the directory path. -#' 2. Checks for null parameters. -#' 3. Identifies file paths based on the specified variable. -#' 4. Generates a date sequence based on the specified start and end dates. -#' 5. Filters the file paths to include only dates of interest. -#' 6. Sets up the search abbreviation and target variable. -#' 7. Imports and processes the data for each file path. -#' 8. Subsets the data to include only dates of interest. -#' 9. Returns the processed data as a SpatRaster object. -#' -#' @examples -#' # Process NARR2 data for the variable "PRATE" from -#' # September 1, 2023 to September 1, 2023 -#' \dontrun{ -#' data <- -#' process_narr2( -#' date = c("2023-09-01", "2023-09-01"), -#' variable = "PRATE", -#' path = "/path/to/data" -#' ) -#' } -#' -#' @export -process_narr2 <- function( - date = c("2023-09-01", "2023-09-01"), - variable = NULL, - path = NULL, - ...) { - #### directory setup - path <- amadeus::download_sanitize_path(path) - #### check for variable - amadeus::check_for_null_parameters(mget(ls())) - #### identify file paths - data_paths <- list.files( - path, - pattern = variable, - recursive = TRUE, - full.names = TRUE - ) - # data_paths <- grep( - # sprintf("%s*.*.nc", variable), - # data_paths, - # value = TRUE - # ) - #### define date sequence - date_sequence <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = TRUE - ) - #### path ncar - ym_from <- regmatches( - data_paths, - regexpr( - "2[0-9]{3,5}", - data_paths - ) - ) - ym_of_interest <- - substr(date_sequence, - 1, ifelse(all(nchar(ym_from) == 6), 6, 4)) - ym_of_interest <- unique(ym_of_interest) - #### subset file paths to only dates of interest - data_paths_ym <- unique( - grep( - paste( - ym_of_interest, - collapse = "|" - ), - data_paths, - value = TRUE - ) - ) - - search_abbr <- list.dirs(path)[-1] - search_abbr <- sub(paste0(path, "/"), "", search_abbr) - search_to <- c( - "ATSFC", "ALBDO", "ACPRC", "DSWRF", "ACEVP", - "HCLAF", "PLBLH", "LCLAF", "LATHF", "MCLAF", - "OMEGA", "PRWTR", "PRATE", "PRSFC", "SENHF", - "SPHUM", "SNWCV", "SLMSC", "CLDCV", "ULWRF", - "UWIND", "VISIB", "VWIND", "ACSNW" - ) - search_to <- - sprintf("MET_%s", search_to[match(variable, search_abbr)]) - - #### initiate for loop - data_full <- terra::rast() - for (p in seq_along(data_paths_ym)) { - #### import data - data_year <- terra::rast(data_paths_ym[p]) - data_year_tinfo <- terra::time(data_year) - time_processed <- as.POSIXlt(data_year_tinfo) - time_this <- time_processed[1] - cat(paste0( - "Cleaning ", variable, " data for ", - sprintf( - "%s, %d %s", - strftime(time_this, "%B"), - time_this$year + 1900, - "...\n" - ) - )) - #### check for mono or pressure levels - lvinfo <- regmatches( - names(data_year), - regexpr("level=[0-9]{3,4}", names(data_year)) - ) - if (length(lvinfo) == 0) { - cat("Detected monolevel data...\n") - names(data_year) <- paste0( - search_to, "_", - gsub("-", "", data_year_tinfo) - ) - } else { - cat("Detected pressure levels data...\n") - lvinfo <- sub("level=", "", lvinfo) - lvinfo <- sprintf("%04d", as.integer(lvinfo)) - lvinfo <- paste0("L", lvinfo) - terra::time(data_year) <- as.Date(data_year_tinfo) - names(data_year) <- sprintf( - "%s_%s_%s", - search_to, - lvinfo, - gsub("-", "", data_year_tinfo) - ) - } - data_full <- c( - data_full, - data_year, - warn = FALSE - ) - } - - #### subset years to dates of interest - data_full_cn <- names(data_full) - data_return <- terra::subset( - data_full, - which( - substr( - data_full_cn, - nchar(data_full_cn) - 7, - nchar(data_full_cn) - ) %in% date_sequence - ) - ) - cat(paste0( - "Returning daily ", - variable, - " data from ", - as.Date(date_sequence[1], format = "%Y%m%d"), - " to ", - as.Date( - date_sequence[length(date_sequence)], - format = "%Y%m%d" - ), - ".\n" - )) - #### return SpatRaster - return(data_return) -} - - -#' Calculate aggregated values for specified locations -#' -#' This function calculates aggregated values for specified locations from -#' a raster dataset. -#' -#' @keywords Calculation -#' @param from The raster dataset from which to extract values. -#' @param locs A data frame containing the locations for which -#' to calculate aggregated values. -#' It should have a column in `locs_id` value -#' that contains unique identifiers for each location. -#' @param locs_id An optional column name -#' in the \code{locs} data frame that contains additional location -#' identifiers. -#' @param radius The radius within which to include neighboring locations -#' for aggregation. Default is 0. -#' @param fun The aggregation function to use. -#' It can be a character string specifying a function name -#' (e.g., "mean", "sum"), -#' or it can be a custom function. Default is "mean". -#' @param ... Additional arguments to be passed to -#' the aggregation function. -#' -#' @returns A data frame containing the aggregated values for each -#' location and time point. -#' @export -calc_narr2 <- function( - from, - locs, - locs_id = NULL, - radius = 0, - fun = "mean", - ... -) { - # - name <- geometry <- value <- NULL - ### prepare locations list - sites_list <- amadeus::calc_prepare_locs( - from = from, - locs = locs[, "site_id"], - locs_id = locs_id, - radius = radius - ) - sites_e <- sites_list[[1]] - # sites_id <- sites_list[[2]] - #### identify pressure level or monolevel data - time_from <- terra::time(from) - timetab <- table(time_from) - if (!all(timetab == 1)) { - time_split <- - split(time_from, - #ceiling(seq_along(time_from) / 29L)) - ceiling(as.integer(as.factor(time_from)) / 14L)) - sites_extracted <- Map( - function(day) { - cat(sprintf("Processing %s...\n", paste(day[1], "-", day[length(day)]))) - from_day <- from[[time_from %in% day]] - sites_extracted_day <- terra::extract( - from_day, - sites_e, - bind = TRUE - ) - sites_extracted_day <- data.frame(sites_extracted_day) - if ("geometry" %in% names(sites_extracted_day)) { - sites_extracted_day <- sites_extracted_day |> - dplyr::select(-geometry) - } - return(sites_extracted_day) - }, - time_split - ) - sites_extracted <- reduce_merge(sites_extracted, by = c("site_id")) - } else { - sites_extracted <- - terra::extract( - from, - sites_e, - bind = TRUE - ) - sites_extracted <- as.data.frame(sites_extracted) - if ("geometry" %in% names(sites_extracted)) { - sites_extracted <- sites_extracted |> - dplyr::select(-geometry) - } - } - sites_extracted <- - sites_extracted |> - tidyr::pivot_longer(cols = tidyselect::starts_with("MET_")) |> - dplyr::rowwise() |> - dplyr::mutate( - time = - regmatches( - name, - regexpr( - "20[0-9]{2,2}[0-1][0-9][0-3][0-9]", - name - ) - ) - ) |> - dplyr::mutate( - name = sub(paste0("_", time), "", name) - ) |> - dplyr::ungroup() |> - dplyr::mutate( - time = as.character(as.Date(time, format = "%Y%m%d")) - ) |> - tidyr::pivot_wider( - names_from = name, - values_from = value, - id_cols = c("site_id", "time") - ) - sites_extracted <- data.table::as.data.table(sites_extracted) - names(sites_extracted)[-1:-2] <- - sprintf("%s_%05d", names(sites_extracted)[-1:-2], radius) - - #### return data.frame - return(sites_extracted) -} - -#' Impute missing values and attach lagged features -#' @keywords Post-calculation -#' @note under construction. -#' This function performs imputation on a given data table -#' by replacing missing values with imputed values. -#' It follows a series of steps including data cleaning, name cleaning, -#' geoscf column renaming, NDVI 16-day backward filling, -#' zero-variance exclusion, excessive "true zeros" exclusion, -#' and imputation using missRanger. -#' -#' @param dt The input data table to be imputed. -#' @param period The period for lagged features. -#' @param nthreads_dt The number of threads to be used for -#' data.table operations. -#' @param nthreads_collapse The number of threads to be used for -#' collapse operations. -#' @param nthreads_imputation The number of threads to be used for -#' the imputation process. -#' -#' @returns The imputed data table with lagged features. -#' -#' @importFrom collapse set_collapse replace_inf replace_na fvar fnth -#' @importFrom data.table setDTthreads setnafill -#' @importFrom qs qread -#' @importFrom stats setNames -#' @importFrom stringi stri_replace_all_regex -#' @importFrom missRanger missRanger -#' @export -impute_all <- - function( - dt, - period, - nthreads_dt = 32L, - nthreads_collapse = 32L, - nthreads_imputation = 32L - ) { - data.table::setDTthreads(nthreads_dt) - if (is.character(dt)) { - dt <- file.path("output/qs", dt) - dt <- qs::qread(dt) - } - # name cleaning - dt <- stats::setNames(dt, sub("light_1", "OTH_HMSWL_0_00000", names(dt))) - dt <- stats::setNames(dt, sub("medium_1", "OTH_HMSWM_0_00000", names(dt))) - dt <- stats::setNames(dt, sub("heavy_1", "OTH_HMSWH_0_00000", names(dt))) - dt <- stats::setNames(dt, sub("population_", "POP_SEDAC_0_", names(dt))) - - geoscn <- - "ACET\tGEO_ACETO_0_00000 - ALD2\tGEO_ACETA_0_00000 - ALK4\tGEO_CALKA_0_00000 - BCPI\tGEO_HIBCA_0_00000 - BCPO\tGEO_HOBCA_0_00000 - BENZ\tGEO_BENZE_0_00000 - C2H6\tGEO_ETHTE_0_00000 - C3H8\tGEO_PROPA_0_00000 - CH4\tGEO_METHA_0_00000 - CO\tGEO_CMONO_0_00000 - DST1\tGEO_DUST1_0_00000 - DST2\tGEO_DUST2_0_00000 - DST3\tGEO_DUST3_0_00000 - DST4\tGEO_DUST4_0_00000 - EOH\tGEO_ETHOL_0_00000 - H2O2\tGEO_HYPER_0_00000 - HCHO\tGEO_FORMA_0_00000 - HNO3\tGEO_NITAC_0_00000 - HNO4\tGEO_PERAC_0_00000 - ISOP\tGEO_ISOPR_0_00000 - MACR\tGEO_METHC_0_00000 - MEK\tGEO_MEKET_0_00000 - MVK\tGEO_MVKET_0_00000 - N2O5\tGEO_DIPEN_0_00000 - NH3\tGEO_AMNIA_0_00000 - NH4\tGEO_AMNUM_0_00000 - NIT\tGEO_INNIT_0_00000 - NO\tGEO_NIOXI_0_00000 - NO2\tGEO_NIDIO_0_00000 - NOy\tGEO_NITRO_0_00000 - OCPI\tGEO_HIORG_0_00000 - OCPO\tGEO_HOORG_0_00000 - PAN\tGEO_PERNI_0_00000 - PM25_RH35_GCC\tGEO_PM25X_0_00000 - PM25_RH35_GOCART\tGEO_PM25R_0_00000 - PM25bc_RH35_GCC\tGEO_BLCPM_0_00000 - PM25du_RH35_GCC\tGEO_DUSPM_0_00000 - PM25ni_RH35_GCC\tGEO_NITPM_0_00000 - PM25oc_RH35_GCC\tGEO_ORCPM_0_00000 - PM25soa_RH35_GCC\tGEO_SORPM_0_00000 - PM25ss_RH35_GCC\tGEO_SEAPM_0_00000 - PM25su_RH35_GCC\tGEO_SULPM_0_00000 - PRPE\tGEO_CALKE_0_00000 - RCHO\tGEO_CALDH_0_00000 - SALA\tGEO_FSEAS_0_00000 - SALC\tGEO_CSEAS_0_00000 - SO2\tGEO_SULDI_0_00000 - SOAP\tGEO_SOAPR_0_00000 - SOAS\tGEO_SOASI_0_00000 - TOLU\tGEO_TOLUE_0_00000 - XYLE\tGEO_XYLEN_0_00000 - CO_y\tGEO_COVMR_0_00000 - NO2_y\tGEO_NOVMR_0_00000 - O3\tGEO_OZVMR_0_00000 - SO2_y\tGEO_SOVMR_0_00000" - - geoscn <- strsplit(geoscn, "\n") - geoscn <- unlist(geoscn) - geoscn <- strsplit(geoscn, "\t") - geoscn <- do.call(rbind, geoscn) - geoscndf <- as.data.frame(geoscn, stringsAsFactors = FALSE) - colnames(geoscndf) <- c("variable", "code") - geoscndf$variable <- trimws(geoscndf$variable) - - for (i in seq_len(nrow(geoscndf))) { - dt <- - setNames( - dt, - stringi::stri_replace_all_regex( - names(dt), sprintf("%s$", geoscndf$variable[i]), geoscndf$code[i] - ) - ) - } - site_id <- NULL - # NDVI 16-day - # For each site_id, backward filling for 16-day NDVI - # Last Observation Carried Forward is the method used; - # it assumes that the rows are ordered by date - dt <- dt[order(site_id, time), ] - col_ndviv <- grep("MOD_NDVIV_", names(dt)) - dtndviv <- - data.table::setnafill( - dt, type = "nocb", nan = NA, - cols = col_ndviv - ) - - collapse::set_collapse(mask = "manip", nthreads = nthreads_collapse) - - target_replace <- grep("^MOD_", names(dt), invert = TRUE) - dt <- collapse::replace_inf(dtndviv, value = NA, replace.nan = TRUE) - dt <- collapse::replace_na(dt, value = 0, cols = target_replace) - - # zero-variance exclusion - dt_colvars <- collapse::fvar(dt[, 5:ncol(dt), with = FALSE]) - zero_var_fields <- names(dt_colvars[dt_colvars == 0]) - - # Exclude fields with zero variance using data.table - dt <- dt[, (zero_var_fields) := NULL] - - # Store the name of zero variance fields as an attribute of the input object - attr(dt, "zero_var_fields") <- zero_var_fields - - # excluding columns with excessive "true zeros" - # we should have a threshold for the zero rate - # exc_zero <- collapse::fnth(dt[, 5:ncol(dt), with = FALSE], n = 0.9) - # exc_zero <- unname(which(exc_zero == 0)) + 5L - # dt <- dt[, (exc_zero) := NULL] - - # Q: Do we use all other features to impute? -- Yes. - # 32-thread, 10% for tree building, 200 trees, 4 rounds: 11 hours - imputed <- - missRanger::missRanger( - data = dt, - maxiter = 30L, - num.trees = 300L, - num.threads = nthreads_imputation, - mtry = 50L, - sample.fraction = 0.1 - ) - - imputed <- amadeus::calc_temporal_dummies(imputed, "time") - return(imputed) - # lagged features: changing period (period[1] + 1 day) - # period <- as.Date(period) - # period[1] <- period[1] + as.difftime(1, units = "days") - # period <- as.character(period) - # index_lag <- - # sprintf("MET_%s", c("ATSFC", "ACPRC", "PRSFC", "SPHUM", "WNDSP")) - # index_lag <- grep(paste(index_lag, collapse = "|"), names(dt)) - # target_lag <- imputed[, index_lag, with = FALSE] - - # output <- amadeus::calc_lagged(target_lag, period, 1, "site_id") - # return(output) - } - - -#' Append Predecessors -#' -#' This function appends predecessors to an existing object or -#' creates a new object if none exists. -#' -#' @keywords Post-calculation -#' @param path_qs The path where the predecessors will be stored. -#' @param period_new The new period to be appended. -#' @param input_new The new input object to be appended. -#' @param nthreads The number of threads to be used. -#' -#' @returns If no existing predecessors are found, the function saves -#' the new input object and returns the name of the saved file. -#' If existing predecessors are found, the function appends -#' the new input object to the existing ones and returns the combined object. -#' @export -append_predecessors <- - function( - path_qs = "output/qs", - period_new = NULL, - input_new = NULL, - nthreads = 8L - ) { - if (is.null(input_new)) { - stop("Please provide a valid object.") - } - if (!dir.exists(path_qs)) { - dir.create(path_qs, recursive = TRUE) - } - input_old <- list.files(path_qs, "*.*.qs$", full.names = TRUE) - - # validate input_old with period_new - # if (length(input_old) > 0) { - # periods_old <- do.call(rbind, strsplit(input_old, "_")) - # periods_old <- periods_old[, 4:5] - # periods_old_check <- vapply( - # seq(1, nrow(periods_old)), - # function(i) { - # period_old <- periods_old[i, ] - # period_old <- as.Date(period_old, format = "%Y-%m-%d") - # period_new <- as.Date(period_new, format = "%Y-%m-%d") - # if (period_new[1] < period_old[1] | period_new[2] < period_old[2]) { - # return(FALSE) - # } else { - # return(TRUE) - # } - # }, - # logical(1) - # ) - # if (!all(periods_old_check)) { - # stop("Results have an overlap period. Please provide a valid period.") - # } - # } - period_new <- sapply(period_new, as.character) - time_create <- gsub("[[:punct:]]|[[:blank:]]", "", Sys.time()) - name_qs <- - sprintf( - "dt_feat_pm25_%s_%s_%s.qs", - period_new[1], period_new[2], time_create - ) - if (length(input_old) == 0) { - qs::qsave(input_new, file = file.path(path_qs, name_qs)) - return(name_qs) - } else { - # vv <- list() - qs::qsave(input_new, file = file.path(path_qs, name_qs)) - input_update <- list.files(path_qs, "*.*.qs$", full.names = TRUE) - bound_large <- - Reduce( - function(x, y) { - if (inherits(x, "data.frame")) { - bound <- rbind(x, qs::qread(y)) - } else { - bound <- rbind(qs::qread(x), qs::qread(y)) - } - return(bound) - }, - input_update - ) - return(bound_large) - } - } - - -# nested parallelization -# IN PROGRESS -# TODO: identify bottleneck -#' @noRd -par_nest <- - function( - path, - ... - ) { - chopin::par_grid( - path, - fun_dist = calculate, - ... - ) - } - - -## base & meta learner fitting -# strategy: -# random subsample (~30%) ; row based -# P times... - -#' Base learner: Multilayer perceptron with brulee -#' -#' Multilayer perceptron model with different configurations of -#' hidden units, dropout, activation, and learning rate using brulee -#' and tidymodels. With proper settings, users can utilize graphics -#' processing units (GPU) to speed up the training process. -#' @keywords Baselearner -#' @note tune package should be 1.2.0 or higher. -#' @param dt_imputed The input data table to be used for fitting. -#' @param folds pre-generated rset object. If NULL, it should be -#' numeric to be used in [rsample::vfold_cv]. -#' @param r_subsample The proportion of rows to be sampled. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom recipes recipe update_role -#' @importFrom dplyr `%>%` -#' @importFrom parsnip mlp set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_brulee <- - function( - dt_imputed, - folds = NULL, - r_subsample = 0.3, - yvar = "Arithmetic.Mean", - xvar = seq(6, ncol(dt_imputed)), - vfold = 5L, - ... - ) { - # 2^9=512, 2^15=32768 (#param is around 10% of selected rows) - grid_hyper_tune <- - expand.grid( - hidden_units = list(c(64, 64), c(32, 32), c(32, 32, 32), c(16, 16, 16)), - dropout = 1 / seq(4, 2, -1), - activation = c("relu", "leaky_relu"), - learn_rate = c(0.1, 0.05, 0.01) - ) - dt_imputed <- - dt_imputed %>% - dplyr::slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed - ) %>% - # do we want to normalize the predictors? - # if so, an additional definition of truly continuous variables is needed - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(!!xvar) %>% - recipes::update_role(!!yvar, new_role = "outcome") #%>% - # recipes::step_normalize(!!yvar) - - if (is.null(folds)) { - base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) - } else { - base_vfold <- folds - } - base_model <- - parsnip::mlp( - hidden_units = parsnip::tune(), - dropout = parsnip::tune(), - epochs = 1000L, - activation = parsnip::tune(), - learn_rate = parsnip::tune() - ) %>% - parsnip::set_engine("brulee", device = "cuda") %>% - parsnip::set_mode("regression") - - wf_config <- tune::control_resamples(save_pred = TRUE, save_workflow = TRUE) - - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config - ) - return(base_wf) - } - - -# dt <- qs::qread("output/dt_feat_design_imputed_061024.qs") -# dtd <- dplyr::as_tibble(dt) -# dtfit <- fit_base_brulee(dtd, r_subsample = 0.3) - - -#' Base learner: Extreme gradient boosting (XGBoost) -#' -#' XGBoost model is fitted at the defined rate (`r_subsample`) of -#' the input dataset by grid search. -#' With proper settings, users can utilize graphics -#' processing units (GPU) to speed up the training process. -#' @keywords Baselearner -#' @note tune package should be 1.2.0 or higher. -#' @param dt_imputed The input data table to be used for fitting. -#' @param folds pre-generated rset object. If NULL, it should be -#' numeric to be used in [rsample::vfold_cv]. -#' @param r_subsample The proportion of rows to be sampled. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom recipes recipe update_role -#' @importFrom dplyr `%>%` -#' @importFrom parsnip boost_tree set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_xgb <- - function( - dt_imputed, - folds = NULL, - r_subsample = 0.3, - yvar = "Arithmetic.Mean", - xvar = seq(6, ncol(dt_imputed)), - vfold = 5L, - ... - ) { - grid_hyper_tune <- - expand.grid( - mtry = floor(c(0.02, 0.1, 0.02) * ncol(dt_imputed)), - trees = seq(500, 3000, 500), - learn_rate = c(0.05, 0.01, 0.005, 0.001) - ) - dt_imputed <- - dt_imputed %>% - dplyr::slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed - ) %>% - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(tidyselect::all_of(xvar)) %>% - recipes::update_role(tidyselect::all_of(yvar), new_role = "outcome") - if (is.null(folds)) { - base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) - } else { - base_vfold <- folds - } - base_model <- - parsnip::boost_tree( - mtry = parsnip::tune(), - trees = parsnip::tune(), - learn_rate = parsnip::tune() - ) %>% - parsnip::set_engine("xgboost", device = "cuda") %>% - parsnip::set_mode("regression") - - wf_config <- tune::control_resamples(save_pred = TRUE, save_workflow = TRUE) - - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config - ) - return(base_wf) - - } - -# dt <- qs::qread("output/dt_feat_design_imputed_061024.qs") -# dtd <- dplyr::as_tibble(dt) -# dtfitx <- fit_base_xgb(dtd, xvar = names(dtd)[6:105], r_subsample = 0.3) - - -#' Base learner: Elastic net -#' -#' Elastic net model is fitted at the defined rate (`r_subsample`) of -#' the input dataset by grid search. -#' @keywords Baselearner -#' @note tune package should be 1.2.0 or higher. -#' @param dt_imputed The input data table to be used for fitting. -#' @param folds pre-generated rset object. If NULL, it should be -#' numeric to be used in [rsample::vfold_cv]. -#' @param r_subsample The proportion of rows to be sampled. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param nthreads The number of threads to be used. Default is 16L. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom future plan multicore multisession -#' @importFrom dplyr `%>%` -#' @importFrom recipes recipe update_role -#' @importFrom parsnip linear_reg set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_elnet <- - function( - dt_imputed, - folds = NULL, - r_subsample = 0.3, - yvar = "Arithmetic.Mean", - xvar = seq(6, ncol(dt_imputed)), - vfold = 5L, - nthreads = 16L, - ... - ) { - grid_hyper_tune <- - expand.grid( - mixture = seq(0, 1, length.out = 21), - penalty = 10 ^ seq(-3, 5) - ) - dt_imputed <- - dt_imputed %>% - dplyr::slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed - ) %>% - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(tidyselect::all_of(xvar)) %>% - recipes::update_role(tidyselect::all_of(yvar), new_role = "outcome") - if (is.null(folds)) { - base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) - } else { - base_vfold <- folds - } - base_model <- - parsnip::linear_reg( - mixture = parsnip::tune(), - penalty = parsnip::tune() - ) %>% - parsnip::set_engine("glmnet") %>% - parsnip::set_mode("regression") - - wf_config <- - tune::control_resamples(save_pred = TRUE, save_workflow = TRUE) - - future::plan(future::multicore, workers = nthreads) - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config, - parallel_over = "resamples" - ) - future::plan(future::sequential) - return(base_wf) - - } - -#' Generate manual rset object from spatiotemporal cross-validation indices -#' @keywords Baselearner -#' @param cvindex integer length of nrow(data). -#' @param data data.frame. -#' @param ref_list List of custom reference indices. Default is NULL. -#' if not NULL, it will be used as a reference instead of max(cvindex). -#' @param cv_mode character(1). Spatiotemporal cross-validation indexing -#' method label. -#' @returns rset object of `rsample` package. A tibble with a list column of -#' training-test data.frames and a column of labels. -#' @author Insang Song -#' @importFrom rsample make_splits -#' @importFrom rsample manual_rset -#' @export -convert_cv_index_rset <- - function( - cvindex, - data, - ref_list = NULL, - cv_mode = "spt" - ) { - if (length(cvindex) != nrow(data)) { - stop("cvindex length should be equal to nrow(data).") - } - - if (!is.null(ref_list)) { - list_cvi <- ref_list - len_cvi <- seq_along(list_cvi) - } else { - maxcvi <- max(cvindex) - len_cvi <- seq_len(maxcvi) - list_cvi <- split(len_cvi, len_cvi) - } - list_cvi_rows <- - lapply( - list_cvi, - function(x) { - list(analysis = which(!cvindex %in% x), - assessment = which(cvindex %in% x)) - } - ) - list_split_dfs <- - lapply( - list_cvi_rows, - function(x) { - rsample::make_splits(x = x, data = data) - } - ) - modename <- sprintf("cvfold_%s_%03d", cv_mode, len_cvi) - rset_stcv <- rsample::manual_rset(list_split_dfs, modename) - return(rset_stcv) - } - - -#' Attach XY coordinates to a data frame -#' -#' This function attaches XY coordinates to a data frame based on a spatial -#' object containing the coordinates. It performs a left join operation to -#' match the coordinates with the corresponding locations in the data frame. -#' @keywords Utility -#' @param data_full The full data frame to which XY coordinates will -#' be attached. -#' @param data_sf The spatial object containing the XY coordinates. -#' @param locs_id The column name in the spatial object that represents the -#' location identifier. -#' @param time_id The column name in the data frame that represents the time -#' identifier. -#' -#' @returns A data frame with the XY coordinates attached. -#' -#' @importFrom sf st_coordinates -#' @importFrom stats setNames -#' @importFrom collapse join -#' @export -attach_xy <- - function( - data_full, - data_sf, - locs_id = "site_id", - time_id = "time" - ) { - data_sfd <- sf::st_coordinates(data_sf) - data_sf <- data_sf[[locs_id]] - data_sfd <- data.frame(site_id = data_sf, data.frame(data_sfd)) - data_sfd <- stats::setNames(data_sfd, c(locs_id, "lon", "lat")) - - data_full_lean <- data_full[, c(locs_id, time_id), with = FALSE] - data_full_lean <- - collapse::join( - data_full_lean, data_sfd, on = locs_id, how = "left" - ) - return(data_full_lean) - } - - - -#' Generate spatio-temporal cross-validation index with anticlust -#' -#' This function generates a spatio-temporal cross-validation index -#' based on the anticlust package. The function first calculates the -#' spatial clustering index using the balanced_clustering function as -#' default, and if `cv_pairs` is provided, -#' it generates rank-based pairs based on the proximity between -#' cluster centroids. -#' @keywords Baselearner -#' @param data data.table with X, Y, and time information. -#' @param target_cols character(3). Names of columns for X, Y, and time. -#' Default is c("lon", "lat", "time"). -#' Order insensitive. -#' @param preprocessing character(1). Preprocessing method. -#' * "none": no preprocessing. -#' * "normalize": normalize the data. -#' * "standardize": standardize the data. -#' @param cv_fold integer(1). Number of folds for cross-validation. -#' default is 5L. -#' @param cv_pairs integer(1). Number of pairs for cross-validation. -#' This value will be used to generate a rank-based pairs -#' based on `target_cols` values. -#' @param pairing character(1) Pair selection method. -#' * "1": search the nearest for each cluster then others -#' are selected based on the rank. -#' * "2": rank the pairwise distances directly -#' @param cv_mode character(1). Spatiotemporal cross-validation indexing -#' @note nrow(data) %% cv_fold should be 0. -#' @returns rsample::manual_rset() object. -#' @author Insang Song -#' @importFrom rsample manual_rset -#' @importFrom anticlust balanced_clustering -#' @importFrom dplyr group_by summarize across ungroup all_of -#' @export -generate_cv_index <- - function( - data, - target_cols = c("lon", "lat", "time"), - preprocessing = c("none", "normalize", "standardize"), - cv_fold = 5L, - cv_pairs = NULL, - pairing = c("1", "2"), - cv_mode = "spt" - ) { - if (length(target_cols) != 3) { - stop("Please provide three target columns.") - } - data <- data[, target_cols, with = FALSE] - data$time <- as.numeric(data$time) - data_proc <- - switch( - preprocessing, - none = data, - normalize = (data + abs(apply(data, 2, min))) / - (apply(data, 2, max) + abs(apply(data, 2, min))), - standardize = collapse::fscale(data) - ) - index_cv <- anticlust::balanced_clustering(data_proc, cv_fold) - cv_index <- NULL - ref_list <- NULL - if (!is.null(cv_pairs)) { - pairing <- match.arg(pairing) - data_ex <- data_proc - data_ex$cv_index <- index_cv - data_exs <- data_ex |> - dplyr::group_by(cv_index) |> - dplyr::summarize( - dplyr::across(dplyr::all_of(target_cols), ~mean(as.numeric(.x))) - ) |> - dplyr::ungroup() - - data_exs$cv_index <- NULL - data_exm <- stats::dist(data_exs) - data_exd <- as.vector(data_exm) - data_exmfull <- as.matrix(data_exm) - # index searching in dist matrix out of dist - data_exd_colid <- - unlist(Map(seq_len, seq_len(max(index_cv) - 1))) - # rep(seq_len(max(index_cv) - 1), seq(max(index_cv) - 1, 1, -1)) - data_exd_rowid <- rep(seq(2, max(index_cv)), seq_len(max(index_cv) - 1)) - if (pairing == "2") { - search_idx <- which(rank(-data_exd) <= cv_pairs) - } else { - # min rank element index per each cluster centroid - search_each1 <- - apply(data_exmfull, 1, \(x) which.min(replace(x, which.min(x), Inf))) - # sort the index - search_each1sort <- - Map(c, seq_along(search_each1), search_each1) - # keep the distinct pairs - search_each1sort <- - unique(Map(sort, search_each1sort)) - # return(list(data_exd_colid, data_exd_rowid, search_each1sort)) - search_idx_each1 <- - which( - Reduce( - `|`, - Map( - \(x) data_exd_colid %in% x[1] & data_exd_rowid %in% x[2], - search_each1sort - ) - ) - ) - - # replace the nearest pairs' distance to Inf - search_idx_others <- - which(rank(-replace(data_exd, search_idx_each1, Inf)) <= cv_pairs) - # remove the nearest pairs - # sort the distance of the remaining pairs - search_idx_others <- - search_idx_others[1:(cv_pairs - length(search_idx_each1))] - search_idx <- c(search_idx_each1, search_idx_others) - } - ref_list <- - Map(c, data_exd_rowid[search_idx], data_exd_colid[search_idx]) - } - - rset_cv <- - convert_cv_index_rset( - index_cv, data, ref_list = ref_list, cv_mode = cv_mode - ) - return(rset_cv) - } - -#' Visualize the spatio-temporal cross-validation index -#' @keywords Baselearner -#' @param rsplit rsample::manual_rset() object. -#' @param angle numeric(1). Viewing angle of 3D plot. -#' @returns None. A plot will be generated. -#' @seealso [`generate_cv_index`] -#' @export -vis_rset <- - function(rsplit, angle = 60) { - nsplit <- nrow(rsplit) - graphics::par(mfrow = c(ceiling(nsplit / 3), 3)) - for (i in seq_len(nsplit)) { - cleared <- rsplit[i, 1][[1]][[1]]$data - cleared$indx <- 0 - cleared$indx[rsplit[i, 1][[1]][[1]]$in_id] <- "In" - cleared$indx[rsplit[i, 1][[1]][[1]]$out_id] <- "Out" - cleared$indx <- factor(cleared$indx) - cleared$time <- as.POSIXct(cleared$time) - scatterplot3d::scatterplot3d( - cleared$lon, cleared$lat, cleared$time, - color = rev(as.integer(cleared$indx) + 1), - cex.symbols = 0.02, pch = 19, - angle = angle - ) - } - } -# nocov end - -#' Get Divisors -#' @keywords Miscellaneous -#' @param x integer(1). A positive integer. -#' @returns A vector of divisors of x. -#' @export -divisor <- - function(x) { - xv <- seq_len(x) - xv[which(x %% xv == 0)] - } diff --git a/inst/targets/pipeline_base_functions_old.R b/inst/targets/pipeline_base_functions_old.R deleted file mode 100644 index 541da8e4..00000000 --- a/inst/targets/pipeline_base_functions_old.R +++ /dev/null @@ -1,2606 +0,0 @@ -## pipeline base functions -## Before being transferred to package functions -## as of 06/11/2024. -## For archiving only. - -## file check: chunking -## if using tarchetypes::tar_files, -## the file *lists* should be stored as a single file -## Provided that the download is completed in a defined -## time period such that users can distiguish a **set** of files -## from each other, -## timestamp check: `fs::file_info(...)$modification_time` -## can be used in bulk file check (will be time consuming as -## the number of files grow, though). -## The file list of the previous successful run will be stored as a file -## and we just save the file list of the current run, which are -## older than a certain rerun interval (e.g., 6 months). -## If one really wants to keep the shorter rerun interval, -## the strategy should be changed. -## THINK: How can we know the downloaded files are complete and correct? -## quick diagram: -## file set 1 ... file set x -## (listing function runs) -## list1.rds ... listx.rds -## (hashed; not modified) ... (not run) -## (pass) ... (run) -## ... ... (downstream process + calculation) -## (as-is) ... (as-is) --- unless modified or manually invalidated - -# 2018-2022 (target) + 2023 (manpower saving demonstration) -# compacting the pipeline with branching -# TODO: file path is the same, but binary is different; could targets -# handle? -library(amadeus) - -#' Load arguments from the formatted argument list file -#' @param argfile character(1). Path to the argument file. RDS format. -#' @param dataset character(1). Dataset name. -#' @returns A list of arguments. -#' @importFrom qs qread -#' @export -loadargs <- function(argfile, dataset) { - if (endsWith(argfile, ".rds")) { - arglist <- readRDS(argfile) - } else if (endsWith(argfile, ".qs")) { - arglist <- qs::qread(argfile) - } else { - stop("Invalid format.") - } - arglist[[dataset]] -} - -#' Check if a query date falls within a time interval -#' -#' This function checks if a given query date falls within a time interval defined by a vector of two dates. -#' -#' @param query_date The query date to check. -#' @param tvec A vector of two dates defining the time interval. -#' -#' @return TRUE if the query date falls within the time interval, FALSE otherwise. -#' -#' @examples -#' query_date <- as.Date("2022-01-01") -#' tvec <- c(as.Date("2021-01-01"), as.Date("2023-01-01")) -#' `%tin%`(query_date, tvec) -#' -#' @export -`%tin%` <- function(query_date, tvec) { - tvec <- sort(tvec) - query_date <= tvec[1] & query_date >= tvec[2] -} - -#' Load MODIS files from a specified path. -#' -#' This function takes a path and an optional pattern as input and returns a list of MODIS files found in the specified path. -#' -#' @param path The path where the MODIS files are located. -#' @param pattern An optional regular expression pattern to filter the files. The default pattern is "hdf$". -#' @param date A vector of two dates to filter the files by. The default is an empty character vector. -#' @return A list of full file names of the MODIS files found in the specified path. -#' -#' @examples -#' # Load MODIS files from the current directory -#' modis_files <- load_modis_files(".") -#' -#' # Load MODIS files from a specific directory with a custom pattern -#' modis_files <- load_modis_files("/path/to/files", pattern = "MOD.*hdf$") -#' -#' @export -load_modis_files <- function(path, pattern = "hdf$", date = character(2)) { - modis_files <- - list.files( - path, pattern = pattern, - recursive = TRUE, - full.names = TRUE - ) - date_exp <- amadeus::generate_date_sequence(date[1], date[2], sub_hyphen = FALSE) - date_exp <- strftime(date_exp, format = "%Y%j") - modis_files <- grep(sprintf("(%s)", paste(paste0("A", date_exp), collapse = "|")), modis_files, value = TRUE) - return(modis_files) -} - -#' Injects the calculate function with specified arguments. -#' -#' This function injects the calculate function with the specified arguments, -#' allowing for dynamic customization of the function's behavior. -#' @param covariate character(1). The name of the covariate to be calculated. -#' @param locs The locations to be used in the calculation. -#' @param buffer The buffer size for the calculation. If not provided, the -#' default buffer size will be used. -#' @param injection Additional arguments to be injected into the calculate function. -#' -#' @return The result of the calculate function with the injected arguments. -#' -#' @examples -#' inject_calculate(locs = my_locs, buffer = 10, domain = my_domain, injection = list(arg1 = "value1", arg2 = "value2")) -#' -#' @export -inject_calculate <- function(covariate, locs, injection) { - rlang::inject( - calculate( - locs = locs, - !!!injection - ) - ) -} - - -#' Injects MODIS PAR data into the specified locations. -#' -#' This function calculates MODIS PAR (Photosynthetically Active Radiation) data for the given locations -#' and injects it into the specified domain. -#' -#' @param locs A data frame containing the locations for which MODIS PAR data needs to be calculated. -#' @param domain The domain in which the MODIS PAR data should be injected. -#' @param injection Additional parameters to be passed to the `calc_modis_par` function. -#' @return The modified domain with the injected MODIS PAR data. -#' @export -inject_modis_par <- function(locs, domain, injection) { - rlang::inject( - amadeus::calc_modis_par( - locs = locs, - locs_id = "site_id", - !!!injection - ) - ) -} - - -#' Injects geographic information into a data frame -#' -#' This function injects geographic information into a data frame using the `calc_geos_strict` function. -#' The injected information includes latitude and longitude coordinates based on the specified locations, -#' a location ID column, a window range, and a snapping option. -#' -#' @param locs A data frame containing the locations for which geographic information needs to be injected. -#' @param injection A list of additional arguments to be passed to the `calc_geos_strict` function. -#' @return A modified data frame with injected geographic information. -#' @export -inject_geos <- function(locs, injection) { - rlang::inject( - calc_geos_strict( - locs = locs, - locs_id = "site_id", - win = c(-126, -62, 22, 52), - snap = "out", - !!!injection - ) - ) -} - - - -inject_gmted <- function(locs, variable, radii, injection, nthreads = 4L) { - future::plan(future::multicore, workers = nthreads) - - radii_list <- split(radii, seq_along(radii)) - radii_rep <- - future.apply::future_lapply( - radii_list, - function(r) { - rlang::inject( - calc_gmted_direct( - locs = locs, - locs_id = "site_id", - radius = r, - variable = c(variable, "7.5 arc-seconds"), - !!!injection - ) - ) - } - ) - radii_rep <- lapply(radii_rep, function(x) as.data.frame(x)) - radii_join <- reduce_merge(radii_rep, "site_id") - future::plan(future::sequential) - return(radii_join) -} - - -#' Reduce and merge a list of data tables -#' -#' This function takes a list of data tables and merges them together using the specified columns. -#' It uses the `merge.data.table` function from the `data.table` package to perform the merge. -#' -#' @param list_in A list of data tables to be merged. -#' @param by The columns to merge the data tables on. -#' @return A merged data table. -#' -#' @examples -#' # Create example data tables -#' dt1 <- data.table(a = 1:3, b = 4:6) -#' dt2 <- data.table(a = 2:4, c = 7:9) -#' dt3 <- data.table(a = 3:5, d = 10:12) -#' -#' # Merge the data tables -#' reduce_merge(list(dt1, dt2, dt3), by = "a") -#' -#' @importFrom data.table merge.data.table -#' @export -reduce_merge <- function(list_in, by = c("site_id", "time"), all.x = TRUE, all.y = FALSE) { - list_check <- sapply(list_in, nrow) - list_checkdiff <- diff(list_check) - if (any(list_checkdiff > 0)) all.y <- TRUE - for (i in seq_len(length(list_in))) { - list_in[[i]] <- data.table::as.data.table(list_in[[i]]) - } - - Reduce( - function(x, y) { - if (is.null(by)) by <- intersect(names(x), names(y)) - #post_calc_autojoin(x, y) - data.table::merge.data.table(x, y, by = by, all.x = all.x, all.y = all.y) - }, - list_in - ) -} - - -reduce_merge <- function(list_in, by = c("site_id", "time"), all.x = TRUE, all.y = FALSE) { - if (all.x && !all.y) how <- "left" - if (!all.x && all.y) how <- "right" - if (!all.x && !all.y) how <- "inner" - if (all.x && all.y) how <- "full" - - # ignore initial argument settings - list_check <- sapply(list_in, nrow) - list_checkdiff <- diff(list_check) - if (any(list_checkdiff > 0)) how <- "full" - # for (i in seq_len(length(list_in))) { - # list_in[[i]] <- data.table::as.data.table(list_in[[i]]) - # } - - Reduce( - function(x, y) { - if (is.null(by)) by <- intersect(names(x), names(y)) - #post_calc_autojoin(x, y) - collapse::join(x, y, how = how, on = by) - }, - list_in - ) -} - - - -par_narr <- function(domain, date, locs, nthreads = 24L) { - - future::plan(future::multicore, workers = nthreads) - - res <- - future.apply::future_lapply( - domain, - function(x) { - from <- process_narr2( - path = "input/narr", - variable = x, - date = date - ) - calc_narr2( - from = from, - locs = locs, - locs_id = "site_id") - }, - future.seed = TRUE) - future::plan(future::sequential) - return(res) - -} - -#' Add Time Column -#' -#' This function adds a time column to a data frame. -#' -#' @param df The data frame to which the time column will be added. -#' @param time_value The value to be assigned to the time column. -#' @param time_id The name of the time column (default is "time"). -#' -#' @return The data frame with the added time column. -#' -#' @examples -#' df <- data.frame(x = 1:5, y = letters[1:5]) -#' add_time_col(df, "2022-01-01") -#' -#' @export -add_time_col <- function(df, time_value, time_id = "time") { - if (!time_id %in% names(df)) { - df[[time_id]] <- time_value - } - return(df) -} - - - -# 2018~2022, 2017, 2020 -# 2017 ... 2020 ... -# 2017 -#' Map the available raw data years over the given period -#' @description -#' Many raw datasets are periodically updated and the period could -#' be longer than a year. This function maps the available years -#' over the given period. -#' @param time_start integer(1). Starting year. -#' @param time_end integer(1). Ending year. -#' @param time_unit character(1). Time unit. Default is `"year"`. -#' @param time_available vector. Available years. -#' @returns integer vector of length (time_end - time_start + 1). -#' Each element will get the nearest preceeding available year. -#' @note -#' The minimum of `time_available` will be filled in front of the first available year -#' when the minimum of `time_available` is greater than `time_start`. -#' @examples -#' process_year_expand(2018, 2022, "year", c(2017, 2020, 2021)) -#' process_year_expand(2018, 2022, "year", c(2020, 2021)) -#' @export -post_calc_year_expand <- - function( - time_start = NULL, - time_end = NULL, - time_unit = "year", - time_available = NULL - ) { - time_seq <- seq(time_start, time_end) - time_target_seq <- findInterval(time_seq, time_available) - time_target_seq <- time_available[time_target_seq] - if (min(time_available) > time_start) { - time_target_seq <- - c( - rep(min(time_available), - min(time_available) - time_start), - time_target_seq - ) - } - return(time_target_seq) - } - - -#' Expand a data frame by year -#' -#' This function expands a data frame by year, creating multiple rows for each year based on the time period specified. -#' -#' @param df The input data frame. -#' @param locs_id The column name of the location identifier in the data frame. -#' @param time_field The column name of the time field in the data frame. -#' @param time_start The start of the time period. -#' @param time_end The end of the time period. -#' @param time_unit The unit of time to expand the data frame. Only for record. -#' @param time_available A vector of available time periods. -#' @param ... Placeholders. -#' @note Year expansion rule is to assign the nearest past year in the available years, -#' if there is no past year in the available years, the first available year is -#' rolled back to the start of the time period. -#' @returns The expanded data frame with multiple rows for each year. -#' @seealso [`process_year_expand()`] -#' @examples -#' df <- data.frame(year = c(2010, 2010, 2011, 2012), -#' value = c(1, 2, 3, 4)) -#' df_expanded <- df_year_expand(df, locs_id = "site_id", time_field = "year", -#' time_start = 2011, time_end = 2012, -#' time_unit = "year") -#' print(df_expanded) -#' -#' @export -post_calc_df_year_expand <- function( - df, - locs_id = "site_id", - time_field = "time", - time_start = NULL, - time_end = NULL, - time_unit = "year", - time_available = NULL, - ... -) { - time_summary <- table(unlist(df[[time_field]])) - if (length(time_summary) != 1) { - if (sd(time_summary) != 0) { - stop("df should be a data frame with the same number of rows per year") - } - } - # assume that df is the row-bound data frame - if (is.character(df[[time_field]])) { - df[[time_field]] <- as.integer(df[[time_field]]) - } - df_years <- unique(df[[time_field]]) - nlocs <- length(unique(df[[locs_id]])) - year_period <- seq(time_start, time_end) - # assign the time period to the available years - year_assigned <- post_calc_year_expand(time_start, time_end, time_unit, df_years) - df_years_repeats <- table(year_assigned) - - # repeat data frames - df_expanded <- Map( - function(y) { - df_sub <- df[df[[time_field]] == df_years[y], ] - df_sub <- df_sub[rep(seq_len(nrow(df_sub)), df_years_repeats[y]), ] - return(df_sub) - }, - seq_along(year_assigned) - ) - df_expanded <- do.call(rbind, df_expanded) - df_expanded[[time_field]] <- rep(year_period, each = nlocs) - return(df_expanded) -} - - -# calculate over a list -#' Spatiotemporal covariate calculation -#' @param domain vector of integer/character/Date. -#' Depending on temporal resolution of raw datasets. -#' Nullable; If `NULL`, it will be set to `c(1)`. -#' @param domain_name character(1). Name of the domain. Default is `"year"`. -#' @param process_function Raw data processor. Default is -#' [`amadeus::process_covariates`] -#' @param calc_function Function to calculate covariates. -#' [`amadeus::calc_covariates`] -#' @param ... Arguments passed to `process_function` and `calc_function` -#' @returns A data.table object. -#' @importFrom data.table rbindlist -#' @importFrom rlang inject -#' @export -# FIXME: this function works inefficiently in expense of -# returning uniform list of length(|years|) output. -# It could seriously affect the performance in scaled calculation -# as it calculates the same covariate for several years. -# Future updates should reduce the workload by calculating -# source data years only then assign proper preceding years -# to the output as another target. -calculate <- - function( - domain = NULL, - domain_name = "year", - nthreads = 1L, - process_function = amadeus::process_covariates, - calc_function = amadeus::calc_covariates, - ... - ) { - if (is.null(domain)) { - domain <- c(1) - } - # split the domain, make years from the domain list - # assuming that domain length is the same as the number of years - domainlist <- split(domain, seq_along(domain)) - years_data <- seq_along(domain) + 2017 - - if (nthreads == 1L) { - future::plan(future::sequential) - } else { - future::plan(future::multicore, workers = nthreads) - } - # double twists: list_iteration is made to distinguish - # cases where a single radius is accepted or ones have no radius - # argument. - res_calc <- - #try( - future.apply::future_mapply( - function(domain_each, year_each) { - # we assume that ... have no "year" and "from" arguments - args_process <- c(arg = domain_each, list(...)) - names(args_process)[1] <- domain_name - if (!is.null(args_process$covariate) && any(names(args_process) %in% c("covariate"))) { - if (args_process$covariate == "nei") { - args_process$county <- process_counties() - } - } - - # load balancing strategy - # if radius is detected, split the list - if (any(names(args_process) %in% c("radius"))) { - list_iteration <- split(args_process$radius, seq_along(args_process$radius)) - } else { - list_iteration <- list(1) - } - - list_iteration_calc <- - Map( - function(r) { - args_process$radius <- r - from_in <- - rlang::inject( - process_function(!!!args_process) - ) - res <- rlang::inject( - calc_function( - from = from_in, - !!!args_process - ) - ) - # using domain_name, add both - # data year and covariate year - if (!is.null(domain) && domain_name == "year") { - res <- add_time_col(res, domain_each, - sprintf( - "%s_year", - unname(args_process$covariate))) - # res <- add_time_col(res, year_each, "year") - } - res <- data.table::as.data.table(res) - return(res) - }, - list_iteration) - df_iteration_calc <- if (length(list_iteration_calc) == 1) { - list_iteration_calc[[1]] } else { - by_detected <- Reduce(intersect, lapply(list_iteration_calc, names)) - reduce_merge(list_iteration_calc, by = by_detected) - } - return(df_iteration_calc) - }, - domainlist, years_data, SIMPLIFY = FALSE, - future.seed = TRUE - ) - #) - future::plan(future::sequential) - if (inherits(res_calc, "try-error")) { - cat(paste0(attr(res_calc, "condition")$message, "\n")) - stop("Results do not match expectations.") - } - res_calc <- lapply(res_calc, - function(x) { - if ("time" %in% names(x)) { - if (nchar(x$time[1]) != 4) { - x$time <- data.table::as.IDate(x$time) - } - } - xconvt <- data.table::as.data.table(x) - return(xconvt) - } - ) - # res_calcdf <- if (length(res_calc) == 1) { - # data.table::as.data.table(res_calc[[1]]) - # } else if (domain_name %in% c("year", "date")) { - # data.table::rbindlist(res_calc, use.names = TRUE, fill = TRUE) - # } else { - # reduce_merge(res_calc, by = c("site_id", "time")) - # } - return(res_calc) - } - - - - -# xx <- Reduce(post_calc_autojoin, c(list(j2), j1)) -# sapply(j1, \(x) names(x)[1:8]) - -# # Example usage -# df_fine0 <- data.frame(site_id = c("A", "B", "B", "C"), -# lon = rep("barns", 4), -# time = as.Date(c("2022-01-01", "2022-01-02", "2021-12-31", "2021-01-03")), -# value = c(1, 2, 3, 5)) - -# df_coarse0 <- data.frame(site_id = c("A", "B", "C"), -# lon = rep("J", 3), -# time = c("2022", "2022", "2021"), -# other_value = c(10, 20, 30)) - -# jdf <- post_calc_autojoin(df_fine0, df_coarse0) -# print(jdf) - - - - -#' Running commands with a punchcard -#' @param var_short Short variable name to call from the CSV fiel -#' @param file Path to the configuration file -#' @param ... Arguments passed to the command -#' @returns Depending on the specification in the punchcard. -#' @examples -#' meta_run("root_absolute") -#' meta_run("root_relative") -#' meta_run("y2018") -#' meta_run("dir_input_modis_mod11") -#' @importFrom utils read.csv -#' @export -meta_run <- - function( - var_short = NULL, - file = file.path("./inst/targets/targets_configuration.csv"), - # after completion, file path should be replaced by system.file - # reference path is ./targets or depending on the location of the pipeline - ... - ) { - metaspec <- utils::read.csv(file) - if (var_short == "root_absolute") { - getwd() - } else { - spec <- metaspec[metaspec$name_targets_short == var_short, ] - foo_run <- get(spec$command) - foo_run(spec$value, ...) - } - } - - - -#' Set resource management for SLURM -#' @param template_file SLURM job submission shell template path. -#' @param partition character(1). Name of partition. Default is `"geo"` -#' @param ncpus integer(1). Number of CPU cores assigned to each task. -#' @param ntasks integer(1). Number of tasks to submit. -#' @param memory integer(1). Specifically odds to 2*x GB. -#' @param user_email character(1). User email address. -#' @param error_log character(1). Error log file name. -#' @notes This function is designed to be used with `tar_resources`. -#' Suggested number of `ncpus` is more than 1 for typical multicore R tasks. -#' @returns A list of resources for `tar_resources` -#' @author Insang Song -#' @importFrom future tweak -#' @importFrom future.batchtools batchtools_slurm -#' @importFrom targets tar_resources -#' @importFrom targets tar_resources_future -#' @export -set_slurm_resource <- - function( - template_file = "inst/targets/template_slurm.tmpl", - partition = "geo", - ncpus = 2L, - ntasks = 2L, - memory = 8, - user_email = meta_run("slurm_user_email"), - error_log = "slurm_error.log" - ) { - targets::tar_resources( - future = targets::tar_resources_future( - plan = future::tweak( - future.batchtools::batchtools_slurm, - template = template_file, - resources = - list( - partition = partition, - # template = template_file, - ntasks = ntasks, - ncpus = ncpus, - memory = memory, - email = user_email, - error.file = error_log - ) - ) - ) - ) - } - - -#' Read AQS data -#' @param fun_aqs function to import AQS data. -#' Default is `amadeus::process_aqs` -#' @param export Export the file to qs. Default is FALSE. -#' @param ... Passed arguments to `fun_aqs` -#' @returns Depending on `fun_aqs` specification. -#' @export -read_locs <- - function( - fun_aqs = amadeus::process_aqs, - export = FALSE, - ... - ) { - aqs_read <- fun_aqs(...) - if (export) qs::qsave(aqs_read, file = "input/sf_feat_proc_aqs_sites.qs") - return(aqs_read) - } - - -#' Filter monitors with the minimum POC value -#' @param path data.frame/tibble/data.table -#' @param site_spt Space-time site data. -#' @param locs_id character(1). Name of site id (not monitor id) -#' @param poc_name character(1). Name of column containing POC values. -#' @param sampling character(1). Name of column with sampling duration. -#' @param date_start character(1). -#' @param date_end character(1). -#' @author Insang Song -#' @returns a data.table object -#' @importFrom dplyr group_by -#' @importFrom dplyr filter -#' @importFrom dplyr ungroup -#' @importFrom data.table as.data.table -#' @importFrom data.table merge.data.table -#' @importFrom data.table rbindlist -#' @importFrom rlang sym -#' @export -get_aqs_data <- - function( - path = list.files( - path = meta_run("dir_input_aqs"), - pattern = "daily_88101_[0-9]{4}.csv", - full.names = TRUE - ), - site_spt = NULL, - locs_id = meta_run("char_siteid"), - time_id = meta_run("char_timeid"), - poc_name = "POC", - sampling = "Sample.Duration", - date_start = "2018-01-01", - date_end = "2022-12-31" - ) { - #nocov start - if (!is.character(locs_id)) { - stop("locs_id should be character.\n") - } - if (!is.character(poc_name)) { - stop("poc_name should be character.\n") - } - # aqs_prep <- - # amadeus::process_aqs( - # path = path, - # date = NULL, - # return_format = return_format - # ) - input_df <- lapply(path, data.table::fread) |> data.table::rbindlist() - input_df <- input_df[, - list( - pm25 = `Arithmetic Mean`, - site_id = - sprintf("%02d%03d%04d%05d", - `State Code`, `County Code`, `Site Num`, `Parameter Code`), - time = as.character(`Date Local`), - POC = POC - )] - - poc_filtered <- input_df |> - dplyr::group_by(!!rlang::sym(locs_id)) |> - dplyr::filter(startsWith(!!rlang::sym(sampling), "24")) |> - dplyr::filter(!!rlang::sym(poc_name) == min(!!rlang::sym(poc_name))) |> - dplyr::ungroup() |> - data.table::as.data.table() - return(poc_filtered) - poc_res <- - data.table::merge.data.table(poc_filtered, - data.table::as.data.table(site_spt), - by = c(locs_id, time_id) - ) - return(poc_res) - #nocov end - } - -#' Join dependent variable (y) and covariates (x) -#' @param df_pm PM2.5 data.frame -#' @param df_covar covariates data.frame -#' @param locs_id location identifier -#' @param time_id time identifier -#' @returns data.frame -#' @author Insang Song -#' @importFrom data.table merge.data.table -post_calc_join_pm25_features <- - function( - df_pm, - df_covar, - locs_id = meta_run("char_siteid"), - time_id = meta_run("char_timeid") - ) { - # full join - data.table::merge.data.table( - df_pm, df_covar, - by = c(locs_id, time_id), - all = TRUE - ) - } - - - -#' Check file status and download if necessary -#' @param path download path. -#' @param dname Dataset name. See [`amadeus::download_data`] for details. -#' @param ... Arguments passed to `amadeus::download_data` -#' @returns logical(1). -feature_raw_download <- - function( - path = NULL, - dataset_name = NULL, - ... - ) { - # run amadeus::download_data - tryCatch( - { - amadeus::download_data(dataset_name = dataset_name, ...) - }, - error = function(e) { - stop(e) - } - ) - } - -#' Load county sf object -#' @param year integer(1). Year of the county shapefile. -#' @param exclude character. State FIPS codes to exclude. -#' Default is c("02", "15", "60", "66", "68", "69", "72", "78"). -#' @returns sf object -#' @importFrom tigris counties -#' @export -process_counties <- - function( - year = 2020, - exclude = c("02", "15", "60", "66", "68", "69", "72", "78") - ) { - options(tigris_use_cache = TRUE) - cnty <- tigris::counties(year = year) - cnty <- - cnty[!cnty$STATEFP %in% - c("02", "15", "60", "66", "68", "69", "72", "78"), ] - return(cnty) - } - - -# calculate (no year is concerned) -#' Single-year or spatial-only calculation -#' @param process_function Raw data processor. Default is -#' [`amadeus::process_covariates`] -#' @param calc_function Covariate calculator. Default is -#' [`amadeus::calc_covariates`] -#' @param ... Arguments passed to `calc_function` -#' @returns Nothing. It will automatically save xz-compressed -#' RDS file to `outpath` -#' @importFrom rlang inject -#' @export -calculate_single <- - function( - process_function = amadeus::process_covariates, - calc_function = amadeus::calc_covariates, - ... - ) { - prep_calc <- - try( - rlang::inject( - process_function( - !!!list(...) - ) - ) - ) - arg_ext <- list(...) - arg_ext$from <- prep_calc - - res_calc <- - try( - rlang::inject( - calc_function( - !!!arg_ext - ) - ) - ) - if (inherits(res_calc, "try-error")) { - stop("Results do not match expectations.") - } - return(res_calc) - } - -# calculate over a list -#' Spatiotemporal covariate calculation -#' @param domain vector of integer/character/Date. -#' Depending on temporal resolution of raw datasets. -#' @param process_function Raw data processor. Default is -#' [`amadeus::process_covariates`] -#' @param calc_function Function to calculate covariates. -#' [`amadeus::calc_covariates`] -#' @param ... Arguments passed to `process_function` and `calc_function` -#' @returns A data.table object. -#' @importFrom data.table rbindlist -#' @importFrom rlang inject -#' @export -#' @examples -calculate_multi <- - function( - # status = NULL, - # outpath = NULL, - domain = NULL, - process_function = amadeus::process_covariates, - calc_function = amadeus::calc_covariates, - ... - ) { - domainlist <- split(domain, seq_along(domain)) - res_calc <- - try( - lapply( - domainlist, - function(el) { - from_in <- - rlang::inject( - process_function(year = el, !!!list(...)) - ) - rlang::inject( - calc_function( - from = from_in, - !!!list(...) - ) - ) - } - ) - ) - if (inherits(res_calc, "try-error")) { - cat(paste0(attr(res_calc, "condition")$message, "\n")) - stop("Results do not match expectations.") - } - res_calc <- lapply(res_calc, function(x) as.data.frame(x)) - res_calc <- data.table::rbindlist(res_calc, fill = TRUE) - return(res_calc) - - } - - -# sspat <- readRDS("~/sites_unique.rds") -# kk <- calculate_multi( -# # sequence: could be refered from dates -# domain = 2018,#c(2018, 2019, 2020, 2021, 2022), -# path = mr("dir_input_tri"), -# covariate = "tri", -# locs = sspat, -# locs_id = mr("pointid") -# ) - -# rr <- -# read_locs( -# path = list.files( -# path = mr("dir_input_aqs"), -# pattern = "daily_88101_[0-9]{4}.csv", -# full.names = TRUE), -# date = NULL, -# return_format = "sf" -# ) - -#' Merge input data.frame objects -#' @param by character. Joining keys. See [`merge`] for details. -#' @param time logical(1). Whether or not include time identifier. -#' Set this `TRUE` will supersede `by` value by appending time identifier. -#' @param ... data.frame objects to merge -#' @returns data.table -#' @importFrom data.table as.data.table -#' @export -post_calc_merge_features <- - function( - by = c(meta_run("char_siteid")), - time = FALSE, - ... - ) { - ellipsis <- list(...) - if (time) { - by <- c(meta_run("char_siteid"), meta_run("char_timeid")) - ellipsis_clean <- - lapply(ellipsis, - function(x) { - x <- data.table::as.data.table(x) - col_coords <- grep("(lon|lat)", names(x)) - if (length(col_coords) > 0 && !is.null(col_coords)) { - x <- x[, -col_coords, with = FALSE] - } - x$time <- as.character(x$time) - return(x) - }) - } else { - ellipsis_clean <- ellipsis - } - joined <- - Reduce(function(x, y) { - data.table::merge.data.table(x, y, by = by, all.x = TRUE, suffixes = c("_Ma", "_Mb")) - }, ellipsis_clean) - return(joined) - } - - -#' Change time column name -#' @param df data.frame -#' @param candidates character. Candidate column names. -#' @param replace character. New column name. -#' @returns data.frame -#' @export -post_calc_unify_timecols <- - function( - df, - candidates = c("year"), - replace = "time" - ) { - if (sum(names(df) %in% candidates) > 1) { - stop("More than a candidate is detected in the input.") - } - names(df)[names(df) %in% candidates] <- replace - return(df) - } - - -#' Convert time column to character -#' @param df data.table -#' @note This function takes preprocessed data.table with a column named `"time"`. -#' @importFrom data.table as.data.table -#' @export -post_calc_convert_time <- - function( - df - ) { - df <- data.table::copy(data.table::as.data.table(df)) - df <- df[, `:=`(time, as.character(time))] - return(df) - } - - -#' Join a data.frame with a year-only date column to that with a full date column -#' @description The full date column will be converted to a year column -#' as a new column, then the data.frame with the year-only column will -#' be joined. -#' @param df_year data.frame with a year-only date column -#' @param df_date data.frame with a full date column -#' @param field_year character(1). Year column in `df_year` -#' @param field_date character(1). Date column in `df_date` -#' @param spid character(1). Name of the unique location identifier field. -#' @importFrom methods is -#' @importFrom data.table merge.data.table -#' @returns data.frame -post_calc_join_yeardate <- - function( - df_year, - df_date, - field_year = "time", - field_date = "time", - spid = "site_id" - ) { - if (!inherits(df_year, "data.frame") && !inherits(df_date, "data.frame")) { - stop("Both inputs should be data.frame.") - } - # df_year[[field_year]] <- as.integer(df_year[[field_year]]) - # df_date[[field_date]] <- as.POSIXlt(df_date[[field_date]])$year + 1900 - - # df_date_joined <- - # df_date[df_year, - # on = .(site_id == site_id, time >= time) - # ] - - names(df_year)[which(names(df_year) %in% field_year)] <- "year" - df_year$year <- as.character(unlist(df_year$year)) - df_date$year <- as.character(substr(df_date[[field_date]], 1, 4)) - #as.integer(format(as.Date(df_date[[field_date]]), "%Y")) - df_joined <- - data.table::merge.data.table( - df_date, df_year, - by = c(spid, "year"), - all.x = TRUE - ) - - df_joined <- df_joined[, c("year") := NULL] - return(df_joined) - } - - -#' Merge spatial and spatiotemporal covariate data -#' @param locs Location. e.g., AQS sites. -#' @param locs_id character(1). Location identifier. -#' @param time_id character(1). Location identifier. -#' @param target_years integer. Used to dummify nominal year. -#' @param df_sp data.frame. Spatial-only covariates. -#' @param df_spt data.frame. Spatiotemporal covariates. -#' @note This version assumes the time_id contains Date-like strings. -#' @returns data.frame -#' @export -post_calc_merge_all <- - function( - locs, - locs_id, - time_id, - target_years = seq(2018, 2022), - df_sp, - df_spt - ) { - if (methods::is(locs, "sf")) { - locs <- sf::st_drop_geometry(locs) - } - locs$time <- as.character(locs$time) - locs <- data.table::as.data.table(locs) - locs_merged <- - data.table::merge.data.table( - locs, df_sp, by = c(locs_id) - ) - locs_merged <- - data.table::merge.data.table( - locs_merged, df_spt, - by = c(locs_id, time_id) - ) - locs_merged <- - amadeus::calc_temporal_dummies( - locs = locs_merged, - locs_id = locs_id, - year = target_years - ) - return(locs_merged) - } - - -#' Remove columns from a data frame based on regular expression patterns. -#' -#' This function removes columns from a data frame that match any of the specified -#' regular expression patterns. By default, it removes columns with names that -#' match the patterns "^lon$|^lat$|geoid|year$|description". -#' -#' @param df The input data frame. -#' @param candidates A character vector of regular expression patterns to match -#' against column names. Columns that match any of the patterns will be removed. -#' The default value is "^lon$|^lat$|geoid|year$|description". -#' @param strict logical(1). If `TRUE`, only `c("site_id", "time")` will be kept. -#' @returns The modified data frame with the specified columns removed. -#' -#' @examples -#' df <- data.frame(lon = 1:5, lat = 6:10, geoid = 11:15, year = 2010:2014, -#' description = letters[1:5], other = 16:20) -#' post_calc_drop_cols(df) -#' -#' @export -post_calc_drop_cols <- - function( - df, - candidates = "(^lon$|^lat$|geoid|year$|description|geometry)", - strict = FALSE - ) { - idx_remove <- - if (!strict) { - grep(candidates, names(df), value = TRUE) - } else { - grep("site_id|time", names(df), value = TRUE, invert = TRUE) - } - df <- df[, -idx_remove, with = FALSE] - return(df) - } - -#' Automatic joining by the time and spatial identifiers -#' @description The key assumption is that all data frames will have -#' time field and spatial field and the data should have one of date or year. -#' Whether the input time unit is year or date -#' is determined by the coercion of the **first row value** of the time field -#' into a character with `as.Date()`. This function will fail if it -#' gets year-like string with length 4. -#' -#' @param df_fine The fine-grained data frame. -#' @param df_coarse The coarse-grained data frame. -#' @param field_sp The name of the spatial field in the data frames. -#' @param field_t The name of the time field in the data frames. -#' -#' @returns A merged data table. -#' @returns -#' df_fine0 <- data.frame(site_id = c("A", "B", "B", "C"), -#' time = as.Date(c("2022-01-01", "2022-01-02", "2021-12-31", "2021-01-03")), -#' value = c(1, 2, 3, 5)) -#' df_coarse0 <- data.frame(site_id = c("A", "B", "C"), -#' time = c("2022", "2022", "2021"), -#' other_value = c(10, 20, 30)) -#' jdf <- post_calc_autojoin(df_fine0, df_coarse0) -#' print(jdf) -#' @importFrom data.table merge.data.table -#' @importFrom rlang as_name -#' @importFrom rlang sym -#' @export -post_calc_autojoin <- - function( - df_fine, - df_coarse, - field_sp = "site_id", - field_t = "time", - year_start = 2018L, - year_end = 2022L - ) { - if (any(grepl("population", names(df_coarse)))) { - df_coarse <- df_coarse[, -c("time"), with = FALSE] - } - common_field <- intersect(names(df_fine), names(df_coarse)) - df_fine <- data.table::as.data.table(df_fine) - df_coarse <- data.table::as.data.table(df_coarse) - df_fine <- post_calc_drop_cols(df_fine) - df_coarse <- post_calc_drop_cols(df_coarse) - # if (length(common_field) > 2) { - # message("The data frames have more than two common fields.") - # message("Trying to remove the redundant common fields...") - # common_field <- intersect(names(df_fine), names(df_coarse)) - # print(common_field) - # common_field <- - # common_field[-which(!common_field %in% c(field_sp, field_t))] - # } - if (length(common_field) == 1) { - print(common_field) - if (common_field == field_sp) { - joined <- data.table::merge.data.table( - df_fine, df_coarse, - by = field_sp, - all.x = TRUE - ) - } - } - if (length(common_field) == 2) { - if (all(common_field %in% c(field_sp, field_t))) { - # t_fine <- try(as.Date(df_fine[[field_t]][1])) - df_fine[[field_t]] <- as.character(df_fine[[field_t]]) - df_coarse[[field_t]] <- as.character(df_coarse[[field_t]]) - t_coarse <- try(as.Date(df_coarse[[field_t]][1])) - if (inherits(t_coarse, "try-error")) { - message("The time field includes years. Trying different join strategy.") - coarse_years <- sort(unique(unlist(as.integer(df_coarse[[field_t]])))) - df_coarse2 <- post_calc_df_year_expand( - df_coarse, - time_start = year_start, - time_end = year_end, - time_available = coarse_years - ) - joined <- post_calc_join_yeardate(df_coarse2, df_fine, field_t, field_t) - } else { - joined <- data.table::merge.data.table( - df_fine, df_coarse, - by = c(field_sp, field_t), - all.x = TRUE - ) - } - } - } - return(joined) - } - - - -#' Read paths from a directory with a specific file extension -#' @param path The directory path from which to read the paths. -#' @param extension The file extension to match. Defaults to ".hdf". -#' @param target_dates A character vector of length 2 containing the start and end dates. -#' @returns A character vector containing the full paths of the matching files. -#' -#' @examples -#' # Read paths from a directory with default extension -#' read_paths("/path/to/directory") -#' -#' # Read paths from a directory with custom extension -#' read_paths("/path/to/directory", ".txt") -#' -#' @export -read_paths <- function(path, extension = ".hdf", target_dates = c("2020-01-01", "2020-01-15"), julian = FALSE) { - flist <- - list.files( - path = path, - pattern = sprintf("%s$", extension), - full.names = TRUE, - recursive = TRUE - ) - if (!missing(target_dates)) { - dateseq <- seq(as.Date(target_dates[1]), as.Date(target_dates[2]), by = "day") - dateseq <- if (julian) format(dateseq, "%Y%j") else format(dateseq, "%Y%m%d") - dateseq <- sprintf("A(%s)", paste(dateseq, collapse = "|")) - flist <- grep(dateseq, flist, value = TRUE) - } - return(flist) -} - - - -#' Search package functions -#' @param package character(1). Package name. -#' @param search character(1). Search term. -#' @returns A character vector containing the matching function names. -#' @examples -#' # Search for functions in the `amadeus` package -#' search_function("amadeus", "process_") -search_function <- function(package, search){ - library(package, character.only = TRUE) - grep(search, ls(sprintf("package:%s", package)), value = TRUE) -} - -#' Get data.frame of function parameters -#' @param functions character. Vector of function names. -#' @returns A data.frame containing the parameters of the functions. -df_params <- function(functions) { - params <- lapply(functions, function(x) { - args <- dplyr::as_tibble(lapply(as.list(formals(get(x))), \(p) list(p)), .name_repair = "minimal") - return(args) - }) - paramsdf <- Reduce(dplyr::bind_rows, params) - return(paramsdf) -} - -# schedo <- search_function("amadeus", "download_") -# sched <- search_function("amadeus", "process_") -# schec <- search_function("amadeus", "calc_") -# df_params(sched[-c(1, 2, 3, 4, 5, 6, 8, 11, 14, 15, 17, 18, 19, 20, 21, 25)]) -# df_params(schec[-c(1, 16)]) |> colnames() -# df_params(schedo) |> colnames() - - - - -#' Process atmospheric composition data by chunks (v2) -#' @description -#' Returning a single `SpatRasterDataset` object. -#' @param date character(2). length of 10. Format "YYYY-MM-DD". -#' @param path character(1). Directory with downloaded netCDF (.nc4) files. or -#' netCDF file paths. -#' @param ... Arguments passed to [`terra::rast`]. -#' @note -#' Layer names of the returned `SpatRaster` object contain the variable, -#' pressure level, date -#' Reference duration: 1 day summary, all layers: 115 seconds -#' @author Mitchell Manware, Insang Song -#' @return a `SpatRaster` object; -#' @importFrom terra rast -#' @importFrom terra time -#' @importFrom terra varnames -#' @importFrom terra crs -#' @importFrom terra subset -#' @export -process_geos_bulk <- - function(path = NULL, - date = c("2018-01-01", "2018-01-01"), - ...) { - #### directory setup - if (length(path) == 1) { - - if (dir.exists(path)) { - path <- amadeus::download_sanitize_path(path) - paths <- list.files( - path, - pattern = "GEOS-CF.v01.rpl", - full.names = TRUE - ) - paths <- paths[grep( - ".nc4", - paths - )] - } - } else { - paths <- path - } - #### check for variable - amadeus::check_for_null_parameters(mget(ls())) - #### identify file paths - #### identify dates based on user input - dates_of_interest <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = TRUE - ) - #### subset file paths to only dates of interest - data_paths <- unique( - grep( - paste( - dates_of_interest, - collapse = "|" - ), - paths, - value = TRUE - ) - ) - #### identify collection - collection <- amadeus::process_collection( - data_paths[1], - source = "geos", - collection = TRUE - ) - cat( - paste0( - "Identified collection ", - collection, - ".\n" - ) - ) - if (length(unique(collection)) > 1) { - warning( - "Multiple collections detected. Returning data for all collections.\n" - ) - } - - filename_date <- regmatches( - data_paths, - regexpr( - "20[0-9]{2}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])", - data_paths - ) - ) - if (any(table(filename_date) < 24)) { - warning( - "Some dates include less than 24 hours. Check the downloaded files." - ) - } - if (length(unique(filename_date)) > 10) { - message( - "More than 10 unique dates detected. Try 10-day chunks..." - ) - } - - # split filename date every 10 days - filename_date <- as.Date(filename_date, format = "%Y%m%d") - filename_date_cl <- as.integer(cut(filename_date, "30 days")) - - future_inserted <- split(data_paths, filename_date_cl) - other_args <- list(...) - data_variables <- names(terra::rast(data_paths[1])) - - summary_byvar <- function(x = data_variables, fs) { - #do.call(c, - rast_in <- rlang::inject(terra::rast(fs, !!!other_args)) - terra::sds(lapply( - x, - function(v) { - rast_inidx <- grep(v, names(rast_in)) - rast_in <- rast_in[[rast_inidx]] - rast_summary <- terra::tapp(rast_in, index = "days", fun = "mean") - names(rast_summary) <- - paste0( - rep(v, terra::nlyr(rast_summary)), "_", - terra::time(rast_summary) - ) - terra::set.crs(rast_summary, "EPSG:4326") - return(rast_summary) - } - )) - } - - # summary by 10 days - # TODO: dropping furrr? - rast_10d_summary <- - furrr::future_map( - .x = future_inserted, - .f = ~summary_byvar(fs = .x), - .options = - furrr::furrr_options( - globals = c("other_args", "data_variables") - ) - ) - rast_10d_summary <- Reduce(c, rast_10d_summary) - return(rast_10d_summary) - - } - -#' Process atmospheric composition data by chunks (v3) -#' @description -#' Returning a single `SpatRasterDataset` object. -#' Removed `tapp` for performance; impose a strict assumption that -#' there are no missing values -#' @param date character(2). length of 10. Format "YYYY-MM-DD". -#' @param path character(1). Directory with downloaded netCDF (.nc4) files. or -#' netCDF file paths. -#' @param ... Arguments passed to [`terra::rast`]. -#' @note -#' Layer names of the returned `SpatRaster` object contain the variable, -#' pressure level, date -#' Reference duration: 1 day summary, all layers: 106 seconds -#' hard-coded subsets for subdataset selection -#' @author Mitchell Manware, Insang Song -#' @return a `SpatRaster` object; -#' @importFrom terra rast -#' @importFrom terra time -#' @importFrom terra varnames -#' @importFrom terra crs -#' @importFrom terra subset -#' @export -calc_geos_strict <- - function(path = NULL, - date = c("2018-01-01", "2018-01-01"), - locs = NULL, - locs_id = NULL, - ...) { - #### directory setup - if (length(path) == 1) { - if (dir.exists(path)) { - # path <- amadeus::download_sanitize_path(path) - paths <- list.files( - path, - pattern = "GEOS-CF.v01.rpl", - full.names = TRUE - ) - paths <- paths[grep( - ".nc4", - paths - )] - } - } else { - paths <- path - } - #### check for variable - # amadeus::check_for_null_parameters(mget(ls())) - #### identify file paths - #### identify dates based on user input - dates_of_interest <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = TRUE - ) - dates_of_interest_incl <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = FALSE - ) - #### subset file paths to only dates of interest - data_paths <- unique( - grep( - paste( - dates_of_interest, - collapse = "|" - ), - paths, - value = TRUE - ) - ) - - #### identify collection - collection <- regmatches( - data_paths[1], - # the pattern accommodates 3-4 characters for the variable name, - # 3-4 alphanumerics for the temporal resolution, - # 8-9 alphanumerics for the output dimensions - regexpr( - "GEOS-CF.v01.rpl.(aqc|chm)_[[:alpha:]]{3,4}_[[:alnum:]]{3,4}_[[:alnum:]]{8,9}_v[1-9]", - data_paths[1] - ) - ) - cat( - paste0( - "Identified collection ", - collection, - ".\n" - ) - ) - if (length(unique(collection)) > 1) { - warning( - "Multiple collections detected. Returning data for all collections.\n" - ) - } - - filename_date <- sort(regmatches( - data_paths, - regexpr( - "20[0-9]{2}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])", - data_paths - ) - )) - if (any(table(filename_date) < 24)) { - warning( - "Some dates include less than 24 hours. Check the downloaded files." - ) - } - - # to export locs (pointers are not exportable) - locs <- sf::st_as_sf(locs) - - # split filename dates daily - filename_date <- as.Date(filename_date, format = "%Y%m%d") - filename_date <- filename_date[filename_date %in% dates_of_interest_incl] - filename_date_cl <- as.integer(as.factor(filename_date)) - - future_inserted <- split(data_paths, filename_date_cl) - other_args <- list(...) - data_variables <- terra::describe(data_paths[1], sds = TRUE)$var - - search_variables <- - if (grepl("chm", collection)) { - c("ACET", "ALD2", "ALK4", "BCPI", "BCPO", "BENZ", "C2H6", "C3H8", "CH4", "CO", "DST1", "DST2", "DST3", "DST4", "EOH", "H2O2", "HCHO", "HNO3", "HNO4", "ISOP", "MACR", "MEK", "MVK", "N2O5", "NH3", "NH4", "NIT", "NO", "NO2", "NOy", "OCPI", "OCPO", "PAN", "PM25_RH35_GCC", "PM25_RH35_GOCART", "PM25bc_RH35_GCC", "PM25du_RH35_GCC", "PM25ni_RH35_GCC", "PM25oc_RH35_GCC", "PM25soa_RH35_GCC", "PM25ss_RH35_GCC", "PM25su_RH35_GCC", "PRPE", "RCHO", "SALA", "SALC", "SO2", "SOAP", "SOAS", "TOLU", "XYLE") - } else { - c("CO", "NO2", "O3", "SO2") - } - - # fs is the hourly file paths per day (each element with N=24) - summary_byvar <- function(x = search_variables, fs) { - rast_in <- rlang::inject(terra::rast(fs, !!!other_args)) - # strongly assume that we take the single day. no need to filter dates - # per variable, - # all files (hourly) are cleaned and processed - sds_proc <- - lapply( - x, - function(v) { - rast_inidx <- grep(v, data_variables) - #rast_in <- mean(rast_in[[rast_inidx]]) - rast_summary <- terra::mean(rast_in[[rast_inidx]]) - rtin <- as.Date(terra::time(rast_in)) - rtin_u <- unique(rtin) - cat(sprintf("Processing %s, date: %s\n", v, rtin_u)) - # rast_summary <- vector("list", length = length(rtin_u)) - # for (d in seq_along(rtin_u)) { - # rast_d <- rast_in[[rtin == rtin_u[d]]] - # rast_summary[[d]] <- mean(rast_d) - # } - # rast_summary <- do.call(c, rast_summary) - - # the next line is deprecated - # rast_summary <- terra::tapp(rast_in, index = "days", fun = "mean") - terra::time(rast_summary) <- rtin_u - names(rast_summary) <- - paste0( - rep(gsub("_lev=.*", "", v), terra::nlyr(rast_summary)) - ) - terra::set.crs(rast_summary, "EPSG:4326") - return(rast_summary) - } - ) - sds_proc <- terra::sds(sds_proc) - - locstr <- terra::vect(locs) - rast_ext <- terra::extract(sds_proc, locstr, ID = TRUE) - # rast_ext <- lapply(rast_ext, - # function(df) { - # df$ID <- unlist(locs[[locs_id]]) - # return(df) - # } - # ) - rast_ext <- - Reduce(function(dfa, dfb) dplyr::full_join(dfa, dfb, by = "ID"), - rast_ext - ) - rast_ext$time <- unique(as.Date(terra::time(rast_in))) - rast_ext$ID <- unlist(locs[[locs_id]])[rast_ext$ID] - names(rast_ext)[names(rast_ext) == "ID"] <- locs_id - return(rast_ext) - - } - future::plan(future::multicore, workers = 10) - rast_summary <- - future.apply::future_lapply( - future_inserted, - function(fs) summary_byvar(fs = fs) - ) - future::plan(future::sequential) - rast_summary <- data.table::rbindlist(rast_summary) - # extract - - return(rast_summary) - - } - - - - -#' Reflown gmted processing -#' -calc_gmted_direct <- function( - variable = NULL, - path = NULL, - locs = NULL, - locs_id = NULL, - win = c(-126, -62, 22, 52), - radius = 0, - fun = "mean", - ...) { - #### directory setup - path <- amadeus::download_sanitize_path(path) - #### check for length of variable - if (!(length(variable) == 2)) { - stop( - paste0( - "Please provide a vector with the statistic and resolution.\n" - ) - ) - } - #### identify statistic and resolution - statistic <- variable[1] - statistic_code <- amadeus::process_gmted_codes( - statistic, - statistic = TRUE, - invert = FALSE - ) - resolution <- variable[2] - resolution_code <- amadeus::process_gmted_codes( - resolution, - resolution = TRUE, - invert = FALSE - ) - cat(paste0( - "Cleaning ", - statistic, - " data at ", - resolution, - " resolution.\n" - )) - statistic_from <- c( - "Breakline Emphasis", "Systematic Subsample", - "Median Statistic", "Minimum Statistic", - "Mean Statistic", "Maximum Statistic", - "Standard Deviation Statistic" - ) - statistic_to <- c( - "BRKL", "SSUB", "MEDN", "MINI", "MEAN", "MAXL", "STDV" - ) - statistic_to <- - sprintf("LDU_E%s", statistic_to[match(statistic, statistic_from)]) - - #### identify file path - paths <- list.dirs( - path, - full.names = TRUE - ) - data_path <- grep(sprintf("%s%s_grd", statistic_code, as.character(resolution_code)), paths, value = TRUE) - - #### import data - data <- terra::rast(data_path, win = win) - #### layer name - names(data) <- paste0( - "elevation_", - gsub( - "_grd", - "", - names(data) - ) - ) - #### varnames - terra::varnames(data) <- paste0( - "Elevation: ", - statistic, - " (", - resolution, - ")" - ) - from <- data - #return(from) - #### prepare locations list - sites_list <- amadeus::calc_prepare_locs( - from = from, - locs = locs, - locs_id = locs_id, - radius = radius - ) - sites_e <- sites_list[[1]] - sites_id <- sites_list[[2]] - #### perform extraction - sites_extracted <- amadeus::calc_worker( - dataset = "gmted", - from = from, - locs_vector = sites_e, - locs_df = sites_id, - radius = radius, - fun = fun, - variable = 2, - time = NULL, - time_type = "timeless" - ) - #### convert integer to numeric - sites_extracted[, 2] <- as.numeric(sites_extracted[, 2]) - #### define column names - colnames(sites_extracted) <- c( - locs_id, - paste0( - statistic_to, "_", sprintf("%05d", radius) - ) - ) - #### return data.frame - return(data.frame(sites_extracted)) -} - - - -process_narr2 <- function( - date = c("2023-09-01", "2023-09-01"), - variable = NULL, - path = NULL, - ...) { - #### directory setup - path <- amadeus::download_sanitize_path(path) - #### check for variable - amadeus::check_for_null_parameters(mget(ls())) - #### identify file paths - data_paths <- list.files( - path, - pattern = variable, - recursive = TRUE, - full.names = TRUE - ) - # data_paths <- grep( - # sprintf("%s*.*.nc", variable), - # data_paths, - # value = TRUE - # ) - #### define date sequence - date_sequence <- amadeus::generate_date_sequence( - date[1], - date[2], - sub_hyphen = TRUE - ) - #### path ncar - ym_from <- regmatches( - data_paths, - regexpr( - "2[0-9]{3,5}", - data_paths - )) - ym_of_interest <- - substr(date_sequence, - 1, ifelse(all(nchar(ym_from) == 6), 6, 4)) - ym_of_interest <- unique(ym_of_interest) - #### subset file paths to only dates of interest - data_paths_ym <- unique( - grep( - paste( - ym_of_interest, - collapse = "|" - ), - data_paths, - value = TRUE - ) - ) - - search_abbr <- list.dirs(path)[-1] - search_abbr <- sub(paste0(path, "/"), "", search_abbr) - search_to <- c( - "ATSFC", "ALBDO", "ACPRC", "DSWRF", "ACEVP", - "HCLAF", "PLBLH", "LCLAF", "LATHF", "MCLAF", - "OMEGA", "PRWTR", "PRATE", "PRSFC", "SENHF", - "SPHUM", "SNWCV", "SLMSC", "CLDCV", "ULWRF", - "UWIND", "VISIB", "VWIND", "ACSNW" - ) - search_to <- - sprintf("MET_%s", search_to[match(variable, search_abbr)]) - - #### initiate for loop - data_full <- terra::rast() - for (p in seq_along(data_paths_ym)) { - #### import data - data_year <- terra::rast(data_paths_ym[p]) - data_year_tinfo <- terra::time(data_year) - time_processed <- as.POSIXlt(data_year_tinfo) - time_this <- time_processed[1] - cat(paste0( - "Cleaning ", variable, " data for ", - sprintf( - "%s, %d %s", - strftime(time_this, "%B"), - time_this$year + 1900, - "...\n" - ) - )) - #### check for mono or pressure levels - lvinfo <- regmatches( - names(data_year), - regexpr("level=[0-9]{3,4}", names(data_year)) - ) - if (length(lvinfo) == 0) { - cat("Detected monolevel data...\n") - names(data_year) <- paste0( - search_to, "_", - gsub("-", "", data_year_tinfo) - ) - } else { - cat("Detected pressure levels data...\n") - lvinfo <- sub("level=", "", lvinfo) - lvinfo <- sprintf("%04d", as.integer(lvinfo)) - lvinfo <- paste0("L", lvinfo) - terra::time(data_year) <- as.Date( - data_year_tinfo - ) - names(data_year) <- sprintf( - "%s_%s_%s", - search_to, - lvinfo, - gsub("-", "", data_year_tinfo) - ) - } - data_full <- c( - data_full, - data_year, - warn = FALSE - ) - } - - #### subset years to dates of interest - data_full_cn <- names(data_full) - data_return <- terra::subset( - data_full, - which( - substr( - data_full_cn, - nchar(data_full_cn) - 7, - nchar(data_full_cn) - ) %in% date_sequence - ) - ) - cat(paste0( - "Returning daily ", - variable, - " data from ", - as.Date(date_sequence[1], format = "%Y%m%d"), - " to ", - as.Date( - date_sequence[length(date_sequence)], - format = "%Y%m%d" - ), - ".\n" - )) - #### return SpatRaster - return(data_return) -} - -calc_narr2 <- function( - from, - locs, - locs_id = NULL, - radius = 0, - fun = "mean", - ...) { - #### prepare locations list - sites_list <- amadeus::calc_prepare_locs( - from = from, - locs = locs[, "site_id"], - locs_id = locs_id, - radius = radius - ) - sites_e <- sites_list[[1]] - sites_id <- sites_list[[2]] - #### identify pressure level or monolevel data - time_from <- terra::time(from) - timetab <- table(time_from) - if (!all(timetab == 1)) { - time_split <- - split(time_from, - #ceiling(seq_along(time_from) / 29L)) - ceiling(as.integer(as.factor(time_from)) / 14L)) - sites_extracted <- Map( - function(day) { - cat(sprintf("Processing %s...\n", paste(day[1], "-", day[length(day)]))) - from_day <- from[[time_from %in% day]] - sites_extracted_day <- terra::extract( - from_day, - sites_e, - bind = TRUE - ) - sites_extracted_day <- data.frame(sites_extracted_day) - if ("geometry" %in% names(sites_extracted_day)) { - sites_extracted_day <- sites_extracted_day |> - dplyr::select(-geometry) - } - return(sites_extracted_day) - }, - time_split - ) - sites_extracted <- reduce_merge(sites_extracted, by = c("site_id")) - } else { - sites_extracted <- - terra::extract( - from, - sites_e, - bind = TRUE - ) - sites_extracted <- as.data.frame(sites_extracted) - if ("geometry" %in% names(sites_extracted)) { - sites_extracted <- sites_extracted |> - dplyr::select(-geometry) - } - } - sites_extracted <- - sites_extracted |> - tidyr::pivot_longer(cols = tidyselect::starts_with("MET_")) |> - dplyr::rowwise() |> - dplyr::mutate( - time = regmatches(name, - regexpr( - "20[0-9]{2,2}[0-1][0-9][0-3][0-9]", - name - )) - ) |> - dplyr::mutate( - name = sub(paste0("_", time), "", name) - ) |> - dplyr::ungroup() |> - dplyr::mutate( - time = as.character(as.Date(time, format = "%Y%m%d")) - ) |> - tidyr::pivot_wider( - names_from = name, - values_from = value, - id_cols = c("site_id", "time") - ) - sites_extracted <- data.table::as.data.table(sites_extracted) - names(sites_extracted)[-1:-2] <- sprintf("%s_%05d", names(sites_extracted)[-1:-2], radius) - - #### return data.frame - return(sites_extracted) -} - -#' Impute missing values and attach lagged features -#' @note under construction. -## impute -#' Impute All Function -#' -#' This function performs imputation on a given data table by replacing missing values with imputed values. -#' It follows a series of steps including data cleaning, name cleaning, geoscn processing, NDVI 16-day backward filling, -#' zero-variance exclusion, excessive "true zeros" exclusion, and imputation using missRanger. -#' -#' @param dt The input data table to be imputed. -#' @param period The period for lagged features in the imputation process. -#' @param nthreads_dt The number of threads to be used for data.table operations. -#' @param nthreads_collapse The number of threads to be used for collapse operations. -#' @param nthreads_imputation The number of threads to be used for the imputation process. -#' -#' @return The imputed data table with lagged features. -#' -#' @importFrom collapse set_collapse replace_inf replace_na fvar fnth -#' @importFrom data.table setDTthreads setnafill -#' @importFrom qs qread -#' @importFrom stats setNames -#' @importFrom stringi stri_replace_all_regex -#' @importFrom missRanger missRanger -#' @examples -#' dt <- data.table(a = c(1, 2, NA, 4), b = c(NA, 2, 3, 4)) -#' impute_all(dt, period = 1) -#' -#' @export -impute_all <- - function( - dt, - period, - nthreads_dt = 32L, - nthreads_collapse = 32L, - nthreads_imputation = 32L) { - library(collapse) - library(data.table) - data.table::setDTthreads(nthreads_dt) - if (is.character(dt)) { - dt <- file.path("output/qs", dt) - dt <- qs::qread(dt) - } - # name cleaning - allcns <- names(dt) - allcns_smoke <- grep("(light|medium|heavy)_", allcns) - dt <- stats::setNames(dt, sub("light_1", "OTH_HMSWL_0_00000", names(dt))) - dt <- stats::setNames(dt, sub("medium_1", "OTH_HMSWM_0_00000", names(dt))) - dt <- stats::setNames(dt, sub("heavy_1", "OTH_HMSWH_0_00000", names(dt))) - dt <- stats::setNames(dt, sub("population_", "POP_SEDAC_0_", names(dt))) - - # Copilot-generated - geoscn <- "ACET\tGEO_ACETO_0_00000 - ALD2\tGEO_ACETA_0_00000 - ALK4\tGEO_CALKA_0_00000 - BCPI\tGEO_HIBCA_0_00000 - BCPO\tGEO_HOBCA_0_00000 - BENZ\tGEO_BENZE_0_00000 - C2H6\tGEO_ETHTE_0_00000 - C3H8\tGEO_PROPA_0_00000 - CH4\tGEO_METHA_0_00000 - CO\tGEO_CMONO_0_00000 - DST1\tGEO_DUST1_0_00000 - DST2\tGEO_DUST2_0_00000 - DST3\tGEO_DUST3_0_00000 - DST4\tGEO_DUST4_0_00000 - EOH\tGEO_ETHOL_0_00000 - H2O2\tGEO_HYPER_0_00000 - HCHO\tGEO_FORMA_0_00000 - HNO3\tGEO_NITAC_0_00000 - HNO4\tGEO_PERAC_0_00000 - ISOP\tGEO_ISOPR_0_00000 - MACR\tGEO_METHC_0_00000 - MEK\tGEO_MEKET_0_00000 - MVK\tGEO_MVKET_0_00000 - N2O5\tGEO_DIPEN_0_00000 - NH3\tGEO_AMNIA_0_00000 - NH4\tGEO_AMNUM_0_00000 - NIT\tGEO_INNIT_0_00000 - NO\tGEO_NIOXI_0_00000 - NO2\tGEO_NIDIO_0_00000 - NOy\tGEO_NITRO_0_00000 - OCPI\tGEO_HIORG_0_00000 - OCPO\tGEO_HOORG_0_00000 - PAN\tGEO_PERNI_0_00000 - PM25_RH35_GCC\tGEO_PM25X_0_00000 - PM25_RH35_GOCART\tGEO_PM25R_0_00000 - PM25bc_RH35_GCC\tGEO_BLCPM_0_00000 - PM25du_RH35_GCC\tGEO_DUSPM_0_00000 - PM25ni_RH35_GCC\tGEO_NITPM_0_00000 - PM25oc_RH35_GCC\tGEO_ORCPM_0_00000 - PM25soa_RH35_GCC\tGEO_SORPM_0_00000 - PM25ss_RH35_GCC\tGEO_SEAPM_0_00000 - PM25su_RH35_GCC\tGEO_SULPM_0_00000 - PRPE\tGEO_CALKE_0_00000 - RCHO\tGEO_CALDH_0_00000 - SALA\tGEO_FSEAS_0_00000 - SALC\tGEO_CSEAS_0_00000 - SO2\tGEO_SULDI_0_00000 - SOAP\tGEO_SOAPR_0_00000 - SOAS\tGEO_SOASI_0_00000 - TOLU\tGEO_TOLUE_0_00000 - XYLE\tGEO_XYLEN_0_00000 - CO_y\tGEO_COVMR_0_00000 - NO2_y\tGEO_NOVMR_0_00000 - O3\tGEO_OZVMR_0_00000 - SO2_y\tGEO_SOVMR_0_00000" - - geoscn <- strsplit(geoscn, "\n") - geoscn <- unlist(geoscn) - geoscn <- strsplit(geoscn, "\t") - geoscn <- do.call(rbind, geoscn) - geoscndf <- as.data.frame(geoscn, stringsAsFactors = FALSE) - colnames(geoscndf) <- c("variable", "code") - geoscndf$variable <- trimws(geoscndf$variable) - - for (i in seq_len(nrow(geoscndf))) { - dt <- - setNames( - dt, - stringi::stri_replace_all_regex( - names(dt), sprintf("%s$", geoscndf$variable[i]), geoscndf$code[i] - ) - ) - } - - # NDVI 16-day - # For each site_id, backward filling for 16-day NDVI - # Last Observation Carried Forward is the method used; - # it assumes that the rows are ordered by date - dt <- dt[order(site_id, time), ] - col_ndviv <- grep("MOD_NDVIV_", names(dt)) - dtndviv <- data.table::setnafill(dt, type = "nocb", nan = NA, cols = col_ndviv) - - collapse::set_collapse(mask = "manip", nthreads = nthreads_collapse) - - target_replace <- grep("^MOD_", names(dt), invert = TRUE) - dt <- collapse::replace_inf(dtndviv, value = NA, replace.nan = TRUE) - dt <- collapse::replace_na(dt, value = 0, cols = target_replace) - - # zero-variance exclusion - dt_colvars <- collapse::fvar(dt[, 5:ncol(dt), with = FALSE]) - zero_var_fields <- names(dt_colvars[dt_colvars == 0]) - - # Exclude fields with zero variance using data.table - dt <- dt[, (zero_var_fields) := NULL] - - # Store the name of zero variance fields as an attribute of the input object - attr(dt, "zero_var_fields") <- zero_var_fields - - # excluding columns with excessive "true zeros" - # we should have a threshold for the zero rate - # exc_zero <- collapse::fnth(dt[, 5:ncol(dt), with = FALSE], n = 0.9) - # exc_zero <- unname(which(exc_zero == 0)) + 5L - # dt <- dt[, (exc_zero) := NULL] - - # Q: Do we use all other features to impute? -- Yes. - # 32-thread, 10% for tree building, 200 trees, 4 rounds: 11 hours - imputed <- - missRanger::missRanger( - data = dt, - maxiter = 30L, - num.trees = 300L, - num.threads = nthreads_imputation, - mtry = 50L, - sample.fraction = 0.1 - ) - - imputed <- amadeus::calc_temporal_dummies(imputed, "time") - return(imputed) - # lagged features: changing period (period[1] + 1 day) - # period <- as.Date(period) - # period[1] <- period[1] + as.difftime(1, units = "days") - # period <- as.character(period) - # index_lag <- - # sprintf("MET_%s", c("ATSFC", "ACPRC", "PRSFC", "SPHUM", "WNDSP")) - # index_lag <- grep(paste(index_lag, collapse = "|"), names(dt)) - # target_lag <- imputed[, index_lag, with = FALSE] - # target_nolag <- - - # output <- amadeus::calc_lagged(target_lag, period, 1, "site_id") - # return(output) -} - -# test -# qssf<-impute_all(qss) -# aqi <- impute_all(aq) -# aqi <- aqi |> tidytable::select(1:2, tidytable::starts_with("MOD_NDVIV")) - -# system.time( -# cgeo <- calc_geos_strict(path = "input/geos/chm_tavg_1hr_g1440x721_v1", -# date = c("2018-05-17", "2018-05-17"), -# locs = terra::vect(data.frame(site_id = 1, lon = -90, lat = 40)), -# locs_id = "site_id", -# win = c(-126, -62, 22, 52), -# snap = "out") -# ) - - -#' Append Predecessors -#' -#' This function appends predecessors to an existing object or creates a new object if none exists. -#' -#' @param path_qs The path where the predecessors will be stored. -#' @param period_new The new period to be appended. -#' @param input_new The new input object to be appended. -#' @param nthreads The number of threads to be used. -#' -#' @return If no existing predecessors are found, the function saves the new input object and returns the name of the saved file. -#' If existing predecessors are found, the function appends the new input object to the existing ones and returns the combined object. -#' -#' @examples -#' # Append predecessors with a new input object -#' append_predecessors(path_qs = "output/qs", period_new = c("2022-01-01", "2022-01-31"), input_new = my_data) -#' -#' # Append predecessors with an existing input object -#' append_predecessors(path_qs = "output/qs", period_new = c("2022-02-01", "2022-02-28"), input_new = my_data) -#' -#' @export -append_predecessors <- - function( - path_qs = "output/qs", - period_new = NULL, - input_new = NULL, - nthreads = 8L - ) { - if (is.null(input_new)) { - stop("Please provide a valid object.") - } - if (!dir.exists(path_qs)) { - dir.create(path_qs, recursive = TRUE) - } - input_old <- list.files(path_qs, "*.*.qs$", full.names = TRUE) - - # validate input_old with period_new - # if (length(input_old) > 0) { - # periods_old <- do.call(rbind, strsplit(input_old, "_")) - # periods_old <- periods_old[, 4:5] - # periods_old_check <- vapply( - # seq(1, nrow(periods_old)), - # function(i) { - # period_old <- periods_old[i, ] - # period_old <- as.Date(period_old, format = "%Y-%m-%d") - # period_new <- as.Date(period_new, format = "%Y-%m-%d") - # if (period_new[1] < period_old[1] | period_new[2] < period_old[2]) { - # return(FALSE) - # } else { - # return(TRUE) - # } - # }, - # logical(1) - # ) - # if (!all(periods_old_check)) { - # stop("Results have an overlap period. Please provide a valid period.") - # } - # } - period_new <- sapply(period_new, as.character) - time_create <- gsub("[[:punct:]]|[[:blank:]]", "", Sys.time()) - name_qs <- - sprintf( - "dt_feat_pm25_%s_%s_%s.qs", - period_new[1], period_new[2], time_create - ) - if (length(input_old) == 0) { - qs::qsave(input_new, file = file.path(path_qs, name_qs)) - return(name_qs) - } else { - vv <- list() - qs::qsave(input_new, file = file.path(path_qs, name_qs)) - input_update <- list.files(path_qs, "*.*.qs$", full.names = TRUE) - bound_large <- - Reduce( - function(x, y) { - if (inherits(x, "data.frame")) { - bound <- rbind(x, qs::qread(y)) - } else { - bound <- rbind(qs::qread(x), qs::qread(y)) - } - return(bound) - }, - input_update - ) - return(bound_large) - } - } - - -# nested parallelization -# IN PROGRESS -# TODO: identify bottleneck -par_nest <- - function( - path, - ... - ) { - par_grid( - path, - fun_dist = calculate, - ... - ) - } - - -## base & meta learner fitting -# strategy: -# random subsample (~30%) ; row based -# P times... - -#' Base learner: Multilayer perceptron with brulee -#' -#' Multilayer perceptron model with different configurations of -#' hidden units, dropout, activation, and learning rate using brulee -#' and tidymodels. With proper settings, users can utilize graphics -#' processing units (GPU) to speed up the training process. -#' @note Spatiotemporal cross-validation strategy is not yet implemented. -#' tune package should be 1.2.0 or higher. -#' @param dt_imputed The input data table to be used for fitting. -#' @param r_subsample The proportion of rows to be sampled. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param cv_config The cross-validation configuration. To be added. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom recipes recipe update_role -#' @importFrom parsnip mlp set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_brulee <- - function( - dt_imputed, - r_subsample = 0.3, - yvar = "Arithmetic.Mean", - xvar = seq(6, ncol(dt_imputed)), - vfold = 5L, - cv_config, - ... - ) { - # 2^9=512, 2^15=32768 (#param is around 10% of selected rows) - grid_hyper_tune <- - expand.grid( - hidden_units = list(c(64, 64), c(32, 32), c(32, 32, 32), c(16, 16, 16)), - dropout = 1 / seq(4, 2, -1), - activation = c("relu", "leaky_relu"), - learn_rate = c(0.1, 0.05, 0.01) - ) - dt_imputed <- - dt_imputed %>% - slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed - ) %>% - # do we want to normalize the predictors? - # if so, an additional definition of truly continuous variables is needed - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(!!xvar) %>% - recipes::update_role(!!yvar, new_role = "outcome") #%>% - # recipes::step_normalize(!!yvar) - - # fix this part to implement SPT CV strategy - base_vfold <- rsample::vfold_cv(dt_imputed, v = vfold) - base_model <- - parsnip::mlp( - hidden_units = tune(), - dropout = tune(), - epochs = 1000L, - activation = tune(), - learn_rate = tune() - ) %>% - parsnip::set_engine("brulee", device = "cuda") %>% - parsnip::set_mode("regression") - - wf_config <- control_resamples(save_pred = TRUE, save_workflow = TRUE) - - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config - ) - return(base_wf) - - } - -# dt <- qs::qread("output/dt_feat_design_imputed_061024.qs") -# dtd <- dplyr::as_tibble(dt) -# dtfit <- fit_base_brulee(dtd, r_subsample = 0.3) - - -#' Base learner: Extreme gradient boosting (XGBoost) -#' -#' XGBoost model is fitted at the defined rate (`r_subsample`) of -#' the input dataset by grid search. -#' With proper settings, users can utilize graphics -#' processing units (GPU) to speed up the training process. -#' @note Spatiotemporal cross-validation strategy is not yet implemented. -#' @param dt_imputed The input data table to be used for fitting. -#' @param r_subsample The proportion of rows to be sampled. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param cv_config The cross-validation configuration. To be added. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom recipes recipe update_role -#' @importFrom parsnip boost_tree set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_xgb <- - function( - dt_imputed, - r_subsample = 0.3, - yvar = "Arithmetic.Mean", - xvar = seq(6, ncol(dt_imputed)), - vfold = 5L, - cv_config, - ... - ) { - grid_hyper_tune <- - expand.grid( - mtry = floor(c(0.02, 0.1, 0.02) * ncol(dt_imputed)), - trees = seq(500, 3000, 500), - learn_rate = c(0.05, 0.01, 0.005, 0.001) - ) - dt_imputed <- - dt_imputed %>% - slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed - ) %>% - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(all_of(xvar)) %>% - recipes::update_role(all_of(yvar), new_role = "outcome") - base_vfold <- rsample::vfold_cv(dt_imputed, v = 5) - base_model <- - parsnip::boost_tree( - mtry = tune(), - trees = tune(), - learn_rate = tune() - ) %>% - parsnip::set_engine("xgboost", device = "cuda") %>% - parsnip::set_mode("regression") - - wf_config <- control_resamples(save_pred = TRUE, save_workflow = TRUE) - - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config - ) - return(base_wf) - - } - -# dt <- qs::qread("output/dt_feat_design_imputed_061024.qs") -# dtd <- dplyr::as_tibble(dt) -# dtfitx <- fit_base_xgb(dtd, xvar = names(dtd)[6:105], r_subsample = 0.3) - - -#' Base learner: Elastic net -#' -#' Elastic net model is fitted at the defined rate (`r_subsample`) of -#' the input dataset by grid search. -#' @note Spatiotemporal cross-validation strategy is not yet implemented. -#' @param dt_imputed The input data table to be used for fitting. -#' @param r_subsample The proportion of rows to be sampled. -#' @param yvar The target variable. -#' @param xvar The predictor variables. -#' @param vfold The number of folds for cross-validation. -#' @param nthreads The number of threads to be used. Default is 16L. -#' @param cv_config The cross-validation configuration. To be added. -#' @param ... Additional arguments to be passed. -#' -#' @returns The fitted workflow. -#' @importFrom future plan multicore multisession -#' @importFrom recipes recipe update_role -#' @importFrom parsnip linear_reg set_engine set_mode -#' @importFrom workflows workflow add_recipe add_model -#' @importFrom tune tune_grid -#' @importFrom tidyselect all_of -#' @importFrom yardstick metric_set rmse -#' @importFrom rsample vfold_cv -#' @export -fit_base_elnet <- - function( - dt_imputed, - r_subsample = 0.3, - yvar = "Arithmetic.Mean", - xvar = seq(6, ncol(dt_imputed)), - vfold = 5L, - nthreads = 16L, - cv_config, - ... - ) { - grid_hyper_tune <- - expand.grid( - mixture = seq(0, 1, length.out = 21), - penalty = 10 ^ seq(-3, 5) - ) - dt_imputed <- - dt_imputed %>% - slice_sample(prop = r_subsample) - - base_recipe <- - recipes::recipe( - dt_imputed - ) %>% - # recipes::step_normalize(recipes::all_numeric_predictors()) %>% - recipes::update_role(all_of(xvar)) %>% - recipes::update_role(all_of(yvar), new_role = "outcome") - base_vfold <- rsample::vfold_cv(dt_imputed, v = 5) - base_model <- - parsnip::linear_reg( - mixture = tune(), - penalty = tune() - ) %>% - parsnip::set_engine("glmnet") %>% - parsnip::set_mode("regression") - - wf_config <- control_resamples(save_pred = TRUE, save_workflow = TRUE) - - future::plan(future::multicore, workers = nthreads) - base_wf <- - workflows::workflow() %>% - workflows::add_recipe(base_recipe) %>% - workflows::add_model(base_model) %>% - tune::tune_grid( - resamples = base_vfold, - grid = grid_hyper_tune, - metrics = yardstick::metric_set(yardstick::rmse, yardstick::mape), - control = wf_config, - parallel_over = "resamples" - ) - future::plan(future::sequential) - return(base_wf) - - } - -# dtfite <- fit_base_elnet(dtd, r_subsample = 0.3) - -### TODO: retrieve predictions, quick prediction checking (spatial/temporal) - - - -predict_base <- - function( - fitted, - targetdf - ) { - } - - - -predict_meta <- - function( - metalearner = NULL, - targetdf = NULL, - threads = NULL - ) { - beethoven::meta_predict( - metalearner, - targetdf, - nthreads = threads - ) - } - -export_res <- - function( - - ) { - - } - - -run_apptainer <- - function( - image_path = "/ddn/gs1/home/songi2/apptainer_build/r-image-05202024.sif", - pass_path = "/ddn/gs1/home/songi2/projects/beethoven", - inner_path = "/data", - export_file = "apptainer_out.qs", - expr = "a<-data.frame(a = 1:8, b = 11:18) - saveRDS(a, file = \"output/apptainer_out.rds\")" - ) { - expr <- - stringi::stri_replace_all_fixed( - expr, - pass_path, - inner_path - ) - - # exec, save to QS and read it back - # should note that the file is saved in the inner path - # in the container. We will have the actual file in the - # outer path. - ## TODO: set dynamic file path to avoid duplicates & overwriting - system( - sprintf( - "apptainer exec --writable-tmpfs --env R_PROFILE_USER=/dev/null --bind %s:%s %s Rscript -e '%s'", - pass_path, - inner_path, - image_path, - expr - ) - ) - path_target <- file.path(pass_path, "output", export_file) - readin <- qs::qread(path_target) - file.remove(path_target) - return(readin) - } - diff --git a/inst/targets/punchcard_calc.qs b/inst/targets/punchcard_calc.qs deleted file mode 100644 index bea4d00c..00000000 Binary files a/inst/targets/punchcard_calc.qs and /dev/null differ diff --git a/inst/targets/run.sh b/inst/targets/run.sh deleted file mode 100644 index 7f847776..00000000 --- a/inst/targets/run.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=pipeline_bench -#SBATCH --output=/ddn/gs1/home/songi2/projects/beethoven/pipeline_out.out -#SBATCH --error=/ddn/gs1/home/songi2/projects/beethoven/pipeline_err.err -#SBATCH --mail-type=END,FAIL -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=2 -#SBATCH --mem-per-cpu=32g -#SBATCH --partition=geo -#SBATCH --mail-user=songi2@nih.gov - -source /ddn/gs1/home/songi2/.profile -export R_LIBS_USER=$R_LIBS_USER:/ddn/gs1/biotools/R/lib64/R/library - -nohup nice -4 Rscript /ddn/gs1/home/songi2/projects/beethoven/inst/targets/targets_start.R diff --git a/inst/targets/targets_arglist.R b/inst/targets/targets_arglist.R index a32bceff..d4c7a4b6 100644 --- a/inst/targets/targets_arglist.R +++ b/inst/targets/targets_arglist.R @@ -67,7 +67,7 @@ library(beethoven) #' @param path_input Character string specifying the input path. #' Default is "input". #' -#' @returns A list of arguments for the calculation process. +#' @return A list of arguments for the calculation process. #' @importFrom qs qsave #' @export set_args_calc <- diff --git a/inst/targets/targets_baselearner.R b/inst/targets/targets_baselearner.R index 3d3c30d8..7a2ba968 100644 --- a/inst/targets/targets_baselearner.R +++ b/inst/targets/targets_baselearner.R @@ -6,301 +6,97 @@ target_baselearner <- attach_xy(dt_feat_calc_imputed, sf_feat_proc_aqs_sites) ) , - # P targets::tar_target( - name = list_feat_calc_xyt, - command = - lapply( - rep(1, 30), - function(x) { - make_subdata(dt_feat_calc_xyt, p = 0.3) - } - ), - iteration = "list" - ) - , - # length of 30 rsets - targets::tar_target( - name = list_learner_base_cv_spt, - command = - prepare_cvindex( - data = list_feat_calc_xyt, - target_cols = c("lon", "lat", "time"), - cv_make_fun = generate_cv_index, - cv_fold = 8L, - cv_pairs = 10L, - preprocessing = "normalize", - pairing = "1" - ), - pattern = map(list_feat_calc_xyt), - iteration = "list" - ) - , - # length of 30 - targets::tar_target( - name = list_learner_base_cv_spblock, - command = - prepare_cvindex( - data = list_feat_calc_xyt, - target_cols = c("lon", "lat"), - cv_make_fun = spatialsample::spatial_block_cv, - v = 10L, - method = "snake" - ), - pattern = map(list_feat_calc_xyt), + name = df_learner_type, + command = assign_learner_cv( + learner = c("xgb", "mlp", "elnet"), + cv_mode = c("spatial", "temporal", "spatiotemporal"), + cv_rep = 100L, + num_device = 2L + ) %>% + split(seq_len(nrow(.))), iteration = "list" ) , - # length of 30 targets::tar_target( - name = list_learner_base_cv_spcluster, - command = - prepare_cvindex( - data = list_feat_calc_xyt, - target_cols = c("lon", "lat"), - cv_make_fun = spatialsample::spatial_clustering_cv, - v = 10L, - cluster_function = "kmeans" - ), - pattern = map(list_feat_calc_xyt), - iteration = "list", - resources = set_slurm_resource(ncpus = 1L, memory = 32L, partition = "geo") - ) - , - # learn_rate branching - targets::tar_target( - name = num_learner_base_learn_device, - command = - split( - data.frame( - device = sprintf("cuda:%d", c(0, 1, 2, 3)), - rate = c(0.1, 0.05, 0.01, 0.001) - ), seq(1, 4) + name = list_base_args_cv, + command = list( + spatial = list( + target_cols = c("lon", "lat"), + cv_make_fun = generate_cv_index_sp, + v = 10L, + method = "snake" ), - description = "device and learning rate", - iteration = "list" - ) - , - # wf: workflow - # xgb-spt-cv - # length of 120 (4 * 30) - targets::tar_target( - workflow_learner_base_lgb_spt, - fit_base_lightgbm( - dt_feat_calc_imputed, - folds = list_learner_base_cv_spt, - tune_mode = "grid", - learn_rate = num_learner_base_learn_device$rate, - device = num_learner_base_learn_device$device - ), - pattern = cross(num_learner_base_learn_device, list_learner_base_cv_spt), - resources = set_slurm_resource(ncpus = 6L, memory = 20L, partition = "geo") - ) - , - # length of 120 - targets::tar_target( - workflow_learner_base_lgb_spblock, - fit_base_lightgbm( - dt_feat_calc_imputed, - folds = list_learner_base_cv_spblock, - tune_mode = "grid", - learn_rate = num_learner_base_learn_device$rate, - device = num_learner_base_learn_device$device - ), - pattern = cross(num_learner_base_learn_device, list_learner_base_cv_spblock), - resources = set_slurm_resource(ncpus = 6L, memory = 20L, partition = "geo") - ) - , - # length of 120 - targets::tar_target( - workflow_learner_base_xgb_spcluster, - fit_base_lightgbm( - dt_feat_calc_imputed, - folds = list_learner_base_cv_spcluster, - tune_mode = "grid", - learn_rate = num_learner_base_learn_device$rate, - device = num_learner_base_learn_device$device - ), - pattern = cross(num_learner_base_learn_device, list_learner_base_cv_spcluster), - resources = set_slurm_resource(ncpus = 6L, memory = 20L, partition = "geo") - ) - , - # mlp-cv: iterate by combination of rate+device and cv strategy - # length of 120 - targets::tar_target( - workflow_learner_base_mlp_spt, - fit_base_brulee( - dt_feat_calc_imputed, - folds = list_learner_base_cv_spt, - tune_mode = "grid", - learn_rate = num_learner_base_learn_device$rate, - device = num_learner_base_learn_device$device - ), - pattern = cross(num_learner_base_learn_device, list_learner_base_cv_spt), - resources = set_slurm_resource(ncpus = 6L, memory = 20L, partition = "geo,gpu") - ) - , - # length of 120 - targets::tar_target( - workflow_learner_base_mlp_spblock, - fit_base_brulee( - dt_feat_calc_imputed, - folds = list_learner_base_cv_spblock, - tune_mode = "grid", - learn_rate = num_learner_base_learn_device$rate, - device = num_learner_base_learn_device$device - ), - pattern = cross(num_learner_base_learn_device, list_learner_base_cv_spblock), - resources = set_slurm_resource(ncpus = 6L, memory = 20L, partition = "geo,gpu") - ) - , - # length of 120 - targets::tar_target( - workflow_learner_base_mlp_spcluster, - fit_base_brulee( - dt_feat_calc_imputed, - folds = list_learner_base_cv_spcluster, - tune_mode = "grid", - learn_rate = num_learner_base_learn_device$rate, - device = num_learner_base_learn_device$device - ), - pattern = cross(num_learner_base_learn_device, list_learner_base_cv_spcluster), - resources = set_slurm_resource(ncpus = 6L, memory = 20L, partition = "geo,gpu") - ) - , - # elnet-cv is branched out only by subsamples. - # length of 30 - targets::tar_target( - workflow_learner_base_elnet_spt, - fit_base_elnet( - dt_feat_calc_imputed, - folds = list_learner_base_cv_spt, - nthreads = 32L - ), - pattern = map(list_learner_base_cv_spt), - iteration = "list", - resources = set_slurm_resource(ncpus = 32L, memory = 8L, partition = "geo") - ) - , - # length of 30 - targets::tar_target( - workflow_learner_base_elnet_spblock, - fit_base_elnet( - dt_feat_calc_imputed, - folds = list_learner_base_cv_spblock, - nthreads = 32L - ), - pattern = map(list_learner_base_cv_spblock), - iteration = "list", - resources = set_slurm_resource(ncpus = 32L, memory = 8L, partition = "geo") - ) - , - # length of 30 - targets::tar_target( - workflow_learner_base_elnet_spcluster, - fit_base_elnet( - dt_feat_calc_imputed, - folds = list_learner_base_cv_spcluster, - nthreads = 32L - ), - pattern = map(list_learner_base_cv_spcluster), - iteration = "list", - resources = set_slurm_resource(ncpus = 32L, memory = 8L, partition = "geo") - ) - , - # combine tuning results to find the best model - # xgb and mlp were branched out by learn_rate; - # learn_rate comes first, thus subsamples are organized - # in a manner of 1 2 3 4 5 ... 30 1 2 3 4 5 ... 30 ... - # combine the results with indices of 1 31 61 91 / 2 32 62 92 / ... / 30 60 90 120. Could be changed per #subsamples. - # elnet is length of 30 (or #subsamples) - targets::tar_target( - list_learner_base_lgb_best_spt, - restore_fit_best( - workflow_learner_base_lgb_spt, - rset_full = list_learner_base_cv_spt, - df_full = dt_feat_calc_imputed, - nested = TRUE - ) - ) - , - targets::tar_target( - list_learner_base_lgb_best_spblock, - restore_fit_best( - workflow_learner_base_lgb_spblock, - rset_full = list_learner_base_cv_spblock, - df_full = dt_feat_calc_imputed, - nested = TRUE - ) - ) - , - targets::tar_target( - list_learner_base_lgb_best_spcluster, - restore_fit_best( - workflow_learner_base_lgb_spcluster, - rset_full = list_learner_base_cv_spcluster, - df_full = dt_feat_calc_imputed, - nested = TRUE - ) - ) - , - targets::tar_target( - list_learner_base_mlp_best_spt, - restore_fit_best( - workflow_learner_base_mlp_spt, - rset_full = list_learner_base_cv_spt, - df_full = dt_feat_calc_imputed, - nested = TRUE - ) - ) - , - targets::tar_target( - list_learner_base_mlp_best_spblock, - restore_fit_best( - workflow_learner_base_mlp_spblock, - rset_full = list_learner_base_cv_spblock, - df_full = dt_feat_calc_imputed, - nested = TRUE - ) - ) - , - targets::tar_target( - list_learner_base_mlp_best_spcluster, - restore_fit_best( - workflow_learner_base_mlp_spcluster, - rset_full = list_learner_base_cv_spcluster, - df_full = dt_feat_calc_imputed, - nested = TRUE - ) - ) - , - targets::tar_target( - list_learner_base_elnet_best_spt, - restore_fit_best( - workflow_learner_base_elnet_spt, - rset_full = list_learner_base_cv_spt, - df_full = dt_feat_calc_imputed, - nested = TRUE + temporal = list( + cv_fold = 10L, + time_col = "time", + window = 14L + ), + spatiotemporal = list( + target_cols = c("lon", "lat", "time"), + cv_make_fun = generate_cv_index_spt, + ngroup_init = 8L, + cv_pairs = 10L, + preprocessing = "normalize", + pairing = "1" + ) ) ) , targets::tar_target( - list_learner_base_elnet_best_spblock, - restore_fit_best( - workflow_learner_base_elnet_spblock, - rset_full = list_learner_base_cv_spblock, - df_full = dt_feat_calc_imputed, - nested = TRUE + name = list_base_params_candidates, + command = list( + lgb = + expand.grid( + mtry = floor(c(0.025, seq(0.05, 0.2, 0.05)) * 2000L), + trees = seq(1000, 3000, 1000), + learn_rate = c(0.1, 0.05, 0.01, 0.005) + ) + , + xgb = + expand.grid( + mtry = floor(c(0.025, seq(0.05, 0.2, 0.05)) * 2000L), + trees = seq(1000, 3000, 1000), + learn_rate = c(0.1, 0.05, 0.01, 0.005) + ) + , + mlp = + expand.grid( + hidden_units = c(1024, 512, 256, 128, 64), + dropout = 1 / seq(5, 2, -1), + activation = c("relu", "leaky_relu"), + learn_rate = c(0.1, 0.05, 0.01, 0.005) + ) + , + elnet = + expand.grid( + # 0.05 step, 0 through 1 + mixture = seq(0, 1, length.out = 21), + penalty = 10 ^ seq(-3, 5, 1) + ) ) ) , targets::tar_target( - list_learner_base_elnet_best_spcluster, - restore_fit_best( - workflow_learner_base_elnet_spcluster, - rset_full = list_learner_base_cv_spcluster, - df_full = dt_feat_calc_imputed, - nested = TRUE - ) + name = workflow_learner_base_best, + command = + fit_base_learner( + learner = df_learner_type$learner, + dt_full = dt_feat_calc_xyt, + r_subsample = 0.3, + model = + switch_model(model_type = df_learner_type$learner, + device = df_learner_type$device), + cv_mode = df_learner_type$cv_mode, + args_generate_cv = list_base_args_cv[[df_learner_type$cv_mode]], + tune_grid_in = list_base_params_candidates[[df_learner_type$learner]], + # preferably match the number of threads to the random grid size. + tune_grid_size = 10L, + nthreads = 10L + ), + pattern = map(df_learner_type), + iteration = "list", + resources = set_slurm_resource(ncpus = 10L, memory = 8L, partition = "geo") ) + ) \ No newline at end of file diff --git a/inst/targets/targets_calculate.R b/inst/targets/targets_calculate.R index ee989d3d..b3f45b11 100644 --- a/inst/targets/targets_calculate.R +++ b/inst/targets/targets_calculate.R @@ -7,7 +7,7 @@ target_calculate_fit <- list( tarchetypes::tar_files_input( name = file_prep_calc_args, - files = list.files("inst/targets", pattern = "*.*.qs$", full.names = TRUE), + files = list.files("inst/targets", pattern = "^calc*.*.qs$", full.names = TRUE), # cue = tar_invalidate(tar_older(Sys.time() - as.difftime(4, units = "weeks"))), format = "file", iteration = "vector", @@ -16,7 +16,7 @@ target_calculate_fit <- , tar_target( chr_iter_calc_features, - command = c("hms", "nlcd", "tri", "nei", + command = c("hms", "tri", "nei", "ecoregions", "koppen", "population", "groads"), iteration = "list", description = "Feature calculation" @@ -92,6 +92,57 @@ target_calculate_fit <- description = "Base feature list (all dt)" ) , + tar_target( + name = df_feat_calc_nlcd_params, + command = expand.grid( + year = loadargs(file_prep_calc_args, "nlcd")$domain, + radius = loadargs(file_prep_calc_args, "nlcd")$radius + ) %>% + split(1:nrow(.)), + iteration = "list" + ) + , + tar_target( + name = list_feat_calc_nlcd, + command = inject_nlcd(year = df_feat_calc_nlcd_params$year, + radius = df_feat_calc_nlcd_params$radius, + from = amadeus::process_nlcd( + path = loadargs(file_prep_calc_args, "nlcd")$path, + year = df_feat_calc_nlcd_params$year + ), + locs = sf_feat_proc_aqs_sites, + locs_id = arglist_common$char_siteid, + nthreads = 10L, + mode = "exact", + max_cells = 3e7 + ), + pattern = cross(file_prep_calc_args, df_feat_calc_nlcd_params), + iteration = "list", + description = "NLCD feature list", + resources = set_slurm_resource( + ntasks = 1, ncpus = 10, memory = 8 + ) + ) + , + tar_target( + name = dt_feat_calc_nlcd, + command = + list_feat_calc_nlcd %>% + collapse::rowbind(fill = TRUE) %>% + collapse::funique() %>% + collapse::pivot( + ids = c(arglist_common$char_siteid, arglist_common$char_timeid), + values = names(.)[!names(.) %in% c(arglist_common$char_siteid, arglist_common$char_timeid)] + ) %>% + .[!is.na(.[["value"]]),] %>% + collapse::pivot( + ids = c("site_id", "time"), + values = c("value"), + how = "wider" + ), + description = "NLCD feature list (all dt)" + ) + , tar_target( list_feat_calc_nasa, command = @@ -141,6 +192,7 @@ target_calculate_fit <- command = #rlang::inject( par_narr( domain = loadargs(file_prep_calc_args, "narr")$domain, + path = loadargs(file_prep_calc_args, "narr")$path, date = arglist_common$char_period, locs = sf_feat_proc_aqs_sites, nthreads = arglist_common$nthreads_narr @@ -218,7 +270,8 @@ target_calculate_fit <- c( list(dt_feat_proc_aqs_sites_time), list_feat_calc_base_flat, - list(dt_feat_calc_gmted) + list(dt_feat_calc_gmted), + list(dt_feat_calc_nlcd) ) ), description = "Base features with PM2.5" @@ -229,7 +282,9 @@ target_calculate_fit <- command = post_calc_autojoin( dt_feat_calc_base, - dt_feat_calc_date + dt_feat_calc_date, + year_start = as.integer(substr(arglist_common$char_period[1], 1, 4)), + year_end = as.integer(substr(arglist_common$char_period[2], 1, 4)) ), description = "data.table of all features with PM2.5" ) diff --git a/inst/targets/targets_download.R b/inst/targets/targets_download.R index 2db780a5..eae4f782 100644 --- a/inst/targets/targets_download.R +++ b/inst/targets/targets_download.R @@ -32,7 +32,7 @@ target_download <- feature_raw_download( path = file_prep_download_args, dataset_name = char_rawdir_download), - pattern = map(file_prep_download_args, char_rawdir_download), + pattern = cross(file_prep_download_args, char_rawdir_download), iteration = "list" ) ) diff --git a/inst/targets/targets_initialize.R b/inst/targets/targets_initialize.R index 35da2fcd..a60f3fe0 100644 --- a/inst/targets/targets_initialize.R +++ b/inst/targets/targets_initialize.R @@ -25,7 +25,7 @@ target_init <- full.names = TRUE ), date = arglist_common$char_period, - mode = "sparse", + mode = "available-data", data_field = c("Arithmetic.Mean", "Event.Type"), return_format = "data.table" ), diff --git a/inst/targets/targets_start.R b/inst/targets/targets_start.R index ce9ad0d4..754701a0 100644 --- a/inst/targets/targets_start.R +++ b/inst/targets/targets_start.R @@ -8,16 +8,8 @@ library(targets) # to = "_targets.R" # ) -.libPaths( - c( - "/ddn/gs1/biotools/R/lib64/R/custompkg", - .libPaths() - ) -) - - tar_make_future( - workers = 24 + workers = 16 ) # TODO: should find a way of auto-invalidate feat_calc_(modis|viirs|geoscf) # when the date range changes in the configuration. diff --git a/man/add_time_col.Rd b/man/add_time_col.Rd index a8343c48..2ff6ebe2 100644 --- a/man/add_time_col.Rd +++ b/man/add_time_col.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{add_time_col} \alias{add_time_col} \title{Add Time Column} diff --git a/man/append_predecessors.Rd b/man/append_predecessors.Rd index 304815ea..912a20e8 100644 --- a/man/append_predecessors.Rd +++ b/man/append_predecessors.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{append_predecessors} \alias{append_predecessors} \title{Append Predecessors} diff --git a/man/assign_learner_cv.Rd b/man/assign_learner_cv.Rd new file mode 100644 index 00000000..ca2bc70a --- /dev/null +++ b/man/assign_learner_cv.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/base_learner.R +\name{assign_learner_cv} +\alias{assign_learner_cv} +\title{Shuffle cross-validation mode for each learner type} +\usage{ +assign_learner_cv( + learner = c("lgb", "mlp", "elnet"), + cv_mode = c("spatiotemporal", "spatial", "temporal"), + cv_rep = 100L, + num_device = ifelse(torch::cuda_device_count() > 1, 2, 1) +) +} +\arguments{ +\item{learner}{character(1). The base learner to be used. +Default is "mlp". Available options are "mlp", "lgb", "elnet".} + +\item{cv_mode}{character(1). The cross-validation mode to be used. +Default is "spatiotemporal". Available options are "spatiotemporal", +"spatial", "temporal".} + +\item{cv_rep}{integer(1). The number of repetitions for each \code{cv_mode}.} + +\item{num_device}{integer(1). The number of CUDA devices to be used. +Each device will be assigned to each eligible learner (i.e., lgb, mlp).} +} +\value{ +A data frame with three columns: learner, cv_mode, and device. +} +\description{ +Shuffle cross-validation mode for each learner type +} +\keyword{Baselearner} diff --git a/man/attach_xy.Rd b/man/attach_xy.Rd index 45ae01d2..8f9887e8 100644 --- a/man/attach_xy.Rd +++ b/man/attach_xy.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/base_learner.R \name{attach_xy} \alias{attach_xy} \title{Attach XY coordinates to a data frame} diff --git a/man/calc_geos_strict.Rd b/man/calc_geos_strict.Rd index 5344e427..f82ed774 100644 --- a/man/calc_geos_strict.Rd +++ b/man/calc_geos_strict.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calculate.R \name{calc_geos_strict} \alias{calc_geos_strict} -\title{Process atmospheric composition data by chunks (v3)} +\title{Process atmospheric composition data by chunks} \usage{ calc_geos_strict( path = NULL, diff --git a/man/calc_gmted_direct.Rd b/man/calc_gmted_direct.Rd index ff62dd4f..30f35730 100644 --- a/man/calc_gmted_direct.Rd +++ b/man/calc_gmted_direct.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calculate.R \name{calc_gmted_direct} \alias{calc_gmted_direct} \title{Reflown gmted processing} diff --git a/man/calc_narr2.Rd b/man/calc_narr2.Rd index 52f9099b..dc55689a 100644 --- a/man/calc_narr2.Rd +++ b/man/calc_narr2.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calculate.R \name{calc_narr2} \alias{calc_narr2} \title{Calculate aggregated values for specified locations} diff --git a/man/calculate.Rd b/man/calculate.Rd index 1b37c536..681ad209 100644 --- a/man/calculate.Rd +++ b/man/calculate.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/injection.R \name{calculate} \alias{calculate} \title{Spatiotemporal covariate calculation} diff --git a/man/convert_cv_index_rset.Rd b/man/convert_cv_index_rset.Rd index 708d3e87..46229070 100644 --- a/man/convert_cv_index_rset.Rd +++ b/man/convert_cv_index_rset.Rd @@ -1,17 +1,30 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/base_learner.R \name{convert_cv_index_rset} \alias{convert_cv_index_rset} \title{Generate manual rset object from spatiotemporal cross-validation indices} \usage{ -convert_cv_index_rset(cvindex, data, ref_list = NULL, cv_mode = "spt") +convert_cv_index_rset( + cvindex, + data, + ref_list = attr(cvindex, "ref_list"), + cv_mode = c("spatiotemporal", "spatial", "temporal") +) } \arguments{ -\item{cvindex}{integer length of nrow(data).} +\item{cvindex}{One of: +\itemize{ +\item integer row indices for \code{id_out} in a \code{rset} object. +\item List of integer row indices stored in elements named \code{analysis} and +\code{assessment}. +}} -\item{data}{data.frame.} +\item{data}{data.frame object from which the \code{cvindex} is used +to create \code{rset} object} -\item{ref_list}{List of custom reference indices. Default is NULL. +\item{ref_list}{List of custom reference group indices. +Default is \code{attr(cvindex, "ref_list")}, where it is assumed that \code{cvindex} +contains an \code{list} attribute named "ref_list". if not NULL, it will be used as a reference instead of max(cvindex).} \item{cv_mode}{character(1). Spatiotemporal cross-validation indexing diff --git a/man/df_params.Rd b/man/df_params.Rd deleted file mode 100644 index 53b8c80b..00000000 --- a/man/df_params.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{df_params} -\alias{df_params} -\title{Get data.frame of function parameters} -\usage{ -df_params(functions) -} -\arguments{ -\item{functions}{character. Vector of function names.} -} -\value{ -A data.frame containing the parameters of the functions. -} -\description{ -Get data.frame of function parameters -} -\keyword{Utility} diff --git a/man/divisor.Rd b/man/divisor.Rd index ae6dea3d..d8e8f38a 100644 --- a/man/divisor.Rd +++ b/man/divisor.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/processing_misc.R \name{divisor} \alias{divisor} \title{Get Divisors} diff --git a/man/feature_raw_download.Rd b/man/feature_raw_download.Rd index f74a882f..471c33e4 100644 --- a/man/feature_raw_download.Rd +++ b/man/feature_raw_download.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/injection.R \name{feature_raw_download} \alias{feature_raw_download} \title{Check file status and download if necessary} diff --git a/man/figures/beethoven-logo.png b/man/figures/beethoven-logo.png new file mode 100644 index 00000000..d1ab8663 Binary files /dev/null and b/man/figures/beethoven-logo.png differ diff --git a/man/figures/pipeline-code-relations.svg b/man/figures/pipeline-code-relations.svg new file mode 100644 index 00000000..027c604e --- /dev/null +++ b/man/figures/pipeline-code-relations.svg @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + Pipeline configuration +- inst/targets/download*.qs +- inst/targets/calc*.qs + Pipeline execution +- _targets.R +- run_interactive.sh +- run_slurm.sh +- inst/targets/targets_start.R + Pipeline targets +- inst/targets/targets_*.R (except for targets_start.R) + + + + diff --git a/man/figures/pipeline-schema.svg b/man/figures/pipeline-schema.svg new file mode 100644 index 00000000..fbe0feb8 --- /dev/null +++ b/man/figures/pipeline-schema.svg @@ -0,0 +1,317 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Pipeline configuration +- inst/targets/download*.qs +- inst/targets/calc*.qs + beethoven +package +functions + list object + Built targets + Pipeline targets +- inst/targets/targets_*.R (except for targets_start.R) + + + + + _targets.R + tar_config_set(store=...) + + stored + + + + injection + + diff --git a/man/fit_base_brulee.Rd b/man/fit_base_brulee.Rd deleted file mode 100644 index 08ca8d19..00000000 --- a/man/fit_base_brulee.Rd +++ /dev/null @@ -1,73 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{fit_base_brulee} -\alias{fit_base_brulee} -\title{Base learner: Multilayer perceptron with brulee} -\usage{ -fit_base_brulee( - dt_imputed, - folds = NULL, - tune_mode = "grid", - tune_bayes_iter = 50L, - learn_rate = 0.1, - yvar = "Arithmetic.Mean", - xvar = seq(5, ncol(dt_imputed)), - vfold = 5L, - device = "cuda:0", - trim_resamples = TRUE, - return_best = FALSE, - ... -) -} -\arguments{ -\item{dt_imputed}{The input data table to be used for fitting.} - -\item{folds}{pre-generated rset object with minimal number of columns. -If NULL, \code{vfold} should be numeric to be used in \link[rsample:vfold_cv]{rsample::vfold_cv}.} - -\item{tune_mode}{character(1). Hyperparameter tuning mode. -Default is "grid", "bayes" is acceptable.} - -\item{tune_bayes_iter}{integer(1). The number of iterations for -Bayesian optimization. Default is 50. Only used when \code{tune_mode = "bayes"}.} - -\item{learn_rate}{The learning rate for the model. For branching purpose. -Default is 0.1.} - -\item{yvar}{The target variable.} - -\item{xvar}{The predictor variables.} - -\item{vfold}{The number of folds for cross-validation.} - -\item{device}{The device to be used for training. -Default is "cuda:0". Make sure that your system is equipped -with CUDA-enabled graphical processing units.} - -\item{trim_resamples}{logical(1). Default is TRUE, which replaces the actual -data.frames in splits column of \code{tune_results} object with NA.} - -\item{return_best}{logical(1). If TRUE, the best tuned model is returned.} - -\item{...}{Additional arguments to be passed.} -} -\value{ -The fitted workflow. -} -\description{ -Multilayer perceptron model with different configurations of -hidden units, dropout, activation, and learning rate using brulee -and tidymodels. With proper settings, users can utilize graphics -processing units (GPU) to speed up the training process. -} -\details{ -Hyperparameters \code{hidden_units}, \code{dropout}, \code{activation}, -and \code{learn_rate} are tuned. \verb{With tune_mode = "grid"}, -users can modify \code{learn_rate} explicitly, and other hyperparameters -will be predefined (56 combinations per \code{learn_rate}). -} -\note{ -tune package should be 1.2.0 or higher. -brulee should be installed with GPU support. -} -\keyword{Baselearner} diff --git a/man/fit_base_elnet.Rd b/man/fit_base_elnet.Rd deleted file mode 100644 index a2aafc54..00000000 --- a/man/fit_base_elnet.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{fit_base_elnet} -\alias{fit_base_elnet} -\title{Base learner: Elastic net} -\usage{ -fit_base_elnet( - dt_imputed, - folds = NULL, - yvar = "Arithmetic.Mean", - xvar = seq(5, ncol(dt_imputed)), - tune_mode = "grid", - tune_bayes_iter = 50L, - vfold = 5L, - nthreads = 16L, - trim_resamples = TRUE, - return_best = FALSE, - ... -) -} -\arguments{ -\item{dt_imputed}{The input data table to be used for fitting.} - -\item{folds}{pre-generated rset object with minimal number of columns. -If NULL, \code{vfold} should be numeric to be used in \link[rsample:vfold_cv]{rsample::vfold_cv}.} - -\item{yvar}{The target variable.} - -\item{xvar}{The predictor variables.} - -\item{tune_mode}{character(1). Hyperparameter tuning mode. -Default is "grid", "bayes" is acceptable.} - -\item{tune_bayes_iter}{integer(1). The number of iterations for -Bayesian optimization. Default is 50. Only used when \code{tune_mode = "bayes"}.} - -\item{vfold}{The number of folds for cross-validation.} - -\item{nthreads}{The number of threads to be used. Default is 16L.} - -\item{return_best}{logical(1). If TRUE, the best tuned model is returned.} - -\item{...}{Additional arguments to be passed.} -} -\value{ -The fitted workflow. -} -\description{ -Elastic net model is fitted at the defined rate (\code{r_subsample}) of -the input dataset by grid search. -} -\note{ -tune package should be 1.2.0 or higher. -} -\keyword{Baselearner} diff --git a/man/fit_base_learner.Rd b/man/fit_base_learner.Rd new file mode 100644 index 00000000..a5277388 --- /dev/null +++ b/man/fit_base_learner.Rd @@ -0,0 +1,126 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/base_learner.R +\name{fit_base_learner} +\alias{fit_base_learner} +\title{Base learner: tune hyperparameters and retrieve the best model} +\usage{ +fit_base_learner( + learner = c("mlp", "xgb", "lgb", "elnet"), + dt_full, + r_subsample = 0.3, + model = NULL, + folds = 5L, + cv_mode = c("spatiotemporal", "spatial", "temporal"), + args_generate_cv = NULL, + tune_mode = "grid", + tune_bayes_iter = 10L, + tune_grid_in = NULL, + tune_grid_size = 10L, + learn_rate = 0.1, + yvar = "Arithmetic.Mean", + xvar = seq(5, ncol(dt_sample)), + nthreads = 8L, + trim_resamples = FALSE, + return_best = TRUE, + ... +) +} +\arguments{ +\item{learner}{character(1). The base learner to be used. +Default is "mlp". Available options are "mlp", "xgb", "lgb", "elnet".} + +\item{dt_full}{The full data table to be used for prediction.} + +\item{r_subsample}{numeric(1). The proportion of rows to be used.} + +\item{model}{The parsnip model object. Preferably generated from +\code{switch_model}.} + +\item{folds}{integer(1). Number of cross-validation folds. +If NULL, \code{cv_mode} should be defined to be used in \link[rsample:vfold_cv]{rsample::vfold_cv}.} + +\item{cv_mode}{character(1). +Cross-validation mode. Default is "spatiotemporal". +Available options are "spatiotemporal", "spatial", "temporal".} + +\item{args_generate_cv}{List of arguments to be passed to +\code{switch_generate_cv_rset} function.} + +\item{tune_mode}{character(1). Hyperparameter tuning mode. +Default is "grid", "bayes" is acceptable.} + +\item{tune_bayes_iter}{integer(1). The number of iterations for +Bayesian optimization. Default is 10. Only used when \code{tune_mode = "bayes"}.} + +\item{tune_grid_in}{data.frame object that includes the grid for +hyperparameter tuning. \code{tune_grid_size} rows will be randomly picked +from this data.frame for grid search.} + +\item{tune_grid_size}{integer(1). The number of grid size for hyperparameter +tuning. Default is 10. Only used when \code{tune_mode = "grid"}.} + +\item{learn_rate}{The learning rate for the model. For branching purpose. +Default is 0.1.} + +\item{yvar}{The target variable.} + +\item{xvar}{The predictor variables.} + +\item{nthreads}{integer(1). The number of threads to be used for +tuning. Default is 8L. \code{learner = "elnet"} will utilize the multiple +threads in \code{\link[future:multicore]{future::multicore()}} plan.} + +\item{trim_resamples}{logical(1). Default is TRUE, which replaces the actual +data.frames in splits column of \code{tune_results} object with NA.} + +\item{return_best}{logical(1). If TRUE, the best tuned model is returned.} + +\item{...}{Additional arguments to be passed.} +} +\value{ +The fitted workflow. +} +\description{ +Multilayer perceptron model with different configurations of +hidden units, dropout, activation, and learning rate using brulee +and tidymodels. With proper settings, users can utilize graphics +processing units (GPU) to speed up the training process. +} +\details{ +LightGBM model is fitted at the defined rate (\code{r_subsample}) of +the input dataset by grid or Bayesian optimization search. +With proper settings, users can utilize graphics +processing units (GPU) to speed up the training process. + +XGBoost model is fitted at the defined rate (\code{r_subsample}) of +the input dataset by grid or Bayesian optimization search. +With proper settings, users can utilize graphics +processing units (GPU) to speed up the training process. + +Elastic net model is fitted at the defined rate (\code{r_subsample}) of +the input dataset by grid search or Bayesian optimization. +\itemize{ +\item MLP: Hyperparameters \code{hidden_units}, \code{dropout}, \code{activation}, +and \code{learn_rate} are tuned. \verb{With tune_mode = "grid"}, +users can modify \code{learn_rate} explicitly, and other hyperparameters +will be predefined (56 combinations per \code{learn_rate} for mlp). +\item XGBoost: Hyperparameters \code{mtry}, \code{ntrees}, and \code{learn_rate} are +tuned. With \code{tune_mode = "grid"}, +users can modify \code{learn_rate} explicitly, and other hyperparameters +will be predefined (30 combinations per \code{learn_rate}). +\item LightGBM: Hyperparameters \code{mtry}, \code{ntrees}, and \code{learn_rate} are +tuned. With \code{tune_mode = "grid"}, +users can modify \code{learn_rate} explicitly, and other hyperparameters +will be predefined (30 combinations per \code{learn_rate}). +\item Elastic net: Hyperparameters \code{mixture} and \code{penalty} are tuned. +} + +Tuning is performed based on random grid search (size = 10). +} +\note{ +tune package should be 1.2.0 or higher. +brulee, xgboost, and lightgbm should be installed with GPU support. +Grid search is not activated in this function, regardless of other parts' +description. +} +\keyword{Baselearner} diff --git a/man/fit_base_lightgbm.Rd b/man/fit_base_lightgbm.Rd deleted file mode 100644 index c1bad054..00000000 --- a/man/fit_base_lightgbm.Rd +++ /dev/null @@ -1,74 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{fit_base_lightgbm} -\alias{fit_base_lightgbm} -\title{Base learner: Light Gradient Boosting Machine (LightGBM)} -\usage{ -fit_base_lightgbm( - dt_imputed, - folds = NULL, - tune_mode = "grid", - tune_bayes_iter = 50L, - learn_rate = 0.1, - yvar = "Arithmetic.Mean", - xvar = seq(5, ncol(dt_imputed)), - vfold = 5L, - device = "gpu", - trim_resamples = TRUE, - return_best = FALSE, - ... -) -} -\arguments{ -\item{dt_imputed}{The input data table to be used for fitting.} - -\item{folds}{pre-generated rset object with minimal number of columns. -If NULL, \code{vfold} should be numeric to be used in \link[rsample:vfold_cv]{rsample::vfold_cv}.} - -\item{tune_mode}{character(1). Hyperparameter tuning mode. -Default is "grid", "bayes" is acceptable.} - -\item{tune_bayes_iter}{integer(1). The number of iterations for -Bayesian optimization. Default is 50. Only used when \code{tune_mode = "bayes"}.} - -\item{learn_rate}{The learning rate for the model. For branching purpose. -Default is 0.1.} - -\item{yvar}{The target variable.} - -\item{xvar}{The predictor variables.} - -\item{vfold}{The number of folds for cross-validation.} - -\item{device}{The device to be used for training. -Default is \code{"gpu"}. Make sure that your system is equipped -with OpenCL-capable graphical processing units. -A GPU-enabled version of LightGBM should be installed.} - -\item{trim_resamples}{logical(1). Default is TRUE, which replaces the actual -data.frames in splits column of \code{tune_results} object with NA.} - -\item{return_best}{logical(1). If TRUE, the best tuned model is returned.} - -\item{...}{Additional arguments to be passed.} -} -\value{ -The fitted workflow. -} -\description{ -LightGBM model is fitted at the defined rate (\code{r_subsample}) of -the input dataset by grid or Bayesian optimization search. -With proper settings, users can utilize graphics -processing units (GPU) to speed up the training process. -} -\details{ -Hyperparameters \code{mtry}, \code{ntrees}, and \code{learn_rate} are -tuned. With \code{tune_mode = "grid"}, -users can modify \code{learn_rate} explicitly, and other hyperparameters -will be predefined (30 combinations per \code{learn_rate}). -} -\note{ -tune package should be 1.2.0 or higher. -xgboost should be installed with GPU support. -} -\keyword{Baselearner} diff --git a/man/fit_base_tune.Rd b/man/fit_base_tune.Rd new file mode 100644 index 00000000..38ad0b9a --- /dev/null +++ b/man/fit_base_tune.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/base_learner.R +\name{fit_base_tune} +\alias{fit_base_tune} +\title{Tune base learner} +\usage{ +fit_base_tune( + recipe, + model, + resample, + tune_mode = c("bayes", "grid"), + grid = NULL, + iter_bayes = 10L, + trim_resamples = TRUE, + return_best = TRUE, + data_full = NULL +) +} +\arguments{ +\item{recipe}{The recipe object.} + +\item{model}{The model object.} + +\item{resample}{The resample object. It is expected to be generated from the +subsamples.} + +\item{tune_mode}{character(1). Hyperparameter tuning mode. +Default is "bayes", "grid" is acceptable.} + +\item{grid}{The grid object for hyperparameter tuning.} + +\item{trim_resamples}{logical(1). Default is TRUE, which replaces the actual +data.frames in splits column of \code{tune_results} object with NA.} + +\item{return_best}{logical(1). If TRUE, the best tuned model is returned.} + +\item{data_full}{The full data frame to be used for prediction.} +} +\value{ +List of 3: +\itemize{ +\item \code{base_prediction}: \code{data.frame} of the best model prediction. +\item \code{base_parameter}: \code{tune_results} object of the best model. +\item \code{best_performance}: \code{data.frame} of the performance metrics. It +includes RMSE, MAPE, R-squared, and MAE for \strong{all} tuned models. +} +} +\description{ +Tune base learner +} +\keyword{Baselearner} +\keyword{internal} diff --git a/man/fit_base_xgb.Rd b/man/fit_base_xgb.Rd deleted file mode 100644 index 7ece22f8..00000000 --- a/man/fit_base_xgb.Rd +++ /dev/null @@ -1,73 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{fit_base_xgb} -\alias{fit_base_xgb} -\title{Base learner: Extreme gradient boosting (XGBoost)} -\usage{ -fit_base_xgb( - dt_imputed, - folds = NULL, - tune_mode = "grid", - tune_bayes_iter = 50L, - learn_rate = 0.1, - yvar = "Arithmetic.Mean", - xvar = seq(5, ncol(dt_imputed)), - vfold = 5L, - device = "cuda:0", - trim_resamples = TRUE, - return_best = FALSE, - ... -) -} -\arguments{ -\item{dt_imputed}{The input data table to be used for fitting.} - -\item{folds}{pre-generated rset object with minimal number of columns. -If NULL, \code{vfold} should be numeric to be used in \link[rsample:vfold_cv]{rsample::vfold_cv}.} - -\item{tune_mode}{character(1). Hyperparameter tuning mode. -Default is "grid", "bayes" is acceptable.} - -\item{tune_bayes_iter}{integer(1). The number of iterations for -Bayesian optimization. Default is 50. Only used when \code{tune_mode = "bayes"}.} - -\item{learn_rate}{The learning rate for the model. For branching purpose. -Default is 0.1.} - -\item{yvar}{The target variable.} - -\item{xvar}{The predictor variables.} - -\item{vfold}{The number of folds for cross-validation.} - -\item{device}{The device to be used for training. -Default is "cuda:0". Make sure that your system is equipped -with CUDA-enabled graphical processing units.} - -\item{trim_resamples}{logical(1). Default is TRUE, which replaces the actual -data.frames in splits column of \code{tune_results} object with NA.} - -\item{return_best}{logical(1). If TRUE, the best tuned model is returned.} - -\item{...}{Additional arguments to be passed.} -} -\value{ -The fitted workflow. -} -\description{ -XGBoost model is fitted at the defined rate (\code{r_subsample}) of -the input dataset by grid search. -With proper settings, users can utilize graphics -processing units (GPU) to speed up the training process. -} -\details{ -Hyperparameters \code{mtry}, \code{ntrees}, and \code{learn_rate} are -tuned. With \code{tune_mode = "grid"}, -users can modify \code{learn_rate} explicitly, and other hyperparameters -will be predefined (30 combinations per \code{learn_rate}). -} -\note{ -tune package should be 1.2.0 or higher. -xgboost should be installed with GPU support. -} -\keyword{Baselearner} diff --git a/man/fit_meta_learner.Rd b/man/fit_meta_learner.Rd new file mode 100644 index 00000000..67dac7d2 --- /dev/null +++ b/man/fit_meta_learner.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/meta_learner.R +\name{fit_meta_learner} +\alias{fit_meta_learner} +\title{Fit meta learner} +\usage{ +fit_meta_learner( + data, + p_col_sel = 0.5, + rset = NULL, + yvar = "Arithmetic.Mean", + xvar = character(0), + tune_iter = 50L +) +} +\arguments{ +\item{data}{data.frame. Full data.} + +\item{p_col_sel}{numeric(1). Rate of column resampling. Default is 0.5.} + +\item{rset}{rset object. Specification of training/test sets.} + +\item{yvar}{character(1). Outcome variable name} + +\item{xvar}{character. Feature names.} + +\item{tune_iter}{integer(1). Bayesian optimization iterations. +Default is 50.} +} +\value{ +List of 3, including the best-fit model, the best hyperparameters, +and the all performance records from \code{tune::tune_bayes()}. +Note that the meta learner function returns the best-fit model, +not predicted values. +} +\description{ +This function subsets the full data by column subsamples (rate=50\%) +The optimal hyperparameter search is performed based on spatial, +temporal, and spatiotemporal cross-validation schemes. +As of version 0.4.0, the function relies on RMSE to select the +best hyperparameter set. +} +\keyword{meta_learner} diff --git a/man/generate_cv_index.Rd b/man/generate_cv_index.Rd deleted file mode 100644 index ee3a9139..00000000 --- a/man/generate_cv_index.Rd +++ /dev/null @@ -1,67 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{generate_cv_index} -\alias{generate_cv_index} -\title{Generate spatio-temporal cross-validation index with anticlust} -\usage{ -generate_cv_index( - data, - target_cols = c("lon", "lat", "time"), - preprocessing = c("none", "normalize", "standardize"), - cv_fold = 5L, - cv_pairs = NULL, - pairing = c("1", "2"), - cv_mode = "spt", - ... -) -} -\arguments{ -\item{data}{data.table with X, Y, and time information.} - -\item{target_cols}{character(3). Names of columns for X, Y, and time. -Default is c("lon", "lat", "time"). -Order insensitive.} - -\item{preprocessing}{character(1). Preprocessing method. -\itemize{ -\item "none": no preprocessing. -\item "normalize": normalize the data. -\item "standardize": standardize the data. -}} - -\item{cv_fold}{integer(1). Number of folds for cross-validation. -default is 5L.} - -\item{cv_pairs}{integer(1). Number of pairs for cross-validation. -This value will be used to generate a rank-based pairs -based on \code{target_cols} values.} - -\item{pairing}{character(1) Pair selection method. -\itemize{ -\item "1": search the nearest for each cluster then others -are selected based on the rank. -\item "2": rank the pairwise distances directly -}} - -\item{cv_mode}{character(1). Spatiotemporal cross-validation indexing} - -\item{...}{Additional arguments to be passed.} -} -\value{ -rsample::manual_rset() object. -} -\description{ -This function generates a spatio-temporal cross-validation index -based on the anticlust package. The function first calculates the -spatial clustering index using the balanced_clustering function as -default, and if \code{cv_pairs} is provided, -it generates rank-based pairs based on the proximity between -cluster centroids. -} -\note{ -nrow(data) \%\% cv_fold should be 0. -} -\author{ -Insang Song -} -\keyword{Baselearner} diff --git a/man/prepare_cvindex.Rd b/man/generate_cv_index_sp.Rd similarity index 58% rename from man/prepare_cvindex.Rd rename to man/generate_cv_index_sp.Rd index f3c8bee6..4dd68ca6 100644 --- a/man/prepare_cvindex.Rd +++ b/man/generate_cv_index_sp.Rd @@ -1,12 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{prepare_cvindex} -\alias{prepare_cvindex} +% Please edit documentation in R/base_learner.R +\name{generate_cv_index_sp} +\alias{generate_cv_index_sp} \title{Prepare spatial and spatiotemporal cross validation sets} \usage{ -prepare_cvindex( +generate_cv_index_sp( data, - r_subsample = 0.3, target_cols = c("lon", "lat"), cv_make_fun = spatialsample::spatial_block_cv, ... @@ -15,8 +14,6 @@ prepare_cvindex( \arguments{ \item{data}{data.table with X, Y, and time information.} -\item{r_subsample}{The proportion of rows to be sampled.} - \item{target_cols}{character(3). Names of columns for X, Y. Default is \code{c("lon", "lat")}. It is passed to sf::st_as_sf to subsequently generate spatial cross-validation indices using @@ -24,16 +21,21 @@ subsequently generate spatial cross-validation indices using \code{spatialsample::spatial_clustering_cv}.} \item{cv_make_fun}{function(1). Function to generate spatial -cross-validation indices. Default is \code{spatialsample::spatial_block_cv}.} +cross-validation indices. +Default is \code{spatialsample::spatial_block_cv}.} + +\item{...}{Additional arguments to be passed to \code{cv_make_fun}.} } \value{ -rsample::manual_rset() object. +A list of numeric vectors with in- and out-of-sample row indices or +a numeric vector with out-of-sample indices. } \description{ Prepare spatial and spatiotemporal cross validation sets } \seealso{ -\code{\link{generate_cv_index}} \code{\link[spatialsample:spatial_block_cv]{spatialsample::spatial_block_cv}} -\code{\link[spatialsample:spatial_clustering_cv]{spatialsample::spatial_clustering_cv}} +\code{\link[spatialsample:spatial_block_cv]{spatialsample::spatial_block_cv}}, +\code{\link[spatialsample:spatial_clustering_cv]{spatialsample::spatial_clustering_cv}}, +\code{\link[spatialsample:spatial_vfold]{spatialsample::spatial_buffer_vfold_cv}} } \keyword{Baselearner} diff --git a/man/generate_cv_index_spt.Rd b/man/generate_cv_index_spt.Rd new file mode 100644 index 00000000..205f50a1 --- /dev/null +++ b/man/generate_cv_index_spt.Rd @@ -0,0 +1,107 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/base_learner.R +\name{generate_cv_index_spt} +\alias{generate_cv_index_spt} +\title{Generate spatio-temporal cross-validation index with anticlust} +\usage{ +generate_cv_index_spt( + data, + target_cols = c("lon", "lat", "time"), + preprocessing = c("none", "normalize", "standardize"), + ngroup_init = 5L, + cv_pairs = NULL, + pairing = c("1", "2"), + ... +) +} +\arguments{ +\item{data}{data.table with X, Y, and time information.} + +\item{target_cols}{character(3). Names of columns for X, Y, and time. +Default is c("lon", "lat", "time"). Order insensitive.} + +\item{preprocessing}{character(1). Preprocessing method for the fields +defined in \code{target_cols}. This serves to homogenize the scale of +the data. Default is "none". +\itemize{ +\item "none": no preprocessing. +\item "normalize": normalize the data. +\item "standardize": standardize the data. +}} + +\item{ngroup_init}{integer(1). Initial number of splits for +pairing groups. Default is 5L.} + +\item{cv_pairs}{integer(1). Number of pairs for cross-validation. +This value will be used to generate a rank-based pairs +based on \code{target_cols} values.} + +\item{pairing}{character(1) Pair selection method. +\itemize{ +\item "1": search the nearest for each cluster then others +are selected based on the rank. +\item "2": rank the pairwise distances directly +}} + +\item{...}{Additional arguments to be passed.} +} +\value{ +List of numeric vectors with balanced cluster numbers and +reference lists of assessment set pair numbers in attributes. +} +\description{ +This function generates a spatio-temporal cross-validation index +based on the anticlust package. The function first calculates the +spatial clustering index using the \code{\link[anticlust:balanced_clustering]{anticlust::balanced_clustering()}} +function as default, and if \code{cv_pairs} is provided, it generates rank-based +pairs based on the proximity between cluster centroids. +\code{cv_pairs} can be NULL, in which case only the spatial clustering index +is generated. \code{ngroup_init} should be lower than \code{cv_pairs}, while +it imposes a condition that \code{nrow(data) \%\% ngroup_init} should be 0 +and \code{cv_pairs} should be less than the number of 2-combinations of +\code{ngroup_init}. Each training set will get 50\% overlap +with adjacent training sets. "Pairs (combinations)" are selected +based on the rank of centroids of \code{ngroup_init} number of initial +clusters, where users have two options. +} +\details{ +\itemize{ +\item Mode "1" assigns at least one pair for each +initial cluster, meaning that \code{ngroup_init} pairs are assigned for each +initial cluster, then the remaining pairs will be ranked to finalize +the \code{cv_pairs} sets. +\item Mode "2" will rank the pairwise distances +directly, which may ignore some overly large initial clusters for pairing. +} + +Of course, mode "2" is faster than mode "1", thus users are advised +to use mode "2" when they are sure that the initial clusters are +spatially uniformly distributed. +} +\note{ +\code{nrow(data) \%\% ngroup_init} should be 0. This is a required +condition for the anticlust::balanced_clustering(). +} +\examples{ +library(data.table) +data <- data.table( + lon = runif(100), + lat = runif(100), + time = + rep( + seq.Date(from = as.Date("2021-01-01"), to = as.Date("2021-01-05"), + by = "day"), + 20 + ) +) +rset_spt <- + generate_cv_index_spt( + data, preprocessing = "normalize", + ngroup_init = 5L, cv_pairs = 6L + ) +rset_spt +} +\author{ +Insang Song +} +\keyword{Baselearner} diff --git a/man/generate_cv_index_ts.Rd b/man/generate_cv_index_ts.Rd new file mode 100644 index 00000000..ede78c24 --- /dev/null +++ b/man/generate_cv_index_ts.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/base_learner.R +\name{generate_cv_index_ts} +\alias{generate_cv_index_ts} +\title{Generate temporal cross-validation index} +\usage{ +generate_cv_index_ts(data, time_col = "time", cv_fold = 10L, window = 14L) +} +\arguments{ +\item{data}{data.table with X, Y, and time information.} + +\item{time_col}{character(1). Field name with time information.} + +\item{cv_fold}{integer(1). Number of cross-validation folds.} + +\item{window}{integer(1). Window size for each fold. +Simply meaning overlaps between folds. Unit is +the base unit of temporal values stored in \code{time_col}. +Window size is put into \code{as.difftime} function, then the half of it +(if odd, rounded number + 1 is applied) is used for overlaps +in the middle folds.} +} +\value{ +List of numeric vector with out-of-sample indices. +} +\description{ +Generate temporal cross-validation index +} +\examples{ +data <- data.frame( + time = seq.Date(from = as.Date("2021-01-01"), by = "day", length.out = 100), + value = rnorm(100) +) +rset_ts <- + generate_cv_index_ts(data, time_col = "time", cv_fold = 10, window = 14) +} +\keyword{Baselearner} diff --git a/man/grapes-tin-grapes.Rd b/man/grapes-tin-grapes.Rd deleted file mode 100644 index 18b8f55d..00000000 --- a/man/grapes-tin-grapes.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{\%tin\%} -\alias{\%tin\%} -\title{Check if a query date falls within a time interval} -\usage{ -query_date \%tin\% tvec -} -\arguments{ -\item{query_date}{The query date to check.} - -\item{tvec}{A vector of two dates defining the time interval.} -} -\value{ -TRUE if the query date falls within the time interval, -FALSE otherwise. -} -\description{ -This function checks if a given query date falls within a time interval -defined by a vector of two dates. -} -\examples{ -\dontrun{ -query_date <- as.Date("2022-01-01") -tvec <- c(as.Date("2021-01-01"), as.Date("2023-01-01")) -`\%tin\%`(query_date, tvec) -} -} -\keyword{Miscellaneous} diff --git a/man/impute_all.Rd b/man/impute_all.Rd index 3d5d957b..309f8f87 100644 --- a/man/impute_all.Rd +++ b/man/impute_all.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{impute_all} \alias{impute_all} \title{Impute missing values and attach lagged features} @@ -33,12 +33,15 @@ The imputed data table with lagged features. Impute missing values and attach lagged features } \note{ -under construction. This function performs imputation on a given data table by replacing missing values with imputed values. It follows a series of steps including data cleaning, name cleaning, geoscf column renaming, NDVI 16-day backward filling, zero-variance exclusion, excessive "true zeros" exclusion, and imputation using missRanger. +A few points should be discussed to sophisticate the imputation +process: exclusion threshold for rates of zero observations, +which might lead to significant improvement in the imputation +process especially in terms of speed and accuracy. } \keyword{Post-calculation} diff --git a/man/inject_calculate.Rd b/man/inject_calculate.Rd index 49e0ade6..eadc4fa7 100644 --- a/man/inject_calculate.Rd +++ b/man/inject_calculate.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/injection.R \name{inject_calculate} \alias{inject_calculate} \title{Injects the calculate function with specified arguments.} diff --git a/man/inject_geos.Rd b/man/inject_geos.Rd index dc3d841b..7a84cc49 100644 --- a/man/inject_geos.Rd +++ b/man/inject_geos.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/injection.R \name{inject_geos} \alias{inject_geos} \title{Injects geographic information into a data frame} diff --git a/man/inject_gmted.Rd b/man/inject_gmted.Rd index 04902448..2a0fdf6a 100644 --- a/man/inject_gmted.Rd +++ b/man/inject_gmted.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/injection.R \name{inject_gmted} \alias{inject_gmted} \title{Injects GMTED data into specified locations} diff --git a/man/inject_match.Rd b/man/inject_match.Rd new file mode 100644 index 00000000..263d22f7 --- /dev/null +++ b/man/inject_match.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/injection.R +\name{inject_match} +\alias{inject_match} +\title{Injects the calculate function with matched arguments.} +\usage{ +inject_match(f, args) +} +\arguments{ +\item{f}{function.} + +\item{args}{List of arguments that are attempted to be injected into \code{f}.} +} +\value{ +Injected function evaluation. +} +\description{ +Injects the calculate function with matched arguments. +} +\keyword{Calculation} diff --git a/man/inject_modis_par.Rd b/man/inject_modis_par.Rd index 49d84893..42fcf01a 100644 --- a/man/inject_modis_par.Rd +++ b/man/inject_modis_par.Rd @@ -1,24 +1,41 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/injection.R \name{inject_modis_par} \alias{inject_modis_par} \title{Injects arguments to parallelize MODIS/VIIRS data processing} \usage{ -inject_modis_par(locs, domain, injection) +inject_modis_par(locs, injection) } \arguments{ \item{locs}{A data frame containing the locations for which MODIS features need to be calculated.} -\item{domain}{The domain in which the MODIS PAR data should be injected.} - -\item{injection}{Additional parameters to be passed to the +\item{injection}{\strong{List} of dditional parameters to be passed to the \code{calc_modis_par} function.} } \value{ -The modified domain with the injected MODIS PAR data. +MODIS/VIIRS feature data.frame. } \description{ Injects arguments to parallelize MODIS/VIIRS data processing } +\examples{ +\dontrun{ +files <- + c( + "/downloads/modis/mod06/MOD06_L2.A2022001.0000.061.2022001160000.hdf", + "/downloads/modis/mod06/MOD06_L2.A2022001.0005.061.2022001160000.hdf" + ) +my_locs <- data.frame(site_id = 1:2, lon = c(-88, -87), lat = c(35, 35)) +my_locs <- sf::st_as_sf(my_locs, coords = c("lon", "lat")) +inject_modis_par( + locs = my_locs, + injection = list(path = files, subdataset = "Cloud_Fraction_Day", + name_covariates = "MOD_CLCVD_0_", nthreads = 2L, + preprocess = amadeus::process_modis_swath, radius = c(1000))) +} +} +\seealso{ +\code{\link[amadeus:calc_modis_daily]{amadeus::calc_modis_daily}}, \code{\link[amadeus:calc_modis_par]{amadeus::calc_modis_par}} +} \keyword{Calculation} diff --git a/man/inject_nlcd.Rd b/man/inject_nlcd.Rd new file mode 100644 index 00000000..1c01e6a8 --- /dev/null +++ b/man/inject_nlcd.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/injection.R +\name{inject_nlcd} +\alias{inject_nlcd} +\title{Inject arguments into NLCD calculation function for branching} +\usage{ +inject_nlcd(year = 2019, radius = 1000, ...) +} +\arguments{ +\item{year}{An integer specifying the year to calculate NLCD data for.} + +\item{radius}{An integer specifying the radius for the NLCD calculation.} + +\item{...}{Additional arguments to be passed to the NLCD calculation +function.} +} +\value{ +data.frame object. +} +\description{ +Inject arguments into NLCD calculation function for branching +} +\keyword{Calculation} diff --git a/man/load_modis_files.Rd b/man/load_modis_files.Rd index 33768da0..b7190ad9 100644 --- a/man/load_modis_files.Rd +++ b/man/load_modis_files.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/load.R \name{load_modis_files} \alias{load_modis_files} \title{Load MODIS files from a specified path.} @@ -26,10 +26,19 @@ returns a list of MODIS files found in the specified path. \examples{ \dontrun{ # Load MODIS files from the current directory -modis_files <- load_modis_files(".") +modis_files <- + load_modis_files( + ".", + date = c("2018-01-01", "2018-01-31") + ) # Load MODIS files from a specific directory with a custom pattern -modis_files <- load_modis_files("/path/to/files", pattern = "MOD.*hdf$") +modis_files <- + load_modis_files( + "/path/to/files", + pattern = "MOD.*hdf$", + date = c("2018-01-01", "2018-01-31") + ) } } \keyword{Utility} diff --git a/man/loadargs.Rd b/man/loadargs.Rd index 7881b164..9a8a3cf0 100644 --- a/man/loadargs.Rd +++ b/man/loadargs.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/load.R \name{loadargs} \alias{loadargs} \title{Load arguments from the formatted argument list file} @@ -12,9 +12,15 @@ loadargs(argfile, dataset) \item{dataset}{character(1). Dataset name.} } \value{ -A list of arguments. +A list of arguments stored in \code{dataset} slot of the +argument file. } \description{ -Load arguments from the formatted argument list file +This function loads the list object of arguments +to be injected into the calculation functions defined at +each target. The arguments are numeric or character, and some +of these are function names. In this case, the internal function +\code{unmarshal_function} is called to convert the function name +to the actual function. } \keyword{Utility} diff --git a/man/make_subdata.Rd b/man/make_subdata.Rd index 4fe8769e..5559a875 100644 --- a/man/make_subdata.Rd +++ b/man/make_subdata.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/base_learner.R \name{make_subdata} \alias{make_subdata} \title{Make sampled subdataframes for base learners} @@ -14,7 +14,8 @@ make_subdata(data, n = NULL, p = 0.3) \item{p}{The proportion of rows to be used. Default is 0.3.} } \value{ -The sampled data table with row index saved in .rowindex field. +The row index of the original data. The name of the original +data object is stored in attribute "object_origin". } \description{ Per beethoven resampling strategy, this function selects diff --git a/man/meta_learner_fit.Rd b/man/meta_learner_fit.Rd deleted file mode 100644 index 0e33a6ea..00000000 --- a/man/meta_learner_fit.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/meta_learner.R -\name{meta_learner_fit} -\alias{meta_learner_fit} -\title{Fit a BART (Bayesian Additive Regression Tree) meta learner. It takes predictions of other models such as kriging, GLM, machine learning models as input and fits a BART Model} -\usage{ -meta_learner_fit(base_predictor_list, kfolds, y, ...) -} -\arguments{ -\item{base_predictor_list}{\itemize{ -\item P x 1 list where P = p is a base predictor -vector (numeric). Each predictor vector should be the same length and -named. -}} - -\item{kfolds}{integer, index of k-folds for cross-validation. This should be -produced with regards to spatial and/or temporal considerations} - -\item{y}{dependent variable} - -\item{...}{Passed arguments to \link[BART]{wbart}} -} -\value{ -meta_fit_obj object of meta learner -} -\description{ -Fit a BART (Bayesian Additive Regression Tree) meta learner. It takes predictions of other models such as kriging, GLM, machine learning models as input and fits a BART Model -} -\examples{ -NULL -} diff --git a/man/meta_learner_predict.Rd b/man/meta_learner_predict.Rd deleted file mode 100644 index 21c37997..00000000 --- a/man/meta_learner_predict.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/meta_learner.R -\name{meta_learner_predict} -\alias{meta_learner_predict} -\title{Create meta_learner predictions from the list of BART fit objects and predictions of base learners} -\usage{ -meta_learner_predict(meta_fit, base_outputs_stdt, nthreads = 2) -} -\arguments{ -\item{meta_fit}{list of BART objects from meta_learner_fit} - -\item{base_outputs_stdt}{stdt object. -list with datatable containing lat, lon, time and the covariates -(outputs of each base learner) at prediction locations and crs.} - -\item{nthreads}{integer(1). Number of threads used in \link[BART:predict.wbart]{BART::predict.wbart}} -} -\value{ -meta_pred: the final meta learner predictions -} -\description{ -The meta learner used in this package, Bayesian Additive Regression Tree (BART), is not explicitly a spatiotemporal model, but the input covariates (outputs of each base learner) are S-T based. -} -\note{ -The predictions can be a rast or sf, which depends on the same -respective format of the covariance matrix input - cov_pred -} -\examples{ -NULL -} -\references{ -https://rspatial.github.io/terra/reference/predict.html -} diff --git a/man/par_narr.Rd b/man/par_narr.Rd index e99f1e60..295e00e2 100644 --- a/man/par_narr.Rd +++ b/man/par_narr.Rd @@ -1,14 +1,16 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calculate.R \name{par_narr} \alias{par_narr} \title{Parallelize NARR feature calculation} \usage{ -par_narr(domain, date, locs, nthreads = 24L) +par_narr(domain, path, date, locs, nthreads = 24L) } \arguments{ \item{domain}{A character vector specifying the domains to process.} +\item{path}{A character vector specifying the path to the NARR data.} + \item{date}{A character vector specifying the date of the NARR data to process.} diff --git a/man/post_calc_autojoin.Rd b/man/post_calc_autojoin.Rd index 02b4a0fe..d6d7f46b 100644 --- a/man/post_calc_autojoin.Rd +++ b/man/post_calc_autojoin.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{post_calc_autojoin} \alias{post_calc_autojoin} \title{Automatic joining by the time and spatial identifiers} diff --git a/man/post_calc_convert_time.Rd b/man/post_calc_convert_time.Rd index 8e3d2015..d0d7ea4c 100644 --- a/man/post_calc_convert_time.Rd +++ b/man/post_calc_convert_time.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{post_calc_convert_time} \alias{post_calc_convert_time} \title{Convert time column to character} diff --git a/man/post_calc_df_year_expand.Rd b/man/post_calc_df_year_expand.Rd index 9029398f..21d6fd6e 100644 --- a/man/post_calc_df_year_expand.Rd +++ b/man/post_calc_df_year_expand.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{post_calc_df_year_expand} \alias{post_calc_df_year_expand} \title{Expand a data frame by year} @@ -16,7 +16,9 @@ post_calc_df_year_expand( ) } \arguments{ -\item{df}{The input data frame.} +\item{df}{The input data frame. The data frame should have the same +number of rows per year, meaning that it assumes this argument is +a spatial-only feature data.frame.} \item{locs_id}{The column name of the location identifier in the data frame.} @@ -41,7 +43,7 @@ for each year based on the time period specified. } \note{ Year expansion rule is to assign the nearest past year -in the available years;#' if there is no past year in the available years, +in the available years; if there is no past year in the available years, the first available year is rolled back to the start of the time period. } \examples{ diff --git a/man/post_calc_drop_cols.Rd b/man/post_calc_drop_cols.Rd index a8d5a901..c7e682fa 100644 --- a/man/post_calc_drop_cols.Rd +++ b/man/post_calc_drop_cols.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{post_calc_drop_cols} \alias{post_calc_drop_cols} \title{Remove columns from a data frame based on regular expression patterns.} diff --git a/man/post_calc_join_yeardate.Rd b/man/post_calc_join_yeardate.Rd index 25892d70..48ed1023 100644 --- a/man/post_calc_join_yeardate.Rd +++ b/man/post_calc_join_yeardate.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{post_calc_join_yeardate} \alias{post_calc_join_yeardate} \title{Join a data.frame with a year-only date column to diff --git a/man/post_calc_merge_all.Rd b/man/post_calc_merge_all.Rd index 4ef4ca28..09cde912 100644 --- a/man/post_calc_merge_all.Rd +++ b/man/post_calc_merge_all.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{post_calc_merge_all} \alias{post_calc_merge_all} \title{Merge spatial and spatiotemporal covariate data} diff --git a/man/post_calc_merge_features.Rd b/man/post_calc_merge_features.Rd index 6c0ff20d..5ffe3a31 100644 --- a/man/post_calc_merge_features.Rd +++ b/man/post_calc_merge_features.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{post_calc_merge_features} \alias{post_calc_merge_features} \title{Merge input data.frame objects} diff --git a/man/post_calc_unify_timecols.Rd b/man/post_calc_unify_timecols.Rd index e4153406..abfe3991 100644 --- a/man/post_calc_unify_timecols.Rd +++ b/man/post_calc_unify_timecols.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{post_calc_unify_timecols} \alias{post_calc_unify_timecols} \title{Change time column name} diff --git a/man/post_calc_year_expand.Rd b/man/post_calc_year_expand.Rd index 996443e8..9bc632b9 100644 --- a/man/post_calc_year_expand.Rd +++ b/man/post_calc_year_expand.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/calc_postprocessing.R \name{post_calc_year_expand} \alias{post_calc_year_expand} \title{Map the available raw data years over the given period} diff --git a/man/predict_meta_learner.Rd b/man/predict_meta_learner.Rd new file mode 100644 index 00000000..2c0c4edc --- /dev/null +++ b/man/predict_meta_learner.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/meta_learner.R +\name{predict_meta_learner} +\alias{predict_meta_learner} +\title{Predict meta learner} +\usage{ +predict_meta_learner(meta_fitted, new_data) +} +\arguments{ +\item{meta_fitted}{Fitted meta learner model.} + +\item{new_data}{data.frame. New data. Must have the same +predictands and predictors as the training data.} +} +\value{ +Predicted values. +} +\description{ +Predict meta learner +} +\keyword{meta_learner} diff --git a/man/process_counties.Rd b/man/process_counties.Rd index 81faa7c4..7efacaf4 100644 --- a/man/process_counties.Rd +++ b/man/process_counties.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/processing_misc.R \name{process_counties} \alias{process_counties} \title{Load county sf object} diff --git a/man/process_geos_bulk.Rd b/man/process_geos_bulk.Rd index 3ed33f85..1b6de2c8 100644 --- a/man/process_geos_bulk.Rd +++ b/man/process_geos_bulk.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/processing.R \name{process_geos_bulk} \alias{process_geos_bulk} \title{Process atmospheric composition data by chunks (v2)} diff --git a/man/process_narr2.Rd b/man/process_narr2.Rd index d4f5bccf..4aa28165 100644 --- a/man/process_narr2.Rd +++ b/man/process_narr2.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/processing.R \name{process_narr2} \alias{process_narr2} -\title{Process NARR2 Data} +\title{Process NARR Data (v2)} \usage{ process_narr2( date = c("2023-09-01", "2023-09-01"), @@ -52,6 +52,5 @@ data <- path = "/path/to/data" ) } - } \keyword{Calculation} diff --git a/man/read_locs.Rd b/man/read_locs.Rd index ec3dc70d..55f2c922 100644 --- a/man/read_locs.Rd +++ b/man/read_locs.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/load.R \name{read_locs} \alias{read_locs} \title{Read AQS data} diff --git a/man/read_paths.Rd b/man/read_paths.Rd index 3ded85fb..fd2d4fff 100644 --- a/man/read_paths.Rd +++ b/man/read_paths.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/load.R \name{read_paths} \alias{read_paths} \title{Read paths from a directory with a specific file extension} diff --git a/man/reduce_merge.Rd b/man/reduce_merge.Rd index cebaa527..b5f97ad6 100644 --- a/man/reduce_merge.Rd +++ b/man/reduce_merge.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/injection.R \name{reduce_merge} \alias{reduce_merge} \title{Reduce and merge a list of data tables} @@ -9,7 +9,8 @@ reduce_merge(list_in, by = c("site_id", "time"), all.x = TRUE, all.y = FALSE) \arguments{ \item{list_in}{A list of data tables to be merged.} -\item{by}{The columns to merge the data tables on.} +\item{by}{The columns to merge the data tables on. +If \code{NULL}, the function will automatically detect the common column names.} \item{all.x}{logical(1). Keeping all rows from the first input.} diff --git a/man/restore_fit_best.Rd b/man/restore_fit_best.Rd deleted file mode 100644 index 7cc130bb..00000000 --- a/man/restore_fit_best.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{restore_fit_best} -\alias{restore_fit_best} -\title{Restore the full data set from two rset objects then fit the best model} -\usage{ -restore_fit_best( - rset_trimmed, - rset_full, - df_full, - by = c("site_id", "time"), - nested = TRUE, - nest_length = 30L -) -} -\arguments{ -\item{rset_trimmed}{rset object without data in splits column.} - -\item{rset_full}{rset object with full data.} - -\item{df_full}{data.table with full data.} - -\item{nested}{logical(1). If TRUE, the rset object is nested.} - -\item{nest_length}{integer(1). Length of the nested list. -i.e., Number of resamples.} -} -\value{ -rset object with full data in splits column. -} -\description{ -Restore the full data set from two rset objects then fit the best model -} -\keyword{Baselearner} diff --git a/man/restore_rset_full.Rd b/man/restore_rset_full.Rd deleted file mode 100644 index 24b9acd7..00000000 --- a/man/restore_rset_full.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{restore_rset_full} -\alias{restore_rset_full} -\title{Restore the full data set from the rset object} -\usage{ -restore_rset_full(rset, data_full) -} -\arguments{ -\item{rset}{rsample::manual_rset() object's \code{splits} column} - -\item{data_full}{data.table with all features} -} -\value{ -A list of data.table objects. -} -\description{ -Restore the full data set from the rset object -} -\note{ -$splits should be present in rset. -} -\keyword{Baselearner} diff --git a/man/search_function.Rd b/man/search_function.Rd deleted file mode 100644 index d998ffcd..00000000 --- a/man/search_function.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{search_function} -\alias{search_function} -\title{Search package functions} -\usage{ -search_function(package, search) -} -\arguments{ -\item{package}{character(1). Package name.} - -\item{search}{character(1). Search term.} -} -\value{ -A character vector containing the matching function names. -} -\description{ -Search package functions -} -\examples{ -# Search for functions in the `amadeus` package -\dontrun{ -search_function("amadeus", "process_") -} -} -\keyword{Utility} diff --git a/man/set_args_calc.Rd b/man/set_args_calc.Rd index 78f4fbf9..02cdd534 100644 --- a/man/set_args_calc.Rd +++ b/man/set_args_calc.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/init_arguments.R \name{set_args_calc} \alias{set_args_calc} \title{Set arguments for the calculation process} @@ -11,10 +11,9 @@ set_args_calc( num_extent = c(-126, -62, 22, 52), char_user_email = paste0(Sys.getenv("USER"), "@nih.gov"), export = FALSE, - path_export = "inst/targets/punchcard_calc.qs", + path_export = "inst/targets/calc_spec.qs", char_input_dir = "input", nthreads_nasa = 14L, - nthreads_hms = 3L, nthreads_tri = 5L, nthreads_geoscf = 10L, nthreads_gmted = 4L, @@ -45,7 +44,7 @@ Default is the current user's email with nih.gov domain.} is exported to \code{path_export}. Default is FALSE.} \item{path_export}{Character string specifying the export path. -Default is "inst/targets/punchcard_calc.qs".} +Default is "inst/targets/calc_spec.qs".} \item{char_input_dir}{Character string specifying the input path. Default is "input".} @@ -53,9 +52,6 @@ Default is "input".} \item{nthreads_nasa}{integer(1). Number of threads for NASA data. Default is 14L.} -\item{nthreads_hms}{integer(1). Number of threads for HMS data. -Default is 3L.} - \item{nthreads_tri}{integer(1). Number of threads for TRI data. Default is 5L.} @@ -82,7 +78,8 @@ Default is 64L.} } \value{ A list of arguments for common use -in the calculation process. +in the calculation process. A *.qs or *.rds file defined in +\code{path_export} is saved if \code{export} is TRUE. \itemize{ \item char_siteid: Character string specifying the site ID. \item char_timeid: Character string specifying the time ID. @@ -91,7 +88,6 @@ in the calculation process. \item char_user_email: Character string specifying the user email. \item char_input_dir: Character string specifying the input path. \item nthreads_nasa: Number of threads for NASA data. -\item nthreads_hms: Number of threads for HMS data. \item nthreads_tri: Number of threads for TRI data. \item nthreads_geoscf: Number of threads for GEOS-CF data. \item nthreads_gmted: Number of threads for GMTED data. diff --git a/man/set_args_download.Rd b/man/set_args_download.Rd index 3f28e7d8..50055aed 100644 --- a/man/set_args_download.Rd +++ b/man/set_args_download.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/init_arguments.R \name{set_args_download} \alias{set_args_download} \title{Generate argument list for raw data download} @@ -8,6 +8,7 @@ set_args_download( char_period = c("2018-01-01", "2022-10-31"), char_input_dir = "input", nasa_earth_data_token = NULL, + mod06_filelist = NULL, year_nlcd = c(2019, 2021), export = FALSE, path_export = "inst/targets/download_spec.qs" @@ -22,6 +23,9 @@ Default is "input".} \item{nasa_earth_data_token}{Character string specifying the NASA Earth Data token.} +\item{mod06_filelist}{character(1). File path to a CSV file with MOD06 download +URLs.} + \item{year_nlcd}{numeric(2). Numeric vector specifying the NLCD years. Default is c(2019, 2021).} diff --git a/man/set_slurm_resource.Rd b/man/set_slurm_resource.Rd index 64fdbd43..a85f4165 100644 --- a/man/set_slurm_resource.Rd +++ b/man/set_slurm_resource.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R +% Please edit documentation in R/targets_control.R \name{set_slurm_resource} \alias{set_slurm_resource} \title{Set resource management for SLURM} @@ -33,7 +33,8 @@ set_slurm_resource( A list of resources for \code{tar_resources} } \description{ -Set resource management for SLURM +This function sets up resources for SLURM job submission. +Note that this function is designed to work with \code{tar_make_future()} } \note{ This function is designed to be used with \code{tar_resources}. diff --git a/man/set_target_years.Rd b/man/set_target_years.Rd new file mode 100644 index 00000000..d56deabd --- /dev/null +++ b/man/set_target_years.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/injection.R +\name{set_target_years} +\alias{set_target_years} +\title{Set which years to be processed} +\usage{ +set_target_years(period = NULL, available = NULL) +} +\arguments{ +\item{period}{character(2)/integer(2) of integer/character/Date.} + +\item{available}{vector of integer or Date. Available years to be processed.} +} +\value{ +A vector of years to be processed. +} +\description{ +Set which years to be processed +} +\note{ +This function is designed to define the temporal domain +from the calculation period and the available years of raw data. +} +\keyword{Utility} diff --git a/man/switch_generate_cv_rset.Rd b/man/switch_generate_cv_rset.Rd new file mode 100644 index 00000000..311bcf09 --- /dev/null +++ b/man/switch_generate_cv_rset.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/base_learner.R +\name{switch_generate_cv_rset} +\alias{switch_generate_cv_rset} +\title{Choose cross-validation strategy for the base learner} +\usage{ +switch_generate_cv_rset( + learner = c("spatial", "temporal", "spatiotemporal"), + ... +) +} +\arguments{ +\item{learner}{character(1). Learner type. Should be one of: +\itemize{ +\item "spatial": spatial cross-validation. +\item "temporal": temporal cross-validation. +\item "spatiotemporal": spatiotemporal cross-validation. +}} + +\item{...}{Additional arguments to be passed.} +} +\value{ +\code{\link[rsample:manual_rset]{rsample::manual_rset()}} output object. +} +\description{ +Choose cross-validation strategy for the base learner +} +\note{ +This function's returned value is used as an input for +\code{fit_base_brulee}, \code{fit_base_lightgbm}, and \code{fit_base_elnet}. +Learner values can be used as a branching point for the cross-validation +strategy. +} +\keyword{Baselearner} diff --git a/man/switch_model.Rd b/man/switch_model.Rd new file mode 100644 index 00000000..07174cdc --- /dev/null +++ b/man/switch_model.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/base_learner.R +\name{switch_model} +\alias{switch_model} +\title{Define a base learner model based on parsnip and tune} +\usage{ +switch_model( + model_type = c("mlp", "xgb", "lgb", "elnet"), + learn_rate = 0.1, + device = "cuda:0" +) +} +\arguments{ +\item{model_type}{character(1). Model type to be used. +Default is "mlp". Available options are "mlp", "xgb", "lgb", "elnet".} + +\item{learn_rate}{numeric(1). The learning rate for the model. +Default is 0.1.} + +\item{device}{character(1). The device to be used for training. +Default is "cuda:0". Make sure that your system is equipped +with CUDA-enabled graphical processing units.} +} +\value{ +A parsnip model object. +} +\description{ +Define a base learner model based on parsnip and tune +} +\keyword{Baselearner} diff --git a/man/unmarshal_function.Rd b/man/unmarshal_function.Rd new file mode 100644 index 00000000..00cdff96 --- /dev/null +++ b/man/unmarshal_function.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/load.R +\name{unmarshal_function} +\alias{unmarshal_function} +\title{Unmarshal functions} +\usage{ +unmarshal_function(pkg_func_str) +} +\arguments{ +\item{pkg_func_str}{Character string specifying the package and function.} +} +\value{ +Function object. +} +\description{ +this function is developed to avoid +random errors in compressing and decompressing R function objects +with \code{qs::qsave} and \code{qs::qread}. If you encounter such errors, please use +this function with function name strings to save and load the function +objects. +} +\note{ +The function name string must include two colons \code{::}. +Also, the package preceding the two colons should be loaded in the +current environment. +} +\examples{ +unmarshal_function("amadeus::process_aqs") +} +\keyword{Utility} diff --git a/man/vis_rset.Rd b/man/vis_spt_rset.Rd similarity index 65% rename from man/vis_rset.Rd rename to man/vis_spt_rset.Rd index af2779c9..16e4fd18 100644 --- a/man/vis_rset.Rd +++ b/man/vis_spt_rset.Rd @@ -1,14 +1,16 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline_base_functions.R -\name{vis_rset} -\alias{vis_rset} +% Please edit documentation in R/base_learner.R +\name{vis_spt_rset} +\alias{vis_spt_rset} \title{Visualize the spatio-temporal cross-validation index} \usage{ -vis_rset(rsplit, angle = 60) +vis_spt_rset(rsplit, cex = 0.02, angle = 60) } \arguments{ \item{rsplit}{rsample::manual_rset() object.} +\item{cex}{numeric(1). Size of the points in the plot.} + \item{angle}{numeric(1). Viewing angle of 3D plot.} } \value{ @@ -17,7 +19,4 @@ None. A plot will be generated. \description{ Visualize the spatio-temporal cross-validation index } -\seealso{ -\code{\link{generate_cv_index}} -} \keyword{Baselearner} diff --git a/run_interactive.sh b/run_interactive.sh new file mode 100644 index 00000000..300ee341 --- /dev/null +++ b/run_interactive.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export PATH=$PATH:/ddn/gs1/tools/cuda11.8/bin +export LD_LIBRARY_PATH=/ddn/gs1/biotools/R/lib64/R/customlib:/ddn/gs1/tools/cuda11.8/lib64:$LD_LIBRARY_PATH +if [ "$USER" != "songi2" ]; then + export R_LIBS_USER=/ddn/gs1/biotools/R/lib64/R/custompkg:$R_LIBS_USER:/ddn/gs1/biotools/R/lib64/R/library +else + export R_LIBS_USER=/ddn/gs1/home/songi2/r-libs:$R_LIBS_USER:/ddn/gs1/biotools/R/lib64/R/library +fi + + +# Submit the pipeline as a background process with ./run.sh +# module load R # Uncomment if R is an environment module. +nohup nice -4 R CMD BATCH inst/targets/targets_start.R & \ No newline at end of file diff --git a/run_slurm.sh b/run_slurm.sh new file mode 100644 index 00000000..746a1633 --- /dev/null +++ b/run_slurm.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +#SBATCH --job-name=pipeline_bench +#SBATCH --output=/ddn/gs1/home/songi2/projects/beethoven/pipeline_out.out +#SBATCH --error=/ddn/gs1/home/songi2/projects/beethoven/pipeline_err.err +#SBATCH --mail-type=END,FAIL +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=2 +#SBATCH --mem-per-cpu=32g +#SBATCH --partition=geo +#SBATCH --mail-user=songi2@nih.gov + +export PATH=$PATH:/ddn/gs1/tools/cuda11.8/bin +export LD_LIBRARY_PATH=/ddn/gs1/biotools/R/lib64/R/customlib:/ddn/gs1/tools/cuda11.8/lib64:$LD_LIBRARY_PATH +if [ "$USER" != "songi2" ]; then + export R_LIBS_USER=/ddn/gs1/biotools/R/lib64/R/custompkg:$R_LIBS_USER:/ddn/gs1/biotools/R/lib64/R/library +else + export R_LIBS_USER=/ddn/gs1/home/songi2/r-libs:$R_LIBS_USER:/ddn/gs1/biotools/R/lib64/R/library +fi + +# modify it into the proper directory path. and output/error paths in the +# # SBATCH directives +USER_PROJDIR=/ddn/gs1/home/$USER/projects + +nohup nice -4 Rscript $USER_PROJDIR/beethoven/inst/targets/targets_start.R diff --git a/tar_run.sh b/tar_run.sh deleted file mode 100644 index 4a47776a..00000000 --- a/tar_run.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -# Submit the pipeline as a background process with ./run.sh -# module load R # Uncomment if R is an environment module. -nohup nice -4 R CMD BATCH inst/targets/targets_start.R & \ No newline at end of file diff --git a/tests/testthat/test-calc-postprocessing.R b/tests/testthat/test-calc-postprocessing.R new file mode 100644 index 00000000..8796c0a7 --- /dev/null +++ b/tests/testthat/test-calc-postprocessing.R @@ -0,0 +1,57 @@ +testthat::test_that("post_calc_autojoin expands and joins data.frames with different temporal resolutions", { + withr::local_package("dplyr") + withr::local_package("data.table") + + # data.frame that are resolved daily + df_fine1 <- + expand.grid( + site_id = rep(LETTERS[1:4], 5), + time = + rep( + seq.Date(as.Date("2021-12-30"), as.Date("2022-01-03"), by = 1), + each = 4 + ), + value = rnorm(20) + ) + + df_fine2 <- + data.frame( + site_id = rep(c("A", "B", "C", "D"), 2), + time = + c( + as.Date(c("2022-01-01", "2022-01-02", "2021-12-31", "2021-01-03")), + as.Date(c("2022-01-01", "2022-01-02", "2021-12-31", "2021-01-03")) + 1 + ), + value2 = c(c(1+2i, 2+3i, 3+4i, 5+6i), + c(1+2i, 2+3i, 3+4i, 5+6i) + 1) + ) + # attempt to join two data.frames with the same temporal resolution + # will run quietly + testthat::expect_no_error( + autojoin_iden <- + post_calc_autojoin( + df_fine = df_fine1, df_coarse = df_fine2, + field_sp = "site_id", field_t = "time" + ) + ) + + df_coarse0 <- data.frame(site_id = c("A", "B", "C", "D"), + time = rep(2020, 4), + other_value = c(10, 20, 30, 40)) + # df_coarse0 includes year values. It will lead to an error when + # attempting to convert the year values to Date objects. + # we leverage that characteristics to detect whether the temporal + # values are year or date. In the current beethoven implementation, + # there are only two temporal resolutions: daily and yearly. + testthat::expect_message( + autojoin_diff <- post_calc_autojoin(df_fine1, df_coarse0) + ) + + # remove the year column + df_coarse2 <- df_coarse0[, -2] + # then it will quietly join two data.frames by site_id + testthat::expect_no_error( + autojoin_diff <- post_calc_autojoin(df_fine1, df_coarse2) + ) + +}) diff --git a/tests/testthat/test-init-arguments.R b/tests/testthat/test-init-arguments.R new file mode 100644 index 00000000..d4c50d98 --- /dev/null +++ b/tests/testthat/test-init-arguments.R @@ -0,0 +1,160 @@ +testthat::test_that("set_args_calc exports qs or rds file", { + withr::local_package("qs") + + # create a temporary directory + tmpdir <- tempdir() + tempqsfile <- file.path(tmpdir, "spec_test.qs") + temprdsfile <- file.path(tmpdir, "spec_test.rds") + + # run set_args_calc with qs extension + testthat::expect_no_error( + calcspec <- set_args_calc( + char_period = c("2018-01-01", "2018-01-31"), + export = FALSE, + path_export = tempqsfile + ) + ) + testthat::expect_true(is.list(calcspec)) + + # run set_args_calc with rds extension + testthat::expect_no_error( + calcspec <- set_args_calc( + char_period = c("2018-01-01", "2018-01-31"), + export = FALSE, + path_export = temprdsfile + ) + ) + testthat::expect_true(is.list(calcspec)) + + # blank paths if try to search nonexisting paths + testthat::expect_no_error( + calcspec_neq <- set_args_calc( + char_input_dir = "/path/to/nonexisting", + char_period = c("2018-01-01", "2018-01-31"), + export = TRUE, + path_export = tempqsfile + ) + ) + testthat::expect_true(is.list(calcspec_neq)) + + # blank paths if try to search nonexisting paths + testthat::expect_no_error( + calcspec_ner <- set_args_calc( + char_input_dir = "/path/to/nonexisting", + char_period = c("2018-01-01", "2018-01-31"), + export = TRUE, + path_export = temprdsfile + ) + ) + testthat::expect_true(is.list(calcspec_ner)) + + # GlobalEnv objects if no path_export is provided + testthat::expect_no_error( + calcspec_nullpath <- set_args_calc( + char_input_dir = "/path/to/nonexisting", + char_period = c("2018-01-01", "2018-01-31"), + export = TRUE, + path_export = NULL + ) + ) + testthat::expect_true(is.list(arglist_calcspec)) + + +}) + +testthat::test_that("set_args_download exports qs or rds file", { + withr::local_package("qs") + + # create a temporary directory + tmpdir <- tempdir() + tempqsfile <- file.path(tmpdir, "specdl_test.qs") + temprdsfile <- file.path(tmpdir, "specdl_test.rds") + + # run set_args_download with qs extension + testthat::expect_warning( + calcspec <- set_args_download( + char_period = c("2018-01-01", "2018-01-31"), + export = FALSE, + path_export = tempqsfile + ) + ) + + # run set_args_download with rds extension + testthat::expect_warning( + calcspec <- set_args_download( + char_period = c("2018-01-01", "2018-01-31"), + export = FALSE, + path_export = temprdsfile + ) + ) + + # run set_args_download with qs extension + testthat::expect_no_error( + dlspec <- set_args_download( + char_period = c("2018-01-01", "2018-01-31"), + export = FALSE, + nasa_earth_data_token = "mytoken", + path_export = tempqsfile + ) + ) + + # run set_args_download with rds extension + testthat::expect_no_error( + dlspec <- set_args_download( + char_period = c("2018-01-01", "2018-01-31"), + export = FALSE, + nasa_earth_data_token = "mytoken", + path_export = temprdsfile + ) + ) + + + # export download spec to qs file will give a message + testthat::expect_message( + dlspecqs <- set_args_download( + char_input_dir = "/path/to/nonexisting", + char_period = c("2018-01-01", "2018-01-31"), + nasa_earth_data_token = "mytoken", + export = TRUE, + path_export = tempqsfile + ) + ) + testthat::expect_true(is.list(dlspecqs)) + testthat::expect_equal(length(dlspecqs), 20) + + # export download spec to rds file will give a message + testthat::expect_message( + dlspecrds <- set_args_download( + char_input_dir = "/path/to/nonexisting", + char_period = c("2018-01-01", "2018-01-31"), + nasa_earth_data_token = "mytoken", + export = TRUE, + path_export = temprdsfile + ) + ) + testthat::expect_true(is.list(dlspecrds)) + testthat::expect_equal(length(dlspecrds), 20) + + # export download spec to a file with other extensions will stop + testthat::expect_error( + dlspecrds <- set_args_download( + char_input_dir = "/path/to/nonexisting", + char_period = c("2018-01-01", "2018-01-31"), + nasa_earth_data_token = "mytoken", + export = TRUE, + path_export = file.path(tmpdir, "specdl_test.txt") + ) + ) + + # warning if no nasa_earth_data_token is provided + testthat::expect_warning( + dlspecrds <- set_args_download( + char_input_dir = "/path/to/nonexisting", + char_period = c("2018-01-01", "2018-01-31"), + nasa_earth_data_token = NULL, + export = TRUE, + path_export = file.path(tmpdir, "specdl_test.qs") + ) + ) + +}) diff --git a/tests/testthat/test-injection.R b/tests/testthat/test-injection.R new file mode 100644 index 00000000..802bc718 --- /dev/null +++ b/tests/testthat/test-injection.R @@ -0,0 +1,63 @@ +testthat::test_that("reduce_merge joins all relevant data.frames", { + withr::local_package("dplyr") + withr::local_package("rlang") + + # Create a list of data.frames + data_frames <- list( + data.frame(x = 1:3, y = 4:6), + data.frame(x = 4:6, y = 7:9) + ) + + # Create example data tables + dt1 <- data.frame(a = 1:3, b = 4:6) + dt2 <- data.frame(a = 2:4, c = 7:9) + dt3 <- data.frame(a = 3:5, d = 10:12) + + # Merge the data tables + # by = NULL will automatically detect the common column names + testthat::expect_no_error( + reduce_merge(list(dt1, dt2, dt3), by = NULL) + ) + rmerged <- reduce_merge(list(dt1, dt2, dt3), by = "a") + testthat::expect_s3_class(rmerged, "data.frame") + testthat::expect_equal(ncol(rmerged), 4) +}) + + +testthat::test_that("inject_match only passes the matching arguments", { + withr::local_package("terra") + withr::local_package("rlang") + + # Generate a point spatial vector object + point_data <- data.frame(x = c(1, 2, 3), y = c(4, 5, 6)) + point_spatial_vector <- terra::vect(point_data, geom = c("x", "y")) + + # Generate a polygon spatial vector object + polygon_data <- + data.frame( + id = c(1, 2, 3), + area = c(10, 20, 30), + geometry = + c("POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))", + "POLYGON ((1 1, 1 2, 2 2, 2 1, 1 1))", + "POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))" + ) + ) + polygon_spatial_vector <- terra::vect(polygon_data, geom = "geometry") + + # define a list with invalid arguments + push_args <- list( + x = point_spatial_vector, + y = polygon_spatial_vector, + z = "invalid" + ) + + testthat::expect_no_error( + imatched <- inject_match( + f = terra::intersect, + args = push_args + ) + ) + testthat::expect_s4_class(imatched, "SpatVector") + +}) diff --git a/tests/testthat/test-meta-learner.R b/tests/testthat/test-meta-learner.R deleted file mode 100644 index 7abe43a2..00000000 --- a/tests/testthat/test-meta-learner.R +++ /dev/null @@ -1,178 +0,0 @@ -#' @author SETgroup -#' @description -#' @title meta learner unit test - -test_that("the meta learner fitting abides", { - withr::local_package("BART") - withr::local_package("data.table") - # Test data - response <- 3 + rnorm(100) - kfolds <- sample(rep(1:5, length.out = length(response))) - predictor_list <- list( - runif(length(response), min = 1, max = 10), - rnorm(length(response)), - rnorm(length(response)) - ) - names(predictor_list) <- c("var1", "var2", "var3") - - # Fit learner - meta_model <- meta_learner_fit( - base_predictor_list = predictor_list, - kfolds = kfolds, y = response, - ntree = 100L, ndpost = 500L - ) - - # test the output of the meta-learner fit is a list - expect_type(meta_model, "list") - - # test that the meta-learner fit test-mean does not produce NA - expect_true(sum(is.na(meta_model[[1]]$yhat.test.mean)) == 0) - - # test that the meta-learner fit test does not produce NA - expect_true(sum(is.na(meta_model[[1]]$yhat.test)) == 0) - - # test that the meta-learner fit train-mean does not produce NA - expect_true(sum(is.na(meta_model[[1]]$yhat.train.mean)) == 0) - - # test that the meta-learner fit train set does not produce NA - expect_true(sum(is.na(meta_model[[1]]$yhat.train)) == 0) - - # test that it throws an error when base learners are different length - predictor_list <- list( - runif(length(response), min = 1, max = 10), - rnorm(length(response) - 1), - rnorm(length(response)) - ) - names(predictor_list) <- c("var1", "var2", "var3") - - expect_error(meta_learner_fit( - base_predictor_list = predictor_list, - kfolds = kfolds, y = response, - ntree = 50L, ndpost = 250L - ), "Error in meta_learner_fit: - Base predictors need to be the same length") - - expect_error(meta_learner_fit( - base_predictor_list = rep(NA, 3), - kfolds = kfolds, y = response, - ntree = 50L, ndpost = 250L - )) - - # test that it throws an error when base learners - # and response are different length - predictor_list <- list( - runif(length(response), min = 1, max = 10), - rnorm(length(response)), - rnorm(length(response)) - ) - names(predictor_list) <- c("var1", "var2", "var3") - response <- 3 + rnorm(99) - - expect_error(meta_learner_fit( - base_predictor_list = predictor_list, - kfolds = kfolds, y = response, - ntree = 50L, ndpost = 250L - ), "Error in meta_learner_fit: - Predictors and response are not the same length") - - # test that it throws an error when kfolds - # and response are different length - response <- 3 + rnorm(100) - kfolds <- sample(rep(1:5, length.out = length(response) - 1)) - - expect_error(meta_learner_fit( - base_predictor_list = predictor_list, - kfolds = kfolds, y = response, - ntree = 50L, ndpost = 250L - ), "Error in meta_learner_fit: - kfolds vector and response are not the same length") - - # test that it throws an error when some of predictors are not numeric - kfolds <- sample(rep(1:5, length.out = length(response))) - predictor_list[[3]] <- - c(1, rep("I love Roquefort cheese", length(response) - 1)) - - expect_error(meta_learner_fit( - base_predictor_list = predictor_list, - kfolds = kfolds, y = response, - ntree = 50L, ndpost = 250L - ), "Error in meta_learner_fit: - Some of base predictors are not numeric") -}) - - - -test_that("the meta learner prediction abides", { - withr::local_package("BART") - withr::local_package("data.table") - response <- 3 + rnorm(100) - kfolds <- sample(rep(1:5, length.out = length(response))) - predictor_list <- list( - "baselearner1" = runif(100, min = 1, max = 10), - "baselearner2" = rnorm(100), - "baselearner3" = pi + rnorm(100) - ) - meta_model <- meta_learner_fit( - base_predictor_list = predictor_list, - kfolds = kfolds, y = response, - ntree = 50L, ndpost = 250L - ) - - # new data to predict - lon <- seq(-112, -101, length.out = 5) # create lon sequence - lat <- seq(33.5, 40.9, length.out = 5) # create lat sequence - df <- expand.grid("lon" = lon, "lat" = lat) # expand to regular grid - df <- rbind(df, df) - df$time <- c(rep("2023-11-02", 25), rep("2023-11-03", 25)) - df$baselearner1 <- runif(50, min = 1, max = 10) - df$baselearner2 <- pi + rnorm(50) - df$baselearner3 <- rnorm(50) - base_outputs <- data.table::as.data.table(df) - base_outputs_stdt <- list("stdt" = base_outputs, - "crs_dt" = "EPSG:4326") - class(base_outputs_stdt) <- c("list", "stdt") - - expect_no_error( - meta_learner_predict( - meta_model, base_outputs_stdt - ) - ) - model_output <- - meta_learner_predict( - meta_model, base_outputs_stdt - ) - expect_identical(class(model_output), c("list", "stdt")) - expect_true(all(c("lon", "lat", "time") %in% colnames(model_output$stdt))) - - # check that datatable input is not changed by the function - expect_identical(base_outputs_stdt$stdt, data.table::as.data.table(df)) - - # check it does not work when one baselearner is missing - base_outputs[, baselearner3 := NULL] - base_outputs_stdt <- list("stdt" = base_outputs, - "crs_dt" = "EPSG:4326") - class(base_outputs_stdt) <- c("list", "stdt") - expect_error(meta_learner_predict(meta_model, base_outputs_stdt), - "Error: baselearners list incomplete or with wrong names") - - # check it does not work when baselearner names are not the same - names(predictor_list) <- c("var1", "var2", "var3") - meta_model <- meta_learner_fit( - base_predictor_list = predictor_list, - kfolds = kfolds, y = response, - ntree = 50L, ndpost = 250L - ) - base_outputs <- data.table::as.data.table(df) - base_outputs_stdt <- list("stdt" = base_outputs, - "crs_dt" = "EPSG:4326") - class(base_outputs_stdt) <- c("list", "stdt") - expect_error(meta_learner_predict(meta_model, base_outputs_stdt), - "Error: baselearners list incomplete or with wrong names") - expect_error( - meta_learner_predict( - meta_model, as.matrix(base_outputs_stdt$stdt) - ), - "Error: param base_outputs_stdt is not in stdt format." - ) - -}) diff --git a/vignettes/epa_download.Rmd b/tools/epa_download.Rmd similarity index 100% rename from vignettes/epa_download.Rmd rename to tools/epa_download.Rmd diff --git a/tools/example_script/check_torch.R b/tools/example_script/check_torch.R index 11f34bc8..82a05cb6 100644 --- a/tools/example_script/check_torch.R +++ b/tools/example_script/check_torch.R @@ -1,6 +1,6 @@ #' Check torch installation and load #' @param default_device character(1). "cpu", "cuda", or "mps" -#' @returns NULL +#' @return NULL #' @author Insang Song #' @importFrom torch torch_is_installed #' @importFrom torch torch_device diff --git a/tools/example_script/processing_functions/calc_ecoregion.R b/tools/example_script/processing_functions/calc_ecoregion.R index f04f4164..a64a3128 100644 --- a/tools/example_script/processing_functions/calc_ecoregion.R +++ b/tools/example_script/processing_functions/calc_ecoregion.R @@ -3,7 +3,7 @@ #' @param sites sf/SpatVector. Unique sites. Should include #' a unique identifier field named \code{id_col} #' @param id_col character(1). Name of unique identifier. -#' @returns a data.frame object with dummy variables and attributes of: +#' @return a data.frame object with dummy variables and attributes of: #' - \code{attr(., "ecoregion2_code")}: Ecoregion lv.2 code and key #' - \code{attr(., "ecoregion3_code")}: Ecoregion lv.3 code and key #' @author Insang Song diff --git a/tools/example_script/processing_functions/calc_koppen_geiger.R b/tools/example_script/processing_functions/calc_koppen_geiger.R index 7e433232..b20fdc4e 100644 --- a/tools/example_script/processing_functions/calc_koppen_geiger.R +++ b/tools/example_script/processing_functions/calc_koppen_geiger.R @@ -6,7 +6,7 @@ #' @param sites sf/SpatVector. Unique sites. Should include #' a unique identifier field named \code{id_col} #' @param id_col character(1). Name of unique identifier. -#' @returns a data.frame object +#' @return a data.frame object #' @author Insang Song #' @import terra #' @export diff --git a/tools/example_script/processing_functions/calc_modis.R b/tools/example_script/processing_functions/calc_modis.R index cce7fe6d..f2c36458 100644 --- a/tools/example_script/processing_functions/calc_modis.R +++ b/tools/example_script/processing_functions/calc_modis.R @@ -9,7 +9,7 @@ #' @param date_in Date(1). date to query. #' @param foo closure. A function compatible with \code{SpatRaster}. #' @author Insang Song -#' @returns A SpatRaster object. +#' @return A SpatRaster object. #' @export get_vrt <- function( paths, @@ -169,7 +169,7 @@ assign_ext_vnp46 <- function( #' @param foo function. A calculation function working with #' SpatRaster and sf. #' @author Insang Song -#' @returns A SpatRaster object. +#' @return A SpatRaster object. #' @importFrom terra extract #' @importFrom terra project #' @importFrom terra vect diff --git a/tools/example_script/processing_functions/calc_temporal_dummies.R b/tools/example_script/processing_functions/calc_temporal_dummies.R index e28e178e..24fe82a6 100644 --- a/tools/example_script/processing_functions/calc_temporal_dummies.R +++ b/tools/example_script/processing_functions/calc_temporal_dummies.R @@ -8,7 +8,7 @@ sites_st <- filter_unique_sites(include_time = TRUE) #' Calculate temporal dummy variables #' @input sites data.frame with a temporal field named "date" -#' @returns data.frame with year, month, and weekday indicators. +#' @return data.frame with year, month, and weekday indicators. #' @author Insang Song #' @importFrom methods is #' @importFrom data.table year diff --git a/tools/example_script/processing_functions/covariates.R b/tools/example_script/processing_functions/covariates.R index 69fa51b2..1551d6ab 100644 --- a/tools/example_script/processing_functions/covariates.R +++ b/tools/example_script/processing_functions/covariates.R @@ -5,7 +5,7 @@ #' @param sites sf/SpatVector. Unique sites. Should include #' a unique identifier field named \code{id_col} #' @param id_col character(1). Name of unique identifier. -#' @returns a data.frame object with dummy variables and attributes of: +#' @return a data.frame object with dummy variables and attributes of: #' - \code{attr(., "ecoregion2_code")}: Ecoregion lv.2 code and key #' - \code{attr(., "ecoregion3_code")}: Ecoregion lv.3 code and key #' @author Insang Song @@ -84,7 +84,7 @@ calc_ecoregion <- #' @param sites sf/SpatVector. Unique sites. Should include #' a unique identifier field named \code{id_col} #' @param id_col character(1). Name of unique identifier. -#' @returns a data.frame object +#' @return a data.frame object #' @author Insang Song #' @import terra #' @export @@ -165,7 +165,7 @@ calc_koeppen_geiger <- calc_koppen_geiger #' Calculate temporal dummy variables #' @input sites data.frame with a temporal field named "date" -#' @returns data.frame with year, month, and weekday indicators. +#' @return data.frame with year, month, and weekday indicators. #' @author Insang Song #' @importFrom methods is #' @importFrom data.table year @@ -245,7 +245,7 @@ calc_temporal_dummies <- #' @param date_in Date(1). date to query. #' @param foo closure. A function compatible with \code{SpatRaster}. #' @author Insang Song -#' @returns A SpatRaster object. +#' @return A SpatRaster object. #' @export get_vrt <- function( paths, @@ -323,7 +323,7 @@ get_vrt <- function( #' @param foo function. A calculation function working with #' SpatRaster and sf. #' @author Insang Song -#' @returns A SpatRaster object. +#' @return A SpatRaster object. #' @export modis_worker <- function( paths, diff --git a/tools/example_script/processing_functions/filter_minimum_poc.R b/tools/example_script/processing_functions/filter_minimum_poc.R index e67b884d..318dc469 100644 --- a/tools/example_script/processing_functions/filter_minimum_poc.R +++ b/tools/example_script/processing_functions/filter_minimum_poc.R @@ -15,7 +15,7 @@ #' @param site_id character(1). Name of site id (not monitor id) #' @param poc_name character(1). Name of column containing POC values. #' @author Insang Song -#' @returns a data.table object +#' @return a data.table object #' @importFrom dplyr group_by #' @importFrom dplyr filter #' @importFrom dplyr ungroup diff --git a/tools/example_script/processing_functions/filter_unique_sites.R b/tools/example_script/processing_functions/filter_unique_sites.R index e2bc990e..06dd0314 100644 --- a/tools/example_script/processing_functions/filter_unique_sites.R +++ b/tools/example_script/processing_functions/filter_unique_sites.R @@ -19,7 +19,7 @@ options(sf_use_s2 = FALSE) #' Should be in "YYYY-MM-DD" format. #' @param date_end character(1). End date. #' Should be in "YYYY-MM-DD" format. -#' @returns data.table object with three or four fields. +#' @return data.table object with three or four fields. #' - "site_id" #' - "lon": in WGS 1984 (EPSG:4326) #' - "lat": in WGS 1984 (EPSG:4326) diff --git a/tools/example_script/processing_functions/process_aqs_data.R b/tools/example_script/processing_functions/process_aqs_data.R index c0cbc698..53f2877b 100644 --- a/tools/example_script/processing_functions/process_aqs_data.R +++ b/tools/example_script/processing_functions/process_aqs_data.R @@ -22,7 +22,7 @@ #' @param remove_zip logical(1). Remove metadata zip file from #' directory_with_data Default = `FALSE`. #' @author Mariana Kassien, Insang Song, Mitchell Manware -#' @returns NULL; Separate comma-separated value (CSV) files of +#' @return NULL; Separate comma-separated value (CSV) files of #' monitors and the daily representative values #' will be stored in directory_to_save. #' @export diff --git a/tools/example_script/processing_functions/process_mod09ga.R b/tools/example_script/processing_functions/process_mod09ga.R index ab209451..36f11cea 100644 --- a/tools/example_script/processing_functions/process_mod09ga.R +++ b/tools/example_script/processing_functions/process_mod09ga.R @@ -99,7 +99,7 @@ get_vrt_old <- function( #' @param index_sds integer(1). The index of subdataset to draw. #' @param date_in Date(1). date to query. #' @param foo closure. A function compatible with \code{SpatRaster}. -#' @returns A SpatRaster object. +#' @return A SpatRaster object. get_vrt <- function( flist, index_sds = NULL, date_in, foo = mean) { today <- as.character(date_in) diff --git a/tools/targets-old/pipeline_base_functions.R b/tools/targets-old/pipeline_base_functions.R index 3d130818..316e8330 100644 --- a/tools/targets-old/pipeline_base_functions.R +++ b/tools/targets-old/pipeline_base_functions.R @@ -4,7 +4,7 @@ #' @param var_short Short variable name to call from the CSV fiel #' @param file Path to the configuration file #' @param ... Arguments passed to the command -#' @returns Depending on the specification in the punchcard. +#' @return Depending on the specification in the punchcard. #' @examples #' meta_run("root_absolute") #' meta_run("root_relative") @@ -42,7 +42,7 @@ meta_run <- #' @param error_log character(1). Error log file name. #' @notes This function is designed to be used with `tar_resources`. #' Suggested number of `ncpus` is more than 1 for typical multicore R tasks. -#' @returns A list of resources for `tar_resources` +#' @return A list of resources for `tar_resources` #' @author Insang Song #' @importFrom future tweak #' @importFrom future.batchtools batchtools_slurm @@ -84,7 +84,7 @@ set_slurm_resource <- #' @param fun_aqs function to import AQS data. #' Default is `amadeus::process_aqs` #' @param ... Passed arguments to `fun_aqs` -#' @returns Depending on `fun_aqs` specification. +#' @return Depending on `fun_aqs` specification. #' @import amadeus process_aqs #' @export read_locs <- @@ -105,7 +105,7 @@ read_locs <- #' @param date_end character(1). #' @param return_format character(1). One of `"sf"` or `"terra"` #' @author Insang Song -#' @returns a data.table object +#' @return a data.table object #' @importFrom dplyr group_by #' @importFrom dplyr filter #' @importFrom dplyr ungroup @@ -173,7 +173,7 @@ get_aqs_data <- #' @param df_covar covariates data.frame #' @param locs_id location identifier #' @param time_id time identifier -#' @returns data.frame +#' @return data.frame #' @author Insang Song #' @importFrom data.table merge.data.table post_calc_join_pm25_features <- @@ -198,7 +198,7 @@ post_calc_join_pm25_features <- #' @param path download path. #' @param dname Dataset name. See [`amadeus::download_data`] for details. #' @param ... Arguments passed to `amadeus::download_data` -#' @returns logical(1). +#' @return logical(1). feature_raw_download <- function( path = NULL, @@ -220,7 +220,7 @@ feature_raw_download <- #' @param year integer(1). Year of the county shapefile. #' @param exclude character. State FIPS codes to exclude. #' Default is c("02", "15", "60", "66", "68", "69", "72", "78"). -#' @returns sf object +#' @return sf object #' @importFrom tigris counties #' @export process_counties <- @@ -244,7 +244,7 @@ process_counties <- #' @param calc_function Covariate calculator. Default is #' [`amadeus::calc_covariates`] #' @param ... Arguments passed to `calc_function` -#' @returns Nothing. It will automatically save xz-compressed +#' @return Nothing. It will automatically save xz-compressed #' RDS file to `outpath` #' @importFrom rlang inject #' @export @@ -288,7 +288,7 @@ calculate_single <- #' @param calc_function Function to calculate covariates. #' [`amadeus::calc_covariates`] #' @param ... Arguments passed to `process_function` and `calc_function` -#' @returns A data.table object. +#' @return A data.table object. #' @importFrom data.table rbindlist #' @importFrom rlang inject #' @export @@ -357,7 +357,7 @@ calculate_multi <- #' @param time logical(1). Whether or not include time identifier. #' Set this `TRUE` will supersede `by` value by appending time identifier. #' @param ... data.frame objects to merge -#' @returns data.table +#' @return data.table #' @importFrom data.table as.data.table #' @export post_calc_merge_features <- @@ -395,7 +395,7 @@ post_calc_merge_features <- #' @param df data.frame #' @param candidates character. Candidate column names. #' @param replace character. New column name. -#' @returns data.frame +#' @return data.frame #' @export post_calc_unify_timecols <- function( @@ -437,7 +437,7 @@ post_calc_convert_time <- #' @param spid character(1). Name of the unique location identifier field. #' @importFrom methods is #' @importFrom data.table merge.data.table -#' @returns data.frame +#' @return data.frame post_calc_join_yeardate <- function( df_year, @@ -472,7 +472,7 @@ post_calc_join_yeardate <- #' @param df_sp data.frame. Spatial-only covariates. #' @param df_spt data.frame. Spatiotemporal covariates. #' @note This version assumes the time_id contains Date-like strings. -#' @returns data.frame +#' @return data.frame #' @importFrom amadeus calc_temporal_dummies #' @export post_calc_merge_all <- @@ -512,7 +512,7 @@ post_calc_merge_all <- #' @param path The directory path from which to read the paths. #' @param extension The file extension to match. Defaults to ".hdf". #' @param target_dates A character vector of length 2 containing the start and end dates. -#' @returns A character vector containing the full paths of the matching files. +#' @return A character vector containing the full paths of the matching files. #' #' @examples #' # Read paths from a directory with default extension @@ -544,7 +544,7 @@ read_paths <- function(path, extension = ".hdf", target_dates = c("2020-01-01", #' Search package functions #' @param package character(1). Package name. #' @param search character(1). Search term. -#' @returns A character vector containing the matching function names. +#' @return A character vector containing the matching function names. #' @examples #' # Search for functions in the `amadeus` package #' search_function("amadeus", "process_") @@ -555,7 +555,7 @@ search_function <- function(package, search){ #' Get data.frame of function parameters #' @param functions character. Vector of function names. -#' @returns A data.frame containing the parameters of the functions. +#' @return A data.frame containing the parameters of the functions. df_params <- function(functions) { params <- lapply(functions, function(x) { args <- dplyr::as_tibble(lapply(as.list(formals(get(x))), \(p) list(p)), .name_repair = "minimal") diff --git a/tools/torch_translation_causalstnet.qmd b/tools/torch_translation_causalstnet.qmd index c164fed5..50094922 100644 --- a/tools/torch_translation_causalstnet.qmd +++ b/tools/torch_translation_causalstnet.qmd @@ -364,7 +364,7 @@ center_station_id <- 1013 #' @param station_id_list A list of station IDs. #' @param r_thred The correlation threshold. #' @note Assume that the data is stored in the `./data/stations_data` directory. -#' @returns A list containing the input tensor (x) and the target tensor (y). +#' @return A list containing the input tensor (x) and the target tensor (y). make_tensors <- function( center_station = center_station_id, station_list = station_id_list, diff --git a/vignettes/list_features.Rmd b/vignettes/list_features.Rmd index 4e84522d..69ecc77d 100644 --- a/vignettes/list_features.Rmd +++ b/vignettes/list_features.Rmd @@ -1,7 +1,10 @@ --- title: "List of features" -date: "2024-04-02" -author: "The SET team" +author: "SET group" +vignette: > + %\VignetteIndexEntry{Generate prediction points} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} pkgdown: as_is: true --- diff --git a/vignettes/prediction_points.Rmd b/vignettes/prediction_points.Rmd index 455dd885..84147b12 100644 --- a/vignettes/prediction_points.Rmd +++ b/vignettes/prediction_points.Rmd @@ -5,7 +5,6 @@ vignette: > %\VignetteIndexEntry{Generate prediction points} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} -date: "2024-01-15" author: "Insang Song" --- @@ -15,6 +14,7 @@ knitr::opts_chunk$set(message = FALSE, warning = FALSE) ```{r load-terra} library(terra) +library(tigris) ``` # Objective @@ -29,7 +29,11 @@ points at 1,000 meter interval. this vignette. ```{r load-usmain} -usmain <- terra::vect("../tests/testdata/US-mainland-boundary.gpkg") +usmain <- tigris::states(progress_bar = FALSE) +exclude <- c("02", "15", "60", "66", "68", "69", "72", "78") +usmain <- usmain[!usmain$STATEFP %in% exclude, ] +usmain <- terra::vect(usmain) +usmain <- terra::aggregate(usmain) usmain <- terra::project(usmain, "EPSG:5070") plot(usmain) @@ -56,7 +60,7 @@ Steps 6 and 7 reduce the file size substantially as all data in the `data.frame` from step 6 are in numeric type. This means the data can be compressed efficiently. -```{r gen-grid-1km} +```{r gen-grid-prep} corner_ul <- c(-2.40, 3.26) * 1e6 corner_lr <- c(2.40, 0.12) * 1e6 @@ -65,6 +69,10 @@ corners <- c(corner_ul, corner_lr) corners_re <- corners[c(1, 3, 4, 2)] names(corners_re) <- c("xmin", "xmax", "ymin", "ymax") corners_ext <- terra::ext(corners_re) +``` + + +```{r gen-grid-1km, eval = FALSE} corners_ras <- terra::rast( corners_ext,