Skip to content

Commit

Permalink
Update FLOSS scoring and document parallel
Browse files Browse the repository at this point in the history
computations

Adjusted the FLOSS score calculation to include dynamic data frequency
and window parameters for more flexible and accurate regime scoring.
Introduced parallel processing optimizations in the R Markdown setup
chunk for improved performance by creating a custom `mdplyr` function,
leveraging `multidplyr` functions on a cluster. Minor text edits
corrected punctuation in figure captions. Binary RDS files were updated
likely as a result of these changes.

- Ensured FLOSS scoring is adaptable to different data frequencies and
window sizes, enabling tailored analytics across diverse datasets.
- Optimized R Markdown for large-scale data processing with a dedicated
parallelization function, `mdplyr`.
- Fixed typographic errors in figure captions within the academic paper
for better clarity.
- Updated RDS outputs in line with the revised scoring logic and
parallel code execution, ensuring data consistency and integrity.

Ref: #2483 for scoring adjustments, #2640 for performance enhancements
  • Loading branch information
franzbischoff committed Dec 10, 2023
1 parent 9112811 commit a8ac78a
Show file tree
Hide file tree
Showing 9 changed files with 37 additions and 17 deletions.
Binary file modified output/dbarts_fitted_mvds.rds
Binary file not shown.
Binary file modified output/dbarts_fitted_vtds.rds
Binary file not shown.
Binary file modified output/importances_mvds.rds
Binary file not shown.
Binary file modified output/importances_vtds.rds
Binary file not shown.
Binary file modified output/regime_outputs_lmk.rds
Binary file not shown.
Binary file modified output/scores_stats_model_3.rds
Binary file not shown.
Binary file modified output/scores_stats_model_4.rds
Binary file not shown.
41 changes: 30 additions & 11 deletions papers/phd/FirstPaper.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,29 @@ editor:
canonical: true
---

```{r setup, include=FALSE}
```{r setup, include=FALSE, cache=FALSE}
rlang::check_installed(c(
"here", "glue", "kableExtra", "ggplot2", "dplyr", "vip", "pdp", "patchwork", "fastshap"
))
source(here::here("scripts", "common", "read_ecg.R"))
source(here::here("scripts", "common", "score_floss.R"))
source(here::here("scripts", "helpers", "plot_fluss.R"))
mdplyr <- function(data, libs = NULL, vars = NULL) {
# conflicts_prefer(testthat::matches)
cluster <- multidplyr::new_cluster(parallelly::availableCores())
for (i in libs) {
multidplyr::cluster_library(cluster, i)
}
for (i in vars) {
multidplyr::cluster_copy(cluster, i)
}
multidplyr::partition(data, cluster)
}
```

# Focus {.unnumbered}
Expand Down Expand Up @@ -388,7 +406,7 @@ This is not a problem *per se* but a signal we must be aware of when exploring t
#| A) Shows strong interaction between `window_size` and `regime_landmark`, `regime_threshold` and `regime_landmark`,
#| `mp_threshold` and `regime_landmark`.
#| B) Refitting the model with these interactions taken into account, the strength is substantially reduced, except
#| for the second, showing that indeed there is a strong correlation between those variables."
#| for the second, showing that indeed there is a strong correlation between those variables.."
library(patchwork)
Expand Down Expand Up @@ -454,7 +472,7 @@ inter + plot_annotation(
#| fig-cap: "Variables importances using three different methods. A) Feature Importance Ranking Measure
#| using ICE curves. B) Permutation method. C) SHAP (100 iterations). Line 1 refers to the original
#| fit, and line 2 to the re-fit, taking into account the interactions between variables
#| (@fig-interaction)."
#| (@fig-interaction).."
library(patchwork)
Expand Down Expand Up @@ -586,7 +604,7 @@ From this first dataset's results, we can infer that the most interesting parame
```{r fig-importanceshap, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, fig.height = 6, fig.width= 10, out.width="100%"}
#| fig-cap: "This shows the effect each variable has on the FLOSS score. This plot doesn't take into account the
#| variable interactions."
#| variable interactions.."
library(dplyr)
library(patchwork)
Expand Down Expand Up @@ -649,7 +667,7 @@ all1 + plot_layout(design = layout) + plot_annotation(title = "Shapley value vs.
> Dataset 1
```{r fig-importanceshap2, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, fig.height = 6, fig.width= 10, out.width="100%"}
#| fig-cap: "This shows the effect each variable has on the FLOSS score, taking into account the interactions."
#| fig-cap: "This shows the effect each variable has on the FLOSS score, taking into account the interactions.."
layout <- "
AABB
Expand Down Expand Up @@ -697,7 +715,6 @@ all2 + plot_layout(design = layout) + plot_annotation(title = "Shapley value vs.
#| (@fig-interaction)."
library(patchwork)
interactions <- readRDS(here::here("output", "importances_mvds.rds"))
importance_firm <- interactions$importance_firm
importance_perm <- interactions$importance_perm
Expand Down Expand Up @@ -892,7 +909,9 @@ all + plot_annotation(
#| fig.cap: "This shows the effect each variable has on the FLOSS score."
int <- readRDS(here("output", "importances_vtds.rds"))
ckeep <- class(int$shap_fastshap_all_test)
shap_fastshap_all_test <- int$shap_fastshap_all_test
class(shap_fastshap_all_test) <- ckeep
trained_model <- readRDS(here("output", "dbarts_fitted_vtds.rds"))
testing_data <- trained_model$testing_data
Expand All @@ -908,7 +927,7 @@ a2 <- shapviz::sv_dependence(d1, "regime_threshold", color_var = "auto") +
ggplot2::geom_smooth(method = loess, colour = "#0000ff44", alpha = 0.2) +
ggplot2::labs(y = ggplot2::element_blank()) +
ggplot2::theme_bw()
a3 <- shapviz::sv_dependence(d1, "regime_landmark", color_var = "auto") +
a3 <- shapviz::sv_dependence(d1, "regime_landmark", color_var = "regime_threshold") +
ggplot2::geom_smooth(method = loess, colour = "#0000ff44", alpha = 0.2) +
ggplot2::labs(y = ggplot2::element_blank()) +
ggplot2::theme_bw()
Expand Down Expand Up @@ -953,7 +972,7 @@ all_scores_mvds <- all_fitted_mvds |>
dplyr::rename(fold = id, size = .sizes, record = .id, model = .config, pred = .pred) |>
dplyr::distinct(rep, record, across(all_of(predictors_names2)), .keep_all = TRUE) |>
dplyr::mutate(truth = clean_truth(truth, size), pred = clean_pred(pred)) |>
dplyr::mutate(score = score_regimes_weighted(truth, pred, 0))
dplyr::mutate(score = score_regimes_weighted(truth, pred, 250, 10))
all_fitted_vtds <- readRDS(here::here("output", "regime_outputs_vtds.rds"))
all_scores_vtds <- all_fitted_vtds |>
Expand All @@ -967,7 +986,7 @@ all_scores_vtds <- all_fitted_vtds |>
dplyr::rename(fold = id, size = .sizes, record = .id, model = .config, pred = .pred) |>
dplyr::distinct(rep, record, across(all_of(predictors_names2)), .keep_all = TRUE) |>
dplyr::mutate(truth = clean_truth(truth, size), pred = clean_pred(pred)) |>
dplyr::mutate(score = score_regimes_weighted(truth, pred, 0))
dplyr::mutate(score = score_regimes_weighted(truth, pred, 250, 10))
```
Expand Down Expand Up @@ -1016,7 +1035,7 @@ scores_stats_mvds$id <- sprintf("%02d", (as.numeric(records_factors_mvds)))
scores2 <- scores_stats_mvds |>
dplyr::mutate(low_iqr = q75 - q25 < 0.1) |>
ggplot2::ggplot(aes(
ggplot2::ggplot(ggplot2::aes(
x = reorder(id, -median),
y = score, colour = low_iqr
)) +
Expand Down Expand Up @@ -1045,7 +1064,7 @@ scores_stats_vtds$id <- sprintf("%02d", (as.numeric(records_factors_vtds)))
scores3 <- scores_stats_vtds |>
dplyr::mutate(low_iqr = q75 - q25 < 0.1) |>
ggplot2::ggplot(aes(
ggplot2::ggplot(ggplot2::aes(
x = reorder(id, -median),
y = score, colour = low_iqr
)) +
Expand Down
13 changes: 7 additions & 6 deletions scripts/common/score_floss.R
Original file line number Diff line number Diff line change
Expand Up @@ -238,18 +238,18 @@ score_regimes_limit <- function(gtruth, reported, data_size, max_size = 2500) {
# for example: 10 seconds * 250hz = 2500


score_regimes_weighted <- function(gtruth, reported, data_size) {
score_regimes_weighted <- function(gtruth, reported, data_freq = 250, window = 10) {
# Probably we are receiving a tibble
if (is.list(gtruth) && length(gtruth) > 1) {
if (length(data_size) == 1) {
data_size <- rep(0, length(gtruth))
if (length(data_freq) == 1) {
data_freq <- rep(250, length(gtruth))
} else {
checkmate::assert(length(gtruth) == length(data_size))
}

# Proceed if same size
if (length(gtruth) == length(reported)) {
scores <- purrr::pmap_dbl(list(gtruth, reported, data_size), score_regimes_weighted)
scores <- purrr::pmap_dbl(list(gtruth, reported, data_freq, window), score_regimes_weighted)
}

return(scores)
Expand Down Expand Up @@ -308,7 +308,7 @@ score_regimes_weighted <- function(gtruth, reported, data_size) {
minv <- c(minv, minv_left)
}

minv <- minv / 250000 # 250hz * 1000
minv <- minv / (data_freq * 1000) # 250hz * 1000, to avoid exponential errors

weights <- exp(-minv)
sum_weights <- sum(weights)
Expand All @@ -319,7 +319,8 @@ score_regimes_weighted <- function(gtruth, reported, data_size) {
rlang::abort("Score is NA")
}

score
score <- (score * 1000) / window # x1000 to return to seconds
return(score)
}

# window parameter will be used to compute if the prediction is a true positive or not
Expand Down

0 comments on commit a8ac78a

Please sign in to comment.