NIEHS · kyle-messier · Sep 14, 2023 · Sep 13, 2023 · Sep 13, 2023 · Sep 13, 2023
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1 +1,2 @@
 ^\.github$
+tools/
diff --git a/R/.DS_Store b/R/.DS_Store
diff --git a/R/data_download_functions/download_aqs_data.R b/R/data_download_functions/download_aqs_data.R
@@ -0,0 +1,108 @@
+################################################################################
+# Date modified: 2023-09-13
+# Packages required: None
+################################################################################
+
+################################################################################
+#' download_aqs_data: download daily data from AQS datamart
+#'
+#' @param parameter_code integer(1). length of 5. EPA pollutant parameter code. For details, please refer to https://aqs.epa.gov/aqsweb/documents/codetables/parameters.html 
+#' @param year_start integer(1). length of 4. Start year for downloading data.
+#' @param year_end integer(1). length of 4. End year for downloading data.
+#' @param resolution_temporal character(1). Name of column containing POC values. Currently, no value other than "daily" works.
+#' @param directory_to_download character(1). Directory to download zip files from AQS data mart.
+#' @param directory_to_save character(1). Directory to decompress zip files 
+#' @param url_aqs_download character(1). URL to the AQS pre-generated datasets.
+#' @param remove_zips logical(1). remove zip files in directory_to_download.
+#' @author Mariana Kassien, Insang Song
+#' @return NULL; Separate comma-separated value (CSV) files of monitors and the daily representative values will be stored in directory_to_save.
+#' @export
+download_aqs_data <- function(
+    parameter_code = 88101,
+    year_start = 2018,
+    year_end = 2022,
+    resolution_temporal = "daily",
+    directory_to_download = "./input/aqs/",
+    directory_to_save = "./input/aqs/",
+    url_aqs_download = "https://aqs.epa.gov/aqsweb/airdata/",
+    remove_zips = FALSE
+) {
+    chars_dir_download = nchar(directory_to_download)
+    chars_dir_save = nchar(directory_to_save)
+
+    if (substr(directory_to_download, chars_dir_download, chars_dir_download) != "/") {
+        directory_to_download = paste(directory_to_download, "/", sep = "")
+    }    
+    if (substr(directory_to_save, chars_dir_save, chars_dir_save) != "/") {
+        directory_to_save = paste(directory_to_save, "/", sep = "")
+    }    
+
+    #### 1. define measurement data paths
+    year_sequence = seq(year_start, year_end, 1)
+    file_urls = sprintf(paste(url_aqs_download, resolution_temporal, "_", parameter_code, "_%.0f.zip", sep = ""), year_sequence)
+    download_names = sprintf(paste(directory_to_download, "download_output_%.0f.zip", sep = ""), year_sequence)
+
+    #### 2. Downloading data
+    # Download zip files from website
+    if (!any(file.exists(download_names))) {
+        download.file(file_urls, download_names, method = "libcurl")
+    }
+
+    # Construct string with unzipped file names 
+    csv_names = sprintf(paste(directory_to_download, resolution_temporal, "_", parameter_code, "_%.0f.csv", sep = ""), year_sequence)
+    #### 3. Processing data
+    # Unzip and read in .csv files, process and join in one dataframe.
+    # The unique site identifier "ID.Monitor" is a string with the structure State-County-Site-Parameter-POC
+    for (n in seq(1, length(file_urls))) {
+        unzip(download_names[n], exdir = directory_to_save) 
+
+        # Read in dataframe
+        cat(paste("reading and processing file: ", csv_names[n], "...\n") )
+        data = read.csv(csv_names[n], stringsAsFactors = F)  
+
+        #Make unique site identifier: State-County-Site-Parameter-POC
+        # data$ID.Monitor=paste(data$State.Code,data$County.Code,data$Site.Num,data$Parameter.Code,data$POC, sep="-")
+        # ISong: Some POCs are two digits, so here I changed POC slot to zero-padded two digits.
+        data$ID.Monitor = sprintf("%02d-%03d-%04d-%05d-%02d", 
+            data$State.Code, data$County.Code, data$Site.Num, data$Parameter.Code, data$POC)
+
+        #Concatenate with other years
+        if (n == 1) {
+            data_all = data
+        } else {
+            data_all = rbind(data_all, data)
+        }
+    }
+
+    cat(paste("Downloading monitor metadata...\n"))
+    #### 4. Downloading monitor metadata file and filter for relevant sites
+    # Download monitors file 
+    dest_monitors = paste(directory_to_download, "aqs_monitors.zip", sep = "")
+    if (!file.exists(dest_monitors)) {
+        download.file(sprintf("%saqs_monitors.zip", url_aqs_download), dest_monitors)
+    }
+    # Unzip and read in
+    unzip(dest_monitors, exdir = directory_to_save) 
+    monitors = read.csv(sprintf("%saqs_monitors.csv", directory_to_save), stringsAsFactors = F)
+
+    # Create site identifier
+    monitors$State.Code = as.numeric(monitors$State.Code) # Convert from string to numeric to get rid of leading zeros, the NAs introduced are from monitors in Canada with site number="CC"
+    monitors$ID.Monitor = sprintf("%02d-%03d-%04d-%05d-%02d", 
+            monitors$State.Code, monitors$County.Code, monitors$Site.Num, monitors$Parameter.Code, monitors$POC)
+    # Filter monitors file to include only monitors in our csv
+    monitors_filter = monitors[which(monitors$ID.Monitor %in% data_all$ID.Monitor),]
+    #### 5. Uploading data to desired folder
+    cat(paste("All requested files were downloaded. Write the cleaned data to ", directory_to_save, "...\n", sep = ""))
+    write.csv(data_all, paste(directory_to_save, resolution_temporal, "_", parameter_code, "_", year_start, "-", year_end, ".csv", sep=""))
+    write.csv(monitors_filter, paste(directory_to_save, "monitors_", parameter_code, "_", year_start, "-", year_end, ".csv", sep=""))
+
+    if (remove_zips) {
+        cat(paste("Delete zip files ... \n"))
+        path_zips = list.files(pattern = ".(zip|ZIP)$",
+                                path = directory_to_download,
+                                full.names = TRUE)
+        for (zipfile in path_zips) {
+            file.remove(zipfile)
+        }
+    }
+}
diff --git a/R/epa_download.html b/R/epa_download.html
diff --git a/R/preprocessing/filter_minimum_poc.R b/R/preprocessing/filter_minimum_poc.R
@@ -0,0 +1,46 @@
+
+################################################################################
+# Date modified: 2023-09-13
+# Script description: this function filters the input data.table or tibble 
+#         object using POC (parameter occurrence code) to return data with 
+#         the minimum POC value in each site. It assumes the daily datasets
+#         per year were concatenated in advance.
+# Packages required: pacman, data.table, dplyr, rlang, tidytable
+################################################################################
+
+################################################################################
+#' filter_minimum_poc: filter monitors with the minimum POC value
+#'
+#' @param input_df data.frame/tbl_df/data.table
+#' @param site_id character(1). Name of site id (not monitor id)
+#' @param poc_name character(1). Name of column containing POC values.
+#' @author Insang Song
+#' @return a data.table object
+#' @export
+filter_minimum_poc <- function(input_df, site_id, poc_name) {
+
+    if (!require(pacman)) {
+        install.packages('pacman')
+        library(pacman)
+    }
+    p_load(data.table, dplyr, rlang, tidytable)
+
+    if (!is(input_df, "data.frame")) {
+        stop("input_df should be data.frame/tbl_df/data.table.\n")
+    }
+    if (!is.character(site_id)) {
+        stop("site_id should be character.\n")
+    }
+    if (!is.character(poc_name)) {
+        stop("poc_name should be character.\n")
+    }
+
+    poc_filtered = input_df |>
+        tidytable::group_by(!!sym(site_id)) |>
+        tidytable::filter(!!sym(poc_name) == min(!!sym(poc_name))) |>
+        tidytable::ungroup() |>
+        data.table()
+    return(poc_filtered)
+
+}
+
diff --git a/tools/.DS_Store b/tools/.DS_Store
diff --git a/tools/shiny_explore_pm/data/.DS_Store b/tools/shiny_explore_pm/data/.DS_Store
diff --git a/tools/shiny_explore_pm/data/STFDF_pm25.rds b/tools/shiny_explore_pm/data/STFDF_pm25.rds
diff --git a/tools/shiny_explore_pm/data/STFDF_pm25_site.rds b/tools/shiny_explore_pm/data/STFDF_pm25_site.rds
diff --git a/tools/shiny_explore_pm/data/STVariogram_pm25.rds b/tools/shiny_explore_pm/data/STVariogram_pm25.rds
diff --git a/tools/shiny_explore_pm/data/STVariogram_pm25_site.rds b/tools/shiny_explore_pm/data/STVariogram_pm25_site.rds
diff --git a/tools/shiny_explore_pm/data/SharedData_sf.gpkg b/tools/shiny_explore_pm/data/SharedData_sf.gpkg
diff --git a/tools/shiny_explore_pm/data/SharedLongData.parquet b/tools/shiny_explore_pm/data/SharedLongData.parquet
diff --git a/tools/shiny_explore_pm/data/Site_missingrate.gpkg b/tools/shiny_explore_pm/data/Site_missingrate.gpkg
diff --git a/tools/shiny_explore_pm/data/States_NMonitors.gpkg b/tools/shiny_explore_pm/data/States_NMonitors.gpkg
diff --git a/tools/shiny_explore_pm/data/aqs_monitors.rds b/tools/shiny_explore_pm/data/aqs_monitors.rds