Skip to content

Commit

Permalink
isong update (09132023)
Browse files Browse the repository at this point in the history
- dashboard source and compressed data files
- aqs download and poc sort are written in functions
  • Loading branch information
Insang Song committed Sep 13, 2023
1 parent 92a72cf commit ffae6a2
Show file tree
Hide file tree
Showing 20 changed files with 22,259 additions and 0 deletions.
Binary file added R/.DS_Store
Binary file not shown.
108 changes: 108 additions & 0 deletions R/data_download_functions/download_aqs_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
################################################################################
# Date modified: 2023-09-13
# Packages required: None
################################################################################

################################################################################
#' download_aqs_data: download daily data from AQS datamart
#'
#' @param parameter_code integer(1). length of 5. EPA pollutant parameter code. For details, please refer to https://aqs.epa.gov/aqsweb/documents/codetables/parameters.html
#' @param year_start integer(1). length of 4. Start year for downloading data.
#' @param year_end integer(1). length of 4. End year for downloading data.
#' @param resolution_temporal character(1). Name of column containing POC values. Currently, no value other than "daily" works.
#' @param directory_to_download character(1). Directory to download zip files from AQS data mart.
#' @param directory_to_save character(1). Directory to decompress zip files
#' @param url_aqs_download character(1). URL to the AQS pre-generated datasets.
#' @param remove_zips logical(1). remove zip files in directory_to_download.
#' @author Mariana Kassien, Insang Song
#' @return NULL; Separate comma-separated value (CSV) files of monitors and the daily representative values will be stored in directory_to_save.
#' @export
download_aqs_data <- function(
parameter_code = 88101,
year_start = 2018,
year_end = 2022,
resolution_temporal = "daily",
directory_to_download = "./input/aqs/",
directory_to_save = "./input/aqs/",
url_aqs_download = "https://aqs.epa.gov/aqsweb/airdata/",
remove_zips = FALSE
) {
chars_dir_download = nchar(directory_to_download)
chars_dir_save = nchar(directory_to_save)

if (substr(directory_to_download, chars_dir_download, chars_dir_download) != "/") {
directory_to_download = paste(directory_to_download, "/", sep = "")
}
if (substr(directory_to_save, chars_dir_save, chars_dir_save) != "/") {
directory_to_save = paste(directory_to_save, "/", sep = "")
}

#### 1. define measurement data paths
year_sequence = seq(year_start, year_end, 1)
file_urls = sprintf(paste(url_aqs_download, resolution_temporal, "_", parameter_code, "_%.0f.zip", sep = ""), year_sequence)
download_names = sprintf(paste(directory_to_download, "download_output_%.0f.zip", sep = ""), year_sequence)

#### 2. Downloading data
# Download zip files from website
if (!any(file.exists(download_names))) {
download.file(file_urls, download_names, method = "libcurl")
}

# Construct string with unzipped file names
csv_names = sprintf(paste(directory_to_download, resolution_temporal, "_", parameter_code, "_%.0f.csv", sep = ""), year_sequence)
#### 3. Processing data
# Unzip and read in .csv files, process and join in one dataframe.
# The unique site identifier "ID.Monitor" is a string with the structure State-County-Site-Parameter-POC
for (n in seq(1, length(file_urls))) {
unzip(download_names[n], exdir = directory_to_save)

# Read in dataframe
cat(paste("reading and processing file: ", csv_names[n], "...\n") )
data = read.csv(csv_names[n], stringsAsFactors = F)

#Make unique site identifier: State-County-Site-Parameter-POC
# data$ID.Monitor=paste(data$State.Code,data$County.Code,data$Site.Num,data$Parameter.Code,data$POC, sep="-")
# ISong: Some POCs are two digits, so here I changed POC slot to zero-padded two digits.
data$ID.Monitor = sprintf("%02d-%03d-%04d-%05d-%02d",
data$State.Code, data$County.Code, data$Site.Num, data$Parameter.Code, data$POC)

#Concatenate with other years
if (n == 1) {
data_all = data
} else {
data_all = rbind(data_all, data)
}
}

cat(paste("Downloading monitor metadata...\n"))
#### 4. Downloading monitor metadata file and filter for relevant sites
# Download monitors file
dest_monitors = paste(directory_to_download, "aqs_monitors.zip", sep = "")
if (!file.exists(dest_monitors)) {
download.file(sprintf("%saqs_monitors.zip", url_aqs_download), dest_monitors)
}
# Unzip and read in
unzip(dest_monitors, exdir = directory_to_save)
monitors = read.csv(sprintf("%saqs_monitors.csv", directory_to_save), stringsAsFactors = F)

# Create site identifier
monitors$State.Code = as.numeric(monitors$State.Code) # Convert from string to numeric to get rid of leading zeros, the NAs introduced are from monitors in Canada with site number="CC"
monitors$ID.Monitor = sprintf("%02d-%03d-%04d-%05d-%02d",
monitors$State.Code, monitors$County.Code, monitors$Site.Num, monitors$Parameter.Code, monitors$POC)
# Filter monitors file to include only monitors in our csv
monitors_filter = monitors[which(monitors$ID.Monitor %in% data_all$ID.Monitor),]
#### 5. Uploading data to desired folder
cat(paste("All requested files were downloaded. Write the cleaned data to ", directory_to_save, "...\n", sep = ""))
write.csv(data_all, paste(directory_to_save, resolution_temporal, "_", parameter_code, "_", year_start, "-", year_end, ".csv", sep=""))
write.csv(monitors_filter, paste(directory_to_save, "monitors_", parameter_code, "_", year_start, "-", year_end, ".csv", sep=""))

if (remove_zips) {
cat(paste("Delete zip files ... \n"))
path_zips = list.files(pattern = ".(zip|ZIP)$",
path = directory_to_download,
full.names = TRUE)
for (zipfile in path_zips) {
file.remove(zipfile)
}
}
}
46 changes: 46 additions & 0 deletions R/preprocessing/filter_minimum_poc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

################################################################################
# Date modified: 2023-09-13
# Script description: this function filters the input data.table or tibble
# object using POC (parameter occurrence code) to return data with
# the minimum POC value in each site. It assumes the daily datasets
# per year were concatenated in advance.
# Packages required: pacman, data.table, dplyr, rlang, tidytable
################################################################################

################################################################################
#' filter_minimum_poc: filter monitors with the minimum POC value
#'
#' @param input_df data.frame/tbl_df/data.table
#' @param site_id character(1). Name of site id (not monitor id)
#' @param poc_name character(1). Name of column containing POC values.
#' @author Insang Song
#' @return a data.table object
#' @export
filter_minimum_poc <- function(input_df, site_id, poc_name) {

if (!require(pacman)) {
install.packages('pacman')
library(pacman)
}
p_load(data.table, dplyr, rlang, tidytable)

if (!is(input_df, "data.frame")) {
stop("input_df should be data.frame/tbl_df/data.table.\n")
}
if (!is.character(site_id)) {
stop("site_id should be character.\n")
}
if (!is.character(poc_name)) {
stop("poc_name should be character.\n")
}

poc_filtered = input_df |>
tidytable::group_by(!!sym(site_id)) |>
tidytable::filter(!!sym(poc_name) == min(!!sym(poc_name))) |>
tidytable::ungroup() |>
data.table()
return(poc_filtered)

}

Binary file added R/shiny_explore_pm/data/.DS_Store
Binary file not shown.
Binary file added R/shiny_explore_pm/data/STFDF_pm25.rds
Binary file not shown.
Binary file added R/shiny_explore_pm/data/STFDF_pm25_site.rds
Binary file not shown.
Binary file added R/shiny_explore_pm/data/STVariogram_pm25.rds
Binary file not shown.
Binary file not shown.
Binary file added R/shiny_explore_pm/data/SharedData_sf.gpkg
Binary file not shown.
Binary file added R/shiny_explore_pm/data/SharedLongData.parquet
Binary file not shown.
Binary file added R/shiny_explore_pm/data/Site_missingrate.gpkg
Binary file not shown.
Binary file added R/shiny_explore_pm/data/States_NMonitors.gpkg
Binary file not shown.
Binary file added R/shiny_explore_pm/data/aqs_monitors.rds
Binary file not shown.
Loading

0 comments on commit ffae6a2

Please sign in to comment.