Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

isong update (09132023); reorganized #28

Merged
merged 3 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
^\.github$
tools/
Binary file added R/.DS_Store
Binary file not shown.
108 changes: 108 additions & 0 deletions R/data_download_functions/download_aqs_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
################################################################################
# Date modified: 2023-09-13
# Packages required: None
################################################################################

################################################################################
#' download_aqs_data: download daily data from AQS datamart
#'
#' @param parameter_code integer(1). length of 5. EPA pollutant parameter code. For details, please refer to https://aqs.epa.gov/aqsweb/documents/codetables/parameters.html
#' @param year_start integer(1). length of 4. Start year for downloading data.
#' @param year_end integer(1). length of 4. End year for downloading data.
#' @param resolution_temporal character(1). Name of column containing POC values. Currently, no value other than "daily" works.
#' @param directory_to_download character(1). Directory to download zip files from AQS data mart.
#' @param directory_to_save character(1). Directory to decompress zip files
#' @param url_aqs_download character(1). URL to the AQS pre-generated datasets.
#' @param remove_zips logical(1). remove zip files in directory_to_download.
#' @author Mariana Kassien, Insang Song
#' @return NULL; Separate comma-separated value (CSV) files of monitors and the daily representative values will be stored in directory_to_save.
#' @export
download_aqs_data <- function(
parameter_code = 88101,
year_start = 2018,
year_end = 2022,
resolution_temporal = "daily",
directory_to_download = "./input/aqs/",
directory_to_save = "./input/aqs/",
url_aqs_download = "https://aqs.epa.gov/aqsweb/airdata/",
remove_zips = FALSE
) {
chars_dir_download = nchar(directory_to_download)
chars_dir_save = nchar(directory_to_save)

if (substr(directory_to_download, chars_dir_download, chars_dir_download) != "/") {
directory_to_download = paste(directory_to_download, "/", sep = "")
}
if (substr(directory_to_save, chars_dir_save, chars_dir_save) != "/") {
directory_to_save = paste(directory_to_save, "/", sep = "")
}

#### 1. define measurement data paths
year_sequence = seq(year_start, year_end, 1)
file_urls = sprintf(paste(url_aqs_download, resolution_temporal, "_", parameter_code, "_%.0f.zip", sep = ""), year_sequence)
download_names = sprintf(paste(directory_to_download, "download_output_%.0f.zip", sep = ""), year_sequence)

#### 2. Downloading data
# Download zip files from website
if (!any(file.exists(download_names))) {
download.file(file_urls, download_names, method = "libcurl")
}

# Construct string with unzipped file names
csv_names = sprintf(paste(directory_to_download, resolution_temporal, "_", parameter_code, "_%.0f.csv", sep = ""), year_sequence)
#### 3. Processing data
# Unzip and read in .csv files, process and join in one dataframe.
# The unique site identifier "ID.Monitor" is a string with the structure State-County-Site-Parameter-POC
for (n in seq(1, length(file_urls))) {
unzip(download_names[n], exdir = directory_to_save)

# Read in dataframe
cat(paste("reading and processing file: ", csv_names[n], "...\n") )
data = read.csv(csv_names[n], stringsAsFactors = F)

#Make unique site identifier: State-County-Site-Parameter-POC
# data$ID.Monitor=paste(data$State.Code,data$County.Code,data$Site.Num,data$Parameter.Code,data$POC, sep="-")
# ISong: Some POCs are two digits, so here I changed POC slot to zero-padded two digits.
data$ID.Monitor = sprintf("%02d-%03d-%04d-%05d-%02d",
data$State.Code, data$County.Code, data$Site.Num, data$Parameter.Code, data$POC)

#Concatenate with other years
if (n == 1) {
data_all = data
} else {
data_all = rbind(data_all, data)
}
}

cat(paste("Downloading monitor metadata...\n"))
#### 4. Downloading monitor metadata file and filter for relevant sites
# Download monitors file
dest_monitors = paste(directory_to_download, "aqs_monitors.zip", sep = "")
if (!file.exists(dest_monitors)) {
download.file(sprintf("%saqs_monitors.zip", url_aqs_download), dest_monitors)
}
# Unzip and read in
unzip(dest_monitors, exdir = directory_to_save)
monitors = read.csv(sprintf("%saqs_monitors.csv", directory_to_save), stringsAsFactors = F)

# Create site identifier
monitors$State.Code = as.numeric(monitors$State.Code) # Convert from string to numeric to get rid of leading zeros, the NAs introduced are from monitors in Canada with site number="CC"
monitors$ID.Monitor = sprintf("%02d-%03d-%04d-%05d-%02d",
monitors$State.Code, monitors$County.Code, monitors$Site.Num, monitors$Parameter.Code, monitors$POC)
# Filter monitors file to include only monitors in our csv
monitors_filter = monitors[which(monitors$ID.Monitor %in% data_all$ID.Monitor),]
#### 5. Uploading data to desired folder
cat(paste("All requested files were downloaded. Write the cleaned data to ", directory_to_save, "...\n", sep = ""))
write.csv(data_all, paste(directory_to_save, resolution_temporal, "_", parameter_code, "_", year_start, "-", year_end, ".csv", sep=""))
write.csv(monitors_filter, paste(directory_to_save, "monitors_", parameter_code, "_", year_start, "-", year_end, ".csv", sep=""))

if (remove_zips) {
cat(paste("Delete zip files ... \n"))
path_zips = list.files(pattern = ".(zip|ZIP)$",
path = directory_to_download,
full.names = TRUE)
for (zipfile in path_zips) {
file.remove(zipfile)
}
}
}
501 changes: 0 additions & 501 deletions R/epa_download.html

This file was deleted.

46 changes: 46 additions & 0 deletions R/preprocessing/filter_minimum_poc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

################################################################################
# Date modified: 2023-09-13
# Script description: this function filters the input data.table or tibble
# object using POC (parameter occurrence code) to return data with
# the minimum POC value in each site. It assumes the daily datasets
# per year were concatenated in advance.
# Packages required: pacman, data.table, dplyr, rlang, tidytable
################################################################################

################################################################################
#' filter_minimum_poc: filter monitors with the minimum POC value
#'
#' @param input_df data.frame/tbl_df/data.table
#' @param site_id character(1). Name of site id (not monitor id)
#' @param poc_name character(1). Name of column containing POC values.
#' @author Insang Song
#' @return a data.table object
#' @export
filter_minimum_poc <- function(input_df, site_id, poc_name) {

if (!require(pacman)) {
install.packages('pacman')
library(pacman)
}
p_load(data.table, dplyr, rlang, tidytable)

if (!is(input_df, "data.frame")) {
stop("input_df should be data.frame/tbl_df/data.table.\n")
}
if (!is.character(site_id)) {
stop("site_id should be character.\n")
}
if (!is.character(poc_name)) {
stop("poc_name should be character.\n")
}

poc_filtered = input_df |>
tidytable::group_by(!!sym(site_id)) |>
tidytable::filter(!!sym(poc_name) == min(!!sym(poc_name))) |>
tidytable::ungroup() |>
data.table()
return(poc_filtered)

}

Binary file added tools/.DS_Store
Binary file not shown.
Binary file added tools/shiny_explore_pm/data/.DS_Store
Binary file not shown.
Binary file added tools/shiny_explore_pm/data/STFDF_pm25.rds
Binary file not shown.
Binary file added tools/shiny_explore_pm/data/STFDF_pm25_site.rds
Binary file not shown.
Binary file added tools/shiny_explore_pm/data/STVariogram_pm25.rds
Binary file not shown.
Binary file not shown.
Binary file added tools/shiny_explore_pm/data/SharedData_sf.gpkg
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tools/shiny_explore_pm/data/States_NMonitors.gpkg
Binary file not shown.
Binary file added tools/shiny_explore_pm/data/aqs_monitors.rds
Binary file not shown.
Loading