init

rer145 · Nov 7, 2023 · 93c2f8b · 93c2f8b
commit 93c2f8b
Show file tree

Hide file tree

Showing 106 changed files with 21,391 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+_old
diff --git a/README.md b/README.md
@@ -0,0 +1,77 @@
+# MaMD Analytical (online version)
+
+An online version of the desktop MaMD Analytical program, developed by Dr. Joe Hefner.
+
+## Projects
+
+### mamd-api-base
+
+The mamd-api-base project builds a Docker container with the required packages for the mamd-api container to run. This saves time in building the mamd-api container, since the packages required does not change often. 
+
+This container uses the [rstudio/plumber](https://github.com/rstudio/plumber) package to serve up an API for use in the mamd-web project. This API can be called from anywhere, provided the proper parameters are passed in.
+
+**Building/Deploying**
+```
+docker build -t mamd-api-base .
+docker tag mamd-api-base rer145/mamd-api-base
+docker push rer145/mamd-api-base
+```
+
+### mamd-api
+
+The mamd-api project builds a Docker container with the analytical data and script to perform the analysis. It relies on the mamd-api-base to setup rstudio/plumber and constantly runs, waiting for inputs from the mamd-web project.
+
+**Building/Deploying**
+```
+docker build -t mamd-api .
+docker tag mamd-api rer145/mamd-api
+docker push rer145/mamd-api
+```
+
+**Running**
+```
+docker run -it --rm -p 8000:8000 mamd-api /api.R
+
+http://localhost:8000/mamd?group_list=American,African,Asian&ANS=1&INA=3&IOB=1&MT=1&NAW=1&NBC=1&NO=1&PBD=1&PZT=1&ZS=1
+```
+
+
+### mamd-web
+
+The mamd-web is a web interface for collecting and providing inputs for the mamd-api analysis. this is the same interface as the original desktop application, but updated to call the API and return back the analysis.
+
+The parameters accepted to the ```/mamd``` endpoint are as follows:
+
+|Name|Type|Description|Example|
+|-|-|-|-|
+|group_list|```string```|A comma separated list of groups to include in the analysis|American,African,Asian|
+|ANS|```int```||1|
+|INA|```int```||1|
+|IOB|```int```||1|
+|MT|```int```||1|
+|NAW|```int```||1|
+|NBC|```int```||1|
+|NO|```int```||1|
+|PBD|```int```||1|
+|PZT|```int```||1|
+|ZS|```int```||1|
+
+
+The response from the API provides a JSON object with the following values:
+
+|Property Name|Type|Description|Example|
+|-|-|-|-|
+|prediction|```string```|The predicted ancestry from the collected inputs|Asian|
+|sensitivity|```decimal```|The predicted ancestry from the collected inputs|Asian|
+|specification|```decimal```|The predicted ancestry from the collected inputs|Asian|
+|probabilities|```decimal```|The predicted ancestry from the collected inputs|Asian|
+|statistics.accuracy|```decimal```|The predicted ancestry from the collected inputs|Asian|
+|statistics.accuracyLower|```decimal```|The predicted ancestry from the collected inputs|Asian|
+|statistics.accuracyUpper|```decimal```|The predicted ancestry from the collected inputs|Asian|
+|matrix|```matrix```|The predicted ancestry from the collected inputs|Asian|
+
+
+
+**Buliding/Deploying**
+
+This project is built and deployed to AWS using a manually run GitHub Action.
diff --git a/mamd-api-base/Dockerfile b/mamd-api-base/Dockerfile
@@ -0,0 +1,17 @@
+FROM rstudio/plumber
+LABEL org.opencontainers.image.authors="Ron Richardson <ron.richardson@gmail.com>"
+MAINTAINER "Ron Richardson" ron.richardson@gmail.com
+
+RUN R -e "install.packages('nnet')"
+RUN R -e "install.packages('dplyr')"
+RUN R -e "install.packages('caret')"
+RUN R -e "install.packages('e1071')"
+RUN R -e "install.packages('MLmetrics')"
+RUN R -e "install.packages('textutils')"
+
+
+# what about versioning the packages?
+# also need to version rstudio/plumber to a specifc version of R
+
+# RUN R -e "install.packages('remotes'); \
+#   remotes::install_version('tidystringdist', '0.1.2')"
diff --git a/mamd-api-base/install_packages.r b/mamd-api-base/install_packages.r
@@ -0,0 +1,5 @@
+install.packages("nnet")
+install.packages("dplyr")
+install.packages("caret")
+install.packages("e1071")
+install.packages("MLmetrics")
diff --git a/mamd-api/Dockerfile b/mamd-api/Dockerfile
@@ -0,0 +1,8 @@
+FROM mamd-api-base
+LABEL org.opencontainers.image.authors="Ron Richardson <ron.richardson@gmail.com>"
+MAINTAINER "Ron Richardson" ron.richardson@gmail.com
+
+COPY MaMD_1Jan2022_AnalData.csv /mamd.csv
+COPY api.R /api.R
+
+CMD ["/app/plumber.R"]
diff --git a/mamd-api/MaMD_1Jan2022_AnalData.csv b/mamd-api/MaMD_1Jan2022_AnalData.csv
diff --git a/mamd-api/api.R b/mamd-api/api.R
@@ -0,0 +1,230 @@
+#* Simple echo
+#* @param msg The message to echo
+#* @get /echo
+function (msg = "") {
+    list(msg = paste0("The message is: '", msg, "'"))
+}
+
+#* @filter cors
+cors <- function(req, res) {
+    res$setHeader("Access-Control-Allow-Origin", "*")
+
+    if (req$REQUEST_METHOD == "OPTIONS") {
+        res$setHeader("Access-Control-Allow-Methods","*")
+        res$setHeader("Access-Control-Allow-Headers", req$HTTP_ACCESS_CONTROL_REQUEST_HEADERS)
+        res$status <- 200 
+        return(list())
+    } else {
+        plumber::forward()
+    }
+}
+
+
+
+
+# create other endpoints to get plots, ctab, other analysis (load up by cookie of runtime?)
+# how to create variable with multiple properties to display as json? 
+# pull back individual items from ctab
+# update openapi docs with full info
+
+
+#* MaMD Analytical
+#* @serializer json
+#* @param group_list The list of groups to include
+#* @param ANS ANS
+#* @param INA INA
+#* @param IOB IOB
+#* @param MT MT
+#* @param NAW NAW
+#* @param NBC NBC
+#* @param NO NO
+#* @param PBD PBD
+#* @param PZT PZT
+#* @param ZS ZS
+#* @get /mamd
+function (group_list = "Unknown", ANS = NA, INA = NA, IOB = NA, MT = NA, NAW = NA, NBC = NA, NO = NA, PBD = NA, PZT = NA, ZS = NA) {
+
+    suppressPackageStartupMessages(library("nnet"))
+    suppressPackageStartupMessages(library("dplyr"))
+    suppressPackageStartupMessages(library("caret"))
+    suppressPackageStartupMessages(library("e1071"))
+    suppressPackageStartupMessages(library("MLmetrics"))
+    suppressPackageStartupMessages(library("textutils"))
+
+    inputs_header <- c('Group', 'ANS', 'INA', 'IOB', 'MT', 'NAW', 'NBC', 'NO', 'PBD', 'PZT', 'ZS')
+    inputs <- data.frame(
+        group_list, 
+        as.numeric(ANS), 
+        as.numeric(INA),
+        as.numeric(IOB), 
+        as.numeric(MT),
+        as.numeric(NAW),
+        as.numeric(NBC), 
+        as.numeric(NO), 
+        as.numeric(PBD),
+        as.numeric(PZT),
+        as.numeric(ZS),
+        stringsAsFactors = TRUE)
+    names(inputs) <- inputs_header
+
+    # convert groups input to vector
+    groups <- strsplit(group_list, split=",", fixed=TRUE)[[1]]
+
+
+
+
+    aNN_data <- read.csv("/mamd.csv")
+    GroupCol <- 'Ancestry'
+
+    names(aNN_data)[names(aNN_data) == GroupCol] <- 'Group';
+
+    # get data from selected groups
+    aNN_data<-aNN_data[aNN_data$Group %in% unlist(strsplit(groups, split=',')),] %>% droplevels()
+    aNN_data = aNN_data[,!sapply(inputs, function(x) mean(is.na(x)))>0.5]
+
+    # apply same sapply to inputs to remove NA columns (or not pass in via original inputs file)
+    aNN_data = na.omit(aNN_data)
+
+    aNN_data$Group<-as.factor(aNN_data$Group)
+    aNN_formula<-as.formula(Group ~ .)
+
+
+
+    ctrl <- trainControl(
+        method = "cv",
+        number = 10, 
+        summaryFunction = multiClassSummary, # Multiple metrics
+        classProbs = T, # Required for the ROC curves
+        savePredictions = T, # Required for the ROC curves
+        ## new option here:
+        sampling = "down");
+
+    # For replication 
+    # (I've added the same in all "trains" so you can just run that part independently)
+    set.seed(150) 
+
+
+
+    fit.NN <- train(
+        Group ~ ANS+ INA+ IOB+ MT+ NAW+ NBC+ NO+ PBD+ PZT+ ZS, 
+        data = aNN_data, 
+        method = "nnet", 
+        trace = F,
+        trControl = ctrl, 
+        preProcess = c("center","scale"), 
+        maxit = 250,    # Maximum number of iterations
+        tuneGrid = data.frame('size' = c(3,2,3,4,5,6,7,7,8,9), 'decay' = c(0.1,0,0,0,0.1,0.1,0.1,0,0,0)),
+        # tuneGrid = data.frame(size = 0, decay = 0),skip=TRUE, # Technically, this is log-reg
+        metric = "Accuracy");
+
+
+
+    # f gives posterior probs for SOME of the reference data
+    f <- fitted(fit.NN); # fitted.values
+
+    # this gives posterior probs for ALL of the reference data
+    ppbs <- predict(fit.NN, type = 'prob');
+
+    # get predictions for training / reference data
+    # for caret-NN
+    mod <- predict(fit.NN, type="raw");
+    mod <- as.factor(mod)
+
+    # NOTE: switched in original! correct is confusionmatrix(PREDICTED, TRUE)
+    #ctab<-caret::confusionMatrix(aNN_data$Group, mod)
+
+    ctab<-caret::confusionMatrix(mod, aNN_data$Group)
+    #ctab
+
+
+    fit.NN$bestTune[1,]
+    RefGrpClassTbl <- cbind(aNN_data['MaMDID'],aNN_data['Group'], mod, ppbs);
+    names(RefGrpClassTbl)[names(RefGrpClassTbl) == 'mod'] <- 'Into';
+
+
+    ############ Predict current case
+    # type must be "prob" with caret-NN
+
+    pred<-predict(fit.NN, newdata=inputs, type=c("prob"))
+    # not needed with caret-NN
+    # pred.post<-cbind(fit$xlevels, pred)
+    pred.post<-as.data.frame(pred, row.names="Posterior Prob") ##Double check here, as this was changed
+    pred.post$V1<-NULL
+    pred.post<-format(round(pred,3), nsmall=3)
+    #pred.post
+
+
+    # Get label of predicted group membership
+    aNNpred<-colnames(pred)[apply(pred, 1, which.max)]
+    #aNNpred
+
+
+
+    list(
+        ctab = HTMLencode(ctab),
+        prediction = aNNpred,
+        sensitivity = 0,
+        specificity = 0,
+        probabilities = c(6,7,8),     # { group, probability }
+        statistics = data.frame(
+            accuracy = 1,
+            accuracyLower = 2,
+            accuracyUpper = 3
+        ),
+        matrix = matrix(c(1,2,3,4,5,6), nrow=2, ncol=3, byrow=TRUE)
+    )
+
+
+
+
+
+
+
+
+
+
+    # list(
+    #     ctab,
+    #     pred.post,
+    #     aNNpred
+    # )
+
+    # list(
+    #     prediction = trimws(aNNpred),
+    #     sensitivity = trimws(gsub(paste("Class: ", trimws(aNNpred), sep=""), "", ctab$byClass[,"Sensitivity"][paste("Class: ", trimws(aNNpred), sep="")])),
+    #     specificity = trimws(gsub(paste("Class: ", trimws(aNNpred), sep=""), "", ctab$byClass[,"Specificity"][paste("Class: ", trimws(aNNpred), sep="")])),
+    # )
+
+    # results <- cat(ctab, pred.post, aNNpred, sep="\n")
+    # results
+
+
+
+
+    # pred.post
+    # aNNpred
+
+    # results.prediction1 <- pred.post
+    # results.prediction2 <- aNNpred
+    # results
+
+    # write ctab, pred.post, aNNpred to a file with a cookie timestamp (saved in docker container)
+    # return back cookie and future calls (plots, data, etc.) use the cookie to lookup the data?
+
+
+
+    # list(
+    #     arguments = inputs, 
+    #     groups = groups)
+
+    # results_header <- c('Inputs', 'Groups', 'Value 1', 'Value 2')
+    # results <- data.frame(
+    #     inputs,
+    #     groups,
+    #     123,
+    #     "Hello poop",
+    #     stringsAsFactors = TRUE)
+    # names(results) <- results_header
+
+    # results
+}