QuickFix_Visualization.Rmd

---
title: "Reproducibility Dashboard"
date: "Last modified: `r Sys.Date()`"
output: 
  flexdashboard::flex_dashboard:
    orientation: rows
    vertical_layout: scroll
editor_options: 
  
  
  chunk_output_type: console
  
knit: (function(input, ...) {
    rmarkdown::render(
      input,
      output_file = 'index.html',
      envir = globalenv()
    )
  })

---

```{r setup_02, include=FALSE}
rm(list=ls())

library(flexdashboard) # https://pkgs.rstudio.com/flexdashboard/
library(shiny)
library(tidyverse)
library(stringdist)       
library(readr)
library(knitr)
library(viridisLite)
library(highcharter)
library(plotly)
library(ggpubr)

#knit("./01-data-filtering.Rmd")
source(here("./QuickFix_Cleaning.R"))
```

## Column

### Reproductions

```{r number of reproductions}
# No. of claims instead of no. of display items because some have all NA repro scores
reproductions <- length(unique(tidy_all_paper_df$reproduction_id)) 

valueBox(reproductions, icon = "fa-pencil")
```

### Papers

```{r number of papers}
papers <-  length(unique(tidy_all_paper_df$doi))

valueBox(papers, icon = "fa-file")
```

### Claims

```{r number of claims}
claims <- tidy_claim_df %>% 
                mutate("claim_id" = paste0(reproduction_id, claim_N)) %>% 
                summarise(length(unique(claim_id))) %>% 
                as.numeric

valueBox(claims, icon = "fa-comment")
```


## Row {.tabset data-height="600"}

```{r histogram function}
# This function takes a dataframe, groups it by a list of variables, applies a function to summarize the reproducibility (possibly weighted by a variable), and then iteratively ungroups until it produces a paper-level summary.

# The argument `vars` is the vector of variables to group by listed in order from coarsest (i.e. smallest number of unique values) to finest (i.e. largest number of unique values)

# fns is a list of functions to do aggregation (e.g. mean) in the order they should be applied

# w is a column to use as weights

paper_hist <- function(df, vars, fns, w=NULL){
    n_ungroup <- length(vars)
    stat1 <- df %>% 
              group_by(across(vars)) %>%
              summarise(paper_level = fns[[1]](repro_score,w=w))
    
    if (length(vars)>1){
        for (i in c(0:(length(vars)-2))){
            
        stat1 <- stat1 %>%
                    ungroup(vars[length(vars)-i]) %>%
                    summarise(paper_level=fns[[i+2]](paper_level,w=w))
        }
    }
    
    level_max <- stat1 %>%
                    mutate(paper_level=floor(paper_level)) %>%
                    group_by(paper_level) %>%
                    summarize(n = n()) %>%
                    max()

    out <- stat1 %>% 
            ggplot(aes(x = paper_level)) +
            geom_histogram(binwidth = 1,
                           color = "black",
                           fill = "gray") +
            coord_cartesian(ylim =  c(0, 1.1*level_max)) +
            scale_x_continuous(breaks = 1:10) +
            theme(panel.background = element_rect(fill="white"),
                  panel.grid.major.y = element_line(color="grey80"),
                  panel.grid.minor.y = element_line(color="grey80"),
                  panel.grid.major.x = element_blank(),
                  panel.grid.minor.x = element_blank(),
                  plot.title = element_text(size=13),
                  plot.caption = element_text(hjust = 0))
    return(out)
}
```


```{r}
wrapper <- function(label, dev_width = dev.size("in")[1], dev_scaler = 18){   
              paste(strwrap(label, dev_width * dev_scaler), collapse = "\n") 
}
```

```{r}
main_hist <- paper_hist(dis_df, c("similar_title", "reproduction_id"), list(mean, mean)) +
    labs(title = "Distribution of Reproduction Scores: Paper Level",
             x = "Reproducibility Score",
             y = "Count")

main_hist_text <- ggparagraph("In this figure, the reproducibility score is calculated as a simple mean of reproducibility scores across display items reproduced, first within each reproduction attempt, and then, when applicable, across reproductions. This graph excludes cases whereby the reproduction attempts were abandoned, due to lack of materials required for computational reproduction (i.e. no data or code)", 
                              face = 'plain', size = 8) + 
                    theme(plot.margin = unit(c(t=10,r=0,b=0,l=0), "lines"))
```

```{r}
abandoned_scores <- abandoned_df %>%
                    select(similar_title,reproduction_id) %>%
                    mutate(repro_score=1)

dis_aband <- rbind.data.frame(
                    select(dis_df,similar_title,reproduction_id,repro_score),
                    abandoned_scores)

abandoned_hist <- paper_hist(dis_aband,
                             c("similar_title","reproduction_id"),
                             list(mean,mean)) +
                              labs(title = "Distribution of Reproduction Scores: Paper Level",
                                       x = "Reproducibility Score",
                                       y = "Count")

abandoned_hist_text <- ggparagraph("In this figure, the reproducibility score is calculated as a simple mean of reproducibility scores across display items reproduced, first within each reproduction attempt, and then, when applicable, across reproductions. In this graph, abandoned reproduction attempts (papers identified as not computationally reproducible) are assigned the lowest score of 1.", 
                              face = 'plain', size = 8) + 
                    theme(plot.margin = unit(c(t=11,r=0,b=0,l=0), "lines"))

```

```{r}
max_hist <- paper_hist(dis_df, c("similar_title"), list(max)) +
                          labs(title = "Distribution of Maximum Reproduction Scores: Paper Level",
                                   x = "Reproducibility Score",
                                   y = "Count")

max_hist_text <- ggparagraph("In this figure, the reproducibility score is the maximum reproducibility scores assigned across the display items examined within the reproduction attempts.", 
                              face = 'plain', size = 8) + 
                    theme(plot.margin = unit(c(t=15.5,r=0,b=0,l=0), "lines"))
```

```{r}
min_hist <- paper_hist(dis_df, c("similar_title"), c(min))+
                        labs(title = "Distribution of Minimum Reproduction Scores: Paper Level",
                                 x = "Reproducibility Score",
                                 y = "Count")

min_hist_text <- ggparagraph("In this figure, the reproducibility score is the minimum reproducibility scores assigned across the display items examined within the reproduction attempts.", 
                              face = 'plain', size = 8) + 
                    theme(plot.margin = unit(c(t=15.5,r=0,b=0,l=0), "lines"))
```

```{r}
hists <- list(main_hist, abandoned_hist, max_hist, min_hist)
y_tops <- sapply(hists, function(x) layer_scales(x)$y$range$range[2])
y_top <- max(y_tops)
hists <- lapply(hists, function(x) x+coord_cartesian(ylim = c(0, y_top)))
```

### Levels of Reproducibility
```{r}
ggarrange(hists[[1]], '', main_hist_text, ncol = 3, widths = c(7,0.25,2.75))
```


### Abandoned Included
```{r}
ggarrange(hists[[2]], '', abandoned_hist_text, ncol = 3, widths = c(7,0.25,2.75))

```

### Max within paper
```{r}
ggarrange(hists[[3]], '', max_hist_text, ncol = 3, widths = c(7,0.25,2.75))

```

### Min within paper
```{r}
ggarrange(hists[[4]], '', min_hist_text, ncol = 3, widths = c(7,0.25,2.75))

```

### Summary of Levels
<br>
<br>

| Level | Short summary                      |
|:-----:|:----------------------------------:|
| 1     | No Data or Code                    |
| 2     | Only Code                          |
| 3     | Partial analysis data & code       |
| 4     | All analysis data & code           |
| 5     | Reproducible from analysis data    |
| 6     | All cleaning code + L4             |
| 7     | Some raw data     + L6             |
| 8     | All cleaning code and raw data     |
| 9     | Reproducible from analysis  + L8   |
| 10    | Reproducible from raw data         |

For a more comprehensive description of the levels of computational reproducibility, see the [ACRe Guide](https://bitss.github.io/ACRE/assessment.html#levels-of-computational-reproducibility-for-a-specific-display-item)

## Row {data-height="500"}

### Reproducers Around the World

```{r map,   fig_width = 15, fig_height = 7.6, fig_retina = 2}
library(rnaturalearth)
library(rgeos)
library(RColorBrewer)

#get number of reproductions by country
country_counts <- dis_df %>% 
  group_by(reproduction_id) %>% 
  slice(1) %>%
  group_by(country_code) %>% 
  count %>% 
  rename(postal = country_code)

world <- ne_countries(returnclass = "sf",scale="small") %>%
    filter(admin!="Antarctica")
#merge to the world data 
world <- left_join(world, country_counts, by=c("postal")) %>% 
  mutate(n =  ifelse(is.na(n), 0, n))

# Make bins
breaks <- quantile(unique(world$n),
                   probs = seq(0,1,0.25))

world$breaks <- cut(world$n,
        breaks = breaks,
        include.lowest = TRUE,
        labels=F)
#world$breaks <- as.character(world$breaks)

world$text <- paste0(world$admin,"\nReproductions: ",world$n)
world$n[world$n==0] <- NA # To make them stand out on map

# light grey boundaries
l <- list(color = toRGB("grey"), width = 0.5)
g <- list(
  showframe = FALSE,
  showcoastlines = FALSE,
  projection = list(type = 'Mercator')
)

fig <- ggplot(data=world)+
    geom_sf(aes(fill=n, text=text, color = admin),
            size=0, color = "white")+
    scale_fill_distiller(palette="YlGnBu",direction = 1,
                         name="Reproductions") +
    theme(panel.background = element_blank(),
          axis.ticks = element_blank(),
          axis.text = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()
          )

# Workaround for getting hover over polygons:
# https://github.com/plotly/plotly.R/issues/1641#issuecomment-550477069

fig <- ggplotly(fig, tooltip="text") 

fig <- fig %>%
        style(hoverlabel = list(bgcolor = "white"),
          hoveron="fills",
          line.color="white", line.width = 0.25,
          traces = seq.int(2, length(fig$x$data))
          )%>%
    hide_legend() %>%
    layout(xaxis = list(autorange = TRUE),
         yaxis = list(autorange = TRUE))

fig
```


<!--
### Universities  

| University | Number of Reproductions |
|:-------------------:|:------:|
| UC Berkeley         | `r reproductions`      |
|                     |        |
|                     |        |

-->