Skip to content

Latest commit

 

History

History
460 lines (354 loc) · 10.8 KB

README.md

File metadata and controls

460 lines (354 loc) · 10.8 KB

R Snippets

General

Some basic hints in the beginning:

  • Use data.table instead of data.frame due to performance reasons. See here for more details.
  • Use explicit function names from packeges in the way package::function(param) instead of importing the whole package into the namespace. If you import a lot of packages and its functions, unintended side-effects might occur like two functions with the same name overwriting each other silently.

Select a subset of a table

## This selects the column 1 to 10 from the data:
subset(data, age < 25, select = 1:10)

Or (using data.table):

cols <- c("colname1", "colname2")
testdata[testdata$colname3 < 40, cols]

Regression, Machine-Learning

Linear

linear_model <- lm(weight ~ height)

Random Forests

rf_model <- randomForest(weight ~ height)
rf_model2 <- CoreModel(weight ~ heigth, model="rf")

k-nearest Neighbors

knn_model <- CoreModel(weight ~ heigth, model="knn")

Bayes

bayes_model <- CoreModel(weight ~ heigth, model="bayes")
bayes_model2 <- NaiveBayes(weight ~ .)

Simple neural networks

nnet.model <- nnet(weight ~ ., size=10)

Measure time

# Start the clock!
ptm <- proc.time()

# Do stuff here

# Stop the clock
proc.time() - ptm

Install TeX

tinytex::install_tinytex(force = T)

Clean the R environment, console and history

# Cleanup the backend in RStudio:
cat("\014") # Clears the console (imitates CTR + L)
rm(list = ls()) # Clears the Global Environment/variables/data
invisible(gc()) # Garbage collector/Clear unused RAM
# Start coding now:
print("Hello world! =)")

Or:

# install.packages("cleaR")
cleaR::clear()

Determine elements existing in two or more vectors

v1 <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3)
v2 <- c(2, 3, 4, 2, 3)
v3 <- c(1, 1, 1, 2, 2, 2, 9, 4)

Reduce(intersect, list(v1, v2, v3))
## Result:
# [1] 2 4

data.table specific stuff

Change the type of multiple columns

data <- data.table::data.table(a = 1:1e5,
                               b = sample(x = LETTERS, size = 1e5, replace = TRUE))

test_set <- function(data, colnames) {
  for (col in colnames) {
    data.table::set(x = data,
                    j = col,
                    value = as.character(data[[col]]))
  }
  return(data)
}

test_lapply <- function(data, colnames) {
  return(data[, lapply(.SD, as.character), .SDcols = colnames])
}

microbenchmark::microbenchmark(test_set(data = data, colnames = c("a", "b")),
                               test_lapply(data = data, colnames = c("a", "b")))

In short: I often use dt <- dt[, lapply(.SD, as.character), .SDcols = c("colname1", "colname2")].

Copy a data.tables structure without its content

data <- data.table::data.table(a = 1:10,
                               b = sample(x = LETTERS, size = 10, replace = TRUE))

data_structure <- data[0, ]
# > data_structure
# Empty data.table (0 rows and 2 cols): a,b

Search and replace in all cells in a data.table

dt <- data.table::data.table(
  c1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9),
  c2 = c(3, 4, 5, 6, 7, 8, 9, 1, 2),
  c3 = c(5, 6, 7, 8, 9, 1, 2, 3, 4)
)
dt
#>    c1 c2 c3
#> 1:  1  3  5
#> 2:  2  4  6
#> 3:  3  5  7
#> 4:  4  6  8
#> 5:  5  7  9
#> 6:  6  8  1
#> 7:  7  9  2
#> 8:  8  1  3
#> 9:  9  2  4

## Option 1: 
## Change all cells containing '2' as value to 'NA'
for (col in names(dt)) {
  data.table::set(
    x = dt,
    i = which(dt[[col]] == 2),
    j = col,
    value = NA
  )
}
dt
#>    c1 c2 c3
#> 1:  1  3  5
#> 2: NA  4  6
#> 3:  3  5  7
#> 4:  4  6  8
#> 5:  5  7  9
#> 6:  6  8  1
#> 7:  7  9 NA
#> 8:  8  1  3
#> 9:  9 NA  4

## Option 2 (thanks to @kapsner):
## Change all cells containing '3' as value to 'NA'
dt[dt == 3] <- NA
dt
#>    c1 c2 c3
#> 1:  1 NA  5
#> 2: NA  4  6
#> 3: NA  5  7
#> 4:  4  6  8
#> 5:  5  7  9
#> 6:  6  8  1
#> 7:  7  9 NA
#> 8:  8  1 NA
#> 9:  9 NA  4

Created on 2022-01-19 by the reprex package (v2.0.1)

Thanks to @kapsner for the inspiration of option 2! 👍

Add multiple columns to a data.table using the := operator

dt <- data.table::data.table(c1 = c(1, 2, 3),
                             c2 = c(3, 4, 5),
                             c3 = c(5, 6, 7))
dt
#>    c1 c2 c3
#> 1:  1  3  5
#> 2:  2  4  6
#> 3:  3  5  7


## Option 1:
dt[, c("newcol_1", "newcol_2") := list("value_for_col_1", "value_for_col_2")]
dt
#>    c1 c2 c3        newcol_1        newcol_2
#> 1:  1  3  5 value_for_col_1 value_for_col_2
#> 2:  2  4  6 value_for_col_1 value_for_col_2
#> 3:  3  5  7 value_for_col_1 value_for_col_2


## Option 2:
dt[, `:=`(avg = mean(c1),
          med = median(c1),
          min = min(c1))]
dt
#>    c1 c2 c3        newcol_1        newcol_2 avg med min
#> 1:  1  3  5 value_for_col_1 value_for_col_2   2   2   1
#> 2:  2  4  6 value_for_col_1 value_for_col_2   2   2   1
#> 3:  3  5  7 value_for_col_1 value_for_col_2   2   2   1

Created on 2021-09-15 by the reprex package (v2.0.1)

Remove all columns with no content

💡 Corresponding to a comment on Stack Overflow:

dt <- data.table::data.table(
  col_1 = c(1,2,3),
  col_2 = NA,
  col_3 = NA,
  col_4 = c(9,8,7)
)
dt
#>    col_1 col_2 col_3 col_4
#> 1:     1    NA    NA     9
#> 2:     2    NA    NA     8
#> 3:     3    NA    NA     7

names_of_empty_cols <- dt[, names(which(sapply(.SD, function(x) all(is.na(x)))))]
# or
# names_of_empty_cols <- dt[, names(which(!colSums(!is.na(.SD))))]
names_of_empty_cols
#> [1] "col_2" "col_3"

dt[, (names_of_empty_cols) := NULL]
dt
#>    col_1 col_4
#> 1:     1     9
#> 2:     2     8
#> 3:     3     7

Created on 2021-11-04 by the reprex package (v2.0.1)

Assign by value/reference

dt <- data.table::data.table(
  a = c(1:4),
  b = c(1:4),
  c = c(1:4)
)

(colnames_dynamic <- names(dt))
#> [1] "a" "b" "c"
(colnames_static <- rlang::duplicate(names(dt)))
#> [1] "a" "b" "c"

## Add new colum to dt:
dt[, "x" := c(11:14)]

## `colnames_dynamic` now also changed (assigned by reference):
colnames_dynamic
#> [1] "a" "b" "c" "x"

## `colnames_dynamic` is still unchanged (assigned by value):
colnames_static
#> [1] "a" "b" "c"

Created on 2022-01-12 by the reprex package (v2.0.1)

Keep first/n rows by group

(dx <-
  data.frame(
    ID = factor(c(1, 1, 2, 2, 3, 3)),
    AGE = c(30, 30, 40, 40, 35, 35),
    FEM = factor(c(1, 1, 0, 0, 1, 1))
  ))
#>   ID AGE FEM
#> 1  1  30   1
#> 2  1  30   1
#> 3  2  40   0
#> 4  2  40   0
#> 5  3  35   1
#> 6  3  35   1

dxt <- data.table::data.table(dx, key='ID')
dxt[, .SD[1,], by=ID]
#>    ID AGE FEM
#> 1:  1  30   1
#> 2:  2  40   0
#> 3:  3  35   1

Created on 2022-01-18 by the reprex package (v2.0.1)

Source: https://stats.stackexchange.com/a/7886

Apply CRAN checks locally to current package

rcmdcheck::rcmdcheck(args = "--as-cran")

Xaringan (Presentations with R)

Some public templates

Title Slides Code
Xaringan example - RU template Slides Code

Formatting slides

Action Command
C

classes

The classes available for vertically aligning text are:

  • top (default)
  • middle
  • bottom

The classes available for horizontally aligning text are:

  • left (default)
  • center
  • right

Other classes:

  • inverse (inverse colors - black background and white font)

(Source)

background-image

background-image: url(image.jpg)`
background-position: center;
background-repeat: no-repeat;
background-size: contain/cover;

CRAN releases

## Apply CRAN checks to local package before submitting to CRAN:
rcmdcheck::rcmdcheck(args = "--as-cran")

## Submit to CRAN:
devtools::submit_cran()

Quarto

:info: See here: [../quarto.md](../quarto.md)

Inspect a data.table or data.frame object

dt |> skimr::skim()

Special characters

## Protected space:
&#160;

## Registered trademark (r):
&#174;

Split vector x in chunks

(Source)

... with maximum chunksize max_size

split(x, ceiling(seq_along(x)/max_size))

... of n equal-sized chunks

split(x, cut(seq_along(x), n, labels = FALSE))