Introduction to R

Comment

help.start()
help(lm)
?lm
example(lm)
?help

# this is a comment
A = 5; a = 10 # R is case sensitive
print(paste("A is", A))
print(paste("a is", a))
cat("A and a are equal? = ", A == a)
cat("My name is", Sys.info()["user"])

[1] "A is 5"

[1] "a is 10"

A and a are equal? =  FALSE

My name is isezen

?print; ?paste; ?cat

ls() # list objects in the current session

[1] "a" "A"

rm(a) # remove object named a
ls() # list again to see what we have

[1] "A"

print(A) # print a to the console

[1] 5

?ls; ?rm

# Assignment
x <- c(10.4, 5.6, 3.1, 6.4, 21.7)
assign("x", c(10.4, 5.6, 3.1, 6.4, 21.7))
c(10.4, 5.6, 3.1, 6.4, 21.7) -> x
y <- c(x, 0, x) # c is abbreviation for combine
print(y)

 [1] 10.4  5.6  3.1  6.4 21.7  0.0 10.4  5.6  3.1  6.4 21.7

1/x

[1] 0.09615385 0.17857143 0.32258065 0.15625000 0.04608295

?c; ?assign

x ^ 2 # take the square
sqrt(y) # square root
x/y
v <- 2*x + y + 1
length(v) # what is the length of v? why?

??"arithmetic operations"; ?log; ?exp; ?sin; ?cos; ?tan; ?sqrt

sum(x) # sum of values in x vector
sum(x)/length(x) # calculate mean
mean(x) # easier mean calculation
min(x); max(x)

?var; ?sd; ?range; ?sort; ?order; ?mean; ?sum; ?summary; ?abs

5:17

 [1]  5  6  7  8  9 10 11 12 13 14 15 16 17

seq(-5, 5, by = 0.2)
seq(length = 51, from = -5, by = 0.2)

 [1] -5.0 -4.8 -4.6 -4.4 -4.2 -4.0 -3.8 -3.6 -3.4 -3.2 -3.0 -2.8 -2.6 -2.4 -2.2
[16] -2.0 -1.8 -1.6 -1.4 -1.2 -1.0 -0.8 -0.6 -0.4 -0.2  0.0  0.2  0.4  0.6  0.8
[31]  1.0  1.2  1.4  1.6  1.8  2.0  2.2  2.4  2.6  2.8  3.0  3.2  3.4  3.6  3.8
[46]  4.0  4.2  4.4  4.6  4.8  5.0

?seq

rep(x, times = 5)
rep(x, each = 5)

 [1] 10.4  5.6  3.1  6.4 21.7 10.4  5.6  3.1  6.4 21.7 10.4  5.6  3.1  6.4 21.7
[16] 10.4  5.6  3.1  6.4 21.7 10.4  5.6  3.1  6.4 21.7

 [1] 10.4 10.4 10.4 10.4 10.4  5.6  5.6  5.6  5.6  5.6  3.1  3.1  3.1  3.1  3.1
[16]  6.4  6.4  6.4  6.4  6.4 21.7 21.7 21.7 21.7 21.7

?rep

5 > 10
x > 13
as.numeric(x > 13)

[1] FALSE

[1] FALSE FALSE FALSE FALSE  TRUE

[1] 0 0 0 0 1

?"Comparison"

z <- c(1:3, NA) # a vector contains an NA
is.na(z) # which element(s) of z is NA?

[1] FALSE FALSE FALSE  TRUE

z == NA # wrong way!

[1] NA NA NA NA

0/0 # meaningless

[1] NaN

?is.na; ?is.finite

labs <- paste(c("X","Y"), 1:10, sep="")
print(labs)

 [1] "X1"  "Y2"  "X3"  "Y4"  "X5"  "Y6"  "X7"  "Y8"  "X9"  "Y10"

?paste; ?paste0

x[3] <- NA # set 3th element of x to NA
print(x)

[1] 10.4  5.6   NA  6.4 21.7

!is.na(x) # The ones that are not NA

[1]  TRUE  TRUE FALSE  TRUE  TRUE

non_na_x <- x[!is.na(x)] #non-NA values of x
print(non_na_x)

[1] 10.4  5.6  6.4 21.7

?`[[`

# create a random integer array to
# represent month
set.seed(2)
month <- round(runif(30, 1, 12))
(char_month <- month.abb[month]) # get month names

 [1] "Mar" "Sep" "Jul" "Mar" "Nov" "Nov" "Feb" "Oct" "Jun" "Jul" "Jul" "Apr"
[13] "Sep" "Mar" "May" "Oct" "Dec" "Mar" "Jun" "Feb" "Aug" "May" "Oct" "Mar"
[25] "May" "Jun" "Mar" "May" "Dec" "Feb"

which(char_month == "Jun") # Which are June?

[1]  9 19 26

which(month == 6) # same as above

[1]  9 19 26

?set.seed; ?runif; ?round; ?which; ?which.max; ?which.min

(char_month[1:12]) # select first 12 months

 [1] "Mar" "Sep" "Jul" "Mar" "Nov" "Nov" "Feb" "Oct" "Jun" "Jul" "Jul" "Apr"

(char_month[-(1:20)]) # exclude first 20

 [1] "Aug" "May" "Oct" "Mar" "May" "Jun" "Mar" "May" "Dec" "Feb"

# exclude June from vector
(char_month[-which(month == 6)])

 [1] "Mar" "Sep" "Jul" "Mar" "Nov" "Nov" "Feb" "Oct" "Jul" "Jul" "Apr" "Sep"
[13] "Mar" "May" "Oct" "Dec" "Mar" "Feb" "Aug" "May" "Oct" "Mar" "May" "Mar"
[25] "May" "Dec" "Feb"

# use months and month names together
# set names of month
names(month) <- char_month
print(month)

Mar Sep Jul Mar Nov Nov Feb Oct Jun Jul Jul Apr Sep Mar May Oct Dec Mar Jun Feb 
  3   9   7   3  11  11   2  10   6   7   7   4   9   3   5  10  12   3   6   2 
Aug May Oct Mar May Jun Mar May Dec Feb 
  8   5  10   3   5   6   3   5  12   2 

(month[(month == 6)])

Jun Jun Jun 
  6   6   6 

?names; ?colnames; ?rownames

# also try byrow = TRUE.
(m <- matrix(1:20, nrow = 5, ncol = 4))

     [,1] [,2] [,3] [,4]
[1,]    1    6   11   16
[2,]    2    7   12   17
[3,]    3    8   13   18
[4,]    4    9   14   19
[5,]    5   10   15   20

?matrix; ?factor; ?list; ?data.frame

(z <- 0:9)

 [1] 0 1 2 3 4 5 6 7 8 9

(digits <- as.character(z))

 [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9"

?as.numeric; ?as.character; ?as.logical; ?as.matrix

digit <- 0:9
name <- c("zero", "one", "two", "three",
          "four", "five", "six", "seven",
          "eight", "nine")
df <- data.frame(digit, name)
head(df)

  digit  name
1     0  zero
2     1   one
3     2   two
4     3 three
5     4  four
6     5  five

str(df)

'data.frame':   10 obs. of  2 variables:
 $ digit: int  0 1 2 3 4 5 6 7 8 9
 $ name : chr  "zero" "one" "two" "three" ...

?str; ? head; ?tail

options(digits = 3) # print only 3 digits
set.seed(1) # fixed randomization for rnorm
# simulate pm10 distribution
pm10 <- 10 ^ rnorm(100, 1.6, 0.27)
pm10 <- pm10[pm10 > 40] # higher values than 40 ug/m^3
set.seed(1) # fixed randomization for sample
regions <- c("mar", "ege", "kdz", "ica", "akd", "dga", "gda")
reg <- factor(sample(1:length(regions), length(pm10), replace = TRUE))
levels(reg) <- regions
head(data.frame(reg, pm10))

  reg  pm10
1 mar  44.6
2 ica 107.3
3 gda  48.9
4 mar  53.9
5 ege  63.0
6 akd  56.9

tapply(pm10, reg, mean) # means by region

 mar  ege  kdz  ica  akd  dga  gda 
71.7 65.9 82.1 59.8 59.4 76.2 71.5 

?options; ?set.seed; ?runif; ?rnorm; ?sample; ?round; ?levels; ?tapply

arr3d <- array(1:24, dim = c(4, 3, 2),
                dimnames = list(
                  c("one", "two", "three", "four"),
                  c("ein", "zwei", "drei"),
                  c("un", "deux")))
mat <- matrix(1:12, nrow = 4, byrow = TRUE,
              dimnames = list(
                c("one", "two", "three", "four"),
                c("ein", "zwei", "drei")))

class(arr3d); class(mat) # class of object
length(arr3d); length(mat) # length of object
dim(arr3d); dim(mat) # dimensions
nrow(arr3d); nrow(mat) # number of rows
ncol(arr3d); ncol(mat) # number of columns
rownames(arr3d); rownames(mat)
colnames(arr3d); colnames(mat)
dimnames(arr3d); dimnames(mat)

?matrix; ?array; ?class; ?dim; ?nrow; ?ncol; ?rownames; ?colnames

mat[1:2, 2:3]

    zwei drei
one    2    3
two    5    6

mat[,2]

  one   two three  four 
    2     5     8    11 

mat[,"zwei"]

  one   two three  four 
    2     5     8    11 

class(mat[,2])

[1] "integer"

class(mat[,2, drop = F])

[1] "matrix" "array" 

?drop

mat2 <- matrix(seq.int(2, 24, 2), nrow = 4,
               dimnames = list(
                 c("five", "six", "seven", "eight"),
                 c("vier", "funf", "sechs")))
rbind(mat, mat2)

      ein zwei drei
one     1    2    3
two     4    5    6
three   7    8    9
four   10   11   12
five    2   10   18
six     4   12   20
seven   6   14   22
eight   8   16   24

cbind(mat, mat2)

      ein zwei drei vier funf sechs
one     1    2    3    2   10    18
two     4    5    6    4   12    20
three   7    8    9    6   14    22
four   10   11   12    8   16    24

?rbind; ?cbind; ?c

mat + mat2
mat %*% c(1,2,3) # matrix multiplication
mat / mat2
t(mat) # transpose of the matrix
diag(mat)
diag(mat) <- 0 # set diagonal to zero

      ein zwei drei
one     3   12   21
two     8   17   26
three  13   22   31
four   18   27   36

      [,1]
one     14
two     32
three   50
four    68

       ein  zwei  drei
one   0.50 0.200 0.167
two   1.00 0.417 0.300
three 1.17 0.571 0.409
four  1.25 0.688 0.500

?t; ?aperm; ?diag; ?`%*%`; ?`%o%`

Lst <- list(name = "John", wife = "Mary", no.children = 3,
            child.ages = c(4,7,9))
Lst$name # equal to Lst[[1]]
Lst[[4]] # equal to Lst$child.ages
Lst[["wife"]] # same as Lst$wife
names(Lst)
str(Lst)

[1] "John"

[1] 4 7 9

[1] "Mary"

[1] "name"        "wife"        "no.children" "child.ages" 

List of 4
 $ name       : chr "John"
 $ wife       : chr "Mary"
 $ no.children: num 3
 $ child.ages : num [1:3] 4 7 9

?list; ?names; ?str

Lst <- list(pm10 = pm10, region = reg)
str(Lst)
head(as.data.frame(Lst))

List of 2
 $ pm10  : num [1:53] 44.6 107.3 48.9 53.9 63 ...
 $ region: Factor w/ 7 levels "mar","ege","kdz",..: 1 4 7 1 2 5 7 3 6 2 ...

   pm10 region
1  44.6    mar
2 107.3    ica
3  48.9    gda
4  53.9    mar
5  63.0    ege
6  56.9    akd

?list; ?str; ?as.data.frame

list.A <- list(name = "John", married = T, child.count = 3)
list.B <- list(name = "Jenny", married = F)
Lst <- c(list.A, list.B)
str(Lst)
head(as.data.frame(Lst))
Lst$name # which one? John or Jenny

List of 5
 $ name       : chr "John"
 $ married    : logi TRUE
 $ child.count: num 3
 $ name       : chr "Jenny"
 $ married    : logi FALSE

  name married child.count name.1 married.1
1 John    TRUE           3  Jenny     FALSE

[1] "John"

df <- data.frame(pm10 = pm10, region = reg)
str(df)
head(df)

'data.frame':   53 obs. of  2 variables:
 $ pm10  : num  44.6 107.3 48.9 53.9 63 ...
 $ region: Factor w/ 7 levels "mar","ege","kdz",..: 1 4 7 1 2 5 7 3 6 2 ...

   pm10 region
1  44.6    mar
2 107.3    ica
3  48.9    gda
4  53.9    mar
5  63.0    ege
6  56.9    akd

?list; ?str; ?as.data.frame

dt <- read.table("data.txt")
class(dt);str(dt)

[1] "data.frame"

'data.frame':   9 obs. of  5 variables:
 $ V1: int  100 200 300 400 500 600 700 800 900
 $ V2: chr  "a1" "a2" "a3" "a4" ...
 $ V3: chr  "b1" "b2" "b3" "b4" ...
 $ V4: logi  TRUE TRUE FALSE FALSE FALSE TRUE ...
 $ V5: chr  "x" "x" "x" "y" ...

head(dt)

   V1 V2 V3    V4 V5
1 100 a1 b1  TRUE  x
2 200 a2 b2  TRUE  x
3 300 a3 b3 FALSE  x
4 400 a4 b4 FALSE  y
5 500 a5 b5 FALSE  y
6 600 a6 b6  TRUE  y

?read.table

dt <- read.table("data.txt", stringsAsFactors = TRUE)
class(dt);str(dt)

[1] "data.frame"

'data.frame':   9 obs. of  5 variables:
 $ V1: int  100 200 300 400 500 600 700 800 900
 $ V2: Factor w/ 9 levels "a1","a2","a3",..: 1 2 3 4 5 6 7 8 9
 $ V3: Factor w/ 9 levels "b1","b2","b3",..: 1 2 3 4 5 6 7 8 9
 $ V4: logi  TRUE TRUE FALSE FALSE FALSE TRUE ...
 $ V5: Factor w/ 3 levels "x","y","z": 1 1 1 2 2 2 1 3 3

head(dt)

   V1 V2 V3    V4 V5
1 100 a1 b1  TRUE  x
2 200 a2 b2  TRUE  x
3 300 a3 b3 FALSE  x
4 400 a4 b4 FALSE  y
5 500 a5 b5 FALSE  y
6 600 a6 b6  TRUE  y

?read.table; ?factor

dt.pm10 <- read.csv("pm10.csv", sep = ";") # or use read.csv2
class(dt.pm10)
str(dt.pm10)
head(dt.pm10)

[1] "data.frame"

'data.frame':   43848 obs. of  6 variables:
 $ Date: chr  "2008-01-01 00:00:00" "2008-01-01 01:00:00" "2008-01-01 02:00:00" "2008-01-01 03:00:00" ...
 $ sta1: num  NA NA NA NA NA 18.1 NA 13.1 NA 28.2 ...
 $ sta2: num  36.6 30.5 33.3 NA 35 29.5 17 39.8 43.5 66.5 ...
 $ sta3: num  56.9 45.8 25.3 20.4 35.1 23.7 44 47.2 NA 38.4 ...
 $ sta4: num  NA NA NA NA NA NA NA NA NA NA ...
 $ sta5: num  51.6 40.4 78.9 39.4 54.6 24.3 16.8 NA 49.7 20.3 ...

                 Date sta1 sta2 sta3 sta4 sta5
1 2008-01-01 00:00:00   NA 36.6 56.9   NA 51.6
2 2008-01-01 01:00:00   NA 30.5 45.8   NA 40.4
3 2008-01-01 02:00:00   NA 33.3 25.3   NA 78.9
4 2008-01-01 03:00:00   NA   NA 20.4   NA 39.4
5 2008-01-01 04:00:00   NA 35.0 35.1   NA 54.6
6 2008-01-01 05:00:00 18.1 29.5 23.7   NA 24.3

?read.csv; ?read.csv2

# Sys.setenv(TZ='GMT')
dt.pm10 <- read.csv("pm10.csv", sep = ";",
               colClasses = c("POSIXct", "numeric", "numeric",
                              "numeric", "numeric", "numeric"))
str(dt.pm10)
head(dt.pm10)

'data.frame':   43848 obs. of  6 variables:
 $ Date: POSIXct, format: "2008-01-01" "2008-01-01" ...
 $ sta1: num  NA NA NA NA NA 18.1 NA 13.1 NA 28.2 ...
 $ sta2: num  36.6 30.5 33.3 NA 35 29.5 17 39.8 43.5 66.5 ...
 $ sta3: num  56.9 45.8 25.3 20.4 35.1 23.7 44 47.2 NA 38.4 ...
 $ sta4: num  NA NA NA NA NA NA NA NA NA NA ...
 $ sta5: num  51.6 40.4 78.9 39.4 54.6 24.3 16.8 NA 49.7 20.3 ...

        Date sta1 sta2 sta3 sta4 sta5
1 2008-01-01   NA 36.6 56.9   NA 51.6
2 2008-01-01   NA 30.5 45.8   NA 40.4
3 2008-01-01   NA 33.3 25.3   NA 78.9
4 2008-01-01   NA   NA 20.4   NA 39.4
5 2008-01-01   NA 35.0 35.1   NA 54.6
6 2008-01-01 18.1 29.5 23.7   NA 24.3

Sys.setenv(TZ='GMT')
dt.pm10 <- read.csv("pm10.csv", sep = ";",
               colClasses = c("POSIXct", "numeric", "numeric",
                              "numeric", "numeric", "numeric"))
# Sys.setenv(TZ='EET')
head(dt.pm10)

                 Date sta1 sta2 sta3 sta4 sta5
1 2008-01-01 00:00:00   NA 36.6 56.9   NA 51.6
2 2008-01-01 01:00:00   NA 30.5 45.8   NA 40.4
3 2008-01-01 02:00:00   NA 33.3 25.3   NA 78.9
4 2008-01-01 03:00:00   NA   NA 20.4   NA 39.4
5 2008-01-01 04:00:00   NA 35.0 35.1   NA 54.6
6 2008-01-01 05:00:00 18.1 29.5 23.7   NA 24.3

dt.pm10$Date[2135:2140]

[1] "2008-03-29 22:00:00 GMT" "2008-03-29 23:00:00 GMT"
[3] "2008-03-30 00:00:00 GMT" "2008-03-30 01:00:00 GMT"
[5] "2008-03-30 02:00:00 GMT" "2008-03-30 03:00:00 GMT"

Sys.setenv(TZ='EET')
dt.pm10$Date[2135:2140]

[1] "2008-03-30 00:00:00 EET"  "2008-03-30 01:00:00 EET" 
[3] "2008-03-30 02:00:00 EET"  "2008-03-30 04:00:00 EEST"
[5] "2008-03-30 05:00:00 EEST" "2008-03-30 06:00:00 EEST"

dt.pm10 <- read.csv("pm10.csv", sep = ";")
# read date column as character
dt.pm10$Date <- strptime(dt.pm10$Date, "%Y-%m-%d %H:%M:%S")
head(dt.pm10)
dt.pm10$Date[2135:2141]

                 Date sta1 sta2 sta3 sta4 sta5
1 2008-01-01 00:00:00   NA 36.6 56.9   NA 51.6
2 2008-01-01 01:00:00   NA 30.5 45.8   NA 40.4
3 2008-01-01 02:00:00   NA 33.3 25.3   NA 78.9
4 2008-01-01 03:00:00   NA   NA 20.4   NA 39.4
5 2008-01-01 04:00:00   NA 35.0 35.1   NA 54.6
6 2008-01-01 05:00:00 18.1 29.5 23.7   NA 24.3

[1] "2008-03-29 22:00:00 EET"  "2008-03-29 23:00:00 EET" 
[3] "2008-03-30 00:00:00 EET"  "2008-03-30 01:00:00 EET" 
[5] "2008-03-30 02:00:00 EET"  "2008-03-30 03:00:00"     
[7] "2008-03-30 04:00:00 EEST"

install.packages(rpart)
install.packages(ggplot2, partykit)

data(package="rpart")
data(Puromycin, package="datasets")
?airquality
edit(airquality) # edit data if you need

?data; ?save; ?dput; ?saveRDS; ?edit

if (expr_1) expr_2 else expr_3

age <- 12
if (age < 13) {
  print("Watch this with your Mom")
} else {
  print("Enjoy the movie!")
}

[1] "Watch this with your Mom"

age <- 21
print(ifelse(age < 13, "Watch this with your Mom", "Enjoy the movie!"))

[1] "Enjoy the movie!"

?`if`; ?ifelse

for (name in expr_1) expr_2
repeat expr_2
while (condition) expr_2

?`for`; ?`repeat`; ?`while`

for (i in 2:3) {
  plot(dt.pm10[,1], dt.pm10[,i], type = "l", main = paste0("plot", i))
}

i <- 2
repeat {
  plot(dt.pm10[,1], dt.pm10[,i], type = "l", main = paste0("plot", i))
  i <- i + 1
  if (i == 3) break
}

i <- 2
while (i < 4) {
  plot(dt.pm10[,1], dt.pm10[,i], type = "l", main = paste0("plot", i))
  i <- i + 1
}

for (i in 2:4) {
  if (i == 3) next
  plot(dt.pm10[,1], dt.pm10[,i], type = "l", main = paste0("plot", i))
}

> name <- function(arg_1, arg_2, ...) expression
> return(value)

make_cake <- function(height, radius) pi * (radius ^ 2) * height
cake1 <- make_cake(0.3, 0.5)
cat("Volume of cake1 is", cake1, "m^3\n")

Volume of cake1 is 0.236 m^3

cake2 <- make_cake(1, 2)
cat("Volume of cake2 is", cake2, "m^3\n")

Volume of cake2 is 12.6 m^3

?`function`; ?methods

make_cake <- function(height = 0.1, radius = 0.5) {
  cake <- pi * (radius ^ 2) * height
  return(cake)
}

cake1 <- make_cake()
cat("Volume of cake1 is", cake1, "m^3\n")

Volume of cake1 is 0.0785 m^3

cake2 <- make_cake(0.2)
cat("Volume of cake2 is", cake2, "m^3\n")

Volume of cake2 is 0.157 m^3

cake3 <- make_cake(radius = 2)
cat("Volume of cake3 is", cake3, "m^3\n")

Volume of cake3 is 1.26 m^3

myfunc <- function() {
  x <- 20
  print(x)
}
x <- 10
print(x)

[1] 10

myfunc()

[1] 20

area_of_rectangle <- function(height = 1, width = 1) {
  area <- height * width
  return(area)
}

area_of_square <- function(height = 1) {
  return(area_of_rectangle(height, height))
}

area_of_triangle <- function(height = 1, width = 1) {
  return(area_of_rectangle(height, width)/2)
}

vol_of_cube <- function(height = 1, width = 1, depth = 1) {
  height * width * depth
}

area_of_rectangle <- function(height = 1, width = 1) {
  vol_of_cube(height, width)
}

area_of_square <- function(height = 1) {
  area_of_rectangle(height, height)
}

area_of_triangle <- function(height = 1, width = 1) {
  area_of_rectangle(height, width)/2
}

fib <- function(n, last = TRUE) {
  x <- numeric(n)
  x[1:2] <- c(1, 1)
  for (i in 3:n) x[i] <- x[i-1] + x[i-2]
  if (last) x <- x[n]
  x
}

fib2 <- function(n, last = TRUE) {
  x <- if (last) n else 1:n
  round(((5 + sqrt(5)) / 10) * (( 1 + sqrt(5)) / 2) ^ (x - 1))
}

library(microbenchmark)
microbenchmark(fib(30, F), fib2(30, F))

Unit: microseconds
        expr  min   lq  mean median   uq  max neval
  fib(30, F) 1.97 2.03 23.52   2.05 2.17 2143   100
 fib2(30, F) 1.27 1.35  9.34   1.44 1.52  784   100

is.prime <- function(x) {
  x <- x[1] # make sure length of x is 1
  it.is.prime <- FALSE
  if(x > 1) {
    it.is.prime <- TRUE
    for (i in 2:(x - 1)) {
      if (x %% i == 0) {
        it.is.prime <- FALSE
        break
      }
    }
  }
  if (x == 2) it.is.prime <- TRUE
  return(it.is.prime)
}

is.prime(13)
is.prime(21)
is.prime(19999999)

is.prime2 <- function(x) {
  x <- x[1] # make sure length of x is 1
  it.is.prime <- FALSE
  if(x > 1) {
    it.is.prime <- TRUE
    i <- 2:(x - 1)
    if(any(x %% i == 0)) {
      it.is.prime <- FALSE
    }
  }
  if (x == 2) it.is.prime <- TRUE
  return(it.is.prime)
}

is.prime3 <- function(x) {
  sapply(x, is.prime2)
}

library(microbenchmark)
x <- 31
microbenchmark(is.prime(x), is.prime2(x))

Unit: nanoseconds
         expr  min   lq  mean median   uq     max neval
  is.prime(x) 2009 2050 26787   2091 2132 2467544   100
 is.prime2(x)  615  615 21065    656  697 2032411   100

nprime <- function(n) {
  sapply(n, function(x) sum(sapply(1:x, is.prime2)))
}
nprime(10)

[1] 4

nprime(11:20)

 [1] 5 5 6 6 6 6 7 7 8 8

apply(X, MARGIN, FUN, ..., simplify = TRUE)

(m <- matrix(1:12, nrow = 3))

     [,1] [,2] [,3] [,4]
[1,]    1    4    7   10
[2,]    2    5    8   11
[3,]    3    6    9   12

apply(m, 1, mean) # 1 indicates rows, 2 would indicate columns

[1] 5.5 6.5 7.5

apply(m, 2, function(x) sum(x^3))

[1]   36  405 1584 4059

apply(X, MARGIN, FUN, ..., simplify = TRUE)

(m <- matrix(1:12, nrow = 3))

     [,1] [,2] [,3] [,4]
[1,]    1    4    7   10
[2,]    2    5    8   11
[3,]    3    6    9   12

apply(m, 2, is.prime3) # or let's employ is.prime3 function

      [,1]  [,2]  [,3]  [,4]
[1,] FALSE FALSE  TRUE FALSE
[2,]  TRUE  TRUE FALSE  TRUE
[3,]  TRUE FALSE FALSE FALSE

apply(m, 2, function(x) { # or let's normalize each column
  return ((x - min(x)) / (max(x) - min(x)))
})

     [,1] [,2] [,3] [,4]
[1,]  0.0  0.0  0.0  0.0
[2,]  0.5  0.5  0.5  0.5
[3,]  1.0  1.0  1.0  1.0

tapply(X, INDEX, FUN = NULL, ..., default = NA, simplify = TRUE)

numbers <- c(1, 2, 3, 4, 5, 6)
groups <- factor(c('A', 'B', 'A', 'B', 'A', 'B'))
tapply(numbers, groups, mean)

A B 
3 4 

tapply(numbers, groups, function(x) { # or let's normalize each group
  return ((x - min(x)) / (max(x) - min(x)))
})

$A
[1] 0.0 0.5 1.0

$B
[1] 0.0 0.5 1.0

tapply(X, INDEX, FUN = NULL, ..., default = NA, simplify = TRUE)

# Example data
pollutant <- runif(30)  # Random pollution data
city <- factor(rep(c("Istanbul", "Ankara", "Izmir"), 10))  # Repeating city names
year <- factor(rep(c("2020", "2021", "2022"), each = 10))  # Repeating years

air_data <- data.frame(pollutant, city, year)
head(air_data)

  pollutant     city year
1     0.623 Istanbul 2020
2     0.357   Ankara 2020
3     0.588    Izmir 2020
4     0.914 Istanbul 2020
5     0.199   Ankara 2020
6     0.369    Izmir 2020

tapply(air_data$pollutant, list(air_data$city, air_data$year), mean)

          2020  2021  2022
Ankara   0.441 0.327 0.342
Istanbul 0.759 0.499 0.368
Izmir    0.493 0.465 0.660

sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)

v <- c(1, 4, 9, 16)
sapply(v, sqrt)

[1] 1 2 3 4

# or let's calculate sqrt and cube of each value
sapply(v, function(x) sqrt(x)^3)

[1]  1  8 27 64

sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)

set.seed(123)  # For reproducibility
city_data <- list(
    Istanbul = data.frame(PM25 = rnorm(10), NO2 = rnorm(10)),
    Ankara = data.frame(PM25 = rnorm(10), NO2 = rnorm(10)),
    Izmir = data.frame(PM25 = rnorm(10), NO2 = rnorm(10))
)
# Let's calculate mean of PM25 for each city
sapply(city_data, function(x) mean(x$PM25, na.rm = TRUE))

Istanbul   Ankara    Izmir 
 0.07463 -0.42456 -0.00872 

# or let's calculate mean of each column
# sapply(city_data, colMeans)
sapply(city_data, function(x) colMeans(x))

     Istanbul Ankara    Izmir
PM25   0.0746 -0.425 -0.00872
NO2    0.2086  0.322  0.22169

vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)

v <- c(1, 4, 9, 16)
vapply(v, sqrt, numeric(1))

[1] 1 2 3 4

v <- 1:10
vapply(v, is.prime2, numeric(1)) # what are the results?

 [1] 0 1 1 0 1 0 1 0 0 0

vapply(v, is.prime2, logical(1)) # what are the results?

 [1] FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE

vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)

set.seed(123)  # For reproducibility
measurements <- list(
    morning = rnorm(50, mean = 100, sd = 10),
    afternoon = rnorm(60, mean = 120, sd = 15),
    evening = rnorm(40, mean = 90, sd = 20)
)
vapply(measurements,
       function(x) c(min(x), mean(x), median(x), max(x)),
       numeric(4))

     morning afternoon evening
[1,]    80.3      85.4    48.9
[2,]   100.3     120.8    85.6
[3,]    99.3     119.8    84.8
[4,]   121.7     152.8   132.0

vapply(measurements,
       function(x) c(min = min(x),
                     mean = mean(x),
                     median = median(x),
                     max = max(x)),
       numeric(4))

       morning afternoon evening
min       80.3      85.4    48.9
mean     100.3     120.8    85.6
median    99.3     119.8    84.8
max      121.7     152.8   132.0

mapply(FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE,
       USE.NAMES = TRUE)

set.seed(123)  # For reproducibility
pm25 <- rnorm(7, mean = 35, sd = 5)  # PM2.5 readings for a week
no2 <- rnorm(7, mean = 50, sd = 10)  # NO2 readings for the same week
so2 <- rnorm(7, mean = 20, sd = 3)   # SO2 readings for the same week
# let's calculate mean for the pollutants.
mapply(mean, pm25, no2, so2)

[1] 32.2 33.8 42.8 35.4 35.6 43.6 37.3

# let's assume a hypothetical formula for an air quality index
pollution_index <- function(pm25, no2, so2) {
    sqrt((pm25 * 0.4 + no2 * 0.3 + so2 * 0.2)) / 3
}
mapply(pollution_index, pm25, no2, so2)

[1] 1.76 1.87 1.97 1.99 1.97 2.04 1.93

set.seed(123)  # For reproducibility
actuals <- list(
  pm25 = rnorm(10), no2 = rnorm(10), so2 = rnorm(10))
predicted <- list(
  pm25 = rnorm(10), no2 = rnorm(10), so2 = rnorm(10))
# let's calculate RMSE for each pollutant
mapply(function(a, p) {
  sqrt(mean((a - p)^2))
}, actuals, predicted)

 pm25   no2   so2 
0.722 1.607 1.616 

sqrt(mean((actuals$pm25 - predicted$pm25)^2))

sapply(1:3, function(i) {
  sqrt(mean((actuals[[i]] - predicted[[i]])^2))
})

for (i in 1:3) {
  print(sqrt(mean((actuals[[i]] - predicted[[i]])^2)))
}

rapply(object, f, classes = "ANY", deflt = NULL,
       how = c("unlist", "replace", "list"), ...)

set.seed(123)  # For reproducibility
# Create a complex example data
measurements  <- sapply(
  c("Ankara", "Istanbul", "Izmir"),
  function(city) {
    sapply(paste0("day", 1:30), function(day) {
      sapply(c("PM25", "NO2", "SO2"),
             function(pol) rnorm(24, mean = 50, sd = 10), simplify = FALSE)
    }, simplify = FALSE)
  }, simplify = FALSE)

norm <- function(x) { # normalize function
    (x - min(x)) / (max(x) - min(x))
}
normalized_measurements <- rapply(measurements, norm, how = "replace")

Map(f, ...)

set.seed(123)  # For reproducibility
# Create a complex example data
PM25 <- list(Ankara = rnorm(10, mean = 35, sd = 5),
             Istanbul = rnorm(10, mean = 30, sd = 4),
             Izmir = rnorm(10, mean = 40, sd = 6))

NO2 <- list(Ankara = rnorm(10, mean = 50, sd = 10),
            Istanbul = rnorm(10, mean = 45, sd = 7),
            Izmir = rnorm(10, mean = 55, sd = 8))

pollution_stats <- Map(
  function(pm25, no2)
    list(mean_pm25 = mean(pm25), sd_pm25 = sd(pm25),
         mean_no2 = mean(no2), sd_no2 = sd(no2)), PM25, NO2)
class(pollution_stats)

[1] "list"

pollution_stats2 <- mapply(
  function(pm25, no2)
    list(mean_pm25 = mean(pm25), sd_pm25 = sd(pm25),
         mean_no2 = mean(no2), sd_no2 = sd(no2)), PM25, NO2)
class(pollution_stats2)

[1] "matrix" "array" 

print(pollution_stats2)

          Ankara Istanbul Izmir
mean_pm25 35.4   30.8     37.5 
sd_pm25   4.77   4.15     5.58 
mean_no2  53.2   44.9     56.8 
sd_no2    5.27   7.58     6.85 

Reduce(f, x, init, right = FALSE, accumulate = FALSE)

weekly_AQI <- list(
    week1 = c(120, 110, 115, 130, 125, 140, 135),
    week2 = c(128, 122, 118, 135, 140, 145, 130),
    week3 = c(130, 125, 120, 140, 135, 150, 145)
)
(cumulative_product <- Reduce(function(x, y) x * y, weekly_AQI))

[1] 1996800 1677500 1628400 2457000 2362500 3045000 2544750

Filter(f, x)

daily_PM25 <- c(35, 40, 25, 20, 50, 45, 55, 30, 25, 40, 60, 20, 30, 35, 40, 45, 25, 50, 55, 60, 30, 25, 20, 35, 40, 45, 30, 50, 55, 60)

(safe_days <- Filter(function(x) x < 40, daily_PM25))

 [1] 35 25 20 30 25 20 30 35 25 30 25 20 35 30

i <- which(daily_PM25 < 40) # indices of the values lower than 40
daily_PM25[i]

 [1] 35 25 20 30 25 20 30 35 25 30 25 20 35 30

daily_PM25[which(daily_PM25 < 40)]

 [1] 35 25 20 30 25 20 30 35 25 30 25 20 35 30

set.seed(123)  # For reproducibility
# 3 cities, 10 days, 2 pollutants
air_quality_data <- list(
    Ankara = data.frame(day = 1:10, PM25 = rnorm(10, mean = 35, sd = 5),
                        NO2 = rnorm(10, mean = 50, sd = 10)),
    Istanbul = data.frame(day = 1:10, PM25 = rnorm(10, mean = 40, sd = 6),
                          NO2 = rnorm(10, mean = 60, sd = 15)),
    Izmir = data.frame(day = 1:10, PM25 = rnorm(10, mean = 30, sd = 4),
                       NO2 = rnorm(10, mean = 45, sd = 7))
)
# PM2.5 mean values of all cities
sapply(air_quality_data, function(df) mean(df$PM25))

  Ankara Istanbul    Izmir 
    35.4     37.5     30.0 

# We want to extract the cities with mean PM2.5 lower than 37
safe_cities <- Filter(
  function(df) mean(df$PM25) < 37,
  air_quality_data)

names(safe_cities)

[1] "Ankara" "Izmir" 

Find(f, x, right = FALSE, nomatch = NULL)

daily_PM25 <- list(
    day1 = 30, day2 = 35, day3 = 40, day4 = 45,
    day5 = 25, day6 = 50, day7 = 55, day8 = 20
)
# Assume the threshold for concern is a PM2.5 level of 50.
(first_high_day <- Find(function(x) x > 50, daily_PM25))

[1] 55

set.seed(123)  # Ensuring reproducibility
air_quality_data <- list(
    Ankara = data.frame(day = 1:7,
                        PM25 = rnorm(7, mean = 35, sd = 5)),
    Istanbul = data.frame(day = 1:7,
                          PM25 = rnorm(7, mean = 40, sd = 6)),
    Izmir = data.frame(day = 1:7,
                       PM25 = rnorm(7, mean = 30, sd = 4))
)
(first_exceeding_city <- Find(
  function(df) any(df$PM25 > 45), air_quality_data))

  day PM25
1   1 32.4
2   2 35.9
3   3 37.3
4   4 47.3
5   5 42.2
6   6 42.4
7   7 40.7

Position(f, x, right = FALSE, nomatch = NA_integer_)

set.seed(123)  # For reproducibility
environmental_data <- list(
    Ankara = data.frame(day = 1:10,
                        PM25 = rnorm(10, mean = 35, sd = 5),
                        Temp = rnorm(10, 20),
                        Humidity = rnorm(10, 60)),
    Istanbul = data.frame(day = 1:10,
                          PM25 = rnorm(10, mean = 40, sd = 6),
                          Temp = rnorm(10, 22),
                          Humidity = rnorm(10, 65)),
    Izmir = data.frame(day = 1:10,
                       PM25 = rnorm(10, mean = 30, sd = 4),
                       Temp = rnorm(10, 25),
                       Humidity = rnorm(10, 70)))
(first_exceeding_city_index <- Position(
  function(df) mean(df$PM25) > 37, environmental_data))

[1] 2

sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...)

set.seed(123)  # For reproducibility
pollution_data <- matrix(rnorm(30), nrow = 10, ncol = 3)
colnames(pollution_data) <- c("PM2.5", "NO2", "SO2")
head(pollution_data)

       PM2.5    NO2    SO2
[1,] -0.5605  1.224 -1.068
[2,] -0.2302  0.360 -0.218
[3,]  1.5587  0.401 -1.026
[4,]  0.0705  0.111 -0.729
[5,]  0.1293 -0.556 -0.625
[6,]  1.7151  1.787 -1.687

means <- colMeans(pollution_data)
sds <- apply(pollution_data, 2, sd)

# Subtract the mean
(centered_data <- sweep(pollution_data, 2, means, FUN = "-"))

         PM2.5     NO2    SO2
 [1,] -0.63510  1.0155 -0.643
 [2,] -0.30480  0.1512  0.207
 [3,]  1.48408  0.1921 -0.601
 [4,] -0.00412 -0.0979 -0.304
 [5,]  0.05466 -0.7645 -0.200
 [6,]  1.64044  1.5783 -1.262
 [7,]  0.38629  0.2892  1.262
 [8,] -1.33969 -2.1752  0.578
 [9,] -0.76148  0.4927 -0.714
[10,] -0.52029 -0.6814  1.678

# Divide by the standard deviation
(standardized_data <- sweep(centered_data, 2, sds, FUN = "/"))

         PM2.5     NO2    SO2
 [1,] -0.66588  0.9782 -0.691
 [2,] -0.31957  0.1456  0.222
 [3,]  1.55599  0.1851 -0.646
 [4,] -0.00432 -0.0943 -0.327
 [5,]  0.05731 -0.7364 -0.215
 [6,]  1.71993  1.5204 -1.356
 [7,]  0.40501  0.2786  1.356
 [8,] -1.40460 -2.0955  0.621
 [9,] -0.79838  0.4747 -0.767
[10,] -0.54550 -0.6564  1.803

(standardized_data2 <- apply(
  pollution_data, 2, function(x) (x - mean(x)) / sd(x)))

         PM2.5     NO2    SO2
 [1,] -0.66588  0.9782 -0.691
 [2,] -0.31957  0.1456  0.222
 [3,]  1.55599  0.1851 -0.646
 [4,] -0.00432 -0.0943 -0.327
 [5,]  0.05731 -0.7364 -0.215
 [6,]  1.71993  1.5204 -1.356
 [7,]  0.40501  0.2786  1.356
 [8,] -1.40460 -2.0955  0.621
 [9,] -0.79838  0.4747 -0.767
[10,] -0.54550 -0.6564  1.803

# Are they equal?
all.equal(standardized_data, standardized_data2)

[1] TRUE

# Get centered data from standardized data
centered_data2 <- sweep(standardized_data, 2, sds, FUN = "*")
# Get original data from centered data
original_data <- sweep(centered_data2, 2, means, FUN = "+")
# Are they equal?
all.equal(original_data, pollution_data)

[1] TRUE

Negate(f)

# We have a function that identifies days with poor
# air quality based on certain criteria.
is_poor_air_quality <- function(pm25, no2) {
    pm25 > 35 && no2 > 50
}
# Use Negate to create the OPPOSITE function
is_good_air_quality <- Negate(is_poor_air_quality)

# Example data for a week
air_quality_data <- data.frame(
    day = 1:7,
    PM25 = c(30, 40, 36, 38, 50, 33, 45),
    NO2 = c(45, 55, 60, 48, 53, 49, 52)
)
head(air_quality_data)

  day PM25 NO2
1   1   30  45
2   2   40  55
3   3   36  60
4   4   38  48
5   5   50  53
6   6   33  49

# Test the original function
(poor_quality_days <- apply(
  air_quality_data, 1,
  function(x) is_poor_air_quality(x["PM25"], x["NO2"])))

[1] FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE

# Test the negated function
(good_quality_days <- apply(
  air_quality_data, 1,
  function(x) is_good_air_quality(x["PM25"], x["NO2"])))

[1]  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE

print(is_poor_air_quality) # Original function

function(pm25, no2) {
    pm25 > 35 && no2 > 50
}
<bytecode: 0x10ba32278>

print(is_good_air_quality) # Negated function

function (...) 
!f(...)
<bytecode: 0x121d291a0>
<environment: 0x10a899c80>

Introduction to R

Need Help?

# Preliminary

SEE ALSO

Data permanency and removing objects

SEE ALSO

Simple Manipulations

Assignment

SEE ALSO

Simple Manipulations

Vector arithmetic

SEE ALSO

Simple Manipulations

Vector arithmetic

SEE ALSO

Simple Manipulations

Generate regular sequences

SEE ALSO

Simple Manipulations

Repeat an object

SEE ALSO

Simple Manipulations

Logical Vectors

SEE ALSO

Simple Manipulations

Missing Values

SEE ALSO

Simple Manipulations

Character vectors

SEE ALSO

Simple Manipulations

Indexing, selecting and modifying

SEE ALSO

Simple Manipulations

Indexing, selecting and modifying

SEE ALSO

Simple Manipulations

Indexing, selecting and modifying

Simple Manipulations

Indexing, selecting and modifying

SEE ALSO

Simple Manipulations

Other types of objects

SEE ALSO

Objects

Everything is an object in R.

SEE ALSO

Objects

Everything is an object in R.

SEE ALSO

Factors

SEE ALSO

Arrays

SEE ALSO

Arrays

Indexing

SEE ALSO

Arrays

Combining

SEE ALSO

Arrays

Arithmetic

SEE ALSO

Lists and data frames

SEE ALSO

Lists and data frames

Constructing and modifying lists

SEE ALSO

Lists and data frames

Concatenating lists

Lists and data frames

Making data frames

SEE ALSO

Reading data from files

read.table

SEE ALSO

Reading data from files

read.table

SEE ALSO

Reading data from files

Number of Primes below `n`

`apply` function

`apply` function

`tapply` function

`tapply` function

`sapply` function

`sapply` function

`vapply` function

`vapply` function

`mapply` function

`mapply` function

`rapply` function (Recursive version of lapply)