x <- 3
x

[1] 3


x <- 3
f <- function() {
    x <<- 1 # Modifies the existing variable in parent namespace (or creates a new global variable)
}
f()
x

[1] 1


"a" %in% "abc" # Note that R strings are not sequences

[1] FALSE


3 %in% c(1, 2, 3) # c(1, 2, 3) is a vector

[1] TRUE


!(3 %in% c(1, 2, 3))

[1] FALSE


v <- c(8, 10, 12)
v

[1]  8 10 12


v <- c(v, 14) # Vectors are always flattened (even when nested)
v

[1]  8 10 12 14


char_vec <- c("apple", "banana", "watermelon")


char_vec

[1] "apple"      "banana"     "watermelon"


# length() function gives the length of an R object (analogous to Python's len())
length(char_vec)

[1] 3


is.character(char_vec)

[1] TRUE


# Note the 'L' suffix to make sure you get an integer rather than double
int_vec <- c(300L, 200L, 4L)


int_vec

[1] 300 200   4


# typeof() function returns the type of an R object (analogous to Python's type())
typeof(int_vec)

[1] "integer"


# is.integer() tests whether R object (vector/array/matrix) contains elements of type 'integer'
is.integer(int_vec)

[1] TRUE


# Note that even without decimal part R treats these numbers as double
dbl_vec <- c(300, 200, 4)


dbl_vec

[1] 300 200   4


typeof(dbl_vec)

[1] "double"


is.double(dbl_vec)

[1] TRUE


# Note that is.numeric() function is a generic way of testing whether vector has numbers:
# integers or double
is.numeric(int_vec)

[1] TRUE


log_vec <- c(FALSE, FALSE, TRUE)
log_vec

[1] FALSE FALSE  TRUE


# While more concise, using T/F instead of TRUE/FALSE can be confusing
log_vec2 <- c(F, F, T)
log_vec2

[1] FALSE FALSE  TRUE


typeof(log_vec)

[1] "logical"


# Note that logical vector get coerced to 0/1 for FALSE/TRUE
c(dbl_vec, log_vec)

[1] 300 200   4   0   0   1


c(char_vec, int_vec)

[1] "apple"      "banana"     "watermelon" "300"        "200"       
[6] "4"


# If no natural way of type conversion exists, NAs are introduced
as.numeric(char_vec)

Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”

[1] NA NA NA


na <- c(NA, NA, NA)
na

[1] NA NA NA


length(na)

[1] 3


null <- c(NULL, NULL, NULL)
null

NULL


length(null)

[1] 0


# Presence of NAs can lead to unexpected results
v_na <- c(1, 2, 3, NA, 5)
mean(v_na)

[1] NA


# NAs should be treated specially
mean(v_na, na.rm = TRUE)

[1] 2.75


# Remember NAs are missing values
# Thus result of comparing them is unknown
NA == NA

[1] NA


# is.na() is a special function that checks whether value is missing (NA)
is.na(v_na)

[1] FALSE FALSE FALSE  TRUE FALSE


# We can use such logical vectors for subsetting (more below)
v_na[!is.na(v_na)]

[1] 1 2 3 5


dbl_vec[1]

[1] 300


dbl_vec[c(1,3)]

[1] 300   4

2:4

[1] 2 3 4


# It is similar to Python's object[start:stop:step] syntax
seq(from = 1, to = 4, by = 2)

[1] 1 3

v

[1]  8 10 12 14


v[2:4]

[1] 10 12 14


# Argument names can be omitted for matching by position
v[seq(1,4,2)]

[1]  8 12


# All but the last element
v[-length(v)]

[1]  8 10 12


# Reverse order
v[seq(length(v),1,-1)]

[1] 14 12 10  8


c(0, 1) + c(1, 2, 3, 4)

[1] 1 3 3 5


5 * c(1, 2, 3, 4)

[1]  5 10 15 20


c(1, 2, 3, 4)[c(TRUE, FALSE)]

[1] 1 3


char_vec

[1] "apple"      "banana"     "watermelon"


char_vec == "watermelon"

[1] FALSE FALSE  TRUE


which(char_vec == "watermelon")

[1] 3


dbl_vec[char_vec == "watermelon"]

[1] 4


dbl_vec[which(char_vec == "watermelon")]

[1] 4


# We can combine different data types in a list and, optionally, name elements (e.g. B below)
l <- list(2:4, "a", B = c(TRUE, FALSE, FALSE), list("x", 1L))
l

[[1]]
[1] 2 3 4

[[2]]
[1] "a"

$B
[1]  TRUE FALSE FALSE

[[4]]
[[4]][[1]]
[1] "x"

[[4]][[2]]
[1] 1


str(l)

List of 4
 $  : int [1:3] 2 3 4
 $  : chr "a"
 $ B: logi [1:3] TRUE FALSE FALSE
 $  :List of 2
  ..$ : chr "x"
  ..$ : int 1


l[3]

$B
[1]  TRUE FALSE FALSE


str(l[3])

List of 1
 $ B: logi [1:3] TRUE FALSE FALSE


l[[3]]

[1]  TRUE FALSE FALSE


# Only works with named elements
l$B

[1]  TRUE FALSE FALSE

v

[1]  8 10 12 14


attr(v, "example_attribute") <- "This is a vector"


attr(v, "example_attribute")

[1] "This is a vector"


# To set names for vector elements we can use names() function
names(v) <- c("a", "b", "c", "d")
v

 a  b  c  d 
 8 10 12 14 
attr(,"example_attribute")
[1] "This is a vector"


# Names of vector elements can be used for subsetting
v["b"]

 b 
10


cities <- c("Dublin", "Cork", "Cork", "Limerick", "Galway")
cities

[1] "Dublin"   "Cork"     "Cork"     "Limerick" "Galway"


typeof(cities)

[1] "character"


# We use factor() function to convert character vector into factor
# Only unique elements of character vector are considered as a level
cities <- factor(cities)
cities

[1] Dublin   Cork     Cork     Limerick Galway  
Levels: Cork Dublin Galway Limerick


class(cities)

[1] "factor"


# Note that the data type of this vector is integer (and not character)
typeof(cities)

[1] "integer"


# Note that R automatically sorted the categories alphabetically
levels(cities)

[1] "Cork"     "Dublin"   "Galway"   "Limerick"


# You can change the reference category using relevel() function
cities <- relevel(cities, ref = "Dublin")
levels(cities)

[1] "Dublin"   "Cork"     "Galway"   "Limerick"


# Or define an arbitrary ordering of levels using levels argument in factor() function
cities <- factor(cities, levels = c("Limerick", "Galway", "Dublin", "Cork"))
levels(cities)

[1] "Limerick" "Galway"   "Dublin"   "Cork"


# Under the hood factors continue to be integer vectors
as.integer(cities)

[1] 3 4 4 1 2


var_1 <- sample(c("a", "b", "c"), size = 50, replace = TRUE)
var_2 <- sample(c(1, 2, 3), size = 50, replace = TRUE)


table(var_1, var_2)

     var_2
var_1 1 2 3
    a 7 5 5
    b 7 5 9
    c 4 6 2


var_2 <- factor(var_2, levels = c(3, 1, 2))


table(var_2)

var_2
 3  1  2 
16 18 16


var_2 <- factor(var_2, levels = c(3, 1, 2), labels = c("Three", "One", "Two"))


table(var_1, var_2)

     var_2
var_1 Three One Two
    a     5   7   5
    b     9   7   5
    c     2   4   6


# : operator can be used generate vectors of sequential numbers
a <- 1:12
a

 [1]  1  2  3  4  5  6  7  8  9 10 11 12


class(a)

[1] "integer"


dim(a) <- c(3, 2, 2)
a

, , 1

     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6

, , 2

     [,1] [,2]
[1,]    7   10
[2,]    8   11
[3,]    9   12


class(a)

[1] "array"


m <- 1:12


dim(m) <- c(3, 4)
m

     [,1] [,2] [,3] [,4]
[1,] 1    4    7    10  
[2,] 2    5    8    11  
[3,] 3    6    9    12


# Alternatively, we could use matrix() function
m <- matrix(1:12, nrow = 3, ncol = 4)
m

     [,1] [,2] [,3] [,4]
[1,] 1    4    7    10  
[2,] 2    5    8    11  
[3,] 3    6    9    12


# Note that length() function displays the length of underlying vector
length(m)

[1] 12

a

, , 1

     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6

, , 2

     [,1] [,2]
[1,]    7   10
[2,]    8   11
[3,]    9   12


# Most common way
a[1, 2, 2]

[1] 10


# Specifying drop = FALSE after indices retains the original dimensionality of matrix/array
a[1, 2, 2, drop = FALSE]

, , 1

     [,1]
[1,]   10


# Here elements are subset from underlying vector (with repetition)
a[c(1, 2, 2)]

[1] 1 2 2

m

     [,1] [,2] [,3] [,4]
[1,] 1    4    7    10  
[2,] 2    5    8    11  
[3,] 3    6    9    12


# As with arrays drop = FALSE prevents from this object being collapsed into 1-dimensional vector
m[, 1, drop = FALSE]

     [,1]
[1,] 1   
[2,] 2   
[3,] 3


# Subset all rows, first two columns
m[1:nrow(m), 1:2]

     [,1] [,2]
[1,] 1    4   
[2,] 2    5   
[3,] 3    6


# Note that vector recycling also applies here
m[c(TRUE, FALSE), -3]

     [,1] [,2] [,3]
[1,] 1    4    10  
[2,] 3    6    12


?length


help(dim)

Structure	Description	Dimensionality	Data Type
`vector`	Atomic vector (scalar)	1d	homogenous
`matrix`	Matrix	2d	homogenous
`array`	One-, two or n-dimensional array	1d/2d/nd	homogenous
`list`	List	1d	heterogeneous
`data.frame`	Rectangular data	2d	heterogeneous

Value	Example	Description
Positive integers	`v[c(3, 1)]`	Returns elements at specified positions
Negative integers	`v[-c(3, 1)]`	Omits elements at specified positions
Logical vectors	`v[c(FALSE, TRUE)]`	Returns elements where corresponding logical value is `TRUE`
Character vector	`v[c(“c”, “a”)]`	Returns elements with matching names (only for named vectors)
Nothing	`v[]`	Returns the original vector
0 (Zero)	`v[0]`	Returns a zero-length vector


`break`	`NA`
`else`	`NaN`
`FALSE`	`next`
`for`	`NULL`
`function`	`repeat`
`if`	`TRUE`
`Inf`	`while`

Week 2: R Fundamentals¶

Introduction to Computer Programming for Data Analysis I¶

Tom Paskhalis¶

4 May 2022¶

Overview¶

R objects¶

Assignment operations¶

Membership operations¶

Data structures¶

Summary of data structures in R¶

Vectors in R¶

Atomic vectors¶

Data types¶

Character vector¶

Integer vector¶

Double vector¶

Integer vs double¶

Logical vector¶

Type coercion in vectors¶

Implicit type coercion¶

NA and NULL values¶

NA and NULL example¶

Working with NAs¶

Vector indexing and subsetting¶

Summary of vector subsetting¶

Generating sequences for subsetting¶

Vector subsetting examples¶

Vector recycling¶

which() function¶

Lists¶

R object structure¶

List subsetting¶

List subsetting examples¶

Attributes¶

Attributes examples¶

Factors¶

Factors example¶

Factors example continued¶

Tabulation¶

Factors in crosstabs¶

Arrays and matrices¶

Array example¶

Matrix example¶

Array and matrix subsetting¶

Array subsetting example¶

Matrix subsetting example¶

Naming conventions¶

Code layout¶

Reserved words¶

R packages¶

Help!¶

Next¶

`which()` function¶