---
output:
html_document: default
pdf_document: default
---
```{r mirror,include=FALSE,purl=FALSE}
r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)
```
This is an R Markdown document for the course 'Computing in R'. Using this document, you can easily run selected pieces of R code shown during the lectures from within RStudio.
Basics of R
-------------------------
```{r "R as a pocket calculator"}
2+7
sqrt(2)
cos(pi)
log10(10^3)
```
```{r "A simple R session"}
x <- 2
print(x)
x
x^2
x
```
```{r "Interacting with the R Console"}
#y < - x
```
```{r "Help (I)"}
help(sqrt)
```
```{r "Packages"}
help.start()
```
```{r "Help (II)"}
help(mean)
# ? is a shorthand for help()
?mean
```
Syntax: data
-------------------------
### Data structures
```{r "Vectors (I)"}
x <- c(10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
x
x <- seq(from = 10, to = 1, by = -1)
x
x <- seq(10, 1)
x
x <- 10:1
x
```
```{r "Vectors (II)"}
x[5]
x[10]
x[5] + x[10]
indx <- c(5,10)
indx
x[indx]
x[c(5,10)]
c(-5, -10)
x[c(-5, -10)]
x[4] <- 12
x
```
```{r "Vectors (III)"}
mean(x)
x + 1
2*x
```
```{r "Matrices (I)"}
help(matrix)
```
```{r "Matrices (II)"}
A <- matrix(data = 1:10, nrow = 2, ncol = 5)
A <- matrix(1:10, 2, 5)
A
A[2, 3]
A[2,3] <- 12
```
```{r "Matrices (III)"}
A[1, ]
A[1,1:5]
A[, c(1, 5)]
A[1:2, c(1,5)]
dim(A[, c(1, 5)])
#x[1,3]
x[c(1,3)]
```
```{r "Objects (I)"}
ls()
summary(x)
summary(A)
```
```{r "Object (variable) names"}
a2 <- 1
#2a <- 1
c <- 1
c
c(1,2)
#else <- 1
```
```{r "Modes"}
c(1, 2, 3, 4)
-2 < -2
letters[1:3]
mode(x)
as.character(x)
```
```{r "Modes: logical (I)"}
x
x[c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE,FALSE, TRUE, FALSE)]
x>5
x[x>5]
length(x>5)
length(x)
x[x==5] # == : test for equality
x[x=5] # = : assignment operator
sum(x>5)
```
```{r "Modes: logical (II)"}
TRUE & FALSE
TRUE | FALSE
x
x>5
x<8
x>5 & x<8
sum(x>5 & x<8)
x[x>5 & x<8]
```
```{r "Naming (I)"}
m <- c(1,2,3,4)
names(m) <- c("gene 1","gene 2","gene 3","gene 4")
m
rownames(A)
rownames(A) <- c("gene 1", "gene 2")
colnames(A) <- c("array 1", "array 2", "array 3","array 4", "array 5")
```
```{r "Naming (II)"}
A
A["gene 1", ]
```
```{r "Lists (I)"}
c("gene 1", 5)
list(gene = "gene 1", number = 5)
```
```{r "Lists (II)"}
x <- list(gene = "gene 1", number = 5)
x[1]
x[[1]]
x$gene
```
```{r "Constructing a data frame"}
pclass <- c("1st","2nd","1st")
survived <- c(1,1,0)
name <- c("Elisabeth Walton","Hudson Trevor","Helen Loraine")
age <- c(29.0,0.9167,2.0)
titanic <- data.frame(pclass,survived,name,age)
titanic
```
```{r "Data frames (II)"}
titanic[c(2,3),]
titanic[,c("name","age")]
titanic[c(2,3),c("name","age")]
titanic$age
titanic[["age"]]
titanic$status <- c("yes","yes","no")
titanic
```
In the exercises, we use the Titanic data set. See [here](http://lib.stat.cmu.edu/S/Harrell/data/descriptions/titanic3info.txt) for a description of the data.
```{r "Data frames (III)"}
library(foreign)
titanic3 <- read.dta("Exercises/titanic3.dta", convert.underscore=TRUE) # convert.underscore: Convert "_" in Stata variable names to "." in R names?
dim(titanic3)
head(titanic3[,1:4])
```
```{r "Data frames (IV)"}
tail(titanic3[,1:4])
str(titanic3[,1:4])
```
### Data import
In R, we can also import directly from a web site:
```{r "Basic data import/export from other formats (I)"}
load(url("https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.sav"))
ls()
```
An example using the readxl package for MS Excel files. The data set can be obtained from [here](http://biostat.mc.vanderbilt.edu/twiki/bin/view/Main/DataSets). If the data set has been stored in a subfolder Exercises, it can be imported via
```{r}
install.packages("readxl")
library(readxl)
titanic3 <- read_excel("Exercises/titanic3.xls")
## Explain warning message and repair by converting to number
## tibble
dim(titanic3)
head(titanic3[,1:4])
tail(titanic3[,1:4])
## Last row: (not) empty?
```
We run the code to import the STATA file, which is what we will do in the exercises as well.
```{r "Basic data import/export from other formats (II)"}
library(foreign)
titanic3 <- read.dta("Exercises/titanic3.dta", convert.underscore=TRUE)
```
Functions; special data formats; selections
-------------------------
### Functions
Using the logarithm as an example, we show the different ways to obtain the 10-log of 100 (which should give 2 as outcome).
```{r "Functions: basic format"}
## Basic structure
log(100) # does not give 2
help(log)
log(100, 10)
log(10, 100) # does not give 2
log(base=10, x=100)
log(b=10,x=100)
## Some functions have an ... argument
c
c(3,6,8)
AMC <- c("Academic","Medical","Center")
help(paste)
paste(AMC, collapse=" ")
## Some functions have an alias that is closer to common use
## E.g. + is an alias for the "+" function
"+"(1,7)
## To cite R in publications:
citation()
```
We can see the contents of a function by leaving out the parentheses. And we can write our own functions.
```{r "Functions: the inside"}
help
```
```{r "Functions and packages"}
good.morning <- function(work){
if(work==TRUE) cat("wake up") else
cat("you can stay in bed")
}
good.morning
#good.morning()
good.morning(work=FALSE)
```
### Selections
```{r "Selection of rows and columns"}
install.packages("ISwR")
library(ISwR)
help(juul)
summary(juul)
juul$age[1:10]
juul[1:10,"age"]
```
```{r "Selection of rows via functions"}
## head and tail functions
tail(juul[, c("menarche","sex")])
## in baby steps
tmp <- juul[, c("menarche","sex")] # all rows, two columns
tail(tmp) # last six rows
## or
tail(juul)[, c("menarche","sex")]
## in baby steps
tmp <- tail(juul) # last six rows, all columns
tmp[, c("menarche","sex")] # two columns
## some selections using subset function
subset(juul, tanner==2 & age<10)
subset(juul, tanner>=4 & age>45)
subset(juul, tanner %in% c(1,5) & (age==0.25|age>50))
```
```{r "Selection of columns via functions"}
with(juul, table(sex, tanner))
xtabs(~sex+tanner, data=juul)
xtabs(~sex+tanner, data=juul, subset=(menarche==1))
## also select columns using subset function
subset(juul, tanner==2 & age<10, select=c(menarche,sex))
```
### Some special data types
#### Missing data
When checking for missingness, we need to use the is.na function.
```{r "Missing data"}
titanic3$age[1:20]
table(titanic3$age==NA)
3==NA
is.na(3)
table(is.na(titanic3$age))
## select those with missing igf1 value
subset(juul, tanner %in% c(1,5) & is.na(igf1) & (age<5|age>18))
#quantile(juul$age, prob=c(0.025,0.25,0.5,0.75,0.975)) # gives an error
quantile(juul$age, prob=c(0.025,0.25,0.5,0.75,0.975), na.rm=TRUE)
with(juul, table(sex, menarche))
with(juul, table(sex, menarche, useNA="always"))
```
#### Factors
```{r "Factors: what are they"}
DiseaseState <- factor(c("Cancer", "Cancer", "Normal"))
DiseaseState
str(DiseaseState)
levels(DiseaseState)
typeof(DiseaseState) # the internal coding is via integers
```
```{r "Factors: how to create"}
is.factor(DiseaseState)
help(factor)
## change labels
factor(c("Cancer", "Cancer", "Normal"), labels=c("Dis","Norm"))
## change ordering of levels
DiseaseState2 <- factor(c("Cancer", "Cancer", "Normal"),levels=c("Normal","Cancer"))
table(DiseaseState)
table(DiseaseState2)
## can also use relevel function
relevel(DiseaseState, ref="Normal")
## see what changes internally
as.numeric(DiseaseState)
as.numeric(relevel(DiseaseState, ref="Normal"))
```
#### Dates
```{r "Dates"}
as.Date("15 April 1912", "%d %b %Y")
julian(as.Date("15 April 1912", "%d %b %Y"))
titanic3$dob <- as.Date("15 April 1912", "%d %b %Y")+ (-titanic3$age*365.25)
as.Date("15041912", "%d%m%Y")
as.Date("150412", "%d%m%y")
format(as.Date("1912April15", "%Y%b%d"),"%A %B %d, %Y")
install.packages("lubridate")
library(lubridate)
wday(as.Date("1912April15", "%Y%b%d"),label=TRUE)
```
#### Swirl
```{r, warning=FALSE,message=FALSE}
#install.packages("swirl")
#library(swirl)
#install_from_swirl("R Programming")
#swirl()
```
Graphics
-------------------------
### Basic graphics
```{r "Basic plot I"}
x <- (0:100)/10
plot(x, x^3 - 13 * x^2 + 39 * x)
```
```{r "Basic plot II"}
plot(x, x^3 - 13 * x^2 + 39 * x,cex.axis=1.5,cex.lab=1.5)
```
```{r "Enhancing a plot I"}
plot(x,x^3-13*x^2+39*x,type="l",xlab="time (hours)",ylab="temperature",main="Enhanced plot",cex.axis=1.5,cex.lab=1.5)
```
```{r "Enhancing a plot II"}
colors()
plot(x,x^3-13*x^2+39*x,pch=18,xlab="time (hours)",ylab="temperature",col="red",main="Enhanced plot",cex.axis=1.5,cex.lab=1.5)
palette()
plot(x,x^3-13*x^2+39*x,pch=18,xlab="time (hours)",ylab="temperature",col=3,main="Enhanced plot",cex.axis=1.5,cex.lab=1.5)
```
```{r "Plot symbols"}
plot(1:25, pch=1:25,cex=2,bg="grey")
```
```{r "Enhancing a plot III"}
x<-(0:100)/10
plot(x,x^3-13*x^2+39*x,type="l",xlab="time (hours)",ylab="temperature",cex.axis=1.5,cex.lab=1.5)
points(2,34,col="red",pch=16,cex=2)
arrows(4,50,2.2,34.5)
text(4.15,50,"local maximum",adj=0,col="blue",cex=1.5)
lines(x,30-50*sin(x/2),col="blue")
legend(x=0,y=80,legend=c("Sahara","Gobi"),lty=1,col=c("black","blue"),cex=1.5)
```
```{r "Graphical parameters I"}
?par
par("bg")
par(bg="green")
par("bg")
# rerun the plot commands
x<-(0:100)/10
plot(x,x^3-13*x^2+39*x,type="l",xlab="time (hours)",ylab="temperature",cex.axis=1.5,cex.lab=1.5)
points(2,34,col="red",pch=16,cex=2)
arrows(4,50,2.2,34.5)
text(4.15,50,"local maximum",adj=0,col="blue",cex=1.5)
lines(x,30-50*sin(x/2),col="blue")
legend(x=0,y=80,legend=c("Sahara","Gobi"),lty=1,col=c("black","blue"),cex=1.5)
plot(1:10)
par(bg="white")
plot(1:10)
# save default graphical parameters to be able to restore them later on
par.defaults <- par(no.readonly=TRUE)
par(bg="green")
par("bg")
plot(1:10)
par(par.defaults)
par("bg")
plot(1:10)
```
```{r "Graphical parameters II"}
x<-(0:100)/10
plot(x,x^3-13*x^2+39*x,type="l",xlab= "time (hours)",ylab="temperature",lwd=3,las=1,cex.axis=1.5,cex.lab=1.5)
```
```{r "Histograms"}
hist(titanic3$age,breaks=15,freq=FALSE,cex.axis=1.5,cex.lab=1.5)
```
```{r "Boxplot"}
boxplot(titanic3$fare,ylim=c(0,300),ylab="fare",cex.axis=1.5,cex.lab=1.5)
boxplot(fare ~ pclass,data=titanic3,ylim=c(0,300),ylab="fare",cex.axis=1.5,cex.lab=1.5)
plot(fare ~ pclass,data=titanic3,ylim=c(0,300),ylab="fare",cex.axis=1.5,cex.lab=1.5)
```
### Advanced graphics
```{r "Advanced R graphics"}
# graphics
plot(fare~age,data=titanic3) # very skewed distribution
plot(log10(fare)~age,data=titanic3) # plot log10(y) vs x
plot(fare~age,data=titanic3,log="y") # original data, but log scale
min(titanic3$fare,na.rm=TRUE) # zero values!
sum(titanic3$fare==0,na.rm=TRUE) # zero values!
install.packages("ggplot2")
library(ggplot2)
qplot(age,fare,data=titanic3)
qplot(age,fare,data=titanic3) + scale_y_log10() # log scale for y values
qplot(age,fare,data=titanic3,facets=~pclass) # one plot per pclass
qplot(age,fare,data=titanic3,facets=~pclass,ylim=c(0,280)) # better for comparison
qplot(age,fare,data=titanic3,facets=sex~pclass) + scale_y_log10() # even better
qplot(age,fare,data=titanic3,facets=pclass~sex) + scale_y_log10() # better for comparing males and females per class
#install.packages("devtools")
#library(devtools)
#install_github("rCharts", "ramnathv", ref = "dev")
#library(rCharts)
#titanic3$survived = factor(titanic3$survived, labels=c("no","yes"))
#rPlot(fare ~ age| sex + pclass, data = titanic3, type = "point", color = "survived", size = list(const = 2))
#dt <- dTable(titanic3,sPaginationType= "full_numbers")
#dt
```
```{r "Export: two types of format"}
## Open the PDF device
pdf("titanic3_ggplot_v1.pdf")
qplot(age,fare,data=titanic3,facets=pclass~sex) + scale_y_log10()
## Close the device again to save the file
dev.off()
qplot(age,fare,data=titanic3,facets=pclass~sex) + scale_y_log10()
dev.print("titanic3_ggplot_v2.pdf",device=pdf)
# Change size, for pdf device width and height are specified in inches
pdf("titanic3_ggplot_v1_big.pdf", width=21, height=21)
qplot(age,fare,data=titanic3,facets=pclass~sex) + scale_y_log10()
## Close the device again to save the file
dev.off()
```
Structure of R
-------------------------
```{r "Structure of R"}
search()
ls() # objects in Workspace
ls("package:stats")[1:20] # some objects in stats package
ls("package:base")[1:30] # some objects in base package
## hierarchical structure and duplicate names
data(quakes) # from package:datasets
## new quakes object in Workspace hides the other
quakes <- 1:10
quakes
## removing it makes the other visible again
rm(quakes)
quakes$mag # original quakes not overwritten!
find("quakes")
```
```{r "Workspace management"}
save(titanic3, file="Titanic.RData")
rm(titanic3)
load("Titanic.RData")
```
```{r "Project management"}
getwd()
#setwd("c:/AMC/Teaching/RCursus") # just an example, note that a forward slash is used
```
Data management
-------------------------
```{r "Data manipulation"}
## sort
install.packages("dplyr")
library(dplyr)
titanic.SortedAge <- arrange(titanic3, age)
head(titanic.SortedAge)
## base R
x <- c(5,8,1)
sort(x)
order(x)
x[order(x)]
z <- data.frame(x=c(5,8,1), y=c("A","M","C"))
z
z[sort(z$x),]
z[order(z$x),]
## merge
emb <- data.frame(embarked=c("Cherbourg","Queenstown","Southampton"), country=c("France","Ireland","United Kingdom"))
emb
titanic3 <- left_join(titanic3, emb, by="embarked") # all rows from titanic 3, all columns from both
head(titanic3)
## base R
titanic3 <- merge(titanic3, emb)
## wide <-> long
install.packages("tidyr")
library(tidyr)
df1 <- data.frame(c("a", "a", "a", "a", "b", "b", "b", "b"), c(1,2,3,4,1,2,3,4), c(0.1, 0.2, 0.3, 0.4, 2.1, 2.2, 2.3, 2.4))
colnames(df1) <- c("Name", "Time", "Value")
df1
df1.wide <- spread(df1, Time, Value)
df1.wide
df1.long <- gather(df1.wide, "Time", "Value", 2:5)
df1.long
arrange(df1.long, Name, Time)
## base R
df1.wide <- reshape(df1, timevar="Time", idvar="Name", direction="wide", v.names="Value")
df1.wide
df1.long <- reshape(df1.wide, varying=2:5, direction="long")
df1.long
df1.long[order(df1.long$Name,df1.long$time),]
```
```{r "Transformed variables"}
## transformation using within
titanic3 <- within(titanic3, {
agecat <- cut(age, breaks=c(0,18,30,40,80))
family <- factor(ifelse(parch==0 & sibsp==0, "no", "yes"))
})
summary(titanic3[,c("age","agecat","family")])
## transformation using transform
titanic3 <- transform(titanic3,
agecat = cut(age, breaks=c(0,18,30,40,80)),
family = factor(ifelse(parch==0 & sibsp==0, "no", "yes"))
)
summary(titanic3[,c("age","agecat","family")])
```
```{r "Making basic summaries"}
# crosstable using descr package
install.packages("descr")
library(descr)
descr(titanic3$age)
with(titanic3, CrossTable(pclass,sex,format="SPSS"))
## summary by subgroup, all produce basically the same output
## basic R functions
aggregate(age~pclass, data=titanic3, FUN=summary)
by(titanic3,titanic3$pclass,FUN=function(x) summary(x$age))
with(titanic3, tapply(age,pclass,FUN=summary))
## using doBy package
install.packages("doBy")
library(doBy)
summaryBy(age~pclass, data=titanic3, FUN=summary)
```
Documentation and help
-------------------------
```{r "Searching"}
help(summary)
fit.age <- lm(age ~ pclass, data=titanic3)
summary(fit.age)
class(fit.age)
help(summary.lm)
help(descriptive)
#help.search("descriptive statistics") # same as ??descriptive
#RSiteSearch("{descriptive statistics}") # braces to indicate that words are to be searched as one entity
#install.packages("sos")
#library(sos)
#findFn("{descriptive statistics}")
```
Model fitting; formulas
-------------------------
```{r "Regression: Formulas"}
help.search("Student")
help(t.test)
with(titanic3,t.test(age[sex=="male"],age[sex=="female"]))
t.test(subset(titanic3,sex=="male")$age,subset(titanic3,sex=="female")$age)
test.age <- t.test(age ~ sex, data = titanic3)
test.age
```
```{r "Model output"}
fit.age <- lm(age ~ pclass, data=titanic3)
fit.age
summary(fit.age)
## output of model is a list
str(fit.age)
names(fit.age)
summary(fit.age$residuals)
## or
summary(resid(fit.age))
hist(resid(fit.age))
coef(fit.age)
predict(fit.age, newdata=data.frame(pclass=factor(levels(titanic3$pclass))))
update(fit.age,. ~ .+sex)
## the output has a specific class, in this case lm
class(fit.age)
## which functions have been written for this class?
help(methods)
methods(class="lm")
## we want to know what the summary function does with an object of class lm
help(summary) # does not give the desired help file
## we need to consult the help file for summary.lm
methods(summary)
help(summary.lm)
plot(fit.age)
```
```{r "Formula structure"}
plot(age ~ fare, data=titanic3)
## the type of plot is automaticaaly adapted to the type of variable
plot(age~pclass, data=titanic3)
```
Programming and ply functions
-------------------------
```{r "Statements: if-then-else"}
x <- 10
z <- if (x < 2) 4 else 3
z
```
```{r "Statements: repetition"}
A <- matrix(1:10, 2, 5)
rownames(A) <- c("gene 1", "gene 2")
colnames(A) <- c("array 1", "array 2", "array 3","array 4", "array 5")
results <- numeric(2)
results
for (i in 1:2) {
results[i] <- mean(A[i, ])
}
results
```
```{r "apply"}
apply(A, 1, mean)
```
```{r "tapply: Example"}
titanic3 <- read.dta("Exercises/titanic3.dta", convert.underscore=TRUE)
head(titanic3[,c("fare","pclass")])
with(titanic3,tapply(fare, pclass, mean,na.rm=T))
# Solution using the dplyr package
#install.packages("dplyr")
library(dplyr)
summarise(group_by(titanic3, pclass), mean(fare,na.rm=TRUE))
```