## ----"R as a pocket calculator"----------------------------------------------------------------------------------------------------------------- 2+7 sqrt(2) cos(pi) log10(10^3) ## ----"A simple R session"----------------------------------------------------------------------------------------------------------------------- x <- 2 print(x) x x^2 x ## ----"Interacting with the R Console"----------------------------------------------------------------------------------------------------------- #y < - x ## ----"Help (I)"--------------------------------------------------------------------------------------------------------------------------------- help(sqrt) ## ----"Packages"--------------------------------------------------------------------------------------------------------------------------------- help.start() ## ----"Help (II)"-------------------------------------------------------------------------------------------------------------------------------- help(mean) # ? is a shorthand for help() ?mean ## ----"Vectors (I)"------------------------------------------------------------------------------------------------------------------------------ x <- c(10, 9, 8, 7, 6, 5, 4, 3, 2, 1) x x <- seq(from = 10, to = 1, by = -1) x x <- seq(10, 1) x x <- 10:1 x ## ----"Vectors (II)"----------------------------------------------------------------------------------------------------------------------------- x[5] x[10] x[5] + x[10] indx <- c(5,10) indx x[indx] x[c(5,10)] c(-5, -10) x[c(-5, -10)] x[4] <- 12 x ## ----"Vectors (III)"---------------------------------------------------------------------------------------------------------------------------- mean(x) x + 1 2*x ## ----"Matrices (I)"----------------------------------------------------------------------------------------------------------------------------- help(matrix) ## ----"Matrices (II)"---------------------------------------------------------------------------------------------------------------------------- A <- matrix(data = 1:10, nrow = 2, ncol = 5) A <- matrix(1:10, 2, 5) A A[2, 3] A[2,3] <- 12 ## ----"Matrices (III)"--------------------------------------------------------------------------------------------------------------------------- A[1, ] A[1,1:5] A[, c(1, 5)] A[1:2, c(1,5)] dim(A[, c(1, 5)]) #x[1,3] x[c(1,3)] ## ----"Objects (I)"------------------------------------------------------------------------------------------------------------------------------ ls() summary(x) summary(A) ## ----"Object (variable) names"------------------------------------------------------------------------------------------------------------------ a2 <- 1 #2a <- 1 c <- 1 c c(1,2) #else <- 1 ## ----"Modes"------------------------------------------------------------------------------------------------------------------------------------ c(1, 2, 3, 4) -2 < -2 letters[1:3] mode(x) as.character(x) ## ----"Modes: logical (I)"----------------------------------------------------------------------------------------------------------------------- x x[c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE,FALSE, TRUE, FALSE)] x>5 x[x>5] length(x>5) length(x) x[x==5] # == : test for equality x[x=5] # = : assignment operator sum(x>5) ## ----"Modes: logical (II)"---------------------------------------------------------------------------------------------------------------------- TRUE & FALSE TRUE | FALSE x x>5 x<8 x>5 & x<8 sum(x>5 & x<8) x[x>5 & x<8] ## ----"Naming (I)"------------------------------------------------------------------------------------------------------------------------------- m <- c(1,2,3,4) names(m) <- c("gene 1","gene 2","gene 3","gene 4") m rownames(A) rownames(A) <- c("gene 1", "gene 2") colnames(A) <- c("array 1", "array 2", "array 3","array 4", "array 5") ## ----"Naming (II)"------------------------------------------------------------------------------------------------------------------------------ A A["gene 1", ] ## ----"Lists (I)"-------------------------------------------------------------------------------------------------------------------------------- c("gene 1", 5) list(gene = "gene 1", number = 5) ## ----"Lists (II)"------------------------------------------------------------------------------------------------------------------------------- x <- list(gene = "gene 1", number = 5) x[1] x[[1]] x$gene ## ----"Constructing a data frame"---------------------------------------------------------------------------------------------------------------- pclass <- c("1st","2nd","1st") survived <- c(1,1,0) name <- c("Elisabeth Walton","Hudson Trevor","Helen Loraine") age <- c(29.0,0.9167,2.0) titanic <- data.frame(pclass,survived,name,age) titanic ## ----"Data frames (II)"------------------------------------------------------------------------------------------------------------------------- titanic[c(2,3),] titanic[,c("name","age")] titanic[c(2,3),c("name","age")] titanic$age titanic[["age"]] titanic$status <- c("yes","yes","no") titanic ## ----"Data frames (III)"------------------------------------------------------------------------------------------------------------------------ library(foreign) titanic3 <- read.dta("Exercises/titanic3.dta", convert.underscore=TRUE) # convert.underscore: Convert "_" in Stata variable names to "." in R names? dim(titanic3) head(titanic3[,1:4]) ## ----"Data frames (IV)"------------------------------------------------------------------------------------------------------------------------- tail(titanic3[,1:4]) str(titanic3[,1:4]) ## ----"Basic data import/export from other formats (I)"------------------------------------------------------------------------------------------ load(url("https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.sav")) ls() ## ----------------------------------------------------------------------------------------------------------------------------------------------- install.packages("readxl") library(readxl) titanic3 <- read_excel("Exercises/titanic3.xls") ## Explain warning message and repair by converting to number ## tibble dim(titanic3) head(titanic3[,1:4]) tail(titanic3[,1:4]) ## Last row: (not) empty? ## ----"Basic data import/export from other formats (II)"----------------------------------------------------------------------------------------- library(foreign) titanic3 <- read.dta("Exercises/titanic3.dta", convert.underscore=TRUE) ## ----"Functions: basic format"------------------------------------------------------------------------------------------------------------------ ## Basic structure log(100) # does not give 2 help(log) log(100, 10) log(10, 100) # does not give 2 log(base=10, x=100) log(b=10,x=100) ## Some functions have an ... argument c c(3,6,8) AMC <- c("Academic","Medical","Center") help(paste) paste(AMC, collapse=" ") ## Some functions have an alias that is closer to common use ## E.g. + is an alias for the "+" function "+"(1,7) ## To cite R in publications: citation() ## ----"Functions: the inside"-------------------------------------------------------------------------------------------------------------------- help ## ----"Functions and packages"------------------------------------------------------------------------------------------------------------------- good.morning <- function(work){ if(work==TRUE) cat("wake up") else cat("you can stay in bed") } good.morning #good.morning() good.morning(work=FALSE) ## ----"Selection of rows and columns"------------------------------------------------------------------------------------------------------------ install.packages("ISwR") library(ISwR) help(juul) summary(juul) juul$age[1:10] juul[1:10,"age"] ## ----"Selection of rows via functions"---------------------------------------------------------------------------------------------------------- ## head and tail functions tail(juul[, c("menarche","sex")]) ## in baby steps tmp <- juul[, c("menarche","sex")] # all rows, two columns tail(tmp) # last six rows ## or tail(juul)[, c("menarche","sex")] ## in baby steps tmp <- tail(juul) # last six rows, all columns tmp[, c("menarche","sex")] # two columns ## some selections using subset function subset(juul, tanner==2 & age<10) subset(juul, tanner>=4 & age>45) subset(juul, tanner %in% c(1,5) & (age==0.25|age>50)) ## ----"Selection of columns via functions"------------------------------------------------------------------------------------------------------- with(juul, table(sex, tanner)) xtabs(~sex+tanner, data=juul) xtabs(~sex+tanner, data=juul, subset=(menarche==1)) ## also select columns using subset function subset(juul, tanner==2 & age<10, select=c(menarche,sex)) ## ----"Missing data"----------------------------------------------------------------------------------------------------------------------------- titanic3$age[1:20] table(titanic3$age==NA) 3==NA is.na(3) table(is.na(titanic3$age)) ## select those with missing igf1 value subset(juul, tanner %in% c(1,5) & is.na(igf1) & (age<5|age>18)) #quantile(juul$age, prob=c(0.025,0.25,0.5,0.75,0.975)) # gives an error quantile(juul$age, prob=c(0.025,0.25,0.5,0.75,0.975), na.rm=TRUE) with(juul, table(sex, menarche)) with(juul, table(sex, menarche, useNA="always")) ## ----"Factors: what are they"------------------------------------------------------------------------------------------------------------------- DiseaseState <- factor(c("Cancer", "Cancer", "Normal")) DiseaseState str(DiseaseState) levels(DiseaseState) typeof(DiseaseState) # the internal coding is via integers ## ----"Factors: how to create"------------------------------------------------------------------------------------------------------------------- is.factor(DiseaseState) help(factor) ## change labels factor(c("Cancer", "Cancer", "Normal"), labels=c("Dis","Norm")) ## change ordering of levels DiseaseState2 <- factor(c("Cancer", "Cancer", "Normal"),levels=c("Normal","Cancer")) table(DiseaseState) table(DiseaseState2) ## can also use relevel function relevel(DiseaseState, ref="Normal") ## see what changes internally as.numeric(DiseaseState) as.numeric(relevel(DiseaseState, ref="Normal")) ## ----"Dates"------------------------------------------------------------------------------------------------------------------------------------ as.Date("15 April 1912", "%d %b %Y") julian(as.Date("15 April 1912", "%d %b %Y")) titanic3$dob <- as.Date("15 April 1912", "%d %b %Y")+ (-titanic3$age*365.25) as.Date("15041912", "%d%m%Y") as.Date("150412", "%d%m%y") format(as.Date("1912April15", "%Y%b%d"),"%A %B %d, %Y") install.packages("lubridate") library(lubridate) wday(as.Date("1912April15", "%Y%b%d"),label=TRUE) ## ---- warning=FALSE,message=FALSE--------------------------------------------------------------------------------------------------------------- #install.packages("swirl") #library(swirl) #install_from_swirl("R Programming") #swirl() ## ----"Basic plot I"----------------------------------------------------------------------------------------------------------------------------- x <- (0:100)/10 plot(x, x^3 - 13 * x^2 + 39 * x) ## ----"Basic plot II"---------------------------------------------------------------------------------------------------------------------------- plot(x, x^3 - 13 * x^2 + 39 * x,cex.axis=1.5,cex.lab=1.5) ## ----"Enhancing a plot I"----------------------------------------------------------------------------------------------------------------------- plot(x,x^3-13*x^2+39*x,type="l",xlab="time (hours)",ylab="temperature",main="Enhanced plot",cex.axis=1.5,cex.lab=1.5) ## ----"Enhancing a plot II"---------------------------------------------------------------------------------------------------------------------- colors() plot(x,x^3-13*x^2+39*x,pch=18,xlab="time (hours)",ylab="temperature",col="red",main="Enhanced plot",cex.axis=1.5,cex.lab=1.5) palette() plot(x,x^3-13*x^2+39*x,pch=18,xlab="time (hours)",ylab="temperature",col=3,main="Enhanced plot",cex.axis=1.5,cex.lab=1.5) ## ----"Plot symbols"----------------------------------------------------------------------------------------------------------------------------- plot(1:25, pch=1:25,cex=2,bg="grey") ## ----"Enhancing a plot III"--------------------------------------------------------------------------------------------------------------------- x<-(0:100)/10 plot(x,x^3-13*x^2+39*x,type="l",xlab="time (hours)",ylab="temperature",cex.axis=1.5,cex.lab=1.5) points(2,34,col="red",pch=16,cex=2) arrows(4,50,2.2,34.5) text(4.15,50,"local maximum",adj=0,col="blue",cex=1.5) lines(x,30-50*sin(x/2),col="blue") legend(x=0,y=80,legend=c("Sahara","Gobi"),lty=1,col=c("black","blue"),cex=1.5) ## ----"Graphical parameters I"------------------------------------------------------------------------------------------------------------------- ?par par("bg") par(bg="green") par("bg") # rerun the plot commands x<-(0:100)/10 plot(x,x^3-13*x^2+39*x,type="l",xlab="time (hours)",ylab="temperature",cex.axis=1.5,cex.lab=1.5) points(2,34,col="red",pch=16,cex=2) arrows(4,50,2.2,34.5) text(4.15,50,"local maximum",adj=0,col="blue",cex=1.5) lines(x,30-50*sin(x/2),col="blue") legend(x=0,y=80,legend=c("Sahara","Gobi"),lty=1,col=c("black","blue"),cex=1.5) plot(1:10) par(bg="white") plot(1:10) # save default graphical parameters to be able to restore them later on par.defaults <- par(no.readonly=TRUE) par(bg="green") par("bg") plot(1:10) par(par.defaults) par("bg") plot(1:10) ## ----"Graphical parameters II"------------------------------------------------------------------------------------------------------------------ x<-(0:100)/10 plot(x,x^3-13*x^2+39*x,type="l",xlab= "time (hours)",ylab="temperature",lwd=3,las=1,cex.axis=1.5,cex.lab=1.5) ## ----"Histograms"------------------------------------------------------------------------------------------------------------------------------- hist(titanic3$age,breaks=15,freq=FALSE,cex.axis=1.5,cex.lab=1.5) ## ----"Boxplot"---------------------------------------------------------------------------------------------------------------------------------- boxplot(titanic3$fare,ylim=c(0,300),ylab="fare",cex.axis=1.5,cex.lab=1.5) boxplot(fare ~ pclass,data=titanic3,ylim=c(0,300),ylab="fare",cex.axis=1.5,cex.lab=1.5) plot(fare ~ pclass,data=titanic3,ylim=c(0,300),ylab="fare",cex.axis=1.5,cex.lab=1.5) ## ----"Advanced R graphics"---------------------------------------------------------------------------------------------------------------------- # graphics plot(fare~age,data=titanic3) # very skewed distribution plot(log10(fare)~age,data=titanic3) # plot log10(y) vs x plot(fare~age,data=titanic3,log="y") # original data, but log scale min(titanic3$fare,na.rm=TRUE) # zero values! sum(titanic3$fare==0,na.rm=TRUE) # zero values! install.packages("ggplot2") library(ggplot2) qplot(age,fare,data=titanic3) qplot(age,fare,data=titanic3) + scale_y_log10() # log scale for y values qplot(age,fare,data=titanic3,facets=~pclass) # one plot per pclass qplot(age,fare,data=titanic3,facets=~pclass,ylim=c(0,280)) # better for comparison qplot(age,fare,data=titanic3,facets=sex~pclass) + scale_y_log10() # even better qplot(age,fare,data=titanic3,facets=pclass~sex) + scale_y_log10() # better for comparing males and females per class #install.packages("devtools") #library(devtools) #install_github("rCharts", "ramnathv", ref = "dev") #library(rCharts) #titanic3$survived = factor(titanic3$survived, labels=c("no","yes")) #rPlot(fare ~ age| sex + pclass, data = titanic3, type = "point", color = "survived", size = list(const = 2)) #dt <- dTable(titanic3,sPaginationType= "full_numbers") #dt ## ----"Export: two types of format"-------------------------------------------------------------------------------------------------------------- ## Open the PDF device pdf("titanic3_ggplot_v1.pdf") qplot(age,fare,data=titanic3,facets=pclass~sex) + scale_y_log10() ## Close the device again to save the file dev.off() qplot(age,fare,data=titanic3,facets=pclass~sex) + scale_y_log10() dev.print("titanic3_ggplot_v2.pdf",device=pdf) # Change size, for pdf device width and height are specified in inches pdf("titanic3_ggplot_v1_big.pdf", width=21, height=21) qplot(age,fare,data=titanic3,facets=pclass~sex) + scale_y_log10() ## Close the device again to save the file dev.off() ## ----"Structure of R"--------------------------------------------------------------------------------------------------------------------------- search() ls() # objects in Workspace ls("package:stats")[1:20] # some objects in stats package ls("package:base")[1:30] # some objects in base package ## hierarchical structure and duplicate names data(quakes) # from package:datasets ## new quakes object in Workspace hides the other quakes <- 1:10 quakes ## removing it makes the other visible again rm(quakes) quakes$mag # original quakes not overwritten! find("quakes") ## ----"Workspace management"--------------------------------------------------------------------------------------------------------------------- save(titanic3, file="Titanic.RData") rm(titanic3) load("Titanic.RData") ## ----"Project management"----------------------------------------------------------------------------------------------------------------------- getwd() #setwd("c:/AMC/Teaching/RCursus") # just an example, note that a forward slash is used ## ----"Data manipulation"------------------------------------------------------------------------------------------------------------------------ ## sort install.packages("dplyr") library(dplyr) titanic.SortedAge <- arrange(titanic3, age) head(titanic.SortedAge) ## base R x <- c(5,8,1) sort(x) order(x) x[order(x)] z <- data.frame(x=c(5,8,1), y=c("A","M","C")) z z[sort(z$x),] z[order(z$x),] ## merge emb <- data.frame(embarked=c("Cherbourg","Queenstown","Southampton"), country=c("France","Ireland","United Kingdom")) emb titanic3 <- left_join(titanic3, emb, by="embarked") # all rows from titanic 3, all columns from both head(titanic3) ## base R titanic3 <- merge(titanic3, emb) ## wide <-> long install.packages("tidyr") library(tidyr) df1 <- data.frame(c("a", "a", "a", "a", "b", "b", "b", "b"), c(1,2,3,4,1,2,3,4), c(0.1, 0.2, 0.3, 0.4, 2.1, 2.2, 2.3, 2.4)) colnames(df1) <- c("Name", "Time", "Value") df1 df1.wide <- spread(df1, Time, Value) df1.wide df1.long <- gather(df1.wide, "Time", "Value", 2:5) df1.long arrange(df1.long, Name, Time) ## base R df1.wide <- reshape(df1, timevar="Time", idvar="Name", direction="wide", v.names="Value") df1.wide df1.long <- reshape(df1.wide, varying=2:5, direction="long") df1.long df1.long[order(df1.long$Name,df1.long$time),] ## ----"Transformed variables"-------------------------------------------------------------------------------------------------------------------- ## transformation using within titanic3 <- within(titanic3, { agecat <- cut(age, breaks=c(0,18,30,40,80)) family <- factor(ifelse(parch==0 & sibsp==0, "no", "yes")) }) summary(titanic3[,c("age","agecat","family")]) ## transformation using transform titanic3 <- transform(titanic3, agecat = cut(age, breaks=c(0,18,30,40,80)), family = factor(ifelse(parch==0 & sibsp==0, "no", "yes")) ) summary(titanic3[,c("age","agecat","family")]) ## ----"Making basic summaries"------------------------------------------------------------------------------------------------------------------- # crosstable using descr package install.packages("descr") library(descr) descr(titanic3$age) with(titanic3, CrossTable(pclass,sex,format="SPSS")) ## summary by subgroup, all produce basically the same output ## basic R functions aggregate(age~pclass, data=titanic3, FUN=summary) by(titanic3,titanic3$pclass,FUN=function(x) summary(x$age)) with(titanic3, tapply(age,pclass,FUN=summary)) ## using doBy package install.packages("doBy") library(doBy) summaryBy(age~pclass, data=titanic3, FUN=summary) ## ----"Searching"-------------------------------------------------------------------------------------------------------------------------------- help(summary) fit.age <- lm(age ~ pclass, data=titanic3) summary(fit.age) class(fit.age) help(summary.lm) help(descriptive) #help.search("descriptive statistics") # same as ??descriptive #RSiteSearch("{descriptive statistics}") # braces to indicate that words are to be searched as one entity #install.packages("sos") #library(sos) #findFn("{descriptive statistics}") ## ----"Regression: Formulas"--------------------------------------------------------------------------------------------------------------------- help.search("Student") help(t.test) with(titanic3,t.test(age[sex=="male"],age[sex=="female"])) t.test(subset(titanic3,sex=="male")$age,subset(titanic3,sex=="female")$age) test.age <- t.test(age ~ sex, data = titanic3) test.age ## ----"Model output"----------------------------------------------------------------------------------------------------------------------------- fit.age <- lm(age ~ pclass, data=titanic3) fit.age summary(fit.age) ## output of model is a list str(fit.age) names(fit.age) summary(fit.age$residuals) ## or summary(resid(fit.age)) hist(resid(fit.age)) coef(fit.age) predict(fit.age, newdata=data.frame(pclass=factor(levels(titanic3$pclass)))) update(fit.age,. ~ .+sex) ## the output has a specific class, in this case lm class(fit.age) ## which functions have been written for this class? help(methods) methods(class="lm") ## we want to know what the summary function does with an object of class lm help(summary) # does not give the desired help file ## we need to consult the help file for summary.lm methods(summary) help(summary.lm) plot(fit.age) ## ----"Formula structure"------------------------------------------------------------------------------------------------------------------------ plot(age ~ fare, data=titanic3) ## the type of plot is automaticaaly adapted to the type of variable plot(age~pclass, data=titanic3) ## ----"Statements: if-then-else"----------------------------------------------------------------------------------------------------------------- x <- 10 z <- if (x < 2) 4 else 3 z ## ----"Statements: repetition"------------------------------------------------------------------------------------------------------------------- A <- matrix(1:10, 2, 5) rownames(A) <- c("gene 1", "gene 2") colnames(A) <- c("array 1", "array 2", "array 3","array 4", "array 5") results <- numeric(2) results for (i in 1:2) { results[i] <- mean(A[i, ]) } results ## ----"apply"------------------------------------------------------------------------------------------------------------------------------------ apply(A, 1, mean) ## ----"tapply: Example"-------------------------------------------------------------------------------------------------------------------------- titanic3 <- read.dta("Exercises/titanic3.dta", convert.underscore=TRUE) head(titanic3[,c("fare","pclass")]) with(titanic3,tapply(fare, pclass, mean,na.rm=T)) # Solution using the dplyr package #install.packages("dplyr") library(dplyr) summarise(group_by(titanic3, pclass), mean(fare,na.rm=TRUE))