## ----twotwo------------------------------------------------------------------------------------------------------------------------------------- 2+2 ## ----conversion,solution=TRUE------------------------------------------------------------------------------------------------------------------- # This is the current exchange rate 15 * 1.1767 ## ----rounding,solution=TRUE--------------------------------------------------------------------------------------------------------------------- curr <- round(15 * 1.1767, 1) curr ## ----summary,solution=TRUE---------------------------------------------------------------------------------------------------------------------- mode(curr) str(curr) summary(curr) ## ----vec,solution=TRUE-------------------------------------------------------------------------------------------------------------------------- vec <- seq(11,30) vec ## ----vec7,solution=TRUE------------------------------------------------------------------------------------------------------------------------- vec[7] ## ----vec-15,solution=TRUE----------------------------------------------------------------------------------------------------------------------- # Use '-' to exclude elements vec[-15] ## ----vec2and5,solution=TRUE--------------------------------------------------------------------------------------------------------------------- # Use the concatenate function 'c' vec[c(2,5)] ## ----odd,solution=TRUE-------------------------------------------------------------------------------------------------------------------------- index <- seq(1,length(vec),by=2) vec[index] ## Script, Console, Environment, History, Files, Plots, Packages, Help, Viewer. ## ----sd,solution=TRUE--------------------------------------------------------------------------------------------------------------------------- # Don't call the variable 'mean' since the function 'mean' already exists xmean <- sum(vec)/length(vec) xmean mean(vec) # Calculate the standard deviation in three (baby) steps numerator <- sum((vec-xmean)^2) denominator <- (length(vec)-1) xstd <- sqrt(numerator/denominator) xstd sd(vec) ## ----islands------------------------------------------------------------------------------------------------------------------------------------ # This could in this case actually be skipped since the package datasets is # already loaded data(islands) ## ----length,solution=TRUE----------------------------------------------------------------------------------------------------------------------- # From ?islands: "Description: The areas in thousands of square miles of # the landmasses which exceed 10,000 square miles." So all of them: length(islands) ## ----sel-islands1,solution=TRUE----------------------------------------------------------------------------------------------------------------- # Remember that area was given in thousands of square miles islands.more20 <- (islands > 20) ## ----sel-islands2,solution=TRUE----------------------------------------------------------------------------------------------------------------- # Use the Boolean vector islands.more20 to select the ones with value TRUE islands[islands.more20] ## ----names-islands,solution=TRUE---------------------------------------------------------------------------------------------------------------- islands.names <- names(islands) ## ----moluccas,solution=TRUE--------------------------------------------------------------------------------------------------------------------- notMoluccas <- islands.names != "Moluccas" islands.withoutMoluccas <- islands[notMoluccas] # Another solution is: islands.withoutMoluccas <- subset(islands,islands.names!="Moluccas") ## ----stata-imp,echo=FALSE----------------------------------------------------------------------------------------------------------------------- library(foreign) titanic3 <- read.dta("titanic3.dta", convert.underscore=TRUE) ## ----stata-imp,eval=FALSE,solution=TRUE--------------------------------------------------------------------------------------------------------- ## library(foreign) ## titanic3 <- read.dta("titanic3.dta", convert.underscore=TRUE) ## This can be done using `View` on the command line. You can also use the menu: in R use **Edit - Data editor**, in RStudio click on the name of the data set in the **Environment** window. ## ----head,solution=TRUE------------------------------------------------------------------------------------------------------------------------- head(titanic3,4) tail(titanic3,4) ## ----dim,eval=FALSE----------------------------------------------------------------------------------------------------------------------------- ## dim(titanic3) ## ----str,eval=FALSE----------------------------------------------------------------------------------------------------------------------------- ## str(titanic3) ## ----dim,solution=TRUE-------------------------------------------------------------------------------------------------------------------------- dim(titanic3) ## ----summarydf,solution=TRUE,samepage=TRUE------------------------------------------------------------------------------------------------------ summary(titanic3[,c("survived","pclass","home.dest","dob")]) ## ----summaries,solution=TRUE-------------------------------------------------------------------------------------------------------------------- # Note that you have to convert percentages into probabilities quantile(titanic3$age, probs=c(0.05,0.25,0.5,0.75,0.95), na.rm=TRUE) IQR(titanic3$age, na.rm=TRUE) ## ----tables,solution=TRUE----------------------------------------------------------------------------------------------------------------------- table(titanic3$survived) table(titanic3$sex, titanic3$survived) ## Running the following line of code: ## ## `titanic3.select <- read.table("titanic3select.txt",sep="\t",header=TRUE)` ## ## throws an error message: ## ## `Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : line 16 did not have 18 elements ## Error during wrapup: cannot open the connection` ## ## This error message can in principle be corrected by adding the argument `fill=TRUE`, which adds blank fields if rows have unequal length. However, running ## ## `titanic3.select <- read.table("titanic3select.txt",sep="\t",header=TRUE,fill=TRUE)` ## ## throws a warning message: ## ## `Warning message: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : ## EOF within quoted string` ## ## It turns out that the data was not correctly imported, which is often easily diagnosed by inspecting the dimensions: ## ## `dim(titanic3.select)` ## ## Indeed the number of rows is 25 instead of the expected 30. This is caused by a trailing ' at row 25 for variable `home_dest`. This can be corrected by changing the default set of quoting characters: ## ## `titanic3.select <- read.table("titanic3select.txt", sep="\t", header=TRUE, fill=TRUE, quote="\"")` ## ## A closer inspection of the data shows that the last column only contains NAs caused by trailing spaces. On the course website there is a [corrected version](http://bioinformatics.amc.nl/wp-content/uploads/gs-computing-in-r/titanic3select_corrected.txt) of the file where the last column and the trailing ' at row 25 were deleted. ## ## `titanic3.select <- read.table("titanic3select_corrected.txt",sep="\t",header=TRUE)` ## ----diffchar-fact------------------------------------------------------------------------------------------------------------------------------ sapply(titanic3, mode) sapply(titanic3, is.factor) ## ----factor-survived,solution=TRUE-------------------------------------------------------------------------------------------------------------- # titanic3info.txt: Survival (0 = No; 1 = Yes) titanic3$status <- factor(titanic3$survived, labels=c("no","yes")) ## ----as-numeric,eval=FALSE---------------------------------------------------------------------------------------------------------------------- ## as.numeric(head(titanic3$pclass)) ## as.numeric(head(titanic3$home.dest)) ## ----as-numeric,solution=TRUE,warning=FALSE----------------------------------------------------------------------------------------------------- as.numeric(head(titanic3$pclass)) as.numeric(head(titanic3$home.dest)) ## ----select-old,solution=TRUE------------------------------------------------------------------------------------------------------------------- # Note that the function 'subset' also leaves out the passsengers for which the # variable 'age' is missing subset(titanic3,age>70)[,c("name","home.dest")] # Another solution is subset(titanic3,age>70,select=c(name,home.dest)) # Yet another solution is titanic3[titanic3$age>70 & !is.na(titanic3$age),c("name","home.dest")] ## ----Uruguay,solution=TRUE---------------------------------------------------------------------------------------------------------------------- subset(titanic3, name=="Artagaveytia, Mr. Ramon") # No he didn't travel with relatives, since the variable 'family' is 'no' ## ----table-class,solution=TRUE------------------------------------------------------------------------------------------------------------------ xtabs(~sex+status, data=titanic3, subset=(pclass=="1st")) ## ----plot--------------------------------------------------------------------------------------------------------------------------------------- plot(3,3,main="Main title",sub="subtitle",xlab="x-label",ylab="y-label") text(3.5,3.5,"text at (3.5,3.5)") abline(h=3.5,v=3.5) for (side in 1:4) mtext(-1:4,side=side,at=2.3,line=-1:4) mtext(paste("side",1:4),side=1:4,line=-1,font=2) ## ----------------------------------------------------------------------------------------------------------------------------------------------- par("mar") ## ----parmar5,eval=FALSE------------------------------------------------------------------------------------------------------------------------- ## par(mar=c(5,5,5,5)) ## ----png,eval=FALSE,solution=TRUE--------------------------------------------------------------------------------------------------------------- ## png(file="figure.png") ## # Add code here to plot the figure (including par(mar=c(5,5,5,5))) ## dev.off() ## pdf(file="figure.pdf") ## # Add code here to plot the figure (including par(mar=c(5,5,5,5))) ## dev.off() ## ----pdf,eval=FALSE,solution=TRUE--------------------------------------------------------------------------------------------------------------- ## pdf(file="figureLarge.pdf",width=21,height=21) ## # Add code here to plot the figure (including par(mar=c(5,5,5,5))) ## dev.off() ## ----quakes------------------------------------------------------------------------------------------------------------------------------------- data(quakes) ## ----plot-quakes1,echo=FALSE-------------------------------------------------------------------------------------------------------------------- plot(quakes$long, quakes$lat) ## ----plot-quakes1,eval=FALSE,solution=TRUE------------------------------------------------------------------------------------------------------ ## plot(quakes$long, quakes$lat) ## ----mag-quakes,echo=FALSE---------------------------------------------------------------------------------------------------------------------- mag.cat <- with(quakes,cut(mag, breaks = c(4, 4.5, 5, 7), include.lowest = TRUE )) # More generic using the functions 'min' and 'max' that return the # minimum and maximum, respectively, of a vector mag.cat <- with(quakes,cut(mag, breaks = c(min(mag), 4.5, 5, max(mag)), include.lowest = TRUE )) levels(mag.cat) <- c("low", "medium", "high") depth.cat <- factor(ifelse(quakes$depth>400, "deep", "shallow")) ## ----mag-quakes,eval=FALSE,solution=TRUE-------------------------------------------------------------------------------------------------------- ## mag.cat <- with(quakes,cut(mag, breaks = c(4, 4.5, 5, 7), ## include.lowest = TRUE )) ## # More generic using the functions 'min' and 'max' that return the ## # minimum and maximum, respectively, of a vector ## mag.cat <- with(quakes,cut(mag, breaks = c(min(mag), 4.5, 5, max(mag)), ## include.lowest = TRUE )) ## levels(mag.cat) <- c("low", "medium", "high") ## depth.cat <- factor(ifelse(quakes$depth>400, "deep", "shallow")) ## ----plot-quakes2,echo=FALSE-------------------------------------------------------------------------------------------------------------------- # Note that for 'col' the factor 'depth.cat' is interpreted as numeric (deep=1 and # shallow=2) and that the first two colours of palette() are used. Strangely enough # for 'pch' this is not the case and the factor ''mag.cat' has to be converted to a # numeric vector using the function 'as.numeric'. plot(quakes$long, quakes$lat, pch=as.numeric(mag.cat), col=depth.cat) ## ----plot-quakes2,eval=FALSE,solution=TRUE------------------------------------------------------------------------------------------------------ ## # Note that for 'col' the factor 'depth.cat' is interpreted as numeric (deep=1 and ## # shallow=2) and that the first two colours of palette() are used. Strangely enough ## # for 'pch' this is not the case and the factor ''mag.cat' has to be converted to a ## # numeric vector using the function 'as.numeric'. ## plot(quakes$long, quakes$lat, pch=as.numeric(mag.cat), col=depth.cat) ## ----energy,echo=FALSE-------------------------------------------------------------------------------------------------------------------------- energy <- (10^(3/2))^quakes$mag ## ----energy,eval=FALSE,solution=TRUE------------------------------------------------------------------------------------------------------------ ## energy <- (10^(3/2))^quakes$mag ## ----symbolsize,echo=FALSE---------------------------------------------------------------------------------------------------------------------- symbolsize <- sqrt(energy)/median(sqrt(energy)) ## ----symbolsize,eval=FALSE,solution=TRUE-------------------------------------------------------------------------------------------------------- ## symbolsize <- sqrt(energy)/median(sqrt(energy)) ## ----plot-quakes3,echo=FALSE-------------------------------------------------------------------------------------------------------------------- plot(quakes$long, quakes$lat, cex=symbolsize, col=depth.cat) ## ----plot-quakes3,eval=FALSE,solution=TRUE------------------------------------------------------------------------------------------------------ ## plot(quakes$long, quakes$lat, cex=symbolsize, col=depth.cat) ## ----plot-titanic,echo=FALSE-------------------------------------------------------------------------------------------------------------------- par(mfrow=c(3,1)) # Make the margins smaller than default par(mar=c(2,4,2,2)) hist(titanic3$age,breaks=15,freq=FALSE) boxplot(titanic3$fare,ylim=c(0,300),ylab="fare") boxplot(fare~pclass,data=titanic3,ylim=c(0,300),ylab="fare") ## ----plot-titanic,eval=FALSE,solution=TRUE------------------------------------------------------------------------------------------------------ ## par(mfrow=c(3,1)) ## # Make the margins smaller than default ## par(mar=c(2,4,2,2)) ## hist(titanic3$age,breaks=15,freq=FALSE) ## boxplot(titanic3$fare,ylim=c(0,300),ylab="fare") ## boxplot(fare~pclass,data=titanic3,ylim=c(0,300),ylab="fare") ## ----search,solution=TRUE----------------------------------------------------------------------------------------------------------------------- # You will get different results depending on your configuration search() ls() getwd() ## ----ls,eval=FALSE------------------------------------------------------------------------------------------------------------------------------ ## ls("package:base", pattern="mean") ## ----save,eval=FALSE,solution=TRUE-------------------------------------------------------------------------------------------------------------- ## save(titanic3, file="Titanic.RData") ## ----arrange,solution=TRUE,warning=FALSE,message=FALSE------------------------------------------------------------------------------------------ # If not installed yet, first install the package dplyr that contains the function # arrange #install.packages("dplyr") library(dplyr) data.sorted <- arrange(titanic3, age) head(data.sorted[,1:5],10) # Use is.na to exclude the passengers for whom 'age' is missing tail(subset(data.sorted,!is.na(age))[,1:5],10) ## ----summary-fare1,solution=TRUE---------------------------------------------------------------------------------------------------------------- summary(subset(titanic3, pclass=="1st")$fare) summary(subset(titanic3, pclass=="2nd")$fare) summary(subset(titanic3, pclass=="3rd")$fare) ## Instead of writing three lines of code, applying the `summary` function to each of the three subsets, we could write a `for` loop. When using a `for` loop, we need to explicitly write the `print` command: ## ## `for(i in 1:3){print(summary(subset(titanic3, pclass==levels(pclass)[i])$fare))}` ## ## The function `aggregate` is yet another option: ## ## `aggregate(fare~pclass,data=titanic3,FUN=summary)` ## ## Later, we will see even more efficient ways. ## ----transform,solution=TRUE-------------------------------------------------------------------------------------------------------------------- titanic3 <- within(titanic3, { sibspcat <- cut(sibsp, breaks=c(0,1,2,9),include.lowest=TRUE, right=FALSE, labels=c("0","1","2-8")) paid <- factor(fare>0, labels=c("no","yes")) } ) titanic3 <- transform(titanic3, sibspcat = cut(sibsp, breaks=c(0,1,2,9),include.lowest=TRUE, right=FALSE, labels=c("0","1","2-8")), paid = factor(fare>0, labels=c("no","yes")) ) ## ----AMC-1,echo=FALSE,results="hide"------------------------------------------------------------------------------------------------------------ AMC <- c("Academic","Medical","Center") # a character vector named AMC mode(AMC) is.character(AMC) ## ----AMC-1,solution=TRUE------------------------------------------------------------------------------------------------------------------------ AMC <- c("Academic","Medical","Center") # a character vector named AMC mode(AMC) is.character(AMC) ## ----AMC-2,eval=FALSE--------------------------------------------------------------------------------------------------------------------------- ## help.search("abbreviate") ## help.search("combine") ## help.search("quote") ## ----AMC-3,solution=TRUE------------------------------------------------------------------------------------------------------------------------ #help(abbreviate) abbreviate(AMC,1) # selects first character #help(paste) paste(abbreviate(AMC,1),collapse="") # gives AMC ## ----AMC-4,results="hide"----------------------------------------------------------------------------------------------------------------------- noquote(paste(abbreviate(AMC,1),collapse="")) # removes quotes ## ----crosstab,eval=FALSE,solution=TRUE---------------------------------------------------------------------------------------------------------- ## help(crosstab) ## help.search("crosstab") ## RSiteSearch("crosstab") ## install.packages("sos") ## library(sos) ## findFn("crosstab") ## ----linmodel-fit,solution=TRUE----------------------------------------------------------------------------------------------------------------- fit.fare <- lm(log10(fare+1) ~ age, data=titanic3) summary(fit.fare) ## ----hist-resid,echo=FALSE---------------------------------------------------------------------------------------------------------------------- fit.fare <- lm(log10(fare+1) ~ age, data=titanic3) hist(resid(fit.fare)) ## ----hist-resid,eval=FALSE,solution=TRUE-------------------------------------------------------------------------------------------------------- ## fit.fare <- lm(log10(fare+1) ~ age, data=titanic3) ## hist(resid(fit.fare)) ## ----resid,eval=FALSE--------------------------------------------------------------------------------------------------------------------------- ## plot(resid(fit.fare)~fitted(fit.fare)) ## ----fareage,echo=FALSE------------------------------------------------------------------------------------------------------------------------- plot(log10(fare+1)~age,data=titanic3) abline(fit.fare) ## ----fareage,eval=FALSE,solution=TRUE----------------------------------------------------------------------------------------------------------- ## plot(log10(fare+1)~age,data=titanic3) ## abline(fit.fare) ## ----linmodel-class,solution=TRUE--------------------------------------------------------------------------------------------------------------- class(fit.fare) methods(class="lm") ## ----apply,solution=TRUE------------------------------------------------------------------------------------------------------------------------ apply(subset(titanic3,select=c("age","fare","body")),2,mean,na.rm=TRUE) ## ----chickwts----------------------------------------------------------------------------------------------------------------------------------- data(chickwts) ## ----mean-weight,solution=TRUE------------------------------------------------------------------------------------------------------------------ tapply(chickwts$weight, chickwts$feed, mean) ## ----number-chicks,solution=TRUE---------------------------------------------------------------------------------------------------------------- sum(chickwts$weight > 300) ## ----func,eval=FALSE---------------------------------------------------------------------------------------------------------------------------- ## name <- function(arg_1,arg_2, ...) expr ## ----gt-chicks,solution=TRUE-------------------------------------------------------------------------------------------------------------------- greater300 <- function(x) { sum(x > 300) } ## ----gt-chicks2,solution=TRUE------------------------------------------------------------------------------------------------------------------- tapply(chickwts$weight, chickwts$feed, greater300)