## ----install-packages,warning=FALSE,message=FALSE,eval=FALSE------------- ## # If you didn't do so yet, first install the required packages. The commented line ## # is needed for the L0 desktops, but can be skipped on other systems ## #.libPaths("C:/Scratch") ## # Installation of packages might take a few minutes ## # If in the console you are asked "Update all/some/none? [a/s/n]:". Just reply "n" ## install.packages("BiocManager") ## BiocManager::install(c("affy","arrayQualityMetrics","bioDist","genefilter","GenomeInfoDbData", ## "hgu133acdf","limma","tibble","mclust","ClassDiscovery")) ## ----load-packages,warning=FALSE,message=FALSE--------------------------- library(affy) library(arrayQualityMetrics) library(bioDist) library(ClassDiscovery) library(genefilter) library(limma) library(mclust) ## ----hcexample-data------------------------------------------------------ E <- read.table("hcexample.txt") matplot(E,type="l",col=1:4,lty=1:4,lwd=3,xlab="Gene",ylab="log2-ratio",xaxt="n") axis(1,1:4) legend(3.3,2.8,c("A","B","C","D"),lty=1:4,col=1:4,lwd=3,y.intersp=1,cex=1.3) ## ----distance-matrix----------------------------------------------------- # Note that you have to transpose (t) the data matrix E since pairwise distances # are calculated for rows of a matrix d.euc <- euc(t(E)) d.euc d.cor <- cor.dist(t(E),abs=FALSE) d.cor ## ----hclust,solution=TRUE,box.title="R code"----------------------------- # First specify that the two plots should be made on the same row par(mfrow=c(1,2)) hc.euc <- hclust(d.euc, method="average"); plot(hc.euc,main="Dendrogram (Euclidean)",sub="Average linkage",hang=-1); hc.cor <- hclust(d.cor, method="average"); plot(hc.cor,main="Dendrogram (Correlation)",sub="Average linkage",hang=-1); par(mfrow=c(1,1)) ## ----rnorm-data---------------------------------------------------------- # 1000 genes n.genes <- 1000 # 50 samples n.samples <- 50 # Generate labels for the samples descr <- paste("S", rep(c("0",""),times=c(9,41)), 1:50, sep="") # Fix the random seed to make the exercise reproducible set.seed(13) # Matrix of expression data for 1000 genes and 50 samples dataMatrix <- matrix(rnorm(n.genes*n.samples),nrow=n.genes) ## ----rnorm-hclust,solution=TRUE,fig.width=10, fig.height=5--------------- # First specify that the three plots should be made on the same row par(mfrow=c(1,3)) # The following code generates the three dendrograms dEuc <- euc(t(dataMatrix)) hAvgEuc <-hclust(dEuc, method="average") plot(hAvgEuc, labels=descr,cex=0.6) hSinEuc <-hclust(dEuc, method="single") plot(hSinEuc, labels=descr,cex=0.6) hComEuc <-hclust(dEuc, method="complete") plot(hComEuc, labels=descr,cex=0.6) # Back to default par(mfrow=c(1,1)) ## ----jq-data,solution=TRUE,warning=FALSE,message=FALSE,box.title="R code"---- jq.data <- read.table("jq.txt") heatmap(as.matrix(jq.data),Colv=NA,scale="none",labRow=NA,col=blueyellow(64)) ## ----jq-data2,echo=FALSE------------------------------------------------- jq.data <- read.table("jq.txt") ## ----heatmap,solution=TRUE,box.title="R code"---------------------------- heatmap(as.matrix(jq.data),Colv=NA,scale="row",labRow=NA,col=blueyellow(64)) ## ----jq-height----------------------------------------------------------- # Plot the height of the 20 branches closest to the root of the tree plot(rev(hclust(euc(as.matrix(jq.data)))$height)[1:20], main="height of the branches of the dendrogram",type="l",ylab="height") ## ----rnorm-height,solution=TRUE,box.title="R code"----------------------- # Plot the height of all branches plot(rev(hclust(euc(t(dataMatrix)),method="complete")$height), main="height of the branches of the dendrogram",type="l",ylab="height") ## ----gse14722,warning=FALSE, message=FALSE------------------------------- # Download raw data from GEO # Normally this can be done via the lines of code that are commented out, but # this option is sometimes blocked. Therefore we download both files 'manually'. #library(GEOquery) #getGEOSuppFiles("GSE14722") # The CEL files are downloaded and stored in folder GSE14722 as a tar archive dir.create("GSE14722") setwd("GSE14722/") if (!file.exists("GSE14722_RAW.tar")){ download.file("ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE14nnn/GSE14722/suppl/GSE14722_RAW.tar", destfile="GSE14722_RAW.tar",mode="wb") download.file("ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE14nnn/GSE14722/suppl/filelist.txt", destfile="filelist.txt") } # If the next line gives an error (GSM367781.CEL.gz: Can't create ...) use the Windows program 7-Zip # with 'Uitpakken (hier)' untar("GSE14722_RAW.tar") # Read in the gzipped CEL files for the 23 HG-U133A chips tab <- read.delim("filelist.txt") affybatch <- ReadAffy(filenames=as.character(tab[2:24,"Name"])) # Add sample annotation, in this case 'by hand' so to speak pData(affybatch) <- data.frame(condition=c(rep("preterm",11),rep("preeclampsia",12)), pData(affybatch)) # You might want to verify the sample annotation with the information available # at GEO to be sure that we did not make any mistake pData(affybatch) setwd("..") ## ----affybatch,warning=FALSE--------------------------------------------- affybatch ## ----aqm-raw, warning=FALSE,message=FALSE-------------------------------- ## do.logtransform: data still has to be log-transformed ## intgroup: add 'condition' (preeclampsia/preterm) as a factor of interest if (!dir.exists("aQ_raw")){ arrayQualityMetrics(affybatch,outdir="aQ_raw",do.logtransform=TRUE,spatial=FALSE, intgroup="condition",force=TRUE) } ## ----aqm-norm,warning=FALSE, message=FALSE------------------------------- # Normalization using RMA eset <- rma(affybatch) if (!dir.exists("aQ_norm")){ arrayQualityMetrics(eset,outdir="aQ_norm",intgroup="condition", force=TRUE) } ## ----aqm-norm-batch,warning=FALSE, message=FALSE------------------------- # Create a vector of scandates pData(eset)$scandate <- sapply(protocolData(eset)$ScanDate,function(x) strsplit(x," ")[[1]][1]) pData(eset)$scandate if (!dir.exists("aQ_norm_batch")){ arrayQualityMetrics(eset,outdir="aQ_norm_batch",intgroup=c("scandate","condition"), force=TRUE) } ## ----rowttests,message=FALSE, warning=FALSE------------------------------ # exprs(eset): extracts the expression values tt <- rowttests(exprs(eset),as.factor(pData(eset)$condition)) # Inspect the first rows of the matrix tt head(tt) # Number of rows and columns of tt dim(tt) ## ----hist---------------------------------------------------------------- # If you don't see a plot appearing, execute dev.off() in the console one (or more) times till you get 'null device 1' hist(tt$p.value,breaks=50,col="orange",main="nominal p-values") # Number of probesets with p < 0.05 sum(tt$p.value<0.05) ## ----bonferroni---------------------------------------------------------- p.bonferroni <- p.adjust(tt$p.value,"bonferroni") sum(p.bonferroni<0.05) p.fdr <- p.adjust(tt$p.value,"fdr") hist(p.fdr,col="yellow",main="FDR corrected") sum(p.fdr<0.05)