## ----install-packages,warning=FALSE,message=FALSE,eval=FALSE-------------
## # Installing packages should now be a piece of cake ...
## # The commented line is needed for the L0 desktops, but can be skipped on other systems
## #.libPaths("C:/Scratch")
## install.packages("BiocManager")
## BiocManager::install(c("Biobase","annotate","arrayQualityMetrics","caret","e1071","genefilter",
##            "hgu133a.db","lattice","limma","NMF","pamr","RColorBrewer"))


## ----load-packages,warning=FALSE,message=FALSE---------------------------
library(annotate)
library(arrayQualityMetrics)
library(Biobase)
library(caret)
library(genefilter)
library(hgu133a.db)
library(lattice)
library(limma)
library(NMF)


## ----eset,warning=FALSE,message=FALSE------------------------------------
download.file("http://bioinformatics.amc.nl/wp-content/uploads/gs-bioinformatics/PathwaysNetworks/eset.zip", destfile="eset.zip")
unzip("eset.zip")
load("eset.RData")
# Sample annotation:
pData(eset)


## ----design,warning=FALSE,message=FALSE----------------------------------
design <- model.matrix(~0+pData(eset)$condition)
colnames(design) <- c("preeclampsia","preterm")
design


## ----lmfit,warning=FALSE, message=FALSE----------------------------------
fit <- lmFit(eset,design)


## ----fit,warning=FALSE,message=FALSE-------------------------------------
# Only show this for the first probeset. First the coefficient in the linear 
# model for the preeclampsia samples
fit$coef[1,1] 
# Then calculate the mean 'by hand'
mean(exprs(eset)[1,as.character(pData(eset)$condition)=="preeclampsia"])
        
# Idem for the preterm samples
fit$coef[1,2] 
mean(exprs(eset)[1,as.character(pData(eset)$condition)=="preterm"])                         


## ----contrast,warning=FALSE,message=FALSE--------------------------------
# Specify the comparison of interest
cont.matrix <- makeContrasts(PreeclampsiaVsPreterm = preeclampsia - preterm, levels = design)
cont.matrix
fit2 <- contrasts.fit(fit, cont.matrix)
# Indeed this gives the difference of the mean log-intensities (only show this for 
# the first probeset)
fit2$coef[1,1]
fit$coef[1,1] - fit$coef[1,2]


## ----toptable,warning=FALSE, message=FALSE-------------------------------
fit2.eb <- eBayes(fit2)
# Extract a table from a linear model fit ordered by nominal p-value
tt.limma <- topTable(fit2.eb,n=Inf)
# Number of differentially expressed probesets with an FDR < 0.05
sum(tt.limma$adj<0.05)
# Select the ten most differentially expressed genes
tt.limma[1:10,]




## ----exprs,solution=TRUE,box.title="R code",warning=FALSE,message=FALSE----
# Our expression values for 31637_s_at
exprs(eset)["31637_s_at",]


## ----rowttests,solution=TRUE,box.title="R code",warning=FALSE,message=FALSE----
# Results for a classical t-test
tt <- rowttests(exprs(eset),as.factor(pData(eset)$condition))
# Use 'match' to have the probesets in the same order 
plot(tt.limma$P.Value,tt$p.value[match(rownames(tt.limma),rownames(tt))],xlim=c(0,0.1),
     ylim=c(0,0.1),cex=0.4)
abline(0,1,col="red")


## ----scan-dates,warning=FALSE, message=FALSE-----------------------------
# Switch order from (preeclampsia,preterm) to (preterm,preeclampsia)
pData(eset)$condition <- relevel(pData(eset)$condition, "preterm")
# Include scan date as a blocking variable
design <- model.matrix(~pData(eset)$condition + pData(eset)$scandate)
# This is a more complicated design than the previous one. I will explain this some time
# this afternoon
head(design,n=4)
tail(design,n=4)
fit <- lmFit(eset,design)


## ----diff-exp,warning=FALSE, message=FALSE-------------------------------
fit.eb <- eBayes(fit)
# The second coefficient now corresponds to the comparison of interest
tt.pvsp <- topTable(fit.eb,coef=2,n=Inf)
# Number of differentially expressed probesets with an FDR < 0.05
sum(tt.pvsp$adj<0.05)
# Select the ten most differentially expressed genes
tt.pvsp[1:10,]




## ----stripplot,fig.width=12----------------------------------------------
# Create a panel plot
stripplot(exprs(eset)["203140_at",] ~ pData(eset)$condition |
            factor(pData(eset)$scandate), layout = c(5, 1)) 


## ----batch-effect,warning=FALSE,message=FALSE----------------------------
# Coefficients 3-6 correspond to the pairwise comparisons of batch '02/11/05' with the 
# other batches
tt.batch <- topTable(fit.eb,coef=3:6,n=Inf)
sum(tt.batch$adj<0.05)
# Select the ten most differentially expressed genes
tt.batch[1:10,]


## ----stripplot-2,fig.width=12--------------------------------------------
# Create a panel plot
stripplot(exprs(eset)["213350_at",] ~ pData(eset)$condition |
            factor(pData(eset)$scandate), layout = c(5, 1)) 


## ----remove-batch,warning=FALSE,message=FALSE----------------------------
eset.corrected <- eset
exprs(eset.corrected) <- removeBatchEffect(exprs(eset), batch=factor(pData(eset)$scandate), 
                                           design=model.matrix(~pData(eset)$condition))


## ----aqm,warning=FALSE, message=FALSE------------------------------------
if (!dir.exists("aQ_corrected")){
  arrayQualityMetrics(eset.corrected,outdir="aQ_corrected",
                    intgroup=c("condition","scandate"),force=TRUE)
}




## ----annotation,warning=FALSE,message=FALSE------------------------------
probesets <- as.character(rownames(tt.pvsp))
# Retrieve Entrez Gene IDs for the U133A chip
eg <- getEG(probesets, "hgu133a")
# Retrieve gene symbols for the U133A chip
sym <- getSYMBOL(probesets, "hgu133a")
tt.pvsp$ID <- as.character(rownames(tt.pvsp))
# Generate an HTML table with annotation
htmlpage(as.data.frame(eg), filename = "report.html", title = "HTML report",
         othernames = data.frame(sym, tt.pvsp), 
         table.head = c("GeneID","Symbol", colnames(tt.pvsp)), 
         table.center = TRUE)


## ----write-table,warning=FALSE,message=FALSE-----------------------------
write.table(data.frame(GeneID=eg,Symbol=sym,tt.pvsp),file="tt_GSE14722.txt",sep="\t",
            row.names=FALSE)





## ----aheatmap,warning=FALSE,message=FALSE--------------------------------
# Select the related genes (KEGG ID: 00250)
hsa00250 <- AnnotationDbi::select(hgu133a.db, keys = "00250", keytype = "PATH", 
                      columns = c("PROBEID", "ENTREZID", "SYMBOL", "GENENAME"))

# Extract and scale the expression data
E <- exprs(eset[hsa00250$PROBEID, ])
E <- t(scale(t(E), center = TRUE, scale = FALSE))

# Visualize the expression of genes related to alanine, aspartate and glutamate 
# metabolism in a heatmap
aheatmap(E, dist = "pearson", col = "-RdBu:256", breaks = 0, labCol = pData(eset)$condition)


## ----fisher-exact,warning=FALSE,message=FALSE----------------------------
# Define DE genes based on thresholds
diffExpr <- decideTests(fit.eb, p.value = 0.01, adjust.method = "none")
# Number of DE genes per coefficient
summary(diffExpr)
# Select the comparison of interest: preeclampsia vs.preterm
diffExpr <- diffExpr[,2]
diffExpr <- diffExpr != 0
# Indicate members of the alanine, aspartate and glutamate metabolism pathway
inPathway <- featureNames(eset) %in% hsa00250$PROBEID
# Construct contingency table
tab <- table(diffExpr, inPathway)
tab
# Perform an enrichment test using Fisher's exact test
fisher.test(tab, alternative = "greater")


## ----split,warning=FALSE,message=FALSE-----------------------------------
# Fix the random seed to make the exercise reproducible
set.seed(13)
# training set: 75%, test set: 25%
inTrain <- createDataPartition(y = pData(eset)$condition,p = .75,list = FALSE)
training <- eset[ ,inTrain]
testing <- eset[,-inTrain]
ncol(training)
ncol(testing)


## ----pamr,warning=FALSE,message=FALSE------------------------------------
pamrFit <- train(x=t(exprs(training)), y=pData(training)$condition, method="pam",
                 tuneLength=10,trControl = trainControl(method = "cv"))
pamrFit


## ----confusion-matrix,warning=FALSE,message=FALSE------------------------
pamrClasses <- predict(pamrFit, newdata = t(exprs(testing)))
confusionMatrix(data = pamrClasses, pData(testing)$condition)


## ----feature-selection,warning=FALSE,message=FALSE-----------------------
varimps <- varImp(pamrFit)
getSYMBOL(head(featureNames(training)[order(varimps$importance[,1],decreasing=TRUE)]),"hgu133a")