## ----install-packages,warning=FALSE,message=FALSE,eval=FALSE------------- ## # Installing packages should now be a piece of cake ... ## # The commented line is needed for the L0 desktops, but can be skipped on other systems ## #.libPaths("C:/Scratch") ## install.packages("BiocManager") ## BiocManager::install(c("Biobase","annotate","arrayQualityMetrics","caret","e1071","genefilter", ## "hgu133a.db","lattice","limma","NMF","pamr","RColorBrewer")) ## ----load-packages,warning=FALSE,message=FALSE--------------------------- library(annotate) library(arrayQualityMetrics) library(Biobase) library(caret) library(genefilter) library(hgu133a.db) library(lattice) library(limma) library(NMF) ## ----eset,warning=FALSE,message=FALSE------------------------------------ download.file("http://bioinformatics.amc.nl/wp-content/uploads/gs-bioinformatics/PathwaysNetworks/eset.zip", destfile="eset.zip") unzip("eset.zip") load("eset.RData") # Sample annotation: pData(eset) ## ----design,warning=FALSE,message=FALSE---------------------------------- design <- model.matrix(~0+pData(eset)$condition) colnames(design) <- c("preeclampsia","preterm") design ## ----lmfit,warning=FALSE, message=FALSE---------------------------------- fit <- lmFit(eset,design) ## ----fit,warning=FALSE,message=FALSE------------------------------------- # Only show this for the first probeset. First the coefficient in the linear # model for the preeclampsia samples fit$coef[1,1] # Then calculate the mean 'by hand' mean(exprs(eset)[1,as.character(pData(eset)$condition)=="preeclampsia"]) # Idem for the preterm samples fit$coef[1,2] mean(exprs(eset)[1,as.character(pData(eset)$condition)=="preterm"]) ## ----contrast,warning=FALSE,message=FALSE-------------------------------- # Specify the comparison of interest cont.matrix <- makeContrasts(PreeclampsiaVsPreterm = preeclampsia - preterm, levels = design) cont.matrix fit2 <- contrasts.fit(fit, cont.matrix) # Indeed this gives the difference of the mean log-intensities (only show this for # the first probeset) fit2$coef[1,1] fit$coef[1,1] - fit$coef[1,2] ## ----toptable,warning=FALSE, message=FALSE------------------------------- fit2.eb <- eBayes(fit2) # Extract a table from a linear model fit ordered by nominal p-value tt.limma <- topTable(fit2.eb,n=Inf) # Number of differentially expressed probesets with an FDR < 0.05 sum(tt.limma$adj<0.05) # Select the ten most differentially expressed genes tt.limma[1:10,] ## ----exprs,solution=TRUE,box.title="R code",warning=FALSE,message=FALSE---- # Our expression values for 31637_s_at exprs(eset)["31637_s_at",] ## ----rowttests,solution=TRUE,box.title="R code",warning=FALSE,message=FALSE---- # Results for a classical t-test tt <- rowttests(exprs(eset),as.factor(pData(eset)$condition)) # Use 'match' to have the probesets in the same order plot(tt.limma$P.Value,tt$p.value[match(rownames(tt.limma),rownames(tt))],xlim=c(0,0.1), ylim=c(0,0.1),cex=0.4) abline(0,1,col="red") ## ----scan-dates,warning=FALSE, message=FALSE----------------------------- # Switch order from (preeclampsia,preterm) to (preterm,preeclampsia) pData(eset)$condition <- relevel(pData(eset)$condition, "preterm") # Include scan date as a blocking variable design <- model.matrix(~pData(eset)$condition + pData(eset)$scandate) # This is a more complicated design than the previous one. I will explain this some time # this afternoon head(design,n=4) tail(design,n=4) fit <- lmFit(eset,design) ## ----diff-exp,warning=FALSE, message=FALSE------------------------------- fit.eb <- eBayes(fit) # The second coefficient now corresponds to the comparison of interest tt.pvsp <- topTable(fit.eb,coef=2,n=Inf) # Number of differentially expressed probesets with an FDR < 0.05 sum(tt.pvsp$adj<0.05) # Select the ten most differentially expressed genes tt.pvsp[1:10,] ## ----stripplot,fig.width=12---------------------------------------------- # Create a panel plot stripplot(exprs(eset)["203140_at",] ~ pData(eset)$condition | factor(pData(eset)$scandate), layout = c(5, 1)) ## ----batch-effect,warning=FALSE,message=FALSE---------------------------- # Coefficients 3-6 correspond to the pairwise comparisons of batch '02/11/05' with the # other batches tt.batch <- topTable(fit.eb,coef=3:6,n=Inf) sum(tt.batch$adj<0.05) # Select the ten most differentially expressed genes tt.batch[1:10,] ## ----stripplot-2,fig.width=12-------------------------------------------- # Create a panel plot stripplot(exprs(eset)["213350_at",] ~ pData(eset)$condition | factor(pData(eset)$scandate), layout = c(5, 1)) ## ----remove-batch,warning=FALSE,message=FALSE---------------------------- eset.corrected <- eset exprs(eset.corrected) <- removeBatchEffect(exprs(eset), batch=factor(pData(eset)$scandate), design=model.matrix(~pData(eset)$condition)) ## ----aqm,warning=FALSE, message=FALSE------------------------------------ if (!dir.exists("aQ_corrected")){ arrayQualityMetrics(eset.corrected,outdir="aQ_corrected", intgroup=c("condition","scandate"),force=TRUE) } ## ----annotation,warning=FALSE,message=FALSE------------------------------ probesets <- as.character(rownames(tt.pvsp)) # Retrieve Entrez Gene IDs for the U133A chip eg <- getEG(probesets, "hgu133a") # Retrieve gene symbols for the U133A chip sym <- getSYMBOL(probesets, "hgu133a") tt.pvsp$ID <- as.character(rownames(tt.pvsp)) # Generate an HTML table with annotation htmlpage(as.data.frame(eg), filename = "report.html", title = "HTML report", othernames = data.frame(sym, tt.pvsp), table.head = c("GeneID","Symbol", colnames(tt.pvsp)), table.center = TRUE) ## ----write-table,warning=FALSE,message=FALSE----------------------------- write.table(data.frame(GeneID=eg,Symbol=sym,tt.pvsp),file="tt_GSE14722.txt",sep="\t", row.names=FALSE) ## ----aheatmap,warning=FALSE,message=FALSE-------------------------------- # Select the related genes (KEGG ID: 00250) hsa00250 <- AnnotationDbi::select(hgu133a.db, keys = "00250", keytype = "PATH", columns = c("PROBEID", "ENTREZID", "SYMBOL", "GENENAME")) # Extract and scale the expression data E <- exprs(eset[hsa00250$PROBEID, ]) E <- t(scale(t(E), center = TRUE, scale = FALSE)) # Visualize the expression of genes related to alanine, aspartate and glutamate # metabolism in a heatmap aheatmap(E, dist = "pearson", col = "-RdBu:256", breaks = 0, labCol = pData(eset)$condition) ## ----fisher-exact,warning=FALSE,message=FALSE---------------------------- # Define DE genes based on thresholds diffExpr <- decideTests(fit.eb, p.value = 0.01, adjust.method = "none") # Number of DE genes per coefficient summary(diffExpr) # Select the comparison of interest: preeclampsia vs.preterm diffExpr <- diffExpr[,2] diffExpr <- diffExpr != 0 # Indicate members of the alanine, aspartate and glutamate metabolism pathway inPathway <- featureNames(eset) %in% hsa00250$PROBEID # Construct contingency table tab <- table(diffExpr, inPathway) tab # Perform an enrichment test using Fisher's exact test fisher.test(tab, alternative = "greater") ## ----split,warning=FALSE,message=FALSE----------------------------------- # Fix the random seed to make the exercise reproducible set.seed(13) # training set: 75%, test set: 25% inTrain <- createDataPartition(y = pData(eset)$condition,p = .75,list = FALSE) training <- eset[ ,inTrain] testing <- eset[,-inTrain] ncol(training) ncol(testing) ## ----pamr,warning=FALSE,message=FALSE------------------------------------ pamrFit <- train(x=t(exprs(training)), y=pData(training)$condition, method="pam", tuneLength=10,trControl = trainControl(method = "cv")) pamrFit ## ----confusion-matrix,warning=FALSE,message=FALSE------------------------ pamrClasses <- predict(pamrFit, newdata = t(exprs(testing))) confusionMatrix(data = pamrClasses, pData(testing)$condition) ## ----feature-selection,warning=FALSE,message=FALSE----------------------- varimps <- varImp(pamrFit) getSYMBOL(head(featureNames(training)[order(varimps$importance[,1],decreasing=TRUE)]),"hgu133a")