## ## This script contains the 'answers' ## # # Course: AMC Graduate School, Bioinformatics # Tutorial: Genome Browsers # Tutor: Antoine van Kampen # # Set your working directory # - RStudio -> Session -> Set Working Directory -> To Source File Location # - or via a command: setwd("C:\\Users\\PietjePuk\\TCGA") (replace the directory below with your location) getwd() ## ## Load the necessary TCGA breast cancer (BRCA) data (provided as a zip file at the course website) ## temp = tempfile() download.file("http://bioinformatics.amc.nl/wp-content/uploads/gs-bioinformatics/GenomeBrowserTutorial/TCGA-data/RawData.zip", temp, mode="wb") unzip(temp) # # As a first step load de-identified clinical information of the TCGA BRCA patients: # Clinical=read.delim(file="./RawData/nationwidechildrens.org_clinical_patient_brca.txt") # # Open the file 'nationwidechildren.org_clinical_patient_bcra.txt' in Excel. # Inspect the file # ERpos = TCGA-AR-A1AR # ERneg = TCGA-BH-A1EO # # In addition to the two patients you selected, we will also retrieve the data of 25 arbitrary samples # from ER positive and ER negative patients. This will allow us to get a first impression about the # difference in gene expression of the CLDN1 gene for these two groups of patients. # # Step 1: Get the barcode and ER status ER=Clinical[,c("bcr_patient_barcode","er_status_by_ihc")] ERpos=ER[which(ER[,2]=="Positive"),] ERneg=ER[which(ER[,2]=="Negative"),] BarcodePos=as.vector(ERpos[1:25,1]) # only use the first 25 samples BarcodeNeg=as.vector(ERneg[1:25,1]) # only use the first 25 samples # Step 2: Load the gene expression as normalized counts (RPKM) # A. First for the 2 pre-selected patients, TCGA-AR-A1AR and TCGA-BH-A1EO GeneExpData = read.delim("./RawData/BRCA__illuminahiseq_rnaseq__GeneExp.txt", sep="\t") # B. Next for the 25 ER pos en 25 ER neg patients GeneExpDataPos = read.delim("./RawData/AllPos/BRCA__illuminahiseq_rnaseq__GeneExp.txt", sep="\t") GeneExpDataNeg = read.delim("./RawData/AllNeg/BRCA__illuminahiseq_rnaseq__GeneExp.txt", sep="\t") # Since we are only interested in the expression levels of the CLDN1 gene, we extract this data and plot # the distributions index=which(GeneExpDataPos[,"GeneSymbol"]=="CLDN1") CLDN1pos=unlist(GeneExpDataPos[index,-c(1:2)]) index=which(GeneExpDataNeg[,"GeneSymbol"]=="CLDN1") CLDN1neg=unlist(GeneExpDataNeg[index,-c(1:2)]) # Have a look at the expression levels of CLDN1 in the 25 ER pos and 25 ER neg samples boxplot(list(CLDN1pos,CLDN1neg),col='blue',ylab="RPKM",main="ER pos / ER neg") # Alternatively you can make a density plot (sort of histogram) plot(density(CLDN1pos,from=0),xlim=c(0,250),col='blue',xlab='RPKM',ylab='Density',main='Distribution Gene Expression (ER pos, ER neg)') points(density(CLDN1neg,from=0),col='red',type='l',xlab='RPKM',ylab='Density',main='Distribution Gene Expression (ER neg)') legend("topright",c("red=ER neg","blue=ER pos")) #### do not close this plot!! ###### # Now add the points for the selected patients. # Note that the data for the selected ER pos and ER neg samples are in a single file. # Inspect this file and retrieve the RPKM for one ER pos and one ER neg sample neg = #ADD THE RPKM Expression level for the patient ER neg patient pos = #ADD THE RPKM Expression level for the patient ER pos patient # Add these expression values in the density plot abline(v=pos,col="blue",lty=2) abline(v=neg,col="red",lty=2)