##
## This script contains the 'answers'
##

#
# Course: AMC Graduate School, Bioinformatics 
# Tutorial: Genome Browsers 
# Tutor: Antoine van Kampen
# 

# Set your working directory 
# - RStudio -> Session -> Set Working Directory -> To Source File Location
# - or via a command: setwd("C:\\Users\\PietjePuk\\TCGA") (replace the directory below with your location)
getwd()

##
## Load the necessary TCGA breast cancer (BRCA) data (provided as a zip file at the course website)
##
temp = tempfile()
download.file("http://bioinformatics.amc.nl/wp-content/uploads/gs-bioinformatics/GenomeBrowserTutorial/TCGA-data/RawData.zip", temp, mode="wb")
unzip(temp)

#
# As a first step load de-identified clinical information of the TCGA BRCA patients:
#
Clinical=read.delim(file="./RawData/nationwidechildrens.org_clinical_patient_brca.txt")

# 
# Open the file 'nationwidechildren.org_clinical_patient_bcra.txt' in Excel. 
# Inspect the file
# ERpos = TCGA-AR-A1AR
# ERneg = TCGA-BH-A1EO

#
# In addition to the two patients you selected, we will also retrieve the data of 25 arbitrary samples
# from ER positive and ER negative patients. This will allow us to get a first impression about the 
# difference in gene expression of the CLDN1 gene for these two groups of patients.
#

# Step 1: Get the barcode and ER status
ER=Clinical[,c("bcr_patient_barcode","er_status_by_ihc")]
ERpos=ER[which(ER[,2]=="Positive"),]
ERneg=ER[which(ER[,2]=="Negative"),]
BarcodePos=as.vector(ERpos[1:25,1])  # only use the first 25 samples
BarcodeNeg=as.vector(ERneg[1:25,1])  # only use the first 25 samples

# Step 2: Load the gene expression as normalized counts (RPKM)
# A. First for the 2 pre-selected patients, TCGA-AR-A1AR and TCGA-BH-A1EO
GeneExpData = read.delim("./RawData/BRCA__illuminahiseq_rnaseq__GeneExp.txt", sep="\t")

# B. Next for the 25 ER pos en 25 ER neg patients
GeneExpDataPos = read.delim("./RawData/AllPos/BRCA__illuminahiseq_rnaseq__GeneExp.txt", sep="\t")

GeneExpDataNeg = read.delim("./RawData/AllNeg/BRCA__illuminahiseq_rnaseq__GeneExp.txt", sep="\t")

# Since we are only interested in the expression levels of the CLDN1 gene, we extract this data and plot
# the distributions
index=which(GeneExpDataPos[,"GeneSymbol"]=="CLDN1")
CLDN1pos=unlist(GeneExpDataPos[index,-c(1:2)])

index=which(GeneExpDataNeg[,"GeneSymbol"]=="CLDN1")
CLDN1neg=unlist(GeneExpDataNeg[index,-c(1:2)])

# Have a look at the expression levels of CLDN1 in the 25 ER pos and 25 ER neg samples
boxplot(list(CLDN1pos,CLDN1neg),col='blue',ylab="RPKM",main="ER pos / ER neg")

# Alternatively you can make a density plot (sort of histogram)
plot(density(CLDN1pos,from=0),xlim=c(0,250),col='blue',xlab='RPKM',ylab='Density',main='Distribution Gene Expression (ER pos, ER neg)')
points(density(CLDN1neg,from=0),col='red',type='l',xlab='RPKM',ylab='Density',main='Distribution Gene Expression (ER neg)')
legend("topright",c("red=ER neg","blue=ER pos"))
#### do not close this plot!! ######

# Now add the points for the selected patients.
# Note that the data for the selected ER pos and ER neg samples are in a single file.
# Inspect this file and retrieve the RPKM for one ER pos and one ER neg sample

neg =     #ADD THE RPKM Expression level for the patient ER neg patient
pos =     #ADD THE RPKM Expression level for the patient ER pos patient

# Add these expression values in the density plot
abline(v=pos,col="blue",lty=2)
abline(v=neg,col="red",lty=2)