"CSC Computational Biology Week"
http://mrccsc.github.io/r_course/Bioconductor/BioC_Tutorial_Rpress.html
Bioconductor (BioC) is an open source, open development software project to provide tools for the analysis and comprehension of high-throughput genomics data
Installing & Using a package:
source("http://bioconductor.org/biocLite.R")
biocLite("GenomicRanges")
library("GenomicRanges")
library("limma")
?lmFit
Prints version information about R and all loaded packages
sessionInfo()
# Orginial vector
# chr1, chr2, chr2, chr2, chr1, chr1, chr3, chr3
library(GenomicRanges)
chr <- Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 2))
chr
character-Rle of length 8 with 4 runs
Lengths: 1 3 2 2
Values : "chr1" "chr2" "chr1" "chr3"
The above Rle can be interpreted as a run of length 1 of chr1, followed by run length of 3 of chr2, followed by run length of 2 of chr1 and followed by run length of 2 of chr3
GRanges class represents a collection of genomic features with single start and end location on the genome. GRanges object can be cretated using GRanges function.
library("GenomicRanges")
gr1 <- GRanges(seqnames = Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)),
ranges = IRanges(start=11:20, end = 50:59, names = head(letters,10)),
strand = Rle(c("-", "+", "-", "+", "-"), c(1,2, 2, 3, 2)),
score = 1:10, GC = runif(10,0,1))
gr1
GRanges object with 10 ranges and 2 metadata columns:
seqnames ranges strand | score GC
<Rle> <IRanges> <Rle> | <integer> <numeric>
a chr1 [11, 50] - | 1 0.580721070291474
b chr2 [12, 51] + | 2 0.261760980123654
c chr2 [13, 52] + | 3 0.262375191319734
d chr2 [14, 53] - | 4 0.0223529902286828
e chr1 [15, 54] - | 5 0.190516075119376
f chr1 [16, 55] + | 6 0.84672028128989
g chr3 [17, 56] + | 7 0.536360397469252
h chr3 [18, 57] + | 8 0.907330405432731
i chr3 [19, 58] - | 9 0.125121991150081
j chr3 [20, 59] - | 10 0.528982581803575
-------
seqinfo: 3 sequences from an unspecified genome; no seqlengths
mcols(gr1)
DataFrame with 10 rows and 2 columns
score GC
<integer> <numeric>
1 1 0.58072107
2 2 0.26176098
3 3 0.26237519
4 4 0.02235299
5 5 0.19051608
6 6 0.84672028
7 7 0.53636040
8 8 0.90733041
9 9 0.12512199
10 10 0.52898258
mm9genes <- read.table("mm9Genes.txt",sep="\t",header=T)
head(mm9genes)
chr start end strand ens Symbol
1 chr9 105729415 105731415 - ENSMUSG00000043719 Col6a6
2 chr18 43636450 43638450 - ENSMUSG00000043424 Gm9781
3 chr7 70356455 70358455 - ENSMUSG00000030525 Chrna7
4 chrX 100818093 100820093 - ENSMUSG00000086370 B230206F22Rik
5 chr12 82881157 82883157 - ENSMUSG00000042724 Map3k9
6 chr7 152124774 152126774 - ENSMUSG00000070348 Ccnd1
mm9genes.GR <- GRanges(seqnames=mm9genes$chr,
ranges=IRanges(start=mm9genes$start,end=mm9genes$end),
strand=mm9genes$strand,
ENSID=mm9genes$ens,
Symbol=mm9genes$Symbol)
Another option: makeGRangesFromDataFrame()
Converting GRanges object to data frame: as.data.frame(GRanges)
head(mm9genes.GR)
GRanges object with 6 ranges and 2 metadata columns:
seqnames ranges strand | ENSID
<Rle> <IRanges> <Rle> | <factor>
[1] chr9 [105729415, 105731415] - | ENSMUSG00000043719
[2] chr18 [ 43636450, 43638450] - | ENSMUSG00000043424
[3] chr7 [ 70356455, 70358455] - | ENSMUSG00000030525
[4] chrX [100818093, 100820093] - | ENSMUSG00000086370
[5] chr12 [ 82881157, 82883157] - | ENSMUSG00000042724
[6] chr7 [152124774, 152126774] - | ENSMUSG00000070348
Symbol
<factor>
[1] Col6a6
[2] Gm9781
[3] Chrna7
[4] B230206F22Rik
[5] Map3k9
[6] Ccnd1
-------
seqinfo: 21 sequences from an unspecified genome; no seqlengths
gr2 <- GRanges(seqnames = Rle(c("chr1", "chr3","chr2", "chr1", "chr3"), c(1, 2,1, 2, 4)),
ranges = IRanges(start=55:64, end = 94:103, names = letters[11:20]),
strand = Rle(c("+", "-", "+", "-"), c(1, 4, 3, 2)),
score = 1:10, GC = runif(10,0,1))
GRL <- GRangesList("Peak1" = gr1, "Peak2" = gr2)
Accessors: seqnames, start, end, ranges, strand, width, names, mcols, length
Extraction: GR[i], GRL[[i]], head, tail
Set operations: reduce, disjoin
Overlaps: findOverlaps, subsetByOverlaps, countOverlaps, nearest, precede, follow
Arithmetic: shift, resize, distance, distanceToNearest
head(ranges(gr1))
IRanges of length 6
start end width names
[1] 11 50 40 a
[2] 12 51 40 b
[3] 13 52 40 c
[4] 14 53 40 d
[5] 15 54 40 e
[6] 16 55 40 f
start(gr1)
[1] 11 12 13 14 15 16 17 18 19 20
width(gr1)
[1] 40 40 40 40 40 40 40 40 40 40
Subsetting GRanges
gr1[seqnames(gr1)=="chr1"]
GRanges object with 3 ranges and 2 metadata columns:
seqnames ranges strand | score GC
<Rle> <IRanges> <Rle> | <integer> <numeric>
a chr1 [11, 50] - | 1 0.580721070291474
e chr1 [15, 54] - | 5 0.190516075119376
f chr1 [16, 55] + | 6 0.84672028128989
-------
seqinfo: 3 sequences from an unspecified genome; no seqlengths
Merge overlapping genomic ranges within the same GRanges object
reduce(gr1)
GRanges object with 6 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr1 [16, 55] +
[2] chr1 [11, 54] -
[3] chr2 [12, 52] +
[4] chr2 [14, 53] -
[5] chr3 [17, 57] +
[6] chr3 [19, 59] -
-------
seqinfo: 3 sequences from an unspecified genome; no seqlengths
gr1_overlaps <- findOverlaps(gr1,gr2,ignore.strand=F)
gr1_overlaps
Hits of length 5
queryLength: 10
subjectLength: 10
queryHits subjectHits
<integer> <integer>
1 6 1
2 9 2
3 9 3
4 10 2
5 10 3
Output of findOverlaps is a 'Hits' object indicating which of the query and subject intervals overlap.
Convert the 'Hits' object to 2 column matrix using as.matrix(). Values in the first column are indices of the query and values in second column are indices of the subject.
gr1_overlaps.m <- as.matrix(gr1_overlaps)
gr1[gr1_overlaps.m[,"queryHits"], ]
GRanges object with 5 ranges and 2 metadata columns:
seqnames ranges strand | score GC
<Rle> <IRanges> <Rle> | <integer> <numeric>
f chr1 [16, 55] + | 6 0.84672028128989
i chr3 [19, 58] - | 9 0.125121991150081
i chr3 [19, 58] - | 9 0.125121991150081
j chr3 [20, 59] - | 10 0.528982581803575
j chr3 [20, 59] - | 10 0.528982581803575
-------
seqinfo: 3 sequences from an unspecified genome; no seqlengths
subsetByOverlaps extracts the query intervals overlap with subject intervals
subsetByOverlaps(gr1,gr2,ignore.strand=F)
GRanges object with 3 ranges and 2 metadata columns:
seqnames ranges strand | score GC
<Rle> <IRanges> <Rle> | <integer> <numeric>
f chr1 [16, 55] + | 6 0.84672028128989
i chr3 [19, 58] - | 9 0.125121991150081
j chr3 [20, 59] - | 10 0.528982581803575
-------
seqinfo: 3 sequences from an unspecified genome; no seqlengths
Other interesting functions: nearest() and distanceToNearest()
distanceToNearest(gr1,gr2,ignore.strand=F)
Hits of length 10
queryLength: 10
subjectLength: 10
queryHits subjectHits distance
<integer> <integer> <integer>
1 1 5 8
2 2 NA NA
3 3 NA NA
4 4 4 4
5 5 5 4
6 6 1 0
7 7 7 4
8 8 7 3
9 9 3 0
10 10 3 0
countOverlaps() tabulates number of subject intervals overlap with each interval in query, ex: counting number of sequencing reads overlap with genes in RNA-Seq
countOverlaps(gr1,gr2,ignore.strand=T) # note the strand!
a b c d e f g h i j
0 0 0 0 0 1 1 2 2 2
coverage calculates how many ranges overlap with individual positions in the genome. coverage function returns the coverage as Rle instance.
coverage(gr1)
RleList of length 3
$chr1
integer-Rle of length 55 with 6 runs
Lengths: 10 4 1 35 4 1
Values : 0 1 2 3 2 1
$chr2
integer-Rle of length 53 with 6 runs
Lengths: 11 1 1 38 1 1
Values : 0 1 2 3 2 1
$chr3
integer-Rle of length 59 with 8 runs
Lengths: 16 1 1 1 37 1 1 1
Values : 0 1 2 3 4 3 2 1
FASTQ
@ERR590398.1 HWI-ST1146:148:C2FVTACXX:8:1101:1172:2059
NAAAATGCATATTCCTAGCATACTTCCCAAACATACTGAATTATAATCTC
+ERR590398.1 HWI-ST1146:148:C2FVTACXX:8:1101:1172:2059
A1BDFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJIJJ
BAM/SAM consist two sections: Header and Alignment
Header section:
Meta data (reference genome, aligner), starts with “@”
Alignment section:
QNAME: ID of the read (“query”)
FLAG: alignment flags
RNAME: ID of the reference (typically: chromosome name)
POS: Position in reference (1-based, left side)
MAPQ: Mapping quality (as Phred score)
CIGAR: Alignment description (mismatch, gaps etc.)
RNEXT: Mate/next read reference sequence name
MPOS: Mate/next read position
TLEN: observed Template Length
SEQ: sequence of the read
QUAL: quality string of the read
Methods for reading BAM/SAM
library("Rsamtools")
BamFile <- system.file("extdata", "ex1.bam", package="Rsamtools")
which <- RangesList(seq1=IRanges(1000, 2000),seq2=IRanges(c(100, 1000), c(1000, 2000)))
what <- c("rname", "strand", "pos", "qwidth", "seq")
param <- ScanBamParam(which=which, what=what)
bamReads <- scanBam(BamFile, param=param)
length(bamReads) # Each element of the list corresponds to a range specified by the which argument in ScanBamParam
[1] 3
names(bamReads[[1]]) # elements specified by the what and tag arguments to ScanBamParam
[1] "rname" "strand" "pos" "qwidth" "seq"
ScanBamParam:
# Constructor
ScanBamParam(flag = scanBamFlag(), what = character(0), which)
# Constructor helpers
scanBamFlag(isPaired = NA, isProperPair = NA, isUnmappedQuery = NA,
hasUnmappedMate = NA, isMinusStrand = NA, isMateMinusStrand = NA,
isFirstMateRead = NA, isSecondMateRead = NA, isNotPrimaryRead = NA,
isNotPassingQualityControls = NA, isDuplicate = NA)
library(GenomicAlignments)
SampleAlign <- readGAlignments(BamFile)
We can also customise which regions to read
region <- RangesList(seq1=IRanges(1000, 2000),seq2=IRanges(1000, 2000))
param1 <- ScanBamParam(what=c("rname", "pos", "cigar","qwidth"),which=region)
SampleAlign1 <- readGAlignments(BamFile,param=param1)
Annotation packages can be broadly classified in to gene-centric and genome-centric.
Gene-centric annotation packages (AnnotationDbi):
Genomic-centric GenomicFeatures packages:
Web-based annotation services:
Note: Package name = Annotation object
We will explore how to retrieve annotation from gene-centric organism level annotation package (org.Hs.eg.db)
Load the package and list the contents
library("org.Hs.eg.db")
columns(org.Hs.eg.db)
[1] "ENTREZID" "PFAM" "IPI" "PROSITE"
[5] "ACCNUM" "ALIAS" "CHR" "CHRLOC"
[9] "CHRLOCEND" "ENZYME" "MAP" "PATH"
[13] "PMID" "REFSEQ" "SYMBOL" "UNIGENE"
[17] "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "GENENAME"
[21] "UNIPROT" "GO" "EVIDENCE" "ONTOLOGY"
[25] "GOALL" "EVIDENCEALL" "ONTOLOGYALL" "OMIM"
[29] "UCSCKG"
To know more about the above identifier types
help(SYMBOL)
Which keytypes can be used to query this database? keytypes (What is the difference between columns and keytypes?)
keytypes(org.Hs.eg.db)
[1] "ENTREZID" "PFAM" "IPI" "PROSITE"
[5] "ACCNUM" "ALIAS" "CHR" "CHRLOC"
[9] "CHRLOCEND" "ENZYME" "MAP" "PATH"
[13] "PMID" "REFSEQ" "SYMBOL" "UNIGENE"
[17] "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "GENENAME"
[21] "UNIPROT" "GO" "EVIDENCE" "ONTOLOGY"
[25] "GOALL" "EVIDENCEALL" "ONTOLOGYALL" "OMIM"
[29] "UCSCKG"
If we want to extract few identifiers of a particular keytype, we can use keys function
head(keys(org.Hs.eg.db, keytype="SYMBOL"))
[1] "A1BG" "A2M" "A2MP1" "NAT1" "NAT2" "NATP"
We can extract other annotations for a particular identifier using select function
select(org.Hs.eg.db, keys = "A1BG", keytype = "SYMBOL", columns = c("SYMBOL", "GENENAME", "CHR") )
SYMBOL GENENAME CHR
1 A1BG alpha-1-B glycoprotein 19
How can we annotate our results (ex: RNA-Seq differential expression analysis results)?
First, we will load an example results:
load(system.file("extdata", "resultTable.Rda", package="AnnotationDbi"))
head(resultTable)
logConc logFC LR.statistic PValue FDR
100418920 -9.639471 -4.679498 378.0732 3.269307e-84 2.613484e-80
100419779 -10.638865 -4.264830 291.1028 2.859424e-65 1.142912e-61
100271867 -11.448981 -4.009603 222.3653 2.757135e-50 7.346846e-47
100287169 -11.026699 -3.486593 206.7771 6.934967e-47 1.385953e-43
100287735 -11.036862 3.064980 204.1235 2.630432e-46 4.205535e-43
100421986 -12.276297 -4.695736 190.5368 2.427556e-43 3.234314e-40
Rownames of the above dataframe are “Entrez gene identifiers” (human). We will extract gene symbol for these Entrez gene identifiers from org.Hs.eg.db package using select fucntion.
SYM <- select(org.Hs.eg.db, keys = rownames(resultTable), keytype = "ENTREZID", columns = "SYMBOL")
NewResult <- merge(resultTable,SYM,by.x=0,by.y=1)
head(NewResult)
Row.names logConc logFC LR.statistic PValue FDR
1 100127888 -10.57050 2.758937 182.8937 1.131473e-41 1.130624e-38
2 100131223 -12.37808 -4.654318 179.2331 7.126423e-41 6.329847e-38
3 100271381 -12.06340 3.511937 188.4824 6.817155e-43 7.785191e-40
4 100271867 -11.44898 -4.009603 222.3653 2.757135e-50 7.346846e-47
5 100287169 -11.02670 -3.486593 206.7771 6.934967e-47 1.385953e-43
6 100287735 -11.03686 3.064980 204.1235 2.630432e-46 4.205535e-43
SYMBOL
1 SLCO4A1-AS1
2 LOC100131223
3 RPS28P8
4 MPVQTL1
5 <NA>
6 TTTY13B
We will explore TxDb package for Mouse mm9 genome from UCSC. We will first install the package and load in to our current working space.
source("http://bioconductor.org/biocLite.R")
biocLite("TxDb.Mmusculus.UCSC.mm9.knownGene")
By default, the annotation object will have same name as package name. Create an alias for convenience.
library("TxDb.Mmusculus.UCSC.mm9.knownGene")
txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
Since TxDb are inherited from AnnotationDb object, we can use columns, keys, select, and keytypes functions.
keys <- c("100009600", "100009609", "100009614")
select(txdb,keys=keys,columns=c("GENEID","TXNAME"),keytype="GENEID")
GENEID TXNAME
1 100009600 uc009veu.1
2 100009609 uc012fog.1
3 100009614 uc011xhj.1
Most common operations performed on TxDb objects are retrieving exons, transcripts and CDS genomic coordinates. The functions genes, transcripts, exons, and cds return the coordinates for the group as GRanges objects.
TranscriptRanges <- transcripts(txdb)
TranscriptRanges[1:3]
GRanges object with 3 ranges and 2 metadata columns:
seqnames ranges strand | tx_id tx_name
<Rle> <IRanges> <Rle> | <integer> <character>
[1] chr1 [4797974, 4832908] + | 1 uc007afg.1
[2] chr1 [4797974, 4836816] + | 2 uc007afh.1
[3] chr1 [4847775, 4887990] + | 3 uc007afi.2
-------
seqinfo: 35 sequences (1 circular) from mm9 genome
ExonRanges <- exons(txdb)
ExonRanges[1:2]
GRanges object with 2 ranges and 1 metadata column:
seqnames ranges strand | exon_id
<Rle> <IRanges> <Rle> | <integer>
[1] chr1 [4797974, 4798063] + | 1
[2] chr1 [4798536, 4798567] + | 2
-------
seqinfo: 35 sequences (1 circular) from mm9 genome
TxDb package also provides interface to discover how genomic features are related to each other. Ex: Access all transcripts or exons associated to a gene. Such grouping can be achieved by transcriptsBy, exonsBy, and cdsBy functions. The results are returned as GRangesList objects.
Transcripts <- transcriptsBy(txdb, by = "gene")
Transcripts[1:2]
GRangesList object of length 2:
$100009600
GRanges object with 1 range and 2 metadata columns:
seqnames ranges strand | tx_id tx_name
<Rle> <IRanges> <Rle> | <integer> <character>
[1] chr9 [20866837, 20872369] - | 28943 uc009veu.1
$100009609
GRanges object with 1 range and 2 metadata columns:
seqnames ranges strand | tx_id tx_name
[1] chr7 [92088679, 92112519] - | 23717 uc012fog.1
-------
seqinfo: 35 sequences (1 circular) from mm9 genome
Other interesting functions: intronsByTranscript, fiveUTRsByTranscript and threeUTRsByTranscript
GenomicFeatures also provides functions to create TxDb objects directly from UCSC and Biomart databases: makeTranscriptDbFromBiomart and makeTranscriptDbFromUCSC
USCmm9KnownGene <- makeTranscriptDbFromUCSC(genome = "mm9", tablename = "knownGene")
Save the annotation object and label them appropriately to facilitate reproducible research:
saveDb(txdb,file="Mouse_ucsc_mm9_20150204.sqlite")
txdb <- loadDb("Mouse_ucsc_mm9_20150204.sqlite")
biomaRt package offers access to biomart based online annotation resources (marts). Each mart has several datasets. getBM function can be used to retrieve annotation from the biomarts. Use the following functions to find values for the arguments in getBM
listMarts(): list the available biomart resources
useMart(): select the mart
listDatasets(): available dataset in the select biomart resource
useDataset(): select a dataset in the select mart
listAttributes(): available annotation attributes for the selected dataset
listFiltersList(): available filters for the selected dataset
library("biomaRt")
marts <- listMarts() # List available marts
marts[1,]
biomart version
1 ensembl ENSEMBL GENES 78 (SANGER UK)
ensembl <- useMart("ensembl") # select ensembl
ens_datasets <- listDatasets(ensembl) # list datasets
ens_human <- useDataset("hsapiens_gene_ensembl",mart=ensembl) # select human dataset
ens_human_Attr <- listAttributes(ens_human) # list available annotation
ens_human_filters <- listFilters(ens_human) # list availabel filters
Extract genomic coordinates, ensembl gene identifier and gene symbol for genes in chromsome X
chrXGenes <- getBM(attributes = c("chromosome_name","start_position","end_position","ensembl_gene_id","strand","external_gene_name"), filter="chromosome_name",values="X", mart=ens_human)