###First, load the libraries. ChIPpeakAnno has to be loaded first to avoid conflicts (otherwise bad things happen). library(ChIPpeakAnno) library(BSgenome.Hsapiens.UCSC.hg19) library(ShortRead) library(rGADEM) library(rtracklayer) ##Some methods in Bioconductor package IRanges depend on the concept of run-length enconding. It's an efficient way to represent sequences that contain long runs of same value: x=c(rep(0,10),rep(1,8),rep(0,10)) x xRle=Rle(x) xRle as.vector(xRle) ##conceptually, you can treat Rle object exactly the same as underlying vectors x[1] xRle[1] x[12] xRle[12] y=c(rep(2,8),rep(0,10),rep(1,10)) yRle=Rle(y) x y x+y xRle+yRle xRle>0 ##basic accessor methods get you valuable information about Rle objects: xRle slotNames(xRle) xRle@values xRle@lengths runValue(xRle) runLength(xRle) ##abstract selectors exist to obtain 'subsequences': window(xRle, 3,9) seqselect(xRle, start=c(1,6),end=c(10,12)) seqselect(xRle, start=c(3,13),width=2) yRle rev(yRle) ##Rle objects are usually stored in RleList containers myRleList=RleList(xRle, yRle) ##can obtain information about RleLists: length(myRleList) elementLengths(myRleList) ##some operations that are NOT vectorized on usual lists are vectorized on RleLists myRleList2=RleList(yRle, rev(yRle)) myRleList myRleList2 myRleList+myRleList2 myRleList>0 ##can do views on Rle object similar to views on sequences myViews=Views(yRle, start=c(1,9), end=c(3,25)) myViews myViews[1] myViews[[1]] subject(myViews) ##can get original object back ##this can come in handy when say Rle object is coverage along chromosome and you want to find areas with coverage > some number. yRle yRle[yRle>=1] ##this doesn't do what you want slice(yRle,1) ##but this does Views(yRle, yRle>=1) ##and so does this slice(myRleList2,1) ##vectorizes to RleLists ##objects of class IRanges store interval representations of subsequences, a.k.a. ranges. There's similarity to views from before (like Views on sequences or Views on Rle's), except that there's no 'subject' being viewed ir=IRanges(c(1,8,14,15,19,34,40), width=c(12,6,6,15,6,2,7)) ir class(ir) slotNames(ir) start(ir) end(ir) width(ir) ##can give ranges names: ir=IRanges(c(1,8,14,15,19,34,40), width=c(12,6,6,15,6,2,7), name=paste('region',1:7,sep='')) ##can do subsetting ir[1:4] ir[start(ir)<=15] ##can reverse order ir rev(ir) ##think of IRanges as a list, with each element a sequence of integeres start:end ir[1] ir[[1]] ##a very useful thing sometimes is to reduce set of ranges to a disjoint collection ir reduce(ir) ##IRangesList container can be used to hold several instances of IRanges (similar to a list) rl=RangesList(ir,reduce(ir)) rl rl[1] rl[[1]] rl[[1]][[1]] ##can get information on IRanges in the list: start(rl) width(rl) ##a very useful feature: can use IRanges to subset Rle objects: yRle ir2=IRanges(start=c(1,5),width=c(15,3)) ir2 yRle[ir2] seqselect(yRle,ir2) Views(yRle,ir2) ##an even more useful feature is ability to find overlaps between sets of ranges: ir reduce(ir) ol=findOverlaps(ir,reduce(ir)) class(ol) ol ol@matchMatrix as.matrix(ol) as.matrix(findOverlaps(reduce(ir),ir)) ##yet another very useful function allows you to compute the coverage of sequence 1:... by ranges ir coverage(ir) coverage(ir, width=100) ##lots of functions to modify ranges: see documentation. Below are a couple examples: ir shift(ir,10) resize(ir, width=5, fix='center') ##often want to have some information associated with ranges, like annotation. RangedData objects allow you to do exactly that. myexon=sample(c(0,1),7,replace=T) myspace=sample(c('chr1','chr2'),7,replace=T) myGC=runif(7) myexon myspace myGC rd=RangedData(ranges=ir, space=myspace, GC=myGC, exon=myexon) ##note that things may have been re-arranged to sort by space class(rd) rd space(rd) start(rd) end(rd) names(rd) row.names(rd) length(rd) ##gives you number of spaces RangedData object has nrow(rd) ##can access underlying objects, note that the results are split by space ranges(rd) values(rd) values(rd)$chr1 ranges(rd)$chr1 ##subsetting RangedData object is somewhat weird. Firstly, it can be thought of as a 'list' with element corresponding to spaces: rd['chr1'] ##at the same time, you can think of it as a matrix-like structure and subset by rows and columns, where columns refer to annotation. rd[1:4,] rd[1:4,1] rd[1:4,2] rd[,'GC'] rd$GC ##here's where the weirdness really strikes you rd[1] rd[[1]] ##lapply is defined on RangedData objects and loops over spaces rd lapply(rd, function(sp) {sum(sp$exon)}) ##function coverage() is defined on RangedData and computes coverage of each space by corresponding ranges rd coverage(rd) ##this returns RleList with space-specific coverages. Remember to supply endpoints when needed! ##if you really want, can convert RangedData objects to data frames as.data.frame(rd) ##also can do overlaps rd2=RangedData(reduce(ir), space='chr1') rd rd2 x=findOverlaps(rd,rd2) class(x) x x$chr1 x$chr2 ##Alternative representation for genomic data is GRanges. We're not going to talk about them, but you should check them out. ##visulaizing data in UCSC genome browser ##package rtracklayer provides interface to genome browsers and allows both visualization of data and download of annotation of interest. ##it builds on RangedData class, with the caveat that one should also specify the genome of interest. myRanges=IRanges(start=seq(1000,50000,by=1000), width=300) targetTrack=GenomicData(myRanges, chrom=paste('chr',sample(1:5,50,replace=T),sep=''), strand=sample(c('+','-'), 50, replace=T), genome='hg19') class(targetTrack) ##can save and import tracks in a variety of formats (see documentation) session=browserSession('UCSC') ##open up session track(session, 'test_track')=targetTrack ##sets up track with the name 'test_track' view=browserView(session, pack='test_track') ##this doesn't do much good ##can adjust the the viewer view=browserView(session, IRangesList(chr1=IRanges(start=1000,end=5000)), pack='test_track') range(view)=IRangesList(chr1=IRanges(start=10000,end=50000)) ##can change which tracks are shown with trackNames(view) ucscTrackModes(view) #to find out which tracks are available and what are their default representations: trackNames(session) ucscTrackModes(session) ##you can also use browser sessions to download data session=browserSession('UCSC') ucscGenomes() genome(session)='hg19' trackNames(session) query=ucscTableQuery(session,'refGene') x=getTable(query) query=ucscTableQuery(session, 'snp130', IRangesList(chr12=IRanges(start=57795963,end=57815592))) x=getTable(query) class(x) dim(x) head(x) ##plenty of other customizing options. ##couple things about human Genome package library(BSgenome.Hsapiens.UCSC.hg19) seqnames(Hsapiens) mseqnames(Hsapiens) x=Hsapiens$upstream1000 head(names(x)) chr1_upstream=x[grep('_chr1_',names(x))] class(x) head(x) length(x) chr1=Hsapiens$chr1 class(chr1) slotNames(chr1) chr1[10000:10010] class(chr1@unmasked) chr1@unmasked[10000:10010] getSeq(Hsapiens, 'chr1',10000,10010) chr1=Hsapiens$chr1@unmasked