Perform sequence alignment and render the results as a ggplot
library(ggplot2)
library(ggsequence)
We need to make sure the alignment that we want to plot comes in the right format, which
is as a MsaAAMultipleAlignment
, with the original sequences as an attribute on the
object
do_alignment = function(sequences) {
# Make sure sequences have a name
if (is.null(names(sequences))) {
names(sequences) = as.character(1:length(sequences))
}
# Convert each of the sequence strings to a Biostrings::Views
seqs = lapply( 1:length(sequences), function(seq_id) {
views = Biostrings::Views(sequences[seq_id],start=1,end=nchar(sequences[seq_id]))
names(views) = names(sequences)[seq_id]
views
})
names(seqs) = names(sequences)
# Perform the MSA on the sequences
aln = msa::msaClustalOmega( Reduce(c,Map(function(view) { as(view,'AAStringSet')}, seqs)) )
# Assign the original Views object to the sequences attribute
attributes(aln)$sequences = seqs
aln
}
Now, we can perform an alignment between two sequences, and plot them
p = ggplot(do_alignment(c(foo='MNTTTMMMNPPPPMNTTTMMMNPPPPMNTTTMMMNPPPPMNTTTMMMNPPPP',bar='NNSMMMPPNNSMMMPPNNSMMMPPNNSMMMPP')))
p
Which won't really give us anything useful apart from labels for the sequences and amino acid position. We can add in the sequence.
p + geom_text(aes(label=after_stat(aa)))
Or, we could add in the sequence, but colour the sequence based upon the conservation of the amino acid using the conservation
stat.
p + geom_text(aes(label=after_stat(aa),color=after_stat(conservation),),stat="conservation")
Or, if we want a quick ClustalW style rendering of sequence
p + geom_text(aes(label=after_stat(aa))) + coord_conservation()
Which we could actually put on top of the sequence
p + geom_barcode()
Lets say we are only interested in the sequence that is aligned with another sequence - we can use the gappedSequence
stat to draw segments
p + geom_text(aes(label=after_stat(aa))) + geom_segment(aes(x=after_stat(seqstart),xend=after_stat(seqend)),stat="gappedSequence",linewidth=2,colour="black",position = position_nudge(y=-0.1))
This is fine, but what if we want to map some data onto our aligned sequences? You can map on a data frame to the alignment using the alignedSite
stat.
You tell the stat where the data is (annotations
parameter), and which column the sequence ids are in (id.column
parameter), and which columns you want available as amino acid positions that have been rescaled
to match the alignment - in this case start
and end
(columns
parameter).
signalpeptide_data=data.frame(seq.ids=c('bar','foo'),start=c(1,1),end=c(3,4))
p + geom_text(aes(label=after_stat(aa))) + geom_segment(aes(x=after_stat(start),xend=after_stat(end)),stat="alignedSite",colour="red",size=4,alpha=0.5,annotations=signalpeptide_data,id.column='seq.ids',columns=c('start','end'))
Or, you can use some fancy brackets to indicate a region
p + geom_text(aes(label=after_stat(aa))) + geom_bracket(aes(x=after_stat(start),xend=after_stat(end)),offset=1,size=0.5,stat="alignedSite",annotations=signalpeptide_data,id.column='seq.ids',columns=c('start','end'))
If you have a data frame with specific annotations to add on, you can use the alignedSite
stat to map that on to the sequences too
site_data = data.frame(seq.ids=c('bar','foo'),site=c(4,5))
site_data$site_label = as.character(site_data$site)
p + geom_text(aes(label=after_stat(aa))) + geom_point(aes(x=after_stat(site)),stat="alignedSite",annotations=site_data,id.column='seq.ids',columns=c('site'),position=position_nudge(y=0.1),color='red')
Extra columns get automatically made available within the geom, so if you have a column with labels for each row, you can easily access them.
p + geom_text(aes(label=after_stat(aa))) + geom_label(aes(x=after_stat(site),label=..site_label..),stat="alignedSite",annotations=site_data,id.column='seq.ids',columns=c('site'),position=position_nudge(y=0.1),color='red')
Or, if you are being really fancy
p + geom_text(aes(label=after_stat(aa))) + ggsugar::geom_sugar(aes(x=after_stat(site)),stat="alignedSite",annotations=site_data,id.column='seq.ids',columns=c('site'),position=position_nudge(y=0.1),size=10,sugar='gal(b1-3)galnac')
Or, if you are being EVEN fancier
site_data = data.frame(seq.ids=c('bar','foo'),site=c(4,5),sugarseq=c('man','gal(b1-3)galnac'))
p + geom_text(aes(label=after_stat(aa))) + ggsugar::geom_sugar(aes(x=after_stat(site),sugar=after_stat(sugarseq)),stat="alignedSite",annotations=site_data,id.column='seq.ids',columns=c('site'),position=position_nudge(y=0.1),size=5)