knitr::opts_chunk$set(dpi=200, out.width='2000px')
options(width = 2000)

plotPE = function(bed, add=FALSE, percent=FALSE, col=c('#ff000055', '#0000ff55')) {
   d = data.frame(pos=seq(min(c(bed$start, bed$end)), max(c(bed$start, bed$end))))
   total = ifelse(percent, sum(bed$count)/100, 1)
   d$starts = sapply(d$pos, function(p) sum(bed$count[bed$start == p]))/total
   d$ends = sapply(d$pos, function(p) -sum(bed$count[bed$end == p]))/total
   plotBounds(d$pos, d$starts, d$ends, col, add)
}

plotBounds = function(pos, starts, ends, width=1, col=c('#ff000055', '#0000ff55'), density=NA, border=NA, add=FALSE, percent=F) {
   if(length(pos) != length(starts) | length(pos) != length(ends)) stop('pos, starts and ends need to be same length')
   if(percent) { starts = starts/(sum(starts)+sum(ends))*100; ends = ends/(sum(starts)+sum(ends))*100 }
   if(!add) {
      plot(0, xlim=range(pos), ylim=range(c(starts, -ends)), type='n', ylab='  <- Ends    |    Starts ->', xlab='Position')
      abline(h=0)
   }
   rect(pos-width/2, 0, pos+width/2, starts, col=col[2], border=border, density=density)
   rect(pos-width/2, -ends, pos+width/2, 0, col=col[1], border=border, density=density)
}

plotBoundsBatch = function(pos, starts, ends, col=c('#ff000055', '#0000ff55'), density=NA, border=NA, start, end, pad=20, seq='', struc='', separate.ends=F, title='', file='') {
   if(length(col) < ncol(starts)) { col = rep(col, ncol(starts)) }
   if(length(density) < ncol(starts)) { density = rep(density, ncol(starts)) }
   if(length(border) < ncol(starts)) { border = rep(border, ncol(starts)) }
   if(file!='') {pdf(file, 12, 6)}
   if(separate.ends) {
      # if(file!='') {pdf(file, 12, 6)}
      layout(matrix(c(1,3,2,3), nrow=2))
      sel = start+(-pad:pad)
      s = starts[pos %in% sel,]; e = matrix(0, length(sel), ncol(s)); total = colSums(s)+colSums(e)+1; s = t(t(s)/total)*100; e = t(t(e)/total)*100;
      plot(0, type='n', xlim=c(start-pad, start+pad), ylim=range(c(s, -e), na.rm=T)*1.1, xlab='Position', ylab='<- Ends    %     Starts ->', main='5`', xaxt='n')
      axis(1, c(round(pad/5):1*-5+1, 0, 1, 1:round(pad/5)*5), rep('', 2*round(pad/5)+2), cex.axis=0.7)
      axis(1, -(pad-1):pad, rep('', 2*pad), tck=-0.01)
      mtext(paste0(rep(c('-', '+'), each=round(pad/5)+1), c(round(pad/5):1*5, 1, 1, 1:round(pad/5)*5)), 1, line=0.5, outer=F, at=c(round(pad/5):1*-5+1, -0.2, 1.2, 1:round(pad/5)*5), cex=0.8)
      mtext(seq[start:(start+pad)], at=start:(start+pad), cex=0.4)
      if(ncol(starts)>3) { abline(v=start+(-pad:pad)-0.5, col='#55555555', lty=3) }
      abline(v=c(start-.5, end+.5), col='red', lty=2)
      for(i in 1:ncol(starts)) {
         plotBounds(pos[pos %in% sel]-0.5+i/(ncol(starts)+1), s[,i], e[,i], width=1/(ncol(starts)+1), col=rep(col[i], 2), density=density[i], border=border[i], add=T)
      }
      abline(h=0)
      
      sel = (end-pad):(end+pad)
      s = matrix(0, length(sel), ncol(starts)); e = ends[pos %in% sel,]; total = colSums(s)+colSums(e)+1; s = t(t(s)/total)*100; e = t(t(e)/total)*100;
      plot(0, type='n', xlim=c(end-pad, end+pad), ylim=range(c(s, -e), na.rm=T)*1.1, xlab='Position', ylab='<- Ends    %     Starts ->', main='3`', xaxt='n')
      axis(1, end+round(-pad/5):round(pad/5)*5, rep('', 2*round(pad/5)+1), cex.axis=0.7)
      axis(1, -(pad-1):pad+end, rep('', 2*pad), tck=-0.01)
      mtext(end+round(-pad/5):round(pad/5)*5, 1, line=0.5, outer=F, at=end+round(-pad/5):round(pad/5)*5, cex=0.8)
      mtext(seq[(end-pad):end], at=(end-pad):end, cex=0.4)
      if(ncol(starts)>3) { abline(v=(end-pad):(end+pad)-0.5, col='#55555555', lty=3) }
      abline(v=c(start-.5, end+.5), col='red', lty=2)
      for(i in 1:ncol(starts)) {
         plotBounds(pos[pos %in% sel]-0.5+i/(ncol(starts)+1), s[,i], e[,i], width=1/(ncol(starts)+1), col=rep(col[i], 2), density=density[i], border=border[i], add=T)
      }
      abline(h=0)
      
      sel = (start+pad):(end-pad)
      s = starts[pos %in% sel,]; e = ends[pos %in% sel,]; total = colSums(s)+colSums(e)+1; s = t(t(s)/total)*100; e = t(t(e)/total)*100;
      plot(0, type='n', xlim=c(start+pad, end-pad), ylim=range(c(s, -e), na.rm=T)*1.1, xlab='Position', ylab='<- Ends    %     Starts ->', main='middle')
      mtext(seq[sel], at=sel, cex=0.4)
      abline(v=c(start-.5, end+.5), col='red', lty=2)
      for(i in 1:ncol(starts)) {
         plotBounds(pos[pos %in% sel]-0.5+i/(ncol(starts)+1), s[,i], e[,i], width=1/(ncol(starts)+1), col=rep(col[i], 2), density=density[i], border=border[i], add=T)
      }
      abline(h=0)
      
      par(mfrow=c(1,1))

   } else {
      par(mfrow=c(1,1))
      plot(0, type='n', xlim=c(start-pad, end+pad), ylim=c(-80, 80), xlab='Position', ylab='<- Ends    %     Starts ->', main=title)
      mtext(seq, at=start:end, cex=0.4)
      mtext(struc, line=-0.5, at=start:end, cex=0.4)
      abline(v=c(start-.5, end+.5), col='red', lty=2)
      for(i in 1:ncol(starts)) {
         plotBounds(pos-0.5+i/(ncol(starts)+1), starts[, i]/sum(starts[, i])*100, ends[,i]/sum(ends[, i])*100, width=1/(ncol(starts)+1), col=rep(col[i], 2), density=density[i], border=border[i], add=T, percent=F)
      }
      abline(h=0)
   }
   if(file!='') {dev.off()}
}

plotStacked = function(bed, add=FALSE, hide.less=0, col='blue') {
   bed = bed[order(bed$start, bed$end), ]
   bed = bed[bed$count > hide.less, ]
   bed$h = cumsum(bed$count)-bed$count
   
   if(!add) { plot(0, xlim=range(c(bed$start, bed$end)), ylim=c(0, sum(bed$count)), type='n', ylab='Reads', xlab='Position') }
   rect(bed$start-0.5, bed$h, bed$end+0.5, bed$h+bed$count, col=col, border=NA)
}

plotCoverage = function(bed, add=FALSE, percent=FALSE, col='blue', border=NA, lwd=1) {
   c = data.frame(pos=seq(min(c(bed$start, bed$end)), max(c(bed$start, bed$end))))
   c$depth = sapply(c$pos, function(p) sum(bed$count[bed$start <= p & p <= bed$end]))
   if(percent) c$depth = c$depth/max(c$depth)*100
   if(!add) { plot(0, xlim=range(c$pos), ylim=c(0, max(c$depth)), type='n', ylab='Depth [%]', xlab='Position') }
   polygon(rep(c$pos, each=2)+c(-.5, .5), rep(c$depth, each=2), col=col, border=border, lwd=lwd)
}

plotNorthern = function(bed, background=NA, lim=c(1, 500), smooth=0.2, horizontal=T, file='') {
   bed$l = bed$end-bed$start
   frags = sapply(seq(lim[1], lim[2]), function(x) sum(bed$count[bed$l == x]))
   # frags = aggregate(count ~ l, bed, sum)
   frags.smooth = smooth.spline(x=seq(lim[1], lim[2]), (frags)^(1/3), spar=smooth)
   if(file!='') {pdf(file, 12, 6)}
   if(horizontal) {
      plot(0, ylim=c(0,1), xlim=lim, type='n', ylab='', xlab='Size [nt]', yaxt='n', log='x')
      rect(frags.smooth$x, 0, frags.smooth$x+1, 1, col=rgb(0, 0, 0, abs(frags.smooth$y)/max(frags.smooth$y)), border=NA)
   } else {
      plot(0, xlim=c(0,1), ylim=lim, type='n', xlab='', ylab='Size [nt]', xaxt='n', log='y', las=2)
      rect(0, frags.smooth$x, 1, frags.smooth$x+1, col=rgb(0, 0, 0, abs(frags.smooth$y)/max(frags.smooth$y)), border=NA)
   }
   if(file!='') {dev.off()}
}

Bioinformatic Methods

All libraries (old and new) were freshly processed including quality assessment, adapter trimming and trimming of bad quality bases at the ends, using fastp v0.23.2(Chen et al. 2018). Trimmed reads were then mapped against Bacillus subtilis 168 genome (NC_000964.3, GCF_000009045.1_ASM904v1) and alignments extracted for loci of 6S-1 and -2 +/-100nt. In the first sequencing run was analyzed using segemehl v0.2.0 only. For the new run, I re-mapped all old and new data with different mapping algorithms: segemehl(Hoffmann et al. 2014), bwa(Li and Durbin 2010), bowtie2(Langmead and Salzberg 2012), STAR(Dobin et al. 2013) to compare different approaches. The newest version of segemehl v0.3.4 had problems with trimmed reads (could not read all reads in library). Since all mappers yield mostly indifferent results, I proceeded with bwa v0.7.17-r1188 mapped libraries. Only perfectly mapping paired reads were used.

Results

There are four different visual representations of the paired-end reads:

  • Fragment starts (positive y-axis) and fragment ends (negative y-axis) give a good picture which major processing positions exist.
  • Stacked fragments sorted by start position give the most complete overview of the distribution.
  • Fragment depth allows the best overview of cumulative transcript lengths. Here Black and gray outlines represent the WT coverage in run 1 and 2 respective for comparison.
  • in silico Northern-blot of fragment sizes (frequency of fragment lengths is transformed to see more low abundance fragments and sizes are smoothed to reflect gaussian gel migration pattern)

Positions are relative to the respective annotated 6S gene starting with \(0\) and might not correspond exactly to canonical 6S transcript indices! Old libraries are colored red (gray outline for WT reference) and new libraries are colored blue (black outline for WT reference).

seq = seqinr::read.fasta('data/GCF_000009045.1_ASM904v1_genomic.fna', forceDNAtolower=F)[[1]]
rna6s = read.csv('6sRNA.bed', sep='\t', col.names=c('chrom', 'start', 'end', 'name', 'score', 'strand', 'fold'), header=F)
rna6s$seq = apply(rna6s, 1, function(r) {
  s = chartr('T', 'U', toupper(paste0(seq[r[['start']]:r[['end']]], collapse='')))
  if(r[['strand']]=='-') {s = chartr('ACGTUN', 'UGCAAN', toupper(paste0(seq[r[['end']]:(r[['start']])], collapse='')))}
  strsplit(s, '')[[1]]})
  # paste0(s, collapse='')})
rna6s$length = rna6s$end - rna6s$start +1
rna6s$fold = apply(rna6s, 1, function(x) strsplit(x[['fold']], '')[[1]])
# rna6s$seq = apply(rna6s, 1, function(r) {paste0(ifelse(r[['strand']]=='-', seq[r[['end']]:r[['start']]], seq[r[['start']]:r[['end']]]), collapse='')})
# rna6s$seq = chartr('ACGTNU', 'UGCANA', paste0(toupper(rna6s$seq), collapse=''))
ds = read.csv('6sRNAse.meta.tsv', header=T, sep='\t')
# ds = subset(ds, rnase!='Y')
# ds$hue = rep(seq(0, floor(nrow(ds)/2)-1)/(nrow(ds)/2+3), each=2)
# ShortRead::countFastq(ds$fastq1)
# ShortRead::countFastq(paste0('data/', ds$name, '.R1.fq.gz'))
stats = read.csv('6sRNAse.stats.tsv', header=T, sep='\t')
fc = read.csv('data/6sRNAse.counts.fc.tsv.summary', header=T, sep='\t')
names(fc) = gsub('.bwa.sort.bam', '', names(fc))
names(fc)[grep('4erKO', names(fc))] = substr(names(fc)[grep('4erKO', names(fc))], 2, 99)
counts = read.csv('data/6sRNAse.counts.fc.tsv', header=T, sep='\t', skip=1)
names(counts) = gsub('.bwa.sort.bam', '', names(counts))
names(counts)[grep('4erKO', names(counts))] = substr(names(counts)[grep('4erKO', names(counts))], 2, 99)
stats$genomic = colSums(fc[1:13, stats$name])
stats$transcript = colSums(fc[1, stats$name])
stats$`5S` = colSums(counts[counts$Geneid %in% c('gene-BSU_rRNA_5', 'gene-BSU_rRNA_26', 'gene-BSU_rRNA_8', 'gene-BSU_rRNA_11', 'gene-BSU_rRNA_15', 'gene-BSU_rRNA_24', 'gene-BSU_rRNA_28', 'gene-BSU_rRNA_13', 'gene-BSU_rRNA_18', 'gene-BSU_rRNA_21'), stats$name])
stats$`6S1` = colSums(counts[counts$Geneid == 'gene-BSU_misc_RNA_41', stats$name])
stats$`6S2` = colSums(counts[counts$Geneid == 'gene-BSU_misc_RNA_32', stats$name])
stats$`6S1.pp` = 0
stats$`6S2.pp` = 0
pad = 15
for(run in 1:2) {
   cat(paste('\n\n## Run', run, '\n'))
   for(i in 1:nrow(rna6s)) {
      rna = rna6s[i,]
      cat(paste('\n\n###', rna$name, '\n'))
      ref1 = read.csv(paste0('data/', ds$name[ds$rnase == 'WT' & ds$rep == run], '.bwa.', rna$name, '.bed'), sep='\t', col.names=c('chrom', 'start', 'end', 'count'))
      ref1$start = ref1$start+1 - rna$start
      ref1$end = ref1$end - rna$start
      # ref1 = ref1[ref1$start >= -pad & ref1$end <= rna$length+pad, ]
      if(rna$strand=='-') {
         seq = rev(chartr('ACGTNU', 'UGCANA', seq))
         tmp = rna$length - ref1$end
         ref1$end = rna$length - ref1$start
         ref1$start = tmp
       }
      # ref1$start = -(ref1$start+1 - rna$start) + (rna$end-rna$start)
      # ref2 = read.csv(paste0('data/', ds$name[ds$rnase == 'WT' & ds$rep == 2], '.bwa.', rna$name, '.bed'), sep='\t', col.names=c('chrom', 'start', 'end', 'count'))
      # ref2$start = ref2$start+1
      d = data.frame(pos=-pad:(rna$length+pad))
      pos = d$pos
      starts = matrix(0, nrow(d), length(unique(ds$rnase[ds$rep==run])))
      ends = starts
      total1 = sum(ref1$count)/100
      d$ref1_starts = sapply(d$pos, function(p) sum(ref1$count[ref1$start == p]))
      # starts[,1] = d$ref1_starts
      d$ref1_ends = sapply(d$pos, function(p) sum(ref1$count[ref1$end == p]))
      # ends[,1] = d$ref1_ends
      # total2 = sum(ref2$count)/100
      # d$ref2_starts = sapply(d$pos, function(p) sum(ref2$count[ref2$start == p]))/total2
      # d$ref2_ends = sapply(d$pos, function(p) -sum(ref2$count[ref2$end == p]))/total2
      rnases = unique(ds$rnase[ds$rep==run])
      for(j in 1:length(rnases)) {
         r = rnases[j]
         cat(paste('\n\n####', r, '\n'))
         hue = ds$hue[ds$rnase==r & ds$rep ==run]
         bed1 = read.csv(paste0('data/', ds$name[ds$rnase == r & ds$rep == run], '.bwa.', rna$name, '.bed'), sep='\t', col.names=c('chrom', 'start', 'end', 'count'))
         if(i == 1) stats[stats$name == ds$name[ds$rnase == r & ds$rep == run], '6S1.pp'] = sum(bed1$count)
         if(i == 2) stats[stats$name == ds$name[ds$rnase == r & ds$rep == run], '6S2.pp'] = sum(bed1$count)
         
         bed1$start = bed1$start+1 - rna$start
         bed1$end = bed1$end - rna$start
         # bed1 = bed1[bed1$start >= -pad & bed1$end <= rna$length+pad, ]
         if(rna$strand=='-') {
            tmp = rna$length - bed1$end
            bed1$end = rna$length - bed1$start
            bed1$start = tmp
         }
         total1 = sum(bed1$count)/100
         d$bed1_starts = sapply(d$pos, function(p) sum(bed1$count[bed1$start == p]))
         d$bed1_ends = sapply(d$pos, function(p) sum(bed1$count[bed1$end == p]))
         starts[,j] = d$bed1_starts
         ends[,j] = d$bed1_ends
         # bed2 = read.csv(paste0('data/', ds$name[ds$rnase == r & ds$rep == 2], '.bwa.', rna$name, '.bed'), sep='\t', col.names=c('chrom', 'start', 'end', 'count'))
         # bed2$start = bed2$start+1
         # total2 = sum(bed2$count)/100
         # d$bed2_starts = sapply(d$pos, function(p) sum(bed2$count[bed2$start == p]))/total2
         # d$bed2_ends = sapply(d$pos, function(p) -sum(bed2$count[bed2$end == p]))/total2
         
         plotBoundsBatch(d$pos, d[, c(4, 2)], d[, c(5, 3)], col=c(hsv(hue, 1, 1), '#55555555'), start=1, end=rna$length, pad=pad, seq=rna$seq[[1]], struc=rna$fold[[1]], title=r, file="")#paste0('processing_', rna$name, '.', r, '.', run, '.pdf'))
         plotBoundsBatch(d$pos, d[, c(4, 2)], d[, c(5, 3)], col=c(hsv(hue, 1, 1), '#00000055'), start=1, end=rna$length, pad=pad, seq=rna$seq[[1]], struc=rna$fold[[1]], title=r, separate.ends=T, file="")#paste0('processing_', rna$name, '.', r, '.', run, '_zoom.pdf'))
         
         # pdf(file=paste0('processing_', rna$name, '.', r, '.', run, '_stack.pdf'), 12, 6)
         plot(0, type='n', xlim=c(-pad, rna$length+pad), ylim=c(0, max(sum(bed1$count), 1)), xlab='Position', ylab='Reads', main=r)
         mtext(rna$seq[[1]], at=1:rna$length, cex=0.4)
         mtext(rna$fold[[1]], 3, -0.5, at=1:rna$length, cex=0.4)
         plotStacked(bed1, T, col=hsv(hue, 1, 1))
         abline(v=c(.5, rna$length+.5), col='red', lty=2)
         # dev.off()
         
         # pdf(file=paste0('processing_', rna$name, '.', r, '.', run, '_coverage.pdf'), 12, 6)
         plot(0, type='n', xlim=c(-pad, rna$length+pad), ylim=c(0, 100), xlab='Position', ylab='Coverage', main=r)
         mtext(rna$seq[[1]], at=1:rna$length, cex=0.4)
         # plotCoverage(bed2, T, T, col=hsv(hue, s=0.8, v=1, alpha=0.5))
         plotCoverage(bed1, T, T, col=hsv(hue, s=1, v=1, alpha=1))
         # plotCoverage(ref2, T, T, col=NA, border='black')
         plotCoverage(ref1, T, T, col=NA, border='black')
         abline(v=c(.5, rna$length+.5), col='red', lty=2)
         legend('topright', legend=c(r, 'WT'), fill=c(hsv(hue, s=1, v=1, alpha=1), 'black'), cex=0.5)
         # dev.off()
         # d$diff_starts = d$bed1_starts-d$ref1_starts
         # d$diff_starts_plus = d$diff_starts; d$diff_starts_plus[d$diff_starts<0] = 0
         # d$diff_starts_minus = -d$diff_starts; d$diff_starts_minus[d$diff_starts>0] = 0
         # d$diff_ends = d$bed1_ends-d$ref1_ends
         # d$diff_ends_plus = -d$diff_ends; d$diff_ends_plus[d$diff_ends>0] = 0
         # d$diff_ends_minus = d$diff_ends; d$diff_ends_minus[d$diff_ends<0] = 0
         # plotBoundsBatch(d$pos, d[, c('diff_starts_plus', 'diff_starts_minus')], d[, c('diff_ends_plus', 'diff_ends_minus')], col=c('#ff000055', '#0000ff55'), density=c(NA, NA), start=1, end=rna$length, pad=pad, seq=rna$seq[[1]], title=paste(r, 'diff'), border=c(NA, NA))
         # plotBoundsBatch(d$pos, d[, c('diff_starts_plus', 'diff_starts_minus')], d[, c('diff_ends_plus', 'diff_ends_minus')], col=c('#ff000055', '#0000ff55'), density=c(NA, NA), start=1, end=rna$length, pad=pad, seq=rna$seq[[1]], title=paste(r, 'diff'), separate.ends=T, border=c(NA, NA))
         # 
      }
      
      cat(paste('\n\n#### All at once\n'))
      plotBoundsBatch(pos, starts[, ds$show[ds$rep==run]], ends[, ds$show[ds$rep==run]], col=c(hsv(ds$hue[ds$rep==run & ds$show], 1, 1)), start=1, end=rna$length, pad=pad, seq=rna$seq[[1]], struc=rna$fold[[1]], separate.ends=T, file="")#paste0('processing_', rna$name, '.all.', run, '.pdf'))
      
      cat(paste('\n\n#### *in silico* Northern blot\n'))
      # pdf(file=paste0('processing_', rna$name, '.all.', run, '_northern.pdf'), 12, 6)
      par(mfrow=c(1, length(unique(ds$rnase[ds$rep==run]))), mar=c(3,6,2,1))
      for(r in unique(ds$rnase[ds$rep==run])) {
         bed1 = read.csv(paste0('data/', ds$name[ds$rnase == r & ds$rep == run], '.bwa.', rna$name, '.bed'), sep='\t', col.names=c('chrom', 'start', 'end', 'count'))
         # cat(paste0(r, '\t', sum(bed1$count), '\n'))
         plotNorthern(bed1, lim=c(30, 250), horizontal=F, smooth=0.2)
         title(r)
         # mtext(paste0('# Reads\n', format(sum(bed1$count), big.mark=',')), 1, 2, cex=0.8)
      }
      # dev.off()
   }
}

Run 1

6S1

WT