knitr::opts_chunk$set(warning=F, mesage=F, dpi=200, fig.path='results/', dev=c('png', 'svg'))

library(DESeq2)
library(ggplot2)
library(tidyverse, quietly = TRUE)
library(dplyr)
options(scipen = 999)
library(scales)
library(ggrepel)
library(RColorBrewer)
library(DT)

pcaplot <- function(dds, ntop=500, pch, col, main='') {
  pca <- prcomp(t(assay(dds)[order(rowVars(assay(dds)), decreasing = TRUE)[1:min(ntop, nrow(assay(dds)))], ]))
  percentVar <- pca$sdev^2/sum(pca$sdev^2)
  ggplot(pca$x, aes(x=PC1, y=PC2)) + geom_point(fill=Strain, shape=Replicate, color=Phase) + labs(xlab==paste0("PC1: ", round(percentVar[1] * 100, digits = 2), "% variance"), ylab=paste0("PC2: ", round(percentVar[2] * 100, digits = 2), "% variance"))
  # plot(pca$x[,c(1,2)], xlab=paste0("PC1: ", round(percentVar[1] * 100, digits = 2), "% variance"), ylab=paste0("PC2: ", round(percentVar[2] * 100, digits = 2), "% variance"))
}

plot.DE = function(deseq.res, min.lfc=lfc, max.pval=alpha, lim.pval=0, lim.lfc=c(-Inf, Inf), label.names=c(''), col.insign='#99999999', col.up='gold', col.down='dodgerblue4', main='') {
   if(length(min.lfc)==1) min.lfc=c(-min.lfc, min.lfc)
   if(lim.lfc[1]==-Inf) lim.lfc[1] = min(deseq.res$log2FoldChange)
   if(lim.lfc[2]==Inf) lim.lfc[2] = max(deseq.res$log2FoldChange)
   deseq.res$sym = ' '
   deseq.res = within(deseq.res, {
      reg = factor(rep('insign', nrow(deseq.res)), levels=c('down', 'insign', 'up'))
      reg[padj <= max.pval & log2FoldChange <= min.lfc[1]] = 'down'
      reg[padj <= max.pval & log2FoldChange >= min.lfc[2]] = 'up'
      if(is.logical(label.names)) {
         if(label.names) {
            Gene[reg=='insign'] = NA
         } else {
            Gene = NA
         }
      } else {
         Gene[!Gene %in% label.names] = NA
      }
      sym[log2FoldChange < lim.lfc[1]] = 'left'
      sym[log2FoldChange > lim.lfc[2]] = 'right'
      sym[padj < lim.pval] = 'top'
      sym[padj < lim.pval & log2FoldChange < lim.lfc[1]] = 'topleft'
      sym[padj < lim.pval & log2FoldChange > lim.lfc[2]] = 'topright'
      log2FoldChange[log2FoldChange < lim.lfc[1]] = lim.lfc[1]
      log2FoldChange[log2FoldChange > lim.lfc[2]] = lim.lfc[2]
      padj[padj < lim.pval] = lim.pval
   })
   lab = data.frame(x=lim.lfc, y=c(1, 1), label=as.character(c(sum(deseq.res$reg=='down'), sum(deseq.res$reg=='up'))), reg=c('down', 'up'))
   return(ggplot(data=deseq.res, aes(x=log2FoldChange, y=-log10(padj), col=reg, label=Gene, shape=factor(sym))) +
      geom_point(size=2) +
      theme_classic() +
      geom_text_repel(data=deseq.res, aes(x=log2FoldChange, y=-log10(padj), label=Gene), point.size=3, show.legend=F, na.rm=T, size=3, point.padding=0.2, box.padding=1, segment.curvature=-1e-20, max.overlaps=Inf,
                          force=1.5, force_pull=0.4, segment.size=0.1, min.segment.length=unit(0, 'lines'), nudge_x=ifelse(deseq.res$reg=='down', -1, 1), nudge_y=ifelse(deseq.res$reg=='insign', -0.1, .1)) +
      annotate(geom='text', x=lim.lfc, y=c(0,0), label=c(sum(deseq.res$reg=='down'), sum(deseq.res$reg=='up')), col=c(col.down, col.up)) +
      scale_color_manual(values=list(up=col.up, down=col.down, insign=col.insign)) +
      scale_y_continuous(expand=c(0,0.1)) +
      scale_shape_manual(values=c(" "="\u25CF", "top"="\u25B2", "left"="\u25C0", "right"="\u25B6", "topleft"="\u25E4", "topright"="\u25E5"), guide='none') +
      ggtitle(main) +
      geom_vline(xintercept=min.lfc, lty=2, color=col.insign) +
      geom_hline(yintercept=-log10(max.pval), lty=2, color=col.insign))
}

alpha = 0.05; lfc = 1

Data preparation

Samples are partially split between Lanes. I merged them, because the included QC report seemed fine.

grep -v ^Read meta.csv | while read r1 r2 n rep strain phase; do echo "zcat raw/READS/${n}_*_1.fq.gz | gzip > raw/READS/$n.1.fq.gz; zcat raw/READS/${n}_*_2.fq.gz | gzip > raw/READS/$n.2.fq.gz"; done
meta = read.table('meta.csv', head=T, stringsAsFactors=T)
meta$Sample = as.character(meta$Sample)
meta$Strain = relevel(meta$Strain, ref='WT')
meta$group = paste0(meta$Phase, '_', meta$Strain)
datatable(meta)

The most recent version of Bacillus subtilis subsp. subtilis NCIB 3610 = GCF_002055965.1_ASM205596v1 is used for genome sequence and annotated genes.

wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/055/965/GCF_002055965.1_ASM205596v1/GCF_002055965.1_ASM205596v1_genomic.fna.gz
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/055/965/GCF_002055965.1_ASM205596v1/GCF_002055965.1_ASM205596v1_genomic.gtf.gz
zcat GCF_002055965.1_ASM205596v1_genomic.fna.gz > genome/GCF_002055965.fna
zcat GCF_002055965.1_ASM205596v1_genomic.gtf.gz > genome/GCF_002055965.gtf
bwa index genome/GCF_002055965.fna

Mapping

grep -v ^Read meta.csv | while read r1 r2 n rep strain phase; do bwa mem -t 12 genome/GCF_002055965.fna raw/READS/$n.1.fq.gz raw/READS/$n.2.fq.gz | samtools sort > BAM/$n.bwa.3610.bam; done
featureCounts -T 12 -p -t gene -g gene_id -a genome/GCF_002055965.gtf -o yham.counts.tsv BAM/*.bwa.3610.bam

Analysis

Libraries

goi = c('yhaM')

counts = read.table('yham.counts.tsv', header=1)
names(counts) = gsub('BAM.(RH0..).bwa.3610.bam', '\\1', names(counts))
stopifnot(all(meta$Sample %in% names(counts)))
counts$gene[is.na(counts$gene)] = counts$Geneid
meta$size = colSums(counts[, meta$Sample])
meta$yhaM = t(counts[counts$gene=='yhaM', meta$Sample])

meta %>%
   group_by(Strain, Phase) %>%
   summarise(sd=sd(size), size=mean(size), yhaM=mean(yhaM), yhaM_sd=sd(yhaM)) %>%
   ggplot(aes(x=Strain, y=size, fill=Strain)) + facet_wrap(~Phase) +
   geom_col(position=position_dodge(.8), width=0.7, alpha=.5) +
   geom_errorbar(aes(ymin=size-sd/2, ymax=size+sd/2), width=.2, position=position_dodge(0.8)) +
   scale_y_continuous(labels=scales::label_number(scale_cut=cut_short_scale())) + theme_classic() + ylab('Library depth [#reads]')

meta %>%
   group_by(Strain, Phase) %>%
   summarise(sd=sd(size), size=mean(size), yhaM_mean=mean(yhaM), yhaM_sd=sd(yhaM)) %>%
   ggplot(aes(x=Strain, y=yhaM_mean, fill=Strain)) + facet_wrap(~Phase) +
   geom_col(position=position_dodge(.8), width=0.7, alpha=.5) +
   geom_errorbar(aes(ymin=yhaM_mean-yhaM_sd/2, ymax=yhaM_mean+yhaM_sd/2), width=.2, position=position_dodge(0.8)) +
   scale_y_continuous(labels=scales::label_number(scale_cut=cut_short_scale())) + theme_classic() + ylab('yhaM [#reads]')

meta %>%
   group_by(Strain, Phase) %>%
   summarise(yhaM_mean=mean(yhaM/size*100), yhaM_sd=sd(yhaM/size*100)) %>%
   ggplot(aes(x=Strain, y=yhaM_mean, fill=Strain)) + facet_wrap(~Phase) +
   geom_col(position=position_dodge(.8), width=0.7, alpha=.5) +
   geom_errorbar(aes(ymin=yhaM_mean-yhaM_sd/2, ymax=yhaM_mean+yhaM_sd/2), width=.2, position=position_dodge(0.8)) +
   scale_y_continuous(labels=scales::label_number(scale_cut=cut_short_scale())) + theme_classic() + ylab('yhaM [%]')

dds = DESeqDataSetFromMatrix(counts[,meta$Sample], colData=meta, design=~(Strain+Phase)+Strain:Replicate+Phase:Replicate)
dds = DESeq(dds, parallel=F)
dds2 = DESeqDataSetFromMatrix(counts[,meta$Sample], colData=meta, design=~group*Replicate)
dds2 = DESeq(dds2, parallel=F)
pca <- prcomp(t(assay(vst(dds))[order(rowVars(assay(dds)), decreasing=T)[1:min(500, nrow(assay(dds)))], ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
pca = data.frame(PC1=pca$x[,1], PC2=pca$x[,2], Strain=dds$Strain, Replicate=dds$Replicate, Phase=dds$Phase)
plotly::ggplotly(ggplot(pca, aes(x=PC1, y=PC2, color=Strain, shape=Phase)) + geom_point(size=3))
corrplot::corrplot(cor(counts(dds, normalized=F), method='pearson'), type='upper')

plotDispEsts(dds)

Differential Expression

KO vs WT

stationary phase

res = data.frame(results(dds2, contrast=c('group', 'stat_deltaYhaM', 'stat_WT')))
res$Gene = counts[row.names(res), ]$gene
plotly::ggplotly(plot.DE(res, label.names=T))#, min.lfc=lfc, max.pval=alpha, lim.pval=10^-8, lim.lfc=c(-6, 6), label.names=T))
datatable(cbind(Gene=res$Gene, round(res[, c('baseMean', 'log2FoldChange', 'pvalue', 'padj')], 3)), filter='top', extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
allres = data.frame(Gene=res$Gene, baseMean=res$baseMean, stat_KOvWT_l2FC=res$log2FoldChange, stat_KOvWT_pval=res$pvalue, stat_KOvWT_padj=res$padj)

exponential phase

res = data.frame(results(dds2, contrast=c('group', 'exp_deltaYhaM', 'exp_WT')))
res$Gene = counts$gene
plotly::ggplotly(plot.DE(res, label.names=T))#, min.lfc=lfc, max.pval=alpha, lim.pval=10^-8, lim.lfc=c(-6, 6), label.names=T))
datatable(cbind(Gene=res$Gene, round(res[, c('baseMean', 'log2FoldChange', 'pvalue', 'padj')], 3)), filter='top', extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
allres$exp_KOvWT_l2FC=res$log2FoldChange; allres$exp_KOvWT_pval=res$pvalue; allres$exp_KOvWT_padj=res$padj

High vs WT

stationary phase

res = data.frame(results(dds2, contrast=c('group', 'stat_hiYhaM', 'stat_WT')))
res$Gene = counts[row.names(res), ]$gene
plotly::ggplotly(plot.DE(res, label.names=T))#, min.lfc=lfc, max.pval=alpha, lim.pval=10^-8, lim.lfc=c(-6, 6), label.names=T))
datatable(cbind(Gene=res$Gene, round(res[, c('baseMean', 'log2FoldChange', 'pvalue', 'padj')], 3)), filter='top', extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
allres$stat_HIvWT_l2FC=res$log2FoldChange; allres$stat_HIvWT_pval=res$pvalue; allres$stat_HIvWT_padj=res$padj

exponential phase

res = data.frame(results(dds2, contrast=c('group', 'exp_hiYhaM', 'exp_WT')))
res$Gene = counts[row.names(res), ]$gene
plotly::ggplotly(plot.DE(res, label.names=T))#, min.lfc=lfc, max.pval=alpha, lim.pval=10^-8, lim.lfc=c(-6, 6), label.names=T))
datatable(cbind(Gene=res$Gene, round(res[, c('baseMean', 'log2FoldChange', 'pvalue', 'padj')], 3)), filter='top', extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
allres$exp_HIvWT_l2FC=res$log2FoldChange; allres$exp_HIvWT_pval=res$pvalue; allres$exp_HIvWT_padj=res$padj

High vs KO

stationary phase

res = data.frame(results(dds2, contrast=c('group', 'stat_hiYhaM', 'stat_deltaYhaM')))
res$Gene = counts[row.names(res), ]$gene
plotly::ggplotly(plot.DE(res, label.names=T))#, min.lfc=lfc, max.pval=alpha, lim.pval=10^-8, lim.lfc=c(-6, 6), label.names=T))
datatable(cbind(Gene=res$Gene, round(res[, c('baseMean', 'log2FoldChange', 'pvalue', 'padj')], 3)), filter='top', extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
allres$stat_HIvKO_l2FC=res$log2FoldChange; allres$stat_HIvKO_pval=res$pvalue; allres$stat_HIvKO_padj=res$padj

exponential phase

res = data.frame(results(dds2, contrast=c('group', 'exp_hiYhaM', 'exp_deltaYhaM')))
res$Gene = counts[row.names(res), ]$gene
plotly::ggplotly(plot.DE(res, label.names=T))#, min.lfc=lfc, max.pval=alpha, lim.pval=10^-8, lim.lfc=c(-6, 6), label.names=T))
datatable(cbind(Gene=res$Gene, round(res[, c('baseMean', 'log2FoldChange', 'pvalue', 'padj')], 3)), filter='top', extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
allres$exp_HIvKO_l2FC=res$log2FoldChange; allres$exp_HIvKO_pval=res$pvalue; allres$exp_HIvKO_padj=res$padj
write.table(cbind(allres, counts(dds2, normalized=T), counts(dds2, normalized=F)), 'yham.deseq.tsv', sep='\t', row.names=F)