systemPipeR's
New CWL Command-line InterfaceSYSargs2
S4 class is a list-like containerSYSargs2
instancesSYSargs2
instances are generated by two constructor functions, loadWorkflow
and renderWF
targets
file. Subsequent targets
instances are created automaticallySYSargs2Pipe
allows loading as many SYSargs2
instances into a single compound object
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("systemPipeR")
BiocManager::install("systemPipeRdata", build_vignettes=TRUE, dependencies=TRUE)
genWorkenvir
function from systemPipeRdatatargets
file for single-end (SE) librarytargetspath <- system.file("extdata", "targets.txt", package="systemPipeR")
read.delim(targetspath, comment.char = "#")[1:3,1:4]
## FileName SampleName Factor SampleLong
## 1 ./data/SRR446027_1.fastq.gz M1A M1 Mock.1h.A
## 2 ./data/SRR446028_1.fastq.gz M1B M1 Mock.1h.B
## 3 ./data/SRR446029_1.fastq.gz A1A A1 Avr.1h.A
targets
file for paired-end (PE) librarytargetspath <- system.file("extdata", "targetsPE.txt", package="systemPipeR")
read.delim(targetspath, comment.char = "#")[1:3,1:5]
## FileName1 FileName2 SampleName
## 1 ./data/SRR446027_1.fastq.gz ./data/SRR446027_2.fastq.gz M1A
## 2 ./data/SRR446028_1.fastq.gz ./data/SRR446028_2.fastq.gz M1B
## 3 ./data/SRR446029_1.fastq.gz ./data/SRR446029_2.fastq.gz A1A
## Factor SampleLong
## 1 M1 Mock.1h.A
## 2 M1 Mock.1h.B
## 3 A1 Avr.1h.A
targets
file and two param
file
hisat2-mapping-se.cwl
file contains the settings for running command-line softwarehisat2-mapping-se.yml
file define all the variables to be input in the specific command-line step
targets <- system.file("extdata", "targets.txt", package="systemPipeR")
dir_path <- system.file("extdata/cwl/hisat2-se", package="systemPipeR")
align <- loadWorkflow(targets=targets, wf_file="hisat2-mapping-se.cwl",
input_file="hisat2-mapping-se.yml", dir_path=dir_path)
align <- renderWF(align, inputvars=c(FileName="_FASTQ_PATH_", SampleName="_SampleName_"))
## Instance of 'SYSargs2':
## Slot names/accessors:
## targets: 18 (M1A...V12B), targetsheader: 4 (lines)
## modules: 2
## wf: 0, clt: 1, yamlinput: 7 (components)
## input: 18, output: 18
## cmdlist: 18
## WF Steps:
## 1. hisat2-mapping-se.cwl (rendered: TRUE)
SYSargs2
instancenames(align)
# [1] "targets" "targetsheader" "modules" "wf" "clt"
# [6] "yamlinput" "cmdlist" "input" "output" "cwlfiles"
# [11] "inputvars"
cmdlist
return command-line arguments for the specific software, here HISAT2
for the first sample:
cmdlist(align)[1]
# $M1A
# $M1A$`hisat2-mapping-se.cwl`
# [1] "hisat2 -S results/M1A.sam -x ./data/tair10.fasta -k 1 --min-intronlen 30 --max-intronlen 3000 -U ./data/SRR446027_1.fastq.gz --threads 4"
SYSargs2
define all the expected output files for each step in the workflow; some of which are the input for the next workflow step
targets <- system.file("extdata", "targets.txt", package="systemPipeR")
dir_path <- system.file("extdata/cwl/hisat2-se", package="systemPipeR")
align <- loadWorkflow(targets=targets, wf_file="hisat2-mapping-se.cwl",
input_file="hisat2-mapping-se.yml", dir_path=dir_path)
align <- renderWF(align, inputvars=c(FileName="_FASTQ_PATH_", SampleName="_SampleName_"))
subsetWF(align, slot="input", subset='FileName')[1:2]
#> M1A M1B
#> "./data/SRR446027_1.fastq.gz" "./data/SRR446028_1.fastq.gz"
subsetWF(align, slot="output", subset=1)[1:2]
#> M1A M1B
#> "results/M1A.sam" "results/M1B.sam"
subsetWF(align, slot="step", subset=1)[1] ## subset all the HISAT2 commandline
#> M1A
#> "hisat2 -S results/M1A.sam -x ./data/tair10.fasta -k 1 --min-intronlen 30 --max-intronlen 3000 -U ./data/SRR446027_1.fastq.gz --threads 4"
subsetWF(align, slot="output", subset=1, delete=TRUE)[1] ##DELETE
#> The subset cannot be deleted: no such file
#> M1A
#> "results/M1A.sam"
Hisat2
, on single machine. Command-line tool needs to be installed for this.cmdlist(align)[1:2]
system("hisat2-build ./data/tair10.fasta ./data/tair10.fasta")
runCommandline(align, make_bam = FALSE)
output(align)
runCommandline
, by default, auto detects SAM file outputs and converts them to sorted and indexed BAM files, using internally Rsamtools package (Morgan et al. 2019)runCommandline
allows the user to create an exclusive results folder for each step in the workflow and a sub-folder for each sample defined in the targets fileclusterRun
function submits non-R command-line software to queuing systems of clusters using run specifications defined by runCommandline
library(batchtools)
resources <- list(walltime=120, ntasks=1, ncpus=4, memory=1024)
reg <- clusterRun(align, FUN = runCommandline, more.args = list(dir=TRUE),
conffile = ".batchtools.conf.R", template = "batchtools.slurm.tmpl",
Njobs=18, runid="01", resourceList=resources)
getStatus(reg=reg)
output(align)
SYSargs2Pipe
SYSargs2Pipe
WF_track <- run_track(WF_ls = c(align))
names(WF_track)
WF_steps(WF_track)
track(WF_track)
summaryWF(WF_track)
# $`hisat2-mapping-se`
# $`hisat2-mapping-se`$Done
# [1] "Existing expected outputs files: 18"
#
# $`hisat2-mapping-se`$NotRun
# $`hisat2-mapping-se`$NotRun$Summary
# [1] "Missing expected outputs files: 0"
#
# $`hisat2-mapping-se`$NotRun$ListFiles
# character(0)
systemPipeR
package## FileName Nreads2x Nalign Perc_Aligned Nalign_Primary
## 1 M1A 192918 177961 92.24697 177961
## 2 M1B 197484 159378 80.70426 159378
## 3 A1A 189870 176055 92.72397 176055
## 4 A1B 188854 147768 78.24457 147768
## Perc_Aligned_Primary
## 1 92.24697
## 2 80.70426
## 3 92.72397
## 4 78.24457
new targets
fileswriteTargetsout
function to a new targets file that serves as input to the next loadWorkflow
and renderWF
call## M1A M1B A1A A1B V1A V1B M6A M6B A6A A6B V6A V6B M12A M12B A12A
## AT1G01010 57 244 201 169 365 229 41 38 152 46 294 405 117 139 132
## AT1G01020 23 93 69 126 107 88 18 25 20 5 88 151 43 74 33
## AT1G01030 41 98 73 58 94 156 9 13 8 6 36 147 32 29 12
## AT1G01040 180 684 522 664 585 680 162 163 249 66 697 1060 338 604 360
## A12B V12A V12B
## AT1G01010 64 230 120
## AT1G01020 18 43 31
## AT1G01030 9 70 85
## AT1G01040 80 203 171
DESeq2
(Love, Huber, and Anders 2014) for any number of pairwise sample comparisons specified under the cmp
argumenttargetspath <- system.file("extdata", "targets.txt", package="systemPipeR")
targets <- read.delim(targetspath, comment="#")
cmp <- readComp(file=targetspath, format="matrix", delim="-")
countDFeBygpath <- system.file("extdata", "countDFeByg.xls", package="systemPipeR")
countDFeByg <- read.delim(countDFeBygpath, row.names=1)
degseqDF <- run_DESeq2(countDF=countDFeByg, targets=targets, cmp=cmp[[1]], independent=FALSE)
degseqDF[1:4,1:6]
## M1-A1_baseMean M1-A1_logFC M1-A1_lfcSE M1-A1_stat M1-A1_pvalue
## AT1G01010 145.25327 0.03975293 0.4624215 0.08596688 0.9314927
## AT1G01020 44.13612 -0.26585739 0.4340369 -0.61252257 0.5401921
## AT1G01030 44.26939 0.63339311 0.6246251 1.01403724 0.3105650
## AT1G01040 315.30420 -0.01585074 0.3126902 -0.05069151 0.9595713
## M1-A1_FDR
## AT1G01010 1
## AT1G01020 1
## AT1G01030 1
## AT1G01040 1
preprocessReads
allows to apply predefined or custom read preprocessing functions to all FASTQ files referenced in a SYSargs
container, such as quality filtering or adapter trimming routines. The following example performs adapter trimming with the trimLRPatterns
function from the Biostrings
package. After the trimming step a new targets file is generated (here targets_trim.txt
) containing the paths to the trimmed FASTQ files. The new targets file can be used for the next workflow step with an updated SYSargs
instance, e.g. running the NGS alignments using the trimmed FASTQ files.
seeFastq
and seeFastqPlot
functions generate and plot a series of useful quality statistics for a set of FASTQ files including per cycle quality box plots, base proportions, base-level quality trends, relative k-mer diversity, length and occurrence distribution of reads, number of reads above quality cutoffs and mean quality distribution. The results are written to a PDF file named fastqReport.pdf
.
summarizeOverlaps
in parallel mode using multiple coressummarizeOverlaps
function (Lawrence et al. 2013). The read counting is preformed for exonic gene regions in a non-strand-specific manner while ignoring overlaps among different genes. The raw read count table (countDFeByg.xls
) is written to a files in the directory of this project. Parallelization is achieved with the BiocParallel
package, here using 8 CPU cores. Return
library("GenomicFeatures"); library(BiocParallel)
args <- systemArgs(sysma="param/hisat2.param", mytargets="targets.txt")
txdb <- makeTxDbFromGFF(file="data/tair10.gff", format="gff",
dataSource="TAIR", organism="Arabidopsis thaliana")
saveDb(txdb, file="./data/tair10.sqlite")
txdb <- loadDb("./data/tair10.sqlite")
(align <- readGAlignments(outpaths(args)[1])) # Demonstrates how to read bam file into R
eByg <- exonsBy(txdb, by=c("gene"))
bfl <- BamFileList(outpaths(args), yieldSize=50000, index=character())
multicoreParam <- MulticoreParam(workers=2); register(multicoreParam)
registered()
counteByg <- bplapply(bfl, function(x) summarizeOverlaps(eByg, x, mode="Union",
ignore.strand=TRUE,
inter.feature=FALSE,
singleEnd=TRUE))
countDFeByg <- sapply(seq(along=counteByg),
function(x) assays(counteByg[[x]])$counts)
rownames(countDFeByg) <- names(rowRanges(counteByg[[1]]))
colnames(countDFeByg) <- names(bfl)
write.table(countDFeByg, "results/countDFeByg.xls", col.names=NA,
quote=FALSE, sep="\t")
Kim, Daehwan, Ben Langmead, and Steven L Salzberg. 2015. “HISAT: A Fast Spliced Aligner with Low Memory Requirements.” Nat. Methods 12 (4): 357–60.
Lawrence, Michael, Wolfgang Huber, Hervé Pagès, Patrick Aboyoun, Marc Carlson, Robert Gentleman, Martin T Morgan, and Vincent J Carey. 2013. “Software for Computing and Annotating Genomic Ranges.” PLoS Comput. Biol. 9 (8): e1003118. https://doi.org/10.1371/journal.pcbi.1003118.
Love, Michael, Wolfgang Huber, and Simon Anders. 2014. “Moderated Estimation of Fold Change and Dispersion for RNA-seq Data with DESeq2.” Genome Biol. 15 (12): 550. https://doi.org/10.1186/s13059-014-0550-8.
Morgan, Martin, Hervé Pagès, Valerie Obenchain, and Nathaniel Hayden. 2019. Rsamtools: Binary Alignment (Bam), Fasta, Variant Call (Bcf), and Tabix File Import. http://bioconductor.org/packages/Rsamtools.