systemPipeR's New CWL Command-line InterfaceSYSargs2 S4 class is a list-like containerSYSargs2 instancesSYSargs2 instances are generated by two constructor functions, loadWorkflow and renderWFtargets file. Subsequent targets instances are created automaticallySYSargs2Pipe allows loading as many SYSargs2 instances into a single compound object
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("systemPipeR")
BiocManager::install("systemPipeRdata", build_vignettes=TRUE, dependencies=TRUE) genWorkenvir function from systemPipeRdatatargets file for single-end (SE) librarytargetspath <- system.file("extdata", "targets.txt", package="systemPipeR")
read.delim(targetspath, comment.char = "#")[1:3,1:4]## FileName SampleName Factor SampleLong
## 1 ./data/SRR446027_1.fastq.gz M1A M1 Mock.1h.A
## 2 ./data/SRR446028_1.fastq.gz M1B M1 Mock.1h.B
## 3 ./data/SRR446029_1.fastq.gz A1A A1 Avr.1h.A
targets file for paired-end (PE) librarytargetspath <- system.file("extdata", "targetsPE.txt", package="systemPipeR")
read.delim(targetspath, comment.char = "#")[1:3,1:5]## FileName1 FileName2 SampleName
## 1 ./data/SRR446027_1.fastq.gz ./data/SRR446027_2.fastq.gz M1A
## 2 ./data/SRR446028_1.fastq.gz ./data/SRR446028_2.fastq.gz M1B
## 3 ./data/SRR446029_1.fastq.gz ./data/SRR446029_2.fastq.gz A1A
## Factor SampleLong
## 1 M1 Mock.1h.A
## 2 M1 Mock.1h.B
## 3 A1 Avr.1h.A
targets file and two param file
hisat2-mapping-se.cwl file contains the settings for running command-line softwarehisat2-mapping-se.yml file define all the variables to be input in the specific command-line step
targets <- system.file("extdata", "targets.txt", package="systemPipeR")
dir_path <- system.file("extdata/cwl/hisat2-se", package="systemPipeR")
align <- loadWorkflow(targets=targets, wf_file="hisat2-mapping-se.cwl",
input_file="hisat2-mapping-se.yml", dir_path=dir_path)
align <- renderWF(align, inputvars=c(FileName="_FASTQ_PATH_", SampleName="_SampleName_"))
## Instance of 'SYSargs2':
## Slot names/accessors:
## targets: 18 (M1A...V12B), targetsheader: 4 (lines)
## modules: 2
## wf: 0, clt: 1, yamlinput: 7 (components)
## input: 18, output: 18
## cmdlist: 18
## WF Steps:
## 1. hisat2-mapping-se.cwl (rendered: TRUE)SYSargs2 instancenames(align)
# [1] "targets" "targetsheader" "modules" "wf" "clt"
# [6] "yamlinput" "cmdlist" "input" "output" "cwlfiles"
# [11] "inputvars" cmdlist return command-line arguments for the specific software, here HISAT2 for the first sample:
cmdlist(align)[1]
# $M1A
# $M1A$`hisat2-mapping-se.cwl`
# [1] "hisat2 -S results/M1A.sam -x ./data/tair10.fasta -k 1 --min-intronlen 30 --max-intronlen 3000 -U ./data/SRR446027_1.fastq.gz --threads 4"SYSargs2 define all the expected output files for each step in the workflow; some of which are the input for the next workflow step
targets <- system.file("extdata", "targets.txt", package="systemPipeR")
dir_path <- system.file("extdata/cwl/hisat2-se", package="systemPipeR")
align <- loadWorkflow(targets=targets, wf_file="hisat2-mapping-se.cwl",
input_file="hisat2-mapping-se.yml", dir_path=dir_path)
align <- renderWF(align, inputvars=c(FileName="_FASTQ_PATH_", SampleName="_SampleName_"))subsetWF(align, slot="input", subset='FileName')[1:2]
#> M1A M1B
#> "./data/SRR446027_1.fastq.gz" "./data/SRR446028_1.fastq.gz"
subsetWF(align, slot="output", subset=1)[1:2]
#> M1A M1B
#> "results/M1A.sam" "results/M1B.sam"
subsetWF(align, slot="step", subset=1)[1] ## subset all the HISAT2 commandline
#> M1A
#> "hisat2 -S results/M1A.sam -x ./data/tair10.fasta -k 1 --min-intronlen 30 --max-intronlen 3000 -U ./data/SRR446027_1.fastq.gz --threads 4"
subsetWF(align, slot="output", subset=1, delete=TRUE)[1] ##DELETE
#> The subset cannot be deleted: no such file
#> M1A
#> "results/M1A.sam"Hisat2, on single machine. Command-line tool needs to be installed for this.cmdlist(align)[1:2]
system("hisat2-build ./data/tair10.fasta ./data/tair10.fasta")
runCommandline(align, make_bam = FALSE)
output(align)runCommandline, by default, auto detects SAM file outputs and converts them to sorted and indexed BAM files, using internally Rsamtools package (Morgan et al. 2019)runCommandline allows the user to create an exclusive results folder for each step in the workflow and a sub-folder for each sample defined in the targets fileclusterRun function submits non-R command-line software to queuing systems of clusters using run specifications defined by runCommandlinelibrary(batchtools)
resources <- list(walltime=120, ntasks=1, ncpus=4, memory=1024)
reg <- clusterRun(align, FUN = runCommandline, more.args = list(dir=TRUE),
conffile = ".batchtools.conf.R", template = "batchtools.slurm.tmpl",
Njobs=18, runid="01", resourceList=resources)
getStatus(reg=reg)
output(align)SYSargs2PipeSYSargs2PipeWF_track <- run_track(WF_ls = c(align))
names(WF_track)
WF_steps(WF_track)
track(WF_track)
summaryWF(WF_track)
# $`hisat2-mapping-se`
# $`hisat2-mapping-se`$Done
# [1] "Existing expected outputs files: 18"
#
# $`hisat2-mapping-se`$NotRun
# $`hisat2-mapping-se`$NotRun$Summary
# [1] "Missing expected outputs files: 0"
#
# $`hisat2-mapping-se`$NotRun$ListFiles
# character(0)systemPipeR package## FileName Nreads2x Nalign Perc_Aligned Nalign_Primary
## 1 M1A 192918 177961 92.24697 177961
## 2 M1B 197484 159378 80.70426 159378
## 3 A1A 189870 176055 92.72397 176055
## 4 A1B 188854 147768 78.24457 147768
## Perc_Aligned_Primary
## 1 92.24697
## 2 80.70426
## 3 92.72397
## 4 78.24457
new targets fileswriteTargetsout function to a new targets file that serves as input to the next loadWorkflow and renderWF call## M1A M1B A1A A1B V1A V1B M6A M6B A6A A6B V6A V6B M12A M12B A12A
## AT1G01010 57 244 201 169 365 229 41 38 152 46 294 405 117 139 132
## AT1G01020 23 93 69 126 107 88 18 25 20 5 88 151 43 74 33
## AT1G01030 41 98 73 58 94 156 9 13 8 6 36 147 32 29 12
## AT1G01040 180 684 522 664 585 680 162 163 249 66 697 1060 338 604 360
## A12B V12A V12B
## AT1G01010 64 230 120
## AT1G01020 18 43 31
## AT1G01030 9 70 85
## AT1G01040 80 203 171
DESeq2 (Love, Huber, and Anders 2014) for any number of pairwise sample comparisons specified under the cmp argumenttargetspath <- system.file("extdata", "targets.txt", package="systemPipeR")
targets <- read.delim(targetspath, comment="#")
cmp <- readComp(file=targetspath, format="matrix", delim="-")
countDFeBygpath <- system.file("extdata", "countDFeByg.xls", package="systemPipeR")
countDFeByg <- read.delim(countDFeBygpath, row.names=1)
degseqDF <- run_DESeq2(countDF=countDFeByg, targets=targets, cmp=cmp[[1]], independent=FALSE)
degseqDF[1:4,1:6]## M1-A1_baseMean M1-A1_logFC M1-A1_lfcSE M1-A1_stat M1-A1_pvalue
## AT1G01010 145.25327 0.03975293 0.4624215 0.08596688 0.9314927
## AT1G01020 44.13612 -0.26585739 0.4340369 -0.61252257 0.5401921
## AT1G01030 44.26939 0.63339311 0.6246251 1.01403724 0.3105650
## AT1G01040 315.30420 -0.01585074 0.3126902 -0.05069151 0.9595713
## M1-A1_FDR
## AT1G01010 1
## AT1G01020 1
## AT1G01030 1
## AT1G01040 1
preprocessReads allows to apply predefined or custom read preprocessing functions to all FASTQ files referenced in a SYSargs container, such as quality filtering or adapter trimming routines. The following example performs adapter trimming with the trimLRPatterns function from the Biostrings package. After the trimming step a new targets file is generated (here targets_trim.txt) containing the paths to the trimmed FASTQ files. The new targets file can be used for the next workflow step with an updated SYSargs instance, e.g. running the NGS alignments using the trimmed FASTQ files.
seeFastq and seeFastqPlot functions generate and plot a series of useful quality statistics for a set of FASTQ files including per cycle quality box plots, base proportions, base-level quality trends, relative k-mer diversity, length and occurrence distribution of reads, number of reads above quality cutoffs and mean quality distribution. The results are written to a PDF file named fastqReport.pdf.
summarizeOverlaps in parallel mode using multiple coressummarizeOverlaps function (Lawrence et al. 2013). The read counting is preformed for exonic gene regions in a non-strand-specific manner while ignoring overlaps among different genes. The raw read count table (countDFeByg.xls) is written to a files in the directory of this project. Parallelization is achieved with the BiocParallel package, here using 8 CPU cores. Return
library("GenomicFeatures"); library(BiocParallel)
args <- systemArgs(sysma="param/hisat2.param", mytargets="targets.txt")
txdb <- makeTxDbFromGFF(file="data/tair10.gff", format="gff",
dataSource="TAIR", organism="Arabidopsis thaliana")
saveDb(txdb, file="./data/tair10.sqlite")
txdb <- loadDb("./data/tair10.sqlite")
(align <- readGAlignments(outpaths(args)[1])) # Demonstrates how to read bam file into R
eByg <- exonsBy(txdb, by=c("gene"))
bfl <- BamFileList(outpaths(args), yieldSize=50000, index=character())
multicoreParam <- MulticoreParam(workers=2); register(multicoreParam)
registered()
counteByg <- bplapply(bfl, function(x) summarizeOverlaps(eByg, x, mode="Union",
ignore.strand=TRUE,
inter.feature=FALSE,
singleEnd=TRUE))
countDFeByg <- sapply(seq(along=counteByg),
function(x) assays(counteByg[[x]])$counts)
rownames(countDFeByg) <- names(rowRanges(counteByg[[1]]))
colnames(countDFeByg) <- names(bfl)
write.table(countDFeByg, "results/countDFeByg.xls", col.names=NA,
quote=FALSE, sep="\t")Kim, Daehwan, Ben Langmead, and Steven L Salzberg. 2015. “HISAT: A Fast Spliced Aligner with Low Memory Requirements.” Nat. Methods 12 (4): 357–60.
Lawrence, Michael, Wolfgang Huber, Hervé Pagès, Patrick Aboyoun, Marc Carlson, Robert Gentleman, Martin T Morgan, and Vincent J Carey. 2013. “Software for Computing and Annotating Genomic Ranges.” PLoS Comput. Biol. 9 (8): e1003118. https://doi.org/10.1371/journal.pcbi.1003118.
Love, Michael, Wolfgang Huber, and Simon Anders. 2014. “Moderated Estimation of Fold Change and Dispersion for RNA-seq Data with DESeq2.” Genome Biol. 15 (12): 550. https://doi.org/10.1186/s13059-014-0550-8.
Morgan, Martin, Hervé Pagès, Valerie Obenchain, and Nathaniel Hayden. 2019. Rsamtools: Binary Alignment (Bam), Fasta, Variant Call (Bcf), and Tabix File Import. http://bioconductor.org/packages/Rsamtools.