variantReport.Rd
Functions for generating tabular variant reports including genomic context
annotations and confidence statistics of variants. The annotations are
obtained with utilities provided by the VariantAnnotation
package and
the variant statistics are retrieved from the input VCF files.
## Variant report
variantReport(files, txdb, fa, organism, out_dir = "results")
## Combine variant reports
combineVarReports(files, filtercol, ncol = 15)
## Create summary statistics of variants
varSummary(files)
named character vector
with paths of the input VCF files.
Annotation data stored as TranscriptDb
object, which can be obtained
from GFF/GTF files, BioMart, Bioc Annotation packages, UCSC, etc.
For details see the vignette of the GenomicFeatures
package.
It is important to use here matching versions of the txdb
and
fa
objects. The latter is the genome sequence used for read mapping
and variant calling.
FaFile
object pointing to the sequence file of the corresponding r
eference genome stored in FASTA format or a BSgenome
instance.
Character vector specifying the organism name of the reference genome.
Named character vector containing in the name field the column titles to
filter on, and in the data field the corresponding values to include in
the report. For instance, the setting filtercol=c(Consequence="nonsynonymous")
will include only nonsysynonymous variances listed in the Consequence
column. To omit the filtering step, one can use the setting
filtercol="All"
.
Integer specifying the number of columns in the tabular input files. Default is set to 15.
Character vector of a results
directory name.
Default is results
.
Tabular output files.
filterVars
## Alignment with BWA (sequentially on single machine)
param <- system.file("extdata", "bwa.param", package="systemPipeR")
targets <- system.file("extdata", "targets.txt", package="systemPipeR")
args <- systemArgs(sysma=param, mytargets=targets)
#> Warning: path[1]="./data/SRR446027_1.fastq.gz": No such file or directory
#> Warning: path[2]="./data/SRR446028_1.fastq.gz": No such file or directory
#> Warning: path[3]="./data/SRR446029_1.fastq.gz": No such file or directory
#> Warning: path[4]="./data/SRR446030_1.fastq.gz": No such file or directory
#> Warning: path[5]="./data/SRR446031_1.fastq.gz": No such file or directory
#> Warning: path[6]="./data/SRR446032_1.fastq.gz": No such file or directory
#> Warning: path[7]="./data/SRR446033_1.fastq.gz": No such file or directory
#> Warning: path[8]="./data/SRR446034_1.fastq.gz": No such file or directory
#> Warning: path[9]="./data/SRR446035_1.fastq.gz": No such file or directory
#> Warning: path[10]="./data/SRR446036_1.fastq.gz": No such file or directory
#> Warning: path[11]="./data/SRR446037_1.fastq.gz": No such file or directory
#> Warning: path[12]="./data/SRR446038_1.fastq.gz": No such file or directory
#> Warning: path[13]="./data/SRR446039_1.fastq.gz": No such file or directory
#> Warning: path[14]="./data/SRR446040_1.fastq.gz": No such file or directory
#> Warning: path[15]="./data/SRR446041_1.fastq.gz": No such file or directory
#> Warning: path[16]="./data/SRR446042_1.fastq.gz": No such file or directory
#> Warning: path[17]="./data/SRR446043_1.fastq.gz": No such file or directory
#> Warning: path[18]="./data/SRR446044_1.fastq.gz": No such file or directory
sysargs(args)[1]
#> M1A
#> "bwa mem -t 4 -M -R '@RG\\tID:group1\\tSM:sample1\\tPL:illumina\\tLB:lib1\\tPU:unit1' /home/runner/work/systemPipeR/systemPipeR/docs/reference/data/tair10.fasta ./data/SRR446027_1.fastq.gz > /home/runner/work/systemPipeR/systemPipeR/docs/reference/results/SRR446027_1.fastq.gz.sam"
if (FALSE) {
library(VariantAnnotation)
system("bwa index -a bwtsw ./data/tair10.fasta")
bampaths <- runCommandline(args=args)
## Alignment with BWA (parallelized on compute cluster)
resources <- list(walltime=120, ntasks=1, ncpus=cores(args), memory=1024)
reg <- clusterRun(args, conffile=".batchtools.conf.R", template="batchtools.slurm.tmpl", Njobs=18, runid="01", resourceList=resources)
## Variant calling with GATK
## The following creates in the inital step a new targets file
## (targets_bam.txt). The first column of this file gives the paths to
## the BAM files created in the alignment step. The new targets file and the
## parameter file gatk.param are used to create a new SYSargs
## instance for running GATK. Since GATK involves many processing steps, it is
## executed by a bash script gatk_run.sh where the user can specify the
## detailed run parameters. All three files are expected to be located in the
## current working directory. Samples files for gatk.param and
## gatk_run.sh are available in the subdirectory ./inst/extdata/ of the
## source file of the systemPipeR package.
writeTargetsout(x=args, file="targets_bam.txt")
system("java -jar CreateSequenceDictionary.jar R=./data/tair10.fasta O=./data/tair10.dict")
# system("java -jar /opt/picard/1.81/CreateSequenceDictionary.jar R=./data/tair10.fasta O=./data/tair10.dict")
args <- systemArgs(sysma="gatk.param", mytargets="targets_bam.txt")
resources <- list(walltime=120, ntasks=1, ncpus=cores(args), memory=1024)
reg <- clusterRun(args, conffile=".batchtools.conf.R", template="batchtools.slurm.tmpl", Njobs=18, runid="01", resourceList=resources)
writeTargetsout(x=args, file="targets_gatk.txt")
## Variant calling with BCFtools
## The following runs the variant calling with BCFtools. This step requires in
## the current working directory the parameter file sambcf.param and the
## bash script sambcf_run.sh.
args <- systemArgs(sysma="sambcf.param", mytargets="targets_bam.txt")
resources <- list(walltime=120, ntasks=1, ncpus=cores(args), memory=1024)
reg <- clusterRun(args, conffile=".batchtools.conf.R", template="batchtools.slurm.tmpl", Njobs=18, runid="01", resourceList=resources)
writeTargetsout(x=args, file="targets_sambcf.txt")
## Filtering of VCF files generated by GATK
args <- systemArgs(sysma="filter_gatk.param", mytargets="targets_gatk.txt")
filter <- "totalDepth(vr) >= 2 & (altDepth(vr) / totalDepth(vr) >= 0.8) & rowSums(softFilterMatrix(vr))==4"
# filter <- "totalDepth(vr) >= 20 & (altDepth(vr) / totalDepth(vr) >= 0.8) & rowSums(softFilterMatrix(vr))==6"
filterVars(args, filter, varcaller="gatk", organism="A. thaliana")
writeTargetsout(x=args, file="targets_gatk_filtered.txt")
## Filtering of VCF files generated by BCFtools
args <- systemArgs(sysma="filter_sambcf.param", mytargets="targets_sambcf.txt")
filter <- "rowSums(vr) >= 2 & (rowSums(vr[,3:4])/rowSums(vr[,1:4]) >= 0.8)"
# filter <- "rowSums(vr) >= 20 & (rowSums(vr[,3:4])/rowSums(vr[,1:4]) >= 0.8)"
filterVars(args, filter, varcaller="bcftools", organism="A. thaliana")
writeTargetsout(x=args, file="targets_sambcf_filtered.txt")
## Annotate filtered variants from GATK
args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_gatk_filtered.txt")
txdb <- loadDb("./data/tair10.sqlite")
fa <- FaFile(systemPipeR::reference(args))
variantReport(args=args, txdb=txdb, fa=fa, organism="A. thaliana")
## Annotate filtered variants from BCFtools
args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_sambcf_filtered.txt")
txdb <- loadDb("./data/tair10.sqlite")
fa <- FaFile(systemPipeR::reference(args))
variantReport(args=args, txdb=txdb, fa=fa, organism="A. thaliana")
## Combine results from GATK
args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_gatk_filtered.txt")
combineDF <- combineVarReports(args, filtercol=c(Consequence="nonsynonymous"))
write.table(combineDF, "./results/combineDF_nonsyn_gatk.xls", quote=FALSE, row.names=FALSE, sep="\t")
## Combine results from BCFtools
args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_sambcf_filtered.txt")
combineDF <- combineVarReports(args, filtercol=c(Consequence="nonsynonymous"))
write.table(combineDF, "./results/combineDF_nonsyn_sambcf.xls", quote=FALSE, row.names=FALSE, sep="\t")
## Summary for GATK
args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_gatk_filtered.txt")
write.table(varSummary(args), "./results/variantStats_gatk.xls", quote=FALSE, col.names = NA, sep="\t")
## Summary for BCFtools
args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_sambcf_filtered.txt")
write.table(varSummary(args), "./results/variantStats_sambcf.xls", quote=FALSE, col.names = NA, sep="\t")
## Venn diagram of variants
args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_gatk_filtered.txt")
varlist <- sapply(names(outpaths(args))[1:4], function(x) as.character(read.delim(outpaths(args)[x])$VARID))
vennset_gatk <- overLapper(varlist, type="vennsets")
args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_sambcf_filtered.txt")
varlist <- sapply(names(outpaths(args))[1:4], function(x) as.character(read.delim(outpaths(args)[x])$VARID))
vennset_bcf <- overLapper(varlist, type="vennsets")
vennPlot(list(vennset_gatk, vennset_bcf), mymain="", mysub="GATK: red; BCFtools: blue", colmode=2, ccol=c("blue", "red"))
}