Here are the examples of the python api bcbio.provenance.do.run taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
199 Examples
3
Example 1
Project: bcbio-nextgen Source File: shared.py
def write_noanalysis_reads(in_file, region_file, out_file, config):
"""Write a BAM file of reads in the specified region file that are not analyzed.
We want to get only reads not in analysis regions but also make use of
the BAM index to perform well on large files. The tricky part is avoiding
command line limits. There is a nice discussion on SeqAnswers:
http://seqanswers.com/forums/showthread.php?t=29538
sambamba supports intersection via an input BED file so avoids command line
length issues.
"""
if not file_exists(out_file):
with file_transaction(config, out_file) as tx_out_file:
bedtools = config_utils.get_program("bedtools", config)
sambamba = config_utils.get_program("sambamba", config)
cl = ("{sambamba} view -f bam -L {region_file} {in_file} | "
"{bedtools} intersect -abam - -b {region_file} -f 1.0 -nonamecheck"
"> {tx_out_file}")
do.run(cl.format(**locals()), "Select unanalyzed reads")
return out_file
3
Example 2
Project: bcbio-nextgen Source File: cufflinks.py
def run(align_file, ref_file, data):
config = data["config"]
cmd = _get_general_options(align_file, config)
cmd.extend(_get_no_assembly_options(ref_file, data))
out_dir = _get_output_dir(align_file, data)
tracking_file = os.path.join(out_dir, "genes.fpkm_tracking")
fpkm_file = os.path.join(out_dir, data['rgnames']['sample']) + ".fpkm"
tracking_file_isoform = os.path.join(out_dir, "isoforms.fpkm_tracking")
fpkm_file_isoform = os.path.join(out_dir, data['rgnames']['sample']) + ".isoform.fpkm"
if not file_exists(fpkm_file):
with file_transaction(data, out_dir) as tmp_out_dir:
safe_makedir(tmp_out_dir)
cmd.extend(["--output-dir", tmp_out_dir])
cmd.extend([align_file])
cmd = map(str, cmd)
do.run(cmd, "Cufflinks on %s." % (align_file))
fpkm_file = gene_tracking_to_fpkm(tracking_file, fpkm_file)
fpkm_file_isoform = gene_tracking_to_fpkm(tracking_file_isoform, fpkm_file_isoform)
return out_dir, fpkm_file, fpkm_file_isoform
3
Example 3
Project: bcbio-nextgen Source File: alignprep.py
@utils.map_wrap
@zeromq_aware_logging
def _grabix_index(data):
in_file = data["bgzip_file"]
config = data["config"]
grabix = config_utils.get_program("grabix", config)
gbi_file = in_file + ".gbi"
if tz.get_in(["algorithm", "align_split_size"], config) is not False:
if not utils.file_exists(gbi_file) or _is_partial_index(gbi_file):
do.run([grabix, "index", in_file], "Index input with grabix: %s" % os.path.basename(in_file))
return [gbi_file]
3
Example 4
Project: bcbio-nextgen Source File: fastq.py
def _bzip_gzip(in_file):
"""
convert from bz2 to gz
"""
if not utils.is_bzipped(in_file):
return in_file
base, first_ext = os.path.splitext(in_file)
gzipped_file = base + ".gz"
if (fastq.is_fastq(base) and
not objectstore.is_remote(in_file)):
if file_exists(gzipped_file):
return gzipped_file
message = "gzipping {in_file}.".format(in_file=in_file)
with file_transaction(gzipped_file) as tx_gzipped_file:
do.run("bunzip2 -c {in_file} | gzip > {tx_gzipped_file}".format(**locals()), message)
return gzipped_file
return in_file
3
Example 5
Project: bcbio-nextgen Source File: hisat2.py
def create_splicesites_file(gtf_file, align_dir, data):
"""
if not pre-created, make a splicesites file to use with hisat2
"""
out_file = os.path.join(align_dir, "ref-transcripts-splicesites.txt")
if file_exists(out_file):
return out_file
safe_makedir(align_dir)
hisat2_ss = config_utils.get_program("hisat2_extract_splice_sites.py", data)
cmd = "{hisat2_ss} {gtf_file} > {tx_out_file}"
message = "Creating hisat2 splicesites file from %s." % gtf_file
with file_transaction(out_file) as tx_out_file:
do.run(cmd.format(**locals()), message)
return out_file
3
Example 6
def _prep_inputs(vrn_info, cnv_info, somatic_info, work_dir, config):
"""Prepare inputs for running PhyloWGS from variant and CNV calls.
"""
exe = os.path.join(os.path.dirname(sys.executable), "create_phylowgs_inputs.py")
assert os.path.exists(exe), "Could not find input prep script for PhyloWGS runs."
ssm_file = os.path.join(work_dir, "ssm_data.txt")
cnv_file = os.path.join(work_dir, "cnv_data.txt")
if not utils.file_exists(ssm_file) or not utils.file_exists(cnv_file):
with file_transaction(somatic_info.tumor_data, ssm_file, cnv_file) as (tx_ssm_file, tx_cnv_file):
variant_type, input_vcf_file = _prep_vrn_file(vrn_info["vrn_file"], vrn_info["variantcaller"],
work_dir, somatic_info, cnv_info["ignore"], config)
input_cnv_file = _prep_cnv_file(cnv_info["subclones"], work_dir, somatic_info)
cmd = [sys.executable, exe,
"--sample-size", str(config["sample_size"]), "--tumor-sample", somatic_info.tumor_name,
"--battenberg", input_cnv_file, "--cellularity", _read_contam(cnv_info["contamination"]),
"--output-cnvs", tx_cnv_file, "--output-variants", tx_ssm_file,
"--variant-type", variant_type, input_vcf_file]
do.run(cmd, "Prepare PhyloWGS inputs.")
return ssm_file, cnv_file
3
Example 7
Project: bcbio-nextgen Source File: samtools.py
def run(bam_file, data, out_dir):
"""Run samtools stats with reports on mapped reads, duplicates and insert sizes.
"""
stats_file = os.path.join(out_dir, "%s.txt" % dd.get_sample_name(data))
if not utils.file_exists(stats_file):
utils.safe_makedir(out_dir)
samtools = config_utils.get_program("samtools", data["config"])
with file_transaction(data, stats_file) as tx_out_file:
cmd = "{samtools} stats {bam_file}"
cmd += " > {tx_out_file}"
do.run(cmd.format(**locals()), "samtools stats", data)
out = _parse_samtools_stats(stats_file)
return out
3
Example 8
Project: bcbio-nextgen Source File: __init__.py
def filter_primary(bam_file, data):
stem, ext = os.path.splitext(bam_file)
out_file = stem + ".primary" + ext
if utils.file_exists(out_file):
return out_file
with file_transaction(out_file) as tx_out_file:
cmd = filter_primary_stream_cmd(bam_file, data)
cmd += "> {tx_out_file}"
do.run(cmd.format(**locals()), ("Filtering primary alignments in %s." %
os.path.basename(bam_file)))
return out_file
3
Example 9
Project: bcbio-nextgen Source File: kallisto.py
def kallisto_index(gtf_file, ref_file, data, out_dir):
out_dir = os.path.join(out_dir, "index")
out_stem = dd.get_genome_build(data)
if dd.get_disambiguate(data):
out_fname = "-".join([out_fname] + dd.get_disambguate(data))
index_dir = os.path.join(out_dir, out_stem)
kallisto = config_utils.get_program("kallisto", dd.get_config(data))
if dd.get_transcriptome_fasta(data):
gtf_fa = dd.get_transcriptome_fasta(data)
else:
gtf_fa = sailfish.create_combined_fasta(data, index_dir)
out_file = os.path.join(index_dir, out_stem + ".idx")
if file_exists(out_file):
return out_file
with file_transaction(out_file) as tx_out_file:
cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}"
message = "Creating Kallisto index for {gtf_fa}."
do.run(cmd.format(**locals()), message.format(**locals()), None)
return out_file
3
Example 10
Project: bcbio-nextgen Source File: __init__.py
def run_gatk(self, params, tmp_dir=None, log_error=True,
data=None, region=None, memscale=None):
with tx_tmpdir(self._config) as local_tmp_dir:
if tmp_dir is None:
tmp_dir = local_tmp_dir
cl = self.cl_gatk(params, tmp_dir, memscale=memscale)
atype_index = params.index("-T") if params.count("-T") > 0 \
else params.index("--analysis_type")
prog = params[atype_index + 1]
do.run(cl, "GATK: {0}".format(prog), data, region=region,
log_error=log_error)
3
Example 11
Project: bcbio-nextgen Source File: cleanbam.py
def fixrg(in_bam, names, ref_file, dirs, data):
"""Fix read group in a file, using samtools addreplacerg.
addreplacerg does not remove the old read group, causing confusion when
checking. We use reheader to work around this
"""
work_dir = utils.safe_makedir(os.path.join(dirs["work"], "bamclean", dd.get_sample_name(data)))
out_file = os.path.join(work_dir, "%s-fixrg.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
if not utils.file_uptodate(out_file, in_bam):
with file_transaction(data, out_file) as tx_out_file:
rg_info = novoalign.get_rg_info(names)
new_header = "%s-header.txt" % os.path.splitext(out_file)[0]
do.run("samtools view -H {in_bam} | grep -v ^@RG > {new_header}".format(**locals()),
"Create empty RG header: %s" % dd.get_sample_name(data))
cmd = ("samtools reheader {new_header} {in_bam} | "
"samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} -")
do.run(cmd.format(**locals()), "Fix read groups: %s" % dd.get_sample_name(data))
return out_file
3
Example 12
Project: bcbio-nextgen Source File: postalign.py
def dedup_bam(in_bam, data):
"""Perform non-stream based deduplication of BAM input files using biobambam.
"""
if _check_dedup(data):
out_file = "%s-dedup%s" % utils.splitext_plus(in_bam)
if not utils.file_exists(out_file):
with tx_tmpdir(data) as tmpdir:
with file_transaction(data, out_file) as tx_out_file:
bammarkduplicates = config_utils.get_program("bammarkduplicates", data["config"])
base_tmp = os.path.join(tmpdir, os.path.splitext(os.path.basename(tx_out_file))[0])
cores, mem = _get_cores_memory(data, downscale=2)
cmd = ("{bammarkduplicates} tmpfile={base_tmp}-markdup "
"markthreads={cores} I={in_bam} O={tx_out_file}")
do.run(cmd.format(**locals()), "De-duplication with biobambam")
bam.index(out_file, data["config"])
return out_file
else:
return in_bam
3
Example 13
Project: bcbio-nextgen Source File: bubbletree.py
def _create_subset_file(in_file, het_region_bed, work_dir, data):
"""Subset the VCF to a set of pre-calculated smaller regions.
"""
cnv_regions = shared.get_base_cnv_regions(data, work_dir)
region_bed = bedutils.intersect_two(het_region_bed, cnv_regions, work_dir, data)
out_file = os.path.join(work_dir, "%s-origsubset.bcf" % utils.splitext_plus(os.path.basename(in_file))[0])
if not utils.file_uptodate(out_file, in_file):
with file_transaction(data, out_file) as tx_out_file:
regions = ("-R %s" % region_bed) if utils.file_exists(region_bed) else ""
cmd = "bcftools view {regions} -o {tx_out_file} -O b {in_file}"
do.run(cmd.format(**locals()), "Extract regions for BubbleTree frequency determination")
return out_file
3
Example 14
Project: bcbio-nextgen Source File: merge.py
def _create_merge_filelist(bam_files, base_file, config):
"""Create list of input files for merge, ensuring all files are valid.
"""
bam_file_list = "%s.list" % os.path.splitext(base_file)[0]
samtools = config_utils.get_program("samtools", config)
with open(bam_file_list, "w") as out_handle:
for f in sorted(bam_files):
do.run('{} quickcheck -v {}'.format(samtools, f),
"Ensure integrity of input merge BAM files")
out_handle.write("%s\n" % f)
return bam_file_list
3
Example 15
Project: bcbio-nextgen Source File: __init__.py
def bam_to_sam(in_file, config):
if is_sam(in_file):
return in_file
assert is_bam(in_file), "%s is not a BAM file" % in_file
out_file = os.path.splitext(in_file)[0] + ".sam"
if utils.file_exists(out_file):
return out_file
samtools = config_utils.get_program("samtools", config)
num_cores = config["algorithm"].get("num_cores", 1)
with file_transaction(config, out_file) as tx_out_file:
cmd = "{samtools} view -@ {num_cores} -h {in_file} -o {tx_out_file}"
do.run(cmd.format(**locals()),
("Convert BAM to SAM (%s cores): %s to %s"
% (str(num_cores), in_file, out_file)))
return out_file
3
Example 16
Project: bcbio-nextgen Source File: sra.py
def _download_sra(sraid, outdir):
url = _create_link(sraid)
cmd = "wget -O {out_file} {url}"
out_file = os.path.join(outdir, "%s.sra" % sraid)
if not utils.file_exists(out_file):
do.run(cmd, "Download %s" % sraid)
return out_file
3
Example 17
Project: bcbio-nextgen Source File: bwakit.py
def run(data):
"""HLA typing with bwakit, parsing output from called genotype files.
"""
bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem")))
hla_fqs = tz.get_in(["hla", "fastq"], data, [])
if len(hla_fqs) > 0:
hla_base = os.path.commonprefix(hla_fqs)
while hla_base.endswith("."):
hla_base = hla_base[:-1]
out_file = hla_base + ".top"
if not utils.file_exists(out_file):
cmd = "{bwakit_dir}/run-HLA {hla_base}"
do.run(cmd.format(**locals()), "HLA typing with bwakit")
out_file = _organize_calls(out_file, hla_base, data)
data["hla"].update({"call_file": out_file,
"hlacaller": "bwakit"})
return data
3
Example 18
Project: bcbio-nextgen Source File: cpat.py
def make_logit_model(coding_fasta, noncoding_fasta, hexamers, data, out_dir=None):
safe_makedir(out_dir)
out_prefix = os.path.join(out_dir, "logit")
out_file = out_prefix + ".logit.RData"
if file_exists(out_file):
return out_file
tx_prefix = tempfile.NamedTemporaryFile(delete=False).name
tx_out_file = tx_prefix + ".logit.RData"
logit_cmd = config_utils.get_program("make_logitModel.py", data)
r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
cmd = ("{r_setup}{logit_cmd} --cgene={coding_fasta} --ngene={noncoding_fasta} "
"--hex={hexamers} --outfile={tx_prefix}")
message = "Building coding/noncoding logistical model."
do.run(cmd.format(**locals()), message)
shutil.move(tx_out_file, out_file)
return out_file
3
Example 19
Project: bcbio-nextgen Source File: ref.py
def fasta_idx(in_file, config=None):
"""Retrieve samtools style fasta index.
"""
fasta_index = in_file + ".fai"
if not utils.file_exists(fasta_index):
samtools = config_utils.get_program("samtools", config) if config else "samtools"
cmd = "{samtools} faidx {in_file}"
do.run(cmd.format(**locals()), "samtools faidx")
return fasta_index
3
Example 20
Project: bcbio-nextgen Source File: kallisto.py
def kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data):
samplename = dd.get_sample_name(data)
quant_dir = os.path.join(kallisto_dir, "quant")
safe_makedir(kallisto_dir)
num_cores = dd.get_num_cores(data)
strandedness = dd.get_strandedness(data).lower()
kallisto = config_utils.get_program("kallisto", dd.get_config(data))
# unsure how to estimate from single end data, so go with a reasonable default
frag_length = 250
batch_file = umi.convert_to_kallisto(data)
index = kallisto_index(gtf_file, fasta_file, data, kallisto_dir)
cmd = ("{kallisto} pseudo --umi "
"-t {num_cores} -o {tx_out_dir} -b {batch_file} -i {index}")
with chdir(os.path.dirname(batch_file)):
with file_transaction(data, quant_dir) as tx_out_dir:
message = ("Quantifying transcripts with Kallisto.")
do.run(cmd.format(**locals()), message, None)
kallisto_table(kallisto_dir, index)
return quant_dir
3
Example 21
Project: bcbio-nextgen Source File: bwa.py
def _run_bwa_align(fastq_file, ref_file, out_file, config):
aln_cl = [config_utils.get_program("bwa", config), "aln",
"-n 2", "-k 2"]
aln_cl += _bwa_args_from_config(config)
aln_cl += [ref_file, fastq_file]
cmd = "{cl} > {out_file}".format(cl=" ".join(aln_cl), out_file=out_file)
do.run(cmd, "bwa aln: {f}".format(f=os.path.basename(fastq_file)), None)
3
Example 22
def index_transcriptome(gtf_file, ref_file, data):
"""
use a GTF file and a reference FASTA file to index the transcriptome
"""
gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
bwa = config_utils.get_program("bwa", data["config"])
cmd = "{bwa} index {gtf_fasta}".format(**locals())
message = "Creating transcriptome index of %s with bwa." % (gtf_fasta)
do.run(cmd, message)
return gtf_fasta
3
Example 23
Project: bcbio-nextgen Source File: sambamba.py
def index(data, bam_fpath):
cmdl = make_command(data, "index", bam_fpath)
indexed_bam = bam_fpath + ".bai"
if not utils.file_uptodate(indexed_bam, bam_fpath):
do.run(cmdl, "Indexing BAM file using sambamba")
if not utils.file_exists(indexed_bam):
logger.error("Cannot index BAM file " + bam_fpath + " using sambamba.")
return None
return indexed_bam
3
Example 24
Project: bcbio-nextgen Source File: postalign.py
def umi_consensus(data):
"""Convert UMI grouped reads into fastq pair for re-alignment.
"""
align_bam = dd.get_work_bam(data)
f1_out = "%s-cuemi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
f2_out = "%s-cuemi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
if not utils.file_uptodate(f1_out, align_bam):
with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
cmd = ("fgbio {jvm_opts} GroupReadsByUmi -m 1 -e 1 -s adjacency -i {align_bam} | "
"fgbio {jvm_opts} CallMolecularConsensusReads -S queryname -i /dev/stdin -o /dev/stdout | "
"bamtofastq F={tx_f1_out} F2={tx_f2_out} gz=1")
do.run(cmd.format(**locals()), "UMI consensus fastq generation")
return f1_out, f2_out
3
Example 25
Project: bcbio-nextgen Source File: __init__.py
def run_mutect(self, params, tmp_dir=None):
with tx_tmpdir(self._config) as local_tmp_dir:
if tmp_dir is None:
tmp_dir = local_tmp_dir
cl = self.cl_mutect(params, tmp_dir)
prog = "MuTect"
do.run(cl, "MuTect: {0}".format(prog), None)
3
Example 26
Project: bcbio-nextgen Source File: tophat.py
def _fix_mates(orig_file, out_file, ref_file, config):
"""Fix problematic unmapped mate pairs in TopHat output.
TopHat 2.0.9 appears to have issues with secondary reads:
https://groups.google.com/forum/#!topic/tuxedo-tools-users/puLfDNbN9bo
This cleans the input file to only keep properly mapped pairs,
providing a general fix that will handle correctly mapped secondary
reads as well.
"""
if not file_exists(out_file):
with file_transaction(config, out_file) as tx_out_file:
samtools = config_utils.get_program("samtools", config)
cmd = "{samtools} view -bS -h -t {ref_file}.fai -F 8 {orig_file} > {tx_out_file}"
do.run(cmd.format(**locals()), "Fix mate pairs in TopHat output", {})
return out_file
3
Example 27
Project: bcbio-nextgen Source File: rapmap.py
def rapmap_index(gtf_file, ref_file, data, out_dir):
out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
if dd.get_disambiguate(data):
out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
rapmap = config_utils.get_program("rapmap", data["config"])
gtf_fa = create_combined_fasta(data, out_dir)
if file_exists(out_dir + "rapidx.jfhash"):
return out_dir
with file_transaction(out_dir) as tx_out_dir:
cmd = "{rapmap} pseudoindex -i {tx_out_dir} -t {gtf_fa}"
message = "Creating RapMap pseudoindex for {gtf_fa}."
do.run(cmd.format(**locals()), message.format(**locals()), None)
return out_dir
3
Example 28
Project: bcbio-nextgen Source File: peaks.py
def _prepare_bam(bam_file, bed_file, config):
if not bam_file or not bed_file:
return bam_file
out_file = utils.append_stem(bam_file, '_filter')
samtools = config_utils.get_program("samtools", config)
if not utils.file_exists(out_file):
with file_transaction(out_file) as tx_out:
cmd = "{samtools} view -bh -L {bed_file} {bam_file} > {tx_out}"
do.run(cmd.format(**locals()), "Clean %s" % bam_file)
return out_file
3
Example 29
Project: bcbio-nextgen Source File: fastq.py
def _gzip_fastq(in_file):
"""
gzip a fastq file if it is not already gzipped, handling conversion
from bzip to gzipped files
"""
if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file):
if utils.is_bzipped(in_file):
return _bzip_gzip(in_file)
elif not utils.is_gzipped(in_file):
gzipped_file = in_file + ".gz"
if file_exists(gzipped_file):
return gzipped_file
message = "gzipping {in_file}.".format(in_file=in_file)
with file_transaction(gzipped_file) as tx_gzipped_file:
do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()),
message)
return gzipped_file
return in_file
3
Example 30
def sam_to_bam(in_sam, config):
if is_bam(in_sam):
return in_sam
assert is_sam(in_sam), "%s is not a SAM file" % in_sam
out_file = os.path.splitext(in_sam)[0] + ".bam"
if utils.file_exists(out_file):
return out_file
samtools = config_utils.get_program("samtools", config)
num_cores = config["algorithm"].get("num_cores", 1)
with file_transaction(config, out_file) as tx_out_file:
cmd = "{samtools} view -@ {num_cores} -h -S -b {in_sam} -o {tx_out_file}"
do.run(cmd.format(**locals()),
("Convert SAM to BAM (%s cores): %s to %s"
% (str(num_cores), in_sam, out_file)))
return out_file
3
Example 31
Project: bcbio-nextgen Source File: fastq.py
def _merge_list_fastqs(files, out_file, config):
"""merge list of fastq files into one"""
if not all(map(fastq.is_fastq, files)):
raise ValueError("Not all of the files to merge are fastq files: %s " % (files))
assert all(map(utils.file_exists, files)), ("Not all of the files to merge "
"exist: %s" % (files))
if not file_exists(out_file):
files = [_gzip_fastq(fn) for fn in files]
if len(files) == 1:
os.symlink(files[0], out_file)
return out_file
with file_transaction(out_file) as file_txt_out:
files_str = " ".join(list(files))
cmd = "cat {files_str} > {file_txt_out}".format(**locals())
do.run(cmd, "merge fastq files %s" % files)
return out_file
3
Example 32
Project: bcbio-nextgen Source File: phylowgs.py
def _run_evolve(ssm_file, cnv_file, work_dir, data):
"""Run evolve.py to infer subclonal composition.
"""
exe = os.path.join(os.path.dirname(sys.executable), "evolve.py")
assert os.path.exists(exe), "Could not find evolve script for PhyloWGS runs."
out_dir = os.path.join(work_dir, "evolve")
out_file = os.path.join(out_dir, "top_k_trees")
if not utils.file_uptodate(out_file, cnv_file):
with file_transaction(data, out_dir) as tx_out_dir:
with utils.chdir(tx_out_dir):
cmd = [sys.executable, exe, "-r", "42", ssm_file, cnv_file]
do.run(cmd, "Run PhyloWGS evolution")
return out_file
3
Example 33
Project: bcbio-nextgen Source File: shared.py
def write_nochr_reads(in_file, out_file, config):
"""Write a BAM file of reads that are not mapped on a reference chromosome.
This is useful for maintaining non-mapped reads in parallel processes
that split processing by chromosome.
"""
if not file_exists(out_file):
with file_transaction(config, out_file) as tx_out_file:
samtools = config_utils.get_program("samtools", config)
cmd = "{samtools} view -b -f 4 {in_file} > {tx_out_file}"
do.run(cmd.format(**locals()), "Select unmapped reads")
return out_file
3
Example 34
Project: bcbio-nextgen Source File: fastq.py
@utils.memoize_outfile(stem=".groom")
def groom(in_file, data, in_qual="illumina", out_dir=None, out_file=None):
"""
Grooms a FASTQ file from Illumina 1.3/1.5 quality scores into
sanger format, if it is not already in that format.
"""
seqtk = config_utils.get_program("seqtk", data["config"])
if in_qual == "fastq-sanger":
logger.info("%s is already in Sanger format." % in_file)
return out_file
with file_transaction(out_file) as tmp_out_file:
cmd = "{seqtk} seq -Q64 {in_file} | gzip > {tmp_out_file}".format(**locals())
do.run(cmd, "Converting %s to Sanger format." % in_file)
return out_file
3
Example 35
Project: bcbio-nextgen Source File: sra.py
def _download_srx(srxid, url, out_dir):
cmd = "wget -N -r -nH -nd -np -nv {0}".format(url)
out_dir = os.path.abspath(utils.safe_makedir(out_dir))
with utils.chdir(out_dir):
do.run(cmd, "Download %s" % url )
# return [os.path.abspath(fn) for fn in glob.glob("*sra")]
return [os.path.join(out_dir, fn) for fn in os.listdir(out_dir)]
3
Example 36
def _select_model(n2_bounds, n2_result, n3_result, out_dir, data):
"""Run final model selection from n=2 and n=3 options.
"""
n2_out_file = n2_result.replace(".n2.results", ".BEST.results")
n3_out_file = n3_result.replace(".n3.results", ".BEST.results")
if not utils.file_exists(n2_out_file) and not utils.file_exists(n3_out_file):
cmd = _get_cmd("ModelSelection.py") + [n2_bounds, n2_result, n3_result]
do.run(cmd, "Select best THetA model")
if utils.file_exists(n2_out_file):
return n2_out_file
else:
assert utils.file_exists(n3_out_file)
return n3_out_file
3
Example 37
Project: bcbio-nextgen Source File: sra.py
def _convert_fastq(srafn, outdir, single=False):
"convert sra to fastq"
cmd = "fastq-dump --split-files --gzip {srafn}"
sraid = os.path.basename(utils.splitext_plus(srafn)[0])
if not single:
out_file = [os.path.join(outdir, "%s_1.fastq.gz" % sraid),
os.path.join(outdir, "%s_2.fastq.gz" % sraid)]
if not utils.file_exists(out_file[0]):
with utils.chdir(outdir):
do.run(cmd.format(**locals()), "Covert to fastq %s" % sraid)
if not utils.file_exists(out_file[0]):
raise IOError("SRA %s didn't convert, something happened." % srafn)
return [out for out in out_file if utils.file_exists(out)]
else:
raise ValueError("Not supported single-end sra samples for now.")
3
Example 38
Project: bcbio-nextgen Source File: __init__.py
def reheader(header, bam_file, config):
samtools = config_utils.get_program("samtools", config)
base, ext = os.path.splitext(bam_file)
out_file = base + ".reheadered" + ext
cmd = "{samtools} reheader {header} {bam_file} > {out_file}"
do.run(cmd.format(**locals()), "Reheadering %s." % bam_file)
return out_file
3
Example 39
Project: bcbio-nextgen Source File: cpat.py
def cpat(assembled_fasta, hexamer, logit, data, out_file=None):
if out_file and file_exists(out_file):
return out_file
if not out_file:
out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".cpat").name
cpat_cmd = config_utils.get_program("cpat.py", data)
r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
cmd = ("{r_setup}{cpat_cmd} --gene={assembled_fasta} --hex={hexamer} "
"--logitModel={logit} --outfile={tx_out_file}")
message = "Predicing coding potential of %s." % (assembled_fasta)
with file_transaction(out_file) as tx_out_file:
do.run(cmd.format(**locals()), message)
return out_file
3
Example 40
Project: bcbio-nextgen Source File: alignprep.py
def _bgzip_from_cram_sambamba(cram_file, dirs, data):
"""Use sambamba to extract from CRAM via regions.
"""
raise NotImplementedError("sambamba doesn't yet support retrieval from CRAM by BED file")
region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data)
if tz.get_in(["config", "algorithm", "coverage_interval"], data) in ["regional", "exome"]
else None)
base_name = utils.splitext_plus(os.path.basename(cram_file))[0]
work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep",
"%s-parts" % base_name))
f1, f2, o1, o2, si = [os.path.join(work_dir, "%s.fq" % x) for x in ["match1", "match2", "unmatch1", "unmatch2",
"single"]]
ref_file = dd.get_ref_file(data)
region = "-L %s" % region_file if region_file else ""
cmd = ("sambamba view -f bam -l 0 -C {cram_file} -T {ref_file} {region} | "
"bamtofastq F={f1} F2={f2} S={si} O={o1} O2={o2}")
do.run(cmd.format(**locals()), "Convert CRAM to fastq in regions")
3
Example 41
Project: bcbio-nextgen Source File: cpat.py
def hexamer_table(cds_fasta, noncoding_fasta, data, out_file=None):
if out_file and file_exists(out_file):
return out_file
if not out_file:
out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".hexamers").name
hex_cmd = config_utils.get_program("make_hexamer_tab.py", data)
cmd = ("{hex_cmd} --cod={cds_fasta} --noncod={noncoding_fasta} "
"> {tx_out_file}")
with file_transaction(out_file) as tx_out_file:
message = ("Calculating hexamer content in %s and %s."
% (cds_fasta, noncoding_fasta))
do.run(cmd.format(**locals()), message)
return out_file
3
Example 42
Project: bipy Source File: htseq_count.py
def run(input_file, gtf_file, out_file=None):
if out_file is None:
out_file = _get_outfilename(input_file)
safe_makedir(os.path.dirname(out_file))
if file_exists(out_file):
return out_file
with file_transaction(out_file) as tmp_out_file:
htseq_cmd = ("htseq-count --mode=union --stranded=no --type=exon "
"--idattr=gene_id {input_file} {gtf_file} > {tmp_out_file}")
cmd = htseq_cmd.format(**locals())
do.run(cmd, "Running htseq-count on %s." % (input_file), None)
return out_file
3
Example 43
Project: bcbio-nextgen Source File: cufflinks.py
def assemble(bam_file, ref_file, num_cores, out_dir, data):
out_dir = os.path.join(out_dir, data["rgnames"]["sample"])
safe_makedir(out_dir)
out_file = os.path.join(out_dir, "cufflinks-assembly.gtf")
cufflinks_out_file = os.path.join(out_dir, "transcripts.gtf")
library_type = " ".join(_get_stranded_flag(data["config"]))
if file_exists(out_file):
return out_file
with file_transaction(data, out_dir) as tmp_out_dir:
cmd = ("cufflinks --output-dir {tmp_out_dir} --num-threads {num_cores} "
"--frag-bias-correct {ref_file} "
"{library_type} --multi-read-correct --upper-quartile-norm {bam_file}")
cmd = cmd.format(**locals())
do.run(cmd, "Assembling transcripts with Cufflinks using %s." % bam_file)
shutil.move(cufflinks_out_file, out_file)
return out_file
3
Example 44
Project: bcbio-nextgen Source File: bowtie2.py
def index_transcriptome(gtf_file, ref_file, data):
"""
use a GTF file and a reference FASTA file to index the transcriptome
"""
gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
bowtie2_index = os.path.splitext(gtf_fasta)[0]
bowtie2_build = config_utils.get_program("bowtie2", data["config"]) + "-build"
cmd = "{bowtie2_build} --offrate 1 {gtf_fasta} {bowtie2_index}".format(**locals())
message = "Creating transcriptome index of %s with bowtie2." % (gtf_fasta)
do.run(cmd, message)
return bowtie2_index
3
Example 45
def run(self, command, options, pipe=False, get_stdout=False, memscale=None):
"""Run a Picard command with the provided option pairs.
"""
cl = self.cl_picard(command, options, memscale=memscale)
if pipe:
subprocess.Popen(cl)
elif get_stdout:
p = subprocess.Popen(cl, stdout=subprocess.PIPE)
stdout = p.stdout.read()
p.wait()
p.stdout.close()
return stdout
else:
do.run(cl, "Picard {0}".format(command), None)
3
Example 46
def run(input_file, fastqc_config, config):
outfile = _make_outfile(input_file, config)
# if it is already done skip it
if os.path.exists(outfile):
return outfile
cmd = _build_command(input_file, fastqc_config, config)
do.run(cmd, "Running FastQC on %s" % (input_file), None)
return outfile
3
Example 47
Project: bcbio-nextgen Source File: cram.py
def index(in_cram, config):
"""Ensure CRAM file has a .crai index file.
"""
out_file = in_cram + ".crai"
if not utils.file_uptodate(out_file, in_cram):
with file_transaction(config, in_cram + ".crai") as tx_out_file:
tx_in_file = os.path.splitext(tx_out_file)[0]
utils.symlink_plus(in_cram, tx_in_file)
cmd = "samtools index {tx_in_file}"
do.run(cmd.format(**locals()), "Index CRAM file")
return out_file
3
Example 48
def mapped(in_bam, config):
"""
return a bam file of only the mapped reads
"""
out_file = os.path.splitext(in_bam)[0] + ".mapped.bam"
if utils.file_exists(out_file):
return out_file
sambamba = _get_sambamba(config)
with file_transaction(config, out_file) as tx_out_file:
if sambamba:
cmd = ("{sambamba} view --format=bam -F 'not (unmapped or mate_is_unmapped)' "
"{in_bam} -o {tx_out_file}")
else:
samtools = config_utils.get_program("samtools", config)
cmd = "{samtools} view -b -F 4 {in_bam} -o {tx_out_file}"
do.run(cmd.format(**locals()),
"Filtering mapped reads to %s." % (tx_out_file))
return out_file
3
Example 49
def run(self, subcmd, opts, memscale=None):
jvm_opts = get_picard_opts(self._config, memscale=memscale)
Rpath = os.path.dirname(utils.Rscript_cmd())
cmd = ["unset", "JAVA_HOME", "&&", "export", "PATH=%s:$PATH" % Rpath, "&&"] + \
[self._cmd] + jvm_opts + [subcmd] + ["%s=%s" % (x, y) for x, y in opts] + \
["VALIDATION_STRINGENCY=SILENT"]
do.run(" ".join(cmd), "Picard: %s" % subcmd)
3
Example 50
def index(ref_file, out_dir, data):
"""Create a STAR index in the defined reference directory.
"""
(ref_dir, local_file) = os.path.split(ref_file)
gtf_file = dd.get_gtf_file(data)
if not utils.file_exists(gtf_file):
raise ValueError("%s not found, could not create a star index." % (gtf_file))
if not utils.file_exists(out_dir):
with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
num_cores = dd.get_cores(data)
cmd = ("STAR --genomeDir {tx_out_dir} --genomeFastaFiles {ref_file} "
"--runThreadN {num_cores} "
"--runMode genomeGenerate --sjdbOverhang 99 --sjdbGTFfile {gtf_file}")
do.run(cmd.format(**locals()), "Index STAR")
if os.path.exists(out_dir):
shutil.rmtree(out_dir)
shutil.move(tx_out_dir, out_dir)
return out_dir