bcbio.provenance.do.run

Here are the examples of the python api bcbio.provenance.do.run taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

199 Examples 7

Example 1

Project: bcbio-nextgen Source File: shared.py
def write_noanalysis_reads(in_file, region_file, out_file, config):
    """Write a BAM file of reads in the specified region file that are not analyzed.

    We want to get only reads not in analysis regions but also make use of
    the BAM index to perform well on large files. The tricky part is avoiding
    command line limits. There is a nice discussion on SeqAnswers:
    http://seqanswers.com/forums/showthread.php?t=29538
    sambamba supports intersection via an input BED file so avoids command line
    length issues.
    """
    if not file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bedtools = config_utils.get_program("bedtools", config)
            sambamba = config_utils.get_program("sambamba", config)
            cl = ("{sambamba} view -f bam -L {region_file} {in_file} | "
                  "{bedtools} intersect -abam - -b {region_file} -f 1.0 -nonamecheck"
                  "> {tx_out_file}")
            do.run(cl.format(**locals()), "Select unanalyzed reads")
    return out_file

Example 2

Project: bcbio-nextgen Source File: cufflinks.py
def run(align_file, ref_file, data):
    config = data["config"]
    cmd = _get_general_options(align_file, config)
    cmd.extend(_get_no_assembly_options(ref_file, data))
    out_dir = _get_output_dir(align_file, data)
    tracking_file = os.path.join(out_dir, "genes.fpkm_tracking")
    fpkm_file = os.path.join(out_dir, data['rgnames']['sample']) + ".fpkm"
    tracking_file_isoform = os.path.join(out_dir, "isoforms.fpkm_tracking")
    fpkm_file_isoform = os.path.join(out_dir, data['rgnames']['sample']) + ".isoform.fpkm"
    if not file_exists(fpkm_file):
        with file_transaction(data, out_dir) as tmp_out_dir:
            safe_makedir(tmp_out_dir)
            cmd.extend(["--output-dir", tmp_out_dir])
            cmd.extend([align_file])
            cmd = map(str, cmd)
            do.run(cmd, "Cufflinks on %s." % (align_file))
        fpkm_file = gene_tracking_to_fpkm(tracking_file, fpkm_file)
        fpkm_file_isoform = gene_tracking_to_fpkm(tracking_file_isoform, fpkm_file_isoform)
    return out_dir, fpkm_file, fpkm_file_isoform

Example 3

Project: bcbio-nextgen Source File: alignprep.py
@utils.map_wrap
@zeromq_aware_logging
def _grabix_index(data):
    in_file = data["bgzip_file"]
    config = data["config"]
    grabix = config_utils.get_program("grabix", config)
    gbi_file = in_file + ".gbi"
    if tz.get_in(["algorithm", "align_split_size"], config) is not False:
        if not utils.file_exists(gbi_file) or _is_partial_index(gbi_file):
            do.run([grabix, "index", in_file], "Index input with grabix: %s" % os.path.basename(in_file))
    return [gbi_file]

Example 4

Project: bcbio-nextgen Source File: fastq.py
def _bzip_gzip(in_file):
    """
    convert from bz2 to gz
    """
    if not utils.is_bzipped(in_file):
        return in_file
    base, first_ext = os.path.splitext(in_file)
    gzipped_file = base + ".gz"
    if (fastq.is_fastq(base) and
        not objectstore.is_remote(in_file)):

        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        with file_transaction(gzipped_file) as tx_gzipped_file:
            do.run("bunzip2 -c {in_file} | gzip > {tx_gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file

Example 5

Project: bcbio-nextgen Source File: hisat2.py
def create_splicesites_file(gtf_file, align_dir, data):
    """
    if not pre-created, make a splicesites file to use with hisat2
    """
    out_file = os.path.join(align_dir, "ref-transcripts-splicesites.txt")
    if file_exists(out_file):
        return out_file
    safe_makedir(align_dir)
    hisat2_ss = config_utils.get_program("hisat2_extract_splice_sites.py", data)
    cmd = "{hisat2_ss} {gtf_file} > {tx_out_file}"
    message = "Creating hisat2 splicesites file from %s." % gtf_file
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file

Example 6

Project: bcbio-nextgen Source File: phylowgs.py
Function: prep_inputs
def _prep_inputs(vrn_info, cnv_info, somatic_info, work_dir, config):
    """Prepare inputs for running PhyloWGS from variant and CNV calls.
    """
    exe = os.path.join(os.path.dirname(sys.executable), "create_phylowgs_inputs.py")
    assert os.path.exists(exe), "Could not find input prep script for PhyloWGS runs."
    ssm_file = os.path.join(work_dir, "ssm_data.txt")
    cnv_file = os.path.join(work_dir, "cnv_data.txt")
    if not utils.file_exists(ssm_file) or not utils.file_exists(cnv_file):
        with file_transaction(somatic_info.tumor_data, ssm_file, cnv_file) as (tx_ssm_file, tx_cnv_file):
            variant_type, input_vcf_file = _prep_vrn_file(vrn_info["vrn_file"], vrn_info["variantcaller"],
                                                          work_dir, somatic_info, cnv_info["ignore"], config)
            input_cnv_file = _prep_cnv_file(cnv_info["subclones"], work_dir, somatic_info)
            cmd = [sys.executable, exe,
                   "--sample-size", str(config["sample_size"]), "--tumor-sample", somatic_info.tumor_name,
                   "--battenberg", input_cnv_file, "--cellularity", _read_contam(cnv_info["contamination"]),
                   "--output-cnvs", tx_cnv_file, "--output-variants", tx_ssm_file,
                   "--variant-type", variant_type, input_vcf_file]
            do.run(cmd, "Prepare PhyloWGS inputs.")
    return ssm_file, cnv_file

Example 7

Project: bcbio-nextgen Source File: samtools.py
def run(bam_file, data, out_dir):
    """Run samtools stats with reports on mapped reads, duplicates and insert sizes.
    """
    stats_file = os.path.join(out_dir, "%s.txt" % dd.get_sample_name(data))
    if not utils.file_exists(stats_file):
        utils.safe_makedir(out_dir)
        samtools = config_utils.get_program("samtools", data["config"])
        with file_transaction(data, stats_file) as tx_out_file:
            cmd = "{samtools} stats {bam_file}"
            cmd += " > {tx_out_file}"
            do.run(cmd.format(**locals()), "samtools stats", data)
    out = _parse_samtools_stats(stats_file)
    return out

Example 8

Project: bcbio-nextgen Source File: __init__.py
def filter_primary(bam_file, data):
    stem, ext = os.path.splitext(bam_file)
    out_file = stem + ".primary" + ext
    if utils.file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        cmd = filter_primary_stream_cmd(bam_file, data)
        cmd += "> {tx_out_file}"
        do.run(cmd.format(**locals()), ("Filtering primary alignments in %s." %
                                        os.path.basename(bam_file)))
    return out_file

Example 9

Project: bcbio-nextgen Source File: kallisto.py
def kallisto_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index")
    out_stem = dd.get_genome_build(data)
    if dd.get_disambiguate(data):
        out_fname = "-".join([out_fname] + dd.get_disambguate(data))
    index_dir = os.path.join(out_dir, out_stem)
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, index_dir)
    out_file = os.path.join(index_dir, out_stem + ".idx")
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}"
        message = "Creating Kallisto index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file

Example 10

Project: bcbio-nextgen Source File: __init__.py
    def run_gatk(self, params, tmp_dir=None, log_error=True,
                 data=None, region=None, memscale=None):
        with tx_tmpdir(self._config) as local_tmp_dir:
            if tmp_dir is None:
                tmp_dir = local_tmp_dir
            cl = self.cl_gatk(params, tmp_dir, memscale=memscale)
            atype_index = params.index("-T") if params.count("-T") > 0 \
                          else params.index("--analysis_type")
            prog = params[atype_index + 1]
            do.run(cl, "GATK: {0}".format(prog), data, region=region,
                   log_error=log_error)

Example 11

Project: bcbio-nextgen Source File: cleanbam.py
def fixrg(in_bam, names, ref_file, dirs, data):
    """Fix read group in a file, using samtools addreplacerg.

    addreplacerg does not remove the old read group, causing confusion when
    checking. We use reheader to work around this
    """
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "bamclean", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, "%s-fixrg.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            rg_info = novoalign.get_rg_info(names)
            new_header = "%s-header.txt" % os.path.splitext(out_file)[0]
            do.run("samtools view -H {in_bam} | grep -v ^@RG > {new_header}".format(**locals()),
                   "Create empty RG header: %s" % dd.get_sample_name(data))
            cmd = ("samtools reheader {new_header} {in_bam} | "
                   "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} -")
            do.run(cmd.format(**locals()), "Fix read groups: %s" % dd.get_sample_name(data))
    return out_file

Example 12

Project: bcbio-nextgen Source File: postalign.py
def dedup_bam(in_bam, data):
    """Perform non-stream based deduplication of BAM input files using biobambam.
    """
    if _check_dedup(data):
        out_file = "%s-dedup%s" % utils.splitext_plus(in_bam)
        if not utils.file_exists(out_file):
            with tx_tmpdir(data) as tmpdir:
                with file_transaction(data, out_file) as tx_out_file:
                    bammarkduplicates = config_utils.get_program("bammarkduplicates", data["config"])
                    base_tmp = os.path.join(tmpdir, os.path.splitext(os.path.basename(tx_out_file))[0])
                    cores, mem = _get_cores_memory(data, downscale=2)
                    cmd = ("{bammarkduplicates} tmpfile={base_tmp}-markdup "
                           "markthreads={cores} I={in_bam} O={tx_out_file}")
                    do.run(cmd.format(**locals()), "De-duplication with biobambam")
        bam.index(out_file, data["config"])
        return out_file
    else:
        return in_bam

Example 13

Project: bcbio-nextgen Source File: bubbletree.py
def _create_subset_file(in_file, het_region_bed, work_dir, data):
    """Subset the VCF to a set of pre-calculated smaller regions.
    """
    cnv_regions = shared.get_base_cnv_regions(data, work_dir)
    region_bed = bedutils.intersect_two(het_region_bed, cnv_regions, work_dir, data)
    out_file = os.path.join(work_dir, "%s-origsubset.bcf" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            regions = ("-R %s" % region_bed) if utils.file_exists(region_bed) else ""
            cmd = "bcftools view {regions} -o {tx_out_file} -O b {in_file}"
            do.run(cmd.format(**locals()), "Extract regions for BubbleTree frequency determination")
    return out_file

Example 14

Project: bcbio-nextgen Source File: merge.py
def _create_merge_filelist(bam_files, base_file, config):
    """Create list of input files for merge, ensuring all files are valid.
    """
    bam_file_list = "%s.list" % os.path.splitext(base_file)[0]
    samtools = config_utils.get_program("samtools", config)
    with open(bam_file_list, "w") as out_handle:
        for f in sorted(bam_files):
            do.run('{} quickcheck -v {}'.format(samtools, f),
                   "Ensure integrity of input merge BAM files")
            out_handle.write("%s\n" % f)
    return bam_file_list

Example 15

Project: bcbio-nextgen Source File: __init__.py
def bam_to_sam(in_file, config):
    if is_sam(in_file):
        return in_file

    assert is_bam(in_file), "%s is not a BAM file" % in_file
    out_file = os.path.splitext(in_file)[0] + ".sam"
    if utils.file_exists(out_file):
        return out_file

    samtools = config_utils.get_program("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    with file_transaction(config, out_file) as tx_out_file:
        cmd = "{samtools} view -@ {num_cores} -h {in_file} -o {tx_out_file}"
        do.run(cmd.format(**locals()),
               ("Convert BAM to SAM (%s cores): %s to %s"
                % (str(num_cores), in_file, out_file)))
    return out_file

Example 16

Project: bcbio-nextgen Source File: sra.py
def _download_sra(sraid, outdir):
    url = _create_link(sraid)
    cmd = "wget -O {out_file} {url}"
    out_file = os.path.join(outdir, "%s.sra" % sraid)
    if not utils.file_exists(out_file):
        do.run(cmd, "Download %s" % sraid)
    return out_file

Example 17

Project: bcbio-nextgen Source File: bwakit.py
def run(data):
    """HLA typing with bwakit, parsing output from called genotype files.
    """
    bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem")))
    hla_fqs = tz.get_in(["hla", "fastq"], data, [])
    if len(hla_fqs) > 0:
        hla_base = os.path.commonprefix(hla_fqs)
        while hla_base.endswith("."):
            hla_base = hla_base[:-1]
        out_file = hla_base + ".top"
        if not utils.file_exists(out_file):
            cmd = "{bwakit_dir}/run-HLA {hla_base}"
            do.run(cmd.format(**locals()), "HLA typing with bwakit")
            out_file = _organize_calls(out_file, hla_base, data)
        data["hla"].update({"call_file": out_file,
                            "hlacaller": "bwakit"})
    return data

Example 18

Project: bcbio-nextgen Source File: cpat.py
def make_logit_model(coding_fasta, noncoding_fasta, hexamers, data, out_dir=None):
    safe_makedir(out_dir)
    out_prefix = os.path.join(out_dir, "logit")
    out_file = out_prefix + ".logit.RData"
    if file_exists(out_file):
        return out_file
    tx_prefix = tempfile.NamedTemporaryFile(delete=False).name
    tx_out_file = tx_prefix + ".logit.RData"
    logit_cmd = config_utils.get_program("make_logitModel.py", data)
    r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
    cmd = ("{r_setup}{logit_cmd} --cgene={coding_fasta} --ngene={noncoding_fasta} "
           "--hex={hexamers} --outfile={tx_prefix}")
    message = "Building coding/noncoding logistical model."
    do.run(cmd.format(**locals()), message)
    shutil.move(tx_out_file, out_file)
    return out_file

Example 19

Project: bcbio-nextgen Source File: ref.py
def fasta_idx(in_file, config=None):
    """Retrieve samtools style fasta index.
    """
    fasta_index = in_file + ".fai"
    if not utils.file_exists(fasta_index):
        samtools = config_utils.get_program("samtools", config) if config else "samtools"
        cmd = "{samtools} faidx {in_file}"
        do.run(cmd.format(**locals()), "samtools faidx")
    return fasta_index

Example 20

Project: bcbio-nextgen Source File: kallisto.py
def kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    # unsure how to estimate from single end data, so go with a reasonable default
    frag_length = 250
    batch_file = umi.convert_to_kallisto(data)
    index = kallisto_index(gtf_file, fasta_file, data, kallisto_dir)
    cmd = ("{kallisto} pseudo --umi "
           "-t {num_cores} -o {tx_out_dir} -b {batch_file} -i {index}")
    with chdir(os.path.dirname(batch_file)):
        with file_transaction(data, quant_dir) as tx_out_dir:
            message = ("Quantifying transcripts with Kallisto.")
            do.run(cmd.format(**locals()), message, None)
    kallisto_table(kallisto_dir, index)
    return quant_dir

Example 21

Project: bcbio-nextgen Source File: bwa.py
def _run_bwa_align(fastq_file, ref_file, out_file, config):
    aln_cl = [config_utils.get_program("bwa", config), "aln",
              "-n 2", "-k 2"]
    aln_cl += _bwa_args_from_config(config)
    aln_cl += [ref_file, fastq_file]
    cmd = "{cl} > {out_file}".format(cl=" ".join(aln_cl), out_file=out_file)
    do.run(cmd, "bwa aln: {f}".format(f=os.path.basename(fastq_file)), None)

Example 22

Project: bcbio-nextgen Source File: bwa.py
Function: index_transcriptome
def index_transcriptome(gtf_file, ref_file, data):
    """
    use a GTF file and a reference FASTA file to index the transcriptome
    """
    gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
    bwa = config_utils.get_program("bwa", data["config"])
    cmd = "{bwa} index {gtf_fasta}".format(**locals())
    message = "Creating transcriptome index of %s with bwa." % (gtf_fasta)
    do.run(cmd, message)
    return gtf_fasta

Example 23

Project: bcbio-nextgen Source File: sambamba.py
def index(data, bam_fpath):
    cmdl = make_command(data, "index", bam_fpath)
    indexed_bam = bam_fpath + ".bai"
    if not utils.file_uptodate(indexed_bam, bam_fpath):
        do.run(cmdl, "Indexing BAM file using sambamba")
        if not utils.file_exists(indexed_bam):
            logger.error("Cannot index BAM file " + bam_fpath + " using sambamba.")
            return None
    return indexed_bam

Example 24

Project: bcbio-nextgen Source File: postalign.py
def umi_consensus(data):
    """Convert UMI grouped reads into fastq pair for re-alignment.
    """
    align_bam = dd.get_work_bam(data)
    f1_out = "%s-cuemi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
    f2_out = "%s-cuemi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
    if not utils.file_uptodate(f1_out, align_bam):
        with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
            jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
            cmd = ("fgbio {jvm_opts} GroupReadsByUmi -m 1 -e 1 -s adjacency -i {align_bam} | "
                   "fgbio {jvm_opts} CallMolecularConsensusReads -S queryname -i /dev/stdin -o /dev/stdout | "
                   "bamtofastq F={tx_f1_out} F2={tx_f2_out} gz=1")
            do.run(cmd.format(**locals()), "UMI consensus fastq generation")
    return f1_out, f2_out

Example 25

Project: bcbio-nextgen Source File: __init__.py
    def run_mutect(self, params, tmp_dir=None):
        with tx_tmpdir(self._config) as local_tmp_dir:
            if tmp_dir is None:
                tmp_dir = local_tmp_dir
            cl = self.cl_mutect(params, tmp_dir)
            prog = "MuTect"
            do.run(cl, "MuTect: {0}".format(prog), None)

Example 26

Project: bcbio-nextgen Source File: tophat.py
def _fix_mates(orig_file, out_file, ref_file, config):
    """Fix problematic unmapped mate pairs in TopHat output.

    TopHat 2.0.9 appears to have issues with secondary reads:
    https://groups.google.com/forum/#!topic/tuxedo-tools-users/puLfDNbN9bo
    This cleans the input file to only keep properly mapped pairs,
    providing a general fix that will handle correctly mapped secondary
    reads as well.
    """
    if not file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            samtools = config_utils.get_program("samtools", config)
            cmd = "{samtools} view -bS -h -t {ref_file}.fai -F 8 {orig_file} > {tx_out_file}"
            do.run(cmd.format(**locals()), "Fix mate pairs in TopHat output", {})
    return out_file

Example 27

Project: bcbio-nextgen Source File: rapmap.py
def rapmap_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    rapmap = config_utils.get_program("rapmap", data["config"])
    gtf_fa = create_combined_fasta(data, out_dir)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} pseudoindex -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating RapMap pseudoindex for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir

Example 28

Project: bcbio-nextgen Source File: peaks.py
def _prepare_bam(bam_file, bed_file, config):
    if not bam_file or not bed_file:
        return bam_file
    out_file = utils.append_stem(bam_file, '_filter')
    samtools = config_utils.get_program("samtools", config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out:
            cmd = "{samtools} view -bh -L {bed_file} {bam_file} > {tx_out}"
            do.run(cmd.format(**locals()), "Clean %s" % bam_file)
    return out_file

Example 29

Project: bcbio-nextgen Source File: fastq.py
def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped, handling conversion
    from bzip to gzipped files
    """
    if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file):
        if utils.is_bzipped(in_file):
            return _bzip_gzip(in_file)
        elif not utils.is_gzipped(in_file):
            gzipped_file = in_file + ".gz"
            if file_exists(gzipped_file):
                return gzipped_file
            message = "gzipping {in_file}.".format(in_file=in_file)
            with file_transaction(gzipped_file) as tx_gzipped_file:
                do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()),
                       message)
            return gzipped_file
    return in_file

Example 30

Project: bcbio-nextgen Source File: __init__.py
Function: sam_to_bam
def sam_to_bam(in_sam, config):
    if is_bam(in_sam):
        return in_sam

    assert is_sam(in_sam), "%s is not a SAM file" % in_sam
    out_file = os.path.splitext(in_sam)[0] + ".bam"
    if utils.file_exists(out_file):
        return out_file

    samtools = config_utils.get_program("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    with file_transaction(config, out_file) as tx_out_file:
        cmd = "{samtools} view -@ {num_cores} -h -S -b {in_sam} -o {tx_out_file}"
        do.run(cmd.format(**locals()),
               ("Convert SAM to BAM (%s cores): %s to %s"
                % (str(num_cores), in_sam, out_file)))
    return out_file

Example 31

Project: bcbio-nextgen Source File: fastq.py
def _merge_list_fastqs(files, out_file, config):
    """merge list of fastq files into one"""
    if not all(map(fastq.is_fastq, files)):
        raise ValueError("Not all of the files to merge are fastq files: %s " % (files))
    assert all(map(utils.file_exists, files)), ("Not all of the files to merge "
                                                "exist: %s" % (files))
    if not file_exists(out_file):
        files = [_gzip_fastq(fn) for fn in files]
        if len(files) == 1:
            os.symlink(files[0], out_file)
            return out_file
        with file_transaction(out_file) as file_txt_out:
            files_str = " ".join(list(files))
            cmd = "cat {files_str} > {file_txt_out}".format(**locals())
            do.run(cmd, "merge fastq files %s" % files)
    return out_file

Example 32

Project: bcbio-nextgen Source File: phylowgs.py
def _run_evolve(ssm_file, cnv_file, work_dir, data):
    """Run evolve.py to infer subclonal composition.
    """
    exe = os.path.join(os.path.dirname(sys.executable), "evolve.py")
    assert os.path.exists(exe), "Could not find evolve script for PhyloWGS runs."
    out_dir = os.path.join(work_dir, "evolve")
    out_file = os.path.join(out_dir, "top_k_trees")
    if not utils.file_uptodate(out_file, cnv_file):
        with file_transaction(data, out_dir) as tx_out_dir:
            with utils.chdir(tx_out_dir):
                cmd = [sys.executable, exe, "-r", "42", ssm_file, cnv_file]
                do.run(cmd, "Run PhyloWGS evolution")
    return out_file

Example 33

Project: bcbio-nextgen Source File: shared.py
def write_nochr_reads(in_file, out_file, config):
    """Write a BAM file of reads that are not mapped on a reference chromosome.

    This is useful for maintaining non-mapped reads in parallel processes
    that split processing by chromosome.
    """
    if not file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            samtools = config_utils.get_program("samtools", config)
            cmd = "{samtools} view -b -f 4 {in_file} > {tx_out_file}"
            do.run(cmd.format(**locals()), "Select unmapped reads")
    return out_file

Example 34

Project: bcbio-nextgen Source File: fastq.py
@utils.memoize_outfile(stem=".groom")
def groom(in_file, data, in_qual="illumina", out_dir=None, out_file=None):
    """
    Grooms a FASTQ file from Illumina 1.3/1.5 quality scores into
    sanger format, if it is not already in that format.
    """
    seqtk = config_utils.get_program("seqtk", data["config"])
    if in_qual == "fastq-sanger":
        logger.info("%s is already in Sanger format." % in_file)
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        cmd = "{seqtk} seq -Q64 {in_file} | gzip > {tmp_out_file}".format(**locals())
        do.run(cmd, "Converting %s to Sanger format." % in_file)
    return out_file

Example 35

Project: bcbio-nextgen Source File: sra.py
def _download_srx(srxid, url, out_dir):
    cmd = "wget -N -r -nH -nd -np -nv {0}".format(url)
    out_dir = os.path.abspath(utils.safe_makedir(out_dir))
    with utils.chdir(out_dir):
        do.run(cmd, "Download %s" % url )
        # return [os.path.abspath(fn) for fn in glob.glob("*sra")]
    return [os.path.join(out_dir, fn) for fn in os.listdir(out_dir)]

Example 36

Project: bcbio-nextgen Source File: theta.py
Function: select_model
def _select_model(n2_bounds, n2_result, n3_result, out_dir, data):
    """Run final model selection from n=2 and n=3 options.
    """
    n2_out_file = n2_result.replace(".n2.results", ".BEST.results")
    n3_out_file = n3_result.replace(".n3.results", ".BEST.results")
    if not utils.file_exists(n2_out_file) and not utils.file_exists(n3_out_file):
        cmd = _get_cmd("ModelSelection.py") + [n2_bounds, n2_result, n3_result]
        do.run(cmd, "Select best THetA model")
    if utils.file_exists(n2_out_file):
        return n2_out_file
    else:
        assert utils.file_exists(n3_out_file)
        return n3_out_file

Example 37

Project: bcbio-nextgen Source File: sra.py
def _convert_fastq(srafn, outdir, single=False):
    "convert sra to fastq"
    cmd = "fastq-dump --split-files --gzip {srafn}"
    sraid = os.path.basename(utils.splitext_plus(srafn)[0])
    if not single:
        out_file = [os.path.join(outdir, "%s_1.fastq.gz" % sraid),
                    os.path.join(outdir, "%s_2.fastq.gz" % sraid)]
        if not utils.file_exists(out_file[0]):
            with utils.chdir(outdir):
                do.run(cmd.format(**locals()), "Covert to fastq %s" % sraid)
        if not utils.file_exists(out_file[0]):
            raise IOError("SRA %s didn't convert, something happened." % srafn)
        return [out for out in out_file if utils.file_exists(out)]
    else:
        raise ValueError("Not supported single-end sra samples for now.")

Example 38

Project: bcbio-nextgen Source File: __init__.py
def reheader(header, bam_file, config):
    samtools = config_utils.get_program("samtools", config)
    base, ext = os.path.splitext(bam_file)
    out_file = base + ".reheadered" + ext
    cmd = "{samtools} reheader {header} {bam_file} > {out_file}"
    do.run(cmd.format(**locals()), "Reheadering %s." % bam_file)
    return out_file

Example 39

Project: bcbio-nextgen Source File: cpat.py
def cpat(assembled_fasta, hexamer, logit, data, out_file=None):
    if out_file and file_exists(out_file):
        return out_file
    if not out_file:
        out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".cpat").name
    cpat_cmd = config_utils.get_program("cpat.py", data)
    r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
    cmd = ("{r_setup}{cpat_cmd} --gene={assembled_fasta} --hex={hexamer} "
           "--logitModel={logit} --outfile={tx_out_file}")
    message = "Predicing coding potential of %s." % (assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file

Example 40

Project: bcbio-nextgen Source File: alignprep.py
def _bgzip_from_cram_sambamba(cram_file, dirs, data):
    """Use sambamba to extract from CRAM via regions.
    """
    raise NotImplementedError("sambamba doesn't yet support retrieval from CRAM by BED file")
    region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data)
                   if tz.get_in(["config", "algorithm", "coverage_interval"], data) in ["regional", "exome"]
                   else None)
    base_name = utils.splitext_plus(os.path.basename(cram_file))[0]
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep",
                                               "%s-parts" % base_name))
    f1, f2, o1, o2, si = [os.path.join(work_dir, "%s.fq" % x) for x in ["match1", "match2", "unmatch1", "unmatch2",
                                                                        "single"]]
    ref_file = dd.get_ref_file(data)
    region = "-L %s" % region_file if region_file else ""
    cmd = ("sambamba view -f bam -l 0 -C {cram_file} -T {ref_file} {region} | "
           "bamtofastq F={f1} F2={f2} S={si} O={o1} O2={o2}")
    do.run(cmd.format(**locals()), "Convert CRAM to fastq in regions")

Example 41

Project: bcbio-nextgen Source File: cpat.py
def hexamer_table(cds_fasta, noncoding_fasta, data, out_file=None):
    if out_file and file_exists(out_file):
        return out_file
    if not out_file:
        out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".hexamers").name
    hex_cmd = config_utils.get_program("make_hexamer_tab.py", data)
    cmd = ("{hex_cmd} --cod={cds_fasta} --noncod={noncoding_fasta} "
           "> {tx_out_file}")
    with file_transaction(out_file) as tx_out_file:
        message = ("Calculating hexamer content in %s and %s."
                   % (cds_fasta, noncoding_fasta))
        do.run(cmd.format(**locals()), message)
    return out_file

Example 42

Project: bipy Source File: htseq_count.py
def run(input_file, gtf_file, out_file=None):
    if out_file is None:
        out_file = _get_outfilename(input_file)

    safe_makedir(os.path.dirname(out_file))

    if file_exists(out_file):
        return out_file

    with file_transaction(out_file) as tmp_out_file:
        htseq_cmd = ("htseq-count --mode=union --stranded=no --type=exon "
                     "--idattr=gene_id {input_file} {gtf_file} > {tmp_out_file}")

        cmd = htseq_cmd.format(**locals())
        do.run(cmd, "Running htseq-count on %s." % (input_file), None)

    return out_file

Example 43

Project: bcbio-nextgen Source File: cufflinks.py
def assemble(bam_file, ref_file, num_cores, out_dir, data):
    out_dir = os.path.join(out_dir, data["rgnames"]["sample"])
    safe_makedir(out_dir)
    out_file = os.path.join(out_dir, "cufflinks-assembly.gtf")
    cufflinks_out_file = os.path.join(out_dir, "transcripts.gtf")
    library_type = " ".join(_get_stranded_flag(data["config"]))
    if file_exists(out_file):
        return out_file
    with file_transaction(data, out_dir) as tmp_out_dir:
        cmd = ("cufflinks --output-dir {tmp_out_dir} --num-threads {num_cores} "
               "--frag-bias-correct {ref_file} "
               "{library_type} --multi-read-correct --upper-quartile-norm {bam_file}")
        cmd = cmd.format(**locals())
        do.run(cmd, "Assembling transcripts with Cufflinks using %s." % bam_file)
    shutil.move(cufflinks_out_file, out_file)
    return out_file

Example 44

Project: bcbio-nextgen Source File: bowtie2.py
def index_transcriptome(gtf_file, ref_file, data):
    """
    use a GTF file and a reference FASTA file to index the transcriptome
    """
    gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
    bowtie2_index = os.path.splitext(gtf_fasta)[0]
    bowtie2_build = config_utils.get_program("bowtie2", data["config"]) + "-build"
    cmd = "{bowtie2_build} --offrate 1 {gtf_fasta} {bowtie2_index}".format(**locals())
    message = "Creating transcriptome index of %s with bowtie2." % (gtf_fasta)
    do.run(cmd, message)
    return bowtie2_index

Example 45

Project: bcbio-nextgen Source File: __init__.py
Function: run
    def run(self, command, options, pipe=False, get_stdout=False, memscale=None):
        """Run a Picard command with the provided option pairs.
        """
        cl = self.cl_picard(command, options, memscale=memscale)
        if pipe:
            subprocess.Popen(cl)
        elif get_stdout:
            p = subprocess.Popen(cl, stdout=subprocess.PIPE)
            stdout = p.stdout.read()
            p.wait()
            p.stdout.close()
            return stdout
        else:
            do.run(cl, "Picard {0}".format(command), None)

Example 46

Project: bipy Source File: fastqc.py
Function: run
def run(input_file, fastqc_config, config):
    outfile = _make_outfile(input_file, config)
    # if it is already done skip it
    if os.path.exists(outfile):
       return outfile

    cmd = _build_command(input_file, fastqc_config, config)
    do.run(cmd, "Running FastQC on %s" % (input_file), None)

    return outfile

Example 47

Project: bcbio-nextgen Source File: cram.py
def index(in_cram, config):
    """Ensure CRAM file has a .crai index file.
    """
    out_file = in_cram + ".crai"
    if not utils.file_uptodate(out_file, in_cram):
        with file_transaction(config, in_cram + ".crai") as tx_out_file:
            tx_in_file = os.path.splitext(tx_out_file)[0]
            utils.symlink_plus(in_cram, tx_in_file)
            cmd = "samtools index {tx_in_file}"
            do.run(cmd.format(**locals()), "Index CRAM file")
    return out_file

Example 48

Project: bcbio-nextgen Source File: __init__.py
Function: mapped
def mapped(in_bam, config):
    """
    return a bam file of only the mapped reads
    """
    out_file = os.path.splitext(in_bam)[0] + ".mapped.bam"
    if utils.file_exists(out_file):
        return out_file
    sambamba = _get_sambamba(config)
    with file_transaction(config, out_file) as tx_out_file:
        if sambamba:
            cmd = ("{sambamba} view --format=bam -F 'not (unmapped or mate_is_unmapped)' "
                   "{in_bam} -o {tx_out_file}")
        else:
            samtools = config_utils.get_program("samtools", config)
            cmd = "{samtools} view -b -F 4 {in_bam} -o {tx_out_file}"
        do.run(cmd.format(**locals()),
               "Filtering mapped reads to %s." % (tx_out_file))
    return out_file

Example 49

Project: bcbio-nextgen Source File: __init__.py
Function: run
    def run(self, subcmd, opts, memscale=None):
        jvm_opts = get_picard_opts(self._config, memscale=memscale)
        Rpath = os.path.dirname(utils.Rscript_cmd())
        cmd = ["unset", "JAVA_HOME", "&&", "export", "PATH=%s:$PATH" % Rpath, "&&"] + \
              [self._cmd] + jvm_opts + [subcmd] + ["%s=%s" % (x, y) for x, y in opts] + \
              ["VALIDATION_STRINGENCY=SILENT"]
        do.run(" ".join(cmd), "Picard: %s" % subcmd)

Example 50

Project: bcbio-nextgen Source File: star.py
Function: index
def index(ref_file, out_dir, data):
    """Create a STAR index in the defined reference directory.
    """
    (ref_dir, local_file) = os.path.split(ref_file)
    gtf_file = dd.get_gtf_file(data)
    if not utils.file_exists(gtf_file):
        raise ValueError("%s not found, could not create a star index." % (gtf_file))
    if not utils.file_exists(out_dir):
        with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
            num_cores = dd.get_cores(data)
            cmd = ("STAR --genomeDir {tx_out_dir} --genomeFastaFiles {ref_file} "
                   "--runThreadN {num_cores} "
                   "--runMode genomeGenerate --sjdbOverhang 99 --sjdbGTFfile {gtf_file}")
            do.run(cmd.format(**locals()), "Index STAR")
            if os.path.exists(out_dir):
                shutil.rmtree(out_dir)
            shutil.move(tx_out_dir, out_dir)
    return out_dir
See More Examples - Go to Next Page
Page 1 Selected Page 2 Page 3 Page 4