Here are the examples of the python api sys.write taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
1 Examples
0
Source : rs-bam2bed.py
with Apache License 2.0
from zavolanlab
with Apache License 2.0
from zavolanlab
def generate_bed_line(sam):
if sam[:1] == "@":
return
for i in range(len(sam)):
if sam[i].startswith("NM:"):
# NM stores the edit distance between read and ref
sam_index_space_pos = i
elif sam[i].startswith("MD:"):
# MD stores the string for mismatching positions
sam_mismatches_pos = i
elif sam[i].startswith("NH:"):
# store number of mappings for that read
read_weight = 1.0 / float(sam[i].replace("NH:i:", ""))
# CONSTANTS
BED_CHR=0
BED_START=1
BED_STOP=2
BED_READ=3
BED_SCORE=4
BED_STRAND=5
BED_MID=6
SAM_READ_ID=0
SAM_FLAG=1
SAM_CHR=2
SAM_START=3
SAM_SCORE = 4
SAM_MID = 5 # aka cigar string
SAM_STAR = 6 # next entry on same chromosome?
SAM__ = 7 # mate info
SAM__ = 8 # mate info
SAM_READ=9
SAM_STAR = 10
SAM__ = 42
SAM_INDEX_SPACE = sam_index_space_pos
SAM_MISMATCHES = sam_mismatches_pos
# 5th digit in SAM_FLAG in binary format tells the strand: 0 -> plus, 1 -> minus
#Now reading in the sam_tuples sequentially.
#Need to convert this in some kind of bed format
MID_length = get_match_length(sam[SAM_MID])
strand = "?"
SAM_FLAG_BINARY = '{0:012b}'.format(int(sam[SAM_FLAG]))
SAM_STRAND = int(SAM_FLAG_BINARY[::-1][4])
if SAM_STRAND == 0:
strand = "+"
elif SAM_STRAND == 1:
strand = "-"
else:
sys.write("[ERROR] Could not infer strand from " + SAM_FLAG + "\n")
sys.exit(2)
bed_line = ""
bed_line += sam[SAM_CHR]+"\t"
bed_line += str(int(sam[SAM_START])-1)+"\t"
bed_line += str(int(sam[SAM_START])-1+MID_length)+"\t"
bed_line += sam[SAM_READ_ID]+"\t"
# the read weight is later on overwritten by the edit distance
bed_line += str(read_weight) +"\t"
bed_line += strand
#At this point, the bed file is almost complete, just needs the MMID
#Make a tuple with it!
bed = bed_line.split("\t")
g_length = MID_length
# genome is set as only "?" because we don't know the seqeuence
g = "".ljust(g_length, "?")
g = list(g)
# get the read that was aligned
soft_clipped_start = 0
soft_clipped_end = 0
match_splitted = re.split(r'(\d+)', sam[SAM_MID])
match_splitted.pop(0) # re produces a weird '' as elem 0 ...
if match_splitted[1] == "S":
soft_clipped_start = match_splitted[0]
if match_splitted[-1] == "S":
soft_clipped_end = match_splitted[-2]
# take only the portion of the read that was aligned to the reference
# (i.e. omit soft clippings at the start and at the end)
last_pos = len(sam[SAM_READ]) - int(soft_clipped_end)
r = sam[SAM_READ][int(soft_clipped_start):last_pos]
r = list(r)
# save the final reference and read sequences
G = []
R = []
current_MID = sam[SAM_MID]
exploded_MID = re.split(r'(\d+|[A-Za-z])', current_MID)
for item in exploded_MID:
if item == '':
exploded_MID.remove(item)
### Do alignment read vs genome
for i in range(int(len(exploded_MID)/2)):
number = int(exploded_MID[i*2])
type = exploded_MID[i*2 +1]
# no need to process type "N" at this point
# because currently the genomic seq anyways
# only consists of "?"
if type == "M":
# read and reference were aligned
for _ in itertools.repeat(None, number):
G.append(g.pop(0))
R.append(r.pop(0))
elif type == "I":
# nucleotides in read that are not there in the reference
for _ in itertools.repeat(None, number):
G.append("-")
R.append(r.pop(0))
elif type == "D":
# nucleotides in reference that are not there in the read
for _ in itertools.repeat(None, number):
G.append(g.pop(0))
R.append("-")
mismatch = sam[SAM_MISMATCHES]
mismatch = mismatch.split(":")[-1]
mismatch = mismatch.replace("^", "")
mismatch_splitted = re.split(r'(\d+|[A-Za-z])', mismatch)
for item in mismatch_splitted:
if item == '':
mismatch_splitted.remove(item)
pointer = 0
for c in mismatch_splitted:
if c == '':
continue # Skip c and this iteration when this happens
# Skip the next G[pointer] that is not a dash
if c.isdigit():
# perfect match here
value = int(c)
while value > 0:
# skip gaps in the reference because insertions are
# not present in the mismatch string
if G[pointer] == '-':
pointer += 1
else:
# go over a region of perfect matches
pointer += 1
value -= 1
else:
# again
# skip gaps in the reference because insertions
# are not present in the mismatch string
while G[pointer] == '-':
pointer += 1
# exchange question mark in the reference
# two possible reasons:
# 1. mismatch; 2. deletion
G.pop(pointer)
G.insert(pointer, c)
pointer+=1
# for the MMID, track the positions of the mismatches and deletions
# final MMID looks like:
# 15MATIGDCDT12 (15 matches -> 1 Mismatch( A ref, T in read) -> 1 insertion (G in read) -> 1 deletion (C in reference) -> 1 deletion (T in reference) -> 12 matches
match_count = 0
MMID = ""
g_index = 0
r_index = 0
for index in range(len(G)):
cg = G[index]
cr = R[index]
if cg == "?":
match_count += 1
else:
if match_count > 0:
MMID += str(match_count)
match_count = 0
if cg == "-":
MMID += "I"+cr
elif cr == "-":
MMID += "D"+cg
else:
MMID += "M"+cg+cr
# note by RS 2016-06-01: this increment statement
# should not effect the loop as far as I understood python
# (was introduced not by me but by Manuel)
index += 1
if match_count > 0:
MMID += str(match_count)
match_count = 0
index_space = sam[SAM_INDEX_SPACE]
index_space = index_space.split(":")[-1]
new_bed = list(bed)
# exchange read weight by edit distance and
# define the read weight later on
new_bed.pop(4)
new_bed.insert(4, index_space)
new_bed.append(MMID)
new_file_line = "\t".join(new_bed)
return new_file_line
# Parses a string containing Matches (M) and InDels (I and D) and returns the number of positions represented by that string
def get_match_length( match_string):