Here are the examples of the python api six.moves.map taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
178 Examples
0
Example 101
Project: anitya Source File: app.py
def modify_rst(rst):
''' Downgrade some of our rst directives if docutils is too old. '''
try:
# The rst features we need were introduced in this version
minimum = [0, 9]
version = list(map(int, docutils.__version__.split('.')))
# If we're at or later than that version, no need to downgrade
if version >= minimum:
return rst
except Exception:
# If there was some error parsing or comparing versions, run the
# substitutions just to be safe.
pass
# Otherwise, make code-blocks into just literal blocks.
substitutions = {
'.. code-block:: javascript': '::',
}
for old, new in substitutions.items():
rst = rst.replace(old, new)
return rst
0
Example 102
Project: jug Source File: hash.py
def hash_update(M, elems):
'''
M = hash_update(M, elems)
Update the hash object ``M`` with the sequence ``elems``.
Parameters
----------
M : hashlib object
An object on which the update method will be called
elems : sequence of 2-tuples
Returns
-------
M : hashlib object
This is the same object as the argument
'''
from six.moves import cPickle as pickle
from six.moves import map
import six
try:
import numpy as np
except ImportError:
np = None
for n,e in elems:
M.update(pickle.dumps(n))
if hasattr(e, '__jug_hash__'):
M.update(e.__jug_hash__())
elif type(e) in (list, tuple):
M.update(repr(type(e)).encode('utf-8'))
hash_update(M, enumerate(e))
elif type(e) == set:
M.update('set')
# With randomized hashing, different runs of Python might result in
# different orders, so sort. We cannot trust that all the elements
# in the set will be comparable, so we convert them to their hashes
# beforehand.
items = list(map(hash_one, e))
items.sort()
hash_update(M, enumerate(items))
elif type(e) == dict:
M.update(six.b('dict'))
items = [(hash_one(k),v) for k,v in e.items()]
items.sort(key=(lambda k_v:k_v[0]))
hash_update(M, items)
elif np is not None and type(e) == np.ndarray:
M.update(six.b('np.ndarray'))
M.update(pickle.dumps(e.dtype))
M.update(pickle.dumps(e.shape))
try:
buffer = e.data
M.update(buffer)
except:
M.update(e.copy().data)
else:
M.update(pickle.dumps(e))
return M
0
Example 103
def discover_services(self, uri=None, cache=True):
'''
Discover services advertised by TAXII server.
This method will send discovery request to a service, defined
by ``uri`` or constructor's connection parameters.
:param str uri: URI path to a specific TAXII service
:param bool cache: if discovered services should be cached
:return: list of TAXII services
:rtype: list of :py:class:`cabby.entities.DetailedServiceInstance`
(or :py:class:`cabby.entities.InboxDetailedService`)
:raises ValueError:
if URI provided is invalid or schema is not supported
:raises `cabby.exceptions.HTTPError`:
if HTTP error happened
:raises `cabby.exceptions.UnsuccessfulStatusError`:
if Status Message received and status_type is not `SUCCESS`
:raises `cabby.exceptions.ServiceNotFoundError`:
if no Discovery servicefound
:raises `cabby.exceptions.AmbiguousServicesError`:
more than one service with type specified
:raises `cabby.exceptions.NoURIProvidedError`:
no URI provided and client can't discover services
'''
uri = uri or self.discovery_path
if not uri:
raise NoURIProvidedError('Discovery service URI is not specified')
response = self._discovery_request(uri)
services = list(map(
to_detailed_service_instance_entity,
response.service_instances))
self.log.info("%d services discovered", len(services))
if cache:
self.services = services
return services
0
Example 104
Project: ete Source File: scheduler.py
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender):
# Adjust debug mode
if debug == "all":
log.setLevel(10)
pending_tasks = set(pending_tasks)
## ===================================
## INITIALIZE BASIC VARS
execution, run_detached = execution
thread2tasks = defaultdict(list)
for task in pending_tasks:
thread2tasks[task.configid].append(task)
expected_threads = set(thread2tasks.keys())
past_threads = {}
thread_errors = defaultdict(list)
## END OF VARS AND SHORTCUTS
## ===================================
cores_total = GLOBALS["_max_cores"]
if cores_total > 0:
job_queue = Queue()
back_launcher = Process(target=background_job_launcher,
args=(job_queue, run_detached,
GLOBALS["launch_time"], cores_total))
back_launcher.start()
else:
job_queue = None
back_launcher = None
GLOBALS["_background_scheduler"] = back_launcher
GLOBALS["_job_queue"] = job_queue
# Captures Ctrl-C for debuging DEBUG
#signal.signal(signal.SIGINT, control_c)
last_report_time = None
BUG = set()
try:
# Enters into task scheduling
while pending_tasks:
wtime = schedule_time
# ask SGE for running jobs
if execution == "sge":
#sgeid2jobs = db.get_sge_tasks()
#qstat_jobs = sge.qstat()
pass
else:
qstat_jobs = None
# Show summary of pending tasks per thread
thread2tasks = defaultdict(list)
for task in pending_tasks:
thread2tasks[task.configid].append(task)
set_logindent(0)
log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime()))
info_lines = []
for tid, tlist in six.iteritems(thread2tasks):
threadname = GLOBALS[tid]["_name"]
sizelist = ["%s" %getattr(_ts, "size", "?") for _ts in tlist]
info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" %(
threadname, len(tlist), ', '.join(sizelist))
info_lines.append(info)
for line in info_lines:
log.log(28, line)
if GLOBALS["email"] and last_report_time is None:
last_report_time = time()
send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines))
## ================================
## CHECK AND UPDATE CURRENT TASKS
checked_tasks = set()
check_start_time = time()
to_add_tasks = set()
GLOBALS["cached_status"] = {}
for task in sorted(pending_tasks, key=cmp_to_key(sort_tasks)):
# Avoids endless periods without new job submissions
elapsed_time = time() - check_start_time
#if not back_launcher and pending_tasks and \
# elapsed_time > schedule_time * 2:
# log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:")
# db.commit()
# wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
# execution, run_detached)
# check_start_time = time()
# Enter debuging mode if necessary
if debug and log.level > 10 and task.taskid.startswith(debug):
log.setLevel(10)
log.debug("ENTERING IN DEBUGGING MODE")
thread2tasks[task.configid].append(task)
# Update tasks and job statuses
if task.taskid not in checked_tasks:
try:
show_task_info(task)
task.status = task.get_status(qstat_jobs)
db.dataconn.commit()
if back_launcher and task.status not in set("DE"):
for j, cmd in task.iter_waiting_jobs():
j.status = "Q"
GLOBALS["cached_status"][j.jobid] = "Q"
if j.jobid not in BUG:
if not os.path.exists(j.jobdir):
os.makedirs(j.jobdir)
for ifile, outpath in six.iteritems(j.input_files):
try:
_tid, _did = ifile.split(".")
_did = int(_did)
except (IndexError, ValueError):
dataid = ifile
else:
dataid = db.get_dataid(_tid, _did)
if not outpath:
outfile = pjoin(GLOBALS["input_dir"], ifile)
else:
outfile = pjoin(outpath, ifile)
if not os.path.exists(outfile):
open(outfile, "w").write(db.get_data(dataid))
log.log(24, " @@8:Queueing @@1: %s from %s" %(j, task))
if execution:
with open(pjoin(GLOBALS[task.configid]["_outpath"], "commands.log"), "a") as CMD_LOGGER:
print('\t'.join([task.tname, task.taskid, j.jobname, j.jobid, j.get_launch_cmd()]), file=CMD_LOGGER)
job_queue.put([j.jobid, j.cores, cmd, j.status_file])
BUG.add(j.jobid)
update_task_states_recursively(task)
db.commit()
checked_tasks.add(task.taskid)
except TaskError as e:
log.error("Errors found in %s" %task)
import traceback
traceback.print_exc()
if GLOBALS["email"]:
threadname = GLOBALS[task.configid]["_name"]
send_mail(GLOBALS["email"], "Errors found in %s!" %threadname,
'\n'.join(map(str, [task, e.value, e.msg])))
pending_tasks.discard(task)
thread_errors[task.configid].append([task, e.value, e.msg])
continue
else:
# Set temporary Queued state to avoids launching
# jobs from clones
task.status = "Q"
if log.level < 24:
show_task_info(task)
if task.status == "D":
#db.commit()
show_task_info(task)
logindent(3)
# Log commands of every task
# if 'cmd_log_file' not in GLOBALS[task.configid]:
# GLOBALS[task.configid]['cmd_log_file'] = pjoin(GLOBALS[task.configid]["_outpath"], "cmd.log")
# O = open(GLOBALS[task.configid]['cmd_log_file'], "w")
# O.close()
# cmd_lines = get_cmd_log(task)
# CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a")
# print(task, file=CMD_LOG)
# for c in cmd_lines:
# print(' '+'\t'.join(map(str, c)), file=CMD_LOG)
# CMD_LOG.close()
#
try:
#wkname = GLOBALS[task.configid]['_name']
create_tasks = workflow_task_processor(task, task.target_wkname)
except TaskError as e:
log.error("Errors found in %s" %task)
pending_tasks.discard(task)
thread_errors[task.configid].append([task, e.value, e.msg])
continue
else:
logindent(-3)
to_add_tasks.update(create_tasks)
pending_tasks.discard(task)
elif task.status == "E":
log.error("task contains errors: %s " %task)
log.error("Errors found in %s")
pending_tasks.discard(task)
thread_errors[task.configid].append([task, None, "Found (E) task status"])
#db.commit()
#if not back_launcher:
# wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
# execution, run_detached)
# Update global task list with recently added jobs to be check
# during next cycle
pending_tasks.update(to_add_tasks)
## END CHECK AND UPDATE CURRENT TASKS
## ================================
if wtime:
set_logindent(0)
log.log(28, "@@13:Waiting %s seconds@@1:" %wtime)
sleep(wtime)
else:
sleep(schedule_time)
# Dump / show ended threads
error_lines = []
for configid, etasks in six.iteritems(thread_errors):
error_lines.append("Thread @@10:%s@@1: contains errors:" %\
(GLOBALS[configid]["_name"]))
for error in etasks:
error_lines.append(" ** %s" %error[0])
e_obj = error[1] if error[1] else error[0]
error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid
if e_obj is not error[0]:
error_lines.append(" -> %s" %e_obj)
error_lines.append(" -> %s" %error_path)
error_lines.append(" -> %s" %error[2])
for eline in error_lines:
log.error(eline)
pending_threads = set([ts.configid for ts in pending_tasks])
finished_threads = expected_threads - (pending_threads | set(thread_errors.keys()))
just_finished_lines = []
finished_lines = []
for configid in finished_threads:
# configid is the the same as threadid in master tasks
final_tree_file = pjoin(GLOBALS[configid]["_outpath"],
GLOBALS["inputname"] + ".final_tree")
threadname = GLOBALS[configid]["_name"]
if configid in past_threads:
log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
threadname, past_threads[configid])
finished_lines.append("Finished %s in %d iteration(s)" %(
threadname, past_threads[configid]))
else:
log.log(28, "Assembling final tree...")
main_tree, treeiters = assembly_tree(configid)
past_threads[configid] = treeiters - 1
log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
threadname, past_threads[configid])
log.log(28, "Writing final tree for @@13:%s@@1:\n %s\n %s",
threadname, final_tree_file+".nw",
final_tree_file+".nwx (newick extended)")
main_tree.write(outfile=final_tree_file+".nw")
main_tree.write(outfile=final_tree_file+ ".nwx", features=[],
format_root_node=True)
if hasattr(main_tree, "tree_phylip_alg"):
log.log(28, "Writing final tree alignment @@13:%s@@1:\n %s",
threadname, final_tree_file+".used_alg.fa")
alg = SeqGroup(get_stored_data(main_tree.tree_phylip_alg), format="iphylip_relaxed")
OUT = open(final_tree_file+".used_alg.fa", "w")
for name, seq, comments in alg:
realname = db.get_seq_name(name)
print(">%s\n%s" %(realname, seq), file=OUT)
OUT.close()
if hasattr(main_tree, "alg_path"):
log.log(28, "Writing root node alignment @@13:%s@@1:\n %s",
threadname, final_tree_file+".fa")
alg = SeqGroup(get_stored_data(main_tree.alg_path))
OUT = open(final_tree_file+".fa", "w")
for name, seq, comments in alg:
realname = db.get_seq_name(name)
print(">%s\n%s" %(realname, seq), file=OUT)
OUT.close()
if hasattr(main_tree, "clean_alg_path"):
log.log(28, "Writing root node trimmed alignment @@13:%s@@1:\n %s",
threadname, final_tree_file+".trimmed.fa")
alg = SeqGroup(get_stored_data(main_tree.clean_alg_path))
OUT = open(final_tree_file+".trimmed.fa", "w")
for name, seq, comments in alg:
realname = db.get_seq_name(name)
print(">%s\n%s" %(realname, seq), file=OUT)
OUT.close()
if norender == False:
log.log(28, "Generating tree image for @@13:%s@@1:\n %s",
threadname, final_tree_file+".png")
for lf in main_tree:
lf.add_feature("sequence", alg.get_seq(lf.safename))
try:
from .visualize import draw_tree
draw_tree(main_tree, GLOBALS[configid], final_tree_file+".png")
except Exception as e:
log.warning('@@8:something went wrong when generating the tree image. Try manually :(@@1:')
if DEBUG:
import traceback, sys
traceback.print_exc(file=sys.stdout)
just_finished_lines.append("Finished %s in %d iteration(s)" %(
threadname, past_threads[configid]))
if GLOBALS["email"]:
if not pending_tasks:
all_lines = finished_lines + just_finished_lines + error_lines
send_mail(GLOBALS["email"], "Your NPR process has ended", '\n'.join(all_lines))
elif GLOBALS["email_report_time"] and time() - last_report_time >= \
GLOBALS["email_report_time"]:
all_lines = info_lines + error_lines + just_finished_lines
send_mail(GLOBALS["email"], "Your NPR report", '\n'.join(all_lines))
last_report_time = time()
elif just_finished_lines:
send_mail(GLOBALS["email"], "Finished threads!",
'\n'.join(just_finished_lines))
log.log(26, "")
except:
raise
if thread_errors:
log.error("Done with ERRORS")
else:
log.log(28, "Done")
return thread_errors
0
Example 105
def __iter__(self):
return map(lambda i: _wrap(i, self._obj_wrapper), self._l_)
0
Example 106
Project: dit Source File: distribution.py
def prepare_string(dist, digits=None, exact=False, tol=1e-9,
show_mask=False, str_outcomes=False):
"""
Prepares a distribution for a string representation.
Parameters
----------
dist : distribution
The distribution to be stringified.
digits : int or None
The probabilities will be rounded to the specified number of
digits, using NumPy's around function. If `None`, then no rounding
is performed. Note, if the number of digits is greater than the
precision of the floats, then the resultant number of digits will
match that smaller precision.
exact : bool
If `True`, then linear probabilities will be displayed, even if
the underlying pmf contains log probabilities. The closest
rational fraction within a tolerance specified by `tol` is used
as the display value.
tol : float
If `exact` is `True`, then the probabilities will be displayed
as the closest rational fraction within `tol`.
show_mask : bool
If `True`, show the mask for marginal distributions.
str_outcomes : bool
If `True`, then attempt to convert outcomes which are tuples to just
strings. This is only a dislplay technique.
Returns
-------
pmf : sequence
The formatted pmf. This could be a NumPy array (possibly rounded)
or a list of Fraction instances.
outcomes : sequence
The formated outcomes.
base : str or float
The base of the formatted pmf.
colsep : str
The column separation for printing.
max_length : int
The length of the largest outcome, as a string.
pstr : str
A informative string representing the probability of an outcome.
This will be 'p(x)' xor 'log p(x)'.
"""
colsep = ' '
# Create outcomes with wildcards, if desired and possible.
if show_mask:
if not dist.is_joint():
msg = '`show_mask` can be `True` only for joint distributions'
raise ditException(msg)
if show_mask != True and show_mask != False:
# The user is specifying what the mask should look like.
wc = show_mask
else:
wc = '*'
ctor = dist._outcome_ctor
def outcome_wc(outcome):
"""
Builds the wildcarded outcome.
"""
i = 0
e = []
for is_masked in dist._mask:
if is_masked:
symbol = wc
else:
symbol = outcome[i]
i += 1
e.append(symbol)
e = ctor(e)
return e
outcomes = map(outcome_wc, dist.outcomes)
else:
outcomes = dist.outcomes
# Convert outcomes to strings, if desired and possible.
if str_outcomes:
if not dist.is_joint():
msg = '`str_outcomes` can be `True` only for joint distributions'
raise ditException(msg)
try:
# First, convert the elements of the outcome to strings.
outcomes_ = [map(str, outcome) for outcome in outcomes]
# Now convert the entire outcome to a string
outcomes_ = map(lambda o: ''.join(o), outcomes_)
# Force the iterators to expand in case there are exceptions.
outcomes = list(outcomes_)
except:
outcomes = map(str, outcomes)
else:
outcomes = map(str, outcomes)
outcomes = list(outcomes)
if len(outcomes):
max_length = max(map(len, outcomes))
else:
max_length = 0
# 1) Convert to linear probabilities, if necessary.
if exact:
# Copy to avoid precision loss
d = dist.copy(base='linear')
else:
d = dist
# 2) Round, if necessary, possibly after converting to linear probabilities.
if digits is not None and digits is not False:
pmf = d.pmf.round(digits)
else:
pmf = d.pmf
# 3) Construct fractions, if necessary.
if exact:
pmf = [approximate_fraction(x, tol) for x in pmf]
if d.is_log():
pstr = 'log p(x)'
else:
pstr = 'p(x)'
base = d.get_base()
return pmf, outcomes, base, colsep, max_length, pstr
0
Example 107
Project: pywb Source File: cdxops.py
def cdx_filter(cdx_iter, filter_strings):
"""
filter CDX by regex if each filter is :samp:`{field}:{regex}` form,
apply filter to :samp:`cdx[{field}]`.
"""
# Support single strings as well
if isinstance(filter_strings, str):
filter_strings = [filter_strings]
filters = []
class Filter:
def __init__(self, string):
# invert filter
self.invert = string.startswith('!')
if self.invert:
string = string[1:]
# exact match
if string.startswith('='):
string = string[1:]
self.compare_func = self.exact
# contains match
elif string.startswith('~'):
string = string[1:]
self.compare_func = self.contains
else:
self.compare_func = self.regex
parts = string.split(':', 1)
# no field set, apply filter to entire cdx
if len(parts) == 1:
self.field = ''
# apply filter to cdx[field]
else:
self.field = parts[0]
self.field = CDXObject.CDX_ALT_FIELDS.get(self.field,
self.field)
string = parts[1]
# make regex if regex mode
if self.compare_func == self.regex:
self.regex = re.compile(string)
else:
self.filter_str = string
def __call__(self, cdx):
if not self.field:
val = str(cdx)
else:
val = cdx.get(self.field, '')
matched = self.compare_func(val)
return matched ^ self.invert
def exact(self, val):
return (self.filter_str == val)
def contains(self, val):
return (self.filter_str in val)
def regex(self, val):
return self.regex.match(val) is not None
filters = list(map(Filter, filter_strings))
for cdx in cdx_iter:
if all(x(cdx) for x in filters):
yield cdx
0
Example 108
def flattenList(l):
if isinstance(l, list):
return sum(map(flattenList, l))
else:
return l
0
Example 109
Project: custodian Source File: handlers.py
def check(self):
if self.wall_time:
run_time = datetime.datetime.now() - self.start_time
total_secs = run_time.total_seconds()
if not self.electronic_step_stop:
try:
# Intelligently determine time per ionic step.
o = Oszicar("OSZICAR")
nsteps = len(o.ionic_steps)
time_per_step = total_secs / nsteps
except Exception:
time_per_step = 0
else:
try:
# Intelligently determine approximate time per electronic
# step.
o = Oszicar("OSZICAR")
if len(o.ionic_steps) == 0:
nsteps = 0
else:
nsteps = sum(map(len, o.electronic_steps))
if nsteps > self.prev_check_nscf_steps:
steps_time = datetime.datetime.now() - \
self.prev_check_time
steps_secs = steps_time.total_seconds()
step_timing = self.buffer_time * ceil(
(steps_secs /
(nsteps - self.prev_check_nscf_steps)) /
self.buffer_time)
self.electronic_steps_timings.append(step_timing)
self.prev_check_nscf_steps = nsteps
self.prev_check_time = datetime.datetime.now()
time_per_step = max(self.electronic_steps_timings)
except Exception as ex:
time_per_step = 0
# If the remaining time is less than average time for 3 ionic
# steps or buffer_time.
time_left = self.wall_time - total_secs
if time_left < max(time_per_step * 3, self.buffer_time):
return True
return False
0
Example 110
Project: debsources Source File: updater.py
def parse_stages(stages):
return set(map(parse_stage, stages.split()))
0
Example 111
def run(args):
# add lineage profiles/stats
import re
from .. import PhyloTree, NCBITaxa
# dump tree by default
if not args.tree and not args.info and not args.descendants:
args.tree = True
ncbi = NCBITaxa(args.dbfile, args.taxdumpfile)
if args.create:
sys.exit(0)
all_taxids = {}
all_names = set()
queries = []
if not args.search:
log.error('Search terms should be provided (i.e. --search) ')
sys.exit(-1)
for n in args.search:
queries.append(n)
try:
all_taxids[int(n)] = None
except ValueError:
all_names.add(n.strip())
# translate names
name2tax = ncbi.get_name_translator(all_names)
for tids in name2tax.values():
for tid in tids:
all_taxids[tid] = None
not_found_names = all_names - set(name2tax.keys())
if args.fuzzy and not_found_names:
log.warn("%s unknown names", len(not_found_names))
for name in not_found_names:
# enable extension loading
tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
if tax:
all_taxids[tax] = None
name2tax[name] = [tax]
name2realname[name] = realname
name2score[name] = "Fuzzy:%0.2f" %sim
if not_found_names:
log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))
if args.tree:
if len(all_taxids) == 1:
target_taxid = list(all_taxids.keys())[0]
log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
else:
log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
t = ncbi.get_topology(list(all_taxids.keys()),
intermediate_nodes=args.full_lineage,
rank_limit=args.rank_limit,
collapse_subspecies=args.collapse_subspecies)
id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
for n in t.traverse():
n.add_features(taxid=n.name)
n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
lineage = ncbi.get_lineage(n.taxid)
n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
"collapse_subspecies", "named_lineage"])
elif args.descendants:
log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]))
translator = ncbi.get_taxid_translator(all_taxids)
ranks = ncbi.get_rank(all_taxids)
for taxid in all_taxids:
descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
'|'.join(map(str, descendants)),
'|'.join(map(str, ncbi.translate_to_names(descendants)))]))
elif args.info:
print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
translator = ncbi.get_taxid_translator(all_taxids)
ranks = ncbi.get_rank(all_taxids)
for taxid, name in six.iteritems(translator):
lineage = ncbi.get_lineage(taxid)
named_lineage = ','.join(ncbi.translate_to_names(lineage))
lineage_string = ','.join(map(str, lineage))
print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))
0
Example 112
Project: ete Source File: codemlparser.py
def parse_paml (pamout, model):
'''
parser function for codeml files,
with values of w,dN,dS etc... dependending of the model
tested.
'''
# if multiple dataset in same file we divide the outfile and model.name+x
if not '*' in str (model.properties['params']['ndata']):
divide_data (pamout, model)
return
all_lines = open (pamout).readlines()
# if we do not have tree, load it
if model._tree is None:
from ..evol import EvolTree
model._tree = EvolTree (re.findall ('\(.*\);', ''.join(all_lines))[2])
model._tree._label_as_paml()
# starts parsing
for i, line in enumerate (all_lines):
if line is '\n':
continue
# codon frequency
if line.startswith('Codon frequencies under model'):
model.stats ['codonFreq'] = []
for j in range (16):
line = list(map (float, re.findall ('\d\.\d+', all_lines [i+j+1])))
model.stats ['codonFreq'] += [line]
continue
if line.startswith('Nei & Gojobori 1986'):
model.stats ['codonFreq'] = []
if 'codonFreq' not in model.stats:
continue
######################
# start serious staff
line = line.rstrip()
# lnL and number of parameters
if line.startswith ('lnL'):
try:
line = re.sub ('.* np: *(\d+)\): +(-\d+\.\d+).*',
'\\1 \\2', line)
model.stats ['np' ] = int (line.split()[0])
model.stats ['lnL'] = float (line.split()[1])
except ValueError:
line = re.sub ('.* np: *(\d+)\): +(nan).*',
'\\1 \\2', line)
model.stats ['np' ] = int (line.split()[0])
model.stats ['lnL'] = float ('-inf')
continue
# get labels of internal branches
if line.count('..') >= 2:
labels = re.findall ('\d+\.\.\d+', line + ' ')
_check_paml_labels (model._tree, labels, pamout, model)
continue
# retrieve kappa
if line.startswith ('kappa '):
try:
model.stats ['kappa'] = float (re.sub ('.*(\d+\.\d+).*',
'\\1', line))
except ValueError:
model.stats ['kappa'] = 'nan'
# retrieve dS dN t w N S and if present, errors. from summary table
if line.count('..') == 1 and line.startswith (' '):
if not re.match (' +\d+\.\.\d+ +\d+\.\d+ ', line):
if re.match (' +( +\d+\.\d+){8}', all_lines [i+1]):
_get_values (model, line.split ()[0]+' '+all_lines [i+1])
continue
_get_values (model, line)
continue
0
Example 113
Project: dit Source File: distconst.py
def insert_rvf(d, func, index=-1):
"""
Returns a new distribution with an added random variable at index `index`.
The new random variable must be a function of the other random variables.
By this, we mean that the entropy of the new random variable conditioned
on the original random variables should be zero.
Parameters
----------
dist : Distribution
The distribution used to construct the new distribution.
func : callable | list of callable
A function which takes a single argument---the value of the previous
random variables---and returns a new random variable. Note, the return
value will be added to the outcome using `__add__`, and so it should be
a hashable, orderable sequence (as every outcome must be). If a list of
callables is provided, then multiple random variables are added
simultaneously and will appear in the same order as the list.
index : int
The index at which to insert the random variable. A value of -1 is
will append the random variable to the end.
Returns
-------
d : Distribution
The new distribution.
Examples
--------
>>> d = dit.Distribution(['00', '01', '10', '11'], [1/4]*4)
>>> def xor(outcome):
... return str(int(outcome[0] != outcome[1]))
...
>>> d2 = dit.insert_rvf(d, xor)
>>> d.outcomes
('000', '011', '101', '110')
"""
try:
func[0]
except TypeError:
funcs = [func]
else:
funcs = func
partial_outcomes = [map(func, d.outcomes) for func in funcs]
# Now "flatten" the new contributions.
partial_outcomes = [d._outcome_ctor([o for o_list in outcome for o in o_list])
for outcome in zip(*partial_outcomes)]
new_outcomes = zip(d.outcomes, partial_outcomes)
if index == -1:
outcomes = [old + new for old, new in new_outcomes]
else:
outcomes = [old[:index] + new + old[index:] for old, new in new_outcomes]
d2 = Distribution(outcomes, d.pmf.copy(), base=d.get_base())
return d2
0
Example 114
Project: ete Source File: cog_creator.py
def brh_cogs2(DB, species, missing_factor=0.0, seed_sp=None, min_score=0):
"""It scans all precalculate BRH relationships among the species
passed as an argument, and detects Clusters of Orthologs
according to several criteria:
min_score: the min coverage/overalp value required for a
blast to be a reliable hit.
missing_factor: the min percentage of species in which a
given seq must have orthologs.
"""
def _sort_cogs(cogs1, cogs2):
seed1, mx1, avg1, ncogs1 = cogs1
seed2, mx2, avg2, ncogs2 = cogs2
for i, j in ((mx1, mx2), (avg1, avg2), (ncogs1, ncogs2)):
v = -1 * cmp(i, j)
if v != 0:
break
return v
log.log(26, "Searching BRH orthologs")
species = set(map(str, species))
min_species = len(species) - round(missing_factor * len(species))
if seed_sp == "auto":
sp_to_test = list(species)
elif seed_sp == "largest":
cmd = """SELECT taxid, size FROM species"""
db.seqcursor.execute(cmd)
sp2size = {}
for tax, counter in db.seqcursor.fetchall():
if tax in species:
sp2size[tax] = counter
sorted_sp = sorted(list(sp2size.items()), lambda x,y: cmp(x[1],y[1]))
log.log(24, sorted_sp[:6])
largest_sp = sorted_sp[-1][0]
sp_to_test = [largest_sp]
log.log(28, "Using %s as search seed. Proteome size=%s genes" %\
(largest_sp, sp2size[largest_sp]))
else:
sp_to_test = [str(seed_sp)]
analysis_txt = StringIO()
if sp_to_test:
log.log(26, "Finding best COG selection...")
seed2size = get_sorted_seeds(seed_sp, species, sp_to_test, min_species, DB)
size_analysis = []
for seedname, content in six.iteritems(seed2size):
cog_sizes = [size for seq, size in content]
mx, avg = _max(cog_sizes), round(_mean(cog_sizes))
size_analysis.append([seedname, mx, avg, len(content)])
size_analysis.sort(_sort_cogs)
#print '\n'.join(map(str, size_analysis))
seed = size_analysis[0][0]
print_as_table(size_analysis[:25], stdout=analysis_txt,
header=["Seed","largest COG", "avg COG size", "total COGs"])
if size_analysis[0][1] < len(species)-1:
print(size_analysis[0][1])
raise ValueError("Current COG selection parameters do not permit to cover all species")
log.log(28, analysis_txt.getvalue())
# The following loop tests each possible seed if none is
# specified.
log.log(28, "Computing Clusters of Orthologs groups (COGs)")
log.log(28, "Min number of species per COG: %d" %min_species)
cogs_selection = []
log.log(26,"Using seed species:%s", seed)
species_side1 = ','.join(map(quote, [s for s in species if str(s)>str(seed)]))
species_side2 = ','.join(map(quote, [s for s in species if str(s)<str(seed)]))
pairs1 = []
pairs2 = []
# Select all ids with matches in the target species, and
# return the total number of species covered by each of
# such ids.
if species_side1 != "":
cmd = """SELECT seqid1, taxid1, seqid2, taxid2 from ortho_pair WHERE
taxid1="%s" AND taxid2 IN (%s) """ % (seed, species_side1)
DB.orthocursor.execute(cmd)
pairs1 = DB.orthocursor.fetchall()
if species_side2 != "":
cmd = """SELECT seqid2, taxid2, seqid1, taxid1 from ortho_pair WHERE
taxid1 IN (%s) AND taxid2 = "%s" """ % (species_side2, seed)
DB.orthocursor.execute(cmd)
pairs2 = DB.orthocursor.fetchall()
cog_candidates = defaultdict(set)
for seq1, sp1, seq2, sp2 in pairs1 + pairs2:
s1 = (sp1, seq1)
s2 = (sp2, seq2)
cog_candidates[(sp1, seq1)].update([s1, s2])
all_cogs = [cand for cand in list(cog_candidates.values()) if
len(cand) >= min_species]
# CHECK CONSISTENCY
seqs = set()
for cand in all_cogs:
seqs.update([b for a,b in cand if a == seed])
pre_selected_seqs = set([v[0] for v in seed2size[seed]])
if len(seqs & pre_selected_seqs) != len(set(seed2size[seed])) or\
len(seqs & pre_selected_seqs) != len(seqs):
print("old method seqs", len(seqs), "new seqs", len(set(seed2size[seed])), "Common", len(seqs & pre_selected_seqs))
raise ValueError("ooops")
cog_sizes = [len(cog) for cog in all_cogs]
cog_spsizes = [len(set([e[0] for e in cog])) for cog in all_cogs]
if [1 for i in range(len(cog_sizes)) if cog_sizes[i] != cog_spsizes[i]]:
raise ValueError("Inconsistent COG found")
if cog_sizes:
cogs_selection.append([seed, all_cogs])
log.log(26, "Found %d COGs" % len(all_cogs))
recoded_cogs = []
for cog in all_cogs:
named_cog = ["%s%s%s" %(x[0], GLOBALS["spname_delimiter"],x[1]) for x in cog]
recoded_cogs.append(named_cog)
return recoded_cogs, analysis_txt.getvalue()
0
Example 115
def __str__(self):
return ''.join(map(_uniquote, self._items))
0
Example 116
def __str__(self):
return '{%s}' % ', '.join(
map(lambda p: '%r: %r' % (p[0], p[1]), self.iterallitems()))
0
Example 117
Project: dit Source File: helpers.py
def construct_alphabets(outcomes):
"""
Construct minimal alphabets for each random variable.
In the process, it verifies that each outcome is a sequence and that all
outcomes have the same length.
Parameters
----------
outcomes : sequence
A nonempty sequence of outcomes. Each outcome in `outcomes` should
be a sequence---these are the elements which determine the alphabet
for each random variable.
Returns
-------
alphabets : tuple
The constructed alphabet for each random variable.
Examples
--------
>>> construct_alphabets([(0,1), (1,1)])
((0,1), (1,))
Raises
------
ditException
When there are no outcomes.
When not every outcome is a sequence.
When not all outcomes have the same length.
"""
# During validation, each outcome is checked to be of the proper class,
# length, and also a sequence. However, this function is called before
# validation and will result in hard to decipher error messages if we
# don't at least verify that each outcome is a container of the same
# length.
# Make sure outcomes is a sequence
try:
L = len(outcomes)
except TypeError:
raise TypeError('`outcomes` must be a sequence.')
if L == 0:
raise ditException('`outcomes` must not be empty.')
# Make sure each outcome is sized. They really should be sequences,
# but this check is sufficient for now.
try:
lengths = list(map(len, outcomes))
except TypeError:
raise ditException('One or more outcomes is not sized. len() fails.')
else:
outcome_length = lengths[0]
# Make sure each outcome has the same length.
equal_lengths = np.alltrue(np.equal(lengths, outcome_length))
if not equal_lengths:
raise ditException('Not all outcomes have the same length.')
alphabets = _construct_alphabets(outcomes)
return alphabets
0
Example 118
def process_tokens(self, tokens):
"""process tokens and search for :
_ non strict indentation (i.e. not always using the <indent> parameter as
indent unit)
_ too long lines (i.e. longer than <max_chars>)
_ optionally bad construct (if given, bad_construct must be a compiled
regular expression).
"""
self._bracket_stack = [None]
indents = [0]
check_equal = False
line_num = 0
self._lines = {}
self._visited_lines = {}
token_handlers = self._prepare_token_dispatcher()
self._last_line_ending = None
self._current_line = ContinuedLineState(tokens, self.config)
for idx, (tok_type, token, start, _, line) in enumerate(tokens):
if start[0] != line_num:
line_num = start[0]
# A tokenizer oddity: if an indented line contains a multi-line
# docstring, the line member of the INDENT token does not contain
# the full line; therefore we check the next token on the line.
if tok_type == tokenize.INDENT:
self.new_line(TokenWrapper(tokens), idx-1, idx+1)
else:
self.new_line(TokenWrapper(tokens), idx-1, idx)
if tok_type == tokenize.NEWLINE:
# a program statement, or ENDMARKER, will eventually follow,
# after some (possibly empty) run of tokens of the form
# (NL | COMMENT)* (INDENT | DEDENT+)?
# If an INDENT appears, setting check_equal is wrong, and will
# be undone when we see the INDENT.
check_equal = True
self._process_retained_warnings(TokenWrapper(tokens), idx)
self._current_line.next_logical_line()
self._check_line_ending(token, line_num)
elif tok_type == tokenize.INDENT:
check_equal = False
self.check_indent_level(token, indents[-1]+1, line_num)
indents.append(indents[-1]+1)
elif tok_type == tokenize.DEDENT:
# there's nothing we need to check here! what's important is
# that when the run of DEDENTs ends, the indentation of the
# program statement (or ENDMARKER) that triggered the run is
# equal to what's left at the top of the indents stack
check_equal = True
if len(indents) > 1:
del indents[-1]
elif tok_type == tokenize.NL:
self._check_continued_indentation(TokenWrapper(tokens), idx+1)
self._current_line.next_physical_line()
elif tok_type != tokenize.COMMENT:
self._current_line.handle_line_start(idx)
# This is the first concrete token following a NEWLINE, so it
# must be the first token of the next program statement, or an
# ENDMARKER; the "line" argument exposes the leading whitespace
# for this statement; in the case of ENDMARKER, line is an empty
# string, so will properly match the empty string with which the
# "indents" stack was seeded
if check_equal:
check_equal = False
self.check_indent_level(line, indents[-1], line_num)
if tok_type == tokenize.NUMBER and token.endswith('l'):
self.add_message('lowercase-l-suffix', line=line_num)
try:
handler = token_handlers[token]
except KeyError:
pass
else:
handler(tokens, idx)
line_num -= 1 # to be ok with "wc -l"
if line_num > self.config.max_module_lines:
# Get the line where the too-many-lines (or its message id)
# was disabled or default to 1.
symbol = self.linter.msgs_store.check_message_id('too-many-lines')
names = (symbol.msgid, 'too-many-lines')
line = next(filter(None,
map(self.linter._pragma_lineno.get, names)), 1)
self.add_message('too-many-lines',
args=(line_num, self.config.max_module_lines),
line=line)
0
Example 119
Project: box-python-sdk Source File: test_group.py
@pytest.fixture()
def mock_membership_responses(mock_membership_dict_stream):
"""
Returns a generator method that takes params: total, page_size.
The generator generates a sequence of 'group membership' mock_box_responses each containing page_size
items, until 'total' entries have been returned
"""
# pylint:disable=redefined-outer-name
def number_entries_per_response(total, page_size, hidden_in_batch):
if not hidden_in_batch:
hidden_in_batch = repeat(0)
quotient, remainder = divmod(total, page_size)
max_items_in_batch = chain(repeat(page_size, quotient), (remainder,))
return map(sub, max_items_in_batch, hidden_in_batch)
def take(iterable, number):
return list(islice(iterable, number))
def membership_responses(total, page_size, hidden_in_batch=None):
offset = 0
for number_entries in number_entries_per_response(total, page_size, hidden_in_batch):
entries = take(mock_membership_dict_stream, number_entries)
mock_box_response = Mock(BoxResponse)
mock_network_response = Mock(DefaultNetworkResponse)
mock_box_response.network_response = mock_network_response
mock_box_response.json.return_value = {
'entries': entries,
'total_count': total,
'offset': offset,
'limit': page_size,
}
offset += number_entries
mock_box_response.status_code = 200
mock_box_response.ok = True
yield mock_box_response
return membership_responses
0
Example 120
Project: PyEMMA Source File: patches.py
def _random_access_generator(self, f):
with f:
curr_size = 0
coords = []
leftovers = []
chunksize = self._chunksize
if chunksize == 0:
chunksize = np.iinfo(int).max
for k, g in groupby(enumerate(self._stride), lambda a: a[0] - a[1]):
grouped_stride = list(map(itemgetter(1), g))
seek_to = grouped_stride[0] - f.tell()
f.seek(seek_to, whence=1)
group_size = len(grouped_stride)
if curr_size + group_size > chunksize:
leftovers = grouped_stride
else:
local_traj_data = _read_traj_data(self._atom_indices, f, group_size, **self._kwargs)
coords.append(local_traj_data)
curr_size += len(grouped_stride)
if curr_size == chunksize:
yield _join_traj_data(coords, self._topology)
chunksize = self._chunksize
curr_size = 0
coords = []
while leftovers:
local_chunk = leftovers[:min(chunksize, len(leftovers))]
local_traj_data = _read_traj_data(self._atom_indices, f, len(local_chunk), **self._kwargs)
coords.append(local_traj_data)
leftovers = leftovers[min(chunksize, len(leftovers)):]
curr_size += len(local_chunk)
if curr_size == chunksize:
yield _join_traj_data(coords, self._topology)
curr_size = 0
coords = []
if coords:
yield _join_traj_data(coords, self._topology)
raise StopIteration("delivered all RA indices")
0
Example 121
Project: skll Source File: writers.py
def _write_header(self, feature_set, output_file, filter_features):
"""
Called before lines are written to file, so that headers can be written
for files that need them.
:param feature_set: The FeatureSet being written to a file.
:type feature_set: FeatureSet
:param output_file: The file being written to.
:type output_file: file
:param filter_features: If only writing a subset of the features in the
FeatureSet to ``output_file``, these are the
features to include in this file.
:type filter_features: set of str
"""
fieldnames = self._get_fieldnames(filter_features)
if self.label_col in fieldnames:
fieldnames.remove(self.label_col)
# Add relation to header
print("@relation '{}'\n".format(self.relation), file=output_file)
# Loop through fields writing the header info for the ARFF file
for field in fieldnames:
print("@attribute '{}' numeric".format(field.replace('\\', '\\\\')
.replace("'", "\\'")),
file=output_file)
# Print class label header if necessary
if self.regression:
print("@attribute {} numeric".format(self.label_col),
file=output_file)
else:
if self.feat_set.has_labels:
print("@attribute {} ".format(self.label_col) +
"{" + ','.join(map(str,
sorted(set(self.feat_set.labels)))) +
"}", file=output_file)
fieldnames.append(self.label_col)
# Create CSV writer to handle missing values for lines in data section
# and to ignore the instance values for non-numeric attributes
self._dict_writer = DictWriter(output_file, fieldnames, restval=0,
extrasaction='ignore', dialect='arff')
# Finish header and start data section
print("\n@data", file=output_file)
0
Example 122
def serialize(self, data):
if isinstance(data, (list, AttrList)):
return list(map(self._serialize, data))
return self._serialize(data)
0
Example 123
def each(self, *funcs):
"""
Call `func` on each element in the collection.
If multiple functions are provided, each item
in the output will be a tuple of each
func(item) in self.
Returns a new Collection.
Example:
>>> col = Collection([Scalar(1), Scalar(2)])
>>> col.each(Q * 10)
Collection([Scalar(10), Scalar(20)])
>>> col.each(Q * 10, Q - 1)
Collection([Scalar((10, 0)), Scalar((20, 1))])
"""
funcs = list(map(_make_callable, funcs))
if len(funcs) == 1:
return Collection(map(funcs[0], self._items))
tupler = lambda item: Scalar(
tuple(_unwrap(func(item)) for func in funcs))
return Collection(map(tupler, self._items))
0
Example 124
def to_dict(self):
"""
Serialize the DSL object to plain dict
"""
d = {}
for pname, value in iteritems(self._params):
pinfo = self._param_defs.get(pname)
# typed param
if pinfo and 'type' in pinfo:
# don't serialize empty lists and dicts for typed fields
if value in ({}, []):
continue
# multi-values are serialized as list of dicts
if pinfo.get('multi'):
value = list(map(lambda x: x.to_dict(), value))
# squash all the hash values into one dict
elif pinfo.get('hash'):
value = dict((k, v.to_dict()) for k, v in iteritems(value))
# serialize single values
else:
value = value.to_dict()
# serialize anything with to_dict method
elif hasattr(value, 'to_dict'):
value = value.to_dict()
d[pname] = value
return {self.name: d}
0
Example 125
Project: dit Source File: lattice.py
def induced_sigalg(dist, rvs, rv_mode=None):
"""
Returns the induced sigma-algebra of the random variable defined by `rvs`.
Parameters
----------
dist : Distribution
The distribution which defines the base sigma-algebra.
rvs : list
The indexes of the random variable used to calculate the induced
sigma algebra.
rv_mode : str, None
Specifies how to interpret the elements of `rvs`. Valid options are:
{'indices', 'names'}. If equal to 'indices', then the elements of
`rvs` are interpreted as random variable indices. If equal to 'names',
the the elements are interpreted as random variable names. If `None`,
then the value of `dist._rv_mode` is consulted.
Returns
-------
F : frozenset of frozensets
The induced sigma-algebra.
"""
# This is brute force and ugly.
#
# Implementation:
# 1) Find induced atoms from atoms of new sigma-algebra:
# X^{-1}(A) = { w : X(w) \in A }
# where A = \{a\} and a is a nonzero outcome in the marginal.
# 2) Generate sigma algebra from induced atoms.
#
# Step 2 may not be necessary.
#
indexes = parse_rvs(dist, rvs, rv_mode=rv_mode, unique=True, sort=True)[1]
# This creates a mapping from new outcomes (defined by rvs) to the
# original outcomes which map to those new outcomes. This defines a
# partition of the original outcomes.
d = defaultdict(list)
ctor = dist._outcome_ctor
for outcome, _ in dist.zipped(mode='atoms'):
# Build a list of inner outcomes. "c" stands for "constructed".
# We need to iterate over all atoms, not just those in pmf since
# we are trying to partition the sample space.
c_outcome = ctor([outcome[i] for i in indexes])
d[c_outcome].append(outcome)
atoms = frozenset(map(frozenset, d.values()))
F = sigma_algebra(atoms)
return F
0
Example 126
Project: ete Source File: ete_compare.py
def run(args):
from .. import Tree
from ..utils import print_table
def iter_differences(set1, set2, unrooted=False):
for s1 in set1:
pairs = []
for r1 in set2:
if unrooted:
d = euc_dist_unrooted(s1, r1)
else:
d = euc_dist(s1, r1)
if d < 1:
pairs.append((d,r1))
yield s1, pairs
col_sizes = [15, 15] + [7] * 8
header = ['source', 'ref', 'E.size', 'nRF',
'RF', 'maxRF', "src-branches",
"ref-branches", "subtrees", "treekoD" ]
if args.taboutput:
print('# ' + '\t'.join(header))
elif args.show_mismatches or args.show_matches:
pass
else:
print_table([header,
["=========================="] * 10],
fix_col_width=col_sizes, wrap_style="cut")
if args.treeko:
from .. import PhyloTree
tree_class = PhyloTree
else:
tree_class = Tree
for stree_name in src_tree_iterator(args):
stree = tree_class(stree_name, format=args.src_newick_format)
# Parses attrs if necessary
src_tree_attr = args.src_tree_attr
if args.src_attr_parser:
for leaf in stree:
leaf.add_feature('tempattr', re.search(
args.src_attr_parser, getattr(leaf, args.src_tree_attr)).groups()[0])
src_tree_attr = 'tempattr'
for rtree_name in ref_tree_iterator(args):
rtree = tree_class(rtree_name, format=args.ref_newick_format)
# Parses attrs if necessary
ref_tree_attr = args.ref_tree_attr
if args.ref_attr_parser:
for leaf in rtree:
leaf.add_feature('tempattr', re.search(
args.ref_attr_parser, getattr(leaf, args.ref_tree_attr)).groups()[0])
ref_tree_attr = 'tempattr'
r = stree.compare(rtree,
ref_tree_attr=ref_tree_attr,
source_tree_attr=src_tree_attr,
min_support_ref=args.min_support_ref,
min_support_source = args.min_support_src,
unrooted=args.unrooted,
has_duplications=args.treeko)
if args.show_mismatches or args.show_matches or args.show_edges:
if args.show_mismatches:
src = r['source_edges'] - r['ref_edges']
ref = r['ref_edges'] - r['source_edges']
elif args.show_matches:
src = r['source_edges'] & r['ref_edges']
ref = r['ref_edges'] & r['source_edges']
elif args.show_edges:
src = r['source_edges']
ref = r['ref_edges']
if args.unrooted:
for tag, part in [("src: %s"%stree_name, src), ("ref: %s"%rtree_name, ref)]:
print("%s\t%s" %(tag, '\t'.join(
map(lambda x: '%s|%s' %(','.join(x[0]), ','.join(x[1])), part))))
else:
for tag, part in [("src: %s"%stree_name, src), ("ref: %s"%rtree_name, ref)]:
print("%s\t%s" %(tag, '\t'.join([','.join(p) for p in part])))
else:
data = [shorten_str(stree_name, 15, reverse=True),
shorten_str(rtree_name, 15, reverse=True),
r['effective_tree_size'],
r['norm_rf'],
r['rf'], r['max_rf'],
r["source_edges_in_ref"],
r["ref_edges_in_source"],
r['source_subtrees'],
r['treeko_dist']]
if r['effective_tree_size'] == 0:
for i in range(3, len(data)):
data[i] = -1
if args.taboutput:
print('\t'.join(map(str, data)))
else:
print_table([list(map(as_str, data))],
fix_col_width = col_sizes, wrap_style='cut')
0
Example 127
Project: ete Source File: ncbiquery.py
def get_topology(self, taxids, intermediate_nodes=False, rank_limit=None, collapse_subspecies=False, annotate=True):
"""Given a list of taxid numbers, return the minimal pruned NCBI taxonomy tree
containing all of them.
:param False intermediate_nodes: If True, single child nodes
representing the complete lineage of leaf nodes are kept. Otherwise, the
tree is pruned to contain the first common ancestor of each group.
:param None rank_limit: If valid NCBI rank name is provided, the tree is
pruned at that given level. For instance, use rank="species" to get rid
of sub-species or strain leaf nodes.
:param False collapse_subspecies: If True, any item under the species
rank will be collapsed into the species upper node.
"""
from .. import PhyloTree
taxids, merged_conversion = self._translate_merged(taxids)
if len(taxids) == 1:
root_taxid = int(list(taxids)[0])
with open(self.dbfile+".traverse.pkl", "rb") as CACHED_TRAVERSE:
prepostorder = pickle.load(CACHED_TRAVERSE)
descendants = {}
found = 0
nodes = {}
hit = 0
visited = set()
start = prepostorder.index(root_taxid)
try:
end = prepostorder.index(root_taxid, start+1)
subtree = prepostorder[start:end+1]
except ValueError:
# If root taxid is not found in postorder, must be a tip node
subtree = [root_taxid]
leaves = set([v for v, count in Counter(subtree).items() if count == 1])
nodes[root_taxid] = PhyloTree(name=str(root_taxid))
current_parent = nodes[root_taxid]
for tid in subtree:
if tid in visited:
current_parent = nodes[tid].up
else:
visited.add(tid)
nodes[tid] = PhyloTree(name=str(tid))
current_parent.add_child(nodes[tid])
if tid not in leaves:
current_parent = nodes[tid]
root = nodes[root_taxid]
else:
taxids = set(map(int, taxids))
sp2track = {}
elem2node = {}
id2lineage = self.get_lineage_translator(taxids)
all_taxids = set()
for lineage in id2lineage.values():
all_taxids.update(lineage)
id2rank = self.get_rank(all_taxids)
for sp in taxids:
track = []
lineage = id2lineage[sp]
for elem in lineage:
if elem not in elem2node:
node = elem2node.setdefault(elem, PhyloTree())
node.name = str(elem)
node.taxid = elem
node.add_feature("rank", str(id2rank.get(int(elem), "no rank")))
else:
node = elem2node[elem]
track.append(node)
sp2track[sp] = track
# generate parent child relationships
for sp, track in six.iteritems(sp2track):
parent = None
for elem in track:
if parent and elem not in parent.children:
parent.add_child(elem)
if rank_limit and elem.rank == rank_limit:
break
parent = elem
root = elem2node[1]
#remove onechild-nodes
if not intermediate_nodes:
for n in root.get_descendants():
if len(n.children) == 1 and int(n.name) not in taxids:
n.delete(prevent_nondicotomic=False)
if len(root.children) == 1:
tree = root.children[0].detach()
else:
tree = root
if collapse_subspecies:
to_detach = []
for node in tree.traverse():
if node.rank == "species":
to_detach.extend(node.children)
for n in to_detach:
n.detach()
if annotate:
self.annotate_tree(tree)
return tree
0
Example 128
Project: dit Source File: distconst.py
def uniform_distribution(outcome_length, alphabet_size, base=None):
"""
Returns a uniform distribution.
Parameters
----------
outcome_length : int
The length of the outcomes.
alphabet_size : int, list of lists
The alphabets used to construct the outcomes of the distribution. If an
integer, then the alphabet for each random variable will be the same,
consisting of integers from 0 to k-1 where k is the alphabet size.
If a list, then the elements are used as the alphabet for each random
variable. If the list has a single element, then it will be used
as the alphabet for each random variable.
base : float, 'linear', 'e'
The desired base for the distribution probabilities.
Returns
-------
d : Distribution.
A uniform distribution.
Examples
--------
Each random variable has the same standardized alphabet: [0,1]
>>> d = dit.uniform_distribution(2, 2)
Each random variable has its own alphabet.
>>> d = dit.uniform_distribution(2, [[0,1],[1,2]])
Both random variables have ['H','T'] as an alphabet.
>>> d = dit.uniform_distribution(2, [['H','T']])
"""
try:
int(alphabet_size)
except TypeError:
# Assume it is a list of lists.
alphabet = alphabet_size
# Autoextend if only one alphabet is provided.
if len(alphabet) == 1:
alphabet = [alphabet[0]] * outcome_length
elif len(alphabet) != outcome_length:
raise TypeError("outcome_length does not match number of rvs.")
else:
# Build the standard alphabet.
alphabet = [tuple(range(alphabet_size))] * outcome_length
try:
Z = np.prod(list(map(len, alphabet)))
try:
# for some reason numpypy.prod returns a list, and pypy can't handle
# multiplying a list by a numpy float.
Z = int(Z[0])
except:
pass
except TypeError:
raise TypeError("alphabet_size must be an int or list of lists.")
pmf = [1/Z] * Z
outcomes = tuple(product(*alphabet))
d = Distribution(outcomes, pmf, base='linear')
# Maybe we should use ditParams['base'] when base is None?
if base is not None:
d.set_base(base)
return d
0
Example 129
Project: ete Source File: phylobuild.py
def main(args):
""" Read and parse all configuration and command line options,
setup global variables and data, and initialize the master task of
all workflows. """
global log
log = logging.getLogger("main")
base_dir = GLOBALS["basedir"]
# -------------------------------------
# READ CONFIG FILE AND PARSE WORKFLOWS
# -------------------------------------
# Load and check config file
if args.custom_config:
concat_config = open(args.base_config).readlines()
concat_config += open(args.custom_config).readlines()
base_config = check_config(concat_config)
else:
base_config = check_config(args.base_config)
# Check for config file overwriting
clearname = os.path.basename(args.base_config)
local_conf_file = pjoin(base_dir, "ete_build.cfg")
if pexist(base_dir):
if hascontent(local_conf_file):
if not args.clearall and not args.resume:
raise ConfigError("Output directory seems to contain"
" data from a previous run."
" Use --clearall to restart the analysis or --resume to continue.")
# Creates a tree splitter config block on the fly. In the future this
# options should be more accessible by users.
base_config['default_tree_splitter'] = {
'_app' : 'treesplitter',
'_max_outgroup_size' : '10%', # dynamic or fixed selection of out seqs.
'_min_outgroup_support' : 0.9, # avoids fixing labile nodes as monophyletic
'_outgroup_topology_dist' : False}
# prepare workflow config dictionaries
workflow_types = defaultdict(list)
TARGET_CLADES = set()
VALID_WORKFLOW_TYPES = set(['genetree', 'supermatrix'])
# extract workflow filters
def parse_workflows(names, target_wtype, parse_filters=False):
parsed_workflows = []
if not names:
return parsed_workflows
for wkname in names:
if parse_filters:
wfilters = {}
fields = [_f.strip() for _f in wkname.split(",")]
if len(fields) == 1:
wkname = fields[0]
else:
wkname = fields[-1]
for f in fields[:-1]:
if f.startswith("size-range:"): # size filter
f = f.replace("size-range:",'')
try:
min_size, max_size = list(map(int, f.split('-')))
if min_size < 0 or min_size > max_size:
raise ValueError
except ValueError:
raise ConfigError('size filter should consist of two integer numbers (i.e. 50-100). Found [%s] instead' %f)
wfilters["max_size"] = max_size
wfilters["min_size"] = min_size
elif f.startswith("seq-sim-range:"):
f = f.replace("seq-sim-range:",'')
try:
min_seq_sim, max_seq_sim = map(float, f.split('-'))
if min_seq_sim > 1 or min_seq_sim < 0:
raise ValueError
if max_seq_sim > 1 or max_seq_sim < 0:
raise ValueError
if min_seq_sim > max_seq_sim:
raise ValueError
except ValueError:
raise ConfigError('sequence similarity filter should consist of two float numbers between 0 and 1 (i.e. 0-0.95). Found [%s] instead' %f)
wfilters["min_seq_sim"] = min_seq_sim
wfilters["max_seq_sim"] = max_seq_sim
else:
raise ConfigError('Unknown workflow filter [%s]' %f)
if target_wtype == "genetree" and wkname in base_config.get('genetree_meta_workflow', {}):
temp_workflows = [x.lstrip('@') for x in base_config['genetree_meta_workflow'][wkname]]
elif target_wtype == "supermatrix" and wkname in base_config.get('supermatrix_meta_workflow', {}):
temp_workflows = [x.lstrip('@') for x in base_config['supermatrix_meta_workflow'][wkname]]
else:
temp_workflows = [wkname]
# if wkname not in base_config and wkname in base_config.get('meta_workflow', {}):
# temp_workflows = [x.lstrip('@') for x in base_config['meta_workflow'][wkname]]
# else:
# temp_workflows = [wkname]
for _w in temp_workflows:
if target_wtype == "genetree":
base_config.update(build_genetree_workflow(_w))
elif target_wtype == "supermatrix":
base_config.update(build_supermatrix_workflow(_w))
parse_block(_w, base_config)
if _w not in base_config:
list_workflows(base_config)
raise ConfigError('[%s] workflow or meta-workflow name is not found in the config file.' %_w)
wtype = base_config[_w]['_app']
if wtype not in VALID_WORKFLOW_TYPES:
raise ConfigError('[%s] is not a valid workflow: %s?' %(_w, wtype))
if wtype != target_wtype:
raise ConfigError('[%s] is not a valid %s workflow' %(wkname, target_wtype))
if parse_filters:
if len(temp_workflows) == 1:
parsed_workflows.extend([(temp_workflows[0], wfilters)])
else:
raise ConfigError('Meta-workflows with multiple threads are not allowed as recursive workflows [%s]' %wkname)
else:
parsed_workflows.extend(temp_workflows)
return parsed_workflows
genetree_workflows = parse_workflows(args.workflow, "genetree")
supermatrix_workflows = parse_workflows(args.supermatrix_workflow, "supermatrix")
# Stop if mixing types of meta-workflows
if supermatrix_workflows and len(genetree_workflows) > 1:
raise ConfigError("A single genetree workflow must be specified when used in combination with super-matrix workflows.")
# Sets master workflow type
if supermatrix_workflows:
WORKFLOW_TYPE = "supermatrix"
master_workflows = supermatrix_workflows
else:
WORKFLOW_TYPE = "genetree"
master_workflows = genetree_workflows
# Parse npr workflows and filters
npr_workflows = []
use_npr = False
if args.npr_workflows is not None:
use_npr = True
npr_workflows = parse_workflows(args.npr_workflows, WORKFLOW_TYPE, parse_filters=True)
# setup workflows and create a separate config dictionary for each of them
run2config = {}
for wkname in master_workflows:
config = dict(base_config)
run2config[wkname] = config
appset = config[config[wkname]['_appset'][1:]]
# Initialized application command line commands for this workflow
config['app'] = {}
config['threading'] = {}
apps_to_test = {}
for k, (appsrc, cores) in six.iteritems(appset):
cores = int(cores)
if appsrc == "built-in":
#cores = int(config["threading"].get(k, args.maxcores))
cores = min(args.maxcores, cores)
config["threading"][k] = cores
cmd = apps.get_call(k, APPSPATH, base_dir, str(cores))
config["app"][k] = cmd
apps_to_test[k] = cmd
# Copy config file
config["_outpath"] = pjoin(base_dir, wkname)
config["_nodeinfo"] = defaultdict(dict)
try:
os.makedirs(config["_outpath"])
except OSError:
pass
# setup genetree workflow as the processor of concat alignment jobs
if WORKFLOW_TYPE == "supermatrix":
concatenator = config[wkname]["_alg_concatenator"][1:]
config[concatenator]["_workflow"] = '@%s' % genetree_workflows[0]
# setup npr options for master workflows
if use_npr:
config['_npr'] = {
# register root workflow as the main workflow if the contrary not said
"wf_type": WORKFLOW_TYPE,
"workflows": npr_workflows if npr_workflows else [(wkname, {})],
'nt_switch_thr': args.nt_switch_thr,
'max_iters': args.max_iters,
}
#config[wkname]['_npr'] = '@'+npr_config
#target_levels = config[npr_config].get('_target_levels', [])
#target_dict = config['_optimized_levels'] = {}
#for tg in target_levels:
# If target level name starts with ~, we allow para and
# poly-phyletic grouping of the species in such level
#strict_monophyly = True
#if tg.startswith("~"):
#tg = target_level.lstrip("~")
#strict_monophyly = False
#tg = tg.lower()
# We add the level as non-optimized
#target_dict[target_level] = [False, strict_monophyly]
#TARGET_CLADES.update(target_levels)
else:
config['_npr'] = {
'nt_switch_thr': args.nt_switch_thr,
}
# dump log config file
with open(local_conf_file, "w") as OUTPUT:
with open(args.base_config) as INPUT:
OUTPUT.write(INPUT.read()) # replace by simple copy?
TARGET_CLADES.discard('')
if WORKFLOW_TYPE == 'genetree':
from .phylobuild_lib.workflow.genetree import pipeline
elif WORKFLOW_TYPE == 'supermatrix':
from .phylobuild_lib.workflow.supermatrix import pipeline
#if args.arch == "auto":
# arch = "64 " if sys.maxsize > 2**32 else "32"
#else:
# arch = args.arch
arch = "64 " if sys.maxsize > 2**32 else "32"
print(__DESCRIPTION__)
# check application binary files
if not args.nochecks:
log.log(28, "Testing x86-%s portable applications..." % arch)
apps.test_apps(apps_to_test)
log.log(28, "Starting ETE-build execution at %s" %(ctime()))
log.log(28, "Output directory %s" %(GLOBALS["output_dir"]))
# -------------------------------------
# PATH CONFIGs
# -------------------------------------
# Set up paths
gallery_dir = os.path.join(base_dir, "gallery")
sge_dir = pjoin(base_dir, "sge_jobs")
tmp_dir = pjoin(base_dir, "tmp")
tasks_dir = os.path.realpath(args.tasks_dir) if args.tasks_dir else pjoin(base_dir, "tasks")
input_dir = pjoin(base_dir, "input")
db_dir = os.path.realpath(args.db_dir) if args.db_dir else pjoin(base_dir, "db")
GLOBALS["db_dir"] = db_dir
GLOBALS["sge_dir"] = sge_dir
GLOBALS["tmp"] = tmp_dir
GLOBALS["gallery_dir"] = gallery_dir
GLOBALS["tasks_dir"] = tasks_dir
GLOBALS["input_dir"] = input_dir
GLOBALS["nprdb_file"] = pjoin(db_dir, "npr.db")
GLOBALS["datadb_file"] = pjoin(db_dir, "data.db")
GLOBALS["seqdb_file"] = pjoin(db_dir, "seq.db") if not args.seqdb else args.seqdb
# Clear databases if necessary
if args.clearall:
log.log(28, "Erasing all existing npr data...")
shutil.rmtree(GLOBALS["tasks_dir"]) if pexist(GLOBALS["tasks_dir"]) else None
shutil.rmtree(GLOBALS["tmp"]) if pexist(GLOBALS["tmp"]) else None
shutil.rmtree(GLOBALS["input_dir"]) if pexist(GLOBALS["input_dir"]) else None
if not args.seqdb:
silent_remove(GLOBALS["seqdb_file"])
silent_remove(GLOBALS["datadb_file"])
silent_remove(pjoin(base_dir, "nprdata.tar"))
silent_remove(pjoin(base_dir, "nprdata.tar.gz"))
#silent_remove(pjoin(base_dir, "npr.log"))
silent_remove(pjoin(base_dir, "npr.log.gz"))
else:
if args.softclear:
log.log(28, "Erasing precomputed data (reusing task directory)")
shutil.rmtree(GLOBALS["tmp"]) if pexist(GLOBALS["tmp"]) else None
shutil.rmtree(GLOBALS["input_dir"]) if pexist(GLOBALS["input_dir"]) else None
os.remove(GLOBALS["datadb_file"]) if pexist(GLOBALS["datadb_file"]) else None
if args.clearseqs and pexist(GLOBALS["seqdb_file"]) and not args.seqdb:
log.log(28, "Erasing existing sequence database...")
os.remove(GLOBALS["seqdb_file"])
if not args.clearall and base_dir != GLOBALS["output_dir"]:
log.log(24, "Copying previous output files to scratch directory: %s..." %base_dir)
try:
shutil.copytree(pjoin(GLOBALS["output_dir"], "db"), db_dir)
except IOError as e:
print(e)
pass
try:
shutil.copytree(pjoin(GLOBALS["output_dir"], "tasks/"), pjoin(base_dir, "tasks/"))
except IOError as e:
try:
shutil.copy(pjoin(GLOBALS["output_dir"], "nprdata.tar.gz"), base_dir)
except IOError as e:
pass
# try: os.system("cp -a %s/* %s/" %(GLOBALS["output_dir"], base_dir))
# except Exception: pass
# UnCompress packed execution data
if pexist(os.path.join(base_dir,"nprdata.tar.gz")):
log.warning("Compressed data found. Extracting content to start execution...")
cmd = "cd %s && gunzip -f nprdata.tar.gz && tar -xf nprdata.tar && rm nprdata.tar" % base_dir
os.system(cmd)
# Create dir structure
for dirname in [tmp_dir, tasks_dir, input_dir, db_dir]:
try:
os.makedirs(dirname)
except OSError:
log.warning("Using existing dir: %s", dirname)
# -------------------------------------
# DATA READING AND CHECKING
# -------------------------------------
# Set number of CPUs available
if WORKFLOW_TYPE == "supermatrix" and not args.cogs_file:
raise ConfigError("Species tree workflow requires a list of COGS"
" to be supplied through the --cogs"
" argument.")
elif WORKFLOW_TYPE == "supermatrix":
GLOBALS["cogs_file"] = os.path.abspath(args.cogs_file)
GLOBALS["seqtypes"] = set()
if args.nt_seed_file:
GLOBALS["seqtypes"].add("nt")
GLOBALS["inputname"] = os.path.split(args.nt_seed_file)[-1]
if args.aa_seed_file:
GLOBALS["seqtypes"].add("aa")
GLOBALS["inputname"] = os.path.split(args.aa_seed_file)[-1]
# Initialize db if necessary, otherwise extract basic info
db.init_nprdb(GLOBALS["nprdb_file"])
db.init_datadb(GLOBALS["datadb_file"])
# Species filter
if args.spfile:
target_species = set([line.strip() for line in open(args.spfile)])
target_species.discard("")
log.log(28, "Enabling %d species", len(target_species))
else:
target_species = None
# Load supermatrix data
if WORKFLOW_TYPE == "supermatrix":
observed_species= set()
target_seqs = set()
for cog_number, seq_cogs in iter_cog_seqs(args.cogs_file, args.spname_delimiter):
for seqname, spcode, seqcode in seq_cogs:
if target_species is None or spcode in target_species:
observed_species.add(spcode)
target_seqs.add(seqname)
if target_species is not None:
if target_species - observed_species:
raise DataError("The following target_species could not be found in COGs file: %s" %(','.join(target_species-observed_species)))
else:
target_species = observed_species
log.warning("COG file restriction: %d sequences from %s species " %(len(target_seqs), len(target_species)))
else:
target_seqs = None
GLOBALS["target_species"] = target_species
# Check and load data
ERROR = ""
if not pexist(GLOBALS["seqdb_file"]):
db.init_seqdb(GLOBALS["seqdb_file"])
seqname2seqid = None
if args.aa_seed_file:
seqname2seqid = seqio.load_sequences(args, "aa", target_seqs, target_species, seqname2seqid)
if not target_seqs:
target_seqs = list(seqname2seqid.keys())
if args.nt_seed_file:
seqname2seqid = seqio.load_sequences(args, "nt", target_seqs, target_species, seqname2seqid)
# Integrity checks?
pass
else:
db.init_seqdb(GLOBALS["seqdb_file"])
log.warning("Reusing sequences from existing database!")
if target_seqs is None:
seqname2seqid = db.get_seq_name_dict()
else:
seqname2seqid = db.get_seq_name_dict()
if target_seqs - set(seqname2seqid.keys()):
raise DataError("The following sequence names in COGs file"
" are not found in current database: %s" %(
','.join(target_seqs - db_seqs)))
log.warning("%d target sequences" %len(seqname2seqid))
GLOBALS["target_sequences"] = seqname2seqid.values()
if ERROR:
with open(pjoin(base_dir, "error.log"), "w") as OUTPUT:
OUTPUT.write(' '.join(arguments) + "\n\n" + ERROR)
raise DataError("Errors were found while loading data. Please"
" check error file for details")
# Prepare target taxa levels, if any
if WORKFLOW_TYPE == "supermatrix" and args.lineages_file and TARGET_CLADES:
sp2lin = {}
lin2sp = defaultdict(set)
all_sorted_levels = []
for line in open(args.lineages_file):
sp, lineage = line.split("\t")
sp = sp.strip()
if sp in target_species:
sp2lin[sp] = [x.strip().lower() for x in lineage.split(",")]
for lin in sp2lin[sp]:
if lin not in lin2sp:
all_sorted_levels.append(lin)
lin2sp[lin].add(sp)
# any target species without lineage information?
if target_species - set(sp2lin):
missing = target_species - set(sp2lin)
log.warning("%d species not found in lineages file" %len(missing))
# So, the following levels (with at least 2 species) could be optimized
avail_levels = [(lin, len(lin2sp[lin])) for lin in all_sorted_levels if len(lin2sp[lin])>=2]
log.log(26, "Available levels for NPR optimization:\n%s", '\n'.join(["% 30s (%d spcs)"%x for x in avail_levels]))
avail_levels = set([lv[0] for lv in avail_levels])
GLOBALS["lineages"] = (sp2lin, lin2sp)
# if no lineages file, raise an error
elif WORKFLOW_TYPE == "supermatrix" and TARGET_CLADES:
raise ConfigError("The use of target_levels requires a species lineage file provided through the --lineages option")
# -------------------------------------
# MISC
# -------------------------------------
GLOBALS["_max_cores"] = args.maxcores
log.debug("Enabling %d CPU cores" %args.maxcores)
# how task will be executed
if args.no_execute:
execution = (None, False)
# elif args.sge_execute:
# execution = ("sge", False)
else:
if args.monitor:
execution =("insitu", True) # True is for run-detached flag
else:
execution = ("insitu", False)
# Scheduling starts here
log.log(28, "ETE build starts now!")
# This initialises all pipelines
pending_tasks = []
start_time = ctime()
for wkname, config in six.iteritems(run2config):
# Feeds pending task with the first task of the workflow
config["_name"] = wkname
new_tasks = pipeline(None, wkname, config)
if not new_tasks:
continue # skips pipelines not fitting workflow filters
thread_id = new_tasks[0].threadid
config["_configid"] = thread_id
GLOBALS[thread_id] = config
pending_tasks.extend(new_tasks)
# Clear info from previous runs
open(os.path.join(config["_outpath"], "runid"), "a").write('\t'.join([thread_id, GLOBALS["nprdb_file"]+"\n"]))
# Write command line info
cmd_info = '\t'.join([start_time, thread_id, str(args.monitor), GLOBALS["cmdline"]])
open(pjoin(config["_outpath"], "command_lines"), "a").write(cmd_info+"\n")
thread_errors = schedule(pipeline, pending_tasks, args.schedule_time,
execution, args.debug, args.noimg)
db.close()
if not thread_errors:
if GLOBALS.get('_background_scheduler', None):
GLOBALS['_background_scheduler'].terminate()
if args.compress:
log.log(28, "Compressing intermediate data...")
cmd = "cd %s && tar --remove-files -cf nprdata.tar tasks/ && gzip -f nprdata.tar; if [ -e npr.log ]; then gzip -f npr.log; fi;" %\
GLOBALS["basedir"]
os.system(cmd)
log.log(28, "Deleting temporal data...")
cmd = "cd %s && rm -rf tmp/" %GLOBALS["basedir"]
os.system(cmd)
cmd = "cd %s && rm -rf input/" %GLOBALS["basedir"]
os.system(cmd)
GLOBALS["citator"].show()
else:
raise DataError("Errors found in some tasks")
0
Example 130
Project: soupy Source File: soupy.py
def _wrap_multi(self, func):
vals = func(self._value)
return Collection(map(Node, vals))
0
Example 131
Project: ete Source File: cog_creator.py
def brh_cogs(DB, species, missing_factor=0.0, seed_sp=None, min_score=0):
"""It scans all precalculate BRH relationships among the species
passed as an argument, and detects Clusters of Orthologs
according to several criteria:
min_score: the min coverage/overalp value required for a
blast to be a reliable hit.
missing_factor: the min percentage of species in which a
given seq must have orthologs.
"""
log.log(26, "Searching BRH orthologs")
species = set(map(str, species))
min_species = len(species) - round(missing_factor * len(species))
if seed_sp == "auto":
# seed2size = get_sorted_seeds(seed_sp, species, species, min_species, DB)
# sort_seeds = sorted([(len(size), sp) for sp, size in seed2size.iteritems()])
# sp_to_test = [sort_seeds[-1][1]]
sp_to_test = list(species)
elif seed_sp == "largest":
cmd = """SELECT taxid, size FROM species"""
db.seqcursor.execute(cmd)
sp2size = {}
for tax, counter in db.seqcursor.fetchall():
if tax in species:
sp2size[tax] = counter
sorted_sp = sorted(list(sp2size.items()), lambda x,y: cmp(x[1],y[1]))
log.log(24, sorted_sp[:6])
largest_sp = sorted_sp[-1][0]
sp_to_test = [largest_sp]
log.log(28, "Using %s as search seed. Proteome size=%s genes" %\
(largest_sp, sp2size[largest_sp]))
else:
sp_to_test = [str(seed_sp)]
# The following loop tests each possible seed if none is
# specified.
log.log(28, "Detecting Clusters of Orthologs groups (COGs)")
log.log(28, "Min number of species per COG: %d" %min_species)
cogs_selection = []
for j, seed in enumerate(sp_to_test):
log.log(26,"Testing new seed species:%s (%d/%d)", seed, j+1, len(sp_to_test))
species_side1 = ','.join(map(quote, [s for s in species if str(s)>str(seed)]))
species_side2 = ','.join(map(quote, [s for s in species if str(s)<str(seed)]))
pairs1 = []
pairs2 = []
# Select all ids with matches in the target species, and
# return the total number of species covered by each of
# such ids.
if species_side1 != "":
cmd = """SELECT seqid1, taxid1, seqid2, taxid2 from ortho_pair WHERE
taxid1="%s" AND taxid2 IN (%s) """ %\
(seed, species_side1)
DB.orthocursor.execute(cmd)
pairs1 = DB.orthocursor.fetchall()
if species_side2 != "":
cmd = """SELECT seqid2, taxid2, seqid1, taxid1 from ortho_pair WHERE
taxid1 IN (%s) AND taxid2 = "%s" """ %\
(species_side2, seed)
#taxid2="%s" AND taxid1 IN (%s) AND score >= %s""" %\
#(seed, species_side2, min_score)
DB.orthocursor.execute(cmd)
pairs2 = DB.orthocursor.fetchall()
cog_candidates = defaultdict(set)
for seq1, sp1, seq2, sp2 in pairs1 + pairs2:
s1 = (sp1, seq1)
s2 = (sp2, seq2)
cog_candidates[(sp1, seq1)].update([s1, s2])
all_cogs = [cand for cand in list(cog_candidates.values()) if
len(cand) >= min_species]
cog_sizes = [len(cog) for cog in all_cogs]
cog_spsizes = [len(set([e[0] for e in cog])) for cog in all_cogs]
if [1 for i in range(len(cog_sizes)) if cog_sizes[i] != cog_spsizes[i]]:
# for i in xrange(len(cog_sizes)):
# if cog_sizes[i] != cog_spsizes[i]:
# print cog_sizes[i], cog_spsizes[i]
# raw_input()
raise ValueError("Inconsistent COG found")
if cog_sizes:
cogs_selection.append([seed, all_cogs])
log.log(26, "Found %d COGs" % len(all_cogs))
def _sort_cogs(cogs1, cogs2):
cogs1 = cogs1[1] # discard seed info
cogs2 = cogs2[1] # discard seed info
cog_sizes1 = [len(cog) for cog in cogs1]
cog_sizes2 = [len(cog) for cog in cogs2]
mx1, mn1, avg1 = _max(cog_sizes1), _min(cog_sizes1), round(_mean(cog_sizes1))
mx2, mn2, avg2 = _max(cog_sizes2), _min(cog_sizes2), round(_mean(cog_sizes2))
# we want to maximize all these values in the following order:
for i, j in ((mx1, mx2), (avg1, avg2), (len(cogs1), len(cogs2))):
v = -1 * cmp(i, j)
if v != 0:
break
return v
log.log(26, "Finding best COG selection...")
cogs_selection.sort(_sort_cogs)
lines = []
for seed, all_cogs in cogs_selection:
cog_sizes = [len(cog) for cog in all_cogs]
mx, mn, avg = max(cog_sizes), min(cog_sizes), round(_mean(cog_sizes))
lines.append([seed, mx, mn, avg, len(all_cogs)])
analysis_txt = StringIO()
print_as_table(lines[:25], stdout=analysis_txt,
header=["Seed","largest COG", "smallest COGs", "avg COG size", "total COGs"])
log.log(28, "Analysis details:\n"+analysis_txt.getvalue())
best_seed, best_cogs = cogs_selection[0]
cog_sizes = [len(cog) for cog in best_cogs]
# Not necessary since they will be sorted differently later on
#best_cogs.sort(lambda x,y: cmp(len(x), len(y)), reverse=True)
if max(cog_sizes) < len(species):
raise ValueError("Current COG selection parameters do not permit to cover all species")
recoded_cogs = []
for cog in best_cogs:
named_cog = ["%s%s%s" %(x[0], GLOBALS["spname_delimiter"],x[1]) for x in cog]
recoded_cogs.append(named_cog)
return recoded_cogs, analysis_txt.getvalue()
0
Example 132
Project: dit Source File: distconst.py
def from_mapping(self, mapping, force=True):
"""
Returns a callable implementing a random variable via a mapping.
Parameters
----------
mapping : dict
A mapping from outcomes to values of the new random variable.
force : bool
Ideally, the values of `mapping` should be satisfy the requirements
of all outcomes (hashable, ordered sequences), but if `force` is
`True`, we will attempt to use the distribution's outcome
constructor and make sure that they are. If they are not, then
the outcomes will be placed into a 1-tuple. This is strictly
a convenience for users. As an example, suppose the outcomes are
strings, the values of `mapping` can also be strings without issue.
However, if the outcomes are tuples of integers, then the values
*should* also be tuples. When `force` is `True`, then the values
can be integers and then they will be transformed into 1-tuples.
Returns
-------
func : function
A callable implementing the desired function. It receives a single
argument, the outcome, and returns an outcome for the calculation.
Examples
--------
>>> d = dit.Distribution(['00', '01', '10', '11'], [1/4]*4)
>>> bf = dit.RVFunctions(d)
>>> mapping = {'00': '0', '01': '1', '10': '1', '11': '0'}
>>> d = dit.insert_rvf(d, bf.from_mapping(mapping))
>>> d.outcomes
('000', '011', '101', '110')
Same example as above but now with tuples.
>>> d = dit.Distribution([(0,0), (0,1), (1,0), (1,1)], [1/4]*4)
>>> bf = dit.RVFunctions(d)
>>> mapping = {(0,0): 0, (0,1): 1, (1,0): 1, (1,1): 0}
>>> d = dit.insert_rvf(d, bf.from_mapping(mapping, force=True))
>>> d.outcomes
((0, 0, 0), (0, 1, 1), (1, 0, 1), (1, 1, 0))
See Also
--------
dit.modify_outcomes
"""
ctor = self.ctor
if force:
try:
list(map(ctor, mapping.values()))
except (TypeError, ditException):
values = [ctor([o]) for o in mapping.values()]
mapping = dict(zip(mapping.keys(), values))
def func(outcome):
return mapping[outcome]
return func
0
Example 133
def __call__(self, environ, start_response):
""" This function is executed when the application is called
by the WSGI apache module. It is, therefore, in charge of
answering web requests."""
path = environ['PATH_INFO'].split("/")
start_response('202 OK', [('content-type', 'text/plain')])
if environ['REQUEST_METHOD'].upper() == 'GET' and environ['QUERY_STRING']:
self.queries = cgi.parse_qs(environ['QUERY_STRING'])
elif environ['REQUEST_METHOD'].upper() == 'POST' and environ['wsgi.input']:
self.queries = cgi.parse_qs(environ['wsgi.input'].read())
else:
self.queries = {}
method = path[1]
treeid = self.queries.get("treeid", [None])[0]
nodeid = self.queries.get("nid", [None])[0]
textface = self.queries.get("textface", [None])[0]
actions = self.queries.get("show_actions", [None])[0]
tree = self.queries.get("tree", [None])[0]
search_term = self.queries.get("search_term", [None])[0]
aindex = self.queries.get("aindex", [None])[0]
if method == "draw":
# if not treeid is given, generate one
if not treeid:
treeid = md5(str(time.time())).hexdigest()
if not self._load_tree(treeid, tree):
return "draw: Cannot load the tree: %s" %treeid
if self._custom_tree_renderer:
t = self._treeid2tree[treeid]
return self._custom_tree_renderer(t, treeid, self)
elif t and treeid:
return self._get_tree_img(treeid=treeid)
else:
return "No tree to draw"
elif method == "get_menu":
if not self._load_tree(treeid):
return "get_menu: Cannot load the tree: %s" %treeid
if nodeid:
tree_index = self._treeid2index[treeid]
node = tree_index[nodeid]
else:
node = None
if textface:
header = str(textface).strip()
else:
header = "Menu"
html = """<div id="ete_popup_header"><span id="ete_popup_header_text">%s</span><div id="ete_close_popup" onClick='hide_popup();'></div></div><ul>""" %\
(header)
for i in map(int, actions.split(",")):
aname, target, handler, checker, html_generator = self.actions[i]
if html_generator:
html += html_generator(i, treeid, nodeid, textface, node)
else:
html += """<li><a href='javascript:void(0);' onClick='hide_popup(); run_action("%s", "%s", "%s");'> %s </a></li> """ %\
(treeid, nodeid, i, aname)
html += '</ul>'
return html
elif method == "action":
if not self._load_tree(treeid):
return "action: Cannot load the tree: %s" %treeid
if aindex is None:
# just refresh tree
return self._get_tree_img(treeid=treeid)
else:
aname, target, handler, checker, html_generator = self.actions[int(aindex)]
if target in set(["node", "face", "layout"]):
return self._get_tree_img(treeid=treeid, pre_drawing_action=[target, handler, [nodeid]])
elif target in set(["search"]):
return self._get_tree_img(treeid=treeid, pre_drawing_action=[target, handler, [search_term]])
elif target in set(["refresh"]):
return self._get_tree_img(treeid=treeid)
return "Bad guy"
elif self._external_app_handler:
return self._external_app_handler(environ, start_response, self.queries)
else:
return '\n'.join(map(str, list(environ.items()))) + str(self.queries) + '\t\n'.join(environ['wsgi.input'])
0
Example 134
def __iter__(cls):
return chain(super(ExtendableEnumMeta, cls).__iter__(), chain.from_iterable(map(iter, cls.__subclasses__())))
0
Example 135
def _convert_from_python(self, value_dict, state):
chained = self.chained_validators[:]
chained.reverse()
finished = []
for validator in chained:
__traceback_info__ = (
'for_python chained_validator %s (finished %s)') % (
validator, ', '.join(map(repr, finished)) or 'none')
finished.append(validator)
value_dict = validator.from_python(value_dict, state)
self.assert_dict(value_dict, state)
new = {}
errors = {}
unused = list(self.fields.keys())
if state is not None:
previous_key = getattr(state, 'key', None)
previous_full_dict = getattr(state, 'full_dict', None)
state.full_dict = value_dict
try:
__traceback_info__ = None
for name, value in six.iteritems(value_dict):
__traceback_info__ = 'for_python in %s' % name
try:
unused.remove(name)
except ValueError:
if not self.allow_extra_fields:
raise Invalid(self.message('notExpected',
state, name=repr(name)), value_dict, state)
if not self.filter_extra_fields:
new[name] = value
else:
if state is not None:
state.key = name
try:
new[name] = self.fields[name].from_python(value, state)
except Invalid as e:
errors[name] = e
del __traceback_info__
for name in unused:
validator = self.fields[name]
if state is not None:
state.key = name
try:
new[name] = validator.from_python(None, state)
except Invalid as e:
errors[name] = e
if errors:
raise Invalid(
format_compound_error(errors),
value_dict, state, error_dict=errors)
pre = self.pre_validators[:]
pre.reverse()
if state is not None:
state.key = previous_key
for validator in pre:
__traceback_info__ = 'for_python pre_validator %s' % validator
new = validator.from_python(new, state)
return new
finally:
if state is not None:
state.key = previous_key
state.full_dict = previous_full_dict
0
Example 136
def to_string(self, digits=None, exact=None, tol=1e-9):
"""
Returns a string representation of the distribution.
Parameters
----------
digits : int or None
The probabilities will be rounded to the specified number of
digits, using NumPy's around function. If `None`, then no rounding
is performed. Note, if the number of digits is greater than the
precision of the floats, then the resultant number of digits will
match that smaller precision.
exact : bool
If `True`, then linear probabilities will be displayed, even if
the underlying pmf contains log probabilities. The closest
rational fraction within a tolerance specified by `tol` is used
as the display value.
tol : float
If `exact` is `True`, then the probabilities will be displayed
as the closest rational fraction within `tol`.
Returns
-------
s : str
A string representation of the distribution.
"""
from six import StringIO
s = StringIO()
if exact is None:
exact = ditParams['print.exact']
x = prepare_string(self, digits, exact, tol)
pmf, outcomes, base, colsep, max_length, pstr = x
headers = ["Class: ",
"Alphabet: ",
"Base: "]
vals = [self.__class__.__name__,
self.alphabet, # pylint: disable=no-member
base]
L = max(map(len, headers))
for head, val in zip(headers, vals):
s.write("{0}{1}\n".format(head.ljust(L), val))
s.write("\n")
s.write(''.join(['x'.ljust(max_length), colsep, pstr, "\n"]))
for o, p in zip(outcomes, pmf):
s.write(''.join([o.ljust(max_length), colsep, str(p), "\n"]))
s.seek(0)
s = s.read()
# Remove the last \n
s = s[:-1]
return s
0
Example 137
def setUp(self):
self.inits = [
{}, {1: 1}, {1: 1, 2: 2, 3: 3}, {None: None}, {
None: None, 1: 1, 2: 2}, {False: False},
]
self.inits += list(map(itemlist, [
[], [(1, 1)], [(1, 1), (2, 2)], [(1, 1), (2, 2), (1, 1)],
[(1, 1), (1, 1), (1, 1)], [(None, None), (None, None)],
[(False, False)],
[(None, 1), (1, None), (None, None), (None, 1), (1, None)],
]))
# Updates to test update() and updateall().
self.updates = [
{}, {7: 7}, {7: 7, 8: 8, 9: 9}, {None: None}, {1: 1, 2: 2}]
self.updates += list(map(itemlist, [
[], [(7, 7)], [(7, 7), (8, 8), (9, 9)], [(None, 'none')],
[(9, 9), (1, 2)], [(7, 7), (7, 7), (8, 8), (7, 77)],
[(1, 11), (1, 111), (1, 1111), (2, 22),
(2, 222), ('a', 'a'), ('a', 'aa')],
]))
self.keyword_updates = [
{}, {'1': 1}, {'1': 1, '2': 2}, {
'sup': 'pumps', 'scewps': None}, {'aa': 'aa'},
]
# Items not initially in any of the multidict inputs self.inits.
self.nonitems = [
(44, 44), (None, 44), (55, None), ('a', 'b'), (11, 11), (22, 22)]
# Keys not initially in any of the multidict inputs self.inits or in
# self.nonitems.
self.nonkeys = [_unique, 'asdfasdosduf', 'oaisfiapsn', 'ioausopdaui']
self.valuelist = [1, 2, 3, None, 'a', 'b', object()]
0
Example 138
def format_records(self, records=None):
if records is None:
records = self.get_records()
return map(self.format_record, records)
0
Example 139
Project: pdfquery Source File: pdfquery.py
def get_tree(self, *page_numbers):
"""
Return lxml.etree.ElementTree for entire docuement, or page numbers
given if any.
"""
cache_key = "_".join(map(str, _flatten(page_numbers)))
tree = self._parse_tree_cacher.get(cache_key)
if tree is None:
# set up root
root = parser.makeelement("pdfxml")
if self.doc.info:
for k, v in list(self.doc.info[0].items()):
k = obj_to_string(k)
v = obj_to_string(resolve1(v))
try:
root.set(k, v)
except ValueError as e:
# Sometimes keys have a character in them, like ':',
# that isn't allowed in XML attribute names.
# If that happens we just replace non-word characters
# with '_'.
if "Invalid attribute name" in e.args[0]:
k = re.sub('\W', '_', k)
root.set(k, v)
# Parse pages and append to root.
# If nothing was passed in for page_numbers, we do this for all
# pages, but if None was explicitly passed in, we skip it.
if not(len(page_numbers) == 1 and page_numbers[0] is None):
if page_numbers:
pages = [[n, self.get_layout(self.get_page(n))] for n in
_flatten(page_numbers)]
else:
pages = enumerate(self.get_layouts())
for n, page in pages:
page = self._xmlize(page)
page.set('page_index', obj_to_string(n))
page.set('page_label', self.doc.get_page_number(n))
root.append(page)
self._clean_text(root)
# wrap root in ElementTree
tree = etree.ElementTree(root)
self._parse_tree_cacher.set(cache_key, tree)
return tree
0
Example 140
def get_queryset(self):
qs = super(BasketList, self).get_queryset()
return map(
functools.partial(assign_basket_strategy, request=self.request),
qs)
0
Example 141
Project: chalk Source File: __init__.py
def format_txt(fg, txt, bg, opts):
fg, txt, bg, opts = map(convert_to_str, (fg, txt, bg, opts))
return make_code(fg, bg, opts) + txt + _clear_formatting
0
Example 142
Project: zulip Source File: users.py
def get_bots_backend(request, user_profile):
# type: (HttpRequest, UserProfile) -> HttpResponse
bot_profiles = UserProfile.objects.filter(is_bot=True, is_active=True,
bot_owner=user_profile)
bot_profiles = bot_profiles.select_related('default_sending_stream', 'default_events_register_stream')
bot_profiles = bot_profiles.order_by('date_joined')
def bot_info(bot_profile):
# type: (UserProfile) -> Dict[str, Any]
default_sending_stream = get_stream_name(bot_profile.default_sending_stream)
default_events_register_stream = get_stream_name(bot_profile.default_events_register_stream)
return dict(
username=bot_profile.email,
full_name=bot_profile.full_name,
api_key=bot_profile.api_key,
avatar_url=avatar_url(bot_profile),
default_sending_stream=default_sending_stream,
default_events_register_stream=default_events_register_stream,
default_all_public_streams=bot_profile.default_all_public_streams,
)
return json_success({'bots': list(map(bot_info, bot_profiles))})
0
Example 143
Project: mapproxy Source File: sqlite.py
def _tile_set_params_dict(self, tile_set):
level = tile_set.level
tile_width, tile_height = self.grid.tile_size
matrix_width, matrix_height = self.grid.grid_sizes[level]
params = {
'layer_id' : self.layer_id,
'bbox' : ', '.join(map(str, [v for v in self.grid.bbox])),
'srs' : self.grid.srs.srs_code,
'format' : self.file_ext,
'min_tile_col' : tile_set.grid[0],
'max_tile_col' : tile_set.grid[2],
'min_tile_row' : tile_set.grid[1],
'max_tile_row' : tile_set.grid[3],
'tile_width' : tile_width,
'tile_height' : tile_height,
'matrix_width' : matrix_width,
'matrix_height' : matrix_height,
'matrix_id' : level,
'matrix_set_name' : self.grid.name,
'table_name' : tile_set.table_name
}
return params
0
Example 144
Project: cabby Source File: converters.py
def to_content_binding_entities(raw_bindings):
return list(map(to_content_binding_entity, raw_bindings))
0
Example 145
def add_arguments(cmdline):
"""populate `cmdline` --- an `argpase.ArgumentParser` --- with cmdline
options shared across several Debsources tools
"""
cmdline.add_argument('--backend', '-b',
metavar='BACKEND',
action='append',
help='only affect a specific backend (one of: db, fs,'
'hooks, hooks.db, hooks.fs). By default all backends'
'are enabled; the special value "none" disables all'
'backends. Can be specified multiple times. Warning:'
'using this you can mess up the update logic, use at '
'your own risk.',
dest='backends')
cmdline.add_argument('--config', '-c', dest='conffile',
help='alternate configuration file')
cmdline.add_argument('--dburi', '-u', dest='dburi',
help='database URI, e.g. postgresql:///mydbname.'
'Override configuration file setting "db_uri"')
cmdline.add_argument('--dry-run', '-d', dest='dry',
action='store_true',
help='enable dry run mode')
cmdline.add_argument('--single-transaction', dest='single_transaction',
choices=['yes', 'no'],
help='use a single big DB transaction, instead of '
'smaller per-package transactions (default: yes)')
cmdline.add_argument('--stage', '-s',
metavar='STAGE',
action='append',
help='only perform a specific update stage '
'(one of: %s). By default all update stages are '
'performed. Can be specified multiple times. Warning:'
'using this you can mess up the update logic, use at'
'your own risk.' %
list(map(updater.pp_stage, updater.UPDATE_STAGES)),
dest='stages')
cmdline.add_argument('--trigger', '-t',
metavar='EVENT/HOOK',
action='append',
help='force trigger of (Python) HOOK for EVENT. By '
'default all registered hooks are triggered for all '
'changed packages. Event is one of: %s. Hook is one '
'of the available hooks. Can be specified multiple '
'times. Warning: if not used with "--backend none" '
'it might lead to multiple execution of the same '
'hook. E.g.: -t add-package/checksums' %
string.join(updater.KNOWN_EVENTS, ', '),
dest='force_triggers')
cmdline.add_argument('--verbose', '-v',
action='count',
help='increase console verbosity')
0
Example 146
Project: zulip Source File: messages.py
def exclude_muting_conditions(user_profile, narrow):
# type: (UserProfile, Iterable[Dict[str, Any]]) -> List[Selectable]
conditions = []
stream_name = get_stream_name_from_narrow(narrow)
if stream_name is None:
rows = Subscription.objects.filter(
user_profile=user_profile,
active=True,
in_home_view=False,
recipient__type=Recipient.STREAM
).values('recipient_id')
muted_recipient_ids = [row['recipient_id'] for row in rows]
condition = not_(column("recipient_id").in_(muted_recipient_ids))
conditions.append(condition)
muted_topics = ujson.loads(user_profile.muted_topics)
if muted_topics:
if stream_name is not None:
muted_topics = [m for m in muted_topics if m[0].lower() == stream_name]
if not muted_topics:
return conditions
muted_streams = bulk_get_streams(user_profile.realm,
[muted[0] for muted in muted_topics])
muted_recipients = bulk_get_recipients(Recipient.STREAM,
[stream.id for stream in six.itervalues(muted_streams)])
recipient_map = dict((s.name.lower(), muted_recipients[s.id].id)
for s in six.itervalues(muted_streams))
muted_topics = [m for m in muted_topics if m[0].lower() in recipient_map]
if muted_topics:
def mute_cond(muted):
# type: (Tuple[str, str]) -> Selectable
stream_cond = column("recipient_id") == recipient_map[muted[0].lower()]
topic_cond = func.upper(column("subject")) == func.upper(muted[1])
return and_(stream_cond, topic_cond)
condition = not_(or_(*list(map(mute_cond, muted_topics))))
return conditions + [condition]
return conditions
0
Example 147
def __dir__(cls):
return list(set(super(ExtendableEnumMeta, cls).__dir__()).union(*map(dir, cls.__subclasses__())))
0
Example 148
def zip(self, *others):
"""
Zip the items of this collection with one or more
other sequences, and wrap the result.
Unlike Python's zip, all sequences must be the same length.
Parameters:
others: One or more iterables or Collections
Returns:
A new collection.
Examples:
>>> c1 = Collection([Scalar(1), Scalar(2)])
>>> c2 = Collection([Scalar(3), Scalar(4)])
>>> c1.zip(c2).val()
[(1, 3), (2, 4)]
"""
args = [_unwrap(item) for item in (self,) + others]
ct = self.count()
if not all(len(arg) == ct for arg in args):
raise ValueError("Arguments are not all the same length")
return Collection(map(Wrapper.wrap, zip(*args)))
0
Example 149
Project: dit Source File: lattice.py
def insert_rv(dist, idx, sigalg):
"""
Returns a new distribution with a random variable inserted at index `idx`.
The random variable is constructed according to its induced sigma-algebra.
Parameters
----------
dist : Distribution
The distribution which defines the base sigma-algebra.
idx : int
The index at which to insert the random variable. To append, set `idx`
to be equal to -1 or dist.outcome_length().
sigalg : frozenset
The sigma-algebra induced by the random variable.
Returns
-------
d : Distribution
The new distribution.
"""
from itertools import chain
if idx == -1:
idx = dist.outcome_length()
if not 0 <= idx <= dist.outcome_length():
raise IndexError('Invalid insertion index.')
# Provide sane sorting of atoms
atoms = atom_set(sigalg)
atoms = [sorted(atom) for atom in atoms]
atoms.sort(key=quasilexico_key)
labels = range(len(atoms))
if dist._outcome_class == str:
# Then the labels for the new random variable must be strings.
labels = map(str, labels)
# Create an index from outcomes to atoms.
atom_of = {}
for label, atom in zip(labels, atoms):
for outcome in atom:
atom_of[outcome] = label
if idx == dist.outcome_length():
def new_outcome_ctor(outcome, ctor=dist._outcome_ctor):
"""The end of the outcome"""
new_outcome = [outcome, [atom_of[outcome]]]
return ctor(chain.from_iterable(new_outcome))
elif idx == 0:
def new_outcome_ctor(outcome, ctor=dist._outcome_ctor):
"""The beginning of the outcome"""
new_outcome = [[atom_of[outcome]], outcome]
return ctor(chain.from_iterable(new_outcome))
else:
def new_outcome_ctor(outcome, ctor=dist._outcome_ctor):
"""In the middle of the outcome"""
new_outcome = [outcome[:idx], [atom_of[outcome]], outcome[idx:]]
return ctor(chain.from_iterable(new_outcome))
d = dit.modify_outcomes(dist, new_outcome_ctor)
return d
0
Example 150
Project: ete Source File: ncbiquery.py
def annotate_tree(self, t, taxid_attr="name", tax2name=None, tax2track=None, tax2rank=None):
"""Annotate a tree containing taxids as leaf names by adding the 'taxid',
'sci_name', 'lineage', 'named_lineage' and 'rank' additional attributes.
:param t: a Tree (or Tree derived) instance.
:param name taxid_attr: Allows to set a custom node attribute containing
the taxid number associated to each node (i.e. species in PhyloTree instances).
:param tax2name,tax2track,tax2rank: Use these arguments to provide
pre-calculated dictionaries providing translation from taxid number and
names,track lineages and ranks.
"""
taxids = set()
for n in t.traverse():
try:
tid = int(getattr(n, taxid_attr))
except (ValueError,AttributeError):
pass
else:
taxids.add(tid)
merged_conversion = {}
taxids, merged_conversion = self._translate_merged(taxids)
if not tax2name or taxids - set(map(int, list(tax2name.keys()))):
tax2name = self.get_taxid_translator(taxids)
if not tax2track or taxids - set(map(int, list(tax2track.keys()))):
tax2track = self.get_lineage_translator(taxids)
all_taxid_codes = set([_tax for _lin in list(tax2track.values()) for _tax in _lin])
extra_tax2name = self.get_taxid_translator(list(all_taxid_codes - set(tax2name.keys())))
tax2name.update(extra_tax2name)
tax2common_name = self.get_common_names(tax2name.keys())
if not tax2rank:
tax2rank = self.get_rank(list(tax2name.keys()))
n2leaves = t.get_cached_content()
for n in t.traverse('postorder'):
try:
node_taxid = int(getattr(n, taxid_attr))
except (ValueError, AttributeError):
node_taxid = None
n.add_features(taxid = node_taxid)
if node_taxid:
if node_taxid in merged_conversion:
node_taxid = merged_conversion[node_taxid]
n.add_features(sci_name = tax2name.get(node_taxid, getattr(n, taxid_attr, '')),
common_name = tax2common_name.get(node_taxid, ''),
lineage = tax2track[node_taxid],
rank = tax2rank.get(node_taxid, 'Unknown'),
named_lineage = [tax2name.get(tax, str(tax)) for tax in tax2track[node_taxid]])
elif n.is_leaf():
n.add_features(sci_name = getattr(n, taxid_attr, 'NA'),
common_name = '',
lineage = [],
rank = 'Unknown',
named_lineage = [])
else:
lineage = self._common_lineage([lf.lineage for lf in n2leaves[n]])
ancestor = lineage[-1]
n.add_features(sci_name = tax2name.get(ancestor, str(ancestor)),
common_name = tax2common_name.get(ancestor, ''),
taxid = ancestor,
lineage = lineage,
rank = tax2rank.get(ancestor, 'Unknown'),
named_lineage = [tax2name.get(tax, str(tax)) for tax in lineage])
return tax2name, tax2track, tax2rank