re.sub

Here are the examples of the python api re.sub taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

200 Examples 7

Example 1

Project: tp-libvirt
Source File: virsh_domtime.py
View license
@error.context_aware
def run(test, params, env):
    """
    This test virsh domtime command and its options.

    1) Start a guest with/without guest agent configured;
    2) Record guest times;
    3) Do some operation to stop VM;
    4) Run virsh domtime command with different options;
    5) Check the command result;
    6) Check the guest times against expectation;
    7) Cleanup test environment.
    """
    epoch = datetime.datetime(1970, 1, 1, 0, 0, 0)
    # Max time can be set with domtime successfully in newer qemu-ga
    time_max_1 = 3155731199
    # Max time can be set with domtime successfully in older qemu-ga
    time_max_2 = 3155759999
    # Max time can be set with domtime bug failed to set RTC in older qemu-ga
    time_max_3 = 9223372035

    def init_time(session):
        """
        Initialize guest RTC time to epoch + 1234567890 and system time
        one day latter.

        :param session: Session from which to access guest
        """
        res = virsh.domtime(vm_name, time=1234567890)
        if res.exit_status:
            logging.debug("Failed to init time to 1234567890:\n%s", res)
        status, output = session.cmd_status_output('date -s "1 day"')
        if status:
            raise error.TestError("Failed to set guest time:\n%s" % output)

    def get_host_utc_time():
        """
        Get host UTC time from date command.
        """
        res = utils.run("date -u")
        # Strip timezone info from output
        # e.g. 'Sun Feb 15 07:31:40 CST 2009' -> 'Sun Feb 15 07:31:40 2009'
        time_str = re.sub(r'\S+ (?=\S+$)', '', res.stdout.strip())
        return datetime.datetime.strptime(time_str,
                                          r"%a %b %d %H:%M:%S %Y")

    def run_cmd(session, cmd):
        """
        Run a command in a session and record duration of call.
        """
        start = time.time()
        output = session.cmd_output(cmd)
        duration = time.time() - start
        logging.info('Result of command "%s". Duration: %s. Output:%s',
                     cmd, duration, output.strip())
        return output, duration

    def get_guest_times(session):
        """
        Retrieve different guest time as a dict for checking.
        Keys:
            local_hw: Guest RTC time in local timezone
            local_sys: Guest system time in local timezone
            utc_sys: Guest system time in UTC
            domtime: Guest system time in UTC got from virsh domtime command

        :param session: Session from which to access guest
        """
        times = {}
        get_begin = time.time()
        # Guest RTC local timezone time
        output, _ = run_cmd(session, 'hwclock')
        time_str, _ = re.search(r"(.+)  (\S+ seconds)", output).groups()

        try:
            # output format 1: Tue 01 Mar 2016 01:53:46 PM CST
            # Remove timezone info from output
            new_str = re.sub(r'\S+$', '', time_str)
            times['local_hw'] = datetime.datetime.strptime(
                new_str, r"%a %d %b %Y %I:%M:%S %p")
        except ValueError:
            # There are two possible output format for `hwclock`
            # output format 2: Sat Feb 14 07:31:33 2009
            times['local_hw'] = datetime.datetime.strptime(
                time_str, r"%a %b %d %H:%M:%S %Y")
        delta = time.time() - get_begin
        times['local_hw'] -= datetime.timedelta(seconds=delta)

        # Guest system local timezone time
        output, _ = run_cmd(session, 'date')
        # Strip timezone info from output
        # e.g. 'Sun Feb 15 07:31:40 CST 2009' -> 'Sun Feb 15 07:31:40 2009'
        time_str = re.sub(r'\S+ (?=\S+$)', '', output.strip())
        times['local_sys'] = datetime.datetime.strptime(
            time_str, r"%a %b %d %H:%M:%S %Y")
        delta = time.time() - get_begin
        times['local_sys'] -= datetime.timedelta(seconds=delta)

        # Guest system UTC timezone time
        output, _ = run_cmd(session, 'date -u')
        # Strip timezone info from output
        # e.g. 'Sun Feb 15 07:31:40 CST 2009' -> 'Sun Feb 15 07:31:40 2009'
        time_str = re.sub(r'\S+ (?=\S+$)', '', output.strip())
        times['utc_sys'] = datetime.datetime.strptime(
            time_str, r"%a %b %d %H:%M:%S %Y")
        delta = time.time() - get_begin
        times['utc_sys'] -= datetime.timedelta(seconds=delta)

        # Guest UTC time from virsh domtime
        res = virsh.domtime(vm_name, pretty=True, ignore_status=True)
        if not res.exit_status:
            logging.info('Result of "domtime". Duration: %s. Output:%s',
                         res.duration, res.stdout.strip())
            _, time_str = res.stdout.split(" ", 1)
            times['domtime'] = datetime.datetime.strptime(
                time_str.strip(), r"%Y-%m-%d %H:%M:%S")
            delta = time.time() - get_begin
            times['domtime'] -= datetime.timedelta(seconds=delta)
        else:
            logging.debug("Unable to get domain time:\n%s", res)
            times['domtime'] = None

        return times, time.time() - get_begin

    def check_get_success(expected_times):
        """
        Check virsh command get result against expected times

        :param expected_times: Expected time for checking
        """
        _, time_str = res.stdout.split(" ", 1)
        if pretty:
            # Time: 2015-01-13 06:29:18
            domtime = datetime.datetime.strptime(time_str.strip(),
                                                 r"%Y-%m-%d %H:%M:%S")
        else:
            # Time: 1421130740
            domtime = epoch + datetime.timedelta(seconds=int(time_str))
        time_shift = time.time() - start
        logging.debug("Time shift is %s", time_shift)
        result_diff = (domtime - expected_times['domtime']).total_seconds()
        if abs(result_diff) > 2.0:
            raise error.TestFail("Expect get time %s, but got %s, time "
                                 "diff: %s" % (org_times['domtime'],
                                               domtime, result_diff))

    def check_guest_times(expected_times, cur_times):
        """
        Check guest times after test against expected times

        :param expected_times: Expected time for checking
        """
        time_shift = time.time() - start
        logging.debug("Time shift is %s", time_shift)

        error_msgs = []
        for key in cur_times:
            if cur_times[key] is not None:
                cur = cur_times[key]
                expect = expected_times[key]

                diff = (cur - expect).total_seconds()
                msg = "For %s, expect get time %s, got %s, time diff: %s" % (
                    key, expect, cur, diff)
                logging.debug(msg)
                if abs(diff) > 2.0:
                    error_msgs.append(msg)
        if error_msgs:
            raise error.TestFail('\n'.join(error_msgs))

    def check_time(result, org_times, cur_times):
        """
        Check whether domain time has been change accordingly.

        :param result: virsh domtime CmdResult instance
        :param org_times: Original guest times
        """
        action = "get"
        if now or sync or (set_time is not None):
            action = "set"

        tz_diff = org_times['local_sys'] - org_times['utc_sys']
        logging.debug("Timezone diff on guest is %d hours.",
                      (tz_diff.total_seconds() / 3600))

        # Hardware time will never stop
        logging.info('Add %ss to expected guest time', interval)
        if action == 'get':
            expected_times = org_times
        elif action == 'set':
            if result.exit_status:
                # Time not change if domtime fails
                expected_times = org_times
            else:
                # Time change accordingly if succeed.
                if now:
                    utc_time = org_host_time
                    local_time = utc_time + tz_diff
                elif sync:
                    local_time = org_times["local_hw"]
                    utc_time = local_time - tz_diff
                elif set_time is not None:
                    utc_time = epoch + datetime.timedelta(
                        seconds=(int(set_time) - guest_duration))
                    local_time = utc_time + tz_diff
                expected_times = {}
                expected_times['local_hw'] = local_time
                expected_times['local_sys'] = local_time
                expected_times["utc_sys"] = utc_time
                expected_times["domtime"] = utc_time

        # Add interval between two checks of guest time
        for key in expected_times:
            if expected_times[key] is not None:
                expected_times[key] += interval

        # Hardware time will never stop
        # Software time will stop if suspended or managed-saved
        if suspend or managedsave:
            logging.info('Remove %ss from expected guest software time',
                         stop_time)
            expected_times["domtime"] -= stop_time
            expected_times["local_sys"] -= stop_time
            expected_times["utc_sys"] -= stop_time

        # Check guest time if domtime succeeded
        check_guest_times(expected_times, cur_times)

        # Check if output of domtime is correct
        if action == 'get' and not result.exit_status:
            check_get_success(expected_times)

    def prepare_fail_patts():
        """
        Predict fail pattern from test parameters.
        """
        fail_patts = []
        if not channel:
            fail_patts.append(r"QEMU guest agent is not configured")
        if not agent:
            # For older version
            fail_patts.append(r"Guest agent not available for now")
            # For newer version
            fail_patts.append(r"Guest agent is not responding")
        if int(now) + int(sync) + int(bool(set_time)) > 1:
            fail_patts.append(r"Options \S+ and \S+ are mutually exclusive")
        if shutdown:
            fail_patts.append(r"domain is not running")

        if set_time is not None:
            if int(set_time) < 0:
                fail_patts.append(r"Invalid argument")
            elif time_max_1 < int(set_time) <= time_max_2:
                fail_patts.append(r"Invalid time")
            elif time_max_2 < int(set_time) <= time_max_3:
                fail_patts.append(r"Invalid time")
            elif time_max_3 < int(set_time):
                fail_patts.append(r"too big for guest agent")
        return fail_patts

    def stop_vm():
        """
        Suspend, managedsave, pmsuspend or shutdown a VM for a period of time
        """
        stop_start = time.time()
        if suspend:
            vm.pause()
            time.sleep(10)
            vm.resume()
        elif managedsave:
            vm.managedsave()
            time.sleep(10)
            vm.start()
            vm.wait_for_login()
        elif pmsuspend:
            vm.pmsuspend()
            time.sleep(10)
            vm.pmwakeup()
            vm.wait_for_login()
        elif shutdown:
            vm.destroy()

        # Check real guest stop time
        stop_seconds = time.time() - stop_start
        stop_time = datetime.timedelta(seconds=stop_seconds)
        logging.debug("Guest stopped: %s", stop_time)
        return stop_time

    # Check availability of virsh command domtime
    if not virsh.has_help_command('domtime'):
        raise error.TestNAError("This version of libvirt does not support "
                                "the domtime test")

    channel = (params.get("prepare_channel", "yes") == 'yes')
    agent = (params.get("start_agent", "yes") == 'yes')
    pretty = (params.get("domtime_pretty", "no") == 'yes')
    now = (params.get("domtime_now", "no") == 'yes')
    sync = (params.get("domtime_sync", "no") == 'yes')
    set_time = params.get("domtime_time", None)

    shutdown = (params.get("shutdown_vm", "no") == 'yes')
    suspend = (params.get("suspend_vm", "no") == 'yes')
    managedsave = (params.get("managedsave_vm", "no") == 'yes')
    pmsuspend = (params.get("pmsuspend_vm", "no") == 'yes')

    vm_name = params.get("main_vm")
    vm = env.get_vm(vm_name)

    # Backup domain XML
    xml_backup = vm_xml.VMXML.new_from_inactive_dumpxml(vm_name)
    try:
        if pmsuspend:
            vm_xml.VMXML.set_pm_suspend(vm_name)
        # Add or remove qemu-agent from guest before test
        vm.prepare_guest_agent(channel=channel, start=agent)
        session = vm.wait_for_login()
        try:
            if channel and agent:
                init_time(session)

            # Expected fail message patterns
            fail_patts = prepare_fail_patts()

            # Message patterns test should skip when met
            skip_patts = [
                r'The command \S+ has not been found',
            ]

            # Record start time
            start = time.time()

            # Record host time before testing
            org_host_time = get_host_utc_time()
            # Get original guest times
            org_times, guest_duration = get_guest_times(session)

            # Run some operations to stop guest system
            stop_time = stop_vm()

            # Run command with specified options.
            res = virsh.domtime(vm_name, now=now, pretty=pretty, sync=sync,
                                time=set_time)
            libvirt.check_result(res, fail_patts, skip_patts)

            # Check interval between two check of guest time
            interval = datetime.timedelta(
                seconds=(time.time() - start))
            logging.debug("Interval between guest checking: %s", interval)

            if not shutdown:
                # Get current guest times
                cur_times, _ = get_guest_times(session)

                check_time(res, org_times, cur_times)
        finally:
            # Sync guest time with host
            if channel and agent and not shutdown:
                res = virsh.domtime(vm_name, now=True)
                if res.exit_status:
                    session.close()
                    raise error.TestError("Failed to recover guest time:\n%s"
                                          % res)
            session.close()
    finally:
        # Restore VM XML
        xml_backup.sync()

Example 2

Project: tp-qemu
Source File: openflow_acl_test.py
View license
@error.context_aware
def run(test, params, env):
    """
    Test Step:
        1. Boot up guest using the openvswitch bridge
        2. Setup related service in test enviroment(http, ftp etc.)(optional)
        3. Access the service in guest
        4. Setup access control rules in ovs to disable the access
        5. Access the service in guest
        6. Setup access control rules in ovs to enable the access
        7. Access the service in guest
        8. Delete the access control rules in ovs
        9. Access the service in guest

    Params:
        :param test: QEMU test object
        :param params: Dictionary with the test parameters
        :param env: Dictionary with test environment.
    """
    def access_service(access_sys, access_targets, disabled, host_ip,
                       ref=False):
        err_msg = ""
        err_type = ""
        for asys in access_sys:
            for atgt in access_targets:
                logging.debug("Try to access target %s from %s" % (atgt, asys))

                access_params = access_sys[asys]
                atgt_disabled = access_params['disabled_%s' % atgt]
                if asys in vms_tags:
                    vm = env.get_vm(asys)
                    session = vm.wait_for_login(timeout=timeout)
                    run_func = session.cmd
                    remote_src = vm
                    ssh_src_ip = vm.get_address()
                else:
                    run_func = utils.system_output
                    remote_src = "localhost"
                    ssh_src_ip = host_ip
                if atgt in vms_tags:
                    vm = env.get_vm(atgt)
                    access_re_sub_string = vm.wait_for_get_address(0)
                else:
                    access_re_sub_string = host_ip

                access_cmd = re.sub("ACCESS_TARGET", access_re_sub_string,
                                    access_params['access_cmd'])
                ref_cmd = re.sub("ACCESS_TARGET", access_re_sub_string,
                                 access_params['ref_cmd'])

                if access_cmd in ["ssh", "telnet"]:
                    if atgt in vms_tags:
                        target_vm = env.get_vm(atgt)
                        target_ip = target_vm.get_address()
                    else:
                        target_vm = "localhost"
                        target_ip = host_ip
                    out = ""
                    out_err = ""
                    try:
                        out = remote_login(access_cmd, target_ip,
                                           remote_src, params, host_ip)
                        stat = 0
                    except remote.LoginError, err:
                        stat = 1
                        out_err = "Failed to login %s " % atgt
                        out_err += "from %s, err: %s" % (asys, err.output)
                    try:
                        out += remote_login(access_cmd, ssh_src_ip,
                                            target_vm, params, host_ip)
                    except remote.LoginError, err:
                        stat += 1
                        out_err += "Failed to login %s " % asys
                        out_err += "from %s, err: %s" % (atgt, err.output)
                    if out_err:
                        out = out_err
                else:
                    try:
                        out = run_func(access_cmd, timeout=op_timeout)
                        stat = 0
                        check_string = access_params.get("check_from_output")
                        if check_string and check_string in out:
                            stat = 1
                    except (aexpect.ShellCmdError, error.CmdError,
                            aexpect.ShellTimeoutError), err:
                        if isinstance(err, error.CmdError):
                            out = err.result_obj.stderr
                            stat = err.result_obj.exit_status
                        else:
                            out = err.output
                            if isinstance(err, aexpect.ShellTimeoutError):
                                stat = 1
                                session.close()
                                session = vm.wait_for_login(timeout=timeout)
                                run_func = session.cmd
                            else:
                                stat = err.status
                    if access_params.get("clean_cmd"):
                        try:
                            run_func(access_params['clean_cmd'])
                        except Exception:
                            pass

                if disabled and atgt_disabled and stat == 0:
                    err_msg += "Still can access %s after" % atgt
                    err_msg += " disable it from ovs. "
                    err_msg += "Command: %s. " % access_cmd
                    err_msg += "Output: %s" % out
                if disabled and atgt_disabled and stat != 0:
                    logging.debug("Can not access target as expect.")
                if not disabled and stat != 0:
                    if ref:
                        err_msg += "Can not access %s at the" % atgt
                        err_msg += " beginning. Please check your setup."
                        err_type = "ref"
                    else:
                        err_msg += "Still can not access %s" % atgt
                        err_msg += " after enable the access"
                    err_msg += "Command: %s. " % access_cmd
                    err_msg += "Output: %s" % out
                if err_msg:
                    session.close()
                    if err_type == "ref":
                        raise error.TestNAError(err_msg)
                    raise error.TestFail(err_msg)

                if not ref_cmd:
                    session.close()
                    return

                try:
                    out = run_func(ref_cmd, timeout=op_timeout)
                    stat = 0
                except (aexpect.ShellCmdError, error.CmdError,
                        aexpect.ShellTimeoutError), err:
                    if isinstance(err, error.CmdError):
                        out = err.result_obj.stderr
                        stat = err.result_obj.exit_status
                    else:
                        out = err.output
                        if isinstance(err, aexpect.ShellTimeoutError):
                            stat = 1
                        else:
                            stat = err.status

                if stat != 0:
                    if ref:
                        err_msg += "Refernce command failed at beginning."
                        err_type = "ref"
                    else:
                        err_msg += "Refernce command failed after setup"
                        err_msg += " the rules"
                    err_msg += "Command: %s. " % ref_cmd
                    err_msg += "Output: %s" % out
                if err_msg:
                    session.close()
                    if err_type == "ref":
                        raise error.TestNAError(err_msg)
                    raise error.TestFail(err_msg)
                session.close()

    def get_acl_cmd(protocol, in_port, action, extra_options):
        acl_cmd = protocol.strip()
        acl_cmd += ",in_port=%s" % in_port.strip()
        if extra_options.strip():
            acl_cmd += ",%s" % ",".join(extra_options.strip().split())
        if action.strip():
            acl_cmd += ",action=%s" % action.strip()
        return acl_cmd

    def acl_rules_check(acl_rules, acl_setup_cmd):
        acl_setup_cmd = re.sub("action=", "actions=", acl_setup_cmd)
        acl_option = re.split(",", acl_setup_cmd)
        for line in acl_rules.splitlines():
            rule = [_.lower() for _ in re.split("[ ,]", line) if _]
            item_in_rule = 0

            for acl_item in acl_option:
                if acl_item.lower() in rule:
                    item_in_rule += 1

            if item_in_rule == len(acl_option):
                return True
        return False

    def remote_login(client, host, src, params_login, host_ip):
        src_name = src
        if src != "localhost":
            src_name = src.name
        logging.info("Login %s from %s" % (host, src))
        port = params_login["target_port"]
        username = params_login["username"]
        password = params_login["password"]
        prompt = params_login["shell_prompt"]
        linesep = eval("'%s'" % params_login.get("shell_linesep", r"\n"))
        quit_cmd = params.get("quit_cmd", "exit")
        if host == host_ip:
            # Try to login from guest to host.
            prompt = "^\[.*\][\#\$]\s*$"
            linesep = "\n"
            username = params_login["host_username"]
            password = params_login["host_password"]
            quit_cmd = "exit"

        if client == "ssh":
            # We only support ssh for Linux in this test
            cmd = ("ssh -o UserKnownHostsFile=/dev/null "
                   "-o StrictHostKeyChecking=no "
                   "-o PreferredAuthentications=password -p %s %[email protected]%s" %
                   (port, username, host))
        elif client == "telnet":
            cmd = "telnet -l %s %s %s" % (username, host, port)
        else:
            raise remote.LoginBadClientError(client)

        if src == "localhost":
            logging.debug("Login with command %s" % cmd)
            session = aexpect.ShellSession(cmd, linesep=linesep, prompt=prompt)
        else:
            if params_login.get("os_type") == "windows":
                if client == "telnet":
                    cmd = "C:\\telnet.py %s %s " % (host, username)
                    cmd += "%s \"%s\" && " % (password, prompt)
                    cmd += "C:\\wait_for_quit.py"
                cmd = "%s || ping 127.0.0.1 -n 5 -w 1000 > nul" % cmd
            else:
                cmd += " || sleep 5"
            session = src.wait_for_login()
            logging.debug("Sending login command: %s" % cmd)
            session.sendline(cmd)
        try:
            out = remote.handle_prompts(session, username, password,
                                        prompt, timeout, debug=True)
        except Exception, err:
            session.close()
            raise err
        try:
            session.cmd(quit_cmd)
            session.close()
        except Exception:
            pass
        return out

    def setup_service(setup_target):
        setup_timeout = int(params.get("setup_timeout", 360))
        if setup_target == "localhost":
            setup_func = utils.system_output
            os_type = "linux"
        else:
            setup_vm = env.get_vm(setup_target)
            setup_session = setup_vm.wait_for_login(timeout=timeout)
            setup_func = setup_session.cmd
            os_type = params["os_type"]

        setup_params = params.object_params(os_type)
        setup_cmd = setup_params.get("setup_cmd", "service SERVICE restart")
        prepare_cmd = setup_params.get("prepare_cmd")
        setup_cmd = re.sub("SERVICE", setup_params.get("service", ""),
                           setup_cmd)

        error.context("Set up %s service in %s" % (setup_params.get("service"),
                                                   setup_target),
                      logging.info)
        if prepare_cmd:
            setup_func(prepare_cmd, timeout=setup_timeout)
        setup_func(setup_cmd, timeout=setup_timeout)
        if setup_target != "localhost":
            setup_session.close()

    def stop_service(setup_target):
        setup_timeout = int(params.get("setup_timeout", 360))
        if setup_target == "localhost":
            setup_func = utils.system_output
            os_type = "linux"
        else:
            setup_vm = env.get_vm(setup_target)
            setup_session = setup_vm.wait_for_login(timeout=timeout)
            setup_func = setup_session.cmd
            os_type = params["os_type"]

        setup_params = params.object_params(os_type)
        stop_cmd = setup_params.get("stop_cmd", "service SERVICE stop")
        cleanup_cmd = setup_params.get("cleanup_cmd")
        stop_cmd = re.sub("SERVICE", setup_params.get("service", ""),
                          stop_cmd)

        error.context("Stop %s service in %s" % (setup_params.get("service"),
                                                 setup_target),
                      logging.info)
        if stop_cmd:
            setup_func(stop_cmd, timeout=setup_timeout)

        if cleanup_cmd:
            setup_func(cleanup_cmd, timeout=setup_timeout)

        if setup_target != "localhost":
            setup_session.close()

    timeout = int(params.get("login_timeout", '360'))
    op_timeout = int(params.get("op_timeout", "360"))
    acl_protocol = params['acl_protocol']
    acl_extra_options = params.get("acl_extra_options", "")

    for vm in env.get_all_vms():
        session = vm.wait_for_login(timeout=timeout)
        if params.get("disable_iptables") == "yes":
            session.cmd("iptables -F")
            #session.cmd_status_output("service iptables stop")
        if params.get("copy_scripts"):
            root_dir = data_dir.get_root_dir()
            script_dir = os.path.join(root_dir, "shared", "scripts")
            tmp_dir = params.get("tmp_dir", "C:\\")
            for script in params.get("copy_scripts").split():
                script_path = os.path.join(script_dir, script)
                vm.copy_files_to(script_path, tmp_dir)
        session.close()

    vms_tags = params.objects("vms")
    br_name = params.get("netdst")
    if br_name == "private":
        br_name = params.get("priv_brname", 'autotest-prbr0')

    for setup_target in params.get("setup_targets", "").split():
        setup_service(setup_target)

    access_targets = params.get("access_targets", "localhost").split()
    deny_target = params.get("deny_target", "localhost")
    all_target = params.get("extra_target", "").split() + vms_tags
    target_port = params["target_port"]
    vm = env.get_vm(vms_tags[0])
    nic = vm.virtnet[0]
    if_name = nic.ifname
    params_nic = params.object_params("nic1")
    if params["netdst"] == "private":
        params_nic["netdst"] = params_nic.get("priv_brname", "atbr0")
    host_ip = utils_net.get_host_ip_address(params_nic)
    if deny_target in vms_tags:
        deny_vm = env.get_vm(deny_target)
        deny_vm_ip = deny_vm.wait_for_get_address(0)
    elif deny_target == "localhost":
        deny_vm_ip = host_ip
    if "NW_DST" in acl_extra_options:
        acl_extra_options = re.sub("NW_DST", deny_vm_ip, acl_extra_options)
    acl_extra_options = re.sub("TARGET_PORT", target_port, acl_extra_options)

    access_sys = {}
    for target in all_target:
        if target not in access_targets:
            if target in vms_tags:
                os_type = params["os_type"]
            else:
                os_type = "linux"
            os_params = params.object_params(os_type)
            access_param = os_params.object_params(target)
            check_from_output = access_param.get("check_from_output")

            access_sys[target] = {}
            access_sys[target]['access_cmd'] = access_param['access_cmd']
            access_sys[target]['ref_cmd'] = access_param.get('ref_cmd', "")
            access_sys[target]['clean_cmd'] = access_param.get('clean_guest',
                                                               "")
            if check_from_output:
                access_sys[target]['check_from_output'] = check_from_output
            for tgt in access_targets:
                tgt_param = access_param.object_params(tgt)
                acl_disabled = tgt_param.get("acl_disabled") == "yes"
                access_sys[target]['disabled_%s' % tgt] = acl_disabled

    error.context("Try to access target before setup the rules", logging.info)
    access_service(access_sys, access_targets, False, host_ip, ref=True)
    error.context("Disable the access in ovs", logging.info)
    br_infos = utils_net.openflow_manager(br_name, "show").stdout
    if_port = re.findall("(\d+)\(%s\)" % if_name, br_infos)
    if not if_port:
        raise error.TestNAError("Can not find %s in bridge %s" % (if_name,
                                                                  br_name))
    if_port = if_port[0]

    acl_cmd = get_acl_cmd(acl_protocol, if_port, "drop", acl_extra_options)
    utils_net.openflow_manager(br_name, "add-flow", acl_cmd)
    acl_rules = utils_net.openflow_manager(br_name, "dump-flows").stdout
    if not acl_rules_check(acl_rules, acl_cmd):
        raise error.TestFail("Can not find the rules from"
                             " ovs-ofctl: %s" % acl_rules)

    error.context("Try to acess target to exam the disable rules",
                  logging.info)
    access_service(access_sys, access_targets, True, host_ip)
    error.context("Enable the access in ovs", logging.info)
    acl_cmd = get_acl_cmd(acl_protocol, if_port, "normal", acl_extra_options)
    utils_net.openflow_manager(br_name, "mod-flows", acl_cmd)
    acl_rules = utils_net.openflow_manager(br_name, "dump-flows").stdout
    if not acl_rules_check(acl_rules, acl_cmd):
        raise error.TestFail("Can not find the rules from"
                             " ovs-ofctl: %s" % acl_rules)
    error.context("Try to acess target to exam the enable rules",
                  logging.info)
    access_service(access_sys, access_targets, False, host_ip)
    error.context("Delete the access rules in ovs", logging.info)
    acl_cmd = get_acl_cmd(acl_protocol, if_port, "", acl_extra_options)
    utils_net.openflow_manager(br_name, "del-flows", acl_cmd)
    acl_rules = utils_net.openflow_manager(br_name, "dump-flows").stdout
    if acl_rules_check(acl_rules, acl_cmd):
        raise error.TestFail("Still can find the rules from"
                             " ovs-ofctl: %s" % acl_rules)
    error.context("Try to acess target to exam after delete the rules",
                  logging.info)
    access_service(access_sys, access_targets, False, host_ip)

    for setup_target in params.get("setup_targets", "").split():
        stop_service(setup_target)

Example 3

Project: virt-test
Source File: unattended_install.py
View license
    def setup_boot_disk(self):
        if self.unattended_file.endswith('.sif'):
            dest_fname = 'winnt.sif'
            setup_file = 'winnt.bat'
            boot_disk = utils_disk.FloppyDisk(self.floppy,
                                              self.qemu_img_binary,
                                              self.tmpdir, self.vfd_size)
            answer_path = boot_disk.get_answer_file_path(dest_fname)
            self.answer_windows_ini(answer_path)
            setup_file_path = os.path.join(self.unattended_dir, setup_file)
            boot_disk.copy_to(setup_file_path)
            if self.install_virtio == "yes":
                boot_disk.setup_virtio_win2003(self.virtio_floppy,
                                               self.virtio_oemsetup_id)
            boot_disk.copy_to(self.finish_program)

        elif self.unattended_file.endswith('.ks'):
            # Red Hat kickstart install
            dest_fname = 'ks.cfg'
            if self.params.get('unattended_delivery_method') == 'integrated':
                ks_param = 'ks=cdrom:/dev/sr0:/isolinux/%s' % dest_fname
                kernel_params = self.kernel_params
                if 'ks=' in kernel_params:
                    kernel_params = re.sub('ks\=[\w\d\:\.\/]+',
                                           ks_param,
                                           kernel_params)
                else:
                    kernel_params = '%s %s' % (kernel_params, ks_param)

                # Standard setting is kickstart disk in /dev/sr0 and
                # install cdrom in /dev/sr1. As we merge them together,
                # we need to change repo configuration to /dev/sr0
                if 'repo=cdrom' in kernel_params:
                    kernel_params = re.sub('repo\=cdrom[\:\w\d\/]*',
                                           'repo=cdrom:/dev/sr0',
                                           kernel_params)

                self.kernel_params = None
                boot_disk = utils_disk.CdromInstallDisk(
                    self.cdrom_unattended,
                    self.tmpdir,
                    self.cdrom_cd1_mount,
                    kernel_params)
            elif self.params.get('unattended_delivery_method') == 'url':
                if self.unattended_server_port is None:
                    self.unattended_server_port = utils_misc.find_free_port(
                        8000,
                        8099,
                        self.url_auto_content_ip)
                path = os.path.join(os.path.dirname(self.cdrom_unattended),
                                    'ks')
                boot_disk = RemoteInstall(path, self.url_auto_content_ip,
                                          self.unattended_server_port,
                                          dest_fname)
                ks_param = 'ks=%s' % boot_disk.get_url()
                kernel_params = self.kernel_params
                if 'ks=' in kernel_params:
                    kernel_params = re.sub('ks\=[\w\d\:\.\/]+',
                                           ks_param,
                                           kernel_params)
                else:
                    kernel_params = '%s %s' % (kernel_params, ks_param)

                # Standard setting is kickstart disk in /dev/sr0 and
                # install cdrom in /dev/sr1. When we get ks via http,
                # we need to change repo configuration to /dev/sr0
                kernel_params = re.sub('repo\=cdrom[\:\w\d\/]*',
                                       'repo=cdrom:/dev/sr0',
                                       kernel_params)

                self.kernel_params = kernel_params
            elif self.params.get('unattended_delivery_method') == 'cdrom':
                boot_disk = utils_disk.CdromDisk(self.cdrom_unattended,
                                                 self.tmpdir)
            elif self.params.get('unattended_delivery_method') == 'floppy':
                boot_disk = utils_disk.FloppyDisk(self.floppy,
                                                  self.qemu_img_binary,
                                                  self.tmpdir, self.vfd_size)
                ks_param = 'ks=floppy'
                kernel_params = self.kernel_params
                if 'ks=' in kernel_params:
                    # Reading ks from floppy directly doesn't work in some OS,
                    # options 'ks=hd:/dev/fd0' can reading ks from mounted
                    # floppy, so skip repace it;
                    if not re.search("fd\d+", kernel_params):
                        kernel_params = re.sub('ks\=[\w\d\:\.\/]+',
                                               ks_param,
                                               kernel_params)
                else:
                    kernel_params = '%s %s' % (kernel_params, ks_param)

                kernel_params = re.sub('repo\=cdrom[\:\w\d\/]*',
                                       'repo=cdrom:/dev/sr0',
                                       kernel_params)

                self.kernel_params = kernel_params
            else:
                raise ValueError("Neither cdrom_unattended nor floppy set "
                                 "on the config file, please verify")
            answer_path = boot_disk.get_answer_file_path(dest_fname)
            self.answer_kickstart(answer_path)

        elif self.unattended_file.endswith('.xml'):
            if "autoyast" in self.kernel_params:
                # SUSE autoyast install
                dest_fname = "autoinst.xml"
                if (self.cdrom_unattended and
                        self.params.get('unattended_delivery_method') == 'cdrom'):
                    boot_disk = utils_disk.CdromDisk(self.cdrom_unattended,
                                                     self.tmpdir)
                elif self.floppy:
                    autoyast_param = 'autoyast=device://fd0/autoinst.xml'
                    kernel_params = self.kernel_params
                    if 'autoyast=' in kernel_params:
                        kernel_params = re.sub('autoyast\=[\w\d\:\.\/]+',
                                               autoyast_param,
                                               kernel_params)
                    else:
                        kernel_params = '%s %s' % (
                            kernel_params, autoyast_param)

                    self.kernel_params = kernel_params
                    boot_disk = utils_disk.FloppyDisk(self.floppy,
                                                      self.qemu_img_binary,
                                                      self.tmpdir,
                                                      self.vfd_size)
                else:
                    raise ValueError("Neither cdrom_unattended nor floppy set "
                                     "on the config file, please verify")
                answer_path = boot_disk.get_answer_file_path(dest_fname)
                self.answer_suse_xml(answer_path)

            else:
                # Windows unattended install
                dest_fname = "autounattend.xml"
                if self.params.get('unattended_delivery_method') == 'cdrom':
                    boot_disk = utils_disk.CdromDisk(self.cdrom_unattended,
                                                     self.tmpdir)
                    if self.install_virtio == "yes":
                        boot_disk.setup_virtio_win2008(self.virtio_floppy,
                                                       self.cdrom_virtio)
                    else:
                        self.cdrom_virtio = None
                else:
                    boot_disk = utils_disk.FloppyDisk(self.floppy,
                                                      self.qemu_img_binary,
                                                      self.tmpdir,
                                                      self.vfd_size)
                    if self.install_virtio == "yes":
                        boot_disk.setup_virtio_win2008(self.virtio_floppy)
                answer_path = boot_disk.get_answer_file_path(dest_fname)
                self.answer_windows_xml(answer_path)

                boot_disk.copy_to(self.finish_program)

        else:
            raise ValueError('Unknown answer file type: %s' %
                             self.unattended_file)

        boot_disk.close()

Example 4

Project: pysb
Source File: matlab.py
View license
    def export(self):
        """Generate a MATLAB class definition containing the ODEs for the PySB
        model associated with the exporter.

        Returns
        -------
        string
            String containing the MATLAB code for an implementation of the
            model's ODEs.
        """
        output = StringIO()
        pysb.bng.generate_equations(self.model)

        docstring = ''
        if self.docstring:
            docstring += self.docstring.replace('\n', '\n    % ')

        # Substitute underscores for any dots in the model name
        model_name = self.model.name.replace('.', '_')

        # -- Parameters and Initial conditions -------
        # Declare the list of parameters as a struct
        params_str = 'self.parameters = struct( ...\n'+' '*16
        params_str_list = []
        for i, p in enumerate(self.model.parameters):
            # Add parameter to struct along with nominal value
            cur_p_str = "'%s', %.17g" % (_fix_underscores(p.name), p.value)
            # Decide whether to continue or terminate the struct declaration:
            if i == len(self.model.parameters) - 1:
                cur_p_str += ');'    # terminate
            else:
                cur_p_str += ', ...' # continue

            params_str_list.append(cur_p_str)

        # Format and indent the params struct declaration
        params_str += ('\n'+' '*16).join(params_str_list)

        # Fill in an array of the initial conditions based on the named
        # parameter values
        initial_values_str = ('initial_values = zeros(1,%d);\n'+' '*12) % \
                             len(self.model.species)
        initial_values_str += ('\n'+' '*12).join(
                ['initial_values(%d) = self.parameters.%s; %% %s' %
                 (i+1, _fix_underscores(ic[1].name), ic[0])
                 for i, ic in enumerate(self.model.initial_conditions)])

        # -- Build observables declaration --
        observables_str = 'self.observables = struct( ...\n'+' '*16
        observables_str_list = []
        for i, obs in enumerate(self.model.observables):
            # Associate species and coefficient lists with observable names,
            # changing from zero- to one-based indexing
            cur_obs_str = "'%s', [%s; %s]" % \
                          (_fix_underscores(obs.name),
                           ' '.join([str(sp+1) for sp in obs.species]),
                           ' '.join([str(c) for c in obs.coefficients]))
            # Decide whether to continue or terminate the struct declaration:
            if i == len(self.model.observables) - 1:
                cur_obs_str += ');'    # terminate
            else:
                cur_obs_str += ', ...' # continue

            observables_str_list.append(cur_obs_str)
        # Format and indent the observables struct declaration
        observables_str += ('\n'+' '*16).join(observables_str_list)

        # -- Build ODEs -------
        # Build a stringified list of species
        species_list = ['%% %s;' % s for i, s in enumerate(self.model.species)]
        # Build the ODEs as strings from the model.odes array
        odes_list = ['y(%d,1) = %s;' % (i+1, sympy.ccode(self.model.odes[i])) 
                     for i in range(len(self.model.odes))] 
        # Zip the ODEs and species string lists and then flatten them
        # (results in the interleaving of the two lists)
        odes_species_list = [item for sublist in zip(species_list, odes_list)
                                  for item in sublist]
        # Flatten to a string and add correct indentation
        odes_str = ('\n'+' '*12).join(odes_species_list)

        # Change species names from, e.g., '__s(0)' to 'y0(1)' (note change
        # from zero-based indexing to 1-based indexing)
        odes_str = re.sub(r'__s(\d+)', \
                          lambda m: 'y0(%s)' % (int(m.group(1))+1), odes_str)
        # Change C code 'pow' function to MATLAB 'power' function
        odes_str = re.sub(r'pow\(', 'power(', odes_str)
        # Prepend 'p.' to named parameters and fix any underscores
        for i, p in enumerate(self.model.parameters):
            odes_str = re.sub(r'\b(%s)\b' % p.name,
                              'p.%s' % _fix_underscores(p.name), odes_str)

        # -- Build final output --
        output.write(pad(r"""
            classdef %(model_name)s
                %% %(docstring)s
                %% A class implementing the ordinary differential equations
                %% for the %(model_name)s model.
                %%
                %% Save as %(model_name)s.m.
                %%
                %% Generated by pysb.export.matlab.MatlabExporter.
                %%
                %% Properties
                %% ----------
                %% observables : struct
                %%     A struct containing the names of the observables from the
                %%     PySB model as field names. Each field in the struct
                %%     maps the observable name to a matrix with two rows:
                %%     the first row specifies the indices of the species
                %%     associated with the observable, and the second row
                %%     specifies the coefficients associated with the species.
                %%     For any given timecourse of model species resulting from
                %%     integration, the timecourse for an observable can be
                %%     retrieved using the get_observable method, described
                %%     below.
                %%
                %% parameters : struct
                %%     A struct containing the names of the parameters from the
                %%     PySB model as field names. The nominal values are set by
                %%     the constructor and their values can be overriden
                %%     explicitly once an instance has been created.
                %%
                %% Methods
                %% -------
                %% %(model_name)s.odes(tspan, y0)
                %%     The right-hand side function for the ODEs of the model,
                %%     for use with MATLAB ODE solvers (see Examples).
                %%
                %% %(model_name)s.get_initial_values()
                %%     Returns a vector of initial values for all species,
                %%     specified in the order that they occur in the original
                %%     PySB model (i.e., in the order found in model.species).
                %%     Non-zero initial conditions are specified using the
                %%     named parameters included as properties of the instance.
                %%     Hence initial conditions other than the defaults can be
                %%     used by assigning a value to the named parameter and then
                %%     calling this method. The vector returned by the method
                %%     is used for integration by passing it to the MATLAB
                %%     solver as the y0 argument.
                %%
                %% %(model_name)s.get_observables(y)
                %%     Given a matrix of timecourses for all model species
                %%     (i.e., resulting from an integration of the model),
                %%     get the trajectories corresponding to the observables.
                %%     Timecourses are returned as a struct which can be
                %%     indexed by observable name.
                %%
                %% Examples
                %% --------
                %% Example integration using default initial and parameter
                %% values:
                %%
                %% >> m = %(model_name)s();
                %% >> tspan = [0 100];
                %% >> [t y] = ode15s(@m.odes, tspan, m.get_initial_values());
                %%
                %% Retrieving the observables:
                %%
                %% >> y_obs = m.get_observables(y)
                %%
                properties
                    observables
                    parameters
                end

                methods
                    function self = %(model_name)s()
                        %% Assign default parameter values
                        %(params_str)s

                        %% Define species indices (first row) and coefficients
                        %% (second row) of named observables
                        %(observables_str)s
                    end

                    function initial_values = get_initial_values(self)
                        %% Return the vector of initial conditions for all
                        %% species based on the values of the parameters
                        %% as currently defined in the instance.

                        %(initial_values_str)s
                    end

                    function y = odes(self, tspan, y0)
                        %% Right hand side function for the ODEs

                        %% Shorthand for the struct of model parameters
                        p = self.parameters;

                        %(odes_str)s
                    end

                    function y_obs = get_observables(self, y)
                        %% Retrieve the trajectories for the model observables
                        %% from a matrix of the trajectories of all model
                        %% species.

                        %% Initialize the struct of observable timecourses
                        %% that we will return
                        y_obs = struct();

                        %% Iterate over the observables;
                        observable_names = fieldnames(self.observables);
                        for i = 1:numel(observable_names)
                            obs_matrix = self.observables.(observable_names{i});
                            species = obs_matrix(1, :);
                            coefficients = obs_matrix(2, :);
                            y_obs.(observable_names{i}) = ...
                                            y(:, species) * coefficients';
                        end
                    end
                end
            end
            """, 0) %
            {'docstring': docstring,
             'model_name': model_name,
             'params_str':params_str,
             'initial_values_str': initial_values_str,
             'observables_str': observables_str,
             'params_str': params_str,
             'odes_str': odes_str})

        return output.getvalue()

Example 5

Project: pysb
Source File: scipyode.py
View license
    def __init__(self, model, tspan=None, initials=None, param_values=None,
                 verbose=False, **kwargs):
        super(ScipyOdeSimulator, self).__init__(model,
                                                tspan=tspan,
                                                initials=initials,
                                                param_values=param_values,
                                                verbose=verbose,
                                                **kwargs)
        # We'll need to know if we're using the Jacobian when we get to run()
        self._use_analytic_jacobian = kwargs.get('use_analytic_jacobian',
                                                 False)
        self.cleanup = kwargs.get('cleanup', True)
        integrator = kwargs.get('integrator', 'vode')
        # Generate the equations for the model
        pysb.bng.generate_equations(self._model, self.cleanup, self.verbose)

        def _eqn_substitutions(eqns):
            """String substitutions on the sympy C code for the ODE RHS and
            Jacobian functions to use appropriate terms for variables and
            parameters."""
            # Substitute expanded parameter formulas for any named expressions
            for e in self._model.expressions:
                eqns = re.sub(r'\b(%s)\b' % e.name, '(' + sympy.ccode(
                    e.expand_expr()) + ')', eqns)

            # Substitute sums of observable species that could've been added
            # by expressions
            for obs in self._model.observables:
                obs_string = ''
                for i in range(len(obs.coefficients)):
                    if i > 0:
                        obs_string += "+"
                    if obs.coefficients[i] > 1:
                        obs_string += str(obs.coefficients[i]) + "*"
                    obs_string += "__s" + str(obs.species[i])
                if len(obs.coefficients) > 1:
                    obs_string = '(' + obs_string + ')'
                eqns = re.sub(r'\b(%s)\b' % obs.name, obs_string, eqns)

            # Substitute 'y[i]' for 'si'
            eqns = re.sub(r'\b__s(\d+)\b',
                          lambda m: 'y[%s]' % (int(m.group(1))),
                          eqns)

            # Substitute 'p[i]' for any named parameters
            for i, p in enumerate(self._model.parameters):
                eqns = re.sub(r'\b(%s)\b' % p.name, 'p[%d]' % i, eqns)
            return eqns

        # ODE RHS -----------------------------------------------
        # Prepare the string representations of the RHS equations
        code_eqs = '\n'.join(['ydot[%d] = %s;' %
                              (i, sympy.ccode(self._model.odes[i]))
                              for i in range(len(self._model.odes))])
        code_eqs = _eqn_substitutions(code_eqs)

        self._test_inline()

        # If we can't use weave.inline to run the C code, compile it as
        # Python code instead for use with
        # exec. Note: C code with array indexing, basic math operations,
        # and pow() just happens to also
        # be valid Python.  If the equations ever have more complex things
        # in them, this might fail.
        if not self._use_inline:
            code_eqs_py = compile(code_eqs, '<%s odes>' % self._model.name,
                                  'exec')
        else:
            for arr_name in ('ydot', 'y', 'p'):
                macro = arr_name.upper() + '1'
                code_eqs = re.sub(r'\b%s\[(\d+)\]' % arr_name,
                                  '%s(\\1)' % macro, code_eqs)

        def rhs(t, y, p):
            ydot = self.ydot
            # note that the evaluated code sets ydot as a side effect
            if self._use_inline:
                weave_inline(code_eqs, ['ydot', 't', 'y', 'p'])
            else:
                _exec(code_eqs_py, locals())
            return ydot

        # JACOBIAN -----------------------------------------------
        # We'll keep the code for putting together the matrix in Sympy
        # in case we want to do manipulations of the matrix later (e.g., to
        # put together the sensitivity matrix)
        jac_fn = None
        if self._use_analytic_jacobian:
            species_names = ['__s%d' % i for i in
                             range(len(self._model.species))]
            jac_matrix = []
            # Rows of jac_matrix are by equation f_i:
            # [[df1/x1, df1/x2, ..., df1/xn],
            #  [   ...                     ],
            #  [dfn/x1, dfn/x2, ..., dfn/xn],
            for eqn in self._model.odes:
                # Derivatives for f_i...
                jac_row = []
                for species_name in species_names:
                    # ... with respect to s_j
                    d = sympy.diff(eqn, species_name)
                    jac_row.append(d)
                jac_matrix.append(jac_row)

            # Next, prepare the stringified Jacobian equations
            jac_eqs_list = []
            for i, row in enumerate(jac_matrix):
                for j, entry in enumerate(row):
                    # Skip zero entries in the Jacobian
                    if entry == 0:
                        continue
                    jac_eq_str = 'jac[%d, %d] = %s;' % (
                    i, j, sympy.ccode(entry))
                    jac_eqs_list.append(jac_eq_str)
            jac_eqs = _eqn_substitutions('\n'.join(jac_eqs_list))

            # Try to inline the Jacobian if possible (as above for RHS)
            if not self._use_inline:
                jac_eqs_py = compile(jac_eqs,
                                     '<%s jacobian>' % self._model.name, 'exec')
            else:
                # Substitute array refs with calls to the JAC1 macro for inline
                jac_eqs = re.sub(r'\bjac\[(\d+), (\d+)\]',
                                 r'JAC2(\1, \2)', jac_eqs)
                # Substitute calls to the Y1 and P1 macros
                for arr_name in ('y', 'p'):
                    macro = arr_name.upper() + '1'
                    jac_eqs = re.sub(r'\b%s\[(\d+)\]' % arr_name,
                                     '%s(\\1)' % macro, jac_eqs)

            def jacobian(t, y, p):
                jac = self.jac
                # note that the evaluated code sets jac as a side effect
                if self._use_inline:
                    weave_inline(jac_eqs, ['jac', 't', 'y', 'p']);
                else:
                    _exec(jac_eqs_py, locals())
                return jac

            # Initialize the jacobian argument to None if we're not going to
            #  use it
            # jac = self.jac as defined in jacobian() earlier
            # Initialization of matrix for storing the Jacobian
            self.jac = np.zeros(
                (len(self._model.odes), len(self._model.species)))
            jac_fn = jacobian

        # build integrator options list from our defaults and any kwargs
        # passed to this function
        options = {}
        if self.default_integrator_options.get(integrator):
            options.update(
                self.default_integrator_options[integrator])  # default options

        options.update(kwargs.get('integrator_options', {}))  # overwrite
        # defaults
        self.opts = options
        self.ydot = np.ndarray(len(self._model.species))

        # Integrator
        if integrator == 'lsoda':
            # lsoda is accessed via scipy.integrate.odeint which,
            # as a function,
            # requires that we pass its args at the point of call. Thus we need
            # to stash stuff like the rhs and jacobian functions in self so we
            # can pass them in later.
            self.integrator = integrator
            # lsoda's rhs and jacobian function arguments are in a different
            # order to other integrators, so we define these shims that swizzle
            # the argument order appropriately.
            self.func = lambda t, y, p: rhs(y, t, p)
            if jac_fn is None:
                self.jac_fn = None
            else:
                self.jac_fn = lambda t, y, p: jac_fn(y, t, p)
        else:
            # The scipy.integrate.ode integrators on the other hand are object
            # oriented and hold the functions and such internally. Once we set
            # up the integrator object we only need to retain a reference to it
            # and can forget about the other bits.
            self.integrator = scipy.integrate.ode(rhs, jac=jac_fn)
            with warnings.catch_warnings():
                warnings.filterwarnings('error', 'No integrator name match')
                self.integrator.set_integrator(integrator, **options)

Example 6

Project: avocado
Source File: gdbmi_parser.py
View license
def __private():
    class Token:

        def __init__(self, type, value=None):
            self.type = type
            self.value = value

        def __cmp__(self, o):
            return cmp(self.type, o)

        def __repr__(self):
            return self.value or self.type

    class AST:

        def __init__(self, type):
            self.type = type
            self._kids = []

        def __getitem__(self, i):
            return self._kids[i]

        def __len__(self):
            return len(self._kids)

        def __setslice__(self, low, high, seq):
            self._kids[low:high] = seq

        def __cmp__(self, o):
            return cmp(self.type, o)

    class GdbMiScannerBase(spark.GenericScanner):

        def tokenize(self, input):
            self.rv = []
            spark.GenericScanner.tokenize(self, input)
            return self.rv

        def t_nl(self, s):
            r'\n|\r\n'
            self.rv.append(Token('nl'))

        def t_whitespace(self, s):
            r'[ \t\f\v]+'
            pass

        def t_symbol(self, s):
            r',|\{|\}|\[|\]|\='
            self.rv.append(Token(s, s))

        def t_result_type(self, s):
            r'\*|\+|\^'
            self.rv.append(Token('result_type', s))

        def t_stream_type(self, s):
            r'\@|\&|\~'
            self.rv.append(Token('stream_type', s))

        def t_string(self, s):
            r'[\w-]+'
            self.rv.append(Token('string', s))

        def t_c_string(self, s):
            r'\".*?(?<![\\\\])\"'
            inner = self.__unescape(s[1:len(s) - 1])
            self.rv.append(Token('c_string', inner))

        def t_default(self, s):
            r'( . | \n )+'
            raise Exception("Specification error: unmatched input for '%s'" % s)

        def __unescape(self, s):
            s = re.sub(r'\\r', r'\r', s)
            s = re.sub(r'\\n', r'\n', s)
            s = re.sub(r'\\t', r'\t', s)
            return re.sub(r'\\(.)', r'\1', s)

    class GdbMiScanner(GdbMiScannerBase):

        def t_token(self, s):
            r'\d+'
            self.rv.append(Token('token', s))

    class GdbMiParser(spark.GenericASTBuilder):

        def __init__(self):
            spark.GenericASTBuilder.__init__(self, AST, 'output')

        def p_output(self, args):
            """
            output ::= record_list
            record_list ::= generic_record
            record_list ::= generic_record record_list
            generic_record ::= result_record
            generic_record ::= stream_record
            result_record ::= result_header result_list nl
            result_record ::= result_header nl
            result_header ::= token result_type class
            result_header ::= result_type class
            result_header ::= token = class
            result_header ::= = class
            stream_record ::= stream_type c_string nl
            result_list ::= , result result_list
            result_list ::= , result
            result ::= variable = value
            class ::= string
            variable ::= string
            value ::= const
            value ::= tuple
            value ::= list
            value_list ::= , value
            value_list ::= , value value_list
            const ::= c_string
            tuple ::= { }
            tuple ::= { result }
            tuple ::= { result result_list }
            list ::= [ ]
            list ::= [ value ]
            list ::= [ value value_list ]
            list ::= [ result ]
            list ::= [ result result_list ]
            list ::= { value }
            list ::= { value value_list }
            """
            pass

        def terminal(self, token):
            #  Homogeneous AST.
            rv = AST(token.type)
            rv.value = token.value
            return rv

        def nonterminal(self, type, args):
            #  Flatten AST a bit by not making nodes if there's only one child.
            exclude = [
                'record_list'
            ]
            if len(args) == 1 and type not in exclude:
                return args[0]
            return spark.GenericASTBuilder.nonterminal(self, type, args)

        def error(self, token, i=0, tokens=None):
            if i > 2:
                print('%s %s %s %s' % (tokens[i - 3],
                                       tokens[i - 2],
                                       tokens[i - 1],
                                       tokens[i]))
            raise Exception("Syntax error at or near %d:'%s' token" % (i, token))

    class GdbMiInterpreter(spark.GenericASTTraversal):

        def __init__(self, ast):
            spark.GenericASTTraversal.__init__(self, ast)
            self.postorder()

        def __translate_type(self, type):
            table = {
                '^': 'result',
                '=': 'notify',
                '+': 'status',
                '*': 'exec',
                '~': 'console',
                '@': 'target',
                '&': 'log'
            }
            return table[type]

        def n_result(self, node):
            # result ::= variable = value
            node.value = {node[0].value: node[2].value}

        def n_tuple(self, node):
            if len(node) == 2:
                # tuple ::= {}
                node.value = {}
            elif len(node) == 3:
                # tuple ::= { result }
                node.value = node[1].value
            elif len(node) == 4:
                # tuple ::= { result result_list }
                node.value = node[1].value
                for result in node[2].value:
                    for n, v in result.items():
                        if node.value.has_key(n):
                            old = node.value[n]
                            if not isinstance(old, list):
                                node.value[n] = [node.value[n]]
                            node.value[n].append(v)
                        else:
                            node.value[n] = v
            else:
                raise Exception('Invalid tuple')

        def n_list(self, node):
            if len(node) == 2:
                # list ::= []
                node.value = []
            elif len(node) == 3:
                # list ::= [ value ]
                node.value = [node[1].value]
            elif len(node) == 4:
                # list ::= [ value value_list ]
                node.value = [node[1].value] + node[2].value
                #list ::= [ result ]
                #list ::= [ result result_list ]
                #list ::= { value }
                #list ::= { value value_list }

        def n_value_list(self, node):
            if len(node) == 2:
                #value_list ::= , value
                node.value = [node[1].value]
            elif len(node) == 3:
                #value_list ::= , value value_list
                node.value = [node[1].value] + node[2].value

        def n_result_list(self, node):
            if len(node) == 2:
                # result_list ::= , result
                node.value = [node[1].value]
            else:
                # result_list ::= , result result_list
                node.value = [node[1].value] + node[2].value

        def n_result_record(self, node):
            node.value = node[0].value
            if len(node) == 3:
                # result_record ::= result_header result_list nl
                node.value['results'] = node[1].value
            elif len(node) == 2:
                # result_record ::= result_header nl
                pass

        def n_result_header(self, node):
            if len(node) == 3:
                # result_header ::= token result_type class
                node.value = {
                    'token': node[0].value,
                    'type': self.__translate_type(node[1].value),
                    'class_': node[2].value,
                    'record_type': 'result'
                }
            elif len(node) == 2:
                # result_header ::= result_type class
                node.value = {
                    'token': None,
                    'type': self.__translate_type(node[0].value),
                    'class_': node[1].value,
                    'record_type': 'result'
                }

        def n_stream_record(self, node):
            # stream_record ::= stream_type c_string nl
            node.value = {
                'type': self.__translate_type(node[0].value),
                'value': node[1].value,
                'record_type': 'stream'
            }

        def n_record_list(self, node):
            if len(node) == 1:
                # record_list ::= generic_record
                node.value = [node[0].value]
            elif len(node) == 2:
                # record_list ::= generic_record record_list
                node.value = [node[0].value] + node[1].value

    class GdbDynamicObject:

        def __init__(self, dict_):
            self.graft(dict_)

        def __repr__(self):
            return pprint.pformat(self.__dict__)

        def __nonzero__(self):
            return len(self.__dict__) > 0

        def __getitem__(self, i):
            if i == 0 and len(self.__dict__) > 0:
                return self
            else:
                raise IndexError

        def __getattr__(self, name):
            if name.startswith('__'):
                raise AttributeError
            return None

        def graft(self, dict_):
            for name, value in dict_.items():
                name = name.replace('-', '_')
                if isinstance(value, dict):
                    value = GdbDynamicObject(value)
                elif isinstance(value, list):
                    x = value
                    value = []
                    for item in x:
                        if isinstance(item, dict):
                            item = GdbDynamicObject(item)
                        value.append(item)
                setattr(self, name, value)

    class GdbMiRecord:

        def __init__(self, record):
            self.result = None
            for name, value in record[0].items():
                name = name.replace('-', '_')
                if name == 'results':
                    for result in value:
                        if not self.result:
                            self.result = GdbDynamicObject(result)
                        else:
                            # graft this result to self.results
                            self.result.graft(result)
                else:
                    setattr(self, name, value)

        def __repr__(self):
            return pprint.pformat(self.__dict__)

    return (GdbMiScanner(), GdbMiParser(), GdbMiInterpreter, GdbMiRecord)

Example 7

Project: rst2pdf
Source File: tenjin.py
View license
def _create_helpers_module():

    def to_str(val):
        """Convert value into string. Return '' if val is None.
           ex.
             >>> to_str(None)
             ''
             >>> to_str("foo")
             'foo'
             >>> to_str(u"\u65e5\u672c\u8a9e")
             u'\u65e5\u672c\u8a9e'
             >>> to_str(123)
             '123'
        """
        if val is None:
            return ''
        if isinstance(val, str):
            return val
        return str(val, 'utf-8')

    def generate_tostrfunc(encoding):
        """Generate 'to_str' function which encodes unicode to str.
           ex.
              import tenjin
              from tenjin.helpers import escape
              to_str = tenjin.generate_tostrfunc('utf-8')
              engine = tenjin.Engine()
              context = { 'items': [u'AAA', u'BBB', u'CCC'] }
              output = engine.render('example.pyhtml')
              print output
        """
        def to_str(val):
            if val is None:
                return ''
            if isinstance(val, str):
                return val
            return str(val, 'utf-8')
        return to_str

    def echo(string):
        """add string value into _buf. this is equivarent to '#{string}'."""
        frame = sys._getframe(1)
        context = frame.f_locals
        context['_buf'].append(string)

    def start_capture(varname=None):
        """
        start capturing with name.

        ex. list.rbhtml
          <html><body>
          <?py start_capture('itemlist') ?>
            <ul>
              <?py for item in list: ?>
              <li>${item}</li>
              <?py #end ?>
            </ul>
          <?py stop_capture() ?>
          </body></html>

        ex. layout.rbhtml
          <html xml:lang="en" lang="en">
           <head>
            <title>Capture Example</title>
           </head>
           <body>
            <!-- content -->
          #{itemlist}
            <!-- /content -->
           </body>
          </html>
        """
        frame = sys._getframe(1)
        context = frame.f_locals
        context['_buf_tmp'] = context['_buf']
        context['_capture_varname'] = varname
        context['_buf'] = []

    def stop_capture(store_to_context=True):
        """
        stop capturing and return the result of capturing.
        if store_to_context is True then the result is stored into _context[varname].
        """
        frame = sys._getframe(1)
        context = frame.f_locals
        result = ''.join(context['_buf'])
        context['_buf'] = context.pop('_buf_tmp')
        varname = context.pop('_capture_varname')
        if varname:
            context[varname] = result
            if store_to_context:
                context['_context'][varname] = result
        return result

    def captured_as(name):
        """
        helper method for layout template.
        if captured string is found then append it to _buf and return True,
        else return False.
        """
        frame = sys._getframe(1)
        context = frame.f_locals
        if name in context:
            _buf = context['_buf']
            _buf.append(context[name])
            return True
        return False

    def _p(arg):
        """ex. '/show/'+_p("item['id']") => "/show/#{item['id']}" """
        return '<`#%s#`>' % arg    # decoded into #{...} by preprocessor

    def _P(arg):
        """ex. '<b>%s</b>' % _P("item['id']") => "<b>${item['id']}</b>" """
        return '<`$%s$`>' % arg    # decoded into ${...} by preprocessor

    def _decode_params(s):
        """decode <`#...#`> and <`$...$`> into #{...} and ${...}"""
        from urllib.parse import unquote
        dct = { 'lt':'<', 'gt':'>', 'amp':'&', 'quot':'"', '#039':"'", }
        def unescape(s):
            #return s.replace('<', '<').replace('>', '>').replace('&quot;', '"').replace('&#039;', "'").replace('&amp;',  '&')
            return re.sub(r'&(lt|gt|quot|amp|#039);',  lambda m: dct[m.group(1)],  s)
        s = re.sub(r'%3C%60%23(.*?)%23%60%3E', lambda m: '#{%s}' % unquote(m.group(1)), s)
        s = re.sub(r'%3C%60%24(.*?)%24%60%3E', lambda m: '${%s}' % unquote(m.group(1)), s)
        s = re.sub(r'<`#(.*?)#`>',   lambda m: '#{%s}' % unescape(m.group(1)), s)
        s = re.sub(r'<`\$(.*?)\$`>', lambda m: '${%s}' % unescape(m.group(1)), s)
        s = re.sub(r'<`#(.*?)#`>', r'#{\1}', s)
        s = re.sub(r'<`\$(.*?)\$`>', r'${\1}', s)
        return s

    mod = _create_module('tenjin.helpers')
    mod.to_str             = to_str
    mod.generate_tostrfunc = generate_tostrfunc
    mod.echo               = echo
    mod.start_capture      = start_capture
    mod.stop_capture       = stop_capture
    mod.captured_as        = captured_as
    mod._p                 = _p
    mod._P                 = _P
    mod._decode_params     = _decode_params
    mod.__all__ = ['escape', 'to_str', 'echo', 'generate_tostrfunc',
                   'start_capture', 'stop_capture', 'captured_as',
                   '_p', '_P', '_decode_params',
                   ]
    return mod

Example 8

Project: kamaelia_
Source File: Requester.py
View license
    def doStuff(self, channel):
        # Check what's on for each channel
        self.send(channel, "whatson")
        while not self.dataReady("whatson"):
            pass
        data = self.recv("whatson")
        if data == None:
            pid = None
        else:
            pid = data[0]
            title = data[1]
            offset = data[2]
            duration = data[3]
            expectedstart = data[4]
        if pid != self.channels[channel]:
            # Perhaps just do a duplicate scan before creating Twitter stream
            if pid == None:
                self.channels[channel] = None
                Print (channel, ": Off Air")
            else:
                self.channels[channel] = pid
                self.send(["http://www.bbc.co.uk/programmes/" + pid + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                
                if recvdata[0] == "OK":
                    programmedata = recvdata[1]
                else:
                    # Fake programme data to prevent crash - not ideal
                    programmedata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                # RDF reader needs to read from a file so write out first
                # Alternative is to read from a URL, but this lacks proper proxy support
                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(programmedata)
                file.close()

                g = Graph()
                # This is a temporary proxy fix. A URL could be put here instead
                g.parse("tempRDF.txt")

                # Identify the brand and whether there are any official hashtags
                twittags = list()
                for bid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Brand')):
                    # bid is Brand ID
                    bidmod = bid.replace("#programme","")
                    bidmod = str(bidmod.replace("file:///programmes/",""))
                    if (bidmod in self.officialbrandtags):
                        twittags = self.officialbrandtags[bidmod]
                        break

                # Identify the series and whether there are any official hashtags
                if len(twittags) == 0:
                    # Identify the brand and whether there are any official hashtags
                    for sid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Series')):
                        # sid is Series ID
                        sidmod = sid.replace("#programme","")
                        sidmod = str(sidmod.replace("file:///programmes/",""))
                        if (sidmod in self.officialseriestags):
                            twittags = self.officialseriestags[sidmod]
                            break

                vidmod = ""
                so = g.subject_objects(predicate=rdflib.URIRef('http://purl.org/ontology/po/version'))
                # Pick a version, any version - for this which one doesn't matter
                for x in so:
                    # vid is version id
                    vid = x[1]
                    vidmod = vid.replace("#programme","")
                    vidmod = vidmod.replace("file:///programmes/","")
                    break

                # Got version, now get people

                self.send(["http://www.bbc.co.uk/programmes/" + vidmod + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                if recvdata[0] == "OK":
                    versiondata = recvdata[1]
                else:
                    versiondata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(versiondata)
                file.close()

                g = Graph()
                g.parse("tempRDF.txt")

                # Identify if this is a change of programme, or the first time we've checked what's on for Print clarity
                if self.firstrun:
                    Print (channel , ": " + title)
                else:
                    Print (channel , ": Changed to - " , title)

                # Minor alterations
                title = title.replace("&","and")

                if ":" in title:
                    titlebits = title.split(":")
                    title = titlebits[0]

                # Saving a copy here so apostrophes etc can be used in the Twitter people search
                titlesave = title

                # Remove punctuation
                for item in """!"#$%()*+,-./;<=>[email protected][\\]?_'`{|}?""":
                    title = title.replace(item,"")

                keywords = dict()
                # Save keywords next to a descriptor of what they are
                keywords[pid] = "PID"

                # Add official hashtags to the list
                for tag in twittags:
                    keywords[tag] = "Twitter"

                # Duplicates will be removed later
                if string.find(title,"The",0,3) != -1:
                    newtitle = string.replace(re.sub("\s+","",title),"The ","",1)
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    # Check for and remove year too
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"
                    keywords['#' + string.lower(re.sub("\s+","",newtitle))] = "Title"
                    # Check for and remove year too
                    keywords['#' + string.replace(string.lower(re.sub("\s+","",newtitle))," " + str(date.today().year),"",1)] = "Title"
                else:
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"

                allwordtitle = string.replace(title,"The ","",1)
                allwordtitle = allwordtitle.lower()
                # Remove current year from events
                allwordtitle = allwordtitle.replace(" " + str(date.today().year),"",1)
                titlewords = allwordtitle.split()
                if len(titlewords) > 1:
                    keywords[allwordtitle] = "Title"
                else:
                    # Trial fix for issue of one word titles producing huge amounts of data
                    keywords[allwordtitle + "^" + "bbc"] = "Title"
                keywords["#" + re.sub("\s+","",allwordtitle)] = "Title"

                numwords = dict({"one" : 1, "two" : 2, "three": 3, "four" : 4, "five": 5, "six" : 6, "seven": 7})
                for word in numwords:
                    if word in channel.lower() and channel != "asiannetwork": # Bug fix! asianne2rk
                        numchannel = string.replace(channel.lower(),word,str(numwords[word]))
                        keywords[numchannel] = "Channel"
                        break
                    if str(numwords[word]) in channel.lower():
                        numchannel = string.replace(channel.lower(),str(numwords[word]),word)
                        keywords[numchannel] = "Channel"
                        break

                # Load NameCache (people we've already searched for on Twitter to avoid hammering PeopleSearch)
                save = False
                try:
                    homedir = os.path.expanduser("~")
                    file = open(homedir + "/namecache.conf",'r')
                    save = True
                except IOError:
                    e = sys.exc_info()[1]
                    Print ("Failed to load name cache - will attempt to create a new file: " ,  e)

                if save:
                    raw_config = file.read()
                    file.close()
                    try:
                        config = cjson.decode(raw_config)
                    except cjson.DecodeError:
                        e = sys.exc_info()[1]
                        config = dict()
                else:
                    config = dict()

                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Role'))

                for x in s:
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))

                    if ((firstname + " " + lastname) in config):
                        # Found a cached value
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Twitter"
                    else:
                        # Not cached yet - new request
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                # Only use this Twitter screen name if there's a good chance they're the person we're after
                                if ("verified" in user):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Participant"

                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Character'))

                for x in s:
                    character = str(g.value(subject=rdflib.BNode(x),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/name')))
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))
                    # This ^ is a temporary fix until I work out a better DB structure
                    keywords[character + "^" + channel] = "Character"
                    keywords[character + "^" + title] = "Character"
                    if " " in character:
                        # Looks like we have a firstname + surname situation
                        charwords = character.split()
                        if charwords[0] != "Dr" and charwords[0] != "Miss" and charwords[0] != "Mr" and charwords[0] != "Mrs" and charwords[0] != "Ms" and charwords[0] != "The":
                            # As long as the first word isn't a title, add it as a first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[0] + "^" + channel] = "Character"
                            keywords[charwords[0] + "^" + title] = "Character"
                        elif len(charwords) > 2:
                            # If the first word was a title, and the second word isn't a surname (checked by > 2) add the first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[1] + "^" + channel] = "Character"
                            keywords[charwords[1] + "^" + title] = "Character"
                    if ((firstname + " " + lastname) in config):
                        # Found a cached value
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Actor"
                    else:
                        # Not cached yet - new request
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if ("verified" in user):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Actor"

                # Radio appears to have been forgotten about a bit in RDF / scheduling at the mo
                # So, let's do some extra queries and see if the show title is a person's name on Twitter
                if "radio" in channel or "6music" in channel or "asiannetwork" in channel or "sportsextra" in channel or "worldservice" in channel:
                    # However, radio shows are often named using the DJ - The cases where this isn't true will cause problems however as they'll be saved in json - DOH! TODO
                    if (titlesave in config):
                        # Found a cached value
                        if config[titlesave] != "":
                            keywords[config[titlesave]] = "Twitter"
                    elif len(titlesave.split()) < 4: # Prevent some shows getting through at least - restricts people's names to three words
                        self.send(titlesave, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if ("verified" in user):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and  string.lower(user['name']) == titlesave.lower():
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError:
                            pass
                        config[titlesave] = screenname

                try:
                    file = open(homedir + "/namecache.conf",'w')
                    raw_config = cjson.encode(config)
                    file.write(raw_config)
                    file.close()
                except IOError:
                    Print ("Failed to save name cache - could cause rate limit problems")

                return [keywords,data]
            
        else:
            if pid == None:
                Print(channel , ": No change - Off Air")
            else:
                Print (channel , ": No change - " , title)

Example 9

Project: kamaelia_
Source File: Requester.py
View license
    def doStuff(self, channel):
        # Check what's on for each channel
        self.send(channel, "whatson")
        while not self.dataReady("whatson"):
            pass
        data = self.recv("whatson")
        if data == None:
            pid = None
        else:
            pid = data[0]
            title = data[1]
            offset = data[2]
            duration = data[3]
            expectedstart = data[4]
        if pid != self.channels[channel]:
            # Perhaps just do a duplicate scan before creating Twitter stream
            if pid == None:
                self.channels[channel] = None
                print (channel + ": Off Air")
            else:
                self.channels[channel] = pid
                self.send(["http://www.bbc.co.uk/programmes/" + pid + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                
                if recvdata[0] == "OK":
                    programmedata = recvdata[1]
                else:
                    # Fake programme data to prevent crash - not ideal
                    programmedata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                # RDF reader needs to read from a file so write out first
                # Alternative is to read from a URL, but this lacks proper proxy support
                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(programmedata)
                file.close()

                g = Graph()
                # This is a temporary proxy fix. A URL could be put here instead
                g.parse("tempRDF.txt")

                # Identify the brand and whether there are any official hashtags
                twittags = list()
                for bid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Brand')):
                    # bid is Brand ID
                    bidmod = bid.replace("#programme","")
                    bidmod = str(bidmod.replace("file:///programmes/",""))
                    if self.officialbrandtags.has_key(bidmod):
                        twittags = self.officialbrandtags[bidmod]
                        break

                # Identify the series and whether there are any official hashtags
                if len(twittags) == 0:
                    # Identify the brand and whether there are any official hashtags
                    for sid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Series')):
                        # sid is Series ID
                        sidmod = sid.replace("#programme","")
                        sidmod = str(sidmod.replace("file:///programmes/",""))
                        if self.officialseriestags.has_key(sidmod):
                            twittags = self.officialseriestags[sidmod]
                            break

                vidmod = ""
                so = g.subject_objects(predicate=rdflib.URIRef('http://purl.org/ontology/po/version'))
                # Pick a version, any version - for this which one doesn't matter
                for x in so:
                    # vid is version id
                    vid = x[1]
                    vidmod = vid.replace("#programme","")
                    vidmod = vidmod.replace("file:///programmes/","")
                    break

                # Got version, now get people

                self.send(["http://www.bbc.co.uk/programmes/" + vidmod + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                if recvdata[0] == "OK":
                    versiondata = recvdata[1]
                else:
                    versiondata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(versiondata)
                file.close()

                g = Graph()
                g.parse("tempRDF.txt")

                # Identify if this is a change of programme, or the first time we've checked what's on for print clarity
                if self.firstrun:
                    print (channel + ": " + title)
                else:
                    print (channel + ": Changed to - " + title)

                # Minor alterations
                title = title.replace("&","and")

                if ":" in title:
                    titlebits = title.split(":")
                    title = titlebits[0]

                # Saving a copy here so apostrophes etc can be used in the Twitter people search
                titlesave = title

                # Remove punctuation
                for item in """!"#$%()*+,-./;<=>[email protected][\\]?_'`{|}?""":
                    title = title.replace(item,"")

                keywords = dict()
                # Save keywords next to a descriptor of what they are
                keywords[pid] = "PID"

                # Add official hashtags to the list
                for tag in twittags:
                    keywords[tag] = "Twitter"

                # Duplicates will be removed later
                # If the title has 'The' in it, add hashtags both with and without the 'the' to the keyword list
                # This simply broadens the list of search terms
                if string.find(title,"The",0,3) != -1:
                    newtitle = string.replace(re.sub("\s+","",title),"The ","",1)
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    # Check for and remove year too - some programmes contain a year which may be undesirable from a search point of view
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"
                    keywords['#' + string.lower(re.sub("\s+","",newtitle))] = "Title"
                    # Check for and remove year too
                    keywords['#' + string.replace(string.lower(re.sub("\s+","",newtitle))," " + str(date.today().year),"",1)] = "Title"
                else:
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"

                allwordtitle = string.replace(title,"The ","",1)
                allwordtitle = allwordtitle.lower()
                # Remove current year from events
                allwordtitle = allwordtitle.replace(" " + str(date.today().year),"",1)
                titlewords = allwordtitle.split()
                if len(titlewords) > 1:
                    keywords[allwordtitle] = "Title"
                else:
                    # Trial fix for issue of one word titles producing huge amounts of data
                    # This occurs for keywords like 'Weather' and 'Breakfast' which aren't BBC limited terms
                    keywords[allwordtitle + "^" + "bbc"] = "Title"
                keywords["#" + re.sub("\s+","",allwordtitle)] = "Title"

                # Where a channel uses text for a number, we also want to search using the numeric representation
                numwords = dict({"one" : 1, "two" : 2, "three": 3, "four" : 4, "five": 5, "six" : 6, "seven": 7})
                for word in numwords:
                    if word in channel.lower() and channel != "asiannetwork": # Bug fix! asianne2rk
                        numchannel = string.replace(channel.lower(),word,str(numwords[word]))
                        keywords[numchannel] = "Channel"
                        break
                    if str(numwords[word]) in channel.lower():
                        numchannel = string.replace(channel.lower(),str(numwords[word]),word)
                        keywords[numchannel] = "Channel"
                        break

                # Load NameCache (people we've already searched for on Twitter to avoid hammering PeopleSearch)
                save = False
                try:
                    homedir = os.path.expanduser("~")
                    file = open(homedir + "/namecache.conf",'r')
                    save = True
                except IOError, e:
                    print ("Failed to load name cache - will attempt to create a new file: " + str(e))

                if save:
                    raw_config = file.read()
                    file.close()
                    try:
                        config = cjson.decode(raw_config)
                    except cjson.DecodeError, e:
                        config = dict()
                else:
                    config = dict()

                # Find people's names in retrieved RDF
                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Role'))

                for x in s:
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))

                    if config.has_key(firstname + " " + lastname):
                        # Found a cached value - this person has been searched for using Twitter
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Twitter"
                    else:
                        # Not cached yet - new request to Twitter people search
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                # Only use this Twitter screen name if there's a good chance they're the person we're after
                                if user.has_key('verified'):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError, e:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Participant"

                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Character'))

                for x in s:
                    character = str(g.value(subject=rdflib.BNode(x),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/name')))
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))
                    # This ^ is a temporary fix until I work out a better DB structure
                    # Character names can sometimes be single common words, like 'James'.
                    # For this reason, using this as a search term we require that either the channel name or programme title also appears in the tweet
                    # The ^ signals to later states of this program that the channel name / title doesn't necessarily have to appear next to the character name
                    keywords[character + "^" + channel] = "Character"
                    keywords[character + "^" + title] = "Character"
                    if " " in character:
                        # Looks like we have a firstname + surname situation
                        charwords = character.split()
                        if charwords[0] != "Dr" and charwords[0] != "Miss" and charwords[0] != "Mr" and charwords[0] != "Mrs" and charwords[0] != "Ms" and charwords[0] != "The":
                            # As long as the first word isn't a title, add it as a first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[0] + "^" + channel] = "Character"
                            keywords[charwords[0] + "^" + title] = "Character"
                        elif len(charwords) > 2:
                            # If the first word was a title, and the second word isn't a surname (checked by > 2) add the first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[1] + "^" + channel] = "Character"
                            keywords[charwords[1] + "^" + title] = "Character"
                    if config.has_key(firstname + " " + lastname):
                        # Found a cached value
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Actor"
                    else:
                        # Not cached yet - new request
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if user.has_key('verified'):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError, e:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Actor"

                # Radio appears to have been forgotten about a bit in RDF / scheduling at the mo
                # So, let's do some extra queries and see if the show title is a person's name on Twitter
                if "radio" in channel or "6music" in channel or "asiannetwork" in channel or "sportsextra" in channel or "worldservice" in channel:
                    # However, radio shows are often named using the DJ - The cases where this isn't true will cause problems however as they'll be saved in json - DOH! TODO
                    if config.has_key(titlesave):
                        # Found a cached value
                        if config[titlesave] != "":
                            keywords[config[titlesave]] = "Twitter"
                    elif len(titlesave.split()) < 4: # Prevent some shows getting through at least - restricts people's names to three words
                        self.send(titlesave, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if user.has_key('verified'):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and  string.lower(user['name']) == titlesave.lower():
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError, e:
                            pass
                        config[titlesave] = screenname

                try:
                    file = open(homedir + "/namecache.conf",'w')
                    raw_config = cjson.encode(config)
                    file.write(raw_config)
                    file.close()
                except IOError, e:
                    print ("Failed to save name cache - could cause rate limit problems")

                return [keywords,data]
            
        else:
            if pid == None:
                print(channel + ": No change - Off Air")
            else:
                print (channel + ": No change - " + title)

Example 10

Project: kamaelia_
Source File: Requester.py
View license
    def doStuff(self, channel):
        # Check what's on for each channel
        self.send(channel, "whatson")
        while not self.dataReady("whatson"):
            pass
        data = self.recv("whatson")
        if data == None:
            pid = None
        else:
            pid = data[0]
            title = data[1]
            offset = data[2]
            duration = data[3]
            expectedstart = data[4]
        if pid != self.channels[channel]:
            # Perhaps just do a duplicate scan before creating Twitter stream
            if pid == None:
                self.channels[channel] = None
                print (channel + ": Off Air")
            else:
                self.channels[channel] = pid
                self.send(["http://www.bbc.co.uk/programmes/" + pid + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                
                if recvdata[0] == "OK":
                    programmedata = recvdata[1]
                else:
                    # Fake programme data to prevent crash - not ideal
                    programmedata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                # RDF reader needs to read from a file so write out first
                # Alternative is to read from a URL, but this lacks proper proxy support
                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(programmedata)
                file.close()

                g = Graph()
                # This is a temporary proxy fix. A URL could be put here instead
                g.parse("tempRDF.txt")

                # Identify the brand and whether there are any official hashtags
                twittags = list()
                for bid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Brand')):
                    # bid is Brand ID
                    bidmod = bid.replace("#programme","")
                    bidmod = str(bidmod.replace("file:///programmes/",""))
                    if self.officialbrandtags.has_key(bidmod):
                        twittags = self.officialbrandtags[bidmod]
                        break

                # Identify the series and whether there are any official hashtags
                if len(twittags) == 0:
                    # Identify the brand and whether there are any official hashtags
                    for sid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Series')):
                        # sid is Series ID
                        sidmod = sid.replace("#programme","")
                        sidmod = str(sidmod.replace("file:///programmes/",""))
                        if self.officialseriestags.has_key(sidmod):
                            twittags = self.officialseriestags[sidmod]
                            break

                vidmod = ""
                so = g.subject_objects(predicate=rdflib.URIRef('http://purl.org/ontology/po/version'))
                # Pick a version, any version - for this which one doesn't matter
                for x in so:
                    # vid is version id
                    vid = x[1]
                    vidmod = vid.replace("#programme","")
                    vidmod = vidmod.replace("file:///programmes/","")
                    break

                # Got version, now get people

                self.send(["http://www.bbc.co.uk/programmes/" + vidmod + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                if recvdata[0] == "OK":
                    versiondata = recvdata[1]
                else:
                    versiondata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(versiondata)
                file.close()

                g = Graph()
                g.parse("tempRDF.txt")

                # Identify if this is a change of programme, or the first time we've checked what's on for print clarity
                if self.firstrun:
                    print (channel + ": " + title)
                else:
                    print (channel + ": Changed to - " + title)

                # Minor alterations
                title = title.replace("&","and")

                if ":" in title:
                    titlebits = title.split(":")
                    title = titlebits[0]

                # Saving a copy here so apostrophes etc can be used in the Twitter people search
                titlesave = title

                # Remove punctuation
                for item in """!"#$%()*+,-./;<=>[email protected][\\]?_'`{|}?""":
                    title = title.replace(item,"")

                keywords = dict()
                # Save keywords next to a descriptor of what they are
                keywords[pid] = "PID"

                # Add official hashtags to the list
                for tag in twittags:
                    keywords[tag] = "Twitter"

                # Duplicates will be removed later
                if string.find(title,"The",0,3) != -1:
                    newtitle = string.replace(re.sub("\s+","",title),"The ","",1)
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    # Check for and remove year too
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"
                    keywords['#' + string.lower(re.sub("\s+","",newtitle))] = "Title"
                    # Check for and remove year too
                    keywords['#' + string.replace(string.lower(re.sub("\s+","",newtitle))," " + str(date.today().year),"",1)] = "Title"
                else:
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"

                allwordtitle = string.replace(title,"The ","",1)
                allwordtitle = allwordtitle.lower()
                # Remove current year from events
                allwordtitle = allwordtitle.replace(" " + str(date.today().year),"",1)
                titlewords = allwordtitle.split()
                if len(titlewords) > 1:
                    keywords[allwordtitle] = "Title"
                else:
                    # Trial fix for issue of one word titles producing huge amounts of data
                    keywords[allwordtitle + "^" + "bbc"] = "Title"
                keywords["#" + re.sub("\s+","",allwordtitle)] = "Title"

                numwords = dict({"one" : 1, "two" : 2, "three": 3, "four" : 4, "five": 5, "six" : 6, "seven": 7})
                for word in numwords:
                    if word in channel.lower() and channel != "asiannetwork": # Bug fix! asianne2rk
                        numchannel = string.replace(channel.lower(),word,str(numwords[word]))
                        keywords[numchannel] = "Channel"
                        break
                    if str(numwords[word]) in channel.lower():
                        numchannel = string.replace(channel.lower(),str(numwords[word]),word)
                        keywords[numchannel] = "Channel"
                        break

                # Load NameCache (people we've already searched for on Twitter to avoid hammering PeopleSearch)
                save = False
                try:
                    homedir = os.path.expanduser("~")
                    file = open(homedir + "/namecache.conf",'r')
                    save = True
                except IOError, e:
                    print ("Failed to load name cache - will attempt to create a new file: " + str(e))

                if save:
                    raw_config = file.read()
                    file.close()
                    try:
                        config = cjson.decode(raw_config)
                    except cjson.DecodeError, e:
                        config = dict()
                else:
                    config = dict()

                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Role'))

                for x in s:
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))

                    if config.has_key(firstname + " " + lastname):
                        # Found a cached value
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Twitter"
                    else:
                        # Not cached yet - new request
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                # Only use this Twitter screen name if there's a good chance they're the person we're after
                                if user.has_key('verified'):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError, e:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Participant"

                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Character'))

                for x in s:
                    character = str(g.value(subject=rdflib.BNode(x),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/name')))
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))
                    # This ^ is a temporary fix until I work out a better DB structure
                    keywords[character + "^" + channel] = "Character"
                    keywords[character + "^" + title] = "Character"
                    if " " in character:
                        # Looks like we have a firstname + surname situation
                        charwords = character.split()
                        if charwords[0] != "Dr" and charwords[0] != "Miss" and charwords[0] != "Mr" and charwords[0] != "Mrs" and charwords[0] != "Ms" and charwords[0] != "The":
                            # As long as the first word isn't a title, add it as a first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[0] + "^" + channel] = "Character"
                            keywords[charwords[0] + "^" + title] = "Character"
                        elif len(charwords) > 2:
                            # If the first word was a title, and the second word isn't a surname (checked by > 2) add the first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[1] + "^" + channel] = "Character"
                            keywords[charwords[1] + "^" + title] = "Character"
                    if config.has_key(firstname + " " + lastname):
                        # Found a cached value
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Actor"
                    else:
                        # Not cached yet - new request
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if user.has_key('verified'):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError, e:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Actor"

                # Radio appears to have been forgotten about a bit in RDF / scheduling at the mo
                # So, let's do some extra queries and see if the show title is a person's name on Twitter
                if "radio" in channel or "6music" in channel or "asiannetwork" in channel or "sportsextra" in channel or "worldservice" in channel:
                    # However, radio shows are often named using the DJ - The cases where this isn't true will cause problems however as they'll be saved in json - DOH! TODO
                    if config.has_key(titlesave):
                        # Found a cached value
                        if config[titlesave] != "":
                            keywords[config[titlesave]] = "Twitter"
                    elif len(titlesave.split()) < 4: # Prevent some shows getting through at least - restricts people's names to three words
                        self.send(titlesave, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if user.has_key('verified'):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and  string.lower(user['name']) == titlesave.lower():
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError, e:
                            pass
                        config[titlesave] = screenname

                try:
                    file = open(homedir + "/namecache.conf",'w')
                    raw_config = cjson.encode(config)
                    file.write(raw_config)
                    file.close()
                except IOError, e:
                    print ("Failed to save name cache - could cause rate limit problems")

                return [keywords,data]
            
        else:
            if pid == None:
                print(channel + ": No change - Off Air")
            else:
                print (channel + ": No change - " + title)

Example 11

Project: kamaelia_
Source File: Requester.py
View license
    def doStuff(self, channel):
        # Check what's on for each channel
        self.send(channel, "whatson")
        while not self.dataReady("whatson"):
            pass
        data = self.recv("whatson")
        if data == None:
            pid = None
        else:
            pid = data[0]
            title = data[1]
            offset = data[2]
            duration = data[3]
            expectedstart = data[4]
        if pid != self.channels[channel]:
            # Perhaps just do a duplicate scan before creating Twitter stream
            if pid == None:
                self.channels[channel] = None
                Print (channel, ": Off Air")
            else:
                self.channels[channel] = pid
                self.send(["http://www.bbc.co.uk/programmes/" + pid + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                
                if recvdata[0] == "OK":
                    programmedata = recvdata[1]
                else:
                    # Fake programme data to prevent crash - not ideal
                    programmedata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                # RDF reader needs to read from a file so write out first
                # Alternative is to read from a URL, but this lacks proper proxy support
                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(programmedata)
                file.close()

                g = Graph()
                # This is a temporary proxy fix. A URL could be put here instead
                g.parse("tempRDF.txt")

                # Identify the brand and whether there are any official hashtags
                twittags = list()
                for bid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Brand')):
                    # bid is Brand ID
                    bidmod = bid.replace("#programme","")
                    bidmod = str(bidmod.replace("file:///programmes/",""))
                    if self.officialbrandtags.has_key(bidmod):
                        twittags = self.officialbrandtags[bidmod]
                        break

                # Identify the series and whether there are any official hashtags
                if len(twittags) == 0:
                    # Identify the brand and whether there are any official hashtags
                    for sid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Series')):
                        # sid is Series ID
                        sidmod = sid.replace("#programme","")
                        sidmod = str(sidmod.replace("file:///programmes/",""))
                        if self.officialseriestags.has_key(sidmod):
                            twittags = self.officialseriestags[sidmod]
                            break

                vidmod = ""
                so = g.subject_objects(predicate=rdflib.URIRef('http://purl.org/ontology/po/version'))
                # Pick a version, any version - for this which one doesn't matter
                for x in so:
                    # vid is version id
                    vid = x[1]
                    vidmod = vid.replace("#programme","")
                    vidmod = vidmod.replace("file:///programmes/","")
                    break

                # Got version, now get people

                self.send(["http://www.bbc.co.uk/programmes/" + vidmod + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                if recvdata[0] == "OK":
                    versiondata = recvdata[1]
                else:
                    versiondata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(versiondata)
                file.close()

                g = Graph()
                g.parse("tempRDF.txt")

                # Identify if this is a change of programme, or the first time we've checked what's on for Print clarity
                if self.firstrun:
                    Print (channel , ": " + title)
                else:
                    Print (channel , ": Changed to - " , title)

                # Minor alterations
                title = title.replace("&","and")

                if ":" in title:
                    titlebits = title.split(":")
                    title = titlebits[0]

                # Saving a copy here so apostrophes etc can be used in the Twitter people search
                titlesave = title

                # Remove punctuation
                for item in """!"#$%()*+,-./;<=>[email protected][\\]?_'`{|}?""":
                    title = title.replace(item,"")

                keywords = dict()
                # Save keywords next to a descriptor of what they are
                keywords[pid] = "PID"

                # Add official hashtags to the list
                for tag in twittags:
                    keywords[tag] = "Twitter"

                # Duplicates will be removed later
                if string.find(title,"The",0,3) != -1:
                    newtitle = string.replace(re.sub("\s+","",title),"The ","",1)
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    # Check for and remove year too
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"
                    keywords['#' + string.lower(re.sub("\s+","",newtitle))] = "Title"
                    # Check for and remove year too
                    keywords['#' + string.replace(string.lower(re.sub("\s+","",newtitle))," " + str(date.today().year),"",1)] = "Title"
                else:
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"

                allwordtitle = string.replace(title,"The ","",1)
                allwordtitle = allwordtitle.lower()
                # Remove current year from events
                allwordtitle = allwordtitle.replace(" " + str(date.today().year),"",1)
                titlewords = allwordtitle.split()
                if len(titlewords) > 1:
                    keywords[allwordtitle] = "Title"
                else:
                    # Trial fix for issue of one word titles producing huge amounts of data
                    keywords[allwordtitle + "^" + "bbc"] = "Title"
                keywords["#" + re.sub("\s+","",allwordtitle)] = "Title"

                numwords = dict({"one" : 1, "two" : 2, "three": 3, "four" : 4, "five": 5, "six" : 6, "seven": 7})
                for word in numwords:
                    if word in channel.lower() and channel != "asiannetwork": # Bug fix! asianne2rk
                        numchannel = string.replace(channel.lower(),word,str(numwords[word]))
                        keywords[numchannel] = "Channel"
                        break
                    if str(numwords[word]) in channel.lower():
                        numchannel = string.replace(channel.lower(),str(numwords[word]),word)
                        keywords[numchannel] = "Channel"
                        break

                # Load NameCache (people we've already searched for on Twitter to avoid hammering PeopleSearch)
                save = False
                try:
                    homedir = os.path.expanduser("~")
                    file = open(homedir + "/namecache.conf",'r')
                    save = True
                except IOError, e:
                    Print ("Failed to load name cache - will attempt to create a new file: " ,  e)

                if save:
                    raw_config = file.read()
                    file.close()
                    try:
                        config = cjson.decode(raw_config)
                    except cjson.DecodeError, e:
                        config = dict()
                else:
                    config = dict()

                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Role'))

                for x in s:
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))

                    if config.has_key(firstname + " " + lastname):
                        # Found a cached value
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Twitter"
                    else:
                        # Not cached yet - new request
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                # Only use this Twitter screen name if there's a good chance they're the person we're after
                                if user.has_key('verified'):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError, e:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Participant"

                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Character'))

                for x in s:
                    character = str(g.value(subject=rdflib.BNode(x),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/name')))
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))
                    # This ^ is a temporary fix until I work out a better DB structure
                    keywords[character + "^" + channel] = "Character"
                    keywords[character + "^" + title] = "Character"
                    if " " in character:
                        # Looks like we have a firstname + surname situation
                        charwords = character.split()
                        if charwords[0] != "Dr" and charwords[0] != "Miss" and charwords[0] != "Mr" and charwords[0] != "Mrs" and charwords[0] != "Ms" and charwords[0] != "The":
                            # As long as the first word isn't a title, add it as a first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[0] + "^" + channel] = "Character"
                            keywords[charwords[0] + "^" + title] = "Character"
                        elif len(charwords) > 2:
                            # If the first word was a title, and the second word isn't a surname (checked by > 2) add the first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[1] + "^" + channel] = "Character"
                            keywords[charwords[1] + "^" + title] = "Character"
                    if config.has_key(firstname + " " + lastname):
                        # Found a cached value
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Actor"
                    else:
                        # Not cached yet - new request
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if user.has_key('verified'):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError, e:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Actor"

                # Radio appears to have been forgotten about a bit in RDF / scheduling at the mo
                # So, let's do some extra queries and see if the show title is a person's name on Twitter
                if "radio" in channel or "6music" in channel or "asiannetwork" in channel or "sportsextra" in channel or "worldservice" in channel:
                    # However, radio shows are often named using the DJ - The cases where this isn't true will cause problems however as they'll be saved in json - DOH! TODO
                    if config.has_key(titlesave):
                        # Found a cached value
                        if config[titlesave] != "":
                            keywords[config[titlesave]] = "Twitter"
                    elif len(titlesave.split()) < 4: # Prevent some shows getting through at least - restricts people's names to three words
                        self.send(titlesave, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if user.has_key('verified'):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and  string.lower(user['name']) == titlesave.lower():
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError, e:
                            pass
                        config[titlesave] = screenname

                try:
                    file = open(homedir + "/namecache.conf",'w')
                    raw_config = cjson.encode(config)
                    file.write(raw_config)
                    file.close()
                except IOError, e:
                    Print ("Failed to save name cache - could cause rate limit problems")

                return [keywords,data]
            
        else:
            if pid == None:
                Print(channel , ": No change - Off Air")
            else:
                Print (channel , ": No change - " , title)

Example 12

View license
@ModuleInfo.plugin('wb.util.exportSQLite',
                   caption='Export SQLite CREATE script',
                   input=[wbinputs.currentCatalog()],
                   groups=['Catalog/Utilities', 'Menu/Catalog'])
@ModuleInfo.export(grt.INT, grt.classes.db_Catalog)
def exportSQLite(cat):
    """Function to go through all schemata in catalog and rename all FKs
    of table-objects
    """

    def validate_for_sqlite_export(cat):
        """Check uniqueness of schema, table and index names. Return 0 on
        success otherwise return 1 (the export process should abort)
        """

        have_errors = False
        idt = {}
        for i, schema in enumerate(cat.schemata):
            if schema.name in idt:
                have_errors = True
                if Workbench.confirm('Name conflict',
                        'Schemas %d and %d have the same name "%s".'
                        ' Please rename one of them.\n'
                        'Search for more such errors?' % (
                            idt[schema.name], i, schema.name)) == 0:
                    return False
            else:
                idt[schema.name] = i

        # Do not continue looking for errors on schema name error
        if have_errors:
            return False

        for schema in cat.schemata:
            idt = {}
            for i, tbl in enumerate(schema.tables):
                if tbl.name == '':
                    have_errors = True
                    if Workbench.confirm('Name conflict',
                            'Table %d in schema "%s". has no name.'
                            ' Please rename.\n'
                            'Search for more such errors?' % (
                                i, schema.name)) == 0:
                        return False
                if tbl.name in idt:
                    have_errors = True
                    if Workbench.confirm('Name conflict',
                            'Tables %d and %d in schema "%s"'
                            ' have the same name "%s".'
                            ' Please rename one of them.\n'
                            'Search for more such errors?' % (
                                idt[tbl.name], i, schema.name, tbl.name)) == 0:
                        return False
                else:
                    idt[tbl.name] = i

        if have_errors:
            return False

        for schema in cat.schemata:
            for tbl in schema.tables:
                idt = {}
                for i, column in enumerate(tbl.columns):
                    if column.name == '':
                        have_errors = True
                        if Workbench.confirm('Name conflict',
                                'Column %d in table "%s"."%s". has no name.'
                                ' Please rename.\n'
                                'Search for more such errors?' % (
                                    i, schema.name, tbl.name)) == 0:
                            return False
                    if column.name in idt:
                        have_errors = True
                        if Workbench.confirm('Name conflict',
                                'Columns %d and %d in table "%s"."%s"'
                                ' have the same name "%s".'
                                ' Please rename one of them.\n'
                                'Search for more such errors?' % (
                                    idt[column.name],
                                    i,
                                    schema.name,
                                    tbl.name,
                                    column.name)) == 0:
                            return False
                    else:
                        idt[column.name] = i

                # Now check indices (except primary/unique)
                idt = {}
                for i, index in enumerate(tbl.indices):
                    if index.indexType == 'INDEX':
                        if index.name == '':
                            have_errors = True
                            if Workbench.confirm('Name conflict',
                                    'Index %d in table "%s"."%s". has no name.'
                                    ' Please rename.\n'
                                    'Search for more such errors?' % (
                                        i, schema.name, tbl.name)) == 0:
                                return False
                        if index.name in idt:
                            have_errors = True
                            if Workbench.confirm('Name conflict',
                                    'Indices %d and %d in table "%s"."%s"'
                                    ' have the same name "%s".'
                                    ' Please rename one of them.\n'
                                    'Search for more such errors?' % (
                                        idt[index.name],
                                        i,
                                        schema.name,
                                        tbl.name,
                                        column.name)) == 0:
                                return False
                        else:
                            idt[index.name] = i

        if have_errors:
            return False

        return True

    def is_deferred(fkey):
        # Hack: if comment starts with "Defer..." we make it a deferred FK could
        # use member 'deferability' (WB has it), but there is no GUI for it
        return fkey.comment.lstrip().lower()[0:5] == 'defer'

    def export_table(out, db_name, schema, tbl):
        if len(tbl.columns) == 0:
            return

        out.write('CREATE TABLE %s%s(\n%s' % (
                  db_name, dq(tbl.name), schema_comment_format(tbl.comment)))

        primary_key = [i for i in tbl.indices if i.isPrimary == 1]
        primary_key = primary_key[0] if len(primary_key) > 0 else None

        pk_column = None
        if primary_key and len(primary_key.columns) == 1:
            pk_column = primary_key.columns[0].referencedColumn

        col_comment = ''
        for i, column in enumerate(tbl.columns):
            check, sqlite_type, flags = '', None, None
            if column.simpleType:
                sqlite_type = column.simpleType.name
                flags = column.simpleType.flags
            else:
                sqlite_type = column.userType.name
                flags = column.flags
            length = column.length
            # For INTEGER PRIMARY KEY column to become an alias for the rowid
            # the type needs to be "INTEGER" not "INT"
            # we fix it for other columns as well
            if 'INT' in sqlite_type or sqlite_type == 'LONG':
                sqlite_type = 'INTEGER'
                length = -1
                # Check flags for "unsigned"
                if 'UNSIGNED' in column.flags:
                    check = dq(column.name) + '>=0'
            # We even implement ENUM (because we can)
            if sqlite_type == 'ENUM':
                sqlite_type = 'TEXT'
                if column.datatypeExplicitParams:
                    check = (dq(column.name) + ' IN' +
                             column.datatypeExplicitParams)
            if i > 0:
                out.write(',' + comment_format(col_comment) + '\n')
            out.write('  ' + dq(column.name))
            # Type is optional in SQLite
            if sqlite_type != '':
                out.write(' ' + sqlite_type)
            # For [VAR]CHAR and such types specify length even though this is
            # not used in SQLite
            if length > 0:
                out.write('(%d)' % length)

            # Must specify single-column PKs as column-constraints for AI/rowid
            # behaviour
            if column == pk_column:
                out.write(' PRIMARY KEY')
                if primary_key.columns[0].descend == 1:
                    out.write(' DESC')
                # Only PK columns can be AI in SQLite
                if column.autoIncrement == 1:
                    out.write(' AUTOINCREMENT')
            # Check for NotNull
            if column.isNotNull == 1:
                out.write(' NOT NULL')

            if check != '':
                out.write(' CHECK(' + check + ')')

            if column.defaultValue != '':
                out.write(' DEFAULT ' + column.defaultValue)

            col_comment = column.comment

        # For multicolumn PKs
        if primary_key and not pk_column:
            out.write(',%s\n  PRIMARY KEY(%s)' % (
                      comment_format(col_comment),
                      print_index_columns(primary_key)))
            col_comment = ''

        # Put non-primary, UNIQUE Keys in CREATE TABLE as well (because we can)
        for index in tbl.indices:
            if index != primary_key and index.indexType == 'UNIQUE':
                out.write(',%s\n' % comment_format(col_comment))
                col_comment = ''
                if index.name != '':
                    out.write('  CONSTRAINT %s\n  ' % dq(index.name))
                out.write('  UNIQUE(%s)' % print_index_columns(index))

        for fkey in tbl.foreignKeys:
            have_fkeys = 1
            out.write(',%s\n' % comment_format(col_comment))
            col_comment = ''
            if fkey.name != '':
                out.write('  CONSTRAINT %s\n  ' % dq(fkey.name))
            out.write('  FOREIGN KEY(%s)\n' % print_fk_columns(fkey.columns))
            out.write('    REFERENCES %s(%s)' % (
                      dq(fkey.referencedTable.name),
                      print_fk_columns(fkey.referencedColumns)))
            if fkey.deleteRule in ['RESTRICT', 'CASCADE', 'SET NULL']:
                out.write('\n    ON DELETE ' + fkey.deleteRule)
            if fkey.updateRule in ['RESTRICT', 'CASCADE', 'SET NULL']:
                out.write('\n    ON UPDATE ' + fkey.updateRule)
            if is_deferred(fkey):
                out.write(' DEFERRABLE INITIALLY DEFERRED')

        out.write(comment_format(col_comment) + '\n);\n')

        # CREATE INDEX statements for all non-primary, non-unique, non-foreign
        # indexes
        for i, index in enumerate(tbl.indices):
            if index.indexType == 'INDEX':
                index_name = tbl.name + '.' + index.name
                if index.name == '':
                    index_name = tbl.name + '.index' + i
                out.write('CREATE INDEX %s%s ON %s (%s);\n' % (
                          db_name,
                          dq(index_name),
                          dq(tbl.name),
                          print_index_columns(index)))

        # Write the INSERTS (currently always)
        for insert in tbl.inserts().splitlines():
            columns_values = ''
            insert_start = 'insert into `%s`.`%s` (' % (schema.name, tbl.name)
            if insert[0:len(insert_start)].lower() == insert_start.lower():
                columns_values = insert[len(insert_start):]
            else:
                raise ExportSQLiteError(
                        'Error', 'Unrecognized command in insert')
            last_column = 0
            for i, column in enumerate(tbl.columns):
                column_name = '`' + column.name + '`'
                if columns_values[0:len(column_name)] == column_name:
                    columns_values = columns_values[len(column_name):]
                    if columns_values[0:1] == ')':
                        columns_values = columns_values[1:]
                        last_column = i
                        break
                    else:
                        if columns_values[0:2] == ', ':
                            columns_values = columns_values[2:]
                        else:
                            raise ExportSQLiteError(
                                    'Error',
                                    'Unrecognized character in column list')
                else:
                    raise ExportSQLiteError(
                            'Error', 'Unrecognized column in inserts')

            out.write('INSERT INTO %s(' % dq(tbl.name))
            for i in range(last_column + 1):
                if i > 0:
                    out.write(',')
                out.write(dq(tbl.columns[i].name))

            if columns_values[0:9].lower() != ' values (':
                raise ExportSQLiteError(
                        'Error', 'Unrecognized SQL in insert')
            columns_values = columns_values[9:]

            out.write(') VALUES(')
            out.write(columns_values.replace("\\'", "''"))
            out.write('\n')

    def order_tables(out, db_name, schema, unordered, respect_deferredness):
        have_ordered = False
        while not have_ordered:
            if len(unordered) == 0:
                have_ordered = True
            for tbl in unordered.values():
                has_forward_reference = False
                for fkey in tbl.foreignKeys:
                    if (fkey.referencedTable.name in unordered and
                            fkey.referencedTable.name != tbl.name and not (
                                respect_deferredness and is_deferred(fkey))):
                        has_forward_reference = True
                        break
                if not has_forward_reference:
                    export_table(out, db_name, schema, tbl)
                    del unordered[tbl.name]
                    have_ordered = True

    def export_schema(out, schema, is_main_schema):
        if len(schema.tables) == 0:
            return

        out.write('\n-- Schema: %s\n' % schema.name)
        out.write(schema_comment_format(schema.comment))

        db_name = ''
        if not is_main_schema:
            db_name = dq(schema.name) + '.'
            out.write('ATTACH "%s" AS %s;\n' % (
                    safe_file_name(schema.name + '.sdb'),
                    dq(schema.name)))
        out.write('BEGIN;\n')

        # Find a valid table order for inserts from FK constraints
        unordered = {t.name: t for t in schema.tables}

        # Try treating deferred keys like non-deferred keys first for ordering
        order_tables(out, db_name, schema, unordered, False)
        # Now try harder (leave out deferred keys from determining an order)
        order_tables(out, db_name, schema, unordered, True)

        # Loop through all remaining tables, if any. Have circular FK refs.
        # How to handle?
        for tbl in unordered.values():
            export_table(out, db_name, schema, tbl)

        out.write('COMMIT;\n')

    def print_index_columns(index):
        s = ''
        for i, column in enumerate(index.columns):
            if i > 0:
                s += ','
            s += dq(column.referencedColumn.name)
            if column.descend == 1:
                s += ' DESC'
        return s

    def print_fk_columns(columns):
        s = ''
        for i, column in enumerate(columns):
            if i > 0:
                s += ','
            s += dq(column.name)
        return s

    def dq(ident):
        """Double quote identifer, replacing " by "" """
        return '"' + re.sub(r'"', '""', ident) + '"'

    def safe_file_name(ident):
        """Create safe filename from identifer"""

        def repl(c):
            return ["%%%02x" % c for c in bytearray(c, 'ascii')]

        return re.sub(r'[/\:*?"<>|%]', repl, ident)

    def info_format(header, body):
        """Format a info field as SQL comment"""
        body = body.strip()
        if body == '':
            return ''
        elif '\n' in body:
            # Multiline comment
            return '-- %s:\n--   %s\n' % (
                        header, re.sub(r'\n', '\n--   ', body))
        else:
            # Single line
            return '-- %-14s %s\n' % (header + ':', body)

    def schema_comment_format(body):
        """Format a schema or table comment as SQL comment
        table comments to be stored in SQLite schema
        """
        body = body.strip()
        if body == '':
            return ''
        else:
            # Multiline comment
            return '--   %s\n' % re.sub(r'\n', '\n--   ', body)

    def comment_format(body):
        body = body.strip()
        if body == '':
            return ''
        elif '\n' in body:
            # Multiline comment
            return '\n--   %s' % re.sub(r'\n', '\n--   ', body)
        else:
            # Single line
            return '-- %s' % body

    if not validate_for_sqlite_export(cat):
        return 1

    out = StringIO.StringIO()
    out.write(info_format(
                'Creator',
                'MySQL Workbench %d.%d.%d/ExportSQLite Plugin %s\n' % (
                    grt.root.wb.info.version.majorNumber,
                    grt.root.wb.info.version.minorNumber,
                    grt.root.wb.info.version.releaseNumber,
                    ModuleInfo.version)))
    out.write(info_format('Author', grt.root.wb.doc.info.author))
    out.write(info_format('Caption', grt.root.wb.doc.info.caption))
    out.write(info_format('Project', grt.root.wb.doc.info.project))
    out.write(info_format('Changed', grt.root.wb.doc.info.dateChanged))
    out.write(info_format('Created', grt.root.wb.doc.info.dateCreated))
    out.write(info_format('Description', grt.root.wb.doc.info.description))

    out.write('PRAGMA foreign_keys = OFF;\n')

    # Loop over all catalogs in schema, find main schema main schema is first
    # nonempty schema or nonempty schema named "main"
    try:
        for schema in [(s, s.name == 'main') for s in cat.schemata]:
            export_schema(out, schema[0], schema[1])
    except ExportSQLiteError as e:
        Workbench.confirm(e.typ, e.message)
        return 1

    sql_text = out.getvalue()
    out.close()

    wizard = ExportSQLiteWizard(sql_text)
    wizard.run()

    return 0

Example 13

View license
	def SetKernPairsMain( self, sender ):
		try:
			thisFont = Glyphs.font # frontmost font
			groupsUC = {
				"A" : ["UC_A", "UC_A"],
				"Aacute" : ["UC_A", "UC_A"],
				"Abreve" : ["UC_A", "UC_A"],
				"Acircumflex" : ["UC_A", "UC_A"],
				"Adieresis" : ["UC_A", "UC_A"],
				"Agrave" : ["UC_A", "UC_A"],
				"Amacron" : ["UC_A", "UC_A"],
				"Aogonek" : ["UC_A", "UC_A"],
				"Aring" : ["UC_A", "UC_A"],
				"Aringacute" : ["UC_A", "UC_A"],
				"Atilde" : ["UC_A", "UC_A"],
				"AE" : ["UC_AE", "UC_E"],
				"AEacute" : ["UC_AE", "UC_E"],
				"B" : ["UC_Stem", "UC_B"],
				"C" : ["UC_Round", "UC_C"],
				"Cacute" : ["UC_Round", "UC_C"],
				"Ccaron" : ["UC_Round", "UC_C"],
				"Ccedilla" : ["UC_Round", "UC_C"],
				"Ccircumflex" : ["UC_Round", "UC_C"],
				"Cdotaccent" : ["UC_Round", "UC_C"],
				"D" : ["UC_Stem", "UC_Round"],
				"Eth" : ["UC_Eth", "UC_Round"],
				"Dcaron" : ["UC_Stem", "UC_Round"],
				"Dcroat" : ["UC_Eth", "UC_Round"],
				"E" : ["UC_Stem", "UC_E"],
				"Eacute" : ["UC_Stem", "UC_E"],
				"Ebreve" : ["UC_Stem", "UC_E"],
				"Ecaron" : ["UC_Stem", "UC_E"],
				"Ecircumflex" : ["UC_Stem", "UC_E"],
				"Edieresis" : ["UC_Stem", "UC_E"],
				"Edotaccent" : ["UC_Stem", "UC_E"],
				"Egrave" : ["UC_Stem", "UC_E"],
				"Emacron" : ["UC_Stem", "UC_E"],
				"Eogonek" : ["UC_Stem", "UC_E"],
				"F" : ["UC_Stem", ""],
				"G" : ["UC_Round", "UC_G"],
				"Gbreve" : ["UC_Round", "UC_G"],
				"Gcircumflex" : ["UC_Round", "UC_G"],
				"Gcommaaccent" : ["UC_Round", "UC_G"],
				"Gcaron" : ["UC_Round", "UC_G"],
				"Gdotaccent" : ["UC_Round", "UC_G"],
				"H" : ["UC_Stem", "UC_Stem"],
				"Hbar" : ["UC_Stem", "UC_Stem"],
				"Hcircumflex" : ["UC_Stem", "UC_Stem"],
				"I" : ["UC_Stem", "UC_Stem"],
				"IJ" : ["UC_Stem", "UC_J"],
				"Iacute" : ["UC_Stem", "UC_Stem"],
				"Ibreve" : ["UC_Stem", "UC_Stem"],
				"Icircumflex" : ["UC_Stem", "UC_Stem"],
				"Idieresis" : ["UC_Stem", "UC_Stem"],
				"Idotaccent" : ["UC_Stem", "UC_Stem"],
				"Igrave" : ["UC_Stem", "UC_Stem"],
				"Imacron" : ["UC_Stem", "UC_Stem"],
				"Iogonek" : ["UC_Stem", "UC_Stem"],
				"Itilde" : ["UC_Stem", "UC_Stem"],
				"J" : ["UC_J", "UC_J"],
				"Jcircumflex" : ["UC_J", "UC_J"],
				"K" : ["UC_Stem", "UC_K"],
				"Kcommaaccent" : ["UC_Stem", "UC_K"],
				"L" : ["UC_Stem", "UC_L"],
				"Lacute" : ["UC_Stem", "UC_L"],
				"Lcaron" : ["UC_Stem", "UC_L"],
				"Lcommaaccent" : ["UC_Stem", "UC_L"],
				"Ldot" : ["UC_Stem", ""],
				"Lslash" : ["UC_Eth", "UC_L"],
				"M" : ["UC_Stem", "UC_Stem"],
				"N" : ["UC_Stem", "UC_Stem"],
				"Nacute" : ["UC_Stem", "UC_Stem"],
				"Ncaron" : ["UC_Stem", "UC_Stem"],
				"Ncommaaccent" : ["UC_Stem", "UC_Stem"],
				"Eng" : ["UC_Stem", ""],
				"Ntilde" : ["UC_Stem", "UC_Stem"],
				"O" : ["UC_Round", "UC_Round"],
				"Oacute" : ["UC_Round", "UC_Round"],
				"Obreve" : ["UC_Round", "UC_Round"],
				"Ocircumflex" : ["UC_Round", "UC_Round"],
				"Odieresis" : ["UC_Round", "UC_Round"],
				"Ograve" : ["UC_Round", "UC_Round"],
				"Ohungarumlaut" : ["UC_Round", "UC_Round"],
				"Omacron" : ["UC_Round", "UC_Round"],
				"Oslash" : ["UC_Round", "UC_Round"],
				"Oslashacute" : ["UC_Round", "UC_Round"],
				"Otilde" : ["UC_Round", "UC_Round"],
				"OE" : ["UC_Round", "UC_E"],
				"P" : ["UC_Stem", "UC_P"],
				"Thorn" : ["UC_Stem", ""],
				"Q" : ["UC_Round", ""],
				"R" : ["UC_Stem", "UC_R"],
				"Racute" : ["UC_Stem", "UC_R"],
				"Rcaron" : ["UC_Stem", "UC_R"],
				"Rcommaaccent" : ["UC_Stem", "UC_R"],
				"S" : ["UC_S", "UC_S"],
				"Sacute" : ["UC_S", "UC_S"],
				"Scaron" : ["UC_S", "UC_S"],
				"Scedilla" : ["UC_S", "UC_S"],
				"Scircumflex" : ["UC_S", "UC_S"],
				"Scommaaccent" : ["UC_S", "UC_S"],
				"T" : ["UC_T", "UC_T"],
				"Tbar" : ["UC_T", "UC_T"],
				"Tcaron" : ["UC_T", "UC_T"],
				"Tcedilla" : ["UC_T", "UC_T"],
				"Tcommaaccent" : ["UC_T", "UC_T"],
				"U" : ["UC_U", "UC_U"],
				"Uacute" : ["UC_U", "UC_U"],
				"Ubreve" : ["UC_U", "UC_U"],
				"Ucircumflex" : ["UC_U", "UC_U"],
				"Udieresis" : ["UC_U", "UC_U"],
				"Ugrave" : ["UC_U", "UC_U"],
				"Uhungarumlaut" : ["UC_U", "UC_U"],
				"Umacron" : ["UC_U", "UC_U"],
				"Uogonek" : ["UC_U", "UC_U"],
				"Uring" : ["UC_U", "UC_U"],
				"Utilde" : ["UC_U", "UC_U"],
				"W" : ["UC_W", "UC_W"],
				"Wacute" : ["UC_W", "UC_W"],
				"Wcircumflex" : ["UC_W", "UC_W"],
				"Wdieresis" : ["UC_W", "UC_W"],
				"Wgrave" : ["UC_W", "UC_W"],
				"X" : ["UC_X", "UC_X"],
				"Y" : ["UC_Y", "UC_Y"],
				"Yacute" : ["UC_Y", "UC_Y"],
				"Ycircumflex" : ["UC_Y", "UC_Y"],
				"Ydieresis" : ["UC_Y", "UC_Y"],
				"Ygrave" : ["UC_Y", "UC_Y"],
				"Z" : ["UC_Z", "UC_Z"],
				"Zacute" : ["UC_Z", "UC_Z"],
				"Zcaron" : ["UC_Z", "UC_Z"],
				"Zdotaccent" : ["UC_Z", "UC_Z"],
				"Schwa" : ["UC_Schwa", "UC_Round"],
				"A-cy" : ["UC_A", "UC_A"],
				"Be-cy" : ["UC_Stem", ""],
				"Ve-cy" : ["UC_Stem", "UC_B"],
				"Ge-cy" : ["UC_Stem", "UC_T"],
				"Gje-cy" : ["UC_Stem", "UC_T"],
				"Gheupturn-cy" : ["UC_Stem", "UC_T"],
				"De-cy" : ["", "UC_StemTooth"],
				"Ie-cy" : ["UC_Stem", "Stem_E"],
				"Iegrave-cy" : ["UC_Stem", "Stem_E"],
				"Io-cy" : ["UC_Stem", "Stem_E"],
				"Zhe-cy" : ["UC_Zhe", "UC_K"],
				"Ze-cy" : ["UC_Ze", "UC_B"],
				"Ii-cy" : ["UC_Stem", "UC_Stem"],
				"Iishort-cy" : ["UC_Stem", "UC_Stem"],
				"Iigrave-cy" : ["UC_Stem", "UC_Stem"],
				"Ka-cy" : ["UC_Stem", "UC_K"],
				"Kje-cy" : ["UC_Stem", "UC_K"],
				"El-cy" : ["UC_El", "UC_Stem"],
				"Em-cy" : ["UC_Stem", "UC_Stem"],
				"En-cy" : ["UC_Stem", "UC_Stem"],
				"O-cy" : ["UC_Round", "UC_Round"],
				"Pe-cy" : ["UC_Stem", "UC_Stem"],
				"Er-cy" : ["UC_Stem", "UC_P"],
				"Es-cy" : ["UC_Round", "UC_C"],
				"Te-cy" : ["UC_T", "UC_T"],
				"U-cy" : ["UC_CyrU", "UC_CyrU"],
				"Ushort-cy" : ["UC_CyrU", "UC_CyrU"],
				"Ef-cy" : ["UC_Ef", "UC_Ef"],
				"Ha-cy" : ["UC_X", "UC_X"],
				"Che-cy" : ["UC_Che", "UC_Stem"],
				"Tse-cy" : ["UC_Stem", "UC_StemTooth"],
				"Sha-cy" : ["UC_Stem", "UC_Stem"],
				"Shcha-cy" : ["UC_Stem", "UC_StemTooth"],
				"Dzhe-cy" : ["UC_Stem", "UC_Stem"],
				"Ia-cy" : ["", "UC_Stem"],
				"Softsign-cy" : ["UC_Stem", "UC_Softsign"],
				"Hardsign-cy" : ["UC_T", "UC_Softsign"],
				"Yeru-cy" : ["UC_Stem", "UC_Stem"],
				"Lje-cy" : ["UC_El", "UC_Softsign"],
				"Nje-cy" : ["UC_Stem", "UC_Softsign"],
				"Dze-cy" : ["UC_S", "UC_S"],
				"E-cy" : ["UC_Round", "UC_C"],
				"Ereversed-cy" : ["UC_Ze", "UC_Round"],
				"I-cy" : ["UC_Stem", "UC_Stem"],
				"Yi-cy" : ["UC_Stem", "UC_Stem"],
				"Je-cy" : ["UC_J", "UC_J"],
				"Tshe-cy" : ["UC_T", "UC_Shha"],
				"Iu-cy" : ["UC_Stem", "UC_Round"],
				"Dje-cy" : ["UC_T", "UC_Softsign"],
				"Fita-cy" : ["UC_Round", "UC_Round"],
				"Izhitsa-cy" : ["UC_V", ""],
				"Ghestroke-cy" : ["UC_Eth", "UC_Te"],
				"Ghemiddlehook-cy" : ["UC_Stem", ""],
				"Zhedescender-cy" : ["UC_Zhe", "UC_K"],
				"Zedescender-cy" : ["UC_Ze", "UC_B"],
				"Kadescender-cy" : ["UC_Stem", "UC_K"],
				"Kaverticalstroke-cy" : ["UC_Stem", "UC_K"],
				"Kastroke-cy" : ["UC_Stem", "UC_K"],
				"Kabashkir-cy" : ["UC_T", "UC_K"],
				"Endescender-cy" : ["UC_Stem", "UC_StemTooth"],
				"Pemiddlehook-cy" : ["UC_Stem", ""],
				"Haabkhasian-cy" : ["UC_Round", ""],
				"Esdescender-cy" : ["UC_Round", "UC_C"],
				"Tedescender-cy" : ["UC_T", "UC_T"],
				"Ustrait-cy" : ["UC_Y", "UC_Y"],
				"Ustraitstroke-cy" : ["UC_Y", "UC_Y"],
				"Hadescender-cy" : ["UC_X", "UC_X"],
				"Chedescender-cy" : ["UC_Che", "UC_StemTooth"],
				"Cheverticalstroke-cy" : ["UC_Che", "UC_Stem"],
				"Shha-cy" : ["UC_Stem", "UC_Shha"],
				"Cheabkhasian-cy" : ["UC_Cheabkhaz", "UC_Cheabkhaz"],
				"Chedescenderabkhasian-cy" : ["UC_Cheabkhaz", "UC_Cheabkhaz"],
				"Palochka-cy" : ["UC_Stem", "UC_Stem"],
				"Zhebreve-cy" : ["UC_Zhe", "UC_K"],
				"Kahook-cy" : ["UC_Stem", ""],
				"Eltail-cy" : ["UC_El", "UC_Stem"],
				"Enhook-cy" : ["UC_Stem", "UC_StemHook"],
				"Entail-cy" : ["UC_Stem", "UC_Stem"],
				"Chekhakassian-cy" : ["UC_Che", "UC_Stem"],
				"Emtail-cy" : ["UC_Stem", "UC_Stem"],
				"Abreve-cy" : ["UC_A", "UC_A"],
				"Adieresis-cy" : ["UC_A", "UC_A"],
				"Iebreve-cy" : ["UC_Stem", "UC_E"],
				"Schwa-cy" : ["UC_Schwa", "UC_Round"],
				"Schwadieresis-cy" : ["UC_Schwa", "UC_Round"],
				"Zhedieresis-cy" : ["UC_Zhe", "UC_K"],
				"Zedieresis-cy" : ["UC_Ze", "UC_B"],
				"Imacron-cy" : ["UC_Stem", "UC_Stem"],
				"Idieresis-cy" : ["UC_Stem", "UC_Stem"],
				"Odieresis-cy" : ["UC_Round", "UC_Round"],
				"Obarred-cy" : ["UC_Round", "UC_Round"],
				"Obarreddieresis-cy" : ["UC_Round", "UC_Round"],
				"Edieresis-cy" : ["UC_Ze", "UC_Round"],
				"Umacron-cy" : ["UC_CyrU", "UC_CyrU"],
				"Udieresis-cy" : ["UC_CyrU", "UC_CyrU"],
				"Uhungarumlaut-cy" : ["UC_CyrU", "UC_CyrU"],
				"Chedieresis-cy" : ["UC_Che", "UC_Stem"],
				"Ghedescender-cy" : ["UC_Stem", "UC_T"],
				"Yerudieresis-cy" : ["UC_Stem", "UC_Stem"],
				"Hahook-cy" : ["UC_X", "UC_X"],
				"Komide-cy" : ["", "UC_Stem"],
				"Elhook-cy" : ["UC_El", "UC_StemHook"],
				"Qa-cy" : ["UC_Round", "UC_Round"],
				"We-cy" : ["UC_W", ""],
				"Pedescender-cy" : ["UC_Stem", "UC_StemTooth"],
				"Shhadescender-cy" : ["UC_Stem", "UC_Shha"],
				"Ishorttail-cy" : ["UC_Stem", "UC_StemTooth"],
				"Enghe-cy" : ["UC_Stem", "UC_T"],
				"Tetse-cy" : ["UC_T", "UC_StemTooth"],
				"Ertick-cy" : ["UC_Stem","UC_P"],
				"Aie-cy" : ["", "UC_E"],
				"Alpha" : ["UC_A", "UC_A"],
				"Beta" : ["UC_Stem", "UC_B"],
				"Gamma" : ["UC_Stem", "UC_T"],
				"Delta" : ["UC_A", "UC_A"],
				"Epsilon" : ["UC_Stem", "UC_E"],
				"Zeta" : ["UC_Z", "UC_Z"],
				"Eta" : ["UC_Stem", "UC_Stem"],
				"Theta" : ["UC_Round", "UC_Round"],
				"Iota" : ["UC_Stem", "UC_Stem"],
				"Kappa" : ["UC_Stem", "UC_K"],
				"Lambda" : ["UC_A", "UC_A"],
				"Mu" : ["UC_Stem", "UC_Stem"],
				"Nu" : ["UC_Stem", "UC_Stem"],
				"Xi" : ["", "UC_E"],
				"Omicron" : ["UC_Round", "UC_Round"],
				"Pi" : ["UC_Stem", "UC_Stem"],
				"Rho" : ["UC_Stem", "UC_P"],
				"Sigma" : ["", "UC_E"],
				"Tau" : ["UC_T", "UC_T"],
				"Upsilon" : ["UC_Y", "UC_Y"],
				"Phi" : ["UC_Ef", "UC_Ef"],
				"Chi" : ["UC_X", "UC_X"],
				"Omega" : ["UC_Omega", "UC_Omega"],
				"Alphatonos" : ["", "UC_A"],
				"Epsilontonos" : ["UC_StemTonos", "UC_E"],
				"Etatonos" : ["UC_StemTonos", "UC_Stem"],
				"Iotatonos" : ["UC_StemTonos", "UC_Stem"],
				"Omicrontonos" : ["", "UC_Round"],
				"Upsilontonos" : ["", "UC_Y"],
				"Omegatonos" : ["", "UC_Omega"],
				"Iotadieresis" : ["UC_Stem", "UC_Stem"],
				"Upsilondieresis" : ["UC_Y", "UC_Y"]
			}

			groupsLCnormal = {
				"a" : ["lc_a", "lc_a"],
				"aacute" : ["lc_a", "lc_a"],
				"abreve" : ["lc_a", "lc_a"],
				"acircumflex" : ["lc_a", "lc_a"],
				"adieresis" : ["lc_a", "lc_a"],
				"agrave" : ["lc_a", "lc_a"],
				"amacron" : ["lc_a", "lc_a"],
				"aogonek" : ["lc_a", "lc_a"],
				"aring" : ["lc_a", "lc_a"],
				"aringacute" : ["lc_a", "lc_a"],
				"atilde" : ["lc_a", "lc_a"],
				"ae" : ["lc_a", "lc_e"],
				"aeacute" : ["lc_a", "lc_e"],
				"b" : ["lc_LongStem", "lc_Round"],
				"c" : ["lc_Round", "lc_c"],
				"cacute" : ["lc_Round", "lc_c"],
				"ccaron" : ["lc_Round", "lc_c"],
				"ccedilla" : ["lc_Round", "lc_c"],
				"ccircumflex" : ["lc_Round", "lc_c"],
				"cdotaccent" : ["lc_Round", "lc_c"],
				"d" : ["lc_Round", "lc_LongStem"],
				"eth" : ["lc_Round", ""],
				"dcaron" : ["lc_Round", "lc_Caron"],
				"dcroat" : ["lc_Round", "lc_LongStem"],
				"e" : ["lc_Round", "lc_e"],
				"eacute" : ["lc_Round", "lc_e"],
				"ebreve" : ["lc_Round", "lc_e"],
				"ecaron" : ["lc_Round", "lc_e"],
				"ecircumflex" : ["lc_Round", "lc_e"],
				"edieresis" : ["lc_Round", "lc_e"],
				"edotaccent" : ["lc_Round", "lc_e"],
				"egrave" : ["lc_Round", "lc_e"],
				"emacron" : ["lc_Round", "lc_e"],
				"eogonek" : ["lc_Round", "lc_e"],
				"f" : ["lc_f", "lc_f"],
				"g" : ["lc_g", "lc_g"],
				"gbreve" : ["lc_g", "lc_g"],
				"gcircumflex" : ["lc_g", "lc_g"],
				"gcommaaccent" : ["lc_g", "lc_g"],
				"gdotaccent" : ["lc_g", "lc_g"],
				"h" : ["lc_LongStem", "lc_Shoulder"],
				"hbar" : ["lc_LongStem", "lc_Shoulder"],
				"hcircumflex" : ["lc_LongStem", "lc_Shoulder"],
				"i" : ["lc_ShortStem", "lc_ShortStem"],
				"dotlessi" : ["lc_ShortStem", "lc_ShortStem"],
				"idotless" : ["lc_ShortStem", "lc_ShortStem"],
				"iacute" : ["lc_ShortStem", "lc_ShortStem"],
				"ibreve" : ["lc_ShortStem", "lc_ShortStem"],
				"icircumflex" : ["lc_ShortStem", "lc_ShortStem"],
				"idieresis" : ["lc_ShortStem", "lc_ShortStem"],
				"idotaccent" : ["lc_ShortStem", "lc_ShortStem"],
				"igrave" : ["lc_ShortStem", "lc_ShortStem"],
				"ij" : ["lc_ShortStem", "lc_j"],
				"imacron" : ["lc_ShortStem", "lc_ShortStem"],
				"iogonek" : ["lc_ShortStem", "lc_ShortStem"],
				"itilde" : ["lc_ShortStem", "lc_ShortStem"],
				"j" : ["lc_j", "lc_j"],
				"dotlessj" : ["lc_j", "lc_j"],
				"jdotless" : ["lc_j", "lc_j"],
				"jcircumflex" : ["lc_j", "lc_j"],
				"k" : ["lc_LongStem", "lc_k"],
				"kcommaaccent" : ["lc_LongStem", "lc_k"],
				"kgreenlandic" : ["lc_ShortStem", "lc_k"],
				"l" : ["lc_LongStem", "lc_LongStem"],
				"lacute" : ["lc_LongStem", "lc_LongStem"],
				"lcaron" : ["lc_LongStem", "lc_Caron"],
				"lcommaaccent" : ["lc_LongStem", "lc_LongStem"],
				"ldot" : ["lc_LongStem", ""],
				"lslash" : ["lc_lslash", "lc_lslash"],
				"m" : ["lc_ShortStem", "lc_Shoulder"],
				"n" : ["lc_ShortStem", "lc_Shoulder"],
				"nacute" : ["lc_ShortStem", "lc_Shoulder"],
				"napostrophe" : ["MSC_quoteright", "lc_Shoulder"],
				"ncaron" : ["lc_ShortStem", "lc_Shoulder"],
				"ncommaaccent" : ["lc_ShortStem", "lc_Shoulder"],
				"eng" : ["lc_ShortStem", "lc_Shoulder"],
				"ntilde" : ["lc_ShortStem", "lc_Shoulder"],
				"o" : ["lc_Round", "lc_Round"],
				"oacute" : ["lc_Round", "lc_Round"],
				"obreve" : ["lc_Round", "lc_Round"],
				"ocircumflex" : ["lc_Round", "lc_Round"],
				"odieresis" : ["lc_Round", "lc_Round"],
				"ograve" : ["lc_Round", "lc_Round"],
				"ohungarumlaut" : ["lc_Round", "lc_Round"],
				"omacron" : ["lc_Round", "lc_Round"],
				"oslash" : ["lc_Round", "lc_Round"],
				"oslashacute" : ["lc_Round", "lc_Round"],
				"otilde" : ["lc_Round", "lc_Round"],
				"oe" : ["lc_Round", "lc_e"],
				"p" : ["lc_p", "lc_Round"],
				"thorn" : ["lc_LongStem", "lc_Round"],
				"q" : ["lc_Round", ""],
				"r" : ["lc_ShortStem", "lc_r"],
				"racute" : ["lc_ShortStem", "lc_r"],
				"rcaron" : ["lc_ShortStem", "lc_r"],
				"rcommaaccent" : ["lc_ShortStem", "lc_r"],
				"s" : ["lc_s", "lc_s"],
				"sacute" : ["lc_s", "lc_s"],
				"scaron" : ["lc_s", "lc_s"],
				"scedilla" : ["lc_s", "lc_s"],
				"scircumflex" : ["lc_s", "lc_s"],
				"scommaaccent" : ["lc_s", "lc_s"],
				"t" : ["lc_t", "lc_t"],
				"tbar" : ["lc_t", ""],
				"tcaron" : ["lc_t", "lc_t"],
				"tcedilla" : ["lc_t", "lc_t"],
				"tcommaaccent" : ["lc_t", "lc_t"],
				"u" : ["lc_u", "lc_u"],
				"uacute" : ["lc_u", "lc_u"],
				"ubreve" : ["lc_u", "lc_u"],
				"ucircumflex" : ["lc_u", "lc_u"],
				"udieresis" : ["lc_u", "lc_u"],
				"ugrave" : ["lc_u", "lc_u"],
				"uhungarumlaut" : ["lc_u", "lc_u"],
				"umacron" : ["lc_u", "lc_u"],
				"uogonek" : ["lc_u", "lc_u"],
				"uring" : ["lc_u", "lc_u"],
				"utilde" : ["lc_u", "lc_u"],
				"v" : ["lc_vwy", "lc_vwy"],
				"w" : ["lc_vwy", "lc_vwy"],
				"wacute" : ["lc_vwy", "lc_vwy"],
				"wcircumflex" : ["lc_vwy", "lc_vwy"],
				"wdieresis" : ["lc_vwy", "lc_vwy"],
				"wgrave" : ["lc_vwy", "lc_vwy"],
				"x" : ["lc_x", "lc_x"],
				"y" : ["lc_vwy", "lc_vwy"],
				"yacute" : ["lc_vwy", "lc_vwy"],
				"ycircumflex" : ["lc_vwy", "lc_vwy"],
				"ydieresis" : ["lc_vwy", "lc_vwy"],
				"ygrave" : ["lc_vwy", "lc_vwy"],
				"z" : ["lc_z", "lc_z"],
				"zacute" : ["lc_z", "lc_z"],
				"zcaron" : ["lc_z", "lc_z"],
				"zdotaccent" : ["lc_z", "lc_z"],
				"schwa" : ["lc_schwa", "lc_Round"],
				"f_f" : ["lc_f", "lc_f"],
				"f_f_i" : ["lc_f", "lc_ShortStem"],
				"f_f_l" : ["lc_f", "lc_LongStem"],
				"f_i" : ["lc_f", "lc_ShortStem"],
				"f_l" : ["lc_f", "lc_LongStem"],
				"fi" : ["lc_f", "lc_ShortStem"],
				"fl" : ["lc_f", "lc_LongStem"],
				"a-cy" : ["lc_a", "lc_a"],
				"be-cy" : ["", "lc_Round"],
				"ve-cy" : ["lc_ShortStem", "lc_ze"],
				"ge-cy" : ["lc_ShortStem", "lc_te"],
				"gje-cy" : ["lc_ShortStem", "lc_te"],
				"gheupturn-cy" : ["lc_ShortStem", "lc_te"],
				"de-cy" : ["", "lc_StemTooth"],
				"ie-cy" : ["lc_Round", "lc_e"],
				"iegrave-cy" : ["lc_Round", "lc_e"],
				"io-cy" : ["lc_Round", "lc_e"],
				"zhe-cy" : ["lc_zhe", "lc_k"],
				"ze-cy" : ["lc_ze", "lc_ze"],
				"ii-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"iishort-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"iigrave-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"ka-cy" : ["lc_ShortStem", "lc_k"],
				"kje-cy" : ["lc_ShortStem", "lc_k"],
				"el-cy" : ["lc_el", "lc_ShortStem"],
				"em-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"en-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"o-cy" : ["lc_Round", "lc_Round"],
				"pe-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"er-cy" : ["lc_p", "lc_Round"],
				"es-cy" : ["lc_Round", "lc_c"],
				"te-cy" : ["lc_te", "lc_te"],
				"u-cy" : ["lc_vwy", "lc_vwy"],
				"ushort-cy" : ["lc_vwy", "lc_vwy"],
				"ef-cy" : ["lc_Round", "lc_Round"],
				"ha-cy" : ["lc_x", "lc_x"],
				"che-cy" : ["lc_che", "lc_ShortStem"],
				"tse-cy" : ["lc_ShortStem", "lc_StemTooth"],
				"sha-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"shcha-cy" : ["lc_ShortStem", "lc_StemTooth"],
				"dzhe-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"ia-cy" : ["", "lc_ShortStem"],
				"softsign-cy" : ["lc_ShortStem", "lc_softsign"],
				"hardsign-cy" : ["lc_te", "lc_softsign"],
				"yeru-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"lje-cy" : ["lc_el", "lc_softsign"],
				"nje-cy" : ["lc_ShortStem", "lc_softsign"],
				"ereversed-cy" : ["lc_ze", "lc_Round"],
				"dze-cy" : ["lc_s", "lc_s"],
				"e-cy" : ["lc_Round", "lc_c"],
				"yi-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"i-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"je-cy" : ["lc_j", "lc_j"],
				"tshe-cy" : ["lc_LongStem", "lc_Shoulder"],
				"iu-cy" : ["lc_ShortStem", "lc_Round"],
				"dje-cy" : ["lc_LongStem", "lc_Shoulder"],
				"fita-cy" : ["lc_Round", "lc_Round"],
				"izhitsa-cy" : ["lc_vwy", ""],
				"ghestroke-cy" : ["lc_ShortStem", "lc_te"],
				"ghemiddlehook-cy" : ["lc_ShortStem", ""],
				"zhedescender-cy" : ["lc_zhe", "lc_k"],
				"zedescender-cy" : ["lc_ze", "lc_ze"],
				"kadescender-cy" : ["lc_ShortStem", "lc_k"],
				"kaverticalstroke-cy" : ["lc_ShortStem", "lc_k"],
				"kastroke-cy" : ["lc_LongStem", "lc_k"],
				"kabashkir-cy" : ["lc_te", "lc_k"],
				"endescender-cy" : ["lc_ShortStem", "lc_StemTooth"],
				"pemiddlehook-cy" : ["lc_ShortStem", ""],
				"haabkhasian-cy" : ["lc_Round", ""],
				"esdescender-cy" : ["lc_Round", "lc_c"],
				"tedescender-cy" : ["lc_te", "lc_te"],
				"ustrait-cy" : ["lc_vwy", "lc_vwy"],
				"hadescender-cy" : ["lc_x", "lc_x"],
				"chedescender-cy" : ["lc_che", "lc_StemTooth"],
				"cheverticalstroke-cy" : ["lc_che", "lc_ShortStem"],
				"shha-cy" : ["lc_LongStem", "lc_Shoulder"],
				"cheabkhasian-cy" : ["lc_cheabkhaz", "lc_e"],
				"chedescenderabkhasian-cy" : ["lc_cheabkhaz", "lc_e"],
				"palochka-cy" : ["lc_LongStem", "lc_LongStem"],
				"zhebreve-cy" : ["lc_zhe", "lc_k"],
				"kahook-cy" : ["lc_ShortStem", ""],
				"eltail-cy" : ["lc_el", "lc_StemTooth"],
				"enhook-cy" : ["lc_ShortStem", "lc_StemHook"],
				"entail-cy" : ["lc_ShortStem", "lc_StemTooth"],
				"chekhakassian-cy" : ["lc_che", "lc_ShortStem"],
				"emtail-cy" : ["lc_ShortStem", "lc_StemTooth"],
				"abreve-cy" : ["lc_a", "lc_a"],
				"adieresis-cy" : ["lc_a", "lc_a"],
				"iebreve-cy" : ["lc_Round", "lc_e"],
				"schwa-cy" : ["lc_schwa", "lc_Round"],
				"schwadieresis-cy" : ["lc_schwa", "lc_Round"],
				"zhedieresis-cy" : ["lc_zhe", "lc_k"],
				"zedieresis-cy" : ["lc_ze", "lc_ze"],
				"imacron-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"idieresis-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"odieresis-cy" : ["lc_Round", "lc_Round"],
				"obarred-cy" : ["lc_Round", "lc_Round"],
				"obarreddieresis-cy" : ["lc_Round", "lc_Round"],
				"edieresis-cy" : ["lc_ereversed", "lc_Round"],
				"umacron-cy" : ["lc_vwy", "lc_vwy"],
				"udieresis-cy" : ["lc_vwy", "lc_vwy"],
				"uhungarumlaut-cy" : ["lc_vwy", "lc_vwy"],
				"chedieresis-cy" : ["lc_che", "lc_ShortStem"],
				"ghedescender-cy" : ["lc_ShortStem", "lc_te"],
				"yerudieresis-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"hahook-cy" : ["lc_x", "lc_x"],
				"komide-cy" : ["lc_Round", "lc_LongStem"],
				"elhook-cy" : ["lc_el", "lc_StemHook"],
				"we-cy" : ["lc_vwy", "lc_vwy"],
				"pedescender-cy" : ["lc_ShortStem", "lc_StemTooth"],
				"shhadescender-cy" : ["lc_LongStem", "lc_Shoulder"],
				"ishorttail-cy" : ["lc_ShortStem", "lc_StemTooth"],
				"ertick-cy" : ["lc_p", "p"],
				"enghe-cy" : ["lc_ShortStem", "lc_te"],
				"tetse-cy" : ["lc_te", "lc_StemTooth"],
				"aie-cy" : ["lc_a", "lc_e"],
				"alpha" : ["lc_Round", "lc_alpha"],
				"delta" : ["", "lc_Round"],
				"epsilon" : ["lc_epsilon", "lc_epsilon"],
				"eta" : ["lc_eta", "lc_eta"],
				"iota" : ["lc_iota", "lc_iota"],
				"mu" : ["lc_ShortStem", "lc_alpha"],
				"omicron" : ["lc_Round", "lc_Round"],
				"rho" : ["", "lc_Round"],
				"sigmafinal" : ["lc_Round", ""],
				"sigma" : ["lc_Round", ""],
				"upsilon" : ["lc_upsilon", "lc_upsilon"],
				"phi" : ["lc_Round", "lc_Round"],
				"psi" : ["", "lc_upsilon"],
				"omega" : ["lc_omega", "lc_upsilon"],
				"iotatonos" : ["lc_iota", "lc_iota"],
				"iotadieresis" : ["lc_iota", "lc_iota"],
				"iotadieresistonos" : ["lc_iota", "lc_iota"],
				"upsilontonos" : ["lc_upsilon", "lc_upsilon"],
				"upsilondieresis" : ["lc_upsilon", "lc_upsilon"],
				"upsilondieresistonos" : ["lc_upsilon", "lc_upsilon"],
				"omicrontonos" : ["lc_Round", "lc_Round"],
				"omegatonos" : ["lc_omega", "lc_upsilon"],
				"alphatonos" : ["lc_Round", "lc_alpha"],
				"epsilontonos" : ["lc_epsilon", "lc_epsilon"],
				"etatonos" : ["lc_eta", "lc_eta"]
			}

			groupsLCcursive = {
				"a" : ["lc_Round", "lc_ShortStem"],
				"aacute" : ["lc_Round", "lc_ShortStem"],
				"abreve" : ["lc_Round", "lc_ShortStem"],
				"acircumflex" : ["lc_Round", "lc_ShortStem"],
				"adieresis" : ["lc_Round", "lc_ShortStem"],
				"agrave" : ["lc_Round", "lc_ShortStem"],
				"amacron" : ["lc_Round", "lc_ShortStem"],
				"aogonek" : ["lc_Round", "lc_ShortStem"],
				"aring" : ["lc_Round", "lc_ShortStem"],
				"aringacute" : ["lc_Round", "lc_ShortStem"],
				"atilde" : ["lc_Round", "lc_ShortStem"],
				"ae" : ["lc_Round", "lc_e"],
				"aeacute" : ["lc_Round", "lc_e"],
				"b" : ["lc_LongStem1", "lc_Round"],
				"c" : ["lc_Round", "lc_c"],
				"cacute" : ["lc_Round", "lc_c"],
				"ccaron" : ["lc_Round", "lc_c"],
				"ccedilla" : ["lc_Round", "lc_c"],
				"ccircumflex" : ["lc_Round", "lc_c"],
				"cdotaccent" : ["lc_Round", "lc_c"],
				"d" : ["lc_Round", "lc_LongStem"],
				"eth" : ["lc_Round", ""],
				"dcaron" : ["lc_Round", "lc_Caron"],
				"dcroat" : ["lc_Round", "lc_LongStem"],
				"e" : ["lc_Round", "lc_e"],
				"eacute" : ["lc_Round", "lc_e"],
				"ebreve" : ["lc_Round", "lc_e"],
				"ecaron" : ["lc_Round", "lc_e"],
				"ecircumflex" : ["lc_Round", "lc_e"],
				"edieresis" : ["lc_Round", "lc_e"],
				"edotaccent" : ["lc_Round", "lc_e"],
				"egrave" : ["lc_Round", "lc_e"],
				"emacron" : ["lc_Round", "lc_e"],
				"eogonek" : ["lc_Round", "lc_e"],
				"f" : ["lc_f", "lc_f"],
				"g" : ["lc_g", "lc_g"],
				"gbreve" : ["lc_g", "lc_g"],
				"gcaron" : ["lc_g", "lc_g"],
				"gcircumflex" : ["lc_g", "lc_g"],
				"gcommaaccent" : ["lc_g", "lc_g"],
				"gdotaccent" : ["lc_g", "lc_g"],
				"h" : ["lc_LongStem2", "lc_Shoulder"],
				"hbar" : ["lc_LongStem2", "lc_Shoulder"],
				"hcircumflex" : ["lc_LongStem2", "lc_Shoulder"],
				"i" : ["lc_i", "lc_ShortStem"],
				"dotlessi" : ["lc_i", "lc_ShortStem"],
				"idotless" : ["lc_i", "lc_ShortStem"],
				"iacute" : ["lc_i", "lc_ShortStem"],
				"ibreve" : ["lc_i", "lc_ShortStem"],
				"icircumflex" : ["lc_i", "lc_ShortStem"],
				"idieresis" : ["lc_i", "lc_ShortStem"],
				"idotaccent" : ["lc_i", "lc_ShortStem"],
				"igrave" : ["lc_i", "lc_ShortStem"],
				"ij" : ["lc_i", "lc_j"],
				"imacron" : ["lc_i", "lc_ShortStem"],
				"iogonek" : ["lc_i", "lc_ShortStem"],
				"itilde" : ["lc_i", "lc_ShortStem"],
				"j" : ["lc_j", "lc_j"],
				"dotlessj" : ["lc_j", "lc_j"],
				"jdotless" : ["lc_j", "lc_j"],
				"jcircumflex" : ["lc_j", "lc_j"],
				"k" : ["lc_LongStem2", "lc_k"],
				"kcommaaccent" : ["lc_LongStem2", "lc_k"],
				"kgreenlandic" : ["lc_ShortStem", "lc_k"],
				"l" : ["lc_LongStem1", "lc_LongStem"],
				"lacute" : ["lc_LongStem1", "lc_LongStem"],
				"lcaron" : ["lc_LongStem1", "lc_Caron"],
				"lcommaaccent" : ["lc_LongStem1", "lc_LongStem"],
				"ldot" : ["lc_LongStem1", ""],
				"lslash" : ["lc_lslash", "lc_lslash"],
				"m" : ["lc_ShortStem", "lc_Shoulder"],
				"n" : ["lc_ShortStem", "lc_Shoulder"],
				"nacute" : ["lc_ShortStem", "lc_Shoulder"],
				"napostrophe" : ["MSC_quoteright", "lc_Shoulder"],
				"ncaron" : ["lc_ShortStem", "lc_Shoulder"],
				"ncommaaccent" : ["lc_ShortStem", "lc_Shoulder"],
				"eng" : ["lc_ShortStem", ""],
				"ntilde" : ["lc_ShortStem", "lc_Shoulder"],
				"o" : ["lc_Round", "lc_Round"],
				"oacute" : ["lc_Round", "lc_Round"],
				"obreve" : ["lc_Round", "lc_Round"],
				"ocircumflex" : ["lc_Round", "lc_Round"],
				"odieresis" : ["lc_Round", "lc_Round"],
				"ograve" : ["lc_Round", "lc_Round"],
				"ohungarumlaut" : ["lc_Round", "lc_Round"],
				"omacron" : ["lc_Round", "lc_Round"],
				"oslash" : ["lc_Round", "lc_Round"],
				"oslashacute" : ["lc_Round", "lc_Round"],
				"otilde" : ["lc_Round", "lc_Round"],
				"oe" : ["lc_Round", "lc_e"],
				"p" : ["", "lc_Round"],
				"thorn" : ["", "lc_Round"],
				"q" : ["lc_Round", ""],
				"r" : ["lc_ShortStem", "lc_r"],
				"racute" : ["lc_ShortStem", "lc_r"],
				"rcaron" : ["lc_ShortStem", "lc_r"],
				"rcommaaccent" : ["lc_ShortStem", "lc_r"],
				"s" : ["lc_s", "lc_s"],
				"sacute" : ["lc_s", "lc_s"],
				"scaron" : ["lc_s", "lc_s"],
				"scedilla" : ["lc_s", "lc_s"],
				"scircumflex" : ["lc_s", "lc_s"],
				"scommaaccent" : ["lc_s", "lc_s"],
				"t" : ["lc_t", "lc_t"],
				"tbar" : ["lc_t", "lc_t"],
				"tcaron" : ["lc_t", "lc_t"],
				"tcedilla" : ["lc_t", "lc_t"],
				"tcommaaccent" : ["lc_t", "lc_t"],
				"u" : ["lc_i", "lc_ShortStem"],
				"uacute" : ["lc_i", "lc_ShortStem"],
				"ubreve" : ["lc_i", "lc_ShortStem"],
				"ucircumflex" : ["lc_i", "lc_ShortStem"],
				"udieresis" : ["lc_i", "lc_ShortStem"],
				"ugrave" : ["lc_i", "lc_ShortStem"],
				"uhungarumlaut" : ["lc_i", "lc_ShortStem"],
				"umacron" : ["lc_i", "lc_ShortStem"],
				"uogonek" : ["lc_i", "lc_ShortStem"],
				"uring" : ["lc_i", "lc_ShortStem"],
				"utilde" : ["lc_i", "lc_ShortStem"],
				"v" : ["lc_vw", "lc_vw"],
				"w" : ["lc_vw", "lc_vw"],
				"wacute" : ["lc_vw", "lc_vw"],
				"wcircumflex" : ["lc_vw", "lc_vw"],
				"wdieresis" : ["lc_vw", "lc_vw"],
				"wgrave" : ["lc_vw", "lc_vw"],
				"x" : ["lc_x", "lc_x"],
				"y" : ["lc_y", "lc_y"],
				"yacute" : ["lc_y", "lc_y"],
				"ycircumflex" : ["lc_y", "lc_y"],
				"ydieresis" : ["lc_y", "lc_y"],
				"ygrave" : ["lc_y", "lc_y"],
				"z" : ["lc_z", "lc_z"],
				"zacute" : ["lc_z", "lc_z"],
				"zcaron" : ["lc_z", "lc_z"],
				"zdotaccent" : ["lc_z", "lc_z"],
				"schwa" : ["lc_schwa", "lc_Round"],
				"a-cy" : ["lc_Round", "lc_ShortStem"],
				"be-cy" : ["", "lc_Round"],
				"ve-cy" : ["lc_Round", "lc_ze"],
				"ge-cy" : ["lc_ge", "lc_ge"],
				"gje-cy" : ["lc_ge", "lc_ge"],
				"gheupturn-cy" : ["lc_ShortStem", ""],
				"de-cy" : ["lc_Round", ""],
				"ie-cy" : ["lc_Round", "lc_e"],
				"iegrave-cy" : ["lc_Round", "lc_e"],
				"io-cy" : ["lc_Round", "lc_e"],
				"zhe-cy" : ["lc_zhe", "lc_zhe"],
				"ze-cy" : ["lc_ze", "lc_ze"],
				"ii-cy" : ["lc_i", "lc_ShortStem"],
				"iishort-cy" : ["lc_i", "lc_ShortStem"],
				"iigrave-cy" : ["lc_i", "lc_ShortStem"],
				"ka-cy" : ["lc_ShortStem", "lc_k"],
				"kje-cy" : ["lc_ShortStem", "lc_k"],
				"el-cy" : ["lc_el", "lc_ShortStem"],
				"em-cy" : ["lc_el", "lc_ShortStem"],
				"en-cy" : ["lc_ShortStem", "lc_ShortStem"],
				"o-cy" : ["lc_Round", "lc_Round"],
				"pe-cy" : ["lc_ShortStem", "lc_Shoulder"],
				"er-cy" : ["lc_p", "lc_Round"],
				"es-cy" : ["lc_Round", "lc_c"],
				"te-cy" : ["lc_ShortStem", "lc_Shoulder"],
				"u-cy" : ["lc_vwy", "lc_vwy"],
				"ushort-cy" : ["lc_vwy", "lc_vwy"],
				"ef-cy" : ["lc_Round", "lc_Round"],
				"ha-cy" : ["lc_x", "lc_x"],
				"che-cy" : ["lc_che", "lc_ShortStem"],
				"tse-cy" : ["lc_i", "lc_StemTooth"],
				"sha-cy" : ["lc_i", "lc_ShortStem"],
				"shcha-cy" : ["lc_i", "lc_StemTooth"],
				"dzhe-cy" : ["lc_i", "lc_ShortStem"],
				"ia-cy" : ["", "lc_ShortStem"],
				"softsign-cy" : ["lc_i", "lc_softsign"],
				"hardsign-cy" : ["lc_hardsign", "lc_softsign"],
				"yeru-cy" : ["lc_i", "lc_ShortStem"],
				"lje-cy" : ["lc_el", "lc_softsign"],
				"nje-cy" : ["lc_ShortStem", "lc_softsign"],
				"ereversed-cy" : ["lc_ereversed", "lc_Round"],
				"e-cy" : ["lc_Round", "lc_c"],
				"yi-cy" : ["lc_i", "lc_ShortStem"],
				"i-cy" : ["lc_i", "lc_ShortStem"],
				"tshe-cy" : ["lc_LongStem", "lc_Shoulder"],
				"iu-cy" : ["lc_ShortStem", "lc_Round"],
				"dje-cy" : ["lc_LongStem", ""],
				"yat-cy" : ["lc_ShortStem", "lc_softsign"],
				"fita-cy" : ["lc_Round", "lc_Round"],
				"izhitsa-cy" : ["lc_vwy", ""],
				"ghestroke-cy" : ["lc_ge", "lc_ge"],
				"ghemiddlehook-cy" : ["lc_ShortStem", ""],
				"zhedescender-cy" : ["lc_zhe", "lc_zhe"],
				"zedescender-cy" : ["lc_ze", "lc_ze"],
				"kadescender-cy" : ["lc_ShortStem", "lc_k"],
				"kaverticalstroke-cy" : ["lc_ShortStem", "lc_k"],
				"kastroke-cy" : ["lc_LongStem", "lc_k"],
				"kabashkir-cy" : ["lc_hardsign", "lc_k"],
				"endescender-cy" : ["lc_ShortStem", "lc_StemTooth"],
				"pemiddlehook-cy" : ["lc_ShortStem", ""],
				"haabkhasian-cy" : ["lc_Round", ""],
				"esdescender-cy" : ["lc_Round", "lc_c"],
				"tedescender-cy" : ["lc_ShortStem", "lc_ShoulderTooth"],
				"ustrait-cy" : ["lc_ustait", "lc_ustait"],
				"ustraitstroke-cy" : ["lc_ustait", "lc_ustait"],
				"hadescender-cy" : ["lc_x", "lc_x"],
				"chedescender-cy" : ["lc_che", "lc_StemTooth"],
				"cheverticalstroke-cy" : ["lc_che", "lc_ShortStem"],
				"shha-cy" : ["lc_LongStem", "lc_Shoulder"],
				"cheabkhasian-cy" : ["lc_cheabkhaz", "lc_e"],
				"chedescenderabkhasian-cy" : ["lc_cheabkhaz", "lc_e"],
				"palochka-cy" : ["lc_LongStem", "lc_LongStem"],
				"zhebreve-cy" : ["lc_zhe", "lc_zhe"],
				"kahook-cy" : ["lc_ShortStem", ""],
				"eltail-cy" : ["lc_el", "lc_StemTooth"],
				"enhook-cy" : ["lc_ShortStem", "lc_StemHook"],
				"entail-cy" : ["lc_ShortStem", "lc_StemTooth"],
				"chekhakassian-cy" : ["lc_che", "lc_ShortStem"],
				"emtail-cy" : ["lc_el", "lc_StemTooth"],
				"abreve-cy" : ["lc_Round", "lc_ShortStem"],
				"adieresis-cy" : ["lc_Round", "lc_ShortStem"],
				"iebreve-cy" : ["lc_Round", "lc_e"],
				"schwa-cy" : ["lc_schwa", "lc_Round"],
				"schwadieresis-cy" : ["lc_schwa", "lc_Round"],
				"zhedieresis-cy" : ["lc_zhe", "lc_zhe"],
				"zedieresis-cy" : ["lc_ze", "lc_ze"],
				"imacron-cy" : ["lc_i", "lc_ShortStem"],
				"idieresis-cy" : ["lc_i", "lc_ShortStem"],
				"odieresis-cy" : ["lc_Round", "lc_Round"],
				"obarred-cy" : ["lc_Round", "lc_Round"],
				"obarreddieresis-cy" : ["lc_Round", "lc_Round"],
				"edieresis-cy" : ["lc_ze", "lc_Round"],
				"umacron-cy" : ["lc_vwy", "lc_vwy"],
				"udieresis-cy" : ["lc_vwy", "lc_vwy"],
				"uhungarumlaut-cy" : ["lc_vwy", "lc_vwy"],
				"chedieresis-cy" : ["lc_che", "lc_ShortStem"],
				"ghedescender-cy" : ["lc_ge", "lc_ge"],
				"yerudieresis-cy" : ["lc_i", "lc_ShortStem"],
				"hahook-cy" : ["lc_x", "lc_x"],
				"komide-cy" : ["lc_Round", "lc_LongStem"],
				"reversedze-cy" : ["", "lc_c"],
				"elhook-cy" : ["lc_el", "lc_StemHook"],
				"we-cy" : ["lc_vwy", ""],
				"pedescender-cy" : ["lc_ShortStem", "lc_ShoulderTooth"],
				"shhadescender-cy" : ["lc_LongStem", "lc_ShoulderTooth"],
				"ishorttail-cy" : ["lc_i", "lc_StemTooth"],
				"ertick-cy" : ["lc_er", ""],
				"enghe-cy" : ["lc_ShortStem", ""],
				"tetse-cy" : ["lc_te", ""],
				"aie-cy" : ["lc_Round", "lc_e"]
			}
	
			groupsMS = {
				"colon" : ["MSC_colon", "MSC_colon"],
				"comma" : ["MSC_period", "MSC_period"],
				"ellipsis" : ["MSC_period", "MSC_period"],
				"period" : ["MSC_period", "MSC_period"],
				"exclam" : ["MSC_exclam", "MSC_eclam"],
				"exclamdouble" : ["MSC_exclam", "MSC_eclam"],
				"quotedbl" : ["MSC_VertQuote", "MSC_VertQuote"],
				"quotesingle" : ["MSC_VertQuote", "MSC_VertQuote"],
				"semicolon" : ["MSC_colon", "MSC_colon"],
				"slash" : ["MSC_slash", "MSC_slash"],
				"braceleft" : ["", "MSC_bracketleft"],
				"braceright" : ["MSC_bracketright", ""],
				"bracketleft" : ["", "MSC_bracketleft"],
				"bracketright" : ["MSC_bracketright", ""],
				"parenleft" : ["", "MSC_bracketleft"],
				"parenright" : ["MSC_bracketright", ""],
				"emdash" : ["MSC_dash", "MSC_dash"],
				"endash" : ["MSC_dash", "MSC_dash"],
				"hyphen" : ["MSC_dash", "MSC_dash"],
				"horizontalbar" : ["MSC_dash", "MSC_dash"],
				"hyphentwo" : ["MSC_dash", "MSC_dash"],
				"softhyphen" : ["MSC_dash", "MSC_dash"],
				"guillemetleft" : ["MSC_guillemetleft", "MSC_guillemetleft"],
				"guillemetright" : ["MSC_guillemetright", "MSC_guillemetright"],
				"guilsinglleft" : ["MSC_guillemetleft", "MSC_guillemetleft"],
				"guilsinglright" : ["MSC_guillemetright", "MSC_guillemetright"],
				"quotedblbase" : ["MSC_period", "MSC_period"],
				"quotedblleft" : ["MSC_quoteleft", "MSC_quoteleft"],
				"quotedblright" : ["MSC_quoteright", "MSC_quoteright"],
				"quoteleft" : ["MSC_quoteleft", "MSC_quoteleft"],
				"quoteright" : ["MSC_quoteright", "MSC_quoteright"],
				"quotesinglbase" : ["MSC_period", "MSC_period"],
				"questiongreek" : ["MSC_colon", "MSC_colon"],
				"space" : ["MSC_space", "MSC_space"],
				"nbspace" : ["MSC_space", "MSC_space"],
				"divide" : ["MSC_minus", "MSC_minus"],
				"equal" : ["MSC_equal", "MSC_equal"],
				"greater" : ["", "MSC_minus"],
				"less" : ["MSC_minus", ""],
				"minus" : ["MSC_minus", "MSC_minus"],
				"notequal" : ["MSC_equal", "MSC_equal"],
				"percent" : ["MSC_percent", ""],
				"perthousand" : ["MSC_percent", ""],
				"plus" : ["MSC_minus", "MSC_minus"]
			}
	
			thisFont.disableUpdateInterface() # suppresses UI updates in Font View
			isNeeded = {}
			for glyph in thisFont.glyphs:
				isNeeded[glyph.name] = False
			if self.w.radioButton.get() == 1:
				for layer in thisFont.selectedLayers:
					isNeeded[layer.parent.name] = True
			else:
				for glyph in thisFont.glyphs:
					isNeeded[glyph.name] = True

			for key in groupsUC:
				if thisFont.glyphs[key] and isNeeded[key]:
					thisFont.glyphs[key].setLeftKerningGroup_(groupsUC[key][0])
					thisFont.glyphs[key].setRightKerningGroup_(groupsUC[key][1])
				if thisFont.glyphs[key.lower()+".sc"] and isNeeded[key]:
					thisFont.glyphs[key.lower()+".sc"].setLeftKerningGroup_(re.sub("UC_", "SC_",groupsUC[key][0]))
					thisFont.glyphs[key.lower()+".sc"].setRightKerningGroup_(re.sub("UC_", "SC_",groupsUC[key][1]))
				elif thisFont.glyphs[key.lower()+".smcp"] and isNeeded[key]:
					thisFont.glyphs[key.lower()+".smcp"].setLeftKerningGroup_(re.sub("UC_", "SC_",groupsUC[key][0]))
					thisFont.glyphs[key.lower()+".smcp"].setRightKerningGroup_(re.sub("UC_", "SC_",groupsUC[key][1]))

				if sender == self.w.allcapButton:
					if thisFont.glyphs[key] and isNeeded[key]:
						try:
							thisFont.glyphs[key.lower()].setLeftKerningGroup_(groupsUC[key][0])
							thisFont.glyphs[key.lower()].setRightKerningGroup_(groupsUC[key][1])
						except:
							print key.lower()

			for key in groupsMS:
				if thisFont.glyphs[key] and isNeeded[key]:
					thisFont.glyphs[key].setLeftKerningGroup_(groupsMS[key][0])
					thisFont.glyphs[key].setRightKerningGroup_(groupsMS[key][1])
				if (thisFont.glyphs[key.lower()+".case"] or thisFont.glyphs[key.lower()+".smcp"]) and isNeeded[key]:
					thisFont.glyphs[key].setLeftKerningGroup_(re.sub("MSC_", "MSC_UC_",groupsMS[key][0]))
					thisFont.glyphs[key].setRightKerningGroup_(re.sub("MSC_", "MSC_UC_",groupsMS[key][1]))
			
			if sender == self.w.normalButton:
				for key in groupsLCnormal:
					if thisFont.glyphs[key] and isNeeded[key]:
						thisFont.glyphs[key].setLeftKerningGroup_(groupsLCnormal[key][0])
						thisFont.glyphs[key].setRightKerningGroup_(groupsLCnormal[key][1])
			elif sender == self.w.cursiveButton:
				for key in groupsLCcursive:
					if thisFont.glyphs[key] and isNeeded[key]:
						thisFont.glyphs[key].setLeftKerningGroup_(groupsLCcursive[key][0])
						thisFont.glyphs[key].setRightKerningGroup_(groupsLCcursive[key][1])

			thisFont.enableUpdateInterface() # re-enables UI updates in Font View
					
			
			self.w.close() # delete if you want window to stay open
		except Exception, e:
			# brings macro window to front and reports error:
			Glyphs.showMacroWindow()
			print " SetKernPairsMain Error: %s" % e

Example 14

Project: PyIB
Source File: tenjin.py
View license
def _create_helpers_module():

    def to_str(val):
        """Convert value into string. Return '' if val is None.
           ex.
             >>> to_str(None)
             ''
             >>> to_str("foo")
             'foo'
             >>> to_str(u"\u65e5\u672c\u8a9e")
             u'\u65e5\u672c\u8a9e'
             >>> to_str(123)
             '123'
        """
        if val is None:              return ''
        if isinstance(val, str):     return val
        if isinstance(val, unicode): return val
        return str(val)

    def generate_tostrfunc(encoding):
        """Generate 'to_str' function which encodes unicode to str.
           ex.
              import tenjin
              from tenjin.helpers import escape
              to_str = tenjin.generate_tostrfunc('utf-8')
              engine = tenjin.Engine()
              context = { 'items': [u'AAA', u'BBB', u'CCC'] }
              output = engine.render('example.pyhtml')
              print output
        """
        def to_str(val):
            if val is None:               return ''
            if isinstance(val, str):      return val
            if isinstance(val, unicode):  return val.encode(encoding)
            return str(val)
        return to_str

    def echo(string):
        """add string value into _buf. this is equivarent to '#{string}'."""
        frame = sys._getframe(1)
        context = frame.f_locals
        context['_buf'].append(string)

    def start_capture(varname=None):
        """
        start capturing with name.

        ex. list.rbhtml
          <html><body>
          <?py start_capture('itemlist') ?>
            <ul>
              <?py for item in list: ?>
              <li>${item}</li>
              <?py #end ?>
            </ul>
          <?py stop_capture() ?>
          </body></html>

        ex. layout.rbhtml
          <html xml:lang="en" lang="en">
           <head>
            <title>Capture Example</title>
           </head>
           <body>
            <!-- content -->
          #{itemlist}
            <!-- /content -->
           </body>
          </html>
        """
        frame = sys._getframe(1)
        context = frame.f_locals
        context['_buf_tmp'] = context['_buf']
        context['_capture_varname'] = varname
        context['_buf'] = []

    def stop_capture(store_to_context=True):
        """
        stop capturing and return the result of capturing.
        if store_to_context is True then the result is stored into _context[varname].
        """
        frame = sys._getframe(1)
        context = frame.f_locals
        result = ''.join(context['_buf'])
        context['_buf'] = context.pop('_buf_tmp')
        varname = context.pop('_capture_varname')
        if varname:
            context[varname] = result
            if store_to_context:
                context['_context'][varname] = result
        return result

    def captured_as(name):
        """
        helper method for layout template.
        if captured string is found then append it to _buf and return True,
        else return False.
        """
        frame = sys._getframe(1)
        context = frame.f_locals
        if context.has_key(name):
            _buf = context['_buf']
            _buf.append(context[name])
            return True
        return False

    def _p(arg):
        """ex. '/show/'+_p("item['id']") => "/show/#{item['id']}" """
        return '<`#%s#`>' % arg    # decoded into #{...} by preprocessor

    def _P(arg):
        """ex. '<b>%s</b>' % _P("item['id']") => "<b>${item['id']}</b>" """
        return '<`$%s$`>' % arg    # decoded into ${...} by preprocessor

    def _decode_params(s):
        """decode <`#...#`> and <`$...$`> into #{...} and ${...}"""
        from urllib import unquote
        dct = { 'lt':'<', 'gt':'>', 'amp':'&', 'quot':'"', '#039':"'", }
        def unescape(s):
            #return s.replace('<', '<').replace('>', '>').replace('&quot;', '"').replace('&#039;', "'").replace('&amp;',  '&')
            return re.sub(r'&(lt|gt|quot|amp|#039);',  lambda m: dct[m.group(1)],  s)
        s = re.sub(r'%3C%60%23(.*?)%23%60%3E', lambda m: '#{%s}' % unquote(m.group(1)), s)
        s = re.sub(r'%3C%60%24(.*?)%24%60%3E', lambda m: '${%s}' % unquote(m.group(1)), s)
        s = re.sub(r'<`#(.*?)#`>',   lambda m: '#{%s}' % unescape(m.group(1)), s)
        s = re.sub(r'<`\$(.*?)\$`>', lambda m: '${%s}' % unescape(m.group(1)), s)
        s = re.sub(r'<`#(.*?)#`>', r'#{\1}', s)
        s = re.sub(r'<`\$(.*?)\$`>', r'${\1}', s)
        return s

    mod = _create_module('tenjin.helpers')
    mod.to_str             = to_str
    mod.generate_tostrfunc = generate_tostrfunc
    mod.echo               = echo
    mod.start_capture      = start_capture
    mod.stop_capture       = stop_capture
    mod.captured_as        = captured_as
    mod._p                 = _p
    mod._P                 = _P
    mod._decode_params     = _decode_params
    mod.__all__ = ['escape', 'to_str', 'echo', 'generate_tostrfunc',
                   'start_capture', 'stop_capture', 'captured_as',
                   '_p', '_P', '_decode_params',
                   ]
    return mod

Example 15

Project: pelisalacarta
Source File: peliculasdk.py
View license
def fanart(item):
    logger.info("pelisalacarta.peliculasdk fanart")
    itemlist = []
    url = item.url
    
    data = scrapertools.cachePage(url)
    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
    title= scrapertools.get_match(data,'<title>Ver Película(.*?) \(')
    title= re.sub(r"3D|SBS|-|","",title)
    title= title.replace('á','a')
    title= title.replace('Á','A')
    title= title.replace('é','e')
    title= title.replace('í','i')
    title= title.replace('ó','o')
    title= title.replace('ú','u')
    title= title.replace('ñ','n')
    title= title.replace('Crepusculo','Twilight')
    title= title.replace(' ','%20')
    url="http://api.themoviedb.org/3/search/movie?api_key="+Tmdb_key+"&query=" + title + "&language=es&include_adult=false"
    data = scrapertools.cachePage(url)
    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
    patron = '"page":1.*?,"id":(.*?),.*?"backdrop_path":"\\\(.*?)"'
    matches = re.compile(patron,re.DOTALL).findall(data)
    if len(matches)==0:
        extra=item.thumbnail
        show= item.thumbnail
        posterdb = item.thumbnail
        fanart_info = item.thumbnail
        fanart_trailer = item.thumbnail
        category= item.thumbnail
        itemlist.append( Item(channel=item.channel, title=item.title, url=item.url, action="findvideos", thumbnail=item.thumbnail, fanart=item.thumbnail ,extra=extra, show=show, category= category, folder=True) )
    else:
        for id, fan in matches:
            try:
                posterdb = scrapertools.get_match(data,'"page":1,.*?"poster_path":"\\\(.*?)"')
                posterdb =  "https://image.tmdb.org/t/p/original" + posterdb
            except:
                posterdb = item.thumbnail
            fanart="https://image.tmdb.org/t/p/original" + fan
            item.extra= fanart
            url ="http://api.themoviedb.org/3/movie/"+id+"/images?api_key="+Tmdb_key
            data = scrapertools.cachePage( url )
            data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
            
            patron = '"backdrops".*?"file_path":".*?",.*?"file_path":"(.*?)",.*?"file_path":"(.*?)",.*?"file_path":"(.*?)"'
            matches = re.compile(patron,re.DOTALL).findall(data)
                    
            if len(matches) == 0:
                patron = '"backdrops".*?"file_path":"(.*?)",.*?"file_path":"(.*?)",.*?"file_path":"(.*?)"'
                matches = re.compile(patron,re.DOTALL).findall(data)
                if len(matches) == 0:
                    fanart_info = item.extra
                    fanart_trailer = item.extra
                    fanart_2 = item.extra
            for fanart_info, fanart_trailer, fanart_2 in matches:
                        fanart_info = "https://image.tmdb.org/t/p/original" + fanart_info
                        fanart_trailer = "https://image.tmdb.org/t/p/original" + fanart_trailer
                        fanart_2 = "https://image.tmdb.org/t/p/original" + fanart_2
        
    #fanart_2 y arts
    
            url ="http://webservice.fanart.tv/v3/movies/"+id+"?api_key=dffe90fba4d02c199ae7a9e71330c987"
            data = scrapertools.cachePage(url)
            data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
            patron = '"hdmovielogo":.*?"url": "([^"]+)"'
            matches = re.compile(patron,re.DOTALL).findall(data)
            if '"moviedisc"' in data:
                disc = scrapertools.get_match(data,'"moviedisc":.*?"url": "([^"]+)"')
            if '"movieposter"' in data:
                poster = scrapertools.get_match(data,'"movieposter":.*?"url": "([^"]+)"')
            if '"moviethumb"' in data:
                thumb = scrapertools.get_match(data,'"moviethumb":.*?"url": "([^"]+)"')
            if '"moviebanner"' in data:
                 banner= scrapertools.get_match(data,'"moviebanner":.*?"url": "([^"]+)"')
        
            if len(matches)==0:
               extra=  posterdb
               show = fanart_2
               category = item.extra
               itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=posterdb, fanart=item.extra,  extra=extra, show=show, category= category, folder=True) )
        for logo in matches:
            if '"hdmovieclearart"' in data:
                clear=scrapertools.get_match(data,'"hdmovieclearart":.*?"url": "([^"]+)"')
                if '"moviebackground"' in data:
                     extra=clear
                     show= fanart_2
                     if '"moviedisc"' in data:
                        category= disc
                     else:
                         category= clear
                     itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=logo, fanart=item.extra, extra=extra,show=show, category= category, folder=True) )
                else:
                    extra= clear
                    show=fanart_2
                    if '"moviedisc"' in data:
                       category = disc
                    else:
                        category = clear
                    itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=logo, fanart=item.extra, extra=extra,show=show, category= category, folder=True) )
                
            if '"moviebackground"' in data:
                
                if '"hdmovieclearart"' in data:
                    clear=scrapertools.get_match(data,'"hdmovieclearart":.*?"url": "([^"]+)"')
                    extra=clear
                    show= fanart_2
                    if '"moviedisc"' in data:
                        category= disc
                    else:
                        category= clear
                    
                else:
                    extra=logo
                    show= fanart_2
                    if '"moviedisc"' in data:
                        category= disc
                    else:
                        category= logo
                    itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=logo, fanart=item.extra, extra=extra,show=show, category= category,  folder=True) )

            if not '"hdmovieclearart"' in data and not '"moviebackground"' in data:
                    extra= logo
                    show=  fanart_2
                    if '"moviedisc"' in data:
                        category= disc
                    else:
                         category= item.extra
                    itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=logo, fanart=item.extra,category= category, extra=extra,show=show ,  folder=True) )

    title ="Info"
   
    if posterdb == item.thumbnail:
       if '"movieposter"' in data:
           thumbnail= poster
       else:
           thumbnail = item.thumbnail
    else:
        thumbnail = posterdb



    

    title = title.replace(title,bbcode_kodi2html("[COLOR skyblue]"+title+"[/COLOR]"))
    itemlist.append( Item(channel=item.channel, action="info" , title=title , url=item.url, thumbnail=posterdb, fanart=fanart_info, extra = extra, show = show,folder=False ))

    title= bbcode_kodi2html("[COLOR crimson]Trailer[/COLOR]")
    
    if len(item.extra)==0:
        fanart=item.thumbnail
    else:
        fanart = item.extra



    if '"moviethumb"' in data:
        thumbnail = thumb
    else:
        thumbnail = posterdb

    if '"moviebanner"' in data:
        extra= banner
    else:
        if 'hdmovieclearart"' in data:
            extra = clear
        
        else:
            extra = posterdb



    itemlist.append( Item(channel=item.channel, action="trailer", title=title , url=item.url , thumbnail=thumbnail , fulltitle = item.title , fanart=fanart_trailer, extra=extra, plot = item.plot,folder=True) )


    return itemlist

Example 16

Project: aceproxy
Source File: p2pproxy_plugin.py
View license
    def handle(self, connection):
        P2pproxy.logger.debug('Handling request')

        hostport = connection.headers['Host']

        query = urlparse.urlparse(connection.path).query
        self.params = urlparse.parse_qs(query)

        if connection.reqtype == 'channels':  # /channels/ branch
            if len(connection.splittedpath) == 3 and connection.splittedpath[2].split('?')[
                0] == 'play':  # /channels/play?id=[id]
                channel_id = self.get_param('id')
                if not channel_id:
                    # /channels/play?id=&_=[epoch timestamp] is Torrent-TV widget proxy check
                    # P2pProxy simply closes connection on this request sending Server header, so do we
                    if self.get_param('_'):
                        P2pproxy.logger.debug('Status check')
                        connection.send_response(200)
                        connection.send_header('Access-Control-Allow-Origin', '*')
                        connection.send_header('Connection', 'close')
                        connection.send_header('Content-Type', 'text/plain;charset=utf-8')
                        connection.send_header('Server', 'P2pProxy/1.0.3.1 AceProxy')
                        connection.wfile.write('\r\n')
                        return
                    else:
                        connection.dieWithError()  # Bad request
                        return

                stream_url = None

                session = TorrentTvApi.auth(config.email, config.password)
                stream_type, stream = TorrentTvApi.stream_source(session, channel_id)

                if stream_type == 'torrent':
                    stream_url = re.sub('^(http.+)$',
                                        lambda match: '/torrent/' + urllib2.quote(match.group(0), '') + '/stream.mp4',
                                        stream)
                elif stream_type == 'contentid':
                    stream_url = re.sub('^([0-9a-f]{40})',
                                        lambda match: '/pid/' + urllib2.quote(match.group(0), '') + '/stream.mp4',
                                        stream)
                connection.path = stream_url
                connection.splittedpath = stream_url.split('/')
                connection.reqtype = connection.splittedpath[1].lower()
                connection.handleRequest(False)
            elif self.get_param('type') == 'm3u':  # /channels/?filter=[filter]&group=[group]&type=m3u
                connection.send_response(200)
                connection.send_header('Content-Type', 'application/x-mpegurl')
                connection.end_headers()

                param_group = self.get_param('group')
                param_filter = self.get_param('filter')
                if not param_filter:
                    param_filter = 'all'  # default filter

                session = TorrentTvApi.auth(config.email, config.password)
                translations_list = TorrentTvApi.translations(session, param_filter)

                playlistgen = PlaylistGenerator()
                P2pproxy.logger.debug('Generating requested m3u playlist')
                for channel in translations_list:
                    group_id = channel.getAttribute('group')

                    if param_group and param_group != 'all' and param_group != group_id:  # filter channels by group
                        continue

                    name = channel.getAttribute('name')
                    group = TorrentTvApi.CATEGORIES[int(group_id)].decode('utf-8')
                    cid = channel.getAttribute('id')
                    logo = channel.getAttribute('logo')
                    if config.fullpathlogo:
                        logo = 'http://torrent-tv.ru/uploads/' + logo

                    fields = {'name': name, 'id': cid, 'url': cid, 'group': group, 'logo': logo}
                    fields['tvgid'] = config.tvgid %fields
                    playlistgen.addItem(fields)

                P2pproxy.logger.debug('Exporting')
                header = '#EXTM3U url-tvg="%s" tvg-shift=%d\n' %(config.tvgurl, config.tvgshift)
                exported = playlistgen.exportm3u(hostport=hostport, header=header)
                exported = exported.encode('utf-8')
                connection.wfile.write(exported)
            else:  # /channels/?filter=[filter]
                param_filter = self.get_param('filter')
                if not param_filter:
                    param_filter = 'all'  # default filter

                session = TorrentTvApi.auth(config.email, config.password)
                translations_list = TorrentTvApi.translations(session, param_filter, True)

                P2pproxy.logger.debug('Exporting')

                connection.send_response(200)
                connection.send_header('Access-Control-Allow-Origin', '*')
                connection.send_header('Connection', 'close')
                connection.send_header('Content-Length', str(len(translations_list)))
                connection.send_header('Content-Type', 'text/xml;charset=utf-8')
                connection.end_headers()
                connection.wfile.write(translations_list)
        elif connection.reqtype == 'xbmc.pvr':  # same as /channels request
            if len(connection.splittedpath) == 3 and connection.splittedpath[2] == 'playlist':
                session = TorrentTvApi.auth(config.email, config.password)
                translations_list = TorrentTvApi.translations(session, 'all', True)

                P2pproxy.logger.debug('Exporting')

                connection.send_response(200)
                connection.send_header('Access-Control-Allow-Origin', '*')
                connection.send_header('Connection', 'close')
                connection.send_header('Content-Length', str(len(translations_list)))
                connection.send_header('Content-Type', 'text/xml;charset=utf-8')
                connection.end_headers()
                connection.wfile.write(translations_list)
        elif connection.reqtype == 'archive':  # /archive/ branch
            if len(connection.splittedpath) == 3 and connection.splittedpath[2] == 'channels':  # /archive/channels

                session = TorrentTvApi.auth(config.email, config.password)
                archive_channels = TorrentTvApi.archive_channels(session, True)

                P2pproxy.logger.debug('Exporting')

                connection.send_response(200)
                connection.send_header('Access-Control-Allow-Origin', '*')
                connection.send_header('Connection', 'close')
                connection.send_header('Content-Length', str(len(archive_channels)))
                connection.send_header('Content-Type', 'text/xml;charset=utf-8')
                connection.end_headers()
                connection.wfile.write(archive_channels)
            if len(connection.splittedpath) == 3 and connection.splittedpath[2].split('?')[
                0] == 'play':  # /archive/play?id=[record_id]
                record_id = self.get_param('id')
                if not record_id:
                    connection.dieWithError()  # Bad request
                    return

                stream_url = None

                session = TorrentTvApi.auth(config.email, config.password)
                stream_type, stream = TorrentTvApi.archive_stream_source(session, record_id)

                if stream_type == 'torrent':
                    stream_url = re.sub('^(http.+)$',
                                        lambda match: '/torrent/' + urllib2.quote(match.group(0), '') + '/stream.mp4',
                                        stream)
                elif stream_type == 'contentid':
                    stream_url = re.sub('^([0-9a-f]{40})',
                                        lambda match: '/pid/' + urllib2.quote(match.group(0), '') + '/stream.mp4',
                                        stream)
                connection.path = stream_url
                connection.splittedpath = stream_url.split('/')
                connection.reqtype = connection.splittedpath[1].lower()
                connection.handleRequest(False)
            elif self.get_param('type') == 'm3u':  # /archive/?type=m3u&date=[param_date]&channel_id=[param_channel]
                connection.send_response(200)
                connection.send_header('Content-Type', 'application/x-mpegurl')
                connection.end_headers()

                param_date = self.get_param('date')
                if not param_date:
                    d = date.today()  # consider default date as today if not given
                else:
                    try:
                        param_date = param_date.split('-')
                        d = date(param_date[2], param_date[1], param_date[0])
                    except IndexError:
                        P2pproxy.logger.error('date param is not correct!')
                        connection.dieWithError()
                        return
                param_channel = self.get_param('channel_id')
                if param_channel == '' or not param_channel:
                    P2pproxy.logger.error('Got /archive/ request but no channel_id specified!')
                    connection.dieWithError()
                    return

                session = TorrentTvApi.auth(config.email, config.password)
                records_list = TorrentTvApi.records(session, param_channel, d.strftime('%d-%m-%Y'))
                channels_list = TorrentTvApi.archive_channels(session)

                playlistgen = PlaylistGenerator()
                P2pproxy.logger.debug('Generating archive m3u playlist')
                for record in records_list:
                    record_id = record.getAttribute('record_id')
                    name = record.getAttribute('name')
                    channel_id = record.getAttribute('channel_id')
                    channel_name = ''
                    logo = ''
                    for channel in channels_list:
                        if channel.getAttribute('channel_id') == channel_id:
                            channel_name = channel.getAttribute('name')
                            logo = channel.getAttribute('logo')

                    if channel_name != '':
                        name = '(' + channel_name + ') ' + name
                    if logo != '' and config.fullpathlogo:
                        logo = 'http://torrent-tv.ru/uploads/' + logo

                    playlistgen.addItem({'name': name, 'url': record_id, 'logo': logo})

                P2pproxy.logger.debug('Exporting')
                exported = playlistgen.exportm3u(hostport, empty_header=True, archive=True)
                exported = exported.encode('utf-8')
                connection.wfile.write(exported)
            else:  # /archive/?date=[param_date]&channel_id=[param_channel]
                param_date = self.get_param('date')
                if not param_date:
                    d = date.today()
                else:
                    try:
                        param_date = param_date.split('-')
                        d = date(param_date[2], param_date[1], param_date[0])
                    except IndexError:
                        P2pproxy.logger.error('date param is not correct!')
                        connection.dieWithError()
                        return
                param_channel = self.get_param('channel_id')
                if param_channel == '' or not param_channel:
                    P2pproxy.logger.error('Got /archive/ request but no channel_id specified!')
                    connection.dieWithError()
                    return

                session = TorrentTvApi.auth(config.email, config.password)
                records_list = TorrentTvApi.records(session, param_channel, d.strftime('%d-%m-%Y'), True)

                P2pproxy.logger.debug('Exporting')

                connection.send_response(200)
                connection.send_header('Access-Control-Allow-Origin', '*')
                connection.send_header('Connection', 'close')
                connection.send_header('Content-Length', str(len(records_list)))
                connection.send_header('Content-Type', 'text/xml;charset=utf-8')
                connection.end_headers()
                connection.wfile.write(records_list)

Example 17

View license
def update_foreign_fields(old_id, node):
    dry_run = '--dry' in sys.argv
    logger.info('* Updating ForeignFields for node {}->{}'.format(old_id, node))

    bns_owner = list(database['boxnodesettings'].find({'owner': old_id}))
    if bns_owner:
        logger.info('** Updating {} BoxNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in bns_owner]))
        for doc in bns_owner:
            database['boxnodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    bus_og = list(database['boxusersettings'].find({'oauth_grants.{}'.format(old_id): {'$ne': None}}))
    if bus_og:
        logger.info('** Updating {} BoxUserSettings (oauth_grants) {}'.format(old_id, [d['_id'] for d in bus_og]))
        for doc in bus_og:
            og = doc['oauth_grants']
            og[node._id] = og.pop(old_id)
            database['boxusersettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'oauth_grants': og
                }}
            )
    advns_o = list(database['addondataversenodesettings'].find({'owner': old_id}))        
    if advns_o:
        logger.info('** Updating {} AddonDataverseNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in advns_o]))
        for doc in advns_o:
            database['addondataversenodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    advus_og = list(database['addondataverseusersettings'].find({'oauth_grants.{}'.format(old_id): {'$ne': None}}))
    if advus_og:
        logger.info('** Updating {} AddonDataverseUserSettings (oauth_grants) {}'.format(old_id, [d['_id'] for d in advus_og]))
        for doc in advus_og:
            og = doc['oauth_grants']
            og[node._id] = og.pop(old_id)
            database['addondataverseusersettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'oauth_grants': og
                }}
            )

    dbns_o = list(database['dropboxnodesettings'].find({'owner': old_id}))
    if dbns_o:
        logger.info('** Updating {} DropboxNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in dbns_o]))
        for doc in dbns_o:
            database['dropboxnodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    dbus_og = list(database['dropboxusersettings'].find({'oauth_grants.{}'.format(old_id): {'$ne': None}}))
    if dbus_og:
        logger.info('** Updating {} DropboxUserSettings (oauth_grants) {}'.format(old_id, [d['_id'] for d in dbus_og]))
        for doc in dbus_og:
            og = doc['oauth_grants']
            og[node._id] = og.pop(old_id)
            database['dropboxusersettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'oauth_grants': og
                }}
            )

    afsns_o = list(database['addonfigsharenodesettings'].find({'owner': old_id}))
    if afsns_o:
        logger.info('** Updating {} AddonFigShareNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in afsns_o]))
        for doc in afsns_o:
            database['addonfigsharenodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    ## Figshare has no oauth_grants

    fwns_o = list(database['forwardnodesettings'].find({'owner': old_id}))
    if fwns_o:
        logger.info('** Updating {} ForwardNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in fwns_o]))
        for doc in fwns_o:
            database['forwardnodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    ghns_o = list(database['githubnodesettings'].find({'owner': old_id}))
    if ghns_o:
        logger.info('** Updating {} GithubNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in ghns_o]))
        for doc in ghns_o:
            database['githubnodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    ghus_og = list(database['githubusersettings'].find({'oauth_grants.{}'.format(old_id): {'$ne': None}}))
    if ghus_og:
        logger.info('** Updating {} GithubUserSettings (oauth_grants) {}'.format(old_id, [d['_id'] for d in ghus_og]))
        for doc in ghus_og:
            og = doc['oauth_grants']
            og[node._id] = og.pop(old_id)
            database['githubusersettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'oauth_grants': og
                }}
            )

    gdns_o = list(database['googledrivenodesettings'].find({'owner': old_id}))
    if gdns_o:
        logger.info('** Updating {} GoogleDriveNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in gdns_o]))
        for doc in gdns_o:
            database['googledrivenodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    gdus_og = list(database['googledriveusersettings'].find({'oauth_grants.{}'.format(old_id): {'$ne': None}}))
    if gdus_og:
        logger.info('** Updating {} GoogleDriveUserSettings (oauth_grants) {}'.format(old_id, [d['_id'] for d in gdus_og]))
        for doc in gdus_og:
            og = doc['oauth_grants']
            og[node._id] = og.pop(old_id)
            database['googledriveusersettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'oauth_grants': og
                }}
            )

    mns_o = list(database['mendeleynodesettings'].find({'owner': old_id}))
    if mns_o:
        logger.info('** Updating {} MendeleyNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in mns_o]))
        for doc in mns_o:
            database['mendeleynodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    mus_og = list(database['mendeleyusersettings'].find({'oauth_grants.{}'.format(old_id): {'$ne': None}}))
    if mus_og:
        logger.info('** Updating {} MendeleyUserSettings (oauth_grants) {}'.format(old_id, [d['_id'] for d in mus_og]))
        for doc in mus_og:
            og = doc['oauth_grants']
            og[node._id] = og.pop(old_id)
            database['mendeleyusersettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'oauth_grants': og
                }}
            )

    osfsns_o = list(database['osfstoragenodesettings'].find({'owner': old_id}))
    if osfsns_o:
        logger.info('** Updating {} OsfStorageNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in osfsns_o]))
        for doc in osfsns_o:
            database['osfstoragenodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    ocns_o = list(database['addonowncloudnodesettings'].find({'owner': old_id}))
    if ocns_o:
        logger.info('** Updating {} AddonOwnCloudNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in ocns_o]))
        for doc in ocns_o:
            database['addonowncloudnodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    ocus_og = list(database['addonowncloudusersettings'].find({'oauth_grants.{}'.format(old_id): {'$ne': None}}))
    if ocus_og:
        logger.info('** Updating {} AddonOwnCloudUserSettings (oauth_grants) {}'.format(old_id, [d['_id'] for d in ocus_og]))
        for doc in ocus_og:
            og = doc['oauth_grants']
            og[node._id] = og.pop(old_id)
            database['addonowncloudusersettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'oauth_grants': og
                }}
            )

    s3ns_o = list(database['s3nodesettings'].find({'owner': old_id}))
    if s3ns_o:
        logger.info('** Updating {} s3NodeSettings (owner) {}'.format(old_id, [d['_id'] for d in s3ns_o]))
        for doc in s3ns_o:
            database['s3nodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    s3us_og = list(database['s3usersettings'].find({'oauth_grants.{}'.format(old_id): {'$ne': None}}))
    if s3us_og:
        logger.info('** Updating {} S3UserSettings (oauth_grants) {}'.format(old_id, [d['_id'] for d in s3us_og]))
        for doc in s3us_og:
            og = doc['oauth_grants']
            og[node._id] = og.pop(old_id)
            database['s3usersettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'oauth_grants': og
                }}
            )

    awns_o = list(database['addonwikinodesettings'].find({'owner': old_id}))
    if awns_o:
        logger.info('** Updating {} AddonWikiNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in awns_o]))
        for doc in awns_o:
            database['addonwikinodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    nwp_n = list(database['nodewikipage'].find({'node': old_id}))
    if nwp_n:
        logger.info('** Updating {} NodeWikiPage (node) {}'.format(old_id, [d['_id'] for d in nwp_n]))
        for doc in nwp_n:
            database['nodewikipage'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'node': node._id
                }}
            )

    zns_o = list(database['zoteronodesettings'].find({'owner': old_id}))
    if zns_o:
        logger.info('** Updating {} ZoteroNodeSettings (owner) {}'.format(old_id, [d['_id'] for d in zns_o]))
        for doc in zns_o:
            database['zoteronodesettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'owner': node._id
                }}
            )

    zus_og = list(database['zoterousersettings'].find({'oauth_grants.{}'.format(old_id): {'$ne': None}}))
    if zus_og:
        logger.info('** Updating {} ZoteroUserSettings (oauth_grants) {}'.format(old_id, [d['_id'] for d in zus_og]))
        for doc in zus_og:
            og = doc['oauth_grants']
            og[node._id] = og.pop(old_id)
            database['zoterousersettings'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'oauth_grants': og
                }}
            )

    aj_sn = list(database['archivejob'].find({'src_node': old_id}))
    if aj_sn:
        logger.info('** Updating {} ArchiveJobs (src_node) {}'.format(old_id, [d['_id'] for d in aj_sn]))
        for doc in aj_sn:
            database['archivejob'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'src_node': node._id
                }}
            )

    tfn_n = list(database['trashedfilenode'].find({'node': old_id}))
    if tfn_n:
        logger.info('** Updating {} TrashedFileNodes (node) {}'.format(old_id, [d['_id'] for d in tfn_n]))
        for doc in tfn_n:
            del_on = doc.pop('deleted_on')  # Remove non-JSON-serializable datetime fields
            last_touch = doc.pop('last_touched')  
            hist_mods = [doc['history'][doc['history'].index(h)].pop('modified') for h in doc['history']]
            replacement = json.loads(re.sub(r'\b{}\b'.format(old_id), node._id, json.dumps(doc)))
            for i, mod in enumerate(hist_mods):
                replacement['history'][i]['modified'] = mod
            database['trashedfilenode'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'node': replacement['node'],
                    'history': replacement['history']
                }}
            )

    sfn_n = list(database['storedfilenode'].find({'node': old_id}))
    if sfn_n:
        logger.info('** Updating {} StoredFileNodes (node) {}'.format(old_id, [d['_id'] for d in sfn_n]))
        for doc in sfn_n:
            doc.pop('last_touched')  # Remove non-JSON-serializable datetime fields
            hist_mods = [doc['history'][doc['history'].index(h)].pop('modified') for h in doc['history']]
            replacement = json.loads(re.sub(r'\b{}\b'.format(old_id), node._id, json.dumps(doc)))
            for i, mod in enumerate(hist_mods):
                replacement['history'][i]['modified'] = mod
            database['storedfilenode'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'node': replacement['node'],
                    'history': replacement['history']
                }}
            )

    com_n = list(database['comment'].find({'node': old_id}))
    if com_n:
        logger.info('** Updating {} Comments (node) {}'.format(old_id, [d['_id'] for d in com_n]))
        for doc in com_n:
            database['comment'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'node': node._id
                }}
            )

    com_t = list(database['comment'].find({'target': {'$in': [old_id]}}))
    if com_t:
        logger.info('** Updating {} Comments (target) {}'.format(old_id, [d['_id'] for d in com_t]))
        for doc in com_t:
            targ = doc['target']
            targ.insert(targ.index(old_id), node._id)
            targ.remove(old_id)
            database['comment'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'target': targ
                }}
            )

    com_t = list(database['comment'].find({'root_target': {'$in': [old_id]}}))
    if com_t:
        logger.info('** Updating {} Comments (root_target) {}'.format(old_id, [d['_id'] for d in com_t]))
        for doc in com_t:
            rtarg = doc['root_target']
            rtarg.insert(rtarg.index(old_id), node._id)
            rtarg.remove(old_id)
            database['comment'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'root_target': rtarg
                }}
            )

    nl_on = list(database['nodelog'].find({'original_node': old_id}))
    if nl_on:
        logger.info('** Updating {} NodeLogs (original_node) {}'.format(old_id, [d['_id'] for d in nl_on]))
        for doc in nl_on:
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'original_node': node._id
                }}
            )

    nl_n = list(database['nodelog'].find({'node': old_id}))
    if nl_n:
        logger.info('** Updating {} NodeLogs (node) {}'.format(old_id, [d['_id'] for d in nl_n]))
        for doc in nl_n:
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'node': node._id
                }}
            )

    nl_pac = list(database['nodelog'].find({'params.auth.callback_url': {'$regex': '/{}/'.format(old_id)}}))
    if nl_pac:
        logger.info('** Updating {} NodeLogs (params.auth.callback_url) {}'.format(old_id, [d['_id'] for d in nl_pac]))
        for doc in nl_pac:
            params = doc['params']
            params['auth']['callback_url'] = params['auth']['callback_url'].replace('{}/'.format(old_id), '{}/'.format(node._id))
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_pn = list(database['nodelog'].find({'params.node': old_id}))
    if nl_pn:
        logger.info('** Updating {} NodeLogs (params.node) {}'.format(old_id, [d['_id'] for d in nl_pn]))
        for doc in nl_pn:
            params = doc['params']
            params['node'] = node._id
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_ppar = list(database['nodelog'].find({'params.parent': old_id}))
    if nl_ppar:
        logger.info('** Updating {} NodeLogs (params.parent) {}'.format(old_id, [d['_id'] for d in nl_ppar]))
        for doc in nl_ppar:
            params = doc['params']
            params['parent'] = node._id
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_ppro = list(database['nodelog'].find({'params.project': old_id}))
    if nl_ppro:
        logger.info('** Updating {} NodeLogs (params.project) {}'.format(old_id, [d['_id'] for d in nl_ppro]))
        for doc in nl_ppro:
            params = doc['params']
            params['project'] = node._id
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_ppn = list(database['nodelog'].find({'params.parent_node': old_id}))
    if nl_ppn:
        logger.info('** Updating {} NodeLogs (params.parent_node) {}'.format(old_id, [d['_id'] for d in nl_ppn]))
        for doc in nl_ppn:
            params = doc['params']
            params['parent_node'] = node._id
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_pdn = list(database['nodelog'].find({'params.destination.nid': old_id}))
    if nl_pdn:
        logger.info('** Updating {} NodeLogs (params.destination.nid) {}'.format(old_id, [d['_id'] for d in nl_pdn]))
        for doc in nl_pdn:
            params = doc['params']
            params['destination']['nid'] = node._id
            if params['destination'].get('url', None):
                params['destination']['url'] = params['destination']['url'].replace('{}/'.format(old_id), '{}/'.format(node._id))
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_pdr = list(database['nodelog'].find({'params.destination.resource': old_id}))
    if nl_pdr:
        logger.info('** Updating {} NodeLogs (params.destination.resource) {}'.format(old_id, [d['_id'] for d in nl_pdr]))
        for doc in nl_pdr:
            params = doc['params']
            params['destination']['resource'] = node._id
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_pdni = list(database['nodelog'].find({'params.destination.node._id': old_id}))
    if nl_pdni:
        logger.info('** Updating {} NodeLogs (params.destination.node._id) {}'.format(old_id, [d['_id'] for d in nl_pdni]))
        for doc in nl_pdni:
            params = doc['params']
            params['destination']['node']['_id'] = node._id
            if params['destination']['node'].get('url', None):
                params['destination']['node']['url'] = params['destination']['node']['url'].replace('{}/'.format(old_id), '{}/'.format(node._id))
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_ppi = list(database['nodelog'].find({'params.pointer.id': old_id}))
    if nl_ppi:
        logger.info('** Updating {} NodeLogs (params.pointer.id) {}'.format(old_id, [d['_id'] for d in nl_ppi]))
        for doc in nl_ppi:
            params = doc['params']
            params['pointer']['id'] = node._id
            if params['pointer'].get('url', None):
                params['pointer']['url'] = params['pointer']['url'].replace('{}/'.format(old_id), '{}/'.format(node._id))
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_psn = list(database['nodelog'].find({'params.source.nid': old_id}))
    if nl_psn:
        logger.info('** Updating {} NodeLogs (params.source.nid) {}'.format(old_id, [d['_id'] for d in nl_psn]))
        for doc in nl_psn:
            params = doc['params']
            params['source']['nid'] = node._id
            if params['source'].get('url', None):
                params['source']['url'] = params['source']['url'].replace('{}/'.format(old_id), '{}/'.format(node._id))
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_psni = list(database['nodelog'].find({'params.source.node._id': old_id}))
    if nl_psni:
        logger.info('** Updating {} NodeLogs (params.source.node._id) {}'.format(old_id, [d['_id'] for d in nl_psni]))
        for doc in nl_psni:
            params = doc['params']
            params['source']['node']['_id'] = node._id
            if params['source']['node'].get('url', None):
                params['source']['node']['url'] = params['source']['node']['url'].replace('{}/'.format(old_id), '{}/'.format(node._id))
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_psr = list(database['nodelog'].find({'params.source.resource': old_id}))
    if nl_psr:
        logger.info('** Updating {} NodeLogs (params.source.resource) {}'.format(old_id, [d['_id'] for d in nl_psr]))
        for doc in nl_psr:
            params = doc['params']
            params['source']['resource'] = node._id
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_ptni = list(database['nodelog'].find({'params.template_node._id': old_id}))
    if nl_ptni:
        logger.info('** Updating {} NodeLogs (params.template_node._id) {}'.format(old_id, [d['_id'] for d in nl_ptni]))
        for doc in nl_ptni:
            params = doc['params']
            params['template_node']['_id'] = node._id
            if params['template_node'].get('url', None):
                params['template_node']['url'] = params['template_node']['url'].replace('{}/'.format(old_id), '{}/'.format(node._id))
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    nl_pud = list(database['nodelog'].find({'params.urls.download': {'$regex': '/{}/'.format(old_id)}}))
    if nl_pud:
        logger.info('** Updating {} NodeLogs (params.source.node._id) {}'.format(old_id, [d['_id'] for d in nl_pud]))
        for doc in nl_pud:
            params = doc['params']
            params['urls']['download'] = params['urls']['download'].replace('{}/'.format(old_id), '{}/'.format(node._id))
            if params['urls'].get('view', None):
                params['urls']['view'] = params['urls']['view'].replace('{}/'.format(old_id), '{}/'.format(node._id))
            database['nodelog'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'params': params
                }}
            )

    ptr_n = list(database['pointer'].find({'node': old_id}))
    if ptr_n:
        logger.info('** Updating {} Pointers (node) {}'.format(old_id, [d['_id'] for d in ptr_n]))
        for doc in ptr_n:
            database['pointer'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'node': node._id
                }}
            )

    n_ff = list(database['node'].find({'forked_from': old_id}))
    if n_ff:
        logger.info('** Updating {} Nodes (forked_from) {}'.format(old_id, [d['_id'] for d in n_ff]))
        for doc in n_ff:
            database['node'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'forked_from': node._id
                }}
            )

    n_rf = list(database['node'].find({'registered_from': old_id}))
    if n_rf:
        logger.info('** Updating {} Nodes (registered_from) {}'.format(old_id, [d['_id'] for d in n_rf]))
        for doc in n_rf:
            database['node'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'registered_from': node._id
                }}
            )

    n_root = list(database['node'].find({'root': old_id}))
    if n_root:
        logger.info('** Updating {} Nodes (root) {}'.format(old_id, [d['_id'] for d in n_root]))
        for doc in n_root:
            database['node'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'root': node._id
                }}
            )

    n_par = list(database['node'].find({'parent': old_id}))
    if n_par:
        logger.info('** Updating {} Nodes (parent) {}'.format(old_id, [d['_id'] for d in n_par]))
        for doc in n_par:
            database['node'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'parent': node._id
                }}
            )

    n_cns = list(database['node'].find({'$where': 'if (this.child_node_subscriptions!==undefined){{var keys=Object.keys(this.child_node_subscriptions);for(var i=0;i<keys.length;i+=1){{if(this.child_node_subscriptions[keys[i]].indexOf("{}")!==-1){{return true}}}}}}return false;'.format(old_id)}))
    if n_cns:
        docs = list(n_cns)
        logger.info('** Updating {} Nodes (child_node_subscriptions) {}'.format(old_id, [d['_id'] for d in docs]))
        for doc in docs:
            if doc['_id'] in cns_dict_to_update:
                cns = cns_dict_to_update[doc['_id']]
            else:
                cns = doc['child_node_subscriptions']
            replacement = json.loads(re.sub(r'\b{}\b'.format(old_id), node._id, json.dumps(cns)))
            cns_dict_to_update[doc['_id']] = replacement
            database['node'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'child_node_subscriptions': replacement
                }}
            )

    nd_nl = list(database['notificationdigest'].find({'node_lineage': {'$in': [old_id]}}))
    if nd_nl:
        logger.info('** Updating {} NotificationDigest (node_lineage) {}'.format(old_id, [d['_id'] for d in nd_nl]))
        for doc in nd_nl:
            nl = doc['node_lineage']
            nl.insert(nl.index(old_id), node._id)
            nl.remove(old_id)
            if doc['message'].find('/{}/'.format(old_id)) != -1:  # avoid html regexes
                message = doc['message'].replace('/{}/'.format(old_id), '/{}/'.format(node._id))
                database['notificationdigest'].find_and_modify(
                    {'_id': doc['_id']},
                    {'$set':{
                        'message': message,
                        'node_lineage': nl
                    }}
                )
            else:
                database['notificationdigest'].find_and_modify(
                    {'_id': doc['_id']},
                    {'$set':{
                        'node_lineage': nl
                    }}
                )

    ns_i = list(database['notificationsubscription'].find({'_id': {'$regex': old_id}}))
    if ns_i:
        logger.info('** Updating {} NotificationSubscription (_id, owner) {}'.format(old_id, [d['_id'] for d in ns_i]))
        for doc in ns_i:
            replacement = json.loads(re.sub(r'\b{}\b'.format(old_id), node._id, json.dumps(doc)))
            new_id = replacement.pop('_id')
            database['notificationsubscription'].find_and_modify(
                {'_id': new_id},
                {'$set':replacement},
                upsert=True
            )
            database['notificationsubscription'].remove({'_id': doc['_id']})

    u_uc = list(database['user'].find({'unclaimed_records.{}'.format(old_id): {'$ne': None}}))
    if u_uc:
        logger.info('** Updating {} Users (unclaimed_records) {}'.format(old_id, [d['_id'] for d in u_uc]))
        for doc in u_uc:
            ucr = doc['unclaimed_records']
            ucr[node._id] = ucr.pop(old_id)
            database['user'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'unclaimed_records': ucr
                }}
            )

    u_caer = list(database['user'].find({'contributor_added_email_records.{}'.format(old_id): {'$ne': None}}))
    if u_caer:
        logger.info('** Updating {} Users (contributor_added_email_records) {}'.format(old_id, [d['_id'] for d in u_caer]))
        for doc in u_caer:
            caer = doc['contributor_added_email_records']
            caer[node._id] = caer.pop(old_id)
            database['user'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'contributor_added_email_records': caer
                }}
            )

    u_nc = list(database['user'].find({'notifications_configured.{}'.format(old_id): {'$ne': None}}))
    if u_nc:
        logger.info('** Updating {} Users (notifications_configured) {}'.format(old_id, [d['_id'] for d in u_nc]))
        for doc in u_nc:
            nc = doc['notifications_configured']
            nc[node._id] = nc.pop(old_id)
            database['user'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'notifications_configured': nc
                }}
            )

    u_cvt = list(database['user'].find({'comments_viewed_timestamp.{}'.format(old_id): {'$ne': None}}))
    if u_cvt:
        logger.info('** Updating {} Users (comments_viewed_timestamp) {}'.format(old_id, [d['_id'] for d in u_cvt]))
        for doc in u_cvt:
            nc = doc['comments_viewed_timestamp']
            nc[node._id] = nc.pop(old_id)
            database['user'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'comments_viewed_timestamp': nc
                }}
            )

    pc_i = list(database['pagecounters'].find({'_id': {'$regex': ':{}:'.format(old_id)}}))
    if pc_i:
        logger.info('** Updating {} PageCounters (_id) {}'.format(old_id, [d['_id'] for d in pc_i]))
        for doc in pc_i:
            replacement = json.loads(re.sub(r'\b{}\b'.format(old_id), node._id, json.dumps(doc)))
            new_id = replacement.pop('_id')
            database['pagecounters'].find_and_modify(
                {'_id': new_id},
                {'$set':replacement},
                upsert=True
            )
            database['pagecounters'].remove({'_id': doc['_id']})

    ss_dv = list(database['session'].find({'data.visited': {'$regex': ':{}:'.format(old_id)}}))
    if ss_dv:
        logger.info('** Updating {} Session (data) {}'.format(old_id, [d['_id'] for d in ss_dv]))
        for doc in ss_dv:
            repl_data = json.loads(re.sub(r'\b{}\b'.format(old_id), node._id, json.dumps(doc['data'])))
            database['session'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'data': repl_data
                }}
            )

    wc_n = list(database['watchconfig'].find({'node': old_id}))
    if wc_n:
        logger.info('** Updating {} WatchConfigs (node) {}'.format(old_id, [d['_id'] for d in wc_n]))
        for doc in wc_n:
            database['watchconfig'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'node': node._id
                }}
            )

    pl_n = list(database['privatelink'].find({'nodes': old_id}))
    if pl_n:
        logger.info('** Updating {} PrivateLinks (nodes) {}'.format(old_id, [d['_id'] for d in pl_n]))
        for d in pl_n:
            new_nodes = d['nodes']
            new_nodes.remove(old_id)
            new_nodes.append(node._id) 
            database['privatelink'].find_and_modify(
                {'_id': d['_id']},
                {'$set':{
                    'nodes': new_nodes
                }}
            )

    dr_bf = list(database['draftregistration'].find({'branched_from': old_id}))
    if dr_bf:
        logger.info('** Updating {} DraftRegistrations (branched_from) {}'.format(old_id, [d['_id'] for d in dr_bf]))
        for doc in dr_bf:
            database['draftregistration'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'branched_from': node._id
                }}
            )

    dr_rn = list(database['draftregistration'].find({'registered_node': old_id}))
    if dr_rn:
        logger.info('** Updating {} DraftRegistrations (registered_node) {}'.format(old_id, [d['_id'] for d in dr_rn]))
        for doc in dr_rn:
            database['draftregistration'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'registered_node': node._id
                }}
            )

    eta_er = list(database['embargoterminationapproval'].find({'embargoed_registration': old_id}))
    if eta_er:
        logger.info('** Updating {} EmbargoTerminationApprovals (embargoed_registration) {}'.format(old_id, [d['_id'] for d in eta_er]))
        for doc in eta_er:
            database['embargoterminationapproval'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'embargoed_registration': node._id
                }}
            )

    ra_su = list(database['registrationapproval'].find({'$where': 'var keys=Object.keys(this.stashed_urls);for(var i=0;i<keys.length;i+=1){{if(this.stashed_urls[keys[i]].view.indexOf("{}")!==-1){{return true}}if(this.stashed_urls[keys[i]].approve.indexOf("{}")!==-1){{return true}}if(this.stashed_urls[keys[i]].reject.indexOf("{}")!==-1){{return true}}}}return false;'.format(old_id, old_id, old_id)}))
    if ra_su:
        logger.info('** Updating {} RegistrationApprovals (stashed_urls) {}'.format(old_id, [d['_id'] for d in ra_su]))
        for doc in ra_su:
            updated_stash = json.loads(re.sub(r'\b{}\b'.format(old_id), node._id, json.dumps(doc['stashed_urls'])))
            database['registrationapproval'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'stashed_urls': updated_stash
                }}
            )

    idf_r = list(database['identifier'].find({'referent': old_id}))
    if idf_r:
        logger.info('** Updating {} Identifiers (referent) {}'.format(old_id, [d['_id'] for d in idf_r]))
        for doc in idf_r:
            ref = doc['referent']
            ref[1] = 'preprintservice'
            database['identifier'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'referent': ref
                }}
            )

    qm_dn = list(database['queuedmail'].find({'data.nid': old_id}))
    if qm_dn:
        logger.info('** Updating {} QueuedMails (data.nid) {}'.format(old_id, [d['_id'] for d in qm_dn]))
        for doc in qm_dn:
            repl_data = json.loads(re.sub(r'\b{}\b'.format(old_id), node._id, json.dumps(doc['data'])))
            database['queuedmail'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'data': repl_data
                }}
            )

    ps_n = list(database['preprintservice'].find({'node': old_id}))
    if ps_n:
        logger.info('** Updating {} PreprintServices (node) {}'.format(old_id, [d['_id'] for d in ps_n]))
        for doc in ps_n:
            database['preprintservice'].find_and_modify(
                {'_id': doc['_id']},
                {'$set':{
                    'node': node._id
                }}
            )

Example 18

Project: cgat
Source File: r_table2scatter.py
View license
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns.")

    parser.add_option("--logscale", dest="logscale", type="string",
                      help="log-transform one or both axes [default=%Default].")

    parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file [default=%default].",
                      metavar="FILE")

    parser.add_option("-f", "--file", dest="input_filename", type="string",
                      help="filename with table data [default=%default].",
                      metavar="FILE")

    parser.add_option("-2", "--file2", dest="input_filename2", type="string",
                      help="additional data file [default=%default].",
                      metavar="FILE")

    parser.add_option("-s", "--stats", dest="statistics", type="choice",
                      choices=("correlation", "spearman", "pearson", "count"),
                      help="statistical quantities to compute [default=%default]",
                      action="append")

    parser.add_option("-p", "--plot", dest="plot", type="choice",
                      choices=("scatter", "pairs", "panel", "bar", "bar-stacked",
                               "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal",
                               "scatter-regression"),
                      help="plots to plot [default=%default]",
                      action="append")

    parser.add_option("-t", "--threshold", dest="threshold", type="float",
                      help="min threshold to use for counting method [default=%default].")

    parser.add_option("-o", "--colours", dest="colours", type="int",
                      help="column with colour information [default=%default].")

    parser.add_option("-l", "--plot-labels", dest="labels", type="string",
                      help="column labels for x and y in matched plots [default=%default].")

    parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true",
                      help="add diagonal to plot [default=%default].")

    parser.add_option("-e", "--plot-legend", dest="legend", type="int",
                      help="column with legend [default=%default].")

    parser.add_option("-r", "--options", dest="r_options", type="string",
                      help="R plotting options [default=%default].")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("full", "sparse"),
                      help="output format [default=%default].")

    parser.add_option("--title", dest="title", type="string",
                      help="""plot title [default=%default].""")

    parser.add_option("", "--xrange", dest="xrange", type="string",
                      help="x viewing range of plot [default=%default].")

    parser.add_option("", "--yrange", dest="yrange", type="string",
                      help="y viewing range of plot[default=%default].")

    parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false",
                      help="do not fail on empty input [default=%default].")

    parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true",
                      help="fail on empty input [default=%default].")

    parser.set_defaults(
        hardcopy=None,
        input_filename="",
        input_filename2=None,
        columns="all",
        logscale=None,
        statistics=[],
        plot=[],
        threshold=0.0,
        labels="x,y",
        colours=None,
        diagonal=False,
        legend=None,
        title=None,
        xrange=None,
        yrange=None,
        r_options="",
        fail_on_empty=True,
        format="full")

    (options, args) = E.Start(parser)

    if len(args) == 1 and not options.input_filename:
        options.input_filename = args[0]

    if options.columns not in ("all", "all-but-first"):
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    if options.colours:
        options.colours -= 1
    if options.legend:
        options.legend -= 1

    table = {}
    headers = []

    # read data matrix
    if options.input_filename:
        lines = IOTools.openFile(options.input_filename, "r").readlines()
    else:
        # note: this will not work for interactive viewing, but
        # creating hardcopy plots works.
        lines = sys.stdin.readlines()

    lines = [x for x in lines if x[0] != "#"]

    if len(lines) == 0:
        if options.fail_on_empty:
            raise IOError("no input")
        E.warn("empty input")
        E.Stop()
        return

    matrix, headers, colours, legend = readTable(lines,
                                                 "matrix",
                                                 take_columns=options.columns,
                                                 headers=True,
                                                 colours=options.colours,
                                                 row_names=options.legend)

    if options.input_filename2:
        # read another matrix (should be of the same format.
        matrix2, headers2, colours2, legend2 = readTable(
            lines,
            "matrix2",
            take_columns=options.columns,
            headers=True,
            colours=options.colours,
            row_names=options.legend)

    R.assign("headers", headers)

    ndata = R("""length( matrix[,1] )""")[0]

    if options.loglevel >= 1:
        options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata))

    if colours:
        R.assign("colours", colours)

    for method in options.statistics:

        if method == "correlation":
            cor = R.cor(matrix, use="pairwise.complete.obs")
            writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f")

        elif method == "pearson":
            options.stdout.write("\t".join(("var1",
                                            "var2",
                                            "coeff",
                                            "passed",
                                            "pvalue",
                                            "n",
                                            "method",
                                            "alternative")) + "\n")
            for x in range(len(headers) - 1):
                for y in range(x + 1, len(headers)):
                    try:
                        result = R(
                            """cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1))
                    except rpy.RPyException as msg:
                        E.warn("correlation not computed for columns %i(%s) and %i(%s): %s" % (
                            x, headers[x], y, headers[y], msg))
                        options.stdout.write("%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" %
                                             (headers[x], headers[y],
                                              "na",
                                              "na",
                                              "na",
                                              0,
                                              "na",
                                              "na"))

                    else:
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" %
                            (headers[x], headers[y],
                             result.rx2('estimate').rx2(
                                 'cor')[0],
                             Stats.getSignificance(
                                 float(result.rx2('p.value')[0])),
                             result.rx2('p.value')[0],
                             result.rx2('parameter').rx2(
                                 'df')[0],
                             result.rx2('method')[0],
                             result.rx2('alternative')[0]))

        elif method == "spearman":
            options.stdout.write("\t".join(("var1", "var2",
                                            "coeff",
                                            "passed",
                                            "pvalue",
                                            "method",
                                            "alternative")) + "\n")
            for x in range(len(headers) - 1):
                for y in range(x + 1, len(headers)):
                    result = R(
                        """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1))
                    options.stdout.write(
                        "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" %
                        (headers[x], headers[y],
                         result['estimate']['rho'],
                         Stats.getSignificance(float(result['p.value'])),
                         result['p.value'],
                         result['parameter']['df'],
                         result['method'],
                         result['alternative']))

        elif method == "count":
            # number of shared elements > threshold
            m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"),
                                             take=options.columns,
                                             headers=True)
            mask = numpy.greater(m, options.threshold)
            counts = numpy.dot(numpy.transpose(mask), mask)
            writeMatrix(options.stdout, counts, headers=c, format="%i")

    if options.plot:

        # remove columns that are completely empty
        if "pairs" in options.plot:
            colsums = R('''colSums( is.na(matrix ))''')
            take = [x for x in range(len(colsums)) if colsums[x] != ndata]
            if take:
                E.warn("removing empty columns %s before plotting" % str(take))
                matrix = R.subset(matrix, select=[x + 1 for x in take])
                R.assign("""matrix""", matrix)
                headers = [headers[x] for x in take]
                if legend:
                    legend = [headers[x] for x in take]

        if options.r_options:
            extra_options = ", %s" % options.r_options
        else:
            extra_options = ""

        if options.legend is not None and len(legend):
            extra_options += ", legend=c('%s')" % "','".join(legend)

        if options.labels:
            xlabel, ylabel = options.labels.split(",")
            extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel)
        else:
            xlabel, ylabel = "", ""

        if options.colours:
            extra_options += ", col=colours"

        if options.logscale:
            extra_options += ", log='%s'" % options.logscale

        if options.xrange:
            extra_options += ", xlim=c(%f,%f)" % tuple(
                map(float, options.xrange.split(",")))

        if options.yrange:
            extra_options += ", ylim=c(%f,%f)" % tuple(
                map(float, options.yrange.split(",")))

        if options.hardcopy:
            if options.hardcopy.endswith(".eps"):
                R.postscript(options.hardcopy)
            elif options.hardcopy.endswith(".png"):
                R.png(options.hardcopy, width=1024, height=768, type="cairo")
            elif options.hardcopy.endswith(".jpg"):
                R.jpg(options.hardcopy, width=1024, height=768, type="cairo")

        for method in options.plot:

            if ndata < 100:
                point_size = "1"
                pch = "o"
            elif ndata < 1000:
                point_size = "1"
                pch = "o"
            else:
                point_size = "0.5"
                pch = "."

            if method == "scatter":
                R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (
                    point_size, extra_options))

            if method == "scatter-regression":
                R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (
                    point_size, extra_options))
                dat = R(
                    """dat <- data.frame(x = matrix[,1], y = matrix[,2])""")
                R(
                    """new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""")
                mod = R("""mod <- lm( y ~ x, dat)""")
                R("""predict(mod, new, se.fit = TRUE)""")
                R("""pred.w.plim <- predict(mod, new, interval="prediction")""")
                R("""pred.w.clim <- predict(mod, new, interval="confidence")""")
                R(
                    """matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""")
                R.mtext(
                    "y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"],
                                                        mod["coefficients"][
                                                            "(Intercept)"],
                                                        R("""cor( dat )[2]"""),
                                                        ndata),
                    3,
                    cex=1.0)

            elif method == "pairs":
                if options.add_diagonal:
                    R(
                        """panel.hist <- function( x,y,...  ) { points(x,y,...); abline(0,1); }""")
                else:
                    R(
                        """panel.hist <- function( x,y,...  ) { points(x,y,...); }""")

                # There used to be a argument na_action="na.omit", but
                # removed this as there appeared error messages saying
                # "na.action is not a graphical parameter" and the
                # plots showed occasionally the wrong scale.
                # cex=point_size also caused trouble (error message:
                # "X11 used font size 8 when 2 was requested" or
                # similar)
                if options.colours:
                    R.pairs(matrix,
                            pch=pch,
                            col=colours,
                            main=options.title,
                            panel="panel.hist",
                            labels=headers,
                            cex_labels=2.0)
                else:
                    R.pairs(matrix,
                            pch=pch,
                            panel="panel.hist",
                            main=options.title,
                            labels=headers,
                            cex_labels=2.0)

            elif method == "boxplot":
                extra_options += ",main='%s'" % options.title

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R(
                        """op <- par(mar=c(11,4,4,2))""")  # the 10 allows the names.arg below the barplot

                R("""boxplot( matrix %s)""" % extra_options)

            elif method == "bar" or method == "bar-stacked":
                if not options.colours:
                    extra_options += ", col=rainbow(5)"

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R(
                        """op <- par(mar=c(11,4,4,2))""")  # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), %s)""" % extra_options)

            elif method == "bar-besides":
                if not options.colours:
                    extra_options += ", col=rainbow(%i)" % ndata

                # set vertical orientation
                if max([len(x) for x in headers]) > 40 / len(headers):
                    # remove xlabel:
                    extra_options = re.sub(", xlab='[^']+'", "", extra_options)
                    extra_options += ", names.arg=headers, las=2"
                    R(
                        """op <- par(mar=c(11,4,4,2))""")  # the 10 allows the names.arg below the barplot

                R("""barplot(as.matrix(matrix), beside=TRUE %s)""" %
                  extra_options)

            elif method == "scatter+marginal":

                if options.title:
                    # set the size of the outer margins - the title needs to be added at the end
                    # after plots have been created
                    R.par(oma=R.c(0, 0, 4, 0))

                R("""matrix""")
                R("""
x <- matrix[,1];
y <- matrix[,2];
xhist <- hist(x, breaks=20, plot=FALSE);
yhist <- hist(y, breaks=20, plot=FALSE);
top <- max(c(xhist$counts, yhist$counts));
nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE );
par(mar=c(3,3,1,1)) ;
plot(x, y, cex=%s, pch="o" %s) ;
par(mar=c(0,3,1,1)) ;
barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ;
par(mar=c(3,0,1,1)) ;
title(main='%s');
barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ;
title(main='%s');
""" % (point_size, extra_options, xlabel, ylabel))

                if options.title:
                    R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

            elif method in ("panel", "1_vs_x", "matched"):

                if method == "panel":
                    pairs = []
                    for x in range(len(headers) - 1):
                        for y in range(x + 1, len(headers)):
                            pairs.append((x, y))

                elif method == "1_vs_x":
                    pairs = []
                    for x in range(1, len(headers)):
                        pairs.append((0, x))

                # print matching columns
                elif method == "matched":
                    pairs = []
                    for x in range(len(headers) - 1):
                        for y in range(x + 1, len(headers)):
                            if headers[x] == headers[y]:
                                pairs.append((x, y))
                                break

                w = int(math.ceil(math.sqrt(len(pairs))))
                h = int(math.ceil(float(len(pairs)) / w))

                PosInf = 1e300000
                NegInf = -1e300000

                xlabel, ylabel = options.labels.split(",")

                R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" %
                  (w * h, w, h))
                for a, b in pairs:
                    new_matrix = [x for x in zip(
                        list(matrix[a].values())[0],
                        list(matrix[b].values())[0])
                                  if x[0] not in (float("nan"), PosInf, NegInf) and
                                  x[1] not in (float("nan"), PosInf, NegInf)]
                    try:
                        R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % (
                            a + 1, b + 1, headers[b], headers[a], xlabel, ylabel))
                    except rpy.RException as msg:
                        print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg))

        if options.hardcopy:
            R['dev.off']()

    E.info("matrix added as >matrix< in R.")

    if not options.hardcopy:
        if options.input_filename:
            interpreter = code.InteractiveConsole(globals())
            interpreter.interact()
        else:
            E.info(
                "can not start new interactive session as input has come from stdin.")

    E.Stop()

Example 19

View license
def main(argv=None):
    '''
    Process the command line arguments and create the JSON dump.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Transfer all projects/repositories from GitLab to Stash. \
                     Note: This script assumes you have your SSH key \
                     registered with both GitLab and Stash.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve')
    parser.add_argument('gitlab_url',
                        help='The full URL to your GitLab instance.')
    parser.add_argument('stash_url',
                        help='The full URL to your Stash instance.')
    parser.add_argument('-p', '--password',
                        help='The password to use to authenticate if token is \
                              not specified. If password and token are both \
                              unspecified, you will be prompted to enter a \
                              password.')
    parser.add_argument('-P', '--page_size',
                        help='When retrieving result from GitLab, how many \
                              results should be included in a given page?.',
                        type=int, default=20)
    parser.add_argument('-s', '--verify_ssl',
                        help='Enable SSL certificate verification',
                        action='store_true')
    parser.add_argument('-S', '--skip_existing',
                        help='Do not update existing repositories and just \
                              skip them.',
                        action='store_true')
    parser.add_argument('-t', '--token',
                        help='The private GitLab API token to use for \
                              authentication. Either this or username and \
                              password must be set.')
    parser.add_argument('-u', '--username',
                        help='The username to use for authentication, if token\
                              is unspecified.')
    parser.add_argument('-v', '--verbose',
                        help='Print more status information. For every ' +
                             'additional time this flag is specified, ' +
                             'output gets more verbose.',
                        default=0, action='count')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    args.page_size = max(100, args.page_size)

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=log_level)

    # Setup authenticated GitLab and Stash instances
    if args.token:
        git = GitLab(args.gitlab_url, token=args.token,
                            verify_ssl=args.verify_ssl)
    else:
        git = None
    if not args.username:
        print('Username: ', end="", file=sys.stderr)
        args.username = input('').strip()
    if not args.password:
        args.password = getpass.getpass('Password: ')
    stash = stashy.connect(args.stash_url, args.username, args.password)
    if git is None:
        git = GitLab(args.gitlab_url, verify_ssl=args.verify_ssl)
        git.login(args.username, args.password)

    print('Retrieving existing Stash projects...', end="", file=sys.stderr)
    sys.stderr.flush()
    key_set = {proj['key'] for proj in stash.projects}
    stash_project_names = {proj['name'] for proj in stash.projects}
    names_to_keys = {proj['name']: proj['key'] for proj in stash.projects}
    print('done', file=sys.stderr)
    sys.stderr.flush()
    updated_projects = set()
    repo_to_slugs = {}
    failed_to_clone = set()
    cwd = os.getcwd()
    transfer_count = 0
    skipped_count = 0
    print('Processing GitLab projects...', file=sys.stderr)
    sys.stderr.flush()
    for project in gen_all_results(git.getallprojects,
                                   per_page=args.page_size):
        print('\n' + ('=' * 80) + '\n', file=sys.stderr)
        sys.stderr.flush()
        proj_name = project['namespace']['name']
        # Create Stash project if it doesn't already exist
        if proj_name not in stash_project_names:
            # Create Stash project key
            key = proj_name
            if key.islower():
                key = key.title()
            key = re.sub(r'[^A-Z]', '', key)
            if len(key) < 2:
                key = re.sub(r'[^A-Za-z]', '', proj_name)[0:2].upper()
            added = False
            suffix = 65
            while key in key_set:
                if not added:
                    key += 'A'
                else:
                    suffix += 1
                    key = key[:-1] + chr(suffix)
            key_set.add(key)

            # Actually add the project to Stash
            print('Creating Stash project "%s" with key %s...' %
                  (proj_name, key), end="", file=sys.stderr)
            sys.stderr.flush()
            stash.projects.create(key, proj_name)
            names_to_keys[proj_name] = key
            stash_project_names.add(proj_name)
            print('done', file=sys.stderr)
            sys.stderr.flush()
        else:
            key = names_to_keys[proj_name]

        stash_project = stash.projects[key]

        # Initialize maping from repository names to slugs for later
        if key not in repo_to_slugs:
            repo_to_slugs[key] = {repo['name']: repo['slug'] for repo in
                                  stash_project.repos}

        # Create Stash-compatible name for repository
        # Repository names are limited to 128 characters.
        # They must start with a letter or number and may contain spaces,
        # hyphens, underscores and periods
        repo_name = project['name']
        if not repo_name[0].isalnum():
            repo_name = 'A ' + repo_name
        repo_name = re.sub(r'[^A-Za-z0-9 _.-]', ' ', repo_name)
        if len(repo_name) > 128:
            repo_name = repo_name[0:128]

        # Add repository to Stash project if it's not already there
        if repo_name not in repo_to_slugs[key]:
            print('Creating Stash repository "%s" in project "%s"...' %
                  (repo_name, proj_name), end="", file=sys.stderr)
            sys.stderr.flush()
            stash_repo = stash_project.repos.create(repo_name)
            repo_to_slugs[key][repo_name] = stash_repo['slug']
            print('done', file=sys.stderr)
            sys.stderr.flush()
        elif args.skip_existing:
            print('Skipping existing Stash repository "%s" in project "%s"' %
                  (repo_name, proj_name), file=sys.stderr)
            sys.stderr.flush()
            skipped_count += 1
            continue
        else:
            print('Updating existing Stash repository "%s" in project "%s"' %
                  (repo_name, proj_name), file=sys.stderr)
            sys.stderr.flush()
            repo_slug = repo_to_slugs[key][repo_name]
            stash_repo = stash_project.repos[repo_slug].get()

        for clone_link in stash_repo['links']['clone']:
            if clone_link['name'] == 'ssh':
                stash_repo_url = clone_link['href']
                break

        with tempfile.TemporaryDirectory() as temp_dir:
            # Clone repository to temporary directory
            print('\nCloning GitLab repository...', file=sys.stderr)
            sys.stderr.flush()
            try:
                subprocess.check_call(['git', 'clone', '--mirror',
                                       project['ssh_url_to_repo'],
                                       temp_dir])
            except subprocess.CalledProcessError:
                print('Failed to clone GitLab repository. This usually when ' +
                      'it does not exist.', file=sys.stderr)
                failed_to_clone.add(project['name_with_namespace'])
                skipped_count += 1
                continue
            os.chdir(temp_dir)

            # Check that repository is not empty
            try:
                subprocess.check_call(['git', 'log', '--format=oneline', '-1'],
                                      stdout=subprocess.DEVNULL,
                                      stderr=subprocess.DEVNULL)
            except subprocess.CalledProcessError:
                print('Repository is empty, so skipping push to Stash.',
                      file=sys.stderr)
                skipped_count += 1
            else:
                # Change remote to Stash and push
                print('\nPushing repository to Stash...', file=sys.stderr)
                sys.stderr.flush()
                subprocess.check_call(['git', 'remote', 'set-url', 'origin',
                                       stash_repo_url])
                subprocess.check_call(['git', 'push', '--mirror'])
                transfer_count += 1

            os.chdir(cwd)

        updated_projects.add(proj_name)


    print('\n' + ('=' * 35) + 'SUMMARY' + ('=' * 35), file=sys.stderr)
    print('{} repositories transferred.\n'.format(transfer_count),
          file=sys.stderr)
    print('{} repositories skipped.\n'.format(skipped_count),
          file=sys.stderr)
    print('Projects created/updated:', file=sys.stderr)
    for proj in sorted(updated_projects):
        print('\t' + proj, file=sys.stderr)
    print('Repositories that we could not clone:', file=sys.stderr)
    for repo_name in sorted(failed_to_clone):
        print('\t' + repo_name, file=sys.stderr)

Example 20

Project: Flexget
Source File: plex.py
View license
    def on_task_input(self, task, config):
        config = self.prepare_config(config)
        urlconfig = {}
        urlappend = "?"
        entries = []
        if config['unwatched_only'] and config['section'] != 'recentlyViewedShows' and config['section'] != 'all':
            urlconfig['unwatched'] = '1'
        if config['username'] and config['password']:
            accesstoken = self.plex_get_accesstoken(config)
            log.debug("Got accesstoken: %s" % accesstoken)
            urlconfig['X-Plex-Token'] = accesstoken

        for key in urlconfig:
            urlappend += '%s=%s&' % (key, urlconfig[key])
        if not self.plex_section_is_int(config['section']):
            try:
                path = "/library/sections/"
                r = requests.get("http://%s:%d%s%s" % (config['plexserver'], config['port'], path, urlappend))
            except requests.RequestException as e:
                raise plugin.PluginError('Error retrieving source: %s' % e)
            dom = parseString(r.text.encode("utf-8"))
            for node in dom.getElementsByTagName('Directory'):
                if node.getAttribute('title') == config['section']:
                    config['section'] = int(node.getAttribute('key'))
        if not self.plex_section_is_int(config['section']):
            raise plugin.PluginError('Could not find section \'%s\'' % config['section'])

        log.debug("Fetching http://%s:%d/library/sections/%s/%s%s" %
                  (config['server'], config['port'], config['section'], config['selection'], urlappend))
        try:
            path = "/library/sections/%s/%s" % (config['section'], config['selection'])
            r = requests.get("http://%s:%d%s%s" % (config['plexserver'], config['port'], path, urlappend))
        except requests.RequestException as e:
            raise plugin.PluginError('There is no section with number %d. (%s)' % (config['section'], e))
        dom = parseString(r.text.encode("utf-8"))
        plexsectionname = dom.getElementsByTagName('MediaContainer')[0].getAttribute('title1')
        viewgroup = dom.getElementsByTagName('MediaContainer')[0].getAttribute('viewGroup')

        log.debug("Plex section \"%s\" is a \"%s\" section" % (plexsectionname, viewgroup))
        if viewgroup != "movie" and viewgroup != "show" and viewgroup != "episode":
            raise plugin.PluginError("Section is neither a movie nor tv show section!")
        domroot = "Directory"
        titletag = "title"
        if viewgroup == "episode":
            domroot = "Video"
            titletag = "grandparentTitle"
            thumbtag = "thumb"
            arttag = "art"
            seasoncovertag = "parentThumb"
            covertag = "grandparentThumb"
        elif viewgroup == "movie":
            domroot = "Video"
            titletag = "title"
            arttag = "art"
            seasoncovertag = "thumb"
            covertag = "thumb"
            if config['fetch'] == "thumb":
                raise plugin.PluginError("Movie sections does not have any thumbnails to download!")
        for node in dom.getElementsByTagName(domroot):
            e = Entry()
            e['plex_server'] = config['plexserver']
            e['plex_port'] = config['port']
            e['plex_section'] = config['section']
            e['plex_section_name'] = plexsectionname
            e['plex_episode_thumb'] = ''

            title = node.getAttribute(titletag)
            if config['strip_year']:
                title = re.sub(r'^(.*)\(\d{4}\)(.*)', r'\1\2', title)
            if config['strip_parens']:
                title = re.sub(r'\(.*?\)', r'', title)
                title = title.strip()
            if config['strip_non_alpha']:
                title = re.sub(r'[\(\)]', r'', title)
                title = re.sub(r'&', r'And', title)
                title = re.sub(r'[^A-Za-z0-9- \']', r'', title)
            if config['lowercase_title']:
                title = title.lower()
            if viewgroup == "show":
                e['title'] = title
                e['url'] = 'NULL'
                entries.append(e)
                # show ends here.
                continue
            e['plex_art'] = "http://%s:%d%s%s" % (config['server'], config['port'],
                                                  node.getAttribute(arttag), urlappend)
            e['plex_cover'] = "http://%s:%d%s%s" % (config['server'], config['port'],
                                                    node.getAttribute(covertag), urlappend)
            e['plex_season_cover'] = "http://%s:%d%s%s" % (config['server'], config['port'],
                                                           node.getAttribute(seasoncovertag), urlappend)
            if viewgroup == "episode":
                e['plex_thumb'] = "http://%s:%d%s%s" % (
                    config['server'], config['port'], node.getAttribute('thumb'), urlappend)
                e['series_name'] = title
                season = int(node.getAttribute('parentIndex'))
                if node.getAttribute('parentIndex') == node.getAttribute('year'):
                    season = node.getAttribute('originallyAvailableAt')
                    filenamemap = "%s_%s%s_%s_%s_%s.%s"
                    episode = ""
                    e['series_id_type'] = 'date'
                    e['series_date'] = season
                elif node.getAttribute('index'):
                    episode = int(node.getAttribute('index'))
                    filenamemap = "%s_%02dx%02d_%s_%s_%s.%s"
                    e['series_season'] = season
                    e['series_episode'] = episode
                    e['series_id_type'] = 'ep'
                    e['series_id'] = 'S%02dE%02d' % (season, episode)
                else:
                    log.debug("Could not get episode number for '%s' (Hint, ratingKey: %s)"
                              % (title, node.getAttribute('ratingKey')))
                    break
            elif viewgroup == "movie":
                filenamemap = "%s_%s_%s_%s.%s"

            e['plex_duration'] = node.getAttribute('duration')
            e['plex_summary'] = node.getAttribute('summary')
            e['plex_userrating'] = node.getAttribute('userrating')
            e['plex_key'] = node.getAttribute('ratingKey')
            count = node.getAttribute('viewCount')
            offset = node.getAttribute('viewOffset')
            if count:
                e['plex_status'] = "seen"
            elif offset:
                e['plex_status'] = "inprogress"
            else:
                e['plex_status'] = "unwatched"
            for media in node.getElementsByTagName('Media'):
                vcodec = media.getAttribute('videoCodec')
                acodec = media.getAttribute('audioCodec')
                if config['fetch'] == "file" or not config['fetch']:
                    container = media.getAttribute('container')
                else:
                    container = "jpg"
                resolution = media.getAttribute('videoResolution') + "p"
                for part in media.getElementsByTagName('Part'):
                    if config['fetch'] == "file" or not config['fetch']:
                        key = part.getAttribute('key')
                    elif config['fetch'] == "art":
                        key = node.getAttribute(arttag)
                    elif config['fetch'] == "cover":
                        key = node.getAttribute(arttag)
                    elif config['fetch'] == "season_cover":
                        key = node.getAttribute(seasoncovertag)
                    elif config['fetch'] == "thumb":
                        key = node.getAttribute(thumbtag)
                    # key = part.getAttribute('key')
                    duration = part.getAttribute('duration')
                    e['plex_title'] = title
                    if config['original_filename']:
                        filename, fileext = os.path.splitext(basename(part.getAttribute('file')))
                        if config['fetch'] != 'file':
                            filename += ".jpg"
                        else:
                            filename = "%s.%s" % (filename, fileext)
                    else:
                        if viewgroup == "episode":
                            filename = filenamemap % (title.replace(" ", "."), season, episode, resolution, vcodec,
                                                      acodec, container)
                            title = filename
                        elif viewgroup == "movie":
                            filename = filenamemap % (title.replace(" ", "."), resolution, vcodec,
                                                      acodec, container)
                    e['plex_url'] = "http://%s:%d%s%s" % (config['server'], config['port'], key, urlappend)
                    e['plex_path'] = key
                    e['url'] = "http://%s:%d%s%s" % (config['server'], config['port'], key, urlappend)
                    e['plex_duration'] = duration
                    e['filename'] = filename
                    e['title'] = title
            if key == "":
                log.debug("Could not find anything in PMS to download. Next!")
            else:
                entries.append(e)
        return entries

Example 21

View license
def build_annotated_tgm(closest_gene_output,distance_to_tss,logistic_score_output,fasta_file,motif_ids,makeWindow=True,tgm_file='',do_pkl=True):
    '''
    Takes existing tgm, and maps to gene names and TF ids within a specific window
    '''
    from chipsequtil import Fasta
    ##get fasta file events, since these are columns in the logistic_score matrix
    seq_ids=Fasta.load(fasta_file,key_func=lambda x: x)

    ##need to get sequence mids in the order they are processed
    ##in the file, this is the index into the score_output file
    ##. ASSUMES GALAXY-formatted FASTA!!!!
    seq_mids=[] ##list of FASTA regions, in their appropriate order in the file
    filtered_events={}##gene name of closest gene to event within window
    for k in seq_ids.keys():
        vals=k.split(';')
        if len(vals)==1:
    	    vals=k.split()
        if ':' in vals[0]: #bed tools used 
            chr,range=vals[0].split(':')
            low,high=range.split('-')
            mid=str(int(low)+((int(high)-int(low))/2))
            seq_mids.append(chr+':'+mid)
        elif 'random' not in vals[0]: #galaxy tools used
            genome,chr,low,high,strand=vals[0].split('_')
            mid=str(int(low)+((int(high)-int(low))/2))
            seq_mids.append(chr+':'+mid)
        
        if len(vals)==3:            
            filtered_events[chr+':'+mid]=vals[2]
    print 'Found %d events, of which %d have gene names'%(len(seq_mids),len(filtered_events))
    ##this next section relies on xls 
    ##filter events that are within distance from closest_gene_output to get gene mapping
    ##
    filtered_fc={}##FC of events within window, in case we want to use in the future

    event_indexes=[] ##

    
 #    ###open the closest_gene_output and determine
#     try:
#         cgo=open(closest_gene_output,'rU').readlines()
#     except:
#         print "Error opening file:", sys.exc_info()[0]
#         print "Check to make sure file exists at %s"%(closest_gene_output)
#         raise
#     inds=cgo[0].strip().split('\t')
#     for row in cgo[1:]:
#         arr=row.strip().split('\t')
#         if 'geneSymbol' in inds: #this is true if we used an xref file
#             gene=arr[inds.index('geneSymbol')]        
# #            mid=arr[2]+':'+str(int(arr[3])+(int(arr[4])-int(arr[3]))/2)
#         else: #otherwise we just gene id
#             gene=arr[inds.index('knownGeneID')]
#         #position mapping is different
#         if 'Position' in inds: #this is for GPS
#             mid='chr'+arr[inds.index('Position')]
#         elif 'chrom' in inds: #this is for BED
#             mid=arr[inds.index('chrom')]+':'+str(int(arr[inds.index('chromStart')])+(int(arr[inds.index('chromEnd')])-int(arr[inds.index('chromStart')]))/2)
#         else: #this is for MACS
#             mid=arr[inds.index('chr')]+':'+str(int(arr[inds.index('start')])+(int(arr[inds.index('end')])-int(arr[inds.index('start')]))/2)

        
#         #print gene,mid
#         dist=arr[inds.index('dist from feature')]
#         try:
#             sv=arr[inds.index('score')]
#         except:
#             try:
#                 sv=arr[inds.index('IPvsCTR')]
#             except:
#                 fc=0.0
#         if sv!='':
#             fc=float(sv)
#         else:
#             next
                
#         #check absolute distance if we're doing a window, or negative distance if we're looking upstream
#         if distance_to_tss=='' or (makeWindow and np.absolute(int(dist))<int(distance_to_tss)) or int(dist)>(-1*int(distance_to_tss)):
# #            filtered_events[mid]=gene #(this was out of if clause, should it be there?) 1/2
#             if mid in seq_mids:
#                 event_indexes.append(seq_mids.index(mid))##index into fasta file value/maps to array
                
#                 ##UPDATE: moved these to within if clause - so that unrelated scores are not included
#                 filtered_events[mid]=gene ##gene name of event
#                 filtered_fc[mid]=float(fc) ##fc value of event
# #            filtered_fc[mid]=float(fc) #see above, 2/2

                
  #  print 'Got '+str(len(filtered_events))+' per-gene events within '+distance_to_tss+' bp window out of '+str(len(cgo))

 #   print 'These map to '+str(len(event_indexes))+' regions in the FASTA file'

    ##get gene ids, or just use mid of sequence region
    gene_names=[t for t in set(filtered_events.values())]
    print gene_names[0:10]

    #get gene ids for all matrices list loaded in
    mi_files=motif_ids.split(',')
    if len(mi_files)>0:
        #open first motif name file that contains names for each element in TAMO file
        all_tf_names=[a.strip() for a in open(mi_files[0],'rU').readlines()]
    if len(mi_files)>1:
        #if we have additional files, check to see if if names already exist
        for i,f in enumerate(mi_files):
            if i==0:
                next
            try:
                #open file and read in extra ids
                newfs=[a.strip() for a in open(f,'rU').readlines()]
            except:
                print "Error opening file:", sys.exc_info()[0]
                print "Check to make sure file exists at %s"%(f)
                raise
               
            if len(newfs)==len(all_tf_names):
                #combine existing tf names with these with . delimiter....
                all_tf_names=['.'.join((a,b)) for a,b in zip(all_tf_names,newfs)]

    ##now go through and clean up TF names
    cleaned_tf_names=[]
    for i,a in enumerate(all_tf_names):
        tfn=set([b for b in a.split('.') if '$' not in b and b!=''])
        if(len(tfn)==0):
            tfn=a.split('.')
#        else:
#            print 'Replacing %s with %s'%(a,'.'.join(tfn))
        cleaned_tf_names.append('.'.join(tfn))

    all_tf_names=cleaned_tf_names
    #print len(cleaned_tf_names)

    
    ##now actually map events to scores
    ##load motif matrix scanning output that maps matrices to regions
    print 'Loading complete motif score file...'
    event_scores=np.loadtxt(logistic_score_output)
    print '\t...Loaded!'
                      
    #create new tgm matrix with approriate file name
    newmat=np.zeros((len(all_tf_names),len(gene_names)),dtype='float')##fill in gene length),dtype='float')
    if makeWindow:
        distance_to_tss=distance_to_tss+'_bpWindow'
    else:
        distance_to_tss=distance_to_tss+'_bpUpstream'

    if tgm_file=='': 
        tgm_file=re.sub('.txt','_'+distance_to_tss+'.tgm',os.path.basename(logistic_score_output))
    if do_pkl:
        pkl_file=re.sub('.tgm','.pkl',tgm_file)
    else:
        pkl_file=''
        
    ##sort event indexes from seq_mids that are in the filtered_events file
    event_indexes.sort()
    
    #populate matrix with greatest score attributed to that gene/tf combo
    for ind,arr in enumerate(event_scores):
        ##name of matrix/motif
        mat=all_tf_names[ind]

        #tfnames=[mat]
        ##here we enumerate which sequences were mapped to a gene within the window
        for k,val in enumerate(seq_mids):#k in event_indexes:
            
            #here we want the event midpoint for the index
#            val=seq_mids[k]
            
            #get score for that index
            score=arr[k]
            
            #now map it to closest gene for that midpoint
            cg=filtered_events[val]

            fc=1.0 ##update this if we want to normalize score by fold change
            score=float(score)*float(fc) ##this should do nothing sine fcgenerally =1

            #if len(tfnames)==1:
            curscore=newmat[all_tf_names.index(mat),gene_names.index(cg)]
            ##updated to include maximum score!!

            if np.abs(score)>np.abs(curscore):
                newmat[all_tf_names.index(mat),gene_names.index(cg)]=score
            #else:
            #    for t in tfnames:
            #        curscore=newmat[all_tf_names.index(t),gene_names.index(cg)]
            #    ##updated to include maximum score!!
            #        if np.abs(float(score))>np.abs(curscore):
            #            newmat[all_tf_names.index(t),gene_names.index(cg)]=float(score)

                
    ###save these intermediate files for debugging purposes
    np.savetxt(tgm_file,newmat)
    gin=re.sub('.tgm','_geneids.txt',tgm_file)
    tin=re.sub('.tgm','_tfids.txt',tgm_file)

    try:
        open(gin,'w').writelines([g+'\n' for g in gene_names])
        open(tin,'w').writelines([t+'\n' for t in all_tf_names])
    except:
        print "Error opening file:", sys.exc_info()[0]
        print "Check to make sure file exists at %s"%(closest_gene_output)
        raise
    
    if pkl_file!='':
        zipcmd='python '+os.path.join(progdir,'zipTgms.py')+' '+tgm_file+' '+tin+' '+gin+' --pkl='+pkl_file
        print 'Compressing matrix file into pkl'
        print zipcmd
        os.system(zipcmd)
        return pkl_file
    else:
        return tgm_file

Example 22

Project: courtlistener
Source File: import_law_box.py
View license
def find_duplicates(doc, case_path):
    """Return True if it should be saved, else False"""
    log_print("Running duplicate checks...")

    # 1. Is the item completely outside of the current corpus?
    if not needs_dup_check(doc):
        log_print(
            "  - Not a duplicate: Outside of date range for selected court.")
        return []
    else:
        log_print(
            "  - Could be a duplicate: Inside of date range for selected court.")

    # 2. Can we find any duplicates and information about them?
    stats, candidates = dup_finder.get_dup_stats(doc)
    if len(candidates) == 0:
        log_print("  - Not a duplicate: No candidate matches found.")
        return []
    elif len(candidates) == 1:

        if doc.docket.docket_number and candidates[0].get(
                'docketNumber') is not None:
            # One in the other or vice versa
            if (re.sub("(\D|0)", "", candidates[0]['docketNumber']) in
                    re.sub("(\D|0)", "", doc.docket.docket_number)) or \
                    (re.sub("(\D|0)", "", doc.docket.docket_number) in
                         re.sub("(\D|0)", "", candidates[0]['docketNumber'])):
                log_print(
                    "  - Duplicate found: Only one candidate returned and docket number matches.")
                return [candidates[0]['id']]
            else:
                if doc.docket.court_id == 'cit':
                    # CIT documents have neutral citations in the database. Look that up and compare against that.
                    candidate_doc = Document.objects.get(
                        pk=candidates[0]['id'])
                    if doc.citation.neutral_cite and candidate_doc.citation.neutral_cite:
                        if candidate_doc.neutral_cite in doc.docket.docket_number:
                            log_print(
                                '  - Duplicate found: One candidate from CIT and its neutral citation matches the new document\'s docket number.')
                            return [candidates[0]['id']]
                else:
                    log_print(
                        "  - Not a duplicate: Only one candidate but docket number differs.")
                return []
        else:
            log_print("  - Skipping docket_number dup check.")

        if doc.case_name == candidates[0].get('caseName'):
            log_print(
                "  - Duplicate found: Only one candidate and case name is a perfect match.")
            return [candidates[0]['id']]

        if dup_helpers.case_name_in_candidate(doc.case_name,
                                              candidates[0].get('caseName')):
            log_print(
                "  - Duplicate found: All words in new document's case name are in the candidate's case name (%s)" %
                candidates[0].get('caseName'))
            return [candidates[0]['id']]

    else:
        # More than one candidate.
        if doc.docket.docket_number:
            dups_by_docket_number = dup_helpers.find_same_docket_numbers(doc,
                                                                         candidates)
            if len(dups_by_docket_number) > 1:
                log_print(
                    "  - Duplicates found: %s candidates matched by docket number." % len(
                        dups_by_docket_number))
                return [can['id'] for can in dups_by_docket_number]
            elif len(dups_by_docket_number) == 1:
                log_print(
                    "  - Duplicate found: Multiple candidates returned, but one matched by docket number.")
                return [dups_by_docket_number[0]['id']]
            else:
                log_print(
                    "  - Could be a duplicate: Unable to find good match via docket number.")
        else:
            log_print("  - Skipping docket_number dup check.")

    # 3. Filter out obviously bad cases and then pass remainder forward for manual review.

    filtered_candidates, filtered_stats = dup_helpers.filter_by_stats(
        candidates, stats)
    log_print("  - %s candidates before filtering. With stats: %s" % (
        stats['candidate_count'], stats['cos_sims']))
    log_print("  - %s candidates after filtering. Using filtered stats: %s" % (
        filtered_stats['candidate_count'],
        filtered_stats['cos_sims']))
    if len(filtered_candidates) == 0:
        log_print(
            "  - Not a duplicate: After filtering no good candidates remained.")
        return []
    elif len(filtered_candidates) == 1 and filtered_stats['cos_sims'][
        0] > 0.93:
        log_print(
            "  - Duplicate found: One candidate after filtering and cosine similarity is high (%s)" %
            filtered_stats['cos_sims'][0])
        return [filtered_candidates[0]['id']]
    else:
        duplicates = []
        high_sims_count = len(
            [sim for sim in filtered_stats['cos_sims'] if sim > 0.98])
        low_sims_count = len(
            [sim for sim in filtered_stats['cos_sims'] if sim < 0.95])
        for k in range(0, len(filtered_candidates)):
            if all([(high_sims_count == 1),  # Only one high score
                    (low_sims_count == filtered_stats['candidate_count'] - 1)
                    # All but one have low scores
                    ]):
                # If only one of the items is very high, then we can ignore the others and assume it's right
                if filtered_stats['cos_sims'][k] > 0.98:
                    duplicates.append(filtered_candidates[k]['id'])
                    break
                else:
                    # ignore the others
                    continue
            else:
                # Have to determine by "hand"
                log_print("  %s) Case name: %s" % (k + 1, doc.case_name))
                log_print(
                    "                 %s" % filtered_candidates[k]['caseName'])
                log_print("      Docket nums: %s" % doc.docket.docket_number)
                log_print("                   %s" % filtered_candidates[k].get(
                    'docketNumber', 'None'))
                log_print(
                    "      Cosine Similarity: %s" % filtered_stats['cos_sims'][
                        k])
                log_print("      Candidate URL: file://%s" % case_path)
                log_print("      Match URL: https://www.courtlistener.com%s" %
                          (filtered_candidates[k]['absolute_url']))

                choice = raw_input("Is this a duplicate? [Y/n]: ")
                choice = choice or "y"
                if choice == 'y':
                    duplicates.append(filtered_candidates[k]['id'])

        if len(duplicates) == 0:
            log_print(
                "  - Not a duplicate: Manual determination found no matches.")
            return []
        elif len(duplicates) == 1:
            log_print(
                "  - Duplicate found: Manual determination found one match.")
            return [duplicates[0]]
        elif len(duplicates) > 1:
            log_print(
                "  - Duplicates found: Manual determination found %s matches." % len(
                    duplicates))
            return duplicates

Example 23

Project: agdc
Source File: fc_stacker.py
View license
    def derive_datasets(self, input_dataset_dict, stack_output_info, tile_type_info):
        """ Overrides abstract function in stacker class. Called in Stacker.stack_derived() function. 
        Creates PQA-masked NDVI stack
        
        Arguments:
            fc_dataset_dict: Dict keyed by processing level (e.g. ORTHO, FC, PQA, DEM)
                containing all tile info which can be used within the function
                A sample is shown below (including superfluous band-specific information):
                
{
'FC': {'band_name': 'Visible Blue',
    'band_tag': 'B10',
    'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
    'end_row': 77,
    'level_name': 'FC',
    'nodata_value': -999L,
    'path': 91,
    'satellite_tag': 'LS7',
    'sensor_name': 'ETM+',
    'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
    'start_row': 77,
    'tile_layer': 1,
    'tile_pathname': '/g/data/v10/datacube/EPSG4326_1deg_0.00025pixel/LS7_ETM/150_-025/2000/LS7_ETM_FC_150_-025_2000-02-09T23-46-12.722217.tif',
    'x_index': 150,
    'y_index': -25},
'ORTHO': {'band_name': 'Thermal Infrared (Low Gain)',
     'band_tag': 'B61',
     'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
     'end_row': 77,
     'level_name': 'ORTHO',
     'nodata_value': 0L,
     'path': 91,
     'satellite_tag': 'LS7',
     'sensor_name': 'ETM+',
     'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
     'start_row': 77,
     'tile_layer': 1,
     'tile_pathname': '/g/data/v10/datacube/EPSG4326_1deg_0.00025pixel/LS7_ETM/150_-025/2000/LS7_ETM_ORTHO_150_-025_2000-02-09T23-46-12.722217.tif',
     'x_index': 150,
     'y_index': -25},
'PQA': {'band_name': 'Pixel Quality Assurance',
    'band_tag': 'PQA',
    'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
    'end_row': 77,
    'level_name': 'PQA',
    'nodata_value': None,
    'path': 91,
    'satellite_tag': 'LS7',
    'sensor_name': 'ETM+',
    'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
    'start_row': 77,
    'tile_layer': 1,
    'tile_pathname': '/g/data/v10/datacube/EPSG4326_1deg_0.00025pixel/LS7_ETM/150_-025/2000/LS7_ETM_PQA_150_-025_2000-02-09T23-46-12.722217.tif,
    'x_index': 150,
    'y_index': -25}
}                
                
        Arguments (Cont'd):
            stack_output_info: dict containing stack output information. 
                Obtained from stacker object. 
                A sample is shown below
                
stack_output_info = {'x_index': 144, 
                      'y_index': -36,
                      'stack_output_dir': '/g/data/v10/tmp/ndvi',
                      'start_datetime': None, # Datetime object or None
                      'end_datetime': None, # Datetime object or None 
                      'satellite': None, # String or None 
                      'sensor': None} # String or None 
                      
        Arguments (Cont'd):
            tile_type_info: dict containing tile type information. 
                Obtained from stacker object (e.g: stacker.tile_type_dict[tile_type_id]). 
                A sample is shown below
                
{'crs': 'EPSG:4326',
    'file_extension': '.tif',
    'file_format': 'GTiff',
    'format_options': 'COMPRESS=LZW,BIGTIFF=YES',
    'tile_directory': 'EPSG4326_1deg_0.00025pixel',
    'tile_type_id': 1L,
    'tile_type_name': 'Unprojected WGS84 1-degree at 4000 pixels/degree',
    'unit': 'degree',
    'x_origin': 0.0,
    'x_pixel_size': Decimal('0.00025000000000000000'),
    'x_pixels': 4000L,
    'x_size': 1.0,
    'y_origin': 0.0,
    'y_pixel_size': Decimal('0.00025000000000000000'),
    'y_pixels': 4000L,
    'y_size': 1.0}
                            
        Function must create one or more GDAL-supported output datasets. Useful functions in the
        Stacker class include Stacker.get_pqa_mask(), but it is left to the coder to produce exactly
        what is required for a single slice of the temporal stack of derived quantities.
            
        Returns:
            output_dataset_info: Dict keyed by stack filename
                containing metadata info for GDAL-supported output datasets created by this function.
                Note that the key(s) will be used as the output filename for the VRT temporal stack
                and each dataset created must contain only a single band. An example is as follows:
{'/g/data/v10/tmp/ndvi/NDVI_stack_150_-025.vrt': 
    {'band_name': 'Normalised Differential Vegetation Index with PQA applied',
    'band_tag': 'NDVI',
    'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
    'end_row': 77,
    'level_name': 'NDVI',
    'nodata_value': None,
    'path': 91,
    'satellite_tag': 'LS7',
    'sensor_name': 'ETM+',
    'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
    'start_row': 77,
    'tile_layer': 1,
    'tile_pathname': '/g/data/v10/tmp/ndvi/LS7_ETM_NDVI_150_-025_2000-02-09T23-46-12.722217.tif',
    'x_index': 150,
    'y_index': -25}
}
        """
        assert type(input_dataset_dict) == dict, 'input_dataset_dict must be a dict'
        
        def create_rgb_tif(input_dataset_path, output_dataset_path, pqa_mask=None, rgb_bands=None, 
                           input_no_data_value=-999, output_no_data_value=0,
                           input_range=()):
            if os.path.exists(output_dataset_path):
                logger.info('Output dataset %s already exists - skipping', output_dataset_path)
                return
            
            if not self.lock_object(output_dataset_path):
                logger.info('Output dataset %s already locked - skipping', output_dataset_path)
                return
            
            if not rgb_bands:
                rgb_bands = [3, 1, 2]
                
            scale_factor = 10000.0 / 255.0 # Scale factor to translate from +ve int16 to byte
            
            input_gdal_dataset = gdal.Open(input_dataset_path) 
            assert input_gdal_dataset, 'Unable to open input dataset %s' % (input_dataset_path)
        
            try:
                # Create multi-band dataset for masked data
                logger.debug('output_dataset path = %s', output_dataset_path)
                gdal_driver = gdal.GetDriverByName('GTiff')
                log_multiline(logger.debug, gdal_driver.GetMetadata(), 'gdal_driver.GetMetadata()')
                output_gdal_dataset = gdal_driver.Create(output_dataset_path, 
                    input_gdal_dataset.RasterXSize, input_gdal_dataset.RasterYSize,
                    len(rgb_bands), gdal.GDT_Byte, ['INTERLEAVE=PIXEL']) #['INTERLEAVE=PIXEL','COMPRESS=NONE','BIGTIFF=YES'])
                assert output_gdal_dataset, 'Unable to open input dataset %s' % output_dataset_path
                output_gdal_dataset.SetGeoTransform(input_gdal_dataset.GetGeoTransform())
                output_gdal_dataset.SetProjection(input_gdal_dataset.GetProjection())
                
                dest_band_no = 0
                for source_band_no in rgb_bands:
                    dest_band_no += 1  
                    logger.debug('Processing source band %d, destination band %d', source_band_no, dest_band_no)
                    input_band_array = input_gdal_dataset.GetRasterBand(source_band_no).ReadAsArray()
                    input_gdal_dataset.FlushCache()
                    
                    output_band_array = (input_band_array / scale_factor).astype(numpy.byte)
                    
                    output_band_array[numpy.logical_or((input_band_array < 0), (input_band_array > 10000))] = output_no_data_value # Set any out-of-bounds values to no-data
                    
                    if pqa_mask is not None: # Need to perform masking
                        output_band_array[numpy.logical_or((input_band_array == input_no_data_value), ~pqa_mask)] = output_no_data_value # Apply PQA mask and no-data value
                    else:
                        output_band_array[(input_band_array == input_no_data_value)] = output_no_data_value # Re-apply no-data value
                    
                    output_band = output_gdal_dataset.GetRasterBand(dest_band_no)
                    output_band.SetNoDataValue(output_no_data_value)
                    output_band.WriteArray(output_band_array)
                    output_band.FlushCache()
                    
                output_gdal_dataset.FlushCache()
            finally:
                self.unlock_object(output_dataset_path)



                
        dtype = {'FC_PV' : gdalconst.GDT_Int16,
                 'FC_NPV' : gdalconst.GDT_Int16,
                 'FC_BS' : gdalconst.GDT_Int16}

        no_data_value = {'FC_PV' : -999,
                         'FC_NPV' : -999,
                         'FC_BS' : -999}
    
        log_multiline(logger.debug, input_dataset_dict, 'input_dataset_dict', '\t')    
       
        # Test function to copy ORTHO & FC band datasets with pixel quality mask applied
        # to an output directory for stacking

        output_dataset_dict = {}
        fc_dataset_info = input_dataset_dict['FC'] # Only need FC data for NDVI
        #thermal_dataset_info = input_dataset_dict['ORTHO'] # Could have one or two thermal bands
        
        if fc_dataset_info is None:
            logger.info('FC dataset does not exist')
            return 
        
        fc_dataset_path = fc_dataset_info['tile_pathname']
        
        if input_dataset_dict['PQA'] is None:
            logger.info('PQA dataset for %s does not exist', fc_dataset_path)
            return 
        
        # Get a boolean mask from the PQA dataset (use default parameters for mask and dilation)
        pqa_mask = self.get_pqa_mask(input_dataset_dict['PQA']['tile_pathname']) 
        
        fc_dataset = gdal.Open(fc_dataset_path)
        assert fc_dataset, 'Unable to open dataset %s' % fc_dataset
        
        band_array = None;
        # List of outputs to generate from each file
        output_tag_list = ['FC_PV', 'FC_NPV', 'FC_BS']
        input_band_index = 0
        for output_tag in output_tag_list: 
        # List of outputs to generate from each file
            # TODO: Make the stack file name reflect the date range                    
            output_stack_path = os.path.join(self.output_dir, 
                                             re.sub('\+', '', '%s_%+04d_%+04d' % (output_tag,
                                                                                   stack_output_info['x_index'],
                                                                                    stack_output_info['y_index'])))
                                                                                    
            if stack_output_info['start_datetime']:
                output_stack_path += '_%s' % stack_output_info['start_datetime'].strftime('%Y%m%d')
            if stack_output_info['end_datetime']:
                output_stack_path += '_%s' % stack_output_info['end_datetime'].strftime('%Y%m%d')
                
            output_stack_path += '_pqa_stack.vrt'
            
            output_tile_path = os.path.join(self.output_dir, re.sub('\.\w+$', tile_type_info['file_extension'],
                                                                    re.sub('FC', 
                                                                           output_tag,
                                                                           os.path.basename(fc_dataset_path)
                                                                           )
                                                                   )
                                           )
                
            # Copy metadata for eventual inclusion in stack file output
            # This could also be written to the output tile if required
            output_dataset_info = dict(fc_dataset_info)
            output_dataset_info['tile_pathname'] = output_tile_path # This is the most important modification - used to find tiles to stack
            output_dataset_info['band_name'] = '%s with PQA mask applied' % output_tag
            output_dataset_info['band_tag'] = '%s-PQA' % output_tag
            output_dataset_info['tile_layer'] = 1
            output_dataset_info['nodata_value'] = no_data_value[output_tag]

            # Check for existing, valid file
            if self.refresh or not os.path.exists(output_tile_path):

                if self.lock_object(output_tile_path): # Test for concurrent writes to the same file
                    try:
                        # Read whole fc_dataset into one array. 
                        # 62MB for float32 data should be OK for memory depending on what else happens downstream
                        if band_array is None:
                            band_array = fc_dataset.ReadAsArray()

                            # Re-project issues with PQ. REDO the contiguity layer.
                            non_contiguous = (band_array < 0).any(0)
                            pqa_mask[non_contiguous] = False
                                                
                        gdal_driver = gdal.GetDriverByName(tile_type_info['file_format'])
                        #output_dataset = gdal_driver.Create(output_tile_path, 
                        #                                    fc_dataset.RasterXSize, fc_dataset.RasterYSize,
                        #                                    1, fc_dataset.GetRasterBand(1).DataType,
                        #                                    tile_type_info['format_options'].split(','))
                        output_dataset = gdal_driver.Create(output_tile_path, 
                                                            fc_dataset.RasterXSize, fc_dataset.RasterYSize,
                                                            1, dtype[output_tag],
                                                            tile_type_info['format_options'].split(','))
                        assert output_dataset, 'Unable to open output dataset %s'% output_dataset                                   
                        output_dataset.SetGeoTransform(fc_dataset.GetGeoTransform())
                        output_dataset.SetProjection(fc_dataset.GetProjection()) 
            
                        output_band = output_dataset.GetRasterBand(1)
            
                        # Calculate each output here
                        # Remember band_array indices are zero-based

                        data_array = band_array[input_band_index].copy()
                                            
                        if no_data_value[output_tag]:
                            self.apply_pqa_mask(data_array=data_array, pqa_mask=pqa_mask, no_data_value=no_data_value[output_tag])
                        
                        gdal_driver = gdal.GetDriverByName(tile_type_info['file_format'])
                        #output_dataset = gdal_driver.Create(output_tile_path, 
                        #                                    fc_dataset.RasterXSize, fc_dataset.RasterYSize,
                        #                                    1, fc_dataset.GetRasterBand(1).DataType,
                        #                                    tile_type_info['format_options'].split(','))
                        output_dataset = gdal_driver.Create(output_tile_path, 
                                                            fc_dataset.RasterXSize, fc_dataset.RasterYSize,
                                                            1, dtype[output_tag],
                                                            tile_type_info['format_options'].split(','))
                        assert output_dataset, 'Unable to open output dataset %s'% output_dataset                                   
                        output_dataset.SetGeoTransform(fc_dataset.GetGeoTransform())
                        output_dataset.SetProjection(fc_dataset.GetProjection()) 
            
                        output_band = output_dataset.GetRasterBand(1)
            
                        output_band.WriteArray(data_array)
                        output_band.SetNoDataValue(output_dataset_info['nodata_value'])
                        output_band.FlushCache()
                        
                        # This is not strictly necessary - copy metadata to output dataset
                        output_dataset_metadata = fc_dataset.GetMetadata()
                        if output_dataset_metadata:
                            output_dataset.SetMetadata(output_dataset_metadata) 
                            log_multiline(logger.debug, output_dataset_metadata, 'output_dataset_metadata', '\t')    
                        
                        output_dataset.FlushCache()
                        logger.info('Finished writing dataset %s', output_tile_path)
                    finally:
                        self.unlock_object(output_tile_path)
                else:
                    logger.info('Skipped locked dataset %s', output_tile_path)
                    sleep(5) #TODO: Find a nicer way of dealing with contention for the same output tile
                    
            else:
                logger.info('Skipped existing dataset %s', output_tile_path)
        
            output_dataset_dict[output_stack_path] = output_dataset_info
            input_band_index += 1
#                    log_multiline(logger.debug, output_dataset_info, 'output_dataset_info', '\t') 
            # End of loop  
 
        fc_rgb_path = os.path.join(self.output_dir, re.sub('\.\w+$', '.tif', # Write to .tif file
                                                                re.sub('^LS\d_[^_]+_', '', # Remove satellite & sensor reference to allow proper sorting by filename
                                                                       re.sub('FC', # Write to FC_RGB file
                                                                              'FC_RGB',
                                                                              os.path.basename(fc_dataset_path)
                                                                              )
                                                                       )
                                                               )
                                       )
                
        logger.info('Creating FC RGB output file %s', fc_rgb_path)
        create_rgb_tif(input_dataset_path=fc_dataset_path, output_dataset_path=fc_rgb_path, pqa_mask=pqa_mask)
        
        log_multiline(logger.debug, output_dataset_dict, 'output_dataset_dict', '\t')    

        # Datasets processed - return info
        return output_dataset_dict

Example 24

Project: agdc
Source File: index_stacker.py
View license
    def derive_datasets(self, input_dataset_dict, stack_output_info, tile_type_info):
        """ Overrides abstract function in stacker class. Called in Stacker.stack_derived() function.
        Creates PQA-masked NDVI stack

        Arguments:
            nbar_dataset_dict: Dict keyed by processing level (e.g. ORTHO, NBAR, PQA, DEM)
                containing all tile info which can be used within the function
                A sample is shown below (including superfluous band-specific information):

{
'NBAR': {'band_name': 'Visible Blue',
    'band_tag': 'B10',
    'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
    'end_row': 77,
    'level_name': 'NBAR',
    'nodata_value': -999L,
    'path': 91,
    'satellite_tag': 'LS7',
    'sensor_name': 'ETM+',
    'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
    'start_row': 77,
    'tile_layer': 1,
    'tile_pathname': '/g/data/v10/datacube/EPSG4326_1deg_0.00025pixel/LS7_ETM/150_-025/2000/LS7_ETM_NBAR_150_-025_2000-02-09T23-46-12.722217.tif',
    'x_index': 150,
    'y_index': -25},
'ORTHO': {'band_name': 'Thermal Infrared (Low Gain)',
     'band_tag': 'B61',
     'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
     'end_row': 77,
     'level_name': 'ORTHO',
     'nodata_value': 0L,
     'path': 91,
     'satellite_tag': 'LS7',
     'sensor_name': 'ETM+',
     'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
     'start_row': 77,
     'tile_layer': 1,
     'tile_pathname': '/g/data/v10/datacube/EPSG4326_1deg_0.00025pixel/LS7_ETM/150_-025/2000/LS7_ETM_ORTHO_150_-025_2000-02-09T23-46-12.722217.tif',
     'x_index': 150,
     'y_index': -25},
'PQA': {'band_name': 'Pixel Quality Assurance',
    'band_tag': 'PQA',
    'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
    'end_row': 77,
    'level_name': 'PQA',
    'nodata_value': None,
    'path': 91,
    'satellite_tag': 'LS7',
    'sensor_name': 'ETM+',
    'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
    'start_row': 77,
    'tile_layer': 1,
    'tile_pathname': '/g/data/v10/datacube/EPSG4326_1deg_0.00025pixel/LS7_ETM/150_-025/2000/LS7_ETM_PQA_150_-025_2000-02-09T23-46-12.722217.tif,
    'x_index': 150,
    'y_index': -25}
}

        Arguments (Cont'd):
            stack_output_info: dict containing stack output information.
                Obtained from stacker object.
                A sample is shown below

stack_output_info = {'x_index': 144,
                      'y_index': -36,
                      'stack_output_dir': '/g/data/v10/tmp/ndvi',
                      'start_datetime': None, # Datetime object or None
                      'end_datetime': None, # Datetime object or None
                      'satellite': None, # String or None
                      'sensor': None} # String or None

        Arguments (Cont'd):
            tile_type_info: dict containing tile type information.
                Obtained from stacker object (e.g: stacker.tile_type_dict[tile_type_id]).
                A sample is shown below

{'crs': 'EPSG:4326',
    'file_extension': '.tif',
    'file_format': 'GTiff',
    'format_options': 'COMPRESS=LZW,BIGTIFF=YES',
    'tile_directory': 'EPSG4326_1deg_0.00025pixel',
    'tile_type_id': 1L,
    'tile_type_name': 'Unprojected WGS84 1-degree at 4000 pixels/degree',
    'unit': 'degree',
    'x_origin': 0.0,
    'x_pixel_size': Decimal('0.00025000000000000000'),
    'x_pixels': 4000L,
    'x_size': 1.0,
    'y_origin': 0.0,
    'y_pixel_size': Decimal('0.00025000000000000000'),
    'y_pixels': 4000L,
    'y_size': 1.0}

        Function must create one or more GDAL-supported output datasets. Useful functions in the
        Stacker class include Stacker.get_pqa_mask(), but it is left to the coder to produce exactly
        what is required for a single slice of the temporal stack of derived quantities.

        Returns:
            output_dataset_info: Dict keyed by stack filename
                containing metadata info for GDAL-supported output datasets created by this function.
                Note that the key(s) will be used as the output filename for the VRT temporal stack
                and each dataset created must contain only a single band. An example is as follows:
{'/g/data/v10/tmp/ndvi/NDVI_stack_150_-025.vrt':
    {'band_name': 'Normalised Differential Vegetation Index with PQA applied',
    'band_tag': 'NDVI',
    'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
    'end_row': 77,
    'level_name': 'NDVI',
    'nodata_value': None,
    'path': 91,
    'satellite_tag': 'LS7',
    'sensor_name': 'ETM+',
    'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
    'start_row': 77,
    'tile_layer': 1,
    'tile_pathname': '/g/data/v10/tmp/ndvi/LS7_ETM_NDVI_150_-025_2000-02-09T23-46-12.722217.tif',
    'x_index': 150,
    'y_index': -25}
}
        """
        assert type(input_dataset_dict) == dict, 'nbar_dataset_dict must be a dict'

        dtype = gdalconst.GDT_Float32 # All output is to be float32
        no_data_value = numpy.nan

        log_multiline(logger.debug, input_dataset_dict, 'input_dataset_dict', '\t')

        # Test function to copy ORTHO & NBAR band datasets with pixel quality mask applied
        # to an output directory for stacking

        output_dataset_dict = {}
        nbar_dataset_info = input_dataset_dict.get('NBAR') # Only need NBAR data for NDVI
        #thermal_dataset_info = input_dataset_dict['ORTHO'] # Could have one or two thermal bands

        # Need to skip tiles which don't have an NBAR tile (i.e. for non-mosaiced FC tiles at W & E sides of test area)
        if nbar_dataset_info is None:
            logger.warning('NBAR tile does not exist')
            return None

        # Nasty work-around for bad PQA due to missing thermal bands for LS8-OLI
        if nbar_dataset_info['satellite_tag'] == 'LS8' and nbar_dataset_info['sensor_name'] == 'OLI':
            logger.debug('Work-around for LS8-OLI PQA issue applied: TILE SKIPPED')
            return None

        # Instantiate band lookup object with all required lookup parameters
        lookup = BandLookup(data_cube=self,
                            lookup_scheme_name='LANDSAT-LS5/7',
                            tile_type_id=tile_type_info['tile_type_id'],
                            satellite_tag=nbar_dataset_info['satellite_tag'],
                            sensor_name=nbar_dataset_info['sensor_name'],
                            level_name=nbar_dataset_info['level_name']
                            )

        nbar_dataset_path = nbar_dataset_info['tile_pathname']
        
        if input_dataset_dict.get('PQA') is None: # No PQA tile available
            return

        # Get a boolean mask from the PQA dataset (use default parameters for mask and dilation)
        pqa_mask = self.get_pqa_mask(input_dataset_dict['PQA']['tile_pathname'])

        log_multiline(logger.debug, pqa_mask, 'pqa_mask', '\t')

        nbar_dataset = gdal.Open(nbar_dataset_path)
        assert nbar_dataset, 'Unable to open dataset %s' % nbar_dataset

        band_array = None;
        # List of outputs to generate from each file
        output_tag_list = ['B', 'G', 'R', 'NIR', 'SWIR1', 'SWIR2',
                           'NDVI', 'EVI', 'NDSI', 'NDMI', 'SLAVI', 'SATVI']
        for output_tag in sorted(output_tag_list):
        # List of outputs to generate from each file
            # TODO: Make the stack file name reflect the date range
            output_stack_path = os.path.join(self.output_dir,
                                             re.sub('\+', '', '%s_%+04d_%+04d' % (output_tag,
                                                                                   stack_output_info['x_index'],
                                                                                    stack_output_info['y_index'])))

            if stack_output_info['start_datetime']:
                output_stack_path += '_%s' % stack_output_info['start_datetime'].strftime('%Y%m%d')
            if stack_output_info['end_datetime']:
                output_stack_path += '_%s' % stack_output_info['end_datetime'].strftime('%Y%m%d')

            output_stack_path += '_pqa_stack.vrt'

            output_tile_path = os.path.join(self.output_dir, re.sub('\.\w+$', tile_type_info['file_extension'],
                                                                    re.sub('NBAR',
                                                                           output_tag,
                                                                           os.path.basename(nbar_dataset_path)
                                                                           )
                                                                   )
                                           )

            # Copy metadata for eventual inclusion in stack file output
            # This could also be written to the output tile if required
            output_dataset_info = dict(nbar_dataset_info)
            output_dataset_info['tile_pathname'] = output_tile_path # This is the most important modification - used to find tiles to stack
            output_dataset_info['band_name'] = '%s with PQA mask applied' % output_tag
            output_dataset_info['band_tag'] = '%s-PQA' % output_tag
            output_dataset_info['tile_layer'] = 1
            output_dataset_info['nodata_value'] = no_data_value

            # Check for existing, valid file
            if self.refresh or not os.path.exists(output_tile_path):

                if self.lock_object(output_tile_path): # Test for concurrent writes to the same file
                    try:
                        # Read whole nbar_dataset into one array.
                        # 62MB for float32 data should be OK for memory depending on what else happens downstream
                        if band_array is None:
                            # Convert to float32 for arithmetic and scale back to 0~1 reflectance
                            band_array = (nbar_dataset.ReadAsArray().astype(numpy.float32)) / SCALE_FACTOR
                            
                            log_multiline(logger.debug, band_array, 'band_array', '\t')
                            
                            # Adjust bands if required
                            for band_tag in lookup.bands:
                                if lookup.adjustment_multiplier[band_tag] != 1.0 or lookup.adjustment_offset[band_tag] != 0.0:
                                    logger.debug('Band values adjusted: %s = %s * %s + %s', 
                                                 band_tag, band_tag, lookup.adjustment_multiplier[band_tag], lookup.adjustment_offset[band_tag])
                                    band_array[lookup.band_index[band_tag]] = band_array[lookup.band_index[band_tag]] * lookup.adjustment_multiplier[band_tag] + lookup.adjustment_offset[band_tag]
                            log_multiline(logger.debug, band_array, 'adjusted band_array', '\t')
                            
                            # Re-project issues with PQ. REDO the contiguity layer.
                            non_contiguous = (band_array < 0).any(0)
                            pqa_mask[non_contiguous] = False

                            log_multiline(logger.debug, pqa_mask, 'enhanced pqa_mask', '\t')

                        gdal_driver = gdal.GetDriverByName(tile_type_info['file_format'])
                        #output_dataset = gdal_driver.Create(output_tile_path,
                        #                                    nbar_dataset.RasterXSize, nbar_dataset.RasterYSize,
                        #                                    1, nbar_dataset.GetRasterBand(1).DataType,
                        #                                    tile_type_info['format_options'].split(','))
                        output_dataset = gdal_driver.Create(output_tile_path,
                                                            nbar_dataset.RasterXSize, nbar_dataset.RasterYSize,
                                                            1, dtype,
                                                            tile_type_info['format_options'].split(','))
                        assert output_dataset, 'Unable to open output dataset %s'% output_dataset
                        output_dataset.SetGeoTransform(nbar_dataset.GetGeoTransform())
                        output_dataset.SetProjection(nbar_dataset.GetProjection())

                        output_band = output_dataset.GetRasterBand(1)

                        # Calculate each output here
                        # Remember band_array indices are zero-based

                        if output_tag in lookup.bands: # One of the band tags
                            # Copy values
                            data_array = band_array[lookup.band_index[output_tag]].copy()
                        elif output_tag == 'NDVI':
                            data_array = numexpr.evaluate("((NIR_array - R_array) / (NIR_array + R_array)) + 1", 
                                                          {'NIR_array': band_array[lookup.band_index['NIR']], 
                                                           'R_array': band_array[lookup.band_index['R']]
                                                           })
                        elif output_tag == 'EVI':
                            data_array = numexpr.evaluate("(2.5 * ((NIR_array - R_array) / (NIR_array + (6 * R_array) - (7.5 * B_array) + 1))) + 1", 
                                                          {'NIR_array': band_array[lookup.band_index['NIR']], 
                                                           'R_array':band_array[lookup.band_index['R']], 
                                                           'B_array':band_array[lookup.band_index['B']]
                                                           })
                        elif output_tag == 'NDSI':
                            data_array = numexpr.evaluate("((R_array - SWIR1_array) / (R_array + SWIR1_array)) + 1", 
                                                          {'SWIR1_array': band_array[lookup.band_index['SWIR1']], 
                                                           'R_array': band_array[lookup.band_index['R']]
                                                           })
                        elif output_tag == 'NDMI':
                            data_array = numexpr.evaluate("((NIR_array - SWIR1_array) / (NIR_array + SWIR1_array)) + 1", 
                                                          {'SWIR1_array': band_array[lookup.band_index['SWIR1']], 
                                                           'NIR_array': band_array[lookup.band_index['NIR']]
                                                           })
                        elif output_tag == 'SLAVI':
                            data_array = numexpr.evaluate("NIR_array / (R_array + SWIR1_array)", 
                                                          {'SWIR1_array': band_array[lookup.band_index['SWIR1']], 
                                                           'NIR_array': band_array[lookup.band_index['NIR']], 
                                                           'R_array': band_array[lookup.band_index['R']]
                                                           })
                        elif output_tag == 'SATVI':
                            data_array = numexpr.evaluate("(((SWIR1_array - R_array) / (SWIR1_array + R_array + 0.5)) * 1.5 - (SWIR2_array / 2)) + 1", 
                                                          {'SWIR1_array': band_array[lookup.band_index['SWIR1']], 
                                                           'SWIR2_array':band_array[lookup.band_index['SWIR2']], 
                                                           'R_array':band_array[lookup.band_index['R']]
                                                           })
                        else:
                            raise Exception('Invalid operation')

                        log_multiline(logger.debug, data_array, 'data_array', '\t')
                        
                        if no_data_value:
                            self.apply_pqa_mask(data_array=data_array, pqa_mask=pqa_mask, no_data_value=no_data_value)

                        log_multiline(logger.debug, data_array, 'masked data_array', '\t')
                        
                        gdal_driver = gdal.GetDriverByName(tile_type_info['file_format'])
                        #output_dataset = gdal_driver.Create(output_tile_path,
                        #                                    nbar_dataset.RasterXSize, nbar_dataset.RasterYSize,
                        #                                    1, nbar_dataset.GetRasterBand(1).DataType,
                        #                                    tile_type_info['format_options'].split(','))
                        output_dataset = gdal_driver.Create(output_tile_path,
                                                            nbar_dataset.RasterXSize, nbar_dataset.RasterYSize,
                                                            1, dtype,
                                                            tile_type_info['format_options'].split(','))
                        assert output_dataset, 'Unable to open output dataset %s'% output_dataset
                        output_dataset.SetGeoTransform(nbar_dataset.GetGeoTransform())
                        output_dataset.SetProjection(nbar_dataset.GetProjection())

                        output_band = output_dataset.GetRasterBand(1)

                        output_band.WriteArray(data_array)
                        output_band.SetNoDataValue(output_dataset_info['nodata_value'])
                        output_band.FlushCache()

                        # This is not strictly necessary - copy metadata to output dataset
                        output_dataset_metadata = nbar_dataset.GetMetadata()
                        if output_dataset_metadata:
                            output_dataset.SetMetadata(output_dataset_metadata)
                            log_multiline(logger.debug, output_dataset_metadata, 'output_dataset_metadata', '\t')

                        output_dataset.FlushCache()
                        logger.info('Finished writing dataset %s', output_tile_path)
                    finally:
                        self.unlock_object(output_tile_path)
                else:
                    logger.info('Skipped locked dataset %s', output_tile_path)
                    sleep(5) #TODO: Find a nicer way of dealing with contention for the same output tile

            else:
                logger.info('Skipped existing dataset %s', output_tile_path)

            output_dataset_dict[output_stack_path] = output_dataset_info
#                    log_multiline(logger.debug, output_dataset_info, 'output_dataset_info', '\t')

        log_multiline(logger.debug, output_dataset_dict, 'output_dataset_dict', '\t')
        # NDVI dataset processed - return info
        return output_dataset_dict

Example 25

Project: agdc
Source File: season_stacker.py
View license
    def derive_datasets(self, input_dataset_dict, stack_output_info, tile_type_info):
        """ Overrides abstract function in stacker class. Called in Stacker.stack_derived() function. 
        Creates PQA-masked NDVI stack
        
        Arguments:
            nbar_dataset_dict: Dict keyed by processing level (e.g. ORTHO, NBAR, PQA, DEM)
                containing all tile info which can be used within the function
                A sample is shown below (including superfluous band-specific information):
                
{
'NBAR': {'band_name': 'Visible Blue',
    'band_tag': 'B10',
    'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
    'end_row': 77,
    'level_name': 'NBAR',
    'nodata_value': -999L,
    'path': 91,
    'satellite_tag': 'LS7',
    'sensor_name': 'ETM+',
    'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
    'start_row': 77,
    'tile_layer': 1,
    'tile_pathname': '/g/data/v10/datacube/EPSG4326_1deg_0.00025pixel/LS7_ETM/150_-025/2000/LS7_ETM_NBAR_150_-025_2000-02-09T23-46-12.722217.tif',
    'x_index': 150,
    'y_index': -25},
'ORTHO': {'band_name': 'Thermal Infrared (Low Gain)',
     'band_tag': 'B61',
     'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
     'end_row': 77,
     'level_name': 'ORTHO',
     'nodata_value': 0L,
     'path': 91,
     'satellite_tag': 'LS7',
     'sensor_name': 'ETM+',
     'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
     'start_row': 77,
     'tile_layer': 1,
     'tile_pathname': '/g/data/v10/datacube/EPSG4326_1deg_0.00025pixel/LS7_ETM/150_-025/2000/LS7_ETM_ORTHO_150_-025_2000-02-09T23-46-12.722217.tif',
     'x_index': 150,
     'y_index': -25},
'PQA': {'band_name': 'Pixel Quality Assurance',
    'band_tag': 'PQA',
    'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
    'end_row': 77,
    'level_name': 'PQA',
    'nodata_value': None,
    'path': 91,
    'satellite_tag': 'LS7',
    'sensor_name': 'ETM+',
    'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
    'start_row': 77,
    'tile_layer': 1,
    'tile_pathname': '/g/data/v10/datacube/EPSG4326_1deg_0.00025pixel/LS7_ETM/150_-025/2000/LS7_ETM_PQA_150_-025_2000-02-09T23-46-12.722217.tif,
    'x_index': 150,
    'y_index': -25}
}                
                
        Arguments (Cont'd):
            stack_output_info: dict containing stack output information. 
                Obtained from stacker object. 
                A sample is shown below
                
stack_output_info = {'x_index': 144, 
                      'y_index': -36,
                      'stack_output_dir': '/g/data/v10/tmp/ndvi',
                      'start_datetime': None, # Datetime object or None
                      'end_datetime': None, # Datetime object or None 
                      'satellite': None, # String or None 
                      'sensor': None} # String or None 
                      
        Arguments (Cont'd):
            tile_type_info: dict containing tile type information. 
                Obtained from stacker object (e.g: stacker.tile_type_dict[tile_type_id]). 
                A sample is shown below
                
{'crs': 'EPSG:4326',
    'file_extension': '.tif',
    'file_format': 'GTiff',
    'format_options': 'COMPRESS=LZW,BIGTIFF=YES',
    'tile_directory': 'EPSG4326_1deg_0.00025pixel',
    'tile_type_id': 1L,
    'tile_type_name': 'Unprojected WGS84 1-degree at 4000 pixels/degree',
    'unit': 'degree',
    'x_origin': 0.0,
    'x_pixel_size': Decimal('0.00025000000000000000'),
    'x_pixels': 4000L,
    'x_size': 1.0,
    'y_origin': 0.0,
    'y_pixel_size': Decimal('0.00025000000000000000'),
    'y_pixels': 4000L,
    'y_size': 1.0}
                            
        Function must create one or more GDAL-supported output datasets. Useful functions in the
        Stacker class include Stacker.get_pqa_mask(), but it is left to the coder to produce exactly
        what is required for a single slice of the temporal stack of derived quantities.
            
        Returns:
            output_dataset_info: Dict keyed by stack filename
                containing metadata info for GDAL-supported output datasets created by this function.
                Note that the key(s) will be used as the output filename for the VRT temporal stack
                and each dataset created must contain only a single band. An example is as follows:
{'/g/data/v10/tmp/ndvi/NDVI_stack_150_-025.vrt': 
    {'band_name': 'Normalised Differential Vegetation Index with PQA applied',
    'band_tag': 'NDVI',
    'end_datetime': datetime.datetime(2000, 2, 9, 23, 46, 36, 722217),
    'end_row': 77,
    'level_name': 'NDVI',
    'nodata_value': None,
    'path': 91,
    'satellite_tag': 'LS7',
    'sensor_name': 'ETM+',
    'start_datetime': datetime.datetime(2000, 2, 9, 23, 46, 12, 722217),
    'start_row': 77,
    'tile_layer': 1,
    'tile_pathname': '/g/data/v10/tmp/ndvi/LS7_ETM_NDVI_150_-025_2000-02-09T23-46-12.722217.tif',
    'x_index': 150,
    'y_index': -25}
}
        """
        assert type(input_dataset_dict) == dict, 'nbar_dataset_dict must be a dict'
                
        dtype = {'B10' : gdalconst.GDT_Float32,
                 'B20' : gdalconst.GDT_Float32,
                 'B30' : gdalconst.GDT_Float32,
                 'B40' : gdalconst.GDT_Float32,
                 'B50' : gdalconst.GDT_Float32,
                 'B70' : gdalconst.GDT_Float32,
                 'NDVI' : gdalconst.GDT_Float32,
                 'EVI' : gdalconst.GDT_Float32,
                 'NDSI' : gdalconst.GDT_Float32,
                 'NDMI' : gdalconst.GDT_Float32,
                 'SLAVI' : gdalconst.GDT_Float32,
                 'SATVI' : gdalconst.GDT_Float32,
                 'WATER' : gdalconst.GDT_Int16}

        no_data_value = {'B10' : numpy.nan,
                 'B20' : numpy.nan,
                 'B30' : numpy.nan,
                 'B40' : numpy.nan,
                 'B50' : numpy.nan,
                 'B70' : numpy.nan,
                 'NDVI' : numpy.nan,
                 'EVI' : numpy.nan,
                 'NDSI' : numpy.nan,
                 'NDMI' : numpy.nan,
                 'SLAVI' : numpy.nan,
                 'SATVI' : numpy.nan,
                 'WATER' : -1}
    
        log_multiline(logger.debug, input_dataset_dict, 'nbar_dataset_dict', '\t')    
       
        # Test function to copy ORTHO & NBAR band datasets with pixel quality mask applied
        # to an output directory for stacking

        output_dataset_dict = {}
        nbar_dataset_info = input_dataset_dict['NBAR'] # Only need NBAR data for NDVI
        #thermal_dataset_info = input_dataset_dict['ORTHO'] # Could have one or two thermal bands
        
        nbar_dataset_path = nbar_dataset_info['tile_pathname']
        
        # Get a boolean mask from the PQA dataset (use default parameters for mask and dilation)
        pqa_mask = self.get_pqa_mask(input_dataset_dict['PQA']['tile_pathname']) 
        
        nbar_dataset = gdal.Open(nbar_dataset_path)
        assert nbar_dataset, 'Unable to open NBAR dataset %s' % nbar_dataset
        
        band_array = None;
        # List of outputs to generate from each file
        output_tag_list = ['B10', 'B20', 'B30', 'B40', 'B50', 'B70', 
                           'NDVI', 'EVI', 'NDSI', 'NDMI', 'SLAVI', 'SATVI']
        for output_tag in sorted(output_tag_list): 
        # List of outputs to generate from each file
            # TODO: Make the stack file name reflect the date range                    
            output_stack_path = os.path.join(self.output_dir, 
                                             re.sub('\+', '', '%s_%+04d_%+04d' % (output_tag,
                                                                                   stack_output_info['x_index'],
                                                                                    stack_output_info['y_index'])))
                                                                                    
            if stack_output_info['start_datetime']:
                output_stack_path += '_%s' % stack_output_info['start_datetime'].strftime('%m%d')
            if stack_output_info['end_datetime']:
                output_stack_path += '_%s' % stack_output_info['end_datetime'].strftime('%m%d')
                
            output_stack_path += '_pqa_stack.vrt'
            
            output_tile_path = os.path.join(self.output_dir, re.sub('\.\w+$', tile_type_info['file_extension'],
                                                                    re.sub('NBAR', 
                                                                           output_tag,
                                                                           os.path.basename(nbar_dataset_path)
                                                                           )
                                                                   )
                                           )
                
            # Copy metadata for eventual inclusion in stack file output
            # This could also be written to the output tile if required
            output_dataset_info = dict(nbar_dataset_info)
            output_dataset_info['tile_pathname'] = output_tile_path # This is the most important modification - used to find tiles to stack
            output_dataset_info['band_name'] = '%s with PQA mask applied' % output_tag
            output_dataset_info['band_tag'] = '%s-PQA' % output_tag
            output_dataset_info['tile_layer'] = 1
            output_dataset_info['nodata_value'] = no_data_value[output_tag]

            # Check for existing, valid file
            if self.refresh or not os.path.exists(output_tile_path):

                if self.lock_object(output_tile_path): # Test for concurrent writes to the same file
                    try:
                        # Read whole nbar_dataset into one array. 
                        # 62MB for float32 data should be OK for memory depending on what else happens downstream
                        if band_array is None:
                            # Convert to float32 for arithmetic and scale back to 0~1 reflectance
                            band_array = (nbar_dataset.ReadAsArray().astype(numpy.float32)) / SCALE_FACTOR

                            # Re-project issues with PQ. REDO the contiguity layer.
                            non_contiguous = (band_array < 0).any(0)
                            pqa_mask[non_contiguous] = False
                    
                        gdal_driver = gdal.GetDriverByName(tile_type_info['file_format'])
                        #output_dataset = gdal_driver.Create(output_tile_path, 
                        #                                    nbar_dataset.RasterXSize, nbar_dataset.RasterYSize,
                        #                                    1, nbar_dataset.GetRasterBand(1).DataType,
                        #                                    tile_type_info['format_options'].split(','))
                        output_dataset = gdal_driver.Create(output_tile_path, 
                                                            nbar_dataset.RasterXSize, nbar_dataset.RasterYSize,
                                                            1, dtype[output_tag],
                                                            tile_type_info['format_options'].split(','))
                        logger.debug('gdal_driver.Create(%s, %s, %s, %s, %s, %s',
                                                            output_tile_path, 
                                                            nbar_dataset.RasterXSize, nbar_dataset.RasterYSize,
                                                            1, dtype[output_tag],
                                                            tile_type_info['format_options'].split(','))
                        assert output_dataset, 'Unable to open output dataset %s' % output_tile_path                                   
                        output_dataset.SetGeoTransform(nbar_dataset.GetGeoTransform())
                        output_dataset.SetProjection(nbar_dataset.GetProjection()) 
            
                        output_band = output_dataset.GetRasterBand(1)
            
                        # Calculate each output here
                        # Remember band_array indices are zero-based

                        if output_tag[0] == 'B': # One of the band tags
                            band_file_no = int(output_tag[1:])
                            # Look up tile_layer (i.e. band number) for specified spectral band in tile dataset
                            tile_layer = self.bands[tile_type_info['tile_type_id']][(nbar_dataset_info['satellite_tag'], nbar_dataset_info['sensor_name'])][band_file_no]['tile_layer']
                            # Copy values 
                            data_array = band_array[tile_layer - 1].copy()
                        elif output_tag == 'NDVI':
                            data_array = numexpr.evaluate("((b4 - b3) / (b4 + b3)) + 1", {'b4':band_array[3], 'b3':band_array[2]})
                        elif output_tag == 'EVI':
                            data_array = numexpr.evaluate("(2.5 * ((b4 - b3) / (b4 + (6 * b3) - (7.5 * b1) + 1))) + 1", {'b4':band_array[3], 'b3':band_array[2], 'b1':band_array[0]})
                        elif output_tag == 'NDSI':   
                            data_array = numexpr.evaluate("((b3 - b5) / (b3 + b5)) + 1", {'b5':band_array[4], 'b3':band_array[2]})
                        elif output_tag == 'NDMI':
                            data_array = numexpr.evaluate("((b4 - b5) / (b4 + b5)) + 1", {'b5':band_array[4], 'b4':band_array[3]})
                        elif output_tag == 'SLAVI':
                            data_array = numexpr.evaluate("b4 / (b3 + b5)", {'b5':band_array[4], 'b4':band_array[3], 'b3':band_array[2]})
                        elif output_tag == 'SATVI':
                            data_array = numexpr.evaluate("(((b5 - b3) / (b5 + b3 + 0.5)) * 1.5 - (b7 / 2)) + 1", {'b5':band_array[4], 'b7':band_array[5], 'b3':band_array[2]})
                        elif output_tag == 'WATER':
                            data_array = numpy.zeros(band_array[0].shape, dtype=numpy.int16)
                            #TODO: Call water analysis code here
                        else:
                            raise Exception('Invalid operation')
                                            
                        if no_data_value[output_tag]:
                            self.apply_pqa_mask(data_array=data_array, pqa_mask=pqa_mask, no_data_value=no_data_value[output_tag])
                        
                        gdal_driver = gdal.GetDriverByName(tile_type_info['file_format'])
                        #output_dataset = gdal_driver.Create(output_tile_path, 
                        #                                    nbar_dataset.RasterXSize, nbar_dataset.RasterYSize,
                        #                                    1, nbar_dataset.GetRasterBand(1).DataType,
                        #                                    tile_type_info['format_options'].split(','))
                        output_dataset = gdal_driver.Create(output_tile_path, 
                                                            nbar_dataset.RasterXSize, nbar_dataset.RasterYSize,
                                                            1, dtype[output_tag],
                                                            tile_type_info['format_options'].split(','))
                        assert output_dataset, 'Unable to open output dataset %s'% output_dataset                                   
                        output_dataset.SetGeoTransform(nbar_dataset.GetGeoTransform())
                        output_dataset.SetProjection(nbar_dataset.GetProjection()) 
            
                        output_band = output_dataset.GetRasterBand(1)
            
                        output_band.WriteArray(data_array)
                        output_band.SetNoDataValue(output_dataset_info['nodata_value'])
                        output_band.FlushCache()
                        
                        # This is not strictly necessary - copy metadata to output dataset
                        output_dataset_metadata = nbar_dataset.GetMetadata()
                        if output_dataset_metadata:
                            output_dataset.SetMetadata(output_dataset_metadata) 
                            log_multiline(logger.debug, output_dataset_metadata, 'output_dataset_metadata', '\t')    
                        
                        output_dataset.FlushCache()
                        logger.info('Finished writing dataset %s', output_tile_path)
                    finally:
                        self.unlock_object(output_tile_path)
                else:
                    logger.info('Skipped locked dataset %s', output_tile_path)
                    sleep(5) #TODO: Find a nicer way of dealing with contention for the same output tile
                    
            else:
                logger.info('Skipped existing dataset %s', output_tile_path)
        
            output_dataset_dict[output_stack_path] = output_dataset_info
#                    log_multiline(logger.debug, output_dataset_info, 'output_dataset_info', '\t')    

        log_multiline(logger.debug, output_dataset_dict, 'output_dataset_dict', '\t')    
        # NDVI dataset processed - return info
        return output_dataset_dict

Example 26

Project: dash-hack
Source File: check_docs2.py
View license
def process_module(module, path):
    hppparser = hp.CppHeaderParser()
    rstparser = rp.RstParser(hppparser)

    rstparser.parse(module, path)
    rst = rstparser.definitions

    hdrlist = []
    for root, dirs, files in os.walk(os.path.join(path, "include")):
        for filename in fnmatch.filter(files, "*.h*"):
            hdrlist.append(os.path.join(root, filename))

    if module == "gpu":
        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "cuda_devptrs.hpp"))
        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpumat.hpp"))

    decls = []
    for hname in hdrlist:
        if not "ts_gtest.h" in hname:
            decls += hppparser.parse(hname, wmode=False)

    funcs = []
    # not really needed to hardcode all the namespaces. Normally all they are collected automatically
    namespaces = ['cv', 'cv.gpu', 'cvflann', 'cvflann.anyimpl', 'cvflann.lsh', 'cv.flann', 'cv.linemod', 'cv.detail', 'cvtest', 'perf', 'cv.videostab']
    classes = []
    structs = []

    # collect namespaces and classes/structs
    for decl in decls:
        if decl[0].startswith("const"):
            pass
        elif decl[0].startswith("class") or decl[0].startswith("struct"):
            if decl[0][0] == 'c':
                classes.append(decl)
            else:
                structs.append(decl)
            dotIdx = decl[0].rfind('.')
            if dotIdx > 0:
                namespace = decl[0][decl[0].find(' ')+1:dotIdx]
                if not [c for c in classes if c[0].endswith(namespace)] and not [s for s in structs if s[0].endswith(namespace)]:
                    if namespace not in namespaces:
                        namespaces.append(namespace)
        else:
            funcs.append(decl)

    clsnamespaces = []
    # process classes
    for cl in classes:
        name = cl[0][cl[0].find(' ')+1:]
        if name.find('.') < 0 and not name.startswith("Cv"):
            logerror(ERROR_004_MISSEDNAMESPACE, "class " + name + " from opencv_" + module + " is placed in global namespace but violates C-style naming convention")
        clsnamespaces.append(name)
        if do_python_crosscheck and not name.startswith("cv.") and name.startswith("Cv"):
            clsnamespaces.append("cv." + name[2:])
        if name.startswith("cv."):
            name = name[3:]
        name = name.replace(".", "::")
        sns = synonims.get(name, [])
        sns.append(name)
        for name in sns:
            doc = rst.get(name)
            if not doc:
                #TODO: class is not documented
                continue
            doc[DOCUMENTED_MARKER] = True
            # verify class marker
            if not doc.get("isclass"):
                logerror(ERROR_001_NOTACLASS, "class " + name + " is not marked as \"class\" in documentation", doc)
            else:
                # verify base
                signature = doc.get("class", "")
                signature = signature.replace(" public ", " ")
                namespaceIdx = signature.rfind("::")

                signature = ("class " + signature).strip()
                hdrsignature = ("class " + name + " " +  cl[1]).replace(".", "::").replace("cv::","").strip()
                if signature != hdrsignature:
                    logerror(ERROR_003_INCORRECTBASE, "invalid base class documentation\ndocumented: " + signature + "\nactual:     " + hdrsignature, doc)

    # process structs
    for st in structs:
        name = st[0][st[0].find(' ')+1:]
        if name.find('.') < 0 and not name.startswith("Cv"):
            logerror(ERROR_004_MISSEDNAMESPACE, "struct " + name + " from opencv_" + module + " is placed in global namespace but violates C-style naming convention")
        clsnamespaces.append(name)
        if name.startswith("cv."):
            name = name[3:]
        name = name.replace(".", "::")
        doc = rst.get(name)
        if not doc:
            #TODO: struct is not documented
            continue
        doc[DOCUMENTED_MARKER] = True
        # verify struct marker
        if not doc.get("isstruct"):
            logerror(ERROR_002_NOTASTRUCT, "struct " + name + " is not marked as \"struct\" in documentation", doc)
        else:
            # verify base
            signature = doc.get("class", "")
            signature = signature.replace(", public ", " ").replace(" public ", " ")
            signature = signature.replace(", protected ", " ").replace(" protected ", " ")
            signature = signature.replace(", private ", " ").replace(" private ", " ")
            signature = ("struct " + signature).strip()
            hdrsignature = (st[0] + " " +  st[1]).replace("struct cv.", "struct ").replace(".", "::").strip()
            if signature != hdrsignature:
                logerror(ERROR_003_INCORRECTBASE, "invalid base struct documentation\ndocumented: " + signature + "\nactual:     " + hdrsignature, doc)
                print st, doc

    # process functions and methods
    flookup = {}
    for fn in funcs:
        name = fn[0]
        parent = None
        namespace = None
        for cl in clsnamespaces:
            if name.startswith(cl + "."):
                if cl.startswith(parent or ""):
                    parent = cl
        if parent:
            name = name[len(parent) + 1:]
            for nm in namespaces:
                if parent.startswith(nm + "."):
                    if nm.startswith(namespace or ""):
                        namespace = nm
            if namespace:
                parent = parent[len(namespace) + 1:]
        else:
            for nm in namespaces:
                if name.startswith(nm + "."):
                    if nm.startswith(namespace or ""):
                        namespace = nm
            if namespace:
                name = name[len(namespace) + 1:]
        #print namespace, parent, name, fn[0]
        if not namespace and not parent and not name.startswith("cv") and not name.startswith("icv") and not name.startswith("CV_"):
            logerror(ERROR_004_MISSEDNAMESPACE, "function " + name + " from opencv_" + module + " is placed in global namespace but violates C-style naming convention")
        else:
            fdescr = (namespace, parent, name, fn)
            flookup_entry = flookup.get(fn[0], [])
            flookup_entry.append(fdescr)
            flookup[fn[0]] = flookup_entry

    if do_python_crosscheck:
        for name, doc in rst.iteritems():
            decls = doc.get("decls")
            if not decls:
                continue
            for signature in decls:
                if signature[0] == "Python1":
                    pname = signature[1][:signature[1].find('(')]
                    try:
                        fn = getattr(cv2.cv, pname[3:])
                        docstr = "cv." + fn.__doc__
                    except AttributeError:
                        logerror(ERROR_005_MISSINGPYFUNC, "could not load documented function: cv2." + pname, doc)
                        continue
                    docstring = docstr
                    sign = signature[1]
                    signature.append(DOCUMENTED_MARKER)
                    # convert old signature to pydoc style
                    if docstring.endswith("*"):
                        docstring = docstring[:-1]
                    s = None
                    while s != sign:
                        s = sign
                        sign = re.sub(r"^(.*\(.*)\(.*?\)(.*\) *->)", "\\1_\\2", sign)
                    s = None
                    while s != sign:
                        s = sign
                        sign = re.sub(r"\s*,\s*([^,]+)\s*=\s*[^,]+\s*(( \[.*\])?)\)", " [, \\1\\2])", sign)
                    sign = re.sub(r"\(\s*([^,]+)\s*=\s*[^,]+\s*(( \[.*\])?)\)", "([\\1\\2])", sign)

                    sign = re.sub(r"\)\s*->\s*", ") -> ", sign)
                    sign = sign.replace("-> convexHull", "-> CvSeq")
                    sign = sign.replace("-> lines", "-> CvSeq")
                    sign = sign.replace("-> boundingRects", "-> CvSeq")
                    sign = sign.replace("-> contours", "-> CvSeq")
                    sign = sign.replace("-> retval", "-> int")
                    sign = sign.replace("-> detectedObjects", "-> CvSeqOfCvAvgComp")

                    def retvalRplace(match):
                        m = match.group(1)
                        m = m.replace("CvScalar", "scalar")
                        m = m.replace("CvMemStorage", "memstorage")
                        m = m.replace("ROIplImage", "image")
                        m = m.replace("IplImage", "image")
                        m = m.replace("ROCvMat", "mat")
                        m = m.replace("CvMat", "mat")
                        m = m.replace("double", "float")
                        m = m.replace("CvSubdiv2DPoint", "point")
                        m = m.replace("CvBox2D", "Box2D")
                        m = m.replace("IplConvKernel", "kernel")
                        m = m.replace("CvHistogram", "hist")
                        m = m.replace("CvSize", "width,height")
                        m = m.replace("cvmatnd", "matND")
                        m = m.replace("CvSeqOfCvConvexityDefect", "convexityDefects")
                        mm = m.split(',')
                        if len(mm) > 1:
                            return "(" + ", ".join(mm) + ")"
                        else:
                            return m

                    docstring = re.sub(r"(?<=-> )(.*)$", retvalRplace, docstring)
                    docstring = docstring.replace("( [, ", "([")

                    if sign != docstring:
                        logerror(ERROR_006_INVALIDPYOLDDOC, "old-style documentation differs from pydoc\npydoc: " + docstring + "\nfixup: " + sign + "\ncvdoc: " + signature[1], doc)
                elif signature[0] == "Python2":
                    pname = signature[1][4:signature[1].find('(')]
                    cvname = "cv." + pname
                    parent = None
                    for cl in clsnamespaces:
                        if cvname.startswith(cl + "."):
                            if cl.startswith(parent or ""):
                                parent = cl
                    try:
                        if parent:
                            instance, clsname = get_cv2_object(parent)
                            fn = getattr(instance, cvname[len(parent)+1:])
                            docstr = fn.__doc__
                            docprefix = "cv2." + clsname + "."
                        else:
                            fn = getattr(cv2, pname)
                            docstr = fn.__doc__
                            docprefix = "cv2."
                    except AttributeError:
                        if parent:
                            logerror(ERROR_005_MISSINGPYFUNC, "could not load documented member of " + parent + " class: cv2." + pname, doc)
                        else:
                            logerror(ERROR_005_MISSINGPYFUNC, "could not load documented function cv2." + pname, doc)
                        signature.append(DOCUMENTED_MARKER) # stop subsequent errors
                        continue
                    docstrings = [docprefix + s.replace("([, ", "([") for s in docstr.split("  or  ")]
                    if not signature[1] in docstrings:
                        pydocs = "\npydoc: ".join(docstrings)
                        logerror(ERROR_007_INVALIDPYDOC, "documentation differs from pydoc\npydoc: " + pydocs + "\ncvdoc: " + signature[1], doc)
                    signature.append(DOCUMENTED_MARKER)

    # verify C/C++ signatures
    for name, doc in rst.iteritems():
        decls = doc.get("decls")
        if not decls:
            continue
        for signature in decls:
            if signature[0] == "C" or signature[0] == "C++":
                if "template" in (signature[2][1] or ""):
                    # TODO find a way to validate templates
                    signature.append(DOCUMENTED_MARKER)
                    continue
                fd = flookup.get(signature[2][0])
                if not fd:
                    if signature[2][0].startswith("cv."):
                        fd = flookup.get(signature[2][0][3:])
                    if not fd:
                        continue
                    else:
                        signature[2][0] = signature[2][0][3:]
                if signature[0] == "C":
                    ffd = [f for f in fd if not f[0] and not f[1]] # filter out C++ stuff
                    if not ffd:
                        if fd[0][1]:
                            logerror(ERROR_008_CFUNCISNOTGLOBAL, "function " + fd[0][2] + " is documented as C function but is actually member of " + fd[0][1] + " class", doc)
                        elif fd[0][0]:
                            logerror(ERROR_008_CFUNCISNOTGLOBAL, "function " + fd[0][2] + " is documented as C function but is actually placed in " + fd[0][0] + " namespace", doc)
                    fd = ffd
                error = None
                for f in fd:
                    match, error = compareSignatures(signature[2], f[3])
                    if match:
                        signature.append(DOCUMENTED_MARKER)
                        break
                if signature[-1] != DOCUMENTED_MARKER:
                    candidates = "\n\t".join([formatSignature(f[3]) for f in fd])
                    logerror(ERROR_009_OVERLOADNOTFOUND, signature[0] + " function " + signature[2][0].replace(".","::") + " is documented but misses in headers (" + error + ").\nDocumented as:\n\t" + signature[1] + "\nCandidates are:\n\t" + candidates, doc)
                    signature.append(DOCUMENTED_MARKER) # to stop subsequent error on this function

    # verify that all signatures was found in the library headers
    for name, doc in rst.iteritems():
        # if doc.get(DOCUMENTED_MARKER, False):
        #     continue # this class/struct was found
        if not doc.get(DOCUMENTED_MARKER, False) and (doc.get("isclass", False) or doc.get("isstruct", False)):
            if name in doc_signatures_whitelist:
                continue
            logerror(ERROR_010_UNKNOWNCLASS, "class/struct " + name + " is mentioned in documentation but is not found in OpenCV headers", doc)
        for d in doc.get("decls", []):
            if d[-1] != DOCUMENTED_MARKER:
                if d[0] == "C" or d[0] =="C++" or (do_python_crosscheck and d[0].startswith("Python")):
                    if d[0][0] == 'C':
                        sname = d[2][0][3:].replace(".", "::")
                        if sname in defines:
                            #TODO: need to find a way to verify #define's
                            continue
                    else:
                        sname = d[1][:d[1].find("(")]
                    prefixes = [x for x in doc_signatures_whitelist if sname.startswith(x)]
                    if prefixes:
                        # TODO: member of template class
                        continue
                    logerror(ERROR_011_UNKNOWNFUNC, d[0] + " function " + sname + " is documented but is not found in OpenCV headers. It is documented as:\n\t" + d[1], doc)

Example 27

Project: easybuild-easyblocks
Source File: aladin.py
View license
    def configure_step(self):
        """Custom configuration procedure for ALADIN."""

        # unset $LIBRARY_PATH set by modules of dependencies, because it may screw up linking
        if 'LIBRARY_PATH' in os.environ:
            self.log.debug("Unsetting $LIBRARY_PATH (was: %s)" % os.environ['LIBRARY_PATH'])
            self.orig_library_path = os.environ.pop('LIBRARY_PATH')
        
        # build auxiliary libraries
        auxlibs_dir = None

        my_gnu = None
        if self.toolchain.comp_family() == toolchain.GCC:
            my_gnu = 'y'  # gfortran
            for var in ['CFLAGS', 'CXXFLAGS', 'F90FLAGS', 'FFLAGS']:
                flags = os.getenv(var)
                env.setvar(var, "%s -fdefault-real-8 -fdefault-double-8" % flags)
                self.log.info("Updated %s to '%s'" % (var, os.getenv(var)))
        elif self.toolchain.comp_family() == toolchain.INTELCOMP:
            my_gnu = 'i'  # icc/ifort
        else:
            raise EasyBuildError("Don't know how to set 'my_gnu' variable in auxlibs build script.")
        self.log.info("my_gnu set to '%s'" % my_gnu)

        tmp_installroot = tempfile.mkdtemp(prefix='aladin_auxlibs_')

        try:
            cwd = os.getcwd()

            os.chdir(self.builddir)
            builddirs = os.listdir(self.builddir)

            auxlibs_dir = [x for x in builddirs if x.startswith('auxlibs_installer')][0]

            os.chdir(auxlibs_dir)

            auto_driver = 'driver_automatic'
            for line in fileinput.input(auto_driver, inplace=1, backup='.orig.eb'):

                line = re.sub(r"^(my_gnu\s*=\s*).*$", r"\1%s" % my_gnu, line)
                line = re.sub(r"^(my_r32\s*=\s*).*$", r"\1n", line)  # always 64-bit real precision
                line = re.sub(r"^(my_readonly\s*=\s*).*$", r"\1y", line)  # make libs read-only after build
                line = re.sub(r"^(my_installroot\s*=\s*).*$", r"\1%s" % tmp_installroot, line)

                sys.stdout.write(line)

            run_cmd("./%s" % auto_driver)

            os.chdir(cwd)

        except OSError, err:
            raise EasyBuildError("Failed to build ALADIN: %s", err)

        # build gmkpack, update PATH and set GMKROOT
        # we build gmkpack here because a config file is generated in the gmkpack isntall path
        try:
            gmkpack_dir = [x for x in builddirs if x.startswith('gmkpack')][0]
            os.chdir(os.path.join(self.builddir, gmkpack_dir))

            qa = {
                  'Do you want to run the configuration file maker assistant now (y) or later [n] ?': 'n',
                 }

            run_cmd_qa("./build_gmkpack", qa)
 
            os.chdir(cwd)

            paths = os.getenv('PATH').split(':')
            paths.append(os.path.join(self.builddir, gmkpack_dir, 'util'))
            env.setvar('PATH', ':'.join(paths))

            env.setvar('GMKROOT', os.path.join(self.builddir, gmkpack_dir))

        except OSError, err:
            raise EasyBuildError("Failed to build gmkpack: %s", err)

        # generate gmkpack configuration file
        self.conf_file = 'ALADIN_%s' % self.version
        self.conf_filepath = os.path.join(self.builddir, 'gmkpack_support', 'arch', '%s.x' % self.conf_file)

        try:
            if os.path.exists(self.conf_filepath):
                os.remove(self.conf_filepath)
                self.log.info("Removed existing gmpack config file %s" % self.conf_filepath)

            archdir = os.path.dirname(self.conf_filepath)
            if not os.path.exists(archdir):
                mkdir(archdir, parents=True)

        except OSError, err:
            raise EasyBuildError("Failed to remove existing file %s: %s", self.conf_filepath, err)

        mpich = 'n'
        known_mpi_libs = [toolchain.MPICH, toolchain.MPICH2, toolchain.INTELMPI]
        if self.toolchain.options.get('usempi', None) and self.toolchain.mpi_family() in known_mpi_libs:
            mpich = 'y'

        qpref = 'Please type the ABSOLUTE name of '
        qsuff = ', or ignore (environment variables allowed) :'
        qsuff2 = ', or ignore : (environment variables allowed) :'

        comp_fam = self.toolchain.comp_family()
        if comp_fam == toolchain.GCC:
            gribdir = 'GNU'
        elif comp_fam == toolchain.INTELCOMP:
            gribdir = 'INTEL'
        else:
            raise EasyBuildError("Don't know which grib lib dir to use for compiler %s", comp_fam)

        aux_lib_gribex = os.path.join(tmp_installroot, gribdir, 'lib', 'libgribex.a')
        aux_lib_ibm = os.path.join(tmp_installroot, gribdir, 'lib', 'libibmdummy.a')
        grib_api_lib = os.path.join(get_software_root('grib_api'), 'lib', 'libgrib_api.a')
        grib_api_f90_lib = os.path.join(get_software_root('grib_api'), 'lib', 'libgrib_api_f90.a')
        grib_api_inc = os.path.join(get_software_root('grib_api'), 'include')
        jasperlib = os.path.join(get_software_root('JasPer'), 'lib', 'libjasper.a')
        mpilib = os.path.join(os.getenv('MPI_LIB_DIR'), os.getenv('MPI_LIB_SHARED'))

        # netCDF
        netcdf = get_software_root('netCDF')
        netcdf_fortran = get_software_root('netCDF-Fortran')
        if netcdf:
            netcdfinc = os.path.join(netcdf, 'include')
            if netcdf_fortran:
                netcdflib = os.path.join(netcdf_fortran, get_software_libdir('netCDF-Fortran'), 'libnetcdff.a')
            else:
                netcdflib = os.path.join(netcdf, get_software_libdir('netCDF'), 'libnetcdff.a')
            if not os.path.exists(netcdflib):
                raise EasyBuildError("%s does not exist", netcdflib)
        else:
            raise EasyBuildError("netCDF(-Fortran) not available")

        ldpaths = [ldflag[2:] for ldflag in os.getenv('LDFLAGS').split(' ')]  # LDFLAGS have form '-L/path/to'

        lapacklibs = []
        for lib in os.getenv('LAPACK_STATIC_LIBS').split(','):
            libpaths = [os.path.join(ldpath, lib) for ldpath in ldpaths]
            lapacklibs.append([libpath for libpath in libpaths if os.path.exists(libpath)][0])
        lapacklib = ' '.join(lapacklibs)
        blaslibs = []
        for lib in os.getenv('BLAS_STATIC_LIBS').split(','):
            libpaths = [os.path.join(ldpath, lib) for ldpath in ldpaths]
            blaslibs.append([libpath for libpath in libpaths if os.path.exists(libpath)][0])
        blaslib = ' '.join(blaslibs)

        qa = {
            'Do you want to run the configuration file maker assistant now (y) or later [n] ?': 'y',
            'Do you want to setup your configuration file for MPICH (y/n) [n] ?': mpich,
            'Please type the directory name where to find a dummy file mpif.h or ignore :': os.getenv('MPI_INC_DIR'),
            '%sthe library gribex or emos%s' % (qpref, qsuff2): aux_lib_gribex,
            '%sthe library ibm%s' % (qpref, qsuff): aux_lib_ibm,
            '%sthe library grib_api%s' % (qpref, qsuff): grib_api_lib,
            '%sthe library grib_api_f90%s' % (qpref, qsuff): grib_api_f90_lib,
            '%sthe JPEG auxilary library if enabled by Grib_api%s' % (qpref, qsuff2): jasperlib,
            '%sthe library netcdf%s' % (qpref, qsuff): netcdflib,
            '%sthe library lapack%s' % (qpref, qsuff): lapacklib,
            '%sthe library blas%s' % (qpref, qsuff): blaslib,
            '%sthe library mpi%s' % (qpref, qsuff): mpilib,
            '%sa MPI dummy library for serial executions, or ignore :' % qpref: '',
            'Please type the directory name where to find grib_api headers, or ignore :': grib_api_inc,
            'Please type the directory name where to find fortint.h or ignore :': '',
            'Please type the directory name where to find netcdf headers, or ignore :': netcdfinc,
            'Do you want to define CANARI (y/n) [y] ?': 'y',
            'Please type the name of the script file used to generate a preprocessed blacklist file, or ignore :': '',
            'Please type the name of the script file used to recover local libraries (gget), or ignore :': '',
            'Please type the options to tune the gnu compilers, or ignore :': os.getenv('F90FLAGS'),
        }

        f90_seq = os.getenv('F90_SEQ')
        if not f90_seq:
            # F90_SEQ is only defined when usempi is enabled
            f90_seq = os.getenv('F90')

        stdqa = OrderedDict([
            (r'Confirm library .* is .*', 'y'),  # this one needs to be tried first!
            (r'.*fortran 90 compiler name .*\s*:\n\(suggestions\s*: .*\)', os.getenv('F90')),
            (r'.*fortran 90 compiler interfaced with .*\s*:\n\(suggestions\s*: .*\)', f90_seq),
            (r'Please type the ABSOLUTE name of .*library.*, or ignore\s*[:]*\s*[\n]*.*', ''),
            (r'Please .* to save this draft configuration file :\n.*', '%s.x' % self.conf_file),
        ])

        no_qa = [
            ".*ignored.",
        ]

        env.setvar('GMKTMP', self.builddir)
        env.setvar('GMKFILE', self.conf_file)

        run_cmd_qa("gmkfilemaker", qa, std_qa=stdqa, no_qa=no_qa)

        # set environment variables for installation dirs
        env.setvar('ROOTPACK', os.path.join(self.installdir, 'rootpack'))
        env.setvar('ROOTBIN', os.path.join(self.installdir, 'rootpack'))
        env.setvar('HOMEPACK', os.path.join(self.installdir, 'pack'))
        env.setvar('HOMEBIN', os.path.join(self.installdir, 'pack'))

        # patch config file to include right Fortran compiler flags
        regex_subs = [(r"^(FRTFLAGS\s*=.*)$", r"\1 %s" % os.getenv('FFLAGS'))]
        apply_regex_substitutions(self.conf_filepath, regex_subs)

Example 28

View license
    def configure_step(self):
        """Custom configuration procedure for Quantum ESPRESSO."""

        if self.toolchain.options.get('openmp', False) or self.cfg['hybrid']:
            self.cfg.update('configopts', '--enable-openmp')

        if not self.toolchain.options.get('usempi', None):
            self.cfg.update('configopts', '--disable-parallel')

        if not self.cfg['with_scalapack']:
            self.cfg.update('configopts', '--without-scalapack')

        repls = []

        if self.toolchain.comp_family() in [toolchain.INTELCOMP]:
            # set preprocessor command (-E to stop after preprocessing, -C to preserve comments)
            cpp = "%s -E -C" % os.getenv('CC')
            repls.append(('CPP', cpp, False))
            env.setvar('CPP', cpp)

            # also define $FCCPP, but do *not* include -C (comments should not be preserved when preprocessing Fortran)
            env.setvar('FCCPP', "%s -E" % os.getenv('CC'))

        super(EB_QuantumESPRESSO, self).configure_step()

        # compose list of DFLAGS (flag, value, keep_stuff)
        # for guidelines, see include/defs.h.README in sources
        dflags = []

        comp_fam_dflags = {
            toolchain.INTELCOMP: '-D__INTEL',
            toolchain.GCC: '-D__GFORTRAN -D__STD_F95',
        }
        dflags.append(comp_fam_dflags[self.toolchain.comp_family()])

        if self.toolchain.options.get('openmp', False):
            libfft = os.getenv('LIBFFT_MT')
        else:
            libfft = os.getenv('LIBFFT')
        if libfft:
            if "fftw3" in libfft:
                dflags.append('-D__FFTW3')
            else:
                dflags.append('-D__FFTW')
            env.setvar('FFTW_LIBS', libfft)

        if get_software_root('ACML'):
            dflags.append('-D__ACML')

        if self.toolchain.options.get('usempi', None):
            dflags.append('-D__MPI -D__PARA')

        if self.toolchain.options.get('openmp', False) or self.cfg['hybrid']:
            dflags.append(" -D__OPENMP")

        if self.cfg['with_scalapack']:
            dflags.append(" -D__SCALAPACK")

        # always include -w to supress warnings
        dflags.append('-w')

        repls.append(('DFLAGS', ' '.join(dflags), False))

        # complete C/Fortran compiler and LD flags
        if self.toolchain.options.get('openmp', False) or self.cfg['hybrid']:
            repls.append(('LDFLAGS', self.toolchain.get_flag('openmp'), True))
            repls.append(('(?:C|F90|F)FLAGS', self.toolchain.get_flag('openmp'), True))

        # obtain library settings
        libs = []
        for lib in ['BLAS', 'LAPACK', 'FFT', 'SCALAPACK']:
            if self.toolchain.options.get('openmp', False):
                val = os.getenv('LIB%s_MT' % lib)
            else:
                val = os.getenv('LIB%s' % lib)
            repls.append(('%s_LIBS' % lib, val, False))
            libs.append(val)
        libs = ' '.join(libs)

        repls.append(('BLAS_LIBS_SWITCH', 'external', False))
        repls.append(('LAPACK_LIBS_SWITCH', 'external', False))
        repls.append(('LD_LIBS', os.getenv('LIBS'), False))

        self.log.debug("List of replacements to perform: %s" % repls)

        # patch make.sys file
        fn = os.path.join(self.cfg['start_dir'], 'make.sys')
        try:
            for line in fileinput.input(fn, inplace=1, backup='.orig.eb'):
                for (k, v, keep) in repls:
                    # need to use [ \t]* instead of \s*, because vars may be undefined as empty,
                    # and we don't want to include newlines
                    if keep:
                        line = re.sub(r"^(%s\s*=[ \t]*)(.*)$" % k, r"\1\2 %s" % v, line)
                    else:
                        line = re.sub(r"^(%s\s*=[ \t]*).*$" % k, r"\1%s" % v, line)

                # fix preprocessing directives for .f90 files in make.sys if required
                if self.toolchain.comp_family() in [toolchain.GCC]:
                    line = re.sub(r"\$\(MPIF90\) \$\(F90FLAGS\) -c \$<",
                                  "$(CPP) -C $(CPPFLAGS) $< -o $*.F90\n" +
                                  "\t$(MPIF90) $(F90FLAGS) -c $*.F90 -o $*.o",
                                  line)

                sys.stdout.write(line)
        except IOError, err:
            raise EasyBuildError("Failed to patch %s: %s", fn, err)

        self.log.debug("Contents of patched %s: %s" % (fn, open(fn, "r").read()))

        # patch default make.sys for wannier
        if LooseVersion(self.version) >= LooseVersion("5"):
            fn = os.path.join(self.cfg['start_dir'], 'install', 'make_wannier90.sys')
        else:
            fn = os.path.join(self.cfg['start_dir'], 'plugins', 'install', 'make_wannier90.sys')
        try:
            for line in fileinput.input(fn, inplace=1, backup='.orig.eb'):
                line = re.sub(r"^(LIBS\s*=\s*).*", r"\1%s" % libs, line)

                sys.stdout.write(line)

        except IOError, err:
            raise EasyBuildError("Failed to patch %s: %s", fn, err)

        self.log.debug("Contents of patched %s: %s" % (fn, open(fn, "r").read()))

        # patch Makefile of want plugin
        wantprefix = 'want-'
        wantdirs = [d for d in os.listdir(self.builddir) if d.startswith(wantprefix)]

        if len(wantdirs) > 1:
            raise EasyBuildError("Found more than one directory with %s prefix, help!", wantprefix)

        if len(wantdirs) != 0:
            wantdir = os.path.join(self.builddir, wantdirs[0])
            make_sys_in_path = None
            cand_paths = [os.path.join('conf', 'make.sys.in'), os.path.join('config', 'make.sys.in')]
            for path in cand_paths:
                full_path = os.path.join(wantdir, path)
                if os.path.exists(full_path):
                    make_sys_in_path = full_path
                    break
            if make_sys_in_path is None:
                raise EasyBuildError("Failed to find make.sys.in in want directory %s, paths considered: %s",
                                     wantdir, ', '.join(cand_paths))

            try:
                for line in fileinput.input(make_sys_in_path, inplace=1, backup='.orig.eb'):
                    # fix preprocessing directives for .f90 files in make.sys if required
                    if self.toolchain.comp_family() in [toolchain.GCC]:
                        line = re.sub("@[email protected]",
                                      "$(CPP) -C $(CPPFLAGS) $< -o $*.F90\n" +
                                      "\t$(MPIF90) $(F90FLAGS) -c $*.F90 -o $*.o",
                                      line)

                    sys.stdout.write(line)
            except IOError, err:
                raise EasyBuildError("Failed to patch %s: %s", fn, err)

        # move non-espresso directories to where they're expected and create symlinks
        try:
            dirnames = [d for d in os.listdir(self.builddir) if not d.startswith('espresso')]
            targetdir = os.path.join(self.builddir, "espresso-%s" % self.version)
            for dirname in dirnames:
                shutil.move(os.path.join(self.builddir, dirname), os.path.join(targetdir, dirname))
                self.log.info("Moved %s into %s" % (dirname, targetdir))

                dirname_head = dirname.split('-')[0]
                linkname = None
                if dirname_head == 'sax':
                    linkname = 'SaX'
                if dirname_head == 'wannier90':
                    linkname = 'W90'
                elif dirname_head in ['gipaw', 'plumed', 'want', 'yambo']:
                    linkname = dirname_head.upper()
                if linkname:
                    os.symlink(os.path.join(targetdir, dirname), os.path.join(targetdir, linkname))

        except OSError, err:
            raise EasyBuildError("Failed to move non-espresso directories: %s", err)

Example 29

Project: pysd
Source File: utils.py
View license
def make_python_identifier(string, namespace=None, reserved_words=None,
                           convert='drop', handle='force'):
    """
    Takes an arbitrary string and creates a valid Python identifier.

    If the input string is in the namespace, return its value.

    If the python identifier created is already in the namespace,
    but the input string is not (ie, two similar strings resolve to
    the same python identifier)

    or if the identifier is a reserved word in the reserved_words
    list, or is a python default reserved word,
    adds _1, or if _1 is in the namespace, _2, etc.

    Parameters
    ----------
    string : <basestring>
        The text to be converted into a valid python identifier
    namespace : <dictionary>
        Map of existing translations into python safe identifiers.
        This is to ensure that two strings are not translated into
        the same python identifier
    reserved_words : <list of strings>
        List of words that are reserved (because they have other meanings
        in this particular program, such as also being the names of
        libraries, etc.
    convert : <string>
        Tells the function what to do with characters that are not
        valid in python identifiers
        - 'hex' implies that they will be converted to their hexidecimal
                representation. This is handy if you have variables that
                have a lot of reserved characters, or you don't want the
                name to be dependent on when things were added to the
                namespace
        - 'drop' implies that they will just be dropped altogether
    handle : <string>
        Tells the function how to deal with namespace conflicts
        - 'force' will create a representation which is not in conflict
                  by appending _n to the resulting variable where n is
                  the lowest number necessary to avoid a conflict
        - 'throw' will raise an exception

    Returns
    -------
    identifier : <string>
        A vaild python identifier based on the input string
    namespace : <dictionary>
        An updated map of the translations of words to python identifiers,
        including the passed in 'string'.

    Examples
    --------
    >>> make_python_identifier('Capital')
    ('capital', {'Capital': 'capital'})

    >>> make_python_identifier('multiple words')
    ('multiple_words', {'multiple words': 'multiple_words'})

    >>> make_python_identifier('multiple     spaces')
    ('multiple_spaces', {'multiple     spaces': 'multiple_spaces'})

    When the name is a python keyword, add '_1' to differentiate it
    >>> make_python_identifier('for')
    ('for_1', {'for': 'for_1'})

    Remove leading and trailing whitespace
    >>> make_python_identifier('  whitespace  ')
    ('whitespace', {'  whitespace  ': 'whitespace'})

    Remove most special characters outright:
    >>> make_python_identifier('[email protected] tr!ck')
    ('ht_trck', {'[email protected] tr!ck': 'ht_trck'})

    Replace special characters with their hex representations
    >>> make_python_identifier('[email protected] tr!ck', convert='hex')
    ('h40t_tr21ck', {'[email protected] tr!ck': 'h40t_tr21ck'})

    remove leading digits
    >>> make_python_identifier('123abc')
    ('abc', {'123abc': 'abc'})

    already in namespace
    >>> make_python_identifier('Variable$', namespace={'Variable$': 'variable'})
    ('variable', {'Variable$': 'variable'})

    namespace conflicts
    >>> make_python_identifier('Variable$', namespace={'[email protected]': 'variable'})
    ('variable_1', {'[email protected]': 'variable', 'Variable$': 'variable_1'})

    >>> make_python_identifier('Variable$', namespace={'[email protected]': 'variable',
    >>>                                                'Variable%': 'variable_1'})
    ('variable_2', {'[email protected]': 'variable', 'Variable%': 'variable_1', 'Variable$': 'variable_2'})

    throw exception instead
    >>> make_python_identifier('Variable$', namespace={'[email protected]': 'variable'}, handle='throw')
    Traceback (most recent call last):
     ...
    NameError: variable already exists in namespace or is a reserved word


    References
    ----------
    Identifiers must follow the convention outlined here:
        https://docs.python.org/2/reference/lexical_analysis.html#identifiers
    """

    if namespace is None:
        namespace = dict()

    if reserved_words is None:
        reserved_words = list()

    if string in namespace:
        return namespace[string], namespace

    # create a working copy (and make it lowercase, while we're at it)
    s = string.lower()

    # remove leading and trailing whitespace
    s = s.strip()

    # Make spaces into underscores
    s = re.sub('[\\s\\t\\n]+', '_', s)

    if convert == 'hex':
        # Convert invalid characters to hex
        s = ''.join([c.encode("hex") if re.findall('[^0-9a-zA-Z_]', c) else c for c in s])

    elif convert == 'drop':
        # Remove invalid characters
        s = re.sub('[^0-9a-zA-Z_]', '', s)

    # Remove leading characters until we find a letter or underscore
    s = re.sub('^[^a-zA-Z_]+', '', s)

    # Check that the string is not a python identifier
    while (s in keyword.kwlist or
                   s in namespace.values() or
                   s in reserved_words):
        if handle == 'throw':
            raise NameError(s + ' already exists in namespace or is a reserved word')
        if handle == 'force':
            if re.match(".*?_\d+$", s):
                i = re.match(".*?_(\d+)$", s).groups()[0]
                s = s.strip('_' + i) + '_' + str(int(i) + 1)
            else:
                s += '_1'

    namespace[string] = s

    return s, namespace

Example 30

Project: plexpy
Source File: __init__.py
View license
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
            parse_email=False, tokenizer=HTMLSanitizer):
    """Convert URL-like strings in an HTML fragment to links.

    linkify() converts strings that look like URLs or domain names in a
    blob of text that may be an HTML fragment to links, while preserving
    (a) links already in the string, (b) urls found in attributes, and
    (c) email addresses.
    """
    text = force_unicode(text)

    if not text:
        return ''

    parser = html5lib.HTMLParser(tokenizer=tokenizer)

    forest = parser.parseFragment(text)
    _seen = set([])

    def replace_nodes(tree, new_frag, node, index=0):
        """
        Doesn't really replace nodes, but inserts the nodes contained in
        new_frag into the treee at position index and returns the number
        of nodes inserted.
        If node is passed in, it is removed from the tree
        """
        count = 0
        new_tree = parser.parseFragment(new_frag)
        # capture any non-tag text at the start of the fragment
        if new_tree.text:
            if index == 0:
                tree.text = tree.text or ''
                tree.text += new_tree.text
            else:
                tree[index - 1].tail = tree[index - 1].tail or ''
                tree[index - 1].tail += new_tree.text
        # the put in the tagged elements into the old tree
        for n in new_tree:
            if n.tag == ETREE_TAG('a'):
                _seen.add(n)
            tree.insert(index + count, n)
            count += 1
        # if we got a node to remove...
        if node is not None:
            tree.remove(node)
        return count

    def strip_wrapping_parentheses(fragment):
        """Strips wrapping parentheses.

        Returns a tuple of the following format::

            (string stripped from wrapping parentheses,
             count of stripped opening parentheses,
             count of stripped closing parentheses)
        """
        opening_parentheses = closing_parentheses = 0
        # Count consecutive opening parentheses
        # at the beginning of the fragment (string).
        for char in fragment:
            if char == '(':
                opening_parentheses += 1
            else:
                break

        if opening_parentheses:
            newer_frag = ''
            # Cut the consecutive opening brackets from the fragment.
            fragment = fragment[opening_parentheses:]
            # Reverse the fragment for easier detection of parentheses
            # inside the URL.
            reverse_fragment = fragment[::-1]
            skip = False
            for char in reverse_fragment:
                # Remove the closing parentheses if it has a matching
                # opening parentheses (they are balanced).
                if (char == ')' and
                        closing_parentheses < opening_parentheses and
                        not skip):
                    closing_parentheses += 1
                    continue
                # Do not remove ')' from the URL itself.
                elif char != ')':
                    skip = True
                newer_frag += char
            fragment = newer_frag[::-1]

        return fragment, opening_parentheses, closing_parentheses

    def apply_callbacks(attrs, new):
        for cb in callbacks:
            attrs = cb(attrs, new)
            if attrs is None:
                return None
        return attrs

    def _render_inner(node):
        out = ['' if node.text is None else node.text]
        for subnode in node:
            out.append(_render(subnode))
            if subnode.tail:
                out.append(subnode.tail)
        return ''.join(out)

    def linkify_nodes(tree, parse_text=True):
        children = len(tree)
        current_child = -1
        # start at -1 to process the parent first
        while current_child < len(tree):
            if current_child < 0:
                node = tree
                if parse_text and node.text:
                    new_txt = old_txt = node.text
                    if parse_email:
                        new_txt = re.sub(email_re, email_repl, node.text)
                        if new_txt and new_txt != node.text:
                            node.text = ''
                            adj = replace_nodes(tree, new_txt, None, 0)
                            children += adj
                            current_child += adj
                            linkify_nodes(tree, True)
                            continue

                    new_txt = re.sub(url_re, link_repl, new_txt)
                    if new_txt != old_txt:
                        node.text = ''
                        adj = replace_nodes(tree, new_txt, None, 0)
                        children += adj
                        current_child += adj
                        continue
            else:
                node = tree[current_child]

            if parse_text and node.tail:
                new_tail = old_tail = node.tail
                if parse_email:
                    new_tail = re.sub(email_re, email_repl, new_tail)
                    if new_tail != node.tail:
                        node.tail = ''
                        adj = replace_nodes(tree, new_tail, None,
                                            current_child + 1)
                        # Insert the new nodes made from my tail into
                        # the tree right after me. current_child+1
                        children += adj
                        continue

                new_tail = re.sub(url_re, link_repl, new_tail)
                if new_tail != old_tail:
                    node.tail = ''
                    adj = replace_nodes(tree, new_tail, None,
                                        current_child + 1)
                    children += adj

            if node.tag == ETREE_TAG('a') and not (node in _seen):
                if not node.get('href', None) is None:
                    attrs = dict(node.items())

                    _text = attrs['_text'] = _render_inner(node)

                    attrs = apply_callbacks(attrs, False)

                    if attrs is None:
                        # <a> tag replaced by the text within it
                        adj = replace_nodes(tree, _text, node,
                                            current_child)
                        current_child -= 1
                        # pull back current_child by 1 to scan the
                        # new nodes again.
                    else:
                        text = force_unicode(attrs.pop('_text'))
                        for attr_key, attr_val in attrs.items():
                            node.set(attr_key, attr_val)

                        for n in reversed(list(node)):
                            node.remove(n)
                        text = parser.parseFragment(text)
                        node.text = text.text
                        for n in text:
                            node.append(n)
                        _seen.add(node)

            elif current_child >= 0:
                if node.tag == ETREE_TAG('pre') and skip_pre:
                    linkify_nodes(node, False)
                elif not (node in _seen):
                    linkify_nodes(node, True)

            current_child += 1

    def email_repl(match):
        addr = match.group(0).replace('"', '&quot;')
        link = {
            '_text': addr,
            'href': 'mailto:{0!s}'.format(addr),
        }
        link = apply_callbacks(link, True)

        if link is None:
            return addr

        _href = link.pop('href')
        _text = link.pop('_text')

        repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
        attr = '{0!s}="{1!s}"'
        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
        return repl.format(_href, attribs, _text)

    def link_repl(match):
        url = match.group(0)
        open_brackets = close_brackets = 0
        if url.startswith('('):
            _wrapping = strip_wrapping_parentheses(url)
            url, open_brackets, close_brackets = _wrapping
        end = ''
        m = re.search(punct_re, url)
        if m:
            end = m.group(0)
            url = url[0:m.start()]
        if re.search(proto_re, url):
            href = url
        else:
            href = ''.join(['http://', url])

        link = {
            '_text': url,
            'href': href,
        }

        link = apply_callbacks(link, True)

        if link is None:
            return '(' * open_brackets + url + ')' * close_brackets

        _text = link.pop('_text')
        _href = link.pop('href')

        repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
        attr = '{0!s}="{1!s}"'
        attribs = ' '.join(attr.format(k, v) for k, v in link.items())

        return repl.format('(' * open_brackets,
                           _href, attribs, _text, end,
                           ')' * close_brackets)

    try:
        linkify_nodes(forest)
    except RuntimeError as e:
        # If we hit the max recursion depth, just return what we've got.
        log.exception('Probable recursion error: {0!r}'.format(e))

    return _render(forest)

Example 31

View license
def clean_html(content_fn, out_fn, author_names):
    import lxml.etree

    # The HTML file contains the entire HTML page from CRS.gov that the report was
    # scraped from. Extract just the report content, dropping the CRS.gov header/footer.

    with open(content_fn) as f:
        content = f.read()

    # Some reports are invalid HTML with a whole doctype and html node inside
    # the main report container element. See if this is one of those documents.
    extract_blockquote = ('<div class="Report"><!DOCTYPE' in content)

    # Extract the report itself from the whole page.
    content = html5lib.parse(content, treebuilder="lxml")
    content = content.find(".//*[@class='Report']")

    if content is None:
        raise ValueError("HTML page doesn't contain an element with the Report CSS class")

    if extract_blockquote:
        content = content.find("{http://www.w3.org/1999/xhtml}blockquote")
        if content is None:
            raise ValueError("HTML page didn't have the expected blockquote.")
        content.tag = "div"

    # Remove the XHTML namespace to make processing easier.
    for tag in [content] + content.findall(".//*"):
        if isinstance(tag.tag, str): # is an element
            tag.tag = tag.tag.replace("{http://www.w3.org/1999/xhtml}", "")

    # Scrub content and adjust some tags.

    allowed_classes = { 'ReportHeader' }

    def scrub_text(text):
        # Scrub crs.gov email addresses from the text.
        # There's a separate filter later for addresses in mailto: links.
        text = re.sub(r"[a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~][email protected]\.(loc\.)?gov", "[email address scrubbed]", text)

        # Scrub CRS telephone numbers --- in 7-xxxx format. We have to exclude
        # cases that have a precediing digit, because otherwise we match
        # strings like "2007-2009". But the number can also occur at the start
        # of a node, so it may be the start of a string.
        text = re.sub(r"(^|[^\d])7-\d\d\d\d", r"\1[phone number scrubbed]", text)

        # Scrub all telephone numbers --- in (xxx) xxx-xxxx format.
        text = re.sub(r"\(\d\d\d\) \d\d\d-\d\d\d\d", "[phone number scrubbed]", text)

        # Scrub all author names.
        text = re.sub("|".join([re.escape(an) for an in author_names]), "[author name scrubbed]", text)

        return text

    for tag in [content] + content.findall(".//*"):
        # Skip non-element nodes.
        if not isinstance(tag.tag, str): continue

        # Scrub the text.
        if tag.text is not None: tag.text = scrub_text(tag.text)
        if tag.tail is not None: tag.tail = scrub_text(tag.tail)

        css_classes = set(tag.attrib.get('class', '').split(" "))

        # Modern reports have a ReportHeader node with title, authors, date, report number,
        # and an internal link to just past the table of contents. Since we are scrubbing
        # author names, we must remove at least that. We also want to remove that internal
        # link and replace the title with an <h1> tag.
        if "ReportHeader" in css_classes:
            for node in tag:
                node_css_classes = set(node.attrib.get('class', '').split(" "))
                if "Title" in node_css_classes:
                    node.tag = "h1"
                elif "CoverDate" in node_css_classes:
                    pass # keep this one
                else:
                    node.getparent().remove(node)

        # Older reports had a "titleline" class for the title.
        if "titleline" in css_classes:
            tag.tag = "h1"
            css_classes.add("Title") # so the h1 doesn't get demoted below

        # Older reports had an "authorline" with author names, which we scrub by
        # removing completely.
        if "authorline" in css_classes:
            tag.getparent().remove(tag)

        # Older reports had a "Print Version" link, which we can remove.
        if tag.tag == "a" and tag.text == "Print Version":
            tag.getparent().remove(tag)

        # Scrub mailto: links, which have author emails, which we want to scrub,
        # as well as email addresses of other people mentioned in the reports.
        if 'href' in tag.attrib and tag.attrib['href'].lower().startswith("mailto:"):
            tag.tag = "span"
            del tag.attrib['href']
            tag.text = "[email address scrubbed]"
            for n in tag: # remove all child nodes
                tag.remove(n)

        # Demote h#s. These seem to occur around the table of contents only. Don't
        # demote the one we just made above for the title.
        if tag.tag in ("h1", "h2", "h3", "h4", "h5") and "Title" not in css_classes:
            tag.tag = "h" + str(int(tag.tag[1:])+1)

        # Turn some classes into h#s.
        for cls in css_classes:
            if cls in ("Heading1", "Heading2", "Heading3", "Heading4", "Heading5"):
                tag.tag = "h" + str(int(cls[7:])+1)
            if cls == "SummaryHeading":
                tag.tag = "h2"

        # Sanitize CSS classes using the whitelist above.
        if "class" in tag.attrib:
            new_classes = " ".join(sorted(set(tag.attrib["class"].split(" ")) & allowed_classes))
            if new_classes:
                tag.attrib["class"] = new_classes
            else:
                del tag.attrib["class"]

    # Serialize back to XHTML.
    content = lxml.etree.tostring(content, encoding=str, method="html")

    # Guard against unsafe content.
    import bleach
    def link_filter(name, value):
        if name in ("name", "class"):
            return True # "name" is for link targets
        if name == "href" and (value.startswith("http:") or value.startswith("https:") or value.startswith("#")):
            return True
        return False
    def image_filter(name, value):
        if name in ("class",):
            return True
        if name == "src" and (value.startswith("http:") or value.startswith("https:")):
            return True
        return False
    content = bleach.clean(
        content,
        tags=["a", "img", "b", "strong", "i", "em", "u", "sup", "sub", "span", "div", "p", "br", "ul", "ol", "li", "table", "thead", "tbody", "tr", "th", "td", "hr", "h1", "h2", "h3", "h4", "h5", "h6"],
        attributes={
            "*": ["title", "class"],
            "a": link_filter,
            "img": image_filter,
            "td": ["colspan", "rowspan"],
            "th": ["colspan", "rowspan"],
        }
    )

    # Write it out.
    with open(out_fn, "w") as f2:
        f2.write(content)

Example 32

View license
	def _loadProgress(self):
		#print('Progress')
		#print(self.url)
		
		if 'kissanime' in self.url:
			cookie_file = '/tmp/AnimeWatch/kcookie.txt'
		elif 'kisscartoon' in self.url:
			cookie_file = '/tmp/AnimeWatch/kcookieC.txt'
		elif 'kissasian' in self.url:
			cookie_file = '/tmp/AnimeWatch/kcookieD.txt'
		elif 'masterani' in self.url:
			cookie_file = '/tmp/AnimeWatch/animeSquare.txt'
		elif 'animeget' in self.url:
			cookie_file = '/tmp/AnimeWatch/animeget.txt'
		elif 'animeplace' in self.url:
			cookie_file = '/tmp/AnimeWatch/animeplace.txt'
		elif 'moetube' in self.url:
			cookie_file = '/tmp/AnimeWatch/animeHQ.txt'
		elif 'nyaa' in self.url:
			cookie_file = '/tmp/AnimeWatch/nyaa.txt'
		if 'moetube' in self.url:
			txt_file = '/tmp/AnimeWatch/moetube.txt'
			frame = self.mainFrame()  
			html = frame.toHtml()
			#print(html)
			if 'var glink = ' in html:
				if os.path.exists(txt_file):
					f = open(txt_file,'a')
				else:
					f = open(txt_file,'w')
				f.write(html)
				f.close()
		
		if self.cnt == 0 and os.path.exists(cookie_file) and ('kisscartoon' in self.url or 'kissasian' in self.url):
			frame = self.mainFrame()
			html = frame.toHtml()
			soup = BeautifulSoup(html,'lxml')
			m = soup.findAll('select',{'id':'selectQuality'})
			if m:
				print(m)
				arr = []
				for i in m:
					j = i.findAll('option')
					for k in j:
						l = k['value']
						#print(l)
						arr.append(l)
				total_q = len(arr)
				
					
						
				if arr:
					print('----------total Different Quality Video------',total_q)
					if self.quality == 'sd':
						txt = arr[-1]
					elif self.quality == 'hd':
						if total_q == 1:
							txt = arr[-1]
						elif total_q == 2:
							txt = arr[-2]
						elif total_q == 3 or total_q == 4:
							txt = arr[-3]
							
					elif self.quality == 'sd480p':
						if total_q == 1:
							txt = arr[-1]
						elif total_q == 2 or total_q == 3 or total_q == 4:
							txt = arr[-2]
						
					doc = frame.documentElement()
					bt = doc.findFirst("select[id=selectQuality]")
					#txt = arr[-1]
					bt.evaluateJavaScript('this.value="'+txt+'"')
					self.cnt = 1
		
		
		
		listCookies = self.networkAccessManager().cookieJar().allCookies()
		#print(listCookies)
		n = []
		m = ''
		o = ''
		for cookie in  listCookies:
			k=cookie.toRawForm()
			#k = getContentUnicode(k)
			k = re.sub("b'","'",str(k))
			#print(k)
			j = re.findall("'[^']*",k)
			for i in j:
				i = re.sub("'",'',i)
				if 'kissanime.to' in i or 'kissasian.com' in i or 'kisscartoon.me' in i or 'masterani.me' in i or 'animeget.io' in i or 'animeplace.co' in i or 'moetube.net' in i or 'nyaa.se' in i:
					j = re.findall('expires=[^;]*',i)
					if j:
						l = re.sub('expires=','',j[0])
						d = datetime.strptime(l,"%a, %d-%b-%Y %H:%M:%S %Z")
						t = calendar.timegm(d.timetuple())
						i = i+'; expiry='+str(int(t))
					else:
						i = i+'; expiry='+str(0)
					n.append(i)
		#print(n)
		cfc=''
		cfd =''
		asp = ''
		idt = ''
		test_idt = ''
		clr = False
		for i in n:
			if 'cf_clearance' in i:
				clr = True
				#print(n)
		if clr:
			for i in n:
				if 'cf_clearance' in i:
					cfc = self.cookie_split(i)
				elif '__cfduid' in i:
					cfd = self.cookie_split(i)
				elif 'ASP.NET_SessionId' in i:
					asp = self.cookie_split(i)
				elif 'idtz' in i:
					idt = self.cookie_split(i)
				
		if cfc and cfd:
			#print(cfc)
			#print(cfd)
			#print(asp)
			str1 = cfc['domain']+'	'+cfc['HttpOnly']+'	'+cfc['path']+'	'+'FALSE'+'	'+cfc['expiry']+'	'+'cf_clearance'+'	'+cfc['cf_clearance']
			str2 = cfd['domain']+'	'+cfd['HttpOnly']+'	'+cfd['path']+'	'+'FALSE'+'	'+cfd['expiry']+'	'+'__cfduid'+'	'+cfd['__cfduid']
			if asp:
				str3 = asp['domain']+'	'+'FALSE'+'	'+asp['path']+'	'+'FALSE'+'	'+asp['expiry']+'	'+'ASP.NET_SessionId'+'	'+asp['ASP.NET_SessionId']
			else:
				str3 = ''
			if idt:
				str3 = idt['domain']+'	'+'FALSE'+'	'+idt['path']+'	'+'FALSE'+'	'+idt['expiry']+'	'+'idtz'+'	'+idt['idtz']
			else:
				str3 = ''
			if 'kissasian' in self.url:
				str3 = 'kissasian.com	FALSE	/	FALSE	0		__test'
			
			if not os.path.exists('/tmp/AnimeWatch'):
				os.makedirs('/tmp/AnimeWatch')
			f = open(cookie_file,'w')
			if str3:
				f.write(str2+'\n'+str1+'\n'+str3)
			else:
				f.write(str2+'\n'+str1)
			f.close()

Example 33

Project: Maildb
Source File: parse.py
View license
	def run(self):
		x = open(os.path.join(self.reportDir, self.msgFile))
		msg = email.message_from_file(x) # open the eml file so we can parse ie
		x.close()
		
		################# Header Information ###################
		# Get me all the sections then write them as one big sql line
		
		dateLine = msg.get('Date')
		
		msg_id = int(os.path.basename(self.reportDir)) # Unique id for this email used to cross ref other tables
		try:
			fromAdd = msg['from'] # might need to tidy this up a little bit using the address parse option
		except:
			fromAdd = msg['from']
		stringIt = str(fromAdd)
		dbFrom = stringIt[stringIt.find("<")+1:stringIt.find(">")]
		# very messy need to fix this.
		addDomain = dbFrom[dbFrom.find("@")+1:] 
		try:
			subjectLine = unicode(msg['subject'], errors = 'replace')
		except:
			subjectLine = msg['subject']
		x_mailer = msg['X-Mailer']
		x_priority = msg['X-Priority']
		try:
			message_id = re.sub('[<>]', '', msg['Message-ID'])
		except:
			message_id = msg['Message-ID']
		hops = msg.get_all('Received')
		if hops:
			for hop in hops:
				hop = re.sub('[<>]', '', hop)
				sqlHop = (msg_id, hop)
				db.parseHops(sqlHop)
		try:
			sender = re.sub('[<>]', '', msg.get('From')) # remove <> so it renders correctly in the HTML
		except:
			sender = dbFrom
		try:
			to_add = re.sub('[<>]', '', msg.get('To')) #
		except:
			to_add = msg.get('To')
		try:
			cc_add = re.sub('[<>]', '', msg.get('cc'))
		except:
			cc_add = msg.get('cc')
		try:
			bcc_add = re.sub('[<>]', '', msg.get('Bcc'))
		except:
			bcc_add = msg.get('bcc')
		sqlHeader = ( msg_id, dateLine, sender, addDomain, subjectLine, x_mailer, x_priority, message_id, cc_add, bcc_add, to_add)
		db.parseHeader(sqlHeader)
		
		counter = 0
		for part in msg.walk():
			if part.get_content_maintype() == 'multipart':				
				continue
			
			if part.get_content_type() == 'text/plain': # Plain Text Body				
				contents = part.get_payload(decode=True)
				links = re.findall(r'(https?://\S+)', contents)
				link_type = "url"
				for urls in links:
					sqlUrl = (msg_id, link_type, urls)
					db.parseLinks(sqlUrl)
								
				from core.cleanHtml import cleanHTML
				htmlStrip = cleanHTML().safe_html(contents)
				if htmlStrip is not None:
					fp = open(os.path.join(self.reportDir, "attatchments", "body.txt"), 'wb')
					fp.write(htmlStrip.encode('ascii', 'ignore'))
					fp.close()
				
			if part.get_content_type() == 'text/html': # HTML Body
				contents = part.get_payload(decode=True)
				soup = BeautifulSoup(contents)
				for link in soup.find_all('a'):
					link_type = "url"
					urls = link.get('href')
					sqlUrl = (msg_id, link_type, urls)
					db.parseLinks(sqlUrl)
				for images in soup.find_all('img'):
					link_type = "img"
					image = images.get('src')
					sqlImg = (msg_id, link_type, image)
					db.parseLinks(sqlImg)
				for iframes in soup.find_all('iframe'):
					link_type = "iframe"
					frames = "Fix Me"
					sqlFrames = (msg_id, link_type, frames)
					db.parseLinks(sqlFrames)
								
				from core.cleanHtml import cleanHTML
				htmlStrip = cleanHTML().safe_html(contents)
				if htmlStrip is not None:
					fp = open(os.path.join(self.reportDir, "attatchments", "htmlbody.txt"), 'wb')
					fp.write(htmlStrip.encode('ascii', 'ignore'))
					fp.close()
 			
			if part.get('Content-Disposition') is None: # Actual File attatchments here
				continue
												
			from bs4 import UnicodeDammit
			filenameraw = str(part.get_filename())			
			dammit = UnicodeDammit(filenameraw)
			enctype = dammit.original_encoding
			if enctype == "ascii":
				filename = dammit.unicode_markup
			else:
				ext = mimetypes.guess_extension(part.get_content_type())
				filename = '%s-encoded-File-%s.%s' % (enctype, counter, ext)
								
			if filename == 'None': # if theres no name then guess the extension and make something up
				ext = mimetypes.guess_extension(part.get_content_type())
				if not ext:
					ext = ".bin"
				filename = 'part-%03d%s' % (counter, ext)
			counter +=1
			fp = open(os.path.join(self.reportDir, "attatchments", filename), 'wb') # write the attatchment out to a folder
			# Deal With Zero Size Files
			if part.get_payload(decode=True) is None:
				part_data = "This is a Zero Byte File"
				fp.write(part_data)
				fp.close()			
			else:
				fp.write(part.get_payload(decode=True))
				fp.close()
				part_data = part.get_payload(decode=True)
			fileSize = os.path.getsize(os.path.join(self.reportDir, "attatchments", filename))
			fileExt = os.path.splitext(os.path.join(self.reportDir, "attatchments", filename))		
			md5Hash = MailHash().HashMD5(part_data)
			sha256Hash = MailHash().HashSha256(part_data)
				
			if ssdeepcheck == '1': # check to see if users has enabled ssdeep
				try: 	#gracefull fail if the python wrapper is not installed.
				
					ssdHash = MailHash().Hashssdeep(part_data)
				except:
					ssdHash = "0"
			else:
				ssdHash = "0"
			
			import core.yarascan
			filetoScan = os.path.join(self.reportDir, "attatchments", filename)
			result = core.yarascan.fileScan(filetoScan, md5Hash, msg_id)
			match = '0'
			if result:
				yaraMatch = '3'
				match = '3'
			else:
				yaraMatch = '0'
					
			# database stuff here
		
			sqlAttatchments = (msg_id, str(filename), fileExt[1][1:], fileSize, md5Hash, sha256Hash, ssdHash, yaraMatch)			
			db.parseAttatch(sqlAttatchments)
			sqlYara = (counter, match, msg_id)
			db.parseYara(sqlYara)

Example 34

Project: Maildb
Source File: parse.py
View license
	def run(self):
		x = open(os.path.join(self.reportDir, self.msgFile))
		msg = email.message_from_file(x) # open the eml file so we can parse ie
		x.close()
		
		################# Header Information ###################
		# Get me all the sections then write them as one big sql line
		
		dateLine = msg.get('Date')
		
		msg_id = int(os.path.basename(self.reportDir)) # Unique id for this email used to cross ref other tables
		try:
			fromAdd = msg['from'] # might need to tidy this up a little bit using the address parse option
		except:
			fromAdd = msg['from']
		stringIt = str(fromAdd)
		dbFrom = stringIt[stringIt.find("<")+1:stringIt.find(">")]
		# very messy need to fix this.
		addDomain = dbFrom[dbFrom.find("@")+1:] 
		try:
			subjectLine = unicode(msg['subject'], errors = 'replace')
		except:
			subjectLine = msg['subject']
		x_mailer = msg['X-Mailer']
		x_priority = msg['X-Priority']
		try:
			message_id = re.sub('[<>]', '', msg['Message-ID'])
		except:
			message_id = msg['Message-ID']
		hops = msg.get_all('Received')
		if hops:
			for hop in hops:
				hop = re.sub('[<>]', '', hop)
				sqlHop = (msg_id, hop)
				db.parseHops(sqlHop)
		try:
			sender = re.sub('[<>]', '', msg.get('From')) # remove <> so it renders correctly in the HTML
		except:
			sender = dbFrom
		try:
			to_add = re.sub('[<>]', '', msg.get('To')) #
		except:
			to_add = msg.get('To')
		try:
			cc_add = re.sub('[<>]', '', msg.get('cc'))
		except:
			cc_add = msg.get('cc')
		try:
			bcc_add = re.sub('[<>]', '', msg.get('Bcc'))
		except:
			bcc_add = msg.get('bcc')
		sqlHeader = ( msg_id, dateLine, sender, addDomain, subjectLine, x_mailer, x_priority, message_id, cc_add, bcc_add, to_add)
		db.parseHeader(sqlHeader)
		
		counter = 0
		for part in msg.walk():
			if part.get_content_maintype() == 'multipart':				
				continue
			
			if part.get_content_type() == 'text/plain': # Plain Text Body				
				contents = part.get_payload(decode=True)
				links = re.findall(r'(https?://\S+)', contents)
				link_type = "url"
				for urls in links:
					sqlUrl = (msg_id, link_type, urls)
					db.parseLinks(sqlUrl)
								
				from core.cleanHtml import cleanHTML
				htmlStrip = cleanHTML().safe_html(contents)
				if htmlStrip is not None:
					fp = open(os.path.join(self.reportDir, "attatchments", "body.txt"), 'wb')
					fp.write(htmlStrip.encode('ascii', 'ignore'))
					fp.close()
				
			if part.get_content_type() == 'text/html': # HTML Body
				contents = part.get_payload(decode=True)
				soup = BeautifulSoup(contents)
				for link in soup.find_all('a'):
					link_type = "url"
					urls = link.get('href')
					sqlUrl = (msg_id, link_type, urls)
					db.parseLinks(sqlUrl)
				for images in soup.find_all('img'):
					link_type = "img"
					image = images.get('src')
					sqlImg = (msg_id, link_type, image)
					db.parseLinks(sqlImg)
				for iframes in soup.find_all('iframe'):
					link_type = "iframe"
					frames = "Fix Me"
					sqlFrames = (msg_id, link_type, frames)
					db.parseLinks(sqlFrames)
								
				from core.cleanHtml import cleanHTML
				htmlStrip = cleanHTML().safe_html(contents)
				if htmlStrip is not None:
					fp = open(os.path.join(self.reportDir, "attatchments", "htmlbody.txt"), 'wb')
					fp.write(htmlStrip.encode('ascii', 'ignore'))
					fp.close()
 			
			if part.get('Content-Disposition') is None: # Actual File attatchments here
				continue
												
			from bs4 import UnicodeDammit
			filenameraw = str(part.get_filename())			
			dammit = UnicodeDammit(filenameraw)
			enctype = dammit.original_encoding
			if enctype == "ascii":
				filename = dammit.unicode_markup
			else:
				ext = mimetypes.guess_extension(part.get_content_type())
				filename = '%s-encoded-File-%s.%s' % (enctype, counter, ext)
								
			if filename == 'None': # if theres no name then guess the extension and make something up
				ext = mimetypes.guess_extension(part.get_content_type())
				if not ext:
					ext = ".bin"
				filename = 'part-%03d%s' % (counter, ext)
			counter +=1
			fp = open(os.path.join(self.reportDir, "attatchments", filename), 'wb') # write the attatchment out to a folder
			# Deal With Zero Size Files
			if part.get_payload(decode=True) is None:
				part_data = "This is a Zero Byte File"
				fp.write(part_data)
				fp.close()			
			else:
				fp.write(part.get_payload(decode=True))
				fp.close()
				part_data = part.get_payload(decode=True)
			fileSize = os.path.getsize(os.path.join(self.reportDir, "attatchments", filename))
			fileExt = os.path.splitext(os.path.join(self.reportDir, "attatchments", filename))		
			md5Hash = MailHash().HashMD5(part_data)
			sha256Hash = MailHash().HashSha256(part_data)
				
			if ssdeepcheck == '1': # check to see if users has enabled ssdeep
				try: 	#gracefull fail if the python wrapper is not installed.
				
					ssdHash = MailHash().Hashssdeep(part_data)
				except:
					ssdHash = "0"
			else:
				ssdHash = "0"
			
			import core.yarascan
			filetoScan = os.path.join(self.reportDir, "attatchments", filename)
			result = core.yarascan.fileScan(filetoScan, md5Hash, msg_id)
			match = '0'
			if result:
				yaraMatch = '3'
				match = '3'
			else:
				yaraMatch = '0'
					
			# database stuff here
		
			sqlAttatchments = (msg_id, str(filename), fileExt[1][1:], fileSize, md5Hash, sha256Hash, ssdHash, yaraMatch)			
			db.parseAttatch(sqlAttatchments)
			sqlYara = (counter, match, msg_id)
			db.parseYara(sqlYara)

Example 35

Project: ensure
Source File: test.py
View license
    def test_basic_ensure_statements(self):
        ensure(range(10)).contains(5)
        with self.assertRaises(EnsureError):
            ensure(range(10)).contains(-1)

        ensure("abc").is_in("abcdef")
        with self.assertRaises(EnsureError):
            ensure(range(10)).is_in(-1)

        ensure("abc").matches("(abc|def)")
        with self.assertRaises(EnsureError):
            ensure(range(10)).is_in(-1)

        x = {x: x for x in range(10)}
        ok_clauses = ('Ensure(x).contains(0)',
                      'Ensure(x).contains_all_of(range(10))',
                      'Ensure(x).contains_no(str)',
                      'Ensure(x).contains_none_of(range(20, 30))',
                      'Ensure(x).contains_one_of(range(1))',
                      'Ensure(x).contains_some_of(range(2))',
                      'Ensure(x).contains_only(range(10))',
                      'Ensure(x).does_not_contain(-1)',
                      'Ensure(x).does_not_equal(range(10))',
                      'Ensure(x).has_attribute("__iter__")',
                      'Ensure(x).has_length(10).also.is_nonempty()',
                      'Ensure(x).has_length(length=10, min=9, max=10)',
                      'Ensure(x).has_length(max=99.9)',
                      'Ensure(x).is_nonempty().also.has_length(10)',
                      'Ensure(x).is_a(collections.Mapping)',
                      'Ensure(x).is_a_dict_of(int).to(int)',
                      'Ensure(x).is_a(collections.Mapping).of(int).to(int)',
                      'Ensure(6).is_greater_than(5)',
                      'Ensure(6).exceeds(5)',
                      'Ensure(1.1).is_greater_than_or_equal_to(1.1)',
                      'Ensure(1.1).is_less_than_or_equal_to(1.1)',
                      'Ensure(1).is_less_than(1.1)',
                      'Ensure(1).is_positive()',
                      'Ensure(1.1).is_a_positive(float)',
                      'Ensure(-1).is_negative()',
                      'Ensure(-1).is_a_negative(int)',
                      'Ensure(0).is_nonnegative()',
                      'Ensure(0).is_a_nonnegative(int)',
                      'Ensure(1).is_a_positive(int).which.equals(1.0)',
                      'Ensure((collections.namedtuple("Thing", ["x"]))(x={})).has_attribute("x").which.is_a(dict)',
                      'Ensure({1:"a"}).has_key(1).whose_value.has_length(1)',
                      'Ensure({1: "a", 2: "b", 3: "c"}).has_keys((1, 2))',
                      'Ensure({1: "a", 2: "b", 3: "c"}).has_only_keys((1, 2, 3))',
                      'Ensure({}).is_empty()',
                      'Ensure(os.path.join).called_with("a", "b").returns(os.path.join("a", "b"))',
                      'Ensure(int).called_with("1100101", base=2).returns(101)',
                      'Ensure.each_of([1,2,3]).is_an(int)',
                      'Ensure.each_of([lambda x: x, lambda y: y]).called_with(1).returns(1)',
                      'Ensure(True).is_none_or.is_an(int)',  # See https://www.python.org/dev/peps/pep-0285/ (section 6)
                      'Ensure(None).is_none_or.is_a_negative(int)',
                      'Ensure(-5).is_none_or.is_a_negative(int)',
                      'Ensure({"a": "b"}).is_none_or.has_key("a")',
                      'Ensure("A").satisfies(str.isupper)',
                      'Ensure("A").satisfies(".isupper")',
                      'Ensure("ABC").satisfies(str.startswith, "AB")',
                      'Ensure("ABC").satisfies(".startswith", "AB")',
                      'Ensure(3).satisfies(lambda x, y: x < y, y=4)')

        for clause in ok_clauses:
            print("Testing OK clause", clause)
            eval(clause)
            if 'each_of' not in clause:
                for sub in r'Check\1.otherwise(Exception)', r'Check\1.or_raise(Exception)', r'Check\1.or_call(self.assertTrue, False)':
                    print("Testing OK clause", re.sub(r'^Ensure(.+)', sub, clause))
                    eval(re.sub(r'^Ensure(.+)', sub, clause))

        bad_clauses = ('Ensure(x).contains(-1)',
                       'Ensure(x).has_length(10).also.is_empty()',
                       'Ensure(x).contains_all_of(range(20))',
                       'Ensure(x).contains_no(int)',
                       'Ensure(x).contains_none_of(range(0, 30))',
                       'Ensure(x).contains_one_of(range(2))',
                       'Ensure(x).contains_some_of(range(20, 30))',
                       'Ensure(x).contains_only(range(11))',
                       'Ensure(x).does_not_contain(1)',
                       'Ensure(x).does_not_equal(x)',
                       'Ensure(x).does_not_equal(copy.deepcopy(x))',
                       'Ensure(x).has_attribute("y")',
                       'Ensure(x).has_length(1)',
                       'Ensure(x).has_length(length=1, min=9, max=10)',
                       'Ensure(x).has_length(min=11)',
                       'Ensure(x).has_length(max=1.1)',
                       'Ensure(x).is_a(str)',
                       'Ensure(x).is_empty()',
                       'Ensure(6).is_greater_than(7)',
                       'Ensure(6).exceeds(7)',
                       'Ensure(1).is_greater_than_or_equal_to(1.1)',
                       'Ensure(None).is_greater_than_or_equal_to(1.1)',
                       'Ensure(5).is_less_than_or_equal_to(1)',
                       'Ensure(1).is_less_than(None)',
                       'Ensure(0).is_positive()',
                       'Ensure(1).is_a_positive(float)',
                       'Ensure(1).is_negative()',
                       'Ensure(-0).is_a_negative(int)',
                       'Ensure(-0.1).is_nonnegative()',
                       'Ensure(None).is_a_nonnegative(int)',
                       'Ensure({1: "a"}).has_key(1).whose_value.has_length(2)',
                       'Ensure({1: "a"}).has_keys((1, 2))',
                       'Ensure({1: "a", 2: "b"}).has_only_keys([1])',
                       'Ensure({1: "a", 2: "b"}).has_only_keys([1, 2, 3])',
                       'Ensure([1, 2, 3]).has_only_keys([1, 2, 3])',
                       'Ensure(os.path.join).called_with("a", "b").returns(None)',
                       'Ensure(1).is_a_positive(int).which.equals(1.2)',
                       'Ensure.each_of([lambda x: x, lambda y: y]).called_with(2).returns(1)',
                       'Ensure(5).is_none_or.is_a_negative(int)',
                       'Ensure(None).is_a_negative(int)',
                       'Ensure("a").satisfies(str.isupper)',
                       'Ensure("a").satisfies(".isupper")',
                       'Ensure("ABC").satisfies(str.startswith, "Z")',
                       'Ensure("ABC").satisfies(".startswith", "Z")',
                       'Ensure(5).satisfies(str.isupper)',
                       'Ensure(5).satisfies(".isupper")')

        for clause in bad_clauses:
            print("Testing bad clause", clause)
            with self.assertRaises(EnsureError):
                eval(clause)
            if 'each_of' not in clause:
                for sub in r'Check\1.otherwise(Exception)', r'Check\1.or_raise(Exception)', r'Check\1.or_call(self.assertTrue, False)':
                    with self.assertRaises(Exception):
                        print("Testing bad clause", re.sub(r'^Ensure(.+)', sub, clause))
                        eval(re.sub(r'^Ensure(.+)', sub, clause))

        with self.assertRaises(EnsureError):
            Ensure(x).is_a_dict_of(int).to(str)
        with self.assertRaises(EnsureError):
            Ensure(x).is_a_dict_of(str).to(int)
        with self.assertRaises(EnsureError):
            Ensure(x).called_with().is_an(int)
        Ensure(lambda: True).is_callable()

        Ensure("1.1").is_a_numeric_string()
        with self.assertRaises(EnsureError):
            Ensure(b"1").is_a_numeric_string()
        with self.assertRaises(EnsureError):
            Ensure("").is_a_numeric_string()
        with self.assertRaises(EnsureError):
            Ensure(None).is_a_numeric_string()

        Ensure(b"1").is_a_numeric_bytestring()
        Ensure(b"1.1").is_a_numeric_bytestring()
        with self.assertRaises(EnsureError):
            Ensure("1").is_a_numeric_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(b"").is_a_numeric_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(None).is_a_numeric_bytestring()

        Ensure("1").is_an_integer_string()
        with self.assertRaises(EnsureError):
            Ensure("1.1").is_an_integer_string()

        Ensure(b"1").is_an_integer_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(b"1.1").is_an_integer_bytestring()
        with self.assertRaises(EnsureError):
            Ensure("1").is_an_integer_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(b"").is_an_integer_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(None).is_an_integer_bytestring()

Example 36

Project: ensure
Source File: test.py
View license
    def test_basic_ensure_statements(self):
        ensure(range(10)).contains(5)
        with self.assertRaises(EnsureError):
            ensure(range(10)).contains(-1)

        ensure("abc").is_in("abcdef")
        with self.assertRaises(EnsureError):
            ensure(range(10)).is_in(-1)

        ensure("abc").matches("(abc|def)")
        with self.assertRaises(EnsureError):
            ensure(range(10)).is_in(-1)

        x = {x: x for x in range(10)}
        ok_clauses = ('Ensure(x).contains(0)',
                      'Ensure(x).contains_all_of(range(10))',
                      'Ensure(x).contains_no(str)',
                      'Ensure(x).contains_none_of(range(20, 30))',
                      'Ensure(x).contains_one_of(range(1))',
                      'Ensure(x).contains_some_of(range(2))',
                      'Ensure(x).contains_only(range(10))',
                      'Ensure(x).does_not_contain(-1)',
                      'Ensure(x).does_not_equal(range(10))',
                      'Ensure(x).has_attribute("__iter__")',
                      'Ensure(x).has_length(10).also.is_nonempty()',
                      'Ensure(x).has_length(length=10, min=9, max=10)',
                      'Ensure(x).has_length(max=99.9)',
                      'Ensure(x).is_nonempty().also.has_length(10)',
                      'Ensure(x).is_a(collections.Mapping)',
                      'Ensure(x).is_a_dict_of(int).to(int)',
                      'Ensure(x).is_a(collections.Mapping).of(int).to(int)',
                      'Ensure(6).is_greater_than(5)',
                      'Ensure(6).exceeds(5)',
                      'Ensure(1.1).is_greater_than_or_equal_to(1.1)',
                      'Ensure(1.1).is_less_than_or_equal_to(1.1)',
                      'Ensure(1).is_less_than(1.1)',
                      'Ensure(1).is_positive()',
                      'Ensure(1.1).is_a_positive(float)',
                      'Ensure(-1).is_negative()',
                      'Ensure(-1).is_a_negative(int)',
                      'Ensure(0).is_nonnegative()',
                      'Ensure(0).is_a_nonnegative(int)',
                      'Ensure(1).is_a_positive(int).which.equals(1.0)',
                      'Ensure((collections.namedtuple("Thing", ["x"]))(x={})).has_attribute("x").which.is_a(dict)',
                      'Ensure({1:"a"}).has_key(1).whose_value.has_length(1)',
                      'Ensure({1: "a", 2: "b", 3: "c"}).has_keys((1, 2))',
                      'Ensure({1: "a", 2: "b", 3: "c"}).has_only_keys((1, 2, 3))',
                      'Ensure({}).is_empty()',
                      'Ensure(os.path.join).called_with("a", "b").returns(os.path.join("a", "b"))',
                      'Ensure(int).called_with("1100101", base=2).returns(101)',
                      'Ensure.each_of([1,2,3]).is_an(int)',
                      'Ensure.each_of([lambda x: x, lambda y: y]).called_with(1).returns(1)',
                      'Ensure(True).is_none_or.is_an(int)',  # See https://www.python.org/dev/peps/pep-0285/ (section 6)
                      'Ensure(None).is_none_or.is_a_negative(int)',
                      'Ensure(-5).is_none_or.is_a_negative(int)',
                      'Ensure({"a": "b"}).is_none_or.has_key("a")',
                      'Ensure("A").satisfies(str.isupper)',
                      'Ensure("A").satisfies(".isupper")',
                      'Ensure("ABC").satisfies(str.startswith, "AB")',
                      'Ensure("ABC").satisfies(".startswith", "AB")',
                      'Ensure(3).satisfies(lambda x, y: x < y, y=4)')

        for clause in ok_clauses:
            print("Testing OK clause", clause)
            eval(clause)
            if 'each_of' not in clause:
                for sub in r'Check\1.otherwise(Exception)', r'Check\1.or_raise(Exception)', r'Check\1.or_call(self.assertTrue, False)':
                    print("Testing OK clause", re.sub(r'^Ensure(.+)', sub, clause))
                    eval(re.sub(r'^Ensure(.+)', sub, clause))

        bad_clauses = ('Ensure(x).contains(-1)',
                       'Ensure(x).has_length(10).also.is_empty()',
                       'Ensure(x).contains_all_of(range(20))',
                       'Ensure(x).contains_no(int)',
                       'Ensure(x).contains_none_of(range(0, 30))',
                       'Ensure(x).contains_one_of(range(2))',
                       'Ensure(x).contains_some_of(range(20, 30))',
                       'Ensure(x).contains_only(range(11))',
                       'Ensure(x).does_not_contain(1)',
                       'Ensure(x).does_not_equal(x)',
                       'Ensure(x).does_not_equal(copy.deepcopy(x))',
                       'Ensure(x).has_attribute("y")',
                       'Ensure(x).has_length(1)',
                       'Ensure(x).has_length(length=1, min=9, max=10)',
                       'Ensure(x).has_length(min=11)',
                       'Ensure(x).has_length(max=1.1)',
                       'Ensure(x).is_a(str)',
                       'Ensure(x).is_empty()',
                       'Ensure(6).is_greater_than(7)',
                       'Ensure(6).exceeds(7)',
                       'Ensure(1).is_greater_than_or_equal_to(1.1)',
                       'Ensure(None).is_greater_than_or_equal_to(1.1)',
                       'Ensure(5).is_less_than_or_equal_to(1)',
                       'Ensure(1).is_less_than(None)',
                       'Ensure(0).is_positive()',
                       'Ensure(1).is_a_positive(float)',
                       'Ensure(1).is_negative()',
                       'Ensure(-0).is_a_negative(int)',
                       'Ensure(-0.1).is_nonnegative()',
                       'Ensure(None).is_a_nonnegative(int)',
                       'Ensure({1: "a"}).has_key(1).whose_value.has_length(2)',
                       'Ensure({1: "a"}).has_keys((1, 2))',
                       'Ensure({1: "a", 2: "b"}).has_only_keys([1])',
                       'Ensure({1: "a", 2: "b"}).has_only_keys([1, 2, 3])',
                       'Ensure([1, 2, 3]).has_only_keys([1, 2, 3])',
                       'Ensure(os.path.join).called_with("a", "b").returns(None)',
                       'Ensure(1).is_a_positive(int).which.equals(1.2)',
                       'Ensure.each_of([lambda x: x, lambda y: y]).called_with(2).returns(1)',
                       'Ensure(5).is_none_or.is_a_negative(int)',
                       'Ensure(None).is_a_negative(int)',
                       'Ensure("a").satisfies(str.isupper)',
                       'Ensure("a").satisfies(".isupper")',
                       'Ensure("ABC").satisfies(str.startswith, "Z")',
                       'Ensure("ABC").satisfies(".startswith", "Z")',
                       'Ensure(5).satisfies(str.isupper)',
                       'Ensure(5).satisfies(".isupper")')

        for clause in bad_clauses:
            print("Testing bad clause", clause)
            with self.assertRaises(EnsureError):
                eval(clause)
            if 'each_of' not in clause:
                for sub in r'Check\1.otherwise(Exception)', r'Check\1.or_raise(Exception)', r'Check\1.or_call(self.assertTrue, False)':
                    with self.assertRaises(Exception):
                        print("Testing bad clause", re.sub(r'^Ensure(.+)', sub, clause))
                        eval(re.sub(r'^Ensure(.+)', sub, clause))

        with self.assertRaises(EnsureError):
            Ensure(x).is_a_dict_of(int).to(str)
        with self.assertRaises(EnsureError):
            Ensure(x).is_a_dict_of(str).to(int)
        with self.assertRaises(EnsureError):
            Ensure(x).called_with().is_an(int)
        Ensure(lambda: True).is_callable()

        Ensure("1.1").is_a_numeric_string()
        with self.assertRaises(EnsureError):
            Ensure(b"1").is_a_numeric_string()
        with self.assertRaises(EnsureError):
            Ensure("").is_a_numeric_string()
        with self.assertRaises(EnsureError):
            Ensure(None).is_a_numeric_string()

        Ensure(b"1").is_a_numeric_bytestring()
        Ensure(b"1.1").is_a_numeric_bytestring()
        with self.assertRaises(EnsureError):
            Ensure("1").is_a_numeric_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(b"").is_a_numeric_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(None).is_a_numeric_bytestring()

        Ensure("1").is_an_integer_string()
        with self.assertRaises(EnsureError):
            Ensure("1.1").is_an_integer_string()

        Ensure(b"1").is_an_integer_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(b"1.1").is_an_integer_bytestring()
        with self.assertRaises(EnsureError):
            Ensure("1").is_an_integer_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(b"").is_an_integer_bytestring()
        with self.assertRaises(EnsureError):
            Ensure(None).is_an_integer_bytestring()

Example 37

Project: calibre
Source File: bibtex.py
View license
    def run(self, path_to_output, opts, db, notification=DummyReporter()):
        from calibre.utils.date import isoformat
        from calibre.utils.html2text import html2text
        from calibre.utils.bibtex import BibTeX
        from calibre.library.save_to_disk import preprocess_template
        from calibre.utils.date import now as nowf
        from calibre.utils.logging import default_log as log

        library_name = os.path.basename(db.library_path)

        def create_bibtex_entry(entry, fields, mode, template_citation,
                                    bibtexdict, db, citation_bibtex=True, calibre_files=True):

            # Bibtex doesn't like UTF-8 but keep unicode until writing
            # Define starting chain or if book valid strict and not book return a Fail string

            bibtex_entry = []
            if mode != "misc" and check_entry_book_valid(entry) :
                bibtex_entry.append(u'@book{')
            elif mode != "book" :
                bibtex_entry.append(u'@misc{')
            else :
                # case strict book
                return ''

            if citation_bibtex :
                # Citation tag
                bibtex_entry.append(make_bibtex_citation(entry, template_citation,
                    bibtexdict))
                bibtex_entry = [u' '.join(bibtex_entry)]

            for field in fields:
                if field.startswith('#'):
                    item = db.get_field(entry['id'],field,index_is_id=True)
                    if isinstance(item, (bool, float, int)):
                        item = repr(item)
                elif field == 'title_sort':
                    item = entry['sort']
                elif field == 'library_name':
                    item = library_name
                else:
                    item = entry[field]

                # check if the field should be included (none or empty)
                if item is None:
                    continue
                try:
                    if len(item) == 0 :
                        continue
                except TypeError:
                    pass

                if field == 'authors' :
                    bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item))

                elif field == 'id' :
                    bibtex_entry.append(u'calibreid = "%s"' % int(item))

                elif field == 'rating' :
                    bibtex_entry.append(u'rating = "%s"' % int(item))

                elif field == 'size' :
                    bibtex_entry.append(u'%s = "%s octets"' % (field, int(item)))

                elif field == 'tags' :
                    # A list to flatten
                    bibtex_entry.append(u'tags = "%s"' % bibtexdict.utf8ToBibtex(u', '.join(item)))

                elif field == 'comments' :
                    # \n removal
                    item = item.replace(u'\r\n',u' ')
                    item = item.replace(u'\n',u' ')
                    # unmatched brace removal (users should use \leftbrace or \rightbrace for single braces)
                    item = bibtexdict.stripUnmatchedSyntax(item, u'{', u'}')
                    # html to text
                    try:
                        item = html2text(item)
                    except:
                        log.warn("Failed to convert comments to text")
                    bibtex_entry.append(u'note = "%s"' % bibtexdict.utf8ToBibtex(item))

                elif field == 'isbn' :
                    # Could be 9, 10 or 13 digits
                    bibtex_entry.append(u'isbn = "%s"' % format_isbn(item))

                elif field == 'formats' :
                    # Add file path if format is selected
                    formats = [format.rpartition('.')[2].lower() for format in item]
                    bibtex_entry.append(u'formats = "%s"' % u', '.join(formats))
                    if calibre_files:
                        files = [u':%s:%s' % (format, format.rpartition('.')[2].upper())
                            for format in item]
                        bibtex_entry.append(u'file = "%s"' % u', '.join(files))

                elif field == 'series_index' :
                    bibtex_entry.append(u'volume = "%s"' % int(item))

                elif field == 'timestamp' :
                    bibtex_entry.append(u'timestamp = "%s"' % isoformat(item).partition('T')[0])

                elif field == 'pubdate' :
                    bibtex_entry.append(u'year = "%s"' % item.year)
                    bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item)))

                elif field.startswith('#') and isinstance(item, basestring):
                    bibtex_entry.append(u'custom_%s = "%s"' % (field[1:],
                        bibtexdict.utf8ToBibtex(item)))

                elif isinstance(item, basestring):
                    # elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice',
                        # 'author_sort', 'series', 'title_sort'] :
                    bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item)))

            bibtex_entry = u',\n    '.join(bibtex_entry)
            bibtex_entry += u' }\n\n'

            return bibtex_entry

        def check_entry_book_valid(entry):
            # Check that the required fields are ok for a book entry
            for field in ['title', 'authors', 'publisher'] :
                if entry[field] is None or len(entry[field]) == 0 :
                    return False
            if entry['pubdate'] is None :
                return False
            else :
                return True

        def make_bibtex_citation(entry, template_citation, bibtexclass):

            # define a function to replace the template entry by its value
            def tpl_replace(objtplname) :

                tpl_field = re.sub(u'[\{\}]', u'', objtplname.group())

                if tpl_field in TEMPLATE_ALLOWED_FIELDS :
                    if tpl_field in ['pubdate', 'timestamp'] :
                        tpl_field = isoformat(entry[tpl_field]).partition('T')[0]
                    elif tpl_field in ['tags', 'authors'] :
                        tpl_field =entry[tpl_field][0]
                    elif tpl_field in ['id', 'series_index'] :
                        tpl_field = str(entry[tpl_field])
                    else :
                        tpl_field = entry[tpl_field]
                    return tpl_field
                else:
                    return u''

            if len(template_citation) >0 :
                tpl_citation = bibtexclass.utf8ToBibtex(
                    bibtexclass.ValidateCitationKey(re.sub(u'\{[^{}]*\}',
                        tpl_replace, template_citation)))

                if len(tpl_citation) >0 :
                    return tpl_citation

            if len(entry["isbn"]) > 0 :
                template_citation = u'%s' % re.sub(u'[\D]',u'', entry["isbn"])

            else :
                template_citation = u'%s' % str(entry["id"])

            return bibtexclass.ValidateCitationKey(template_citation)

        self.fmt = path_to_output.rpartition('.')[2]
        self.notification = notification

        # Combobox options
        bibfile_enc = ['utf8', 'cp1252', 'ascii']
        bibfile_enctag = ['strict', 'replace', 'ignore', 'backslashreplace']
        bib_entry = ['mixed', 'misc', 'book']

        # Needed beacause CLI return str vs int by widget
        try:
            bibfile_enc = bibfile_enc[opts.bibfile_enc]
            bibfile_enctag = bibfile_enctag[opts.bibfile_enctag]
            bib_entry = bib_entry[opts.bib_entry]
        except:
            if opts.bibfile_enc in bibfile_enc :
                bibfile_enc = opts.bibfile_enc
            else :
                log.warn("Incorrect --choose-encoding flag, revert to default")
                bibfile_enc = bibfile_enc[0]
            if opts.bibfile_enctag in bibfile_enctag :
                bibfile_enctag = opts.bibfile_enctag
            else :
                log.warn("Incorrect --choose-encoding-configuration flag, revert to default")
                bibfile_enctag = bibfile_enctag[0]
            if opts.bib_entry in bib_entry :
                bib_entry = opts.bib_entry
            else :
                log.warn("Incorrect --entry-type flag, revert to default")
                bib_entry = bib_entry[0]

        if opts.verbose:
            opts_dict = vars(opts)
            log("%s(): Generating %s" % (self.name,self.fmt))
            if opts.connected_device['is_device_connected']:
                log(" connected_device: %s" % opts.connected_device['name'])
            if opts_dict['search_text']:
                log(" --search='%s'" % opts_dict['search_text'])

            if opts_dict['ids']:
                log(" Book count: %d" % len(opts_dict['ids']))
                if opts_dict['search_text']:
                    log(" (--search ignored when a subset of the database is specified)")

            if opts_dict['fields']:
                if opts_dict['fields'] == 'all':
                    log(" Fields: %s" % ', '.join(FIELDS[1:]))
                else:
                    log(" Fields: %s" % opts_dict['fields'])

            log(" Output file will be encoded in %s with %s flag" % (bibfile_enc, bibfile_enctag))

            log(" BibTeX entry type is %s with a citation like '%s' flag" % (bib_entry, opts_dict['bib_cit']))

        # If a list of ids are provided, don't use search_text
        if opts.ids:
            opts.search_text = None

        data = self.search_sort_db(db, opts)

        if not len(data):
            log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text)

        # Get the requested output fields as a list
        fields = self.get_output_fields(db, opts)

        if not len(data):
            log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text)

        # Initialize BibTeX class
        bibtexc = BibTeX()

        # Entries writing after Bibtex formating (or not)
        if bibfile_enc != 'ascii' :
            bibtexc.ascii_bibtex = False
        else :
            bibtexc.ascii_bibtex = True

        # Check citation choice and go to default in case of bad CLI
        if isinstance(opts.impcit, (StringType, UnicodeType)) :
            if opts.impcit == 'False' :
                citation_bibtex= False
            elif opts.impcit == 'True' :
                citation_bibtex= True
            else :
                log.warn("Incorrect --create-citation, revert to default")
                citation_bibtex= True
        else :
            citation_bibtex= opts.impcit

        # Check add file entry and go to default in case of bad CLI
        if isinstance(opts.addfiles, (StringType, UnicodeType)) :
            if opts.addfiles == 'False' :
                addfiles_bibtex = False
            elif opts.addfiles == 'True' :
                addfiles_bibtex = True
            else :
                log.warn("Incorrect --add-files-path, revert to default")
                addfiles_bibtex= True
        else :
            addfiles_bibtex = opts.addfiles

        # Preprocess for error and light correction
        template_citation = preprocess_template(opts.bib_cit)

        # Open output and write entries
        with codecs.open(path_to_output, 'w', bibfile_enc, bibfile_enctag)\
            as outfile:
            # File header
            nb_entries = len(data)

            # check in book strict if all is ok else throw a warning into log
            if bib_entry == 'book' :
                nb_books = len(filter(check_entry_book_valid, data))
                if nb_books < nb_entries :
                    log.warn("Only %d entries in %d are book compatible" % (nb_books, nb_entries))
                    nb_entries = nb_books

            # If connected device, add 'On Device' values to data
            if opts.connected_device['is_device_connected'] and 'ondevice' in fields:
                for entry in data:
                    entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[entry['id']]['ondevice']

            outfile.write(u'%%%Calibre catalog\n%%%{0} entries in catalog\n\n'.format(nb_entries))
            outfile.write(u'@preamble{"This catalog of %d entries was generated by calibre on %s"}\n\n'
                % (nb_entries, nowf().strftime("%A, %d. %B %Y %H:%M").decode(preferred_encoding)))

            for entry in data:
                outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation,
                    bibtexc, db, citation_bibtex, addfiles_bibtex))

Example 38

Project: calibre
Source File: bibtex.py
View license
    def run(self, path_to_output, opts, db, notification=DummyReporter()):
        from calibre.utils.date import isoformat
        from calibre.utils.html2text import html2text
        from calibre.utils.bibtex import BibTeX
        from calibre.library.save_to_disk import preprocess_template
        from calibre.utils.date import now as nowf
        from calibre.utils.logging import default_log as log

        library_name = os.path.basename(db.library_path)

        def create_bibtex_entry(entry, fields, mode, template_citation,
                                    bibtexdict, db, citation_bibtex=True, calibre_files=True):

            # Bibtex doesn't like UTF-8 but keep unicode until writing
            # Define starting chain or if book valid strict and not book return a Fail string

            bibtex_entry = []
            if mode != "misc" and check_entry_book_valid(entry) :
                bibtex_entry.append(u'@book{')
            elif mode != "book" :
                bibtex_entry.append(u'@misc{')
            else :
                # case strict book
                return ''

            if citation_bibtex :
                # Citation tag
                bibtex_entry.append(make_bibtex_citation(entry, template_citation,
                    bibtexdict))
                bibtex_entry = [u' '.join(bibtex_entry)]

            for field in fields:
                if field.startswith('#'):
                    item = db.get_field(entry['id'],field,index_is_id=True)
                    if isinstance(item, (bool, float, int)):
                        item = repr(item)
                elif field == 'title_sort':
                    item = entry['sort']
                elif field == 'library_name':
                    item = library_name
                else:
                    item = entry[field]

                # check if the field should be included (none or empty)
                if item is None:
                    continue
                try:
                    if len(item) == 0 :
                        continue
                except TypeError:
                    pass

                if field == 'authors' :
                    bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item))

                elif field == 'id' :
                    bibtex_entry.append(u'calibreid = "%s"' % int(item))

                elif field == 'rating' :
                    bibtex_entry.append(u'rating = "%s"' % int(item))

                elif field == 'size' :
                    bibtex_entry.append(u'%s = "%s octets"' % (field, int(item)))

                elif field == 'tags' :
                    # A list to flatten
                    bibtex_entry.append(u'tags = "%s"' % bibtexdict.utf8ToBibtex(u', '.join(item)))

                elif field == 'comments' :
                    # \n removal
                    item = item.replace(u'\r\n',u' ')
                    item = item.replace(u'\n',u' ')
                    # unmatched brace removal (users should use \leftbrace or \rightbrace for single braces)
                    item = bibtexdict.stripUnmatchedSyntax(item, u'{', u'}')
                    # html to text
                    try:
                        item = html2text(item)
                    except:
                        log.warn("Failed to convert comments to text")
                    bibtex_entry.append(u'note = "%s"' % bibtexdict.utf8ToBibtex(item))

                elif field == 'isbn' :
                    # Could be 9, 10 or 13 digits
                    bibtex_entry.append(u'isbn = "%s"' % format_isbn(item))

                elif field == 'formats' :
                    # Add file path if format is selected
                    formats = [format.rpartition('.')[2].lower() for format in item]
                    bibtex_entry.append(u'formats = "%s"' % u', '.join(formats))
                    if calibre_files:
                        files = [u':%s:%s' % (format, format.rpartition('.')[2].upper())
                            for format in item]
                        bibtex_entry.append(u'file = "%s"' % u', '.join(files))

                elif field == 'series_index' :
                    bibtex_entry.append(u'volume = "%s"' % int(item))

                elif field == 'timestamp' :
                    bibtex_entry.append(u'timestamp = "%s"' % isoformat(item).partition('T')[0])

                elif field == 'pubdate' :
                    bibtex_entry.append(u'year = "%s"' % item.year)
                    bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item)))

                elif field.startswith('#') and isinstance(item, basestring):
                    bibtex_entry.append(u'custom_%s = "%s"' % (field[1:],
                        bibtexdict.utf8ToBibtex(item)))

                elif isinstance(item, basestring):
                    # elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice',
                        # 'author_sort', 'series', 'title_sort'] :
                    bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item)))

            bibtex_entry = u',\n    '.join(bibtex_entry)
            bibtex_entry += u' }\n\n'

            return bibtex_entry

        def check_entry_book_valid(entry):
            # Check that the required fields are ok for a book entry
            for field in ['title', 'authors', 'publisher'] :
                if entry[field] is None or len(entry[field]) == 0 :
                    return False
            if entry['pubdate'] is None :
                return False
            else :
                return True

        def make_bibtex_citation(entry, template_citation, bibtexclass):

            # define a function to replace the template entry by its value
            def tpl_replace(objtplname) :

                tpl_field = re.sub(u'[\{\}]', u'', objtplname.group())

                if tpl_field in TEMPLATE_ALLOWED_FIELDS :
                    if tpl_field in ['pubdate', 'timestamp'] :
                        tpl_field = isoformat(entry[tpl_field]).partition('T')[0]
                    elif tpl_field in ['tags', 'authors'] :
                        tpl_field =entry[tpl_field][0]
                    elif tpl_field in ['id', 'series_index'] :
                        tpl_field = str(entry[tpl_field])
                    else :
                        tpl_field = entry[tpl_field]
                    return tpl_field
                else:
                    return u''

            if len(template_citation) >0 :
                tpl_citation = bibtexclass.utf8ToBibtex(
                    bibtexclass.ValidateCitationKey(re.sub(u'\{[^{}]*\}',
                        tpl_replace, template_citation)))

                if len(tpl_citation) >0 :
                    return tpl_citation

            if len(entry["isbn"]) > 0 :
                template_citation = u'%s' % re.sub(u'[\D]',u'', entry["isbn"])

            else :
                template_citation = u'%s' % str(entry["id"])

            return bibtexclass.ValidateCitationKey(template_citation)

        self.fmt = path_to_output.rpartition('.')[2]
        self.notification = notification

        # Combobox options
        bibfile_enc = ['utf8', 'cp1252', 'ascii']
        bibfile_enctag = ['strict', 'replace', 'ignore', 'backslashreplace']
        bib_entry = ['mixed', 'misc', 'book']

        # Needed beacause CLI return str vs int by widget
        try:
            bibfile_enc = bibfile_enc[opts.bibfile_enc]
            bibfile_enctag = bibfile_enctag[opts.bibfile_enctag]
            bib_entry = bib_entry[opts.bib_entry]
        except:
            if opts.bibfile_enc in bibfile_enc :
                bibfile_enc = opts.bibfile_enc
            else :
                log.warn("Incorrect --choose-encoding flag, revert to default")
                bibfile_enc = bibfile_enc[0]
            if opts.bibfile_enctag in bibfile_enctag :
                bibfile_enctag = opts.bibfile_enctag
            else :
                log.warn("Incorrect --choose-encoding-configuration flag, revert to default")
                bibfile_enctag = bibfile_enctag[0]
            if opts.bib_entry in bib_entry :
                bib_entry = opts.bib_entry
            else :
                log.warn("Incorrect --entry-type flag, revert to default")
                bib_entry = bib_entry[0]

        if opts.verbose:
            opts_dict = vars(opts)
            log("%s(): Generating %s" % (self.name,self.fmt))
            if opts.connected_device['is_device_connected']:
                log(" connected_device: %s" % opts.connected_device['name'])
            if opts_dict['search_text']:
                log(" --search='%s'" % opts_dict['search_text'])

            if opts_dict['ids']:
                log(" Book count: %d" % len(opts_dict['ids']))
                if opts_dict['search_text']:
                    log(" (--search ignored when a subset of the database is specified)")

            if opts_dict['fields']:
                if opts_dict['fields'] == 'all':
                    log(" Fields: %s" % ', '.join(FIELDS[1:]))
                else:
                    log(" Fields: %s" % opts_dict['fields'])

            log(" Output file will be encoded in %s with %s flag" % (bibfile_enc, bibfile_enctag))

            log(" BibTeX entry type is %s with a citation like '%s' flag" % (bib_entry, opts_dict['bib_cit']))

        # If a list of ids are provided, don't use search_text
        if opts.ids:
            opts.search_text = None

        data = self.search_sort_db(db, opts)

        if not len(data):
            log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text)

        # Get the requested output fields as a list
        fields = self.get_output_fields(db, opts)

        if not len(data):
            log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text)

        # Initialize BibTeX class
        bibtexc = BibTeX()

        # Entries writing after Bibtex formating (or not)
        if bibfile_enc != 'ascii' :
            bibtexc.ascii_bibtex = False
        else :
            bibtexc.ascii_bibtex = True

        # Check citation choice and go to default in case of bad CLI
        if isinstance(opts.impcit, (StringType, UnicodeType)) :
            if opts.impcit == 'False' :
                citation_bibtex= False
            elif opts.impcit == 'True' :
                citation_bibtex= True
            else :
                log.warn("Incorrect --create-citation, revert to default")
                citation_bibtex= True
        else :
            citation_bibtex= opts.impcit

        # Check add file entry and go to default in case of bad CLI
        if isinstance(opts.addfiles, (StringType, UnicodeType)) :
            if opts.addfiles == 'False' :
                addfiles_bibtex = False
            elif opts.addfiles == 'True' :
                addfiles_bibtex = True
            else :
                log.warn("Incorrect --add-files-path, revert to default")
                addfiles_bibtex= True
        else :
            addfiles_bibtex = opts.addfiles

        # Preprocess for error and light correction
        template_citation = preprocess_template(opts.bib_cit)

        # Open output and write entries
        with codecs.open(path_to_output, 'w', bibfile_enc, bibfile_enctag)\
            as outfile:
            # File header
            nb_entries = len(data)

            # check in book strict if all is ok else throw a warning into log
            if bib_entry == 'book' :
                nb_books = len(filter(check_entry_book_valid, data))
                if nb_books < nb_entries :
                    log.warn("Only %d entries in %d are book compatible" % (nb_books, nb_entries))
                    nb_entries = nb_books

            # If connected device, add 'On Device' values to data
            if opts.connected_device['is_device_connected'] and 'ondevice' in fields:
                for entry in data:
                    entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[entry['id']]['ondevice']

            outfile.write(u'%%%Calibre catalog\n%%%{0} entries in catalog\n\n'.format(nb_entries))
            outfile.write(u'@preamble{"This catalog of %d entries was generated by calibre on %s"}\n\n'
                % (nb_entries, nowf().strftime("%A, %d. %B %Y %H:%M").decode(preferred_encoding)))

            for entry in data:
                outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation,
                    bibtexc, db, citation_bibtex, addfiles_bibtex))

Example 39

Project: theHarvester
Source File: theHarvester.py
View license
def start(argv):
    if len(sys.argv) < 4:
        usage()
        sys.exit()
    try:
        opts, args = getopt.getopt(argv, "l:d:b:s:vf:nhcte:")
    except getopt.GetoptError:
        usage()
        sys.exit()
    start = 0
    host_ip = []
    filename = ""
    bingapi = "yes"
    dnslookup = False
    dnsbrute = False
    dnstld = False
    shodan = False
    vhost = []
    virtual = False
    limit = 100
    dnsserver = ""
    for opt, arg in opts:
        if opt == '-l':
            limit = int(arg)
        elif opt == '-d':
            word = arg
        elif opt == '-s':
            start = int(arg)
        elif opt == '-v':
            virtual = "basic"
        elif opt == '-f':
            filename = arg
        elif opt == '-n':
            dnslookup = True
        elif opt == '-c':
            dnsbrute = True
        elif opt == '-h':
            shodan = True
        elif opt == '-e':
            dnsserver = arg
        elif opt == '-t':
            dnstld = True
        elif opt == '-b':
            engine = arg
            if engine not in ("baidu", "bing", "bingapi","dogpile", "google", "googleCSE", "googleplus", "google-profiles","linkedin", "pgp", "twitter", "vhost", "yahoo", "all"):
                usage()
                print "Invalid search engine, try with: baidu, bing, bingapi, dogpile, google, googleCSE, googleplus, google-profiles, linkedin, pgp, twitter, vhost, yahoo, all"
                sys.exit()
            else:
                pass
    if engine == "google":
        print "[-] Searching in Google:"
        search = googlesearch.search_google(word, limit, start)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    if engine == "googleCSE":
        print "[-] Searching in Google Custom Search:"
        search = googleCSE.search_googleCSE(word, limit, start)
        search.process()
        search.store_results()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "bing" or engine == "bingapi":
        print "[-] Searching in Bing:"
        search = bingsearch.search_bing(word, limit, start)
        if engine == "bingapi":
            bingapi = "yes"
        else:
            bingapi = "no"
        search.process(bingapi)
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "dogpile":
        print "[-] Searching in Dogpilesearch.."
        search = dogpilesearch.search_dogpile(word, limit)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "pgp":
        print "[-] Searching in PGP key server.."
        search = pgpsearch.search_pgp(word)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "yahoo":
        print "[-] Searching in Yahoo.."
        search = yahoosearch.search_yahoo(word, limit)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "baidu":
        print "[-] Searching in Baidu.."
        search = baidusearch.search_baidu(word, limit)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "googleplus":
        print "[-] Searching in Google+ .."
        search = googleplussearch.search_googleplus(word, limit)
        search.process()
        people = search.get_people()
        print "Users from Google+:"
       	print "===================="
       	for user in people:
            print user
        sys.exit()

    elif engine == "twitter":
        print "[-] Searching in Twitter .."
        search = twittersearch.search_twitter(word, limit)
        search.process()
        people = search.get_people()
        print "Users from Twitter:"
       	print "===================="
       	for user in people:
            print user
        sys.exit()

    elif engine == "linkedin":
        print "[-] Searching in Linkedin.."
        search = linkedinsearch.search_linkedin(word, limit)
        search.process()
        people = search.get_people()
        print "Users from Linkedin:"
       	print "===================="
       	for user in people:
            print user
        sys.exit()
    elif engine == "google-profiles":
        print "[-] Searching in Google profiles.."
        search = googlesearch.search_google(word, limit, start)
        search.process_profiles()
        people = search.get_profiles()
        print "Users from Google profiles:"
        print "---------------------------"
        for users in people:
            print users
        sys.exit()
    elif engine == "all":
        print "Full harvest.."
        all_emails = []
        all_hosts = []
        virtual = "basic"
        print "[-] Searching in Google.."
        search = googlesearch.search_google(word, limit, start)
        search.process()
        emails = search.get_emails()
        hosts = search.get_hostnames()
        all_emails.extend(emails)
        all_hosts.extend(hosts)
        print "[-] Searching in PGP Key server.."
        search = pgpsearch.search_pgp(word)
        search.process()
        emails = search.get_emails()
        hosts = search.get_hostnames()
        all_hosts.extend(hosts)
        all_emails.extend(emails)
        print "[-] Searching in Bing.."
        bingapi = "no"
        search = bingsearch.search_bing(word, limit, start)
        search.process(bingapi)
        emails = search.get_emails()
        hosts = search.get_hostnames()
        all_hosts.extend(hosts)
        all_emails.extend(emails)
        print "[-] Searching in Exalead.."
        search = exaleadsearch.search_exalead(word, limit, start)
        search.process()
        emails = search.get_emails()
        hosts = search.get_hostnames()
        all_hosts.extend(hosts)
        all_emails.extend(emails)

        #Clean up email list, sort and uniq
        all_emails=sorted(set(all_emails))
    #Results############################################################
    print "\n\n[+] Emails found:"
    print "------------------"
    if all_emails == []:
        print "No emails found"
    else:
        print "\n".join(all_emails)

    print "\n[+] Hosts found in search engines:"
    print "------------------------------------"
    if all_hosts == []:
        print "No hosts found"
    else:
        all_hosts=sorted(set(all_hosts))
        print "[-] Resolving hostnames IPs... "
        full_host = hostchecker.Checker(all_hosts)
        full = full_host.check()
        for host in full:
            ip = host.split(':')[0]
            print host
            if host_ip.count(ip.lower()):
                pass
            else:
                host_ip.append(ip.lower())

    #DNS reverse lookup#################################################
    dnsrev = []
    if dnslookup == True:
        print "\n[+] Starting active queries:"
        analyzed_ranges = []
        for x in full:
            ip = x.split(":")[0]
            range = ip.split(".")
            range[3] = "0/24"
            range = string.join(range, '.')
            if not analyzed_ranges.count(range):
                print "[-]Performing reverse lookup in :" + range
                a = dnssearch.dns_reverse(range, True)
                a.list()
                res = a.process()
                analyzed_ranges.append(range)
            else:
                continue
            for x in res:
                if x.count(word):
                    dnsrev.append(x)
                    if x not in full:
                        full.append(x)
        print "Hosts found after reverse lookup:"
        print "---------------------------------"
        for xh in dnsrev:
            print xh
    #DNS Brute force####################################################
    dnsres = []
    if dnsbrute == True:
        print "\n[-] Starting DNS brute force:"
        a = dnssearch.dns_force(word, dnsserver, verbose=True)
        res = a.process()
        print "\n[+] Hosts found after DNS brute force:\n"
        for y in res:
            print y
            dnsres.append(y)
            if y not in full:
                full.append(y)
    #DNS TLD expansion###################################################
    dnstldres = []
    if dnstld == True:
        print "[-] Starting DNS TLD expansion:"
        a = dnssearch.dns_tld(word, dnsserver, verbose=True)
        res = a.process()
        print "\n[+] Hosts found after DNS TLD expansion:"
        print "=========================================="
        for y in res:
            print y
            dnstldres.append(y)
            if y not in full:
                full.append(y)

    #Virtual hosts search###############################################
    if virtual == "basic":
        print "[+] Virtual hosts:"
        print "=================="
        for l in host_ip:
            search = bingsearch.search_bing(l, limit, start)
            search.process_vhost()
            res = search.get_allhostnames()
            for x in res:
                x = re.sub(r'[[\<\/?]*[\w]*>]*','',x)
                x = re.sub('<','',x)
                x = re.sub('>','',x)
                print l + "\t" + x
                vhost.append(l + ":" + x)
                full.append(l + ":" + x)
        vhost=sorted(set(vhost))
    else:
        pass
    shodanres = []
    shodanvisited = []
    if shodan == True:
        print "[+] Shodan Database search:"
        for x in full:
            print x
            try:
                ip = x.split(":")[0]
                if not shodanvisited.count(ip):
                    print "\tSearching for: " + x
                    a = shodansearch.search_shodan(ip)
                    shodanvisited.append(ip)
                    results = a.run()
                    for res in results:
                        shodanres.append(
                            x + "SAPO" + str(res['banner']) + "SAPO" + str(res['port']))
            except:
                pass
        print "[+] Shodan results:"
        print "==================="
        for x in shodanres:
            print x.split("SAPO")[0] + ":" + x.split("SAPO")[1]
    else:
        pass

    ###################################################################
    # Here i need to add explosion mode.
    # Tengo que sacar los TLD para hacer esto.
    recursion = None
    if recursion:
        start = 0
        for word in vhost:
            search = googlesearch.search_google(word, limit, start)
            search.process()
            emails = search.get_emails()
            hosts = search.get_hostnames()
            print emails
            print hosts
    else:
        pass

    #Reporting#######################################################
    if filename != "":
        try:
            print "[+] Saving files..."
            html = htmlExport.htmlExport(
                all_emails,
                full,
                vhost,
                dnsres,
                dnsrev,
                filename,
                word,
                shodanres,
                dnstldres)
            save = html.writehtml()
        except Exception as e:
            print e
            print "Error creating the file"
        try:
            filename = filename.split(".")[0] + ".xml"
            file = open(filename, 'w')
            file.write('<?xml version="1.0" encoding="UTF-8"?><theHarvester>')
            for x in all_emails:
                file.write('<email>' + x + '</email>')

            for x in full:
                x = x.split(":")
                if len(x) == 2:
                    file.write('<host>' + '<ip>' + x[0] + '</ip><hostname>' + x[1]  + '</hostname>' + '</host>')
                else:
                    file.write('<host>' + x + '</host>')
            for x in vhost:
                x = x.split(":")
                if len(x) == 2:
                    file.write('<vhost>' + '<ip>' + x[0] + '</ip><hostname>' + x[1]  + '</hostname>' + '</vhost>')
                else:
                    file.write('<vhost>' + x + '</vhost>')

            if shodanres != []:
                shodanalysis = []
                for x in shodanres:
                    res = x.split("SAPO")
                    # print " res[0] " + res[0] # ip/host
                    # print " res[1] " + res[1] # banner/info
                    # print " res[2] " + res[2] # port
                    file.write('<shodan>')
                    #page.h3(res[0])
                    file.write('<host>' + res[0] + '</host>')
                    #page.a("Port :" + res[2])
                    file.write('<port>' + res[2] + '</port>')
                    #page.pre(res[1])
                    file.write('<banner><!--' + res[1] + '--></banner>')
                    
                    
                    reg_server = re.compile('Server:.*')
                    temp = reg_server.findall(res[1])
                    if temp != []:
                        shodanalysis.append(res[0] + ":" + temp[0])
                    
                    file.write('</shodan>')
                if shodanalysis != []:
                    shodanalysis=sorted(set(shodanalysis))
                    file.write('<servers>')
                    for x in shodanalysis:
                        #page.pre(x)
                        file.write('<server>' + x + '</server>')
                    file.write('</servers>')
                    

            file.write('</theHarvester>')
            file.flush()
            file.close()
            print "Files saved!"
        except Exception as er:
            print "Error saving XML file: " + er
        sys.exit()

Example 40

Project: theHarvester
Source File: theHarvester.py
View license
def start(argv):
    if len(sys.argv) < 4:
        usage()
        sys.exit()
    try:
        opts, args = getopt.getopt(argv, "l:d:b:s:vf:nhcte:")
    except getopt.GetoptError:
        usage()
        sys.exit()
    start = 0
    host_ip = []
    filename = ""
    bingapi = "yes"
    dnslookup = False
    dnsbrute = False
    dnstld = False
    shodan = False
    vhost = []
    virtual = False
    limit = 100
    dnsserver = ""
    for opt, arg in opts:
        if opt == '-l':
            limit = int(arg)
        elif opt == '-d':
            word = arg
        elif opt == '-s':
            start = int(arg)
        elif opt == '-v':
            virtual = "basic"
        elif opt == '-f':
            filename = arg
        elif opt == '-n':
            dnslookup = True
        elif opt == '-c':
            dnsbrute = True
        elif opt == '-h':
            shodan = True
        elif opt == '-e':
            dnsserver = arg
        elif opt == '-t':
            dnstld = True
        elif opt == '-b':
            engine = arg
            if engine not in ("baidu", "bing", "bingapi","dogpile", "google", "googleCSE", "googleplus", "google-profiles","linkedin", "pgp", "twitter", "vhost", "yahoo", "all"):
                usage()
                print "Invalid search engine, try with: baidu, bing, bingapi, dogpile, google, googleCSE, googleplus, google-profiles, linkedin, pgp, twitter, vhost, yahoo, all"
                sys.exit()
            else:
                pass
    if engine == "google":
        print "[-] Searching in Google:"
        search = googlesearch.search_google(word, limit, start)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    if engine == "googleCSE":
        print "[-] Searching in Google Custom Search:"
        search = googleCSE.search_googleCSE(word, limit, start)
        search.process()
        search.store_results()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "bing" or engine == "bingapi":
        print "[-] Searching in Bing:"
        search = bingsearch.search_bing(word, limit, start)
        if engine == "bingapi":
            bingapi = "yes"
        else:
            bingapi = "no"
        search.process(bingapi)
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "dogpile":
        print "[-] Searching in Dogpilesearch.."
        search = dogpilesearch.search_dogpile(word, limit)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "pgp":
        print "[-] Searching in PGP key server.."
        search = pgpsearch.search_pgp(word)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "yahoo":
        print "[-] Searching in Yahoo.."
        search = yahoosearch.search_yahoo(word, limit)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "baidu":
        print "[-] Searching in Baidu.."
        search = baidusearch.search_baidu(word, limit)
        search.process()
        all_emails = search.get_emails()
        all_hosts = search.get_hostnames()

    elif engine == "googleplus":
        print "[-] Searching in Google+ .."
        search = googleplussearch.search_googleplus(word, limit)
        search.process()
        people = search.get_people()
        print "Users from Google+:"
       	print "===================="
       	for user in people:
            print user
        sys.exit()

    elif engine == "twitter":
        print "[-] Searching in Twitter .."
        search = twittersearch.search_twitter(word, limit)
        search.process()
        people = search.get_people()
        print "Users from Twitter:"
       	print "===================="
       	for user in people:
            print user
        sys.exit()

    elif engine == "linkedin":
        print "[-] Searching in Linkedin.."
        search = linkedinsearch.search_linkedin(word, limit)
        search.process()
        people = search.get_people()
        print "Users from Linkedin:"
       	print "===================="
       	for user in people:
            print user
        sys.exit()
    elif engine == "google-profiles":
        print "[-] Searching in Google profiles.."
        search = googlesearch.search_google(word, limit, start)
        search.process_profiles()
        people = search.get_profiles()
        print "Users from Google profiles:"
        print "---------------------------"
        for users in people:
            print users
        sys.exit()
    elif engine == "all":
        print "Full harvest.."
        all_emails = []
        all_hosts = []
        virtual = "basic"
        print "[-] Searching in Google.."
        search = googlesearch.search_google(word, limit, start)
        search.process()
        emails = search.get_emails()
        hosts = search.get_hostnames()
        all_emails.extend(emails)
        all_hosts.extend(hosts)
        print "[-] Searching in PGP Key server.."
        search = pgpsearch.search_pgp(word)
        search.process()
        emails = search.get_emails()
        hosts = search.get_hostnames()
        all_hosts.extend(hosts)
        all_emails.extend(emails)
        print "[-] Searching in Bing.."
        bingapi = "no"
        search = bingsearch.search_bing(word, limit, start)
        search.process(bingapi)
        emails = search.get_emails()
        hosts = search.get_hostnames()
        all_hosts.extend(hosts)
        all_emails.extend(emails)
        print "[-] Searching in Exalead.."
        search = exaleadsearch.search_exalead(word, limit, start)
        search.process()
        emails = search.get_emails()
        hosts = search.get_hostnames()
        all_hosts.extend(hosts)
        all_emails.extend(emails)

        #Clean up email list, sort and uniq
        all_emails=sorted(set(all_emails))
    #Results############################################################
    print "\n\n[+] Emails found:"
    print "------------------"
    if all_emails == []:
        print "No emails found"
    else:
        print "\n".join(all_emails)

    print "\n[+] Hosts found in search engines:"
    print "------------------------------------"
    if all_hosts == []:
        print "No hosts found"
    else:
        all_hosts=sorted(set(all_hosts))
        print "[-] Resolving hostnames IPs... "
        full_host = hostchecker.Checker(all_hosts)
        full = full_host.check()
        for host in full:
            ip = host.split(':')[0]
            print host
            if host_ip.count(ip.lower()):
                pass
            else:
                host_ip.append(ip.lower())

    #DNS reverse lookup#################################################
    dnsrev = []
    if dnslookup == True:
        print "\n[+] Starting active queries:"
        analyzed_ranges = []
        for x in full:
            ip = x.split(":")[0]
            range = ip.split(".")
            range[3] = "0/24"
            range = string.join(range, '.')
            if not analyzed_ranges.count(range):
                print "[-]Performing reverse lookup in :" + range
                a = dnssearch.dns_reverse(range, True)
                a.list()
                res = a.process()
                analyzed_ranges.append(range)
            else:
                continue
            for x in res:
                if x.count(word):
                    dnsrev.append(x)
                    if x not in full:
                        full.append(x)
        print "Hosts found after reverse lookup:"
        print "---------------------------------"
        for xh in dnsrev:
            print xh
    #DNS Brute force####################################################
    dnsres = []
    if dnsbrute == True:
        print "\n[-] Starting DNS brute force:"
        a = dnssearch.dns_force(word, dnsserver, verbose=True)
        res = a.process()
        print "\n[+] Hosts found after DNS brute force:\n"
        for y in res:
            print y
            dnsres.append(y)
            if y not in full:
                full.append(y)
    #DNS TLD expansion###################################################
    dnstldres = []
    if dnstld == True:
        print "[-] Starting DNS TLD expansion:"
        a = dnssearch.dns_tld(word, dnsserver, verbose=True)
        res = a.process()
        print "\n[+] Hosts found after DNS TLD expansion:"
        print "=========================================="
        for y in res:
            print y
            dnstldres.append(y)
            if y not in full:
                full.append(y)

    #Virtual hosts search###############################################
    if virtual == "basic":
        print "[+] Virtual hosts:"
        print "=================="
        for l in host_ip:
            search = bingsearch.search_bing(l, limit, start)
            search.process_vhost()
            res = search.get_allhostnames()
            for x in res:
                x = re.sub(r'[[\<\/?]*[\w]*>]*','',x)
                x = re.sub('<','',x)
                x = re.sub('>','',x)
                print l + "\t" + x
                vhost.append(l + ":" + x)
                full.append(l + ":" + x)
        vhost=sorted(set(vhost))
    else:
        pass
    shodanres = []
    shodanvisited = []
    if shodan == True:
        print "[+] Shodan Database search:"
        for x in full:
            print x
            try:
                ip = x.split(":")[0]
                if not shodanvisited.count(ip):
                    print "\tSearching for: " + x
                    a = shodansearch.search_shodan(ip)
                    shodanvisited.append(ip)
                    results = a.run()
                    for res in results:
                        shodanres.append(
                            x + "SAPO" + str(res['banner']) + "SAPO" + str(res['port']))
            except:
                pass
        print "[+] Shodan results:"
        print "==================="
        for x in shodanres:
            print x.split("SAPO")[0] + ":" + x.split("SAPO")[1]
    else:
        pass

    ###################################################################
    # Here i need to add explosion mode.
    # Tengo que sacar los TLD para hacer esto.
    recursion = None
    if recursion:
        start = 0
        for word in vhost:
            search = googlesearch.search_google(word, limit, start)
            search.process()
            emails = search.get_emails()
            hosts = search.get_hostnames()
            print emails
            print hosts
    else:
        pass

    #Reporting#######################################################
    if filename != "":
        try:
            print "[+] Saving files..."
            html = htmlExport.htmlExport(
                all_emails,
                full,
                vhost,
                dnsres,
                dnsrev,
                filename,
                word,
                shodanres,
                dnstldres)
            save = html.writehtml()
        except Exception as e:
            print e
            print "Error creating the file"
        try:
            filename = filename.split(".")[0] + ".xml"
            file = open(filename, 'w')
            file.write('<?xml version="1.0" encoding="UTF-8"?><theHarvester>')
            for x in all_emails:
                file.write('<email>' + x + '</email>')

            for x in full:
                x = x.split(":")
                if len(x) == 2:
                    file.write('<host>' + '<ip>' + x[0] + '</ip><hostname>' + x[1]  + '</hostname>' + '</host>')
                else:
                    file.write('<host>' + x + '</host>')
            for x in vhost:
                x = x.split(":")
                if len(x) == 2:
                    file.write('<vhost>' + '<ip>' + x[0] + '</ip><hostname>' + x[1]  + '</hostname>' + '</vhost>')
                else:
                    file.write('<vhost>' + x + '</vhost>')

            if shodanres != []:
                shodanalysis = []
                for x in shodanres:
                    res = x.split("SAPO")
                    # print " res[0] " + res[0] # ip/host
                    # print " res[1] " + res[1] # banner/info
                    # print " res[2] " + res[2] # port
                    file.write('<shodan>')
                    #page.h3(res[0])
                    file.write('<host>' + res[0] + '</host>')
                    #page.a("Port :" + res[2])
                    file.write('<port>' + res[2] + '</port>')
                    #page.pre(res[1])
                    file.write('<banner><!--' + res[1] + '--></banner>')
                    
                    
                    reg_server = re.compile('Server:.*')
                    temp = reg_server.findall(res[1])
                    if temp != []:
                        shodanalysis.append(res[0] + ":" + temp[0])
                    
                    file.write('</shodan>')
                if shodanalysis != []:
                    shodanalysis=sorted(set(shodanalysis))
                    file.write('<servers>')
                    for x in shodanalysis:
                        #page.pre(x)
                        file.write('<server>' + x + '</server>')
                    file.write('</servers>')
                    

            file.write('</theHarvester>')
            file.flush()
            file.close()
            print "Files saved!"
        except Exception as er:
            print "Error saving XML file: " + er
        sys.exit()

Example 41

Project: yalih
Source File: executemechanize.py
View license
def executemechanize(urldict):

        url = urldict["url"]
        url_no = urldict["counter"]

        #Array of redirections
        threadlocal.__setattr__('redirection_list', [])

        # Mechanize Settings
        br = mechanize.Browser()
        cj = cookielib.LWPCookieJar()
        br.set_cookiejar(cj)
        br.set_handle_equiv(True)
        br.set_handle_gzip(True)
        br.set_handle_redirect(True)
        br.set_handle_referer(False)
        br.set_handle_robots(False)
        br.set_debug_responses(True)
        br.set_debug_redirects(True)
        br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=0)
        br.set_proxies(honeypotconfig.proxy)
        br.encoding = "UTF-8"
        # Add HTTP Basic/Digest auth username and password for HTTP proxy access.
        # (equivalent to using "joe:[email protected]" form above)
        #       br.add_proxy_password("username", "password")

        # Set header, referrer, accept language from honeypotconfig
        if honeypotconfig.referrer:
                br.addheaders = [('User-Agent', honeypotconfig.useragent),('Accept', 'text/html,application/xhtml+xml,application/xml,text/javascript;q=0.9,*/*;q=0.8'),('Accept-Language', honeypotconfig.acceptlang),('Accept-Encoding', 'gzip,deflate'),('Referer', honeypotconfig.referrer)]
        else:
                br.addheaders = [('User-Agent', honeypotconfig.useragent),('Accept', 'text/html,application/xhtml+xml,application/xml,text/javascript;q=0.9,*/*;q=0.8'),('Accept-Language', honeypotconfig.acceptlang),('Accept-Encoding', 'gzip,deflate'),('Referer', host)] #'https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source='+url)]

        cj.add_cookie_header(br)

        try:
                r = br.open(url, timeout=12.0)

                # Parse url (url after redirections)
                scheme, host, path, params, query, fragment = urlparse.urlparse(r.geturl())

                # Print redirection route if exist
                threadlocal.__setattr__('redirect', mechanize._redirection.redirection())

                # Extract and format URL
                extracted = tldextract.extract(url)
#               print extracted
                #formatted = "{}.{}".format(extracted.domain, extracted.tld)
                formatted = "{}.{}.{}".format(extracted.subdomain, extracted.domain, extracted.suffix)
                print formatted

                # Extract each link in the redirection list and match it aginst the formatted URL
                for eachredirect in threadlocal.redirection_list:
                        list_extract = tldextract.extract(eachredirect)
                        list_format = "{}.{}.{}".format(list_extract.subdomain, list_extract.domain, list_extract.suffix)
#                       print list_format
                        if list_format == formatted:
                                pass
                        if not list_format == formatted:
                                if threadlocal.redirection_list:
                                        logger.info(str(url_no) + ",\t" + url + ",\t" + "Redirection Route" + ",\t" +str(threadlocal.redirection_list))
                                        break

                #if threadlocal.redirection_list:
                        #logger.info(str(url_no) + ",\t" + url + ",\t" + "Redirection Route" + ",\t" +str(threadlocal.redirection_list))

                # Convert url into valid file name
                fdirname = urllib.quote_plus(url)
                if (len(fdirname) > 250):
                        fdirname = fdirname[:247]

# Folder Generation

                # Gets first character of website to store alphabetically
                first_char = re.sub(r"(http://|https://)?(www.)?", "", url)[:1]
                second_char = re.sub(r"(http://|https://)?(www.)?", "", url)[1:3]

                directory_name = os.path.join(honeypotconfig.wdir, honeypotconfig.tmpfolder, first_char,  second_char, fdirname)

                # If using proxy, names directory in the format ip_address:port
                if honeypotconfig.proxy:
                        proxy_name = re.search(r":\s?['\"](.*)\s?['\"]", str(honeypotconfig.proxy)).group(1)
                        directory_name = os.path.join(honeypotconfig.wdir, proxy_name, first_char, second_char, fdirname)

                create_directory(directory_name)

                # Fetch array of javascript url
                jsurl_list_old, jsurl_list, url_list = js_extraction(br.response().read(), scheme, host)

                # Remove duplicates
                jsurl_list_unique = set(jsurl_list)
                del jsurl_list[:]

                # Modify javascript paths in html if relative path
                fp = open(os.path.join(directory_name, fdirname), "wb")
                new_js_path = br.response().read()
                for link in jsurl_list_old:
                        if not link.lower().startswith(("www.","http://","https://")):
                                js_name=link[link.rfind("/") + 1:]
                                new_js_path = re.sub(re.escape(link), "./javascripts/" + js_name, new_js_path)

                fp.write(new_js_path)
                fp.close()

                del jsurl_list_old[:]

                # Grab the current extension of the file and check the true extension
                # Rename if differ
                current_ext = os.path.splitext(os.path.join(directory_name, fdirname))[1]
                guess_ext = mimetypes.guess_extension(magic.from_file(os.path.join(directory_name, fdirname), mime=True))
                if (guess_ext is not current_ext and guess_ext is not None):
                        os.rename((os.path.join(directory_name, fdirname)), (os.path.join(directory_name, fdirname)) + str(guess_ext))

#Fetching .js Files

                if len(jsurl_list_unique) != 0:
                        create_directory(os.path.join(directory_name,  "javascripts"))


                for link in jsurl_list_unique:
                        try:
                                r = br.open(link, timeout=12.0)
                                logger.info(str(url_no) + ",\t" + url + ",\tJS retrieve,\t" + link)
                                js_name = link[link.rfind("/") + 1:]
                                response = br.response().read()

                                # If it doesn't end with ".js" eg. "abc.js?key=123" truncate after "?"
                                if not js_name.endswith(".js"):
                                        js_name = js_name[0:js_name.rfind("?")]

                                # Writes js file
                                js_file_path = os.path.join(honeypotconfig.wdir, honeypotconfig.tmpfolder, first_char, second_char, fdirname, "javascripts", js_name)
                                if honeypotconfig.proxy:
                                        proxyname = re.search(r":\s?['\"](.*)\s?['\"]", str(honeypotconfig.proxy)).group(1)
                                        js_file_path = os.path.join(honeypotconfig.wdir, proxyname, first_char, second_char, fdirname, "javascripts", js_name)
                                jswrite = open(js_file_path, 'w')
                                jswrite.write(response)

                                if honeypotconfig.jsbeautifier:
                                        jswrite.write("\n====================================================\n")
                                        jswrite.write("====================Beautified Below================\n")
                                        with open(js_file_path , 'a') as f:
                                                beautify_script_string = jsbeautifier.beautify(response, opts)
                                                f.write(str(beautify_script_string))
                                jswrite.close()

                        except Exception, e:
                                try:
                                        logger.error(str(url_no) + ",\t" + url.strip() + ",\t" + str(e) + ",\t" + link,  extra = {'error_code' : str(e.code)})
                                except AttributeError:
                                        logger.error(str(url_no) + ",\t" + url.strip() + ",\t" + str(e) + ",\t" + link,  extra = {'error_code' : ""})

                jsurl_list_unique.clear()

                # Check for executable files and saves them
                exe_list = []

                if exe_crawler:
                        exe_list = exe_extraction(url_list)

                if len(exe_list) != 0:
                        create_directory(os.path.join(directory_name,  "exe"))

                for link in exe_list:
                        try:
                                # Read header to check for exe size
                                # Only downloads if less than a threshold (set in honeypotconfig)
                                r = urllib2.urlopen(link, timeout=12)
                                size = int(r.headers["Content-Length"]) / 1024
                                exename = link[link.rfind("/") + 1:]
                                if size < honeypotconfig.exe_max_size:
                                        logger.info(str(url_no) + ",\t" + url + ",\t" + "EXE retrieve,\t" + link)
                                        exe_file_path = os.path.join(honeypotconfig.wdir, honeypotconfig.tmpfolder, first_char, second_char, fdirname, "exe", exename)
                                        if honeypotconfig.proxy:
                                                proxyname = re.search(r":\s?['\"](.*)\s?['\"]", str(honeypotconfig.proxy)).group(1)
                                                exe_file_path = os.path.join(honeypotconfig.wdir, proxyname, first_char, second_char, fdirname, "exe", js_name)
                                        exewrite = open(exe_file_path, 'w')
                                        exewrite.write(br.response().read())
                                        exewrite.close()
                                else:
                                        logger.info(str(url_no) + ",\t" + url + ",\t" + "EXE " + str(size) + "KB above exe_max_size" + ",\t" + link)
                        except Exception, e:
                                try:
                                        logger.error(str(url_no) + ",\t" + url.strip() + ",\t" + str(e) + ",\t" + link,  extra = {'error_code' : str(e.code)})
                                except AttributeError:
                                        logger.error(str(url_no) + ",\t" + url.strip() + ",\t" + str(e) + ",\t" + link,  extra = {'error_code' : ""})

                del exe_list[:]
                del url_list[:]

        except Exception, e:
                try:
                        logger.error(str(url_no) + ",\t" + url.strip() + "\tpoop" + ",\t" + str(e), extra = {'error_code' : str(e.code)})
                except AttributeError:
#                       logger.error(str(url_no) + ",\t" + url.strip() + "\tpoop" +",\t" + "Error 418: I'm a teapot", extra = {'error_code' : ""})
#                       else:
                        logger.error(str(url_no) + ",\t" + url.strip() + "\tpoop" +",\t" + str(e), extra = {'error_code' : ""})

Example 42

Project: maglica
Source File: vm.py
View license
def clone(args):
    image = args["image"]
    hostname = args["hostname"]

    format = None
    if args.has_key("format"):
        format = args["format"]

    conn = libvirt.open(None)
    dom = conn.lookupByName(image)
    desc = fromstring(dom.XMLDesc(libvirt.VIR_DOMAIN_XML_SECURE))

    sources = []
    for disk in desc.findall(".//disk"):
        if disk.get("device") == "disk":
            sources.append(disk.find(".//source").get("file"))

    target_paths = []
    for source in sources:
        target_file = os.path.basename(source)
        target_file = target_file.replace(image, hostname)
        target_dir = _select_most_free_dir(conn)
        if not target_dir:
            target_dir = os.path.dirname(source)

        target_paths.append(os.path.join(target_dir, target_file))

    cmdline = [
        "virt-clone",
        "-o",
        image,
        "-n",
        hostname,
    ]

    for path in target_paths:
        cmdline.append("-f")
        cmdline.append(path)

    proc = subprocess.Popen(
        cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()

    message = None
    status = 1
    if proc.returncode:
        status = 2
        message = stderr

    g = guestfs.GuestFS()
    for path in target_paths:
        g.add_drive(path)
    g.launch()

    roots = g.inspect_os()
    mountpoints = g.inspect_get_mountpoints(roots[0])
    mountpoints.sort()

    for mountpoint in mountpoints:
        g.mount(mountpoint[1], mountpoint[0])

    ostype = None
    if g.exists('/etc/redhat-release'):
        ostype = 'redhat'
    elif g.exists('/etc/debian_version'):
        ostype = 'debian'

    # TODO: OS 毎に別モジュールにする
    if ostype == 'redhat':
        ifcfg = '''DEVICE=%s
BOOTPROTO=dhcp
ONBOOT=yes
TYPE="Ethernet"
DHCP_HOSTNAME=%s
'''

        network = '''NETWORKING=yes
HOSTNAME=%s
'''

        ifcfg0 = ifcfg % ('eth0', hostname)
        network = network % (hostname)

        g.write_file('/etc/sysconfig/network-scripts/ifcfg-eth0', ifcfg0, 0)
        g.write_file('/etc/sysconfig/network', network, 0)
        g.write_file('/etc/udev/rules.d/70-persistent-net.rules', '', 0)

        if g.exists('/etc/sysconfig/network-scripts/ifcfg-eth1'):
            ifcfg1 = ifcfg % ('eth1', re.sub(r"\.pb$", ".pblan", hostname))
            g.write_file(
                '/etc/sysconfig/network-scripts/ifcfg-eth1', ifcfg1, 0)

    elif ostype == 'debian':
        g.write_file('/etc/hosts', '127.0.0.1    localhost', 0)
        g.write_file('/etc/hostname', hostname, 0)
        g.write_file('/etc/udev/rules.d/70-persistent-net.rules', '', 0)

        interface = '''
interface "%s" {
  send host-name "%s";
}
'''
        eth0 = interface % ('eth0', hostname)
        eth1 = interface % ('eth1', re.sub(r"\.pb$", ".pblan", hostname))

        conf = g.read_file('/etc/dhcp/dhclient.conf')
        g.write_file('/etc/dhcp/dhclient.conf', conf + eth0 + eth1, 0)

    shadow = g.read_file("/etc/shadow")
    g.write_file("/etc/shadow", re.sub(
        r"^root:[^:]+:", "root:$1$ZJsvbRbB$dWzQZuu8dDFR8wr6PTPjp0:", shadow), 0)

    if format == "vmdk":
        grub = g.read_file("/boot/grub/grub.conf")
        g.write_file("/boot/grub/grub.conf", re.sub(
            r"console=[^\s]+", "", grub), 0)

    g.sync()
    g.umount_all()

    dom = conn.lookupByName(hostname)
    if args["start"] and format != "vmdk":
        dom.create()

    if format == "vmdk":
        vmdk_path = "/var/www/html/maglica/%s.vmdk" % hostname
        cmdline = [
            "qemu-img",
            "convert",
            "-f",
            "raw",
            "-O",
            "vmdk",
            target_paths[0],
            vmdk_path,
        ]

        proc = subprocess.Popen(
            cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()

        if proc.returncode:
            status = 2
            message = stderr
        else:
            message = "Get vmdk file from http://%s/maglica/%s.vmdk" % (
                socket.gethostname(), hostname)

        remove({"name": hostname})

    if status == 1 and not message:
        message = "%s was cloned from %s on %s successfully" % (
            hostname, image, socket.gethostname())

    return {
        "message": message,
        "status": status,
    }

Example 43

Project: maglica
Source File: vm.py
View license
def clone(args):
    image = args["image"]
    hostname = args["hostname"]

    format = None
    if args.has_key("format"):
        format = args["format"]

    conn = libvirt.open(None)
    dom = conn.lookupByName(image)
    desc = fromstring(dom.XMLDesc(libvirt.VIR_DOMAIN_XML_SECURE))

    sources = []
    for disk in desc.findall(".//disk"):
        if disk.get("device") == "disk":
            sources.append(disk.find(".//source").get("file"))

    target_paths = []
    for source in sources:
        target_file = os.path.basename(source)
        target_file = target_file.replace(image, hostname)
        target_dir = _select_most_free_dir(conn)
        if not target_dir:
            target_dir = os.path.dirname(source)

        target_paths.append(os.path.join(target_dir, target_file))

    cmdline = [
        "virt-clone",
        "-o",
        image,
        "-n",
        hostname,
    ]

    for path in target_paths:
        cmdline.append("-f")
        cmdline.append(path)

    proc = subprocess.Popen(
        cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()

    message = None
    status = 1
    if proc.returncode:
        status = 2
        message = stderr

    g = guestfs.GuestFS()
    for path in target_paths:
        g.add_drive(path)
    g.launch()

    roots = g.inspect_os()
    mountpoints = g.inspect_get_mountpoints(roots[0])
    mountpoints.sort()

    for mountpoint in mountpoints:
        g.mount(mountpoint[1], mountpoint[0])

    ostype = None
    if g.exists('/etc/redhat-release'):
        ostype = 'redhat'
    elif g.exists('/etc/debian_version'):
        ostype = 'debian'

    # TODO: OS 毎に別モジュールにする
    if ostype == 'redhat':
        ifcfg = '''DEVICE=%s
BOOTPROTO=dhcp
ONBOOT=yes
TYPE="Ethernet"
DHCP_HOSTNAME=%s
'''

        network = '''NETWORKING=yes
HOSTNAME=%s
'''

        ifcfg0 = ifcfg % ('eth0', hostname)
        network = network % (hostname)

        g.write_file('/etc/sysconfig/network-scripts/ifcfg-eth0', ifcfg0, 0)
        g.write_file('/etc/sysconfig/network', network, 0)
        g.write_file('/etc/udev/rules.d/70-persistent-net.rules', '', 0)

        if g.exists('/etc/sysconfig/network-scripts/ifcfg-eth1'):
            ifcfg1 = ifcfg % ('eth1', re.sub(r"\.pb$", ".pblan", hostname))
            g.write_file(
                '/etc/sysconfig/network-scripts/ifcfg-eth1', ifcfg1, 0)

    elif ostype == 'debian':
        g.write_file('/etc/hosts', '127.0.0.1    localhost', 0)
        g.write_file('/etc/hostname', hostname, 0)
        g.write_file('/etc/udev/rules.d/70-persistent-net.rules', '', 0)

        interface = '''
interface "%s" {
  send host-name "%s";
}
'''
        eth0 = interface % ('eth0', hostname)
        eth1 = interface % ('eth1', re.sub(r"\.pb$", ".pblan", hostname))

        conf = g.read_file('/etc/dhcp/dhclient.conf')
        g.write_file('/etc/dhcp/dhclient.conf', conf + eth0 + eth1, 0)

    shadow = g.read_file("/etc/shadow")
    g.write_file("/etc/shadow", re.sub(
        r"^root:[^:]+:", "root:$1$ZJsvbRbB$dWzQZuu8dDFR8wr6PTPjp0:", shadow), 0)

    if format == "vmdk":
        grub = g.read_file("/boot/grub/grub.conf")
        g.write_file("/boot/grub/grub.conf", re.sub(
            r"console=[^\s]+", "", grub), 0)

    g.sync()
    g.umount_all()

    dom = conn.lookupByName(hostname)
    if args["start"] and format != "vmdk":
        dom.create()

    if format == "vmdk":
        vmdk_path = "/var/www/html/maglica/%s.vmdk" % hostname
        cmdline = [
            "qemu-img",
            "convert",
            "-f",
            "raw",
            "-O",
            "vmdk",
            target_paths[0],
            vmdk_path,
        ]

        proc = subprocess.Popen(
            cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()

        if proc.returncode:
            status = 2
            message = stderr
        else:
            message = "Get vmdk file from http://%s/maglica/%s.vmdk" % (
                socket.gethostname(), hostname)

        remove({"name": hostname})

    if status == 1 and not message:
        message = "%s was cloned from %s on %s successfully" % (
            hostname, image, socket.gethostname())

    return {
        "message": message,
        "status": status,
    }

Example 44

Project: parlparse
Source File: createhansardindex.py
View license
def CmIndexFromPage(urllinkpage):
	urlinkpage = urllib.urlopen(urllinkpage)
        #print "urllinkpage ", urllinkpage
	srlinkpage = urlinkpage.read()
	urlinkpage.close()

	# remove comments because they sometimes contain wrong links
	srlinkpage = re.sub('<!--[\s\S]*?-->', ' ', srlinkpage)


	# <b>Wednesday 5 November 2003</b>
	#<td colspan=2><font size=+1><b>Wednesday 5 November 2003</b></font></td>
	# <a href="../cm199900/cmhansrd/vo000309/debindx/00309-x.htm">Oral Questions and Debates</a>

	# this was when I didn't use the match objects, and prefered this more direct detction thing
	datelinks = redateindexlinks.findall(srlinkpage)

	# read the dates and links in order, and associate last date with each matching link
	sdate = ''
	for link1 in datelinks:
		if link1[0]:
			odate = re.sub('\s', ' ', link1[0])
                        if odate == 'Wednesday 1 November' and urllinkpage == 'http://www.publications.parliament.uk/pa/cm/cmhn0611.htm':
                                odate = 'Wednesday 1 November 2006'
                        if odate == 'Tuesday 9 November 2008' and sdate=='':
                                odate = 'Tuesday 9 December 2008'
                        if odate == 'Wednesday 10 November 2008' and sdate=='':
                                odate = 'Wednesday 10 December 2008'
                        if odate == 'Tuesday 8 June 2008' and sdate=='':
                                odate = 'Tuesday 8 July 2008'
			sdate = mx.DateTime.DateTimeFrom(odate).date
			continue

		# these come from the special dates (of ranges) listed from above.
		# any more of these and I'll have to make special code to handle them
		if link1[1]:
			if link1[1][0:22] == "<b>Friday 23 July 2004":
				odate = "1 Sept 2004" # the date quoted on the wrans page
			elif link1[1][0:27] == "<b>Friday 17 September 2004":
				odate = "4 October 2004" # the date quoted on the wrans page
			else:
				assert False
			sdate = mx.DateTime.DateTimeFrom(odate).date
			continue

                if link1[2]:
                        odate = re.sub('&nbsp;', ' ', link1[3])
                        if link1[3] == 'Friday, 6 February 2003':
                                odate = '7 February 2003'
                        if link1[3] == 'Thursday, 24th February 1999':
                                odate = '25 February 1999'
                        sdate = mx.DateTime.DateTimeFrom(odate).date
                        if sdate < earliestdate:
                                continue
                        uind = urlparse.urljoin(urllinkpage, re.sub('\s', '', link1[2]))
                        typ = "Votes and Proceedings"
                elif link1[4]:
                        odate = re.sub('\s+', ' ', link1[5].replace('&nbsp;', ' '))
                        sdate = mx.DateTime.DateTimeFrom(odate).date
                        if sdate < earliestdate:
                                continue
                        uind = urlparse.urljoin(urllinkpage, re.sub('\s', '', link1[4]))
                        typ = "Question Book"
                elif link1[6]:
                        linkhref = link1[6]
                        linktext = link1[7]
                	# the link types by name
		        if not re.search('debate|westminster|written(?i)', linktext):
			        continue

               		if re.search('Chronology', linktext):
        			# print "Chronology:", link
	        		continue

		        # get rid of the new index pages
        		if re.search('/indexes/|cmordbk|/business/', linkhref):
	        		continue

                        if (re.search('Written Answers received between Friday 26 May and Thursday 1 June\s+2006', linktext)):
                                odate = '2 June 2006'
                                sdate = mx.DateTime.DateTimeFrom(odate).date

        		if not sdate:
        			raise Exception, 'No date for link 1 in: ' + urllinkpage + ' ' + ','.join(link1)
        		if sdate < earliestdate:
        			continue

        		# take out spaces and linefeeds we don't want
        		uind = urlparse.urljoin(urllinkpage, re.sub('\s', '', linkhref))
        		typ = string.strip(re.sub('\s+', ' ', linktext))
                        if typ == 'Recess Written Answers':
                                typ = 'Written Answers'

                elif link1[8]:
                        linkhref = link1[8]
                        linktext = link1[9]

                        if re.match('Written Answers and Statements received between<br>\s*Monday 4 September and Friday 8 September 2006', linktext):
                                odate = '11 September 2006'
                        elif re.match('Written Answers received between<br>\s*Wednesday 26 July and Friday 1 September 2006', linktext):
                                odate = '4 September 2006'
                        elif re.match('Written Answers and Statements received between<br>\s*Monday 11 September and Wednesday 13 September 2006', linktext):
                                odate = '13 September 2006'
                        elif re.match('Written Answers and Statements received between<br>\s*Thursday 14 September and Monday 18 September 2006', linktext):
                                odate = '18 September 2006'
                        elif re.match('Written Answers received between<br>\s*Tuesday 19 September and Friday 29 September 2006', linktext):
                                odate = '2 October 2006'
                        elif re.match('Written Answers received between<br>\s*Wednesday 20 December 2006 and Friday 5 January 2007', linktext):
                                odate = '5 January 2007'
                        elif re.match('Written Answers received between<br>\s*Monday 12 February 2007 and Friday 16 February 2007', linktext):
                                odate = '16 February 2007'
                        elif re.match('Written Answers received between<br>\s*Wednesday 12 February 2007 and Friday 16 February 2007', linktext):
                                odate = '16 February 2007'
                        else:
        			raise Exception, 'No date for link 2 in: ' + urllinkpage + ' ' + ','.join(link1)

                        sdate = mx.DateTime.DateTimeFrom(odate).date
        		uind = urlparse.urljoin(urllinkpage, re.sub('\s', '', linkhref))
        		typ = 'Written Answers'

                uind = uind.replace('080227a', '080227')

                # 21st July 2005 has a link, but there was none
                if uind == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo050721/hallindx/50721-x.htm':
                        continue
                if uind == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm071203/hallindx/71203-x.htm':
                        continue
                if uind == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080218/hallindx/80218-x.htm':
                        continue
                if uind == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080225/hallindx/80225-x.htm':
                        continue
                if uind == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080229/hallindx/80229-x.htm':
                        continue
                # 21st June 2005 WHall links to 22nd June
                if sdate=='2005-06-21' and uind=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo050622/hallindx/50622-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo050621/hallindx/50621-x.htm'
                # 25th May 2006 WMS links to 4th May
                if sdate=='2006-05-25' and uind=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060504/wmsindx/60504-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060525/wmsindx/60525-x.htm'
                if sdate=='2007-06-28' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070627/wmsindx/70628-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070628/wmsindx/70628-x.htm'

                if sdate=='2007-02-26' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070129/index/70129-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070226/index/70226-x.htm'
                if sdate=='2007-02-26' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070129/wmsindx/70129-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070226/wmsindx/70226-x.htm'
                if sdate=='2007-02-26' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070129/debindx/70129-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070226/debindx/70226-x.htm'

                if sdate=='2007-01-19' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070126/index/70126-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070119/index/70119-x.htm'
                if sdate=='2007-01-19' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070126/wmsindx/70126-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070119/wmsindx/70119-x.htm'
                if sdate=='2007-01-19' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070126/debindx/70126-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070119/debindx/70119-x.htm'

                if sdate=='2007-10-23' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm071016/debindx/71023-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm071023/debindx/71023-x.htm'
                if sdate=='2007-11-15' and uind=='http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm071114/debindx/71115-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm071115/debindx/71115-x.htm'
                if sdate=='2008-01-15' and uind=='http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080116/index/80115-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080115/index/80115-x.htm'

                # 7th May 2008 debates links to 8th May
                if sdate=='2008-05-07' and uind=='http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080508/debindx/80508-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080507/debindx/80507-x.htm'
                if sdate>='2006-12-05' and sdate<='2006-12-14' and typ=='Westminster Hall':
                        uind = uind.replace('200506', '200607')

		# check for repeats where the URLs differ
		if (sdate, typ) in reses:

			rc = reses[(sdate, typ)]
			otheruind = rc[0]
			if otheruind == uind:
				continue

			# sometimes they have old links to the cm edition as
			# well as the vo edition, we pick the newer vo ones
			# make sure that discrepancies are explainable
			test1 = uind.replace('cmhansrd/cm', 'cmhansrd/vo')
			test2 = otheruind.replace('cmhansrd/cm', 'cmhansrd/vo')
			if test1 != test2:
				raise Exception, '------\nRepeated link to %s %s differs:\nurl1: %s\nurl2: %s\nfrom index page1: %s\nindex2: %s\n------' % (sdate, typ, uind, otheruind, urllinkpage, rc[1])

			# case of two URLs the same only vo/cm differ like this:
			# (which is a bug in Hansard, should never happen)
			#http://www.publications.parliament.uk/pa/cm200203/cmhansrd/vo031006/index/31006-x.htm
			#http://www.publications.parliament.uk/pa/cm200203/cmhansrd/cm031006/index/31006-x.htm
			# we replace both with just the vo edition:
			#print "done replace of these two URLs into the vo one\nurl1: %s\nurl2: %s" % (uind, otheruind)
			uind = test1

		reses[(sdate, typ)] = (uind, urllinkpage)

Example 45

Project: parlparse
Source File: createhansardindex.py
View license
def CmIndexFromPage(urllinkpage):
	urlinkpage = urllib.urlopen(urllinkpage)
        #print "urllinkpage ", urllinkpage
	srlinkpage = urlinkpage.read()
	urlinkpage.close()

	# remove comments because they sometimes contain wrong links
	srlinkpage = re.sub('<!--[\s\S]*?-->', ' ', srlinkpage)


	# <b>Wednesday 5 November 2003</b>
	#<td colspan=2><font size=+1><b>Wednesday 5 November 2003</b></font></td>
	# <a href="../cm199900/cmhansrd/vo000309/debindx/00309-x.htm">Oral Questions and Debates</a>

	# this was when I didn't use the match objects, and prefered this more direct detction thing
	datelinks = redateindexlinks.findall(srlinkpage)

	# read the dates and links in order, and associate last date with each matching link
	sdate = ''
	for link1 in datelinks:
		if link1[0]:
			odate = re.sub('\s', ' ', link1[0])
                        if odate == 'Wednesday 1 November' and urllinkpage == 'http://www.publications.parliament.uk/pa/cm/cmhn0611.htm':
                                odate = 'Wednesday 1 November 2006'
                        if odate == 'Tuesday 9 November 2008' and sdate=='':
                                odate = 'Tuesday 9 December 2008'
                        if odate == 'Wednesday 10 November 2008' and sdate=='':
                                odate = 'Wednesday 10 December 2008'
                        if odate == 'Tuesday 8 June 2008' and sdate=='':
                                odate = 'Tuesday 8 July 2008'
			sdate = mx.DateTime.DateTimeFrom(odate).date
			continue

		# these come from the special dates (of ranges) listed from above.
		# any more of these and I'll have to make special code to handle them
		if link1[1]:
			if link1[1][0:22] == "<b>Friday 23 July 2004":
				odate = "1 Sept 2004" # the date quoted on the wrans page
			elif link1[1][0:27] == "<b>Friday 17 September 2004":
				odate = "4 October 2004" # the date quoted on the wrans page
			else:
				assert False
			sdate = mx.DateTime.DateTimeFrom(odate).date
			continue

                if link1[2]:
                        odate = re.sub('&nbsp;', ' ', link1[3])
                        if link1[3] == 'Friday, 6 February 2003':
                                odate = '7 February 2003'
                        if link1[3] == 'Thursday, 24th February 1999':
                                odate = '25 February 1999'
                        sdate = mx.DateTime.DateTimeFrom(odate).date
                        if sdate < earliestdate:
                                continue
                        uind = urlparse.urljoin(urllinkpage, re.sub('\s', '', link1[2]))
                        typ = "Votes and Proceedings"
                elif link1[4]:
                        odate = re.sub('\s+', ' ', link1[5].replace('&nbsp;', ' '))
                        sdate = mx.DateTime.DateTimeFrom(odate).date
                        if sdate < earliestdate:
                                continue
                        uind = urlparse.urljoin(urllinkpage, re.sub('\s', '', link1[4]))
                        typ = "Question Book"
                elif link1[6]:
                        linkhref = link1[6]
                        linktext = link1[7]
                	# the link types by name
		        if not re.search('debate|westminster|written(?i)', linktext):
			        continue

               		if re.search('Chronology', linktext):
        			# print "Chronology:", link
	        		continue

		        # get rid of the new index pages
        		if re.search('/indexes/|cmordbk|/business/', linkhref):
	        		continue

                        if (re.search('Written Answers received between Friday 26 May and Thursday 1 June\s+2006', linktext)):
                                odate = '2 June 2006'
                                sdate = mx.DateTime.DateTimeFrom(odate).date

        		if not sdate:
        			raise Exception, 'No date for link 1 in: ' + urllinkpage + ' ' + ','.join(link1)
        		if sdate < earliestdate:
        			continue

        		# take out spaces and linefeeds we don't want
        		uind = urlparse.urljoin(urllinkpage, re.sub('\s', '', linkhref))
        		typ = string.strip(re.sub('\s+', ' ', linktext))
                        if typ == 'Recess Written Answers':
                                typ = 'Written Answers'

                elif link1[8]:
                        linkhref = link1[8]
                        linktext = link1[9]

                        if re.match('Written Answers and Statements received between<br>\s*Monday 4 September and Friday 8 September 2006', linktext):
                                odate = '11 September 2006'
                        elif re.match('Written Answers received between<br>\s*Wednesday 26 July and Friday 1 September 2006', linktext):
                                odate = '4 September 2006'
                        elif re.match('Written Answers and Statements received between<br>\s*Monday 11 September and Wednesday 13 September 2006', linktext):
                                odate = '13 September 2006'
                        elif re.match('Written Answers and Statements received between<br>\s*Thursday 14 September and Monday 18 September 2006', linktext):
                                odate = '18 September 2006'
                        elif re.match('Written Answers received between<br>\s*Tuesday 19 September and Friday 29 September 2006', linktext):
                                odate = '2 October 2006'
                        elif re.match('Written Answers received between<br>\s*Wednesday 20 December 2006 and Friday 5 January 2007', linktext):
                                odate = '5 January 2007'
                        elif re.match('Written Answers received between<br>\s*Monday 12 February 2007 and Friday 16 February 2007', linktext):
                                odate = '16 February 2007'
                        elif re.match('Written Answers received between<br>\s*Wednesday 12 February 2007 and Friday 16 February 2007', linktext):
                                odate = '16 February 2007'
                        else:
        			raise Exception, 'No date for link 2 in: ' + urllinkpage + ' ' + ','.join(link1)

                        sdate = mx.DateTime.DateTimeFrom(odate).date
        		uind = urlparse.urljoin(urllinkpage, re.sub('\s', '', linkhref))
        		typ = 'Written Answers'

                uind = uind.replace('080227a', '080227')

                # 21st July 2005 has a link, but there was none
                if uind == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo050721/hallindx/50721-x.htm':
                        continue
                if uind == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm071203/hallindx/71203-x.htm':
                        continue
                if uind == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080218/hallindx/80218-x.htm':
                        continue
                if uind == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080225/hallindx/80225-x.htm':
                        continue
                if uind == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080229/hallindx/80229-x.htm':
                        continue
                # 21st June 2005 WHall links to 22nd June
                if sdate=='2005-06-21' and uind=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo050622/hallindx/50622-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo050621/hallindx/50621-x.htm'
                # 25th May 2006 WMS links to 4th May
                if sdate=='2006-05-25' and uind=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060504/wmsindx/60504-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060525/wmsindx/60525-x.htm'
                if sdate=='2007-06-28' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070627/wmsindx/70628-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070628/wmsindx/70628-x.htm'

                if sdate=='2007-02-26' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070129/index/70129-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070226/index/70226-x.htm'
                if sdate=='2007-02-26' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070129/wmsindx/70129-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070226/wmsindx/70226-x.htm'
                if sdate=='2007-02-26' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070129/debindx/70129-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070226/debindx/70226-x.htm'

                if sdate=='2007-01-19' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070126/index/70126-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070119/index/70119-x.htm'
                if sdate=='2007-01-19' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070126/wmsindx/70126-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070119/wmsindx/70119-x.htm'
                if sdate=='2007-01-19' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070126/debindx/70126-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070119/debindx/70119-x.htm'

                if sdate=='2007-10-23' and uind=='http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm071016/debindx/71023-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm071023/debindx/71023-x.htm'
                if sdate=='2007-11-15' and uind=='http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm071114/debindx/71115-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm071115/debindx/71115-x.htm'
                if sdate=='2008-01-15' and uind=='http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080116/index/80115-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080115/index/80115-x.htm'

                # 7th May 2008 debates links to 8th May
                if sdate=='2008-05-07' and uind=='http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080508/debindx/80508-x.htm':
                        uind = 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080507/debindx/80507-x.htm'
                if sdate>='2006-12-05' and sdate<='2006-12-14' and typ=='Westminster Hall':
                        uind = uind.replace('200506', '200607')

		# check for repeats where the URLs differ
		if (sdate, typ) in reses:

			rc = reses[(sdate, typ)]
			otheruind = rc[0]
			if otheruind == uind:
				continue

			# sometimes they have old links to the cm edition as
			# well as the vo edition, we pick the newer vo ones
			# make sure that discrepancies are explainable
			test1 = uind.replace('cmhansrd/cm', 'cmhansrd/vo')
			test2 = otheruind.replace('cmhansrd/cm', 'cmhansrd/vo')
			if test1 != test2:
				raise Exception, '------\nRepeated link to %s %s differs:\nurl1: %s\nurl2: %s\nfrom index page1: %s\nindex2: %s\n------' % (sdate, typ, uind, otheruind, urllinkpage, rc[1])

			# case of two URLs the same only vo/cm differ like this:
			# (which is a bug in Hansard, should never happen)
			#http://www.publications.parliament.uk/pa/cm200203/cmhansrd/vo031006/index/31006-x.htm
			#http://www.publications.parliament.uk/pa/cm200203/cmhansrd/cm031006/index/31006-x.htm
			# we replace both with just the vo edition:
			#print "done replace of these two URLs into the vo one\nurl1: %s\nurl2: %s" % (uind, otheruind)
			uind = test1

		reses[(sdate, typ)] = (uind, urllinkpage)

Example 46

Project: pombola
Source File: kenya_parser.py
View license
    @classmethod
    def convert_html_to_data(cls, html):

        # Clean out all the &nbsp; now. pdftohtml puts them to preserve the lines
        html = re.sub( r'&nbsp;', ' ', html )
        html = re.sub( r'&#160;', ' ', html )

        # create a soup out of the html
        soup = BeautifulSoup(
            html,
            convertEntities=BeautifulStoneSoup.HTML_ENTITIES
        )

        if not soup.body:
            raise Exception, "No <body> was found - output probably isn't HTML"
        contents = soup.body.contents

        # counters to use in the loops below
        br_count    = 0
        page_number = 1

        filtered_contents = []

        while len(contents):
            line = contents.pop(0)

            # get the tag name if there is one
            tag_name = line.name if type(line) == Tag else None

            # count <br> tags - we use two or more in succession to decide that
            # we've moved on to a new bit of text
            if tag_name == 'br':
                br_count += 1
                continue

            # skip empty lines
            if tag_name == None:
                text_content = unicode(line)
            else:
                text_content = line.text

            if re.match( r'\s*$', text_content ):
                continue


            # For Assembly
            # check for something that looks like the page number - when found
            # delete it and the two lines that follow
            if tag_name == 'b':
                page_number_match = re.match( r'(\d+)\s{10,}', line.text )
                if page_number_match:
                    # up the page number - the match is the page that we are leaving
                    page_number = int(page_number_match.group(0)) + 1
                    # skip on to the next page
                    while len(contents):
                        item = contents.pop(0)
                        if type(item) == Tag and item.name == 'hr': break
                    continue

            # For Senate
            # check for something that looks like the page number
            if tag_name == 'b':
                page_number_match = re.search( r'\s{10,}(\d+)', line.text )
                if page_number_match:
                    # set the page number - the match is the page that we are on
                    page_number = int(page_number_match.group(0))
                    continue

            if tag_name == 'b':
                if re.search( r'\s*Disclaimer:', line.text ):
                    # This is a disclaimer line that we can skip
                    continue

            # if br_count > 0:
            #     print 'br_count: ' + str(br_count)
            # print type( line )
            # # if type(line) == Tag: print line.name
            # print "%s: >>>%s<<<" % (tag_name, text_content)
            # print '------------------------------------------------------'

            text_content = text_content.strip()
            text_content = re.sub( r'\s+', ' ', text_content )

            filtered_contents.append(dict(
                tag_name     = tag_name,
                text_content = text_content,
                br_count     = br_count,
                page_number  = page_number,
            ))

            br_count = 0

        # go through all the filtered_content and using the br_count determine
        # when lines should be merged
        merged_contents = []

        for line in filtered_contents:

            # print line
            br_count = line['br_count']

            # Join lines that have the same tag_name and are not too far apart
            same_tag_name_test = (
                    br_count <= 1
                and len(merged_contents)
                and line['tag_name'] == merged_contents[-1]['tag_name']
            )

            # Italic text in the current unstyled text
            inline_italic_test = (
                    br_count == 0
                and len(merged_contents)
                and line['tag_name'] == 'i'
                and merged_contents[-1]['tag_name'] == None
            )

            # Merge lines tha meet one of the above tests
            if ( same_tag_name_test or inline_italic_test ):
                new_content = ' '.join( [ merged_contents[-1]['text_content'], line['text_content'] ] )
                new_content = re.sub( r'\s+,', ',', new_content )
                merged_contents[-1]['text_content'] = new_content
            else:
                merged_contents.append( line )

        # now go through and create some meaningful chunks from what we see
        meaningful_content = []
        last_speaker_name  = ''
        last_speaker_title = ''

        while len(merged_contents):

            line = merged_contents.pop(0)
            next_line = merged_contents[0] if len(merged_contents) else None

            # print '----------------------------------------'
            # print line

            # if the content is italic then it is a scene
            if line['tag_name'] == 'i':
                meaningful_content.append({
                    'type': 'scene',
                    'text': line['text_content'],
                    'page_number': line['page_number'],
                })
                continue

            # if the content is all caps then it is a heading
            if line['text_content'] == line['text_content'].upper():
                meaningful_content.append({
                    'type': 'heading',
                    'text': line['text_content'],
                    'page_number': line['page_number'],
                })
                last_speaker_name  = ''
                last_speaker_title = ''
                continue

            # It is a speech if we have a speaker and it is not formatted
            if line['tag_name'] == None and last_speaker_name:

                # do some quick smarts to see if we can extract a name from the
                # start of the speech.
                speech = line['text_content']

                matches = re.match( r'\(([^\)]+)\):(.*)', speech )
                if matches:
                    last_speaker_title = last_speaker_name
                    last_speaker_name = matches.group(1)
                    speech = matches.group(2)
                else:
                    # strip leading colons that may have been missed when the
                    # name was extracted (usually the colon was outside the
                    # bold tags around the name)
                    speech = re.sub( r'^:\s*', '', speech)

                meaningful_content.append({
                    'speaker_name':  last_speaker_name,
                    'speaker_title': last_speaker_title,
                    'text': speech,
                    'type': 'speech',
                    'page_number': line['page_number'],
                })

                # print meaningful_content[-1]

                continue

            # If it is a bold line and the next line is 'None' and is no
            # br_count away then we have the start of a speech.
            if (
                    line['tag_name']      == 'b'
                and next_line
                and next_line['tag_name'] == None
                and next_line['br_count'] == 0
            ):
                last_speaker_name = line['text_content'].strip(':')
                last_speaker_title = ''
                continue

            meaningful_content.append({
                'type': 'other',
                'text': line['text_content'],
                'page_number': line['page_number'],
            })
            last_speaker_name  = ''
            last_speaker_title = ''

        hansard_data = {
            'meta': cls.extract_meta_from_transcript( meaningful_content ),
            'transcript': meaningful_content,
        }

        return hansard_data

Example 47

Project: pombola
Source File: kenya_parser.py
View license
    @classmethod
    def convert_html_to_data(cls, html):

        # Clean out all the &nbsp; now. pdftohtml puts them to preserve the lines
        html = re.sub( r'&nbsp;', ' ', html )
        html = re.sub( r'&#160;', ' ', html )

        # create a soup out of the html
        soup = BeautifulSoup(
            html,
            convertEntities=BeautifulStoneSoup.HTML_ENTITIES
        )

        if not soup.body:
            raise Exception, "No <body> was found - output probably isn't HTML"
        contents = soup.body.contents

        # counters to use in the loops below
        br_count    = 0
        page_number = 1

        filtered_contents = []

        while len(contents):
            line = contents.pop(0)

            # get the tag name if there is one
            tag_name = line.name if type(line) == Tag else None

            # count <br> tags - we use two or more in succession to decide that
            # we've moved on to a new bit of text
            if tag_name == 'br':
                br_count += 1
                continue

            # skip empty lines
            if tag_name == None:
                text_content = unicode(line)
            else:
                text_content = line.text

            if re.match( r'\s*$', text_content ):
                continue


            # For Assembly
            # check for something that looks like the page number - when found
            # delete it and the two lines that follow
            if tag_name == 'b':
                page_number_match = re.match( r'(\d+)\s{10,}', line.text )
                if page_number_match:
                    # up the page number - the match is the page that we are leaving
                    page_number = int(page_number_match.group(0)) + 1
                    # skip on to the next page
                    while len(contents):
                        item = contents.pop(0)
                        if type(item) == Tag and item.name == 'hr': break
                    continue

            # For Senate
            # check for something that looks like the page number
            if tag_name == 'b':
                page_number_match = re.search( r'\s{10,}(\d+)', line.text )
                if page_number_match:
                    # set the page number - the match is the page that we are on
                    page_number = int(page_number_match.group(0))
                    continue

            if tag_name == 'b':
                if re.search( r'\s*Disclaimer:', line.text ):
                    # This is a disclaimer line that we can skip
                    continue

            # if br_count > 0:
            #     print 'br_count: ' + str(br_count)
            # print type( line )
            # # if type(line) == Tag: print line.name
            # print "%s: >>>%s<<<" % (tag_name, text_content)
            # print '------------------------------------------------------'

            text_content = text_content.strip()
            text_content = re.sub( r'\s+', ' ', text_content )

            filtered_contents.append(dict(
                tag_name     = tag_name,
                text_content = text_content,
                br_count     = br_count,
                page_number  = page_number,
            ))

            br_count = 0

        # go through all the filtered_content and using the br_count determine
        # when lines should be merged
        merged_contents = []

        for line in filtered_contents:

            # print line
            br_count = line['br_count']

            # Join lines that have the same tag_name and are not too far apart
            same_tag_name_test = (
                    br_count <= 1
                and len(merged_contents)
                and line['tag_name'] == merged_contents[-1]['tag_name']
            )

            # Italic text in the current unstyled text
            inline_italic_test = (
                    br_count == 0
                and len(merged_contents)
                and line['tag_name'] == 'i'
                and merged_contents[-1]['tag_name'] == None
            )

            # Merge lines tha meet one of the above tests
            if ( same_tag_name_test or inline_italic_test ):
                new_content = ' '.join( [ merged_contents[-1]['text_content'], line['text_content'] ] )
                new_content = re.sub( r'\s+,', ',', new_content )
                merged_contents[-1]['text_content'] = new_content
            else:
                merged_contents.append( line )

        # now go through and create some meaningful chunks from what we see
        meaningful_content = []
        last_speaker_name  = ''
        last_speaker_title = ''

        while len(merged_contents):

            line = merged_contents.pop(0)
            next_line = merged_contents[0] if len(merged_contents) else None

            # print '----------------------------------------'
            # print line

            # if the content is italic then it is a scene
            if line['tag_name'] == 'i':
                meaningful_content.append({
                    'type': 'scene',
                    'text': line['text_content'],
                    'page_number': line['page_number'],
                })
                continue

            # if the content is all caps then it is a heading
            if line['text_content'] == line['text_content'].upper():
                meaningful_content.append({
                    'type': 'heading',
                    'text': line['text_content'],
                    'page_number': line['page_number'],
                })
                last_speaker_name  = ''
                last_speaker_title = ''
                continue

            # It is a speech if we have a speaker and it is not formatted
            if line['tag_name'] == None and last_speaker_name:

                # do some quick smarts to see if we can extract a name from the
                # start of the speech.
                speech = line['text_content']

                matches = re.match( r'\(([^\)]+)\):(.*)', speech )
                if matches:
                    last_speaker_title = last_speaker_name
                    last_speaker_name = matches.group(1)
                    speech = matches.group(2)
                else:
                    # strip leading colons that may have been missed when the
                    # name was extracted (usually the colon was outside the
                    # bold tags around the name)
                    speech = re.sub( r'^:\s*', '', speech)

                meaningful_content.append({
                    'speaker_name':  last_speaker_name,
                    'speaker_title': last_speaker_title,
                    'text': speech,
                    'type': 'speech',
                    'page_number': line['page_number'],
                })

                # print meaningful_content[-1]

                continue

            # If it is a bold line and the next line is 'None' and is no
            # br_count away then we have the start of a speech.
            if (
                    line['tag_name']      == 'b'
                and next_line
                and next_line['tag_name'] == None
                and next_line['br_count'] == 0
            ):
                last_speaker_name = line['text_content'].strip(':')
                last_speaker_title = ''
                continue

            meaningful_content.append({
                'type': 'other',
                'text': line['text_content'],
                'page_number': line['page_number'],
            })
            last_speaker_name  = ''
            last_speaker_title = ''

        hansard_data = {
            'meta': cls.extract_meta_from_transcript( meaningful_content ),
            'transcript': meaningful_content,
        }

        return hansard_data

Example 48

View license
    def handle(self, username=None, **options):
        from slumber.exceptions import HttpClientError
        from candidates.election_specific import PARTY_DATA, shorten_post_label
        from candidates.models import PopItPerson
        from candidates.popit import create_popit_api_object

        election_data = {
            'prv-2015': 'listedescandidatsauxelectionslegislativeslisteprovincialeanptic.csv',
            'nat-2015': 'listedescandidatsauxelectionslegislativesanptic.csv'
            }

        field_map = {
            'prv-2015': {
                'region': 1,
                'party': 4,
                'list_order': 5,
                'first_name': 7,
                'last_name': 6,
                'gender': 8,
                'birth_date': 9,
                'party_short': 3
            },
            'nat-2015': {
                'region': 0,
                'party': 2,
                'list_order': 3,
                'first_name': 5,
                'last_name': 4,
                'gender': 6,
                'birth_date': 7,
                'party_short': 2
            }
        }

        api = create_popit_api_object()

        party_id_missing = {}
        party_name_to_id = {}
        for party_id, party_name in PARTY_DATA.party_id_to_name.items():
            party_name_to_id[party_name] = party_id

        for election_id, filename in election_data.items():
            csv_filename = join(
                dirname(__file__), '..', '..', 'data', filename
            )

            fields = field_map[election_id]

            with codecs.open(csv_filename, 'r', encoding='windows-1252') as f:

                initial = True
                for candidate in unicode_csv_reader(f):
                    # skip header line
                    if initial:
                        initial = False
                        continue

                    region = candidate[fields['region']]
                    party = candidate[fields['party']]
                    party_list_order = candidate[fields['list_order']]
                    first_name = string.capwords(candidate[fields['first_name']])
                    last_name = string.capwords(candidate[fields['last_name']])
                    gender = candidate[fields['gender']]
                    birth_date = None

                    if candidate[fields['birth_date']] is not None:
                        birth_date = str(dateutil.parser.parse(
                            candidate[fields['birth_date']], dayfirst=True
                        ).date())

                    name = first_name + ' ' + last_name

                    id = '-'.join([
                        re.sub('[^\w]*', '', re.sub(r' ', '-', strip_accents(name.lower()))),
                        re.sub('[^\w]*', '', candidate[fields['party_short']].lower()),
                        birth_date
                    ])

                    # national candidate
                    if region == 'PAYS':
                        region = 'Burkina Faso'
                    election_data, post_data = get_post_data(
                        api, election_id, region
                    )

                    # debug
                    # tmp = '%s %s %s (%s) - %s (%s)' % ( id, first_name, last_name, party, region, post_data['label'] )
                    # print(tmp)

                    person = get_existing_popit_person(id)
                    if person:
                        # print("Found an existing person:", person.get_absolute_url())
                        pass
                    else:
                        print("No existing person, creating a new one:", name)
                        person = PopItPerson()

                    person.set_identifier('import-id', id)
                    person.family_name = last_name
                    person.given_name = first_name
                    person.name = name
                    person.gender = gender
                    if birth_date:
                        person.birth_date = str(birth_date)
                    else:
                        person.birth_date = None

                    standing_in_election = {
                        'post_id': post_data['id'],
                        'name': shorten_post_label(post_data['label']),
                        'party_list_position': party_list_order,
                    }

                    if 'area' in post_data:
                        standing_in_election['mapit_url'] = post_data['area']['identifier']

                    person.standing_in = {
                        election_data.slug: standing_in_election
                    }

                    change_metadata = get_change_metadata(
                        None,
                        'Imported candidate from CSV',
                    )

                    party_comp = re.sub(' +', ' ', party)
                    party_id = UNKNOWN_PARTY_ID
                    if party_comp in party_name_to_id.keys():
                        party_id = party_name_to_id[party_comp]
                        party = party_comp
                    else:
                        party_id = party_name_to_id['Unknown Party']
                        party = 'Unknown Party'

                    if party_id == UNKNOWN_PARTY_ID and party_comp not in party_id_missing.keys():
                        party_id_missing[party_comp] = 1

                    person.party_memberships = {
                        election_data.slug: {
                            'id': party_id,
                            'name': party,
                            'imported_name': party_comp
                        }
                    }

                    person.record_version(change_metadata)
                    try:
                        person.save_to_popit(api)
                    except HttpClientError as hce:
                        print("Got an HttpClientError:", hce.content)
                        raise

        if len(party_id_missing) > 0:
            print("Unmatched party names:")
            for name in party_id_missing.keys():
                print(name)

Example 49

Project: labuildings
Source File: convert.py
View license
def convert(buildingsFile, osmOut):
    with open(buildingsFile) as f:
        features = json.load(f)
    allAddresses = {}
    buildings = []
    buildingShapes = []
    buildingIdx = index.Index()

    # Returns the coordinates for this address
    def keyFromAddress(address):
        return str(address['geometry']['coordinates'][0]) + "," + str(address['geometry']['coordinates'][1])

    for feature in features:
        if feature['geometry']['type'] == 'Polygon' or feature['geometry']['type'] == 'MultiPolygon':
            extra_tags = osm_tags.get_osm_tags(feature)
            feature['properties']['osm'] = extra_tags
            buildings.append(feature)
            shape = asShape(feature['geometry'])
            buildingShapes.append(shape)
            buildingIdx.add(len(buildingShapes) - 1, shape.bounds)

        # These are the addresses that don't overlap any buildings
        elif feature['geometry']['type'] == 'Point':
            # The key is the coordinates of this address. Track how many addresses share these coords.
            key = keyFromAddress(feature)
            if key in allAddresses:
                allAddresses[key].append(feature)
            else:
                allAddresses[key] = [feature]

        else:
            print "geometry of unknown type:", feature['geometry']['type']

    # Generates a new osm id.
    osmIds = dict(node = -1, way = -1, rel = -1)
    def newOsmId(type):
        osmIds[type] = osmIds[type] - 1
        return osmIds[type]

    ## Formats multi part house numbers
    def formatHousenumber(p):
        def suffix(part1, part2, hyphen_type=None):
            #part1 = stripZeroes(part1)
            if not part2:
                return str(part1)
            #part2 = stripZeroes(part2)
            return str(part1) + ' ' + str(part2)
        #def stripZeroes(addr): # strip leading zeroes from numbers
        #    if addr.isdigit():
        #        addr = str(int(addr))
        #    if '-' in addr:
        #        try:
        #            addr2 = addr.split('-')
        #            if len(addr2) == 2:
        #                addr = str(int(addr2[0])) + '-' + str(int(addr2[1])).zfill(2)
        #        except:
        #            pass
        #    return addr
        number = suffix(p['Number'], p['NumSuffix'])
        if p['NumPrefix']:
            number = p['NumPrefix'] + number
        return number

    # Converts an address
    def convertAddress(address):
        result = dict()
        if all (k in address for k in ('Number', 'StreetName')):
            if address['Number']:
                result['addr:housenumber'] = formatHousenumber(address)
            if address['StreetName']:

                # Titlecase
                streetname = address['StreetName'].title()
                if address['StArticle']:
                    streetname = address['StArticle'].title() + " " + streetname
                if address['PreType']:
                    streetname = address['PreType'].title() + " " + streetname
                if address['PreDir']:
                    streetname = address['PreDir'].title() + " " + streetname
                if address['PreMod']:
                    streetname = address['PreMod'].title() + " " + streetname
                if address['PostType']:
                    streetname = streetname + " " + address['PostType'].title()
                if address['PostDir']:
                    streetname = streetname + " " + address['PostDir'].title()
                if address['PostMod']:
                    streetname = streetname + " " + address['PostMod'].title()

                # Fix titlecase on 1St, 2Nd, 3Rd, 4Th, etc
                streetname = re.sub(r"(.*)(\d+)St\s*(.*)", r"\1\2st \3", streetname)
                streetname = re.sub(r"(.*)(\d+)Nd\s*(.*)", r"\1\2nd \3", streetname)
                streetname = re.sub(r"(.*)(\d+)Rd\s*(.*)", r"\1\2rd \3", streetname)
                streetname = re.sub(r"(.*)(\d+)Th\s*(.*)", r"\1\2th \3", streetname)

                # Expand 'St ' -> 'Saint'
                # relevant for:
                #   'St Clair'
                #   'St Louis'
                #   'St James'
                #   'St James Park'
                #   'St Andrews'
                #   'St Nicolas'
                #   'St Cloud'
                #   'St Ambrose'
                #   'St Bonaventure'
                #   'St Joseph'
                #   'St Tropez'
                if streetname[0:3] == 'St ': streetname = 'Saint ' + streetname[3:]
                # Middle name expansions
                streetname = streetname.replace(' St ', ' Street ')
                streetname = streetname.replace(' Rd ', ' Road ')
                streetname = streetname.replace(' Blvd ', ' Boulevard ')
                result['addr:street'] = streetname
            if address['PCITY1']:
                result['addr:city'] = address['PCITY1'].title()
            elif address['LegalComm']:
                result['addr:city'] = address['LegalComm'].title()
            if address['ZipCode']:
                result['addr:postcode'] = str(int(address['ZipCode']))
            if address['UnitName']:
                result['addr:unit'] = address['UnitName']
        return result

    # Distills coincident addresses into one address where possible.
    # Takes an array of addresses and returns an array of 1 or more addresses
    def distillAddresses(addresses):
        # Only distill addresses if the following conditions are true:
        # 1) the addresses share the same coordinates.
        # AND
        # 2a) all the attributes are the same _except_ the unit number/name
        # OR
        # 2b) the street number is the same but the street names are referring to the same thing

        outputAddresses = []

        # First, group the addresses into separate lists for each unique location
        addressesByCoords = {}
        for address in addresses:
            key = keyFromAddress(address)
            if key in addressesByCoords:
                addressesByCoords[key].append(address)
            else:
                addressesByCoords[key] = [address]

        # loop over unique coordinates
        for key in addressesByCoords:
            # Here see if we can collapse any of these addresses at the same coords.

            # addressesByCoords[key] is an array of addresses at this location.

            # We are only looking for the 2 possibilities above (2a) and (2b).
            # If the situation is more complicated, change nothing.
            outputAddresses.extend(distillAddressesAtPoint(addressesByCoords[key]))

        return outputAddresses

    # This function is called by distillAddresses.
    # It assumes all addresses are at the same coordinates.
    # Returns an array of 1 or more addresses
    def distillAddressesAtPoint(addresses):

        if len(addresses) == 1:
            return addresses

        firstAddress = addresses[0]

        # (2a) If the first address is an apartment, see if all the rest are too.

        # NOTE: sometimes an apartment building has a few address points that lack a UnitName...
        # ...so checking for the presence of UnitName in firstAddress wouldn't always work.
        props = firstAddress['properties']
        if debug: print "Testing to see if these are apartments...", '\t'.join([str(props['Number']), str(props['NumSuffix']), str(props['PreType']), str(props['StreetName']), str(props['PostType']), str(props['UnitName'])])
        # Compare subsequent addresses in the array to the first address.
        # Hence, range starts at 1.
        for i in range(1, len(addresses)):
            if not areSameAddressExceptUnit(firstAddress, addresses[i]):
                props = addresses[i]['properties']
                if debug: print "No, this address was different...........", '\t'.join([str(props['Number']), str(props['NumSuffix']), str(props['PreType']), str(props['StreetName']), str(props['PostType']), str(props['UnitName'])])
                #print firstAddress
                #print addresses[i]
                break
            # else, keep going

        else: # else for the `for` statement. Executes only if `break` never did.
            # We checked them all, and they're all the same except UnitName.
            # In this case the apartment data is useless to OSM because the
            # apartment nodes are all on top of each other.
            # So, discard the unit information and return just one address.
            firstAddress['properties']['UnitName'] = None
            if debug: print "Yes they were apartments! Collapsed", len(addresses), "into one"
            return [firstAddress]

        # (2b) Check if the street number is all the same.
        # For this, we use a list of alternative names (like HWY 1, etc)...
        # ...and we need to know which canonical name to keep.
        if debug: print "Testing to see if the street names are synonyms.."
        canonicalStreetName = None
        for i in range(1, len(addresses)):
            props = addresses[i]['properties']
            if not areSameAddressExceptStreet(firstAddress, addresses[i]):
                if debug: print "No, this address was different...........", '\t'.join([str(props['Number']), str(props['NumSuffix']), str(props['PreType']), str(props['StreetName']), str(props['PostType']), str(props['UnitName'])])
                #print firstAddress
                #print addresses[i]
                break
            compoundStreetName = (str(props['PreType']),str(props['StreetName']),str(props['PostType']))
            currentCanonicalStreetName = getCanonicalName(compoundStreetName)
            if currentCanonicalStreetName:
                if debug: print "found canonical name", currentCanonicalStreetName
                if ((currentCanonicalStreetName == canonicalStreetName) or (canonicalStreetName == None)):
                    canonicalStreetName = currentCanonicalStreetName
                else:
                    if debug: print "canonicalStreetNames didn't match:", canonicalStreetName, currentCanonicalStreetName
                    break
            else:
                print "couldn't find canonicalStreetName for", compoundStreetName
                break

        else: # else for the `for` statement. Executes only if `break` never did.
            # We checked them all, and they're all the same except StreetName.
            # If we can determine that they are all the same synonym, we can
            # overwrite the other streetname information and return just one address.
            firstAddress['properties']['PreType'] = canonicalStreetName[0]
            firstAddress['properties']['StreetName'] = canonicalStreetName[1]
            firstAddress['properties']['PostType'] = canonicalStreetName[2]
            if debug: print "Yes they were synonyms! Collapsed", len(addresses), "into one"
            return [firstAddress]

        # This is only excuted if neither of the two `else` statements executed 
        # for the two `for` statements above. That means we were unable to collapse
        # separate apartments into one, or collapse synonymous street names into one.
        # So, instead of returning just one address, we fail and return all of them.
        return addresses

    def areSameAddressExceptUnit(a1, a2):
        for key in ['NumPrefix', 'Number', 'NumSuffix', 'PreMod', 'PreDir', 'PreType', 'StArticle', 'StreetName', 'PostType', 'PostDir', 'PostMod', 'ZipCode', 'LegalComm', 'PCITY1']:
            if a1['properties'][key] != a2['properties'][key]:
                #print key, a1['properties'][key], "!=", a2['properties'][key]
                return False
        return True

    def areSameAddressExceptStreet(a1, a2):
        for key in ['NumPrefix', 'Number', 'NumSuffix', 'PreMod', 'PreDir', 'StArticle', 'UnitName', 'PostDir', 'PostMod', 'ZipCode', 'LegalComm', 'PCITY1']:
            if a1['properties'][key] != a2['properties'][key]:
                #print key, a1['properties'][key], "!=", a2['properties'][key]
                return False
        return True

    # Sometimes we have identical addresses that differ only by street name.
    # Usually these are because the street name is also a highway. We want to 
    # remove all the highway names and only use the street name for the address
    canonicalNames = {
        ("None", "LINCOLN", "BOULEVARD"): (None, "LINCOLN", "BOULEVARD"),
        ("ROUTE", "1", "None"): (None, "LINCOLN", "BOULEVARD"),
        ("HIGHWAY", "1", "None"): (None, "LINCOLN", "BOULEVARD"),
        ("None", "SR-1", "None"): (None, "LINCOLN", "BOULEVARD"),
        ("None", "PCH", "None"): (None, "LINCOLN", "BOULEVARD"),
    }

    def getCanonicalName(compoundStreetName):
        result = None
        try:
            result = canonicalNames[compoundStreetName]
        except KeyError:
            return None
        return result

    # Appends new node or returns existing if exists.
    nodes = {}
    def appendNewNode(coords, osmXml):
        rlon = int(float(coords[0]*10**7))
        rlat = int(float(coords[1]*10**7))
        if (rlon, rlat) in nodes:
            return nodes[(rlon, rlat)]
        node = etree.Element('node', visible = 'true', id = str(newOsmId('node')))
        node.set('lon', str(Decimal(coords[0])*Decimal(1)))
        node.set('lat', str(Decimal(coords[1])*Decimal(1)))
        nodes[(rlon, rlat)] = node
        osmXml.append(node)
        return node

    # Sometimes we want to force overlapping nodes, such as with addresses.
    # This way they'll show up in JOSM and the contributor can deal with them manually.
    # Otherwise, we might try to apply multiple address tags to the same node...
    # ...which is also incorrect, but harder to detect.
    def appendNewNodeIgnoringExisting(coords, osmXml):
        rlon = int(float(coords[0]*10**7))
        rlat = int(float(coords[1]*10**7))
        #if (rlon, rlat) in nodes:
        #    return nodes[(rlon, rlat)]
        node = etree.Element('node', visible = 'true', id = str(newOsmId('node')))
        node.set('lon', str(Decimal(coords[0])*Decimal(1)))
        node.set('lat', str(Decimal(coords[1])*Decimal(1)))
        nodes[(rlon, rlat)] = node
        osmXml.append(node)
        return node

    def appendNewWay(coords, intersects, osmXml):
        way = etree.Element('way', visible='true', id=str(newOsmId('way')))
        firstNid = 0
        for i, coord in enumerate(coords):
            if i == 0: continue # the first and last coordinate are the same
            node = appendNewNode(coord, osmXml)
            if i == 1: firstNid = node.get('id')
            way.append(etree.Element('nd', ref=node.get('id')))

            # Check each way segment for intersecting nodes
            int_nodes = {}
            try:
                line = LineString([coord, coords[i+1]])
            except IndexError:
                line = LineString([coord, coords[1]])
            for idx, c in enumerate(intersects):
                if line.buffer(0.000001).contains(Point(c[0], c[1])) and c not in coords:
                    t_node = appendNewNode(c, osmXml)
                    for n in way.iter('nd'):
                        if n.get('ref') == t_node.get('id'):
                            break
                    else:
                        int_nodes[t_node.get('id')] = Point(c).distance(Point(coord))
            for n in sorted(int_nodes, key=lambda key: int_nodes[key]): # add intersecting nodes in order
                way.append(etree.Element('nd', ref=n))
            
        way.append(etree.Element('nd', ref=firstNid)) # close way
        osmXml.append(way)
        return way

    # Appends an address to a given node or way.
    def appendAddress(address, element):
    #    # Need to check if these tags already exist on this element
        for k, v in convertAddress(address['properties']).iteritems():
            # TODO: is this doing anything useful?
            #for child in element:
            #    if child.tag == 'tag':
            #        #print k, v
            #        if child.attrib.get('k') == k:
            #            print "found key", k
            #            if child.attrib.get('v') == v:
            #                print "found matching value", v
           element.append(etree.Element('tag', k=k, v=v))

    # Appends a building to a given OSM xml document.
    def appendBuilding(building, shape, address, osmXml):
        # Check for intersecting buildings
        intersects = []
        for i in buildingIdx.intersection(shape.bounds):
            try:
                for c in buildingShapes[i].exterior.coords:
                    if Point(c[0], c[1]).buffer(0.000001).intersects(shape):
                        intersects.append(c)
            except AttributeError:
                for c in buildingShapes[i][0].exterior.coords:
                    if Point(c[0], c[1]).buffer(0.000001).intersects(shape):
                        intersects.append(c)

        # Export building, create multipolygon if there are interior shapes.
        interiors = []
        try:
            way = appendNewWay(list(shape.exterior.coords), intersects, osmXml)
            for interior in shape.interiors:
                interiors.append(appendNewWay(list(interior.coords), [], osmXml))
        except AttributeError:
            way = appendNewWay(list(shape[0].exterior.coords), intersects, osmXml)
            for interior in shape[0].interiors:
                interiors.append(appendNewWay(list(interior.coords), [], osmXml))
        if len(interiors) > 0:
            relation = etree.Element('relation', visible='true', id=str(newOsmId('way')))
            relation.append(etree.Element('member', type='way', role='outer', ref=way.get('id')))
            for interior in interiors:
                relation.append(etree.Element('member', type='way', role='inner', ref=interior.get('id')))
            relation.append(etree.Element('tag', k='type', v='multipolygon'))
            osmXml.append(relation)
            way = relation
        for tag in building['properties']['osm']:
            value = building['properties']['osm'][tag]
            way.append(etree.Element('tag', k=tag, v=value))
        # if 'GeneralUse' in building['properties']:
        #     way.append(etree.Element('tag', k='building', v=building['properties']['GeneralUse']))
        # else:
        #     way.append(etree.Element('tag', k='building', v='yes'))
        # if 'SpecificUs' in building['properties']:
        #     way.append(etree.Element('tag', k='building:use', v=building['properties']['GeneralUse']))
        if 'YearBuilt' in building['properties'] and building['properties']['YearBuilt'] is not None:
            YearBuilt = int(building['properties']['YearBuilt'])
            if YearBuilt > 0:
                    way.append(etree.Element('tag', k='start_date', v=str(YearBuilt)))
        # if 'Specific_1' in building['properties']:
        #         way.append(etree.Element('tag', k='amenity', v=building['properties']['Specific_1']))
        if 'Units' in building['properties'] and building['properties']['Units'] is not None:
            units = int(round(float(building['properties']['Units']), 0))
            if units > 0:
                way.append(etree.Element('tag', k='building:units', v=str(units)))
        if 'HEIGHT' in building['properties']:
            height = round(((building['properties']['HEIGHT'] * 12) * 0.0254), 1)
            if height > 0:
                way.append(etree.Element('tag', k='height', v=str(height)))
        if 'ELEV' in building['properties']:
            elevation = round(((building['properties']['ELEV'] * 12) * 0.0254), 1)
            if elevation > 0:
                way.append(etree.Element('tag', k='ele', v=str(elevation)))
        if 'BLD_ID' in building['properties']:
            way.append(etree.Element('tag', k='lacounty:bld_id', v=str(building['properties']['BLD_ID'])))
        if 'AIN' in building['properties'] and building['properties']['AIN'] is not None:
            way.append(etree.Element('tag', k='lacounty:ain', v=str(building['properties']['AIN'])))
#        if address:
#            appendAddress(address, way)

    # Export buildings & addresses. Only export address with building if there is exactly
    # one address per building. Export remaining addresses as individual nodes.
    # The remaining addresses are added to a dictionary hashed by their coordinates.
    # This way we catch any addresses that have the same coordinates.
    osmXml = etree.Element('osm', version='0.6', generator='[email protected]')
    for i in range(0, len(buildings)):

        buildingAddresses = []
        for address in buildings[i]['properties']['addresses']:
            buildingAddresses.append(address)
        address = None
        if len(buildingAddresses) == 1:
            # There's only one address in the building footprint
            address = buildingAddresses[0]
        elif len(buildingAddresses) > 1:
            # If there are multiple addresses, first try to distill them.
            # If we can distill them to one address, we can still add it to this building.
            distilledAddresses = distillAddresses(buildingAddresses)
            if len(distilledAddresses) == 1:
                # We distilled down to one address. Add it to the building.
                address = distilledAddresses[0]
            else:
                # We could not distilled down to one address. Instead export as nodes.
                for address in distilledAddresses:
                    # The key is the coordinates of this address. Track how many addresses share these coords.
                    key = keyFromAddress(address)
                    if key in allAddresses:
                        allAddresses[key].append(address)
                    else:
                        allAddresses[key] = [address]

        appendBuilding(buildings[i], buildingShapes[i], address, osmXml)


    # Export any addresses that aren't the only address for a building.
    if (len(allAddresses) > 0):

        # Iterate over the list of distinct coordinates found in the address data
        for coordskey in allAddresses:
            # if a distinct coordinate has only one associated address,
            # then export that address as a new node
            if len(allAddresses[coordskey]) == 1:
                address = allAddresses[coordskey][0]
                coordinates = address['geometry']['coordinates']
#                node = appendNewNode(coordinates, osmXml) # returns old node if one exists at these coords
#                appendAddress(address, node)

            # If there is more than one address at these coordinates, do something.
            # ...but do what exactly?
            else:
                distilledAddresses = distillAddresses(allAddresses[coordskey])
                if len(distilledAddresses) == 1:
                    # We distilled down to one address. Append it.
                    address = distilledAddresses[0]
                    coordinates = address['geometry']['coordinates']
#                    node = appendNewNode(coordinates, osmXml) # returns old node if one exists at these coords
#                    appendAddress(address, node)
                else:
                    if debug: print "found duplicate coordinates that could not be distilled:", coordskey, "has", len(allAddresses[coordskey]), "addresses"
                    if debug: print '\t'.join(["num", "numsufx", "pretype", "street", "posttype", "unit"])
                    for address in distilledAddresses:
                        # TODO: do something smart here. These are overlapping addresses that we couldn't distill.
                        # TODO: maybe jitter them, or leave stacked but with FIXME?
                        # TODO: For now, we use appendNewNodeIgnoringExisting to pile the nodes on top of each other.
                        #print address
                        props = address['properties']
                        if debug: print '\t'.join([str(props['Number']), str(props['NumSuffix']), str(props['PreType']), str(props['StreetName']), str(props['PostType']), str(props['UnitName'])])
                        coordinates = address['geometry']['coordinates']
#                        node = appendNewNodeIgnoringExisting(coordinates, osmXml) # Force overlapping nodes so JOSM will catch them
#                        appendAddress(address, node)

    with open(osmOut, 'w') as outFile:
        outFile.writelines(tostring(osmXml, pretty_print=True, xml_declaration=True, encoding='UTF-8'))
        print 'Exported ' + osmOut

Example 50

Project: python-netsnmpagent
Source File: netsnmpagent.py
View license
	def __init__(self, **args):
		"""Initializes a new netsnmpAgent instance.
		
		"args" is a dictionary that can contain the following
		optional parameters:
		
		- AgentName     : The agent's name used for registration with net-snmp.
		- MasterSocket  : The transport specification of the AgentX socket of
		                  the running snmpd instance to connect to (see the
		                  "LISTENING ADDRESSES" section in the snmpd(8) manpage).
		                  Change this if you want to use eg. a TCP transport or
		                  access a custom snmpd instance, eg. as shown in
		                  run_simple_agent.sh, or for automatic testing.
		- PersistenceDir: The directory to use to store persistence information.
		                  Change this if you want to use a custom snmpd
		                  instance, eg. for automatic testing.
		- MIBFiles      : A list of filenames of MIBs to be loaded. Required if
		                  the OIDs, for which variables will be registered, do
		                  not belong to standard MIBs and the custom MIBs are not
		                  located in net-snmp's default MIB path
		                  (/usr/share/snmp/mibs).
		- UseMIBFiles   : Whether to use MIB files at all or not. When False,
		                  the parser for MIB files will not be initialized, so
		                  neither system-wide MIB files nor the ones provided
		                  in the MIBFiles argument will be in use.
		- LogHandler    : An optional Python function that will be registered
		                  with net-snmp as a custom log handler. If specified,
		                  this function will be called for every log message
		                  net-snmp itself generates, with parameters as follows:
		                  1. a string indicating the message's priority: one of
		                  "Emergency", "Alert", "Critical", "Error", "Warning",
		                  "Notice", "Info" or "Debug".
		                  2. the actual log message. Note that heading strings
		                  such as "Warning: " and "Error: " will be stripped off
		                  since the priority level is explicitly known and can
		                  be used to prefix the log message, if desired.
		                  Trailing linefeeds will also have been stripped off.
		                  If undefined, log messages will be written to stderr
		                  instead. """

		# Default settings
		defaults = {
			"AgentName"     : os.path.splitext(os.path.basename(sys.argv[0]))[0],
			"MasterSocket"  : None,
			"PersistenceDir": None,
			"UseMIBFiles"   : True,
			"MIBFiles"      : None,
			"LogHandler"    : None,
		}
		for key in defaults:
			setattr(self, key, args.get(key, defaults[key]))
		if self.UseMIBFiles and self.MIBFiles is not None and type(self.MIBFiles) not in (list, tuple):
			self.MIBFiles = (self.MIBFiles,)

		# Initialize status attribute -- until start() is called we will accept
		# SNMP object registrations
		self._status = netsnmpAgentStatus.REGISTRATION

		# Unfortunately net-snmp does not give callers of init_snmp() (used
		# in the start() method) any feedback about success or failure of
		# connection establishment. But for AgentX clients this information is
		# quite essential, thus we need to implement some more or less ugly
		# workarounds.

		# For net-snmp 5.7.x, we can derive success and failure from the log
		# messages it generates. Normally these go to stderr, in the absence
		# of other so-called log handlers. Alas we define a callback function
		# that we will register with net-snmp as a custom log handler later on,
		# hereby effectively gaining access to the desired information.
		def _py_log_handler(majorID, minorID, serverarg, clientarg):
			# "majorID" and "minorID" are the callback IDs with which this
			# callback function was registered. They are useful if the same
			# callback was registered multiple times.
			# Both "serverarg" and "clientarg" are pointers that can be used to
			# convey information from the calling context to the callback
			# function: "serverarg" gets passed individually to every call of
			# snmp_call_callbacks() while "clientarg" was initially passed to
			# snmp_register_callback().

			# In this case, "majorID" and "minorID" are always the same (see the
			# registration code below). "serverarg" needs to be cast back to
			# become a pointer to a "snmp_log_message" C structure (passed by
			# net-snmp's log_handler_callback() in snmplib/snmp_logging.c) while
			# "clientarg" will be None (see the registration code below).
			logmsg = ctypes.cast(serverarg, snmp_log_message_p)

			# Generate textual description of priority level
			priorities = {
				LOG_EMERG: "Emergency",
				LOG_ALERT: "Alert",
				LOG_CRIT: "Critical",
				LOG_ERR: "Error",
				LOG_WARNING: "Warning",
				LOG_NOTICE: "Notice",
				LOG_INFO: "Info",
				LOG_DEBUG: "Debug"
			}
			msgprio = priorities[logmsg.contents.priority]

			# Strip trailing linefeeds and in addition "Warning: " and "Error: "
			# from msgtext as these conditions are already indicated through
			# msgprio
			msgtext = re.sub(
				"^(Warning|Error): *",
				"",
				u(logmsg.contents.msg.rstrip(b"\n"))
			)

			# Intercept log messages related to connection establishment and
			# failure to update the status of this netsnmpAgent object. This is
			# really an ugly hack, introducing a dependency on the particular
			# text of log messages -- hopefully the net-snmp guys won't
			# translate them one day.
			if  msgprio == "Warning" \
			or  msgprio == "Error" \
			and re.match("Failed to .* the agentx master agent.*", msgtext):
				# If this was the first connection attempt, we consider the
				# condition fatal: it is more likely that an invalid
				# "MasterSocket" was specified than that we've got concurrency
				# issues with our agent being erroneously started before snmpd.
				if self._status == netsnmpAgentStatus.FIRSTCONNECT:
					self._status = netsnmpAgentStatus.CONNECTFAILED

					# No need to log this message -- we'll generate our own when
					# throwing a netsnmpAgentException as consequence of the
					# ECONNECT
					return 0

				# Otherwise we'll stay at status RECONNECTING and log net-snmp's
				# message like any other. net-snmp code will keep retrying to
				# connect.
			elif msgprio == "Info" \
			and  re.match("AgentX subagent connected", msgtext):
				self._status = netsnmpAgentStatus.CONNECTED
			elif msgprio == "Info" \
			and  re.match("AgentX master disconnected us.*", msgtext):
				self._status = netsnmpAgentStatus.RECONNECTING

			# If "LogHandler" was defined, call it to take care of logging.
			# Otherwise print all log messages to stderr to resemble net-snmp
			# standard behavior (but add log message's associated priority in
			# plain text as well)
			if self.LogHandler:
				self.LogHandler(msgprio, msgtext)
			else:
				print("[{0}] {1}".format(msgprio, msgtext))

			return 0

		# We defined a Python function that needs a ctypes conversion so it can
		# be called by C code such as net-snmp. That's what SNMPCallback() is
		# used for. However we also need to store the reference in "self" as it
		# will otherwise be lost at the exit of this function so that net-snmp's
		# attempt to call it would end in nirvana...
		self._log_handler = SNMPCallback(_py_log_handler)

		# Now register our custom log handler with majorID SNMP_CALLBACK_LIBRARY
		# and minorID SNMP_CALLBACK_LOGGING.
		if libnsa.snmp_register_callback(
			SNMP_CALLBACK_LIBRARY,
			SNMP_CALLBACK_LOGGING,
			self._log_handler,
			None
		) != SNMPERR_SUCCESS:
			raise netsnmpAgentException(
				"snmp_register_callback() failed for _netsnmp_log_handler!"
			)

		# Finally the net-snmp logging system needs to be told to enable
		# logging through callback functions. This will actually register a
		# NETSNMP_LOGHANDLER_CALLBACK log handler that will call out to any
		# callback functions with the majorID and minorID shown above, such as
		# ours.
		libnsa.snmp_enable_calllog()

		# Unfortunately our custom log handler above is still not enough: in
		# net-snmp 5.4.x there were no "AgentX master disconnected" log
		# messages yet. So we need another workaround to be able to detect
		# disconnects for this release. Both net-snmp 5.4.x and 5.7.x support
		# a callback mechanism using the "majorID" SNMP_CALLBACK_APPLICATION and
		# the "minorID" SNMPD_CALLBACK_INDEX_STOP, which we can abuse for our
		# purposes. Again, we start by defining a callback function.
		def _py_index_stop_callback(majorID, minorID, serverarg, clientarg):
			# For "majorID" and "minorID" see our log handler above.
			# "serverarg" is a disguised pointer to a "netsnmp_session"
			# structure (passed by net-snmp's subagent_open_master_session() and
			# agentx_check_session() in agent/mibgroup/agentx/subagent.c). We
			# can ignore it here since we have a single session only anyway.
			# "clientarg" will be None again (see the registration code below).

			# We only care about SNMPD_CALLBACK_INDEX_STOP as our custom log
			# handler above already took care of all other events.
			if minorID == SNMPD_CALLBACK_INDEX_STOP:
				self._status = netsnmpAgentStatus.RECONNECTING

			return 0

		# Convert it to a C callable function and store its reference
		self._index_stop_callback = SNMPCallback(_py_index_stop_callback)

		# Register it with net-snmp
		if libnsa.snmp_register_callback(
			SNMP_CALLBACK_APPLICATION,
			SNMPD_CALLBACK_INDEX_STOP,
			self._index_stop_callback,
			None
		) != SNMPERR_SUCCESS:
			raise netsnmpAgentException(
				"snmp_register_callback() failed for _netsnmp_index_callback!"
			)

		# No enabling necessary here

		# Make us an AgentX client
		if libnsa.netsnmp_ds_set_boolean(
			NETSNMP_DS_APPLICATION_ID,
			NETSNMP_DS_AGENT_ROLE,
			1
		) != SNMPERR_SUCCESS:
			raise netsnmpAgentException(
				"netsnmp_ds_set_boolean() failed for NETSNMP_DS_AGENT_ROLE!"
			)

		# Use an alternative transport specification to connect to the master?
		# Defaults to "/var/run/agentx/master".
		# (See the "LISTENING ADDRESSES" section in the snmpd(8) manpage)
		if self.MasterSocket:
			if libnsa.netsnmp_ds_set_string(
				NETSNMP_DS_APPLICATION_ID,
				NETSNMP_DS_AGENT_X_SOCKET,
				b(self.MasterSocket)
			) != SNMPERR_SUCCESS:
				raise netsnmpAgentException(
					"netsnmp_ds_set_string() failed for NETSNMP_DS_AGENT_X_SOCKET!"
				)

		# Use an alternative persistence directory?
		if self.PersistenceDir:
			if libnsa.netsnmp_ds_set_string(
				NETSNMP_DS_LIBRARY_ID,
				NETSNMP_DS_LIB_PERSISTENT_DIR,
				b(self.PersistenceDir)
			) != SNMPERR_SUCCESS:
				raise netsnmpAgentException(
					"netsnmp_ds_set_string() failed for NETSNMP_DS_LIB_PERSISTENT_DIR!"
				)

		# Initialize net-snmp library (see netsnmp_agent_api(3))
		if libnsa.init_agent(b(self.AgentName)) != 0:
			raise netsnmpAgentException("init_agent() failed!")

		# Initialize MIB parser
		if self.UseMIBFiles:
			libnsa.netsnmp_init_mib()

		# If MIBFiles were specified (ie. MIBs that can not be found in
		# net-snmp's default MIB directory /usr/share/snmp/mibs), read
		# them in so we can translate OID strings to net-snmp's internal OID
		# format.
		if self.UseMIBFiles and self.MIBFiles:
			for mib in self.MIBFiles:
				if libnsa.read_mib(b(mib)) == 0:
					raise netsnmpAgentException("netsnmp_read_module({0}) " +
					                            "failed!".format(mib))

		# Initialize our SNMP object registry
		self._objs = defaultdict(dict)