collections.defaultdict

Here are the examples of the python api collections.defaultdict taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

160 Examples 7

Example 151

Project: geoinference Source File: geocoder.py
    def __init__(self,dataset="geonames"):
        """
        Initializes the "reverse_geocoder" and "geocoder" dictionaries,
        based on the dataset selected. By default "geonames" is selected,
        """
        self.reverse_geocoder = defaultdict(list)
        self.geocoder = {}
        self.abbv_to_state = state_abbv_data()
        self.state_abbv_regex = re.compile(r'(\b' + (r'\b|\b'.join(self.abbv_to_state.keys())) + r'\b)')
        self.all_city_names = set()

        LOGGER.debug("Geocoder loading city-location mapping from %s" % (dataset))

        # If the user specifies GeoLite data or if they are using GPS data, for
        # which GeoList is the default gazetteer
        if dataset == "geolite" or dataset == "geo-median":
            data = geolite_data()

            city_to_latlon = {}
            city_name_counts = collections.Counter()

            for line in data[2:]:
                country_name = line[2].lower()
                region_name = line[3].lower()
                city_name = line[4].lower()

                if not city_name:
                    continue

                lat = float(line[0])
                lon = float(line[1])

                # Keep track of how many times city names occur
                city_to_latlon[city_name] = (lat,lon)
                city_name_counts[city_name] += 1


                #sets bins of 0.01 accuracy of lat/lon for reverse_geocoding
                rounded_lat = round(lat,2)
                rounded_lon = round(lon,2)

                #builds the geocoder dictionary based on a city\tregion\tcountry format
                if city_name and region_name and country_name:
                    city_region_country = city_name+"\t"+region_name+"\t"+country_name
                    city_region = city_name+"\t"+region_name
                    city_country = city_name+"\t"+country_name
                    self.geocoder[city_region_country] = (lat,lon)
                    self.geocoder[city_region] = (lat,lon)
                    self.geocoder[city_country] = (lat,lon)
                    self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_region_country))
                    self.all_city_names.add(city_region_country)
                elif city_name and region_name:
                    city_region = city_name+"\t"+region_name
                    self.geocoder[city_region] = (lat,lon)
                    self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_region))

                elif city_name and country_name:
                    if city_name == country_name:
                        self.geocoder[city_name] = (lat,lon)
                        self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_name))
                        self.all_city_names.add(city_name)
                    else:
                        city_country = city_name+"\t"+country_name
                        self.geocoder[city_country] = (lat,lon)
                        self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_country))
                        self.all_city_names.add(city_country)

            # If there was only ever one city with this name, allow it to be an
            # unabiguosus lookup with just the city name
            unambiguous_cities = 0
            for city_name, (lat,lon) in city_to_latlon.iteritems():
                if city_name_counts[city_name] == 1:
                    self.geocoder[city_name] = (lat,lon)
                    unambiguous_cities += 1
            #print "Saw %d unambiguous cities in %s" % (unambiguous_cities, dataset)


        elif dataset == "google":
            data = google_data()
            
            city_to_latlon = {}
            city_name_counts = collections.Counter()

            for line in data[1:]:
                #TODO this city name should be formatted the same as incoming tweets
                city_name = line[6].lower()
                if not city_name:
                    continue

                country_name = line[2].lower()
                region_name = line[3].lower()
                lat = float(line[0])
                lon = float(line[1])
                rounded_lat = round(lat,2)
                rounded_lon = round(lon,2)
                #self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,country_name,region_name,city_name))

                # Keep track of how many times city names occur
                city_to_latlon[city_name] = (lat,lon)
                city_name_counts[city_name] += 1

                if city_name and region_name and country_name:
                    city_region_country = city_name+"\t"+region_name+"\t"+country_name
                    city_region = city_name+"\t"+region_name
                    city_country = city_name+"\t"+country_name
                    self.geocoder[city_region_country] = (lat,lon)
                    self.geocoder[city_region] = (lat,lon)
                    self.geocoder[city_country] = (lat,lon)
                    self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_region_country))
                    self.all_city_names.add(city_region_country)

                elif city_name and region_name:
                    city_region = city_name+"\t"+region_name
                    self.geocoder[city_region] = (lat,lon)
                    self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_region))
                    self.all_city_names.add(city_region)

                elif city_name and country_name:
                    if city_name == country_name:
                        self.geocoder[city_name] = (lat,lon)
                        self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_name))
                        self.all_city_names.add(city_name)
                    else:
                        city_country = city_name+"\t"+country_name
                        self.geocoder[city_country] = (lat,lon)
                        self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_country))
                    self.all_city_names.add(city_country)

            # If there was only ever one city with this name, allow it to be an
            # unabiguosus lookup with just the city name
            unambiguous_cities = 0
            for city_name, (lat,lon) in city_to_latlon.iteritems():
                if city_name_counts[city_name] == 1:
                    self.geocoder[city_name] = (lat,lon)
                    unambiguous_cities += 1
            #print "Saw %d unambiguous cities in %s" % (unambiguous_cities, dataset)

        elif dataset == "dbpedia":
            data = dbpedia_data()
            
            city_to_latlon = {}
            city_name_counts = collections.Counter()
            already_entered = set()
            line_no = 0

            for cols in data[1:]:

                line_no += 1
                if line_no % 1000000 == 0:
                    LOGGER.debug("currently read %d locations from %s" %
                                 (line_no, dataset))


                lat = cols[3]
                lon = cols[4]

                # Guard against weirdness
                if lat == 'NAN' or lon == 'NAN':
                    continue
                try:
                    lat = float(lat)
                    lon = float(lon)
                except ValueError:
                    continue

                # Ensure we can use this location if we're not allowing duplicates
                lat_lon = (lat, lon)
                already_entered.add(lat_lon)
                
                city = cols[0].lower()
                country = cols[2].lower()
                states = cols[1].lower().split('|')
                
                city_to_latlon[city] = (lat,lon)
                city_name_counts[city] += 1

                self.__add_name(city + "\t" + country, lat_lon)
                if city == country:
                    self.__add_name(city, lat_lon)
                for state in states:
                    self.__add_name(city + "\t" + state + "\t" + country, lat_lon)

            unambiguous_cities = 0
            for city_name, (lat,lon) in city_to_latlon.iteritems():
                if city_name_counts[city_name] == 1:
                    self.geocoder[city_name] = (lat,lon)
                    unambiguous_cities += 1

        elif dataset == "geonames":
            data = geonames_data()
            
            city_to_latlon = {}
            city_name_counts = collections.Counter()

            line_no = 0
            for line in data[1:]:
                #TODO this city name should be formatted the same as incoming tweets
                city_name = line[0].lower()
                if not city_name:
                    continue

                line_no += 1
                if line_no % 1000000 == 0:
                    LOGGER.debug("currently read %d locations from %s" %
                                 (line_no, dataset))

                country_name = line[2].lower()
                region_name = line[1].lower()
                lat = float(line[3])
                lon = float(line[4])
                rounded_lat = round(lat,2)
                rounded_lon = round(lon,2)
                
                # Keep track of how many times city names occur
                city_to_latlon[city_name] = (lat,lon)
                city_name_counts[city_name] += 1

                if city_name and region_name and country_name:
                    city_region_country = city_name+"\t"+region_name+"\t"+country_name
                    city_region = city_name+"\t"+region_name
                    city_country = city_name+"\t"+country_name
                    self.geocoder[city_region_country] = (lat,lon)
                    self.geocoder[city_region] = (lat,lon)
                    self.geocoder[city_country] = (lat,lon)
                    self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_region_country))
                    self.all_city_names.add(city_region_country)

                elif city_name and region_name:
                    city_region = city_name+"\t"+region_name
                    self.geocoder[city_region] = (lat,lon)
                    self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_region))
                    self.all_city_names.add(city_region)

                elif city_name and country_name:
                    if city_name == country_name:
                        self.geocoder[city_name] = (lat,lon)
                        self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_name))
                        self.all_city_names.add(city_name)
                    else:
                        city_country = city_name+"\t"+country_name
                        self.geocoder[city_country] = (lat,lon)
                        self.reverse_geocoder[(rounded_lat,rounded_lon)].append((lat,lon,city_country))
                    self.all_city_names.add(city_country)

            # If there was only ever one city with this name, allow it to be an
            # unabiguosus lookup with just the city name
            unambiguous_cities = 0
            for city_name, (lat,lon) in city_to_latlon.iteritems():
                if city_name_counts[city_name] == 1:
                    self.geocoder[city_name] = (lat,lon)
                    unambiguous_cities += 1
            #print "Saw %d unambiguous cities in %s" % (unambiguous_cities, dataset)


        else:
            raise NotImplementedError(dataset)


        # create a lower-case dictionary for noisy lookups
        self.lc_name_to_location = {}
        for name, (lat, lon) in self.geocoder.iteritems():
            self.lc_name_to_location[name.lower()] = (lat, lon)

        LOGGER.debug("Geocoder loaded %d locations from %s" %
                     (len(self.geocoder), dataset))

Example 152

Project: ansible-snmp-facts Source File: snmp_facts.py
def main():
    module = AnsibleModule(
        argument_spec=dict(
            host=dict(required=True),
            version=dict(required=True, choices=['v2', 'v2c', 'v3']),
            community=dict(required=False, default=False),
            username=dict(required=False),
            level=dict(required=False, choices=['authNoPriv', 'authPriv']),
            integrity=dict(required=False, choices=['md5', 'sha']),
            privacy=dict(required=False, choices=['des', 'aes']),
            authkey=dict(required=False),
            privkey=dict(required=False),
            removeplaceholder=dict(required=False)),
            required_together = ( ['username','level','integrity','authkey'],['privacy','privkey'],),
        supports_check_mode=False)

    m_args = module.params

    if not has_pysnmp:
        module.fail_json(msg='Missing required pysnmp module (check docs)')

    cmdGen = cmdgen.CommandGenerator()

    # Verify that we receive a community when using snmp v2
    if m_args['version'] == "v2" or m_args['version'] == "v2c":
        if m_args['community'] == False:
            module.fail_json(msg='Community not set when using snmp version 2')
            
    if m_args['version'] == "v3":
        if m_args['username'] == None:
            module.fail_json(msg='Username not set when using snmp version 3')

        if m_args['level'] == "authPriv" and m_args['privacy'] == None:
            module.fail_json(msg='Privacy algorithm not set when using authPriv')

            
        if m_args['integrity'] == "sha":
            integrity_proto = cmdgen.usmHMACSHAAuthProtocol
        elif m_args['integrity'] == "md5":
            integrity_proto = cmdgen.usmHMACMD5AuthProtocol

        if m_args['privacy'] == "aes":
            privacy_proto = cmdgen.usmAesCfb128Protocol
        elif m_args['privacy'] == "des":
            privacy_proto = cmdgen.usmDESPrivProtocol
    
    # Use SNMP Version 2
    if m_args['version'] == "v2" or m_args['version'] == "v2c":
        snmp_auth = cmdgen.CommunityData(m_args['community'])

    # Use SNMP Version 3 with authNoPriv
    elif m_args['level'] == "authNoPriv":
        snmp_auth = cmdgen.UsmUserData(m_args['username'], authKey=m_args['authkey'], authProtocol=integrity_proto)

    # Use SNMP Version 3 with authPriv
    else:
        snmp_auth = cmdgen.UsmUserData(m_args['username'], authKey=m_args['authkey'], privKey=m_args['privkey'], authProtocol=integrity_proto, privProtocol=privacy_proto)

    # Use p to prefix OIDs with a dot for polling
    p = DefineOid(dotprefix=True)
    # Use v without a prefix to use with return values
    v = DefineOid(dotprefix=False)

    Tree = lambda: defaultdict(Tree)
                               
    results = Tree()
            
    errorIndication, errorStatus, errorIndex, varBinds = cmdGen.getCmd(
        snmp_auth,
        cmdgen.UdpTransportTarget((m_args['host'], 161)),
        cmdgen.MibVariable(p.sysDescr,),
        cmdgen.MibVariable(p.sysObjectId,), 
        cmdgen.MibVariable(p.sysUpTime,),
        cmdgen.MibVariable(p.sysContact,), 
        cmdgen.MibVariable(p.sysName,),
        cmdgen.MibVariable(p.sysLocation,),
    )


    if errorIndication:
        module.fail_json(msg=str(errorIndication))

    for oid, val in varBinds:
        current_oid = oid.prettyPrint()
        current_val = val.prettyPrint()
        if current_oid == v.sysDescr:
            results['ansible_sysdescr'] = decode_hex(current_val)
        elif current_oid == v.sysObjectId:
            results['ansible_sysobjectid'] = current_val
        elif current_oid == v.sysUpTime:
            results['ansible_sysuptime'] = current_val
        elif current_oid == v.sysContact:
            results['ansible_syscontact'] = current_val
        elif current_oid == v.sysName:
            results['ansible_sysname'] = current_val
        elif current_oid == v.sysLocation:
            results['ansible_syslocation'] = current_val

    errorIndication, errorStatus, errorIndex, varTable = cmdGen.nextCmd(
        snmp_auth,
        cmdgen.UdpTransportTarget((m_args['host'], 161)), 
        cmdgen.MibVariable(p.ifIndex,),
        cmdgen.MibVariable(p.ifDescr,),
        cmdgen.MibVariable(p.ifMtu,),
        cmdgen.MibVariable(p.ifSpeed,),
        cmdgen.MibVariable(p.ifPhysAddress,),
        cmdgen.MibVariable(p.ifAdminStatus,),
        cmdgen.MibVariable(p.ifOperStatus,),
        cmdgen.MibVariable(p.ipAdEntAddr,), 
        cmdgen.MibVariable(p.ipAdEntIfIndex,), 
        cmdgen.MibVariable(p.ipAdEntNetMask,), 

        cmdgen.MibVariable(p.ifAlias,),
    )
 

    if errorIndication:
        module.fail_json(msg=str(errorIndication))

    interface_indexes = []
    
    all_ipv4_addresses = []     
    ipv4_networks = Tree()

    for varBinds in varTable:
        for oid, val in varBinds:
            current_oid = oid.prettyPrint()
            current_val = val.prettyPrint()
            if v.ifIndex in current_oid:
                ifIndex = int(current_oid.rsplit('.', 1)[-1])
                results['ansible_interfaces'][ifIndex]['ifindex'] = current_val
                interface_indexes.append(ifIndex)
            if v.ifDescr in current_oid:
                ifIndex = int(current_oid.rsplit('.', 1)[-1])
                results['ansible_interfaces'][ifIndex]['name'] = current_val
            if v.ifMtu in current_oid:
                ifIndex = int(current_oid.rsplit('.', 1)[-1])
                results['ansible_interfaces'][ifIndex]['mtu'] = current_val
            if v.ifMtu in current_oid:
                ifIndex = int(current_oid.rsplit('.', 1)[-1])
                results['ansible_interfaces'][ifIndex]['speed'] = current_val
            if v.ifPhysAddress in current_oid:
                ifIndex = int(current_oid.rsplit('.', 1)[-1])
                results['ansible_interfaces'][ifIndex]['mac'] = decode_mac(current_val)
            if v.ifAdminStatus in current_oid:
                ifIndex = int(current_oid.rsplit('.', 1)[-1])
                results['ansible_interfaces'][ifIndex]['adminstatus'] = lookup_adminstatus(int(current_val))
            if v.ifOperStatus in current_oid:
                ifIndex = int(current_oid.rsplit('.', 1)[-1])
                results['ansible_interfaces'][ifIndex]['operstatus'] = lookup_operstatus(int(current_val))
            if v.ipAdEntAddr in current_oid:
                curIPList = current_oid.rsplit('.', 4)[-4:]
                curIP = ".".join(curIPList)
                ipv4_networks[curIP]['address'] = current_val
                all_ipv4_addresses.append(current_val)
            if v.ipAdEntIfIndex in current_oid:
                curIPList = current_oid.rsplit('.', 4)[-4:]
                curIP = ".".join(curIPList)
                ipv4_networks[curIP]['interface'] = current_val
            if v.ipAdEntNetMask in current_oid:
                curIPList = current_oid.rsplit('.', 4)[-4:]
                curIP = ".".join(curIPList)
                ipv4_networks[curIP]['netmask'] = current_val

            if v.ifAlias in current_oid:
                ifIndex = int(current_oid.rsplit('.', 1)[-1])
                results['ansible_interfaces'][ifIndex]['description'] = current_val

    interface_to_ipv4 = {}
    for ipv4_network in ipv4_networks:
        current_interface = ipv4_networks[ipv4_network]['interface']
        current_network = {
                            'address':  ipv4_networks[ipv4_network]['address'],
                            'netmask':  ipv4_networks[ipv4_network]['netmask']
                          }
        if not current_interface in interface_to_ipv4:
            interface_to_ipv4[current_interface] = []
            interface_to_ipv4[current_interface].append(current_network)
        else:
            interface_to_ipv4[current_interface].append(current_network)

    for interface in interface_to_ipv4:
        results['ansible_interfaces'][int(interface)]['ipv4'] = interface_to_ipv4[interface]

    results['ansible_all_ipv4_addresses'] = all_ipv4_addresses
 
    module.exit_json(ansible_facts=results)

Example 153

Project: networkx Source File: gml.py
def parse_gml_lines(lines, label, destringizer):
    """Parse GML into a graph.
    """
    def tokenize():
        patterns = [
            r'[A-Za-z][0-9A-Za-z_]*\b',  # keys
            r'[+-]?(?:[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)(?:[Ee][+-]?[0-9]+)?',  # reals
            r'[+-]?[0-9]+',   # ints
            r'".*?"',         # strings
            r'\[',            # dict start
            r'\]',            # dict end
            r'#.*$|\s+'       # comments and whitespaces
            ]
        tokens = re.compile(
            '|'.join('(' + pattern + ')' for pattern in patterns))
        lineno = 0
        for line in lines:
            length = len(line)
            pos = 0
            while pos < length:
                match = tokens.match(line, pos)
                if match is not None:
                    for i in range(len(patterns)):
                        group = match.group(i + 1)
                        if group is not None:
                            if i == 0:    # keys
                                value = group.rstrip()
                            elif i == 1:  # reals
                                value = float(group)
                            elif i == 2:  # ints
                                value = int(group)
                            else:
                                value = group
                            if i != 6:    # comments and whitespaces
                                yield (i, value, lineno + 1, pos + 1)
                            pos += len(group)
                            break
                else:
                    raise NetworkXError('cannot tokenize %r at (%d, %d)' %
                                        (line[pos:], lineno + 1, pos + 1))
            lineno += 1
        yield (None, None, lineno + 1, 1)  # EOF

    def unexpected(curr_token, expected):
        category, value, lineno, pos = curr_token
        raise NetworkXError(
            'expected %s, found %s at (%d, %d)' %
            (expected, repr(value) if value is not None else 'EOF', lineno,
             pos))

    def consume(curr_token, category, expected):
        if curr_token[0] == category:
            return next(tokens)
        unexpected(curr_token, expected)

    def parse_kv(curr_token):
        dct = defaultdict(list)
        while curr_token[0] == 0:  # keys
            key = curr_token[1]
            curr_token = next(tokens)
            category = curr_token[0]
            if category == 1 or category == 2:  # reals or ints
                value = curr_token[1]
                curr_token = next(tokens)
            elif category == 3:  # strings
                value = unescape(curr_token[1][1:-1])
                if destringizer:
                    try:
                        value = destringizer(value)
                    except ValueError:
                        pass
                curr_token = next(tokens)
            elif category == 4:  # dict start
                curr_token, value = parse_dict(curr_token)
            else:
                unexpected(curr_token, "an int, float, string or '['")
            dct[key].append(value)
        dct = {key: (value if not isinstance(value, list) or len(value) != 1
                     else value[0]) for key, value in dct.items()}
        return curr_token, dct

    def parse_dict(curr_token):
        curr_token = consume(curr_token, 4, "'['")    # dict start
        curr_token, dct = parse_kv(curr_token)
        curr_token = consume(curr_token, 5, "']'")  # dict end
        return curr_token, dct

    def parse_graph():
        curr_token, dct = parse_kv(next(tokens))
        if curr_token[0] is not None:  # EOF
            unexpected(curr_token, 'EOF')
        if 'graph' not in dct:
            raise NetworkXError('input contains no graph')
        graph = dct['graph']
        if isinstance(graph, list):
            raise NetworkXError('input contains more than one graph')
        return graph

    tokens = tokenize()
    graph = parse_graph()

    directed = graph.pop('directed', False)
    multigraph = graph.pop('multigraph', False)
    if not multigraph:
        G = nx.DiGraph() if directed else nx.Graph()
    else:
        G = nx.MultiDiGraph() if directed else nx.MultiGraph()
    G.graph.update((key, value) for key, value in graph.items()
                   if key != 'node' and key != 'edge')

    def pop_attr(dct, category, attr, i):
        try:
            return dct.pop(attr)
        except KeyError:
            raise NetworkXError(
                "%s #%d has no '%s' attribute" % (category, i, attr))

    nodes = graph.get('node', [])
    mapping = {}
    labels = set()
    for i, node in enumerate(nodes if isinstance(nodes, list) else [nodes]):
        id = pop_attr(node, 'node', 'id', i)
        if id in G:
            raise NetworkXError('node id %r is duplicated' % (id,))
        if label != 'id':
            label = pop_attr(node, 'node', 'label', i)
            if label in labels:
                raise NetworkXError('node label %r is duplicated' % (label,))
            labels.add(label)
            mapping[id] = label
        G.add_node(id, **node)

    edges = graph.get('edge', [])
    for i, edge in enumerate(edges if isinstance(edges, list) else [edges]):
        source = pop_attr(edge, 'edge', 'source', i)
        target = pop_attr(edge, 'edge', 'target', i)
        if source not in G:
            raise NetworkXError(
                'edge #%d has an undefined source %r' % (i, source))
        if target not in G:
            raise NetworkXError(
                'edge #%d has an undefined target %r' % (i, target))
        if not multigraph:
            if not G.has_edge(source, target):
                G.add_edge(source, target, **edge)
            else:
                raise nx.NetworkXError(
                    'edge #%d (%r%s%r) is duplicated' %
                    (i, source, '->' if directed else '--', target))
        else:
            key = edge.pop('key', None)
            if key is not None and G.has_edge(source, target, key):
                raise nx.NetworkXError(
                    'edge #%d (%r%s%r, %r) is duplicated' %
                    (i, source, '->' if directed else '--', target, key))
            G.add_edge(source, target, key, **edge)

    if label != 'id':
        G = nx.relabel_nodes(G, mapping)
    return G

Example 154

Project: scancode-toolkit Source File: index.py
    def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accuemulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            rul_len = rul.length
            if rul.false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
                if rul_len > self.largest_false_positive_length:
                    self.largest_false_positive_length = rul_len
            elif rul.negative():
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS

        # initial dictionary mapping to old/random token ids
        self.dictionary = dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)}
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) random integer ids
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid]

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens)
        self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered
        len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered
        self.len_good = len_good = len_tokens - len_junk

        #######################################################################
        # build index structures
        #######################################################################

        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            if rule.false_positive:
                # FP rules are not used for any matching
                # there is nothing else for these rules
                self.false_positive_rid_by_hash[rule_hash] = rid
            else:
                # negative, small and regular

                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid >= len_junk:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {tid: array('h', value) for tid, value in postings.items()}
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = index_token_sets(rule_token_ids, len_junk, len_good)
                self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automatons...
                if rule.negative():
                    # ... with only the whole rule tokens sequence
                    negative_automaton_add(tids=rule_token_ids, rid=rid)
                else:
                    # ... or with the whole rule tokens sequence
                    rules_automaton_add(tids=rule_token_ids, rid=rid)
                    # ... and ngrams: compute ngrams and populate the automaton with ngrams
                    if USE_AHO_FRAGMENTS and not rule.is_url and not rule.solid and len(rule_token_ids) > NGRAM_LEN:
                        all_ngrams = ngrams(rule_token_ids, ngram_length=NGRAM_LEN)
                        selected_ngrams = select_ngrams(all_ngrams, with_pos=True)
                        for pos, ngram in selected_ngrams:
                            rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # update rule thresholds
                rule.low_unique = tids_set_counter(rlow_set)
                rule.high_unique = tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = tids_multiset_counter(rlow_mset)
                rule.high_length = tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)
        sparsify(self.false_positive_rid_by_hash)

        dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
        if dupe_rules:
            dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules]
            msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths)))
            raise AssertionError(msg)

        self.optimized = True

Example 155

Project: eyed3 Source File: fixup.py
    def handleDirectory(self, directory, _):
        if not self._file_cache:
            return

        directory = os.path.abspath(directory)
        print("\n" + Style.BRIGHT + Fore.GREY +
              "Scanning directory%s %s" % (Style.RESET_ALL, directory))

        def _path(af):
            return af.path

        self._handled_one = True

        # Make sure all of the audio files has a tag.
        for f in self._file_cache:
            if f.tag is None:
                f.initTag()

        audio_files = sorted(list(self._file_cache), key=_path)

        self._file_cache = []
        edited_files = set()
        self._curr_dir_type = self.args.dir_type
        if self._curr_dir_type is None:
            types = set([a.tag.album_type for a in audio_files])
            if len(types) == 1:
                self._curr_dir_type = types.pop()

        # Check for corrections to LP, EP, COMP
        if (self._curr_dir_type is None and len(audio_files) < EP_MAX_HINT):
            # Do you want EP?
            if False in [a.tag.album_type == EP_TYPE for a in audio_files]:
                if prompt("Only %d audio files, process directory as an EP" %
                          len(audio_files),
                          default=True):
                    self._curr_dir_type = EP_TYPE
            else:
                self._curr_dir_type = EP_TYPE
        elif (self._curr_dir_type in (EP_TYPE, DEMO_TYPE) and
                len(audio_files) > EP_MAX_HINT):
            # Do you want LP?
            if prompt("%d audio files is large for type %s, process "
                      "directory as an LP" % (len(audio_files),
                                              self._curr_dir_type),
                      default=True):
                self._curr_dir_type = LP_TYPE

        last = defaultdict(lambda: None)

        album_artist = None
        artists = set()
        album = None

        if self._curr_dir_type != SINGLE_TYPE:
            album_artist, artists = self._resolveArtistInfo(audio_files)
            print(Fore.BLUE + u"Album artist: " + Style.RESET_ALL +
                  (album_artist or u""))
            print(Fore.BLUE + "Artist" + ("s" if len(artists) > 1 else "") +
                  ": " + Style.RESET_ALL + u", ".join(artists))

            album = self._getAlbum(audio_files)
            print(Fore.BLUE + "Album: " + Style.RESET_ALL + album)

            rel_date, orel_date, rec_date = self._getDates(audio_files)
            for what, d in [("Release", rel_date),
                            ("Original", orel_date),
                            ("Recording", rec_date)]:
                print(Fore.BLUE + ("%s date: " % what) + Style.RESET_ALL +
                        str(d))

            num_audio_files = len(audio_files)
            track_nums = set([f.tag.track_num[0] for f in audio_files])
            fix_track_nums = set(range(1, num_audio_files + 1)) != track_nums
            new_track_nums = []

        dir_type = self._curr_dir_type
        for f in sorted(audio_files, key=_path):
            print(Style.BRIGHT + Fore.GREEN + u"Checking" + Fore.RESET +
                  Fore.GREY + (" %s" % os.path.basename(f.path)) +
                  Style.RESET_ALL)

            if not f.tag:
                print("\tAdding new tag")
                f.initTag()
                edited_files.add(f)
            tag = f.tag

            if tag.version != ID3_V2_4:
                print("\tConverting to ID3 v2.4")
                tag.version = ID3_V2_4
                edited_files.add(f)

            if (dir_type != SINGLE_TYPE and album_artist != tag.album_artist):
                print(u"\tSetting album artist: %s" % album_artist)
                tag.album_artist = album_artist
                edited_files.add(f)

            if not tag.artist and dir_type in (VARIOUS_TYPE, SINGLE_TYPE):
                # Prompt artist
                tag.artist = prompt("Artist name", default=last["artist"])
                last["artist"] = tag.artist
            elif len(artists) == 1 and tag.artist != artists[0]:
                assert(dir_type != SINGLE_TYPE)
                print(u"\tSetting artist: %s" % artists[0])
                tag.artist = artists[0]
                edited_files.add(f)

            if tag.album != album and dir_type != SINGLE_TYPE:
                print(u"\tSetting album: %s" % album)
                tag.album = album
                edited_files.add(f)

            orig_title = tag.title
            if not tag.title:
                tag.title = prompt("Track title")
            tag.title = tag.title.strip()
            if self.args.fix_case:
                tag.title = _fixCase(tag.title)
            if orig_title != tag.title:
                print(u"\tSetting title: %s" % tag.title)
                edited_files.add(f)

            if dir_type != SINGLE_TYPE:
                # Track numbers
                tnum, ttot = tag.track_num
                update = False
                if ttot != num_audio_files:
                    update = True
                    ttot = num_audio_files

                if fix_track_nums or not (1 <= tnum <= num_audio_files):
                    tnum = None
                    while tnum is None:
                        tnum = int(prompt("Track #", type_=int))
                        if not (1 <= tnum <= num_audio_files):
                            print(Fore.RED + "Out of range: " + Fore.RESET +
                                  "1 <= %d <= %d" % (tnum, num_audio_files))
                            tnum = None
                        elif tnum in new_track_nums:
                            print(Fore.RED + "Duplicate value: " + Fore.RESET +
                                    str(tnum))
                            tnum = None
                        else:
                            update = True
                            new_track_nums.append(tnum)

                if update:
                    tag.track_num = (tnum, ttot)
                    print("\tSetting track numbers: %s" % str(tag.track_num))
                    edited_files.add(f)
            else:
                # Singles
                if tag.track_num != (None, None):
                    tag.track_num = (None, None)
                    edited_files.add(f)

            if dir_type != SINGLE_TYPE:
                # Dates
                if rec_date and tag.recording_date != rec_date:
                    print("\tSetting %s date (%s)" %
                            ("recording", str(rec_date)))
                    tag.recording_date = rec_date
                    edited_files.add(f)
                if rel_date and tag.release_date != rel_date:
                    print("\tSetting %s date (%s)" % ("release", str(rel_date)))
                    tag.release_date = rel_date
                    edited_files.add(f)
                if orel_date and tag.original_release_date != orel_date:
                    print("\tSetting %s date (%s)" % ("original release",
                                                      str(orel_date)))
                    tag.original_release_date = orel_date
                    edited_files.add(f)

            for frame in list(tag.frameiter(["USER", "PRIV"])):
                print("\tRemoving %s frames: %s" %
                        (frame.id,
                         frame.owner_id if frame.id == b"PRIV" else frame.text))
                tag.frame_set[frame.id].remove(frame)
                edited_files.add(f)

            # Add TLEN
            tlen = tag.getTextFrame("TLEN")
            real_tlen = f.info.time_secs * 1000
            if tlen is None or int(tlen) != real_tlen:
                print("\tSetting TLEN (%d)" % real_tlen)
                tag.setTextFrame("TLEN", UnicodeType(real_tlen))
                edited_files.add(f)

            # Add custom album type if special and otherwise not able to be
            # determined.
            curr_type = tag.album_type
            if curr_type != dir_type:
                print("\tSetting %s = %s" % (TXXX_ALBUM_TYPE, dir_type))
                tag.album_type = dir_type
                edited_files.add(f)

        try:
            if not self._checkCoverArt(directory, audio_files):
                if not prompt("Proceed without valid cover file", default=True):
                    return
        finally:
            self._dir_images = []

        # Determine other changes, like file and/or directory renames
        # so they can be reported before save confirmation.

        # File renaming
        file_renames = []
        if self.args.file_rename_pattern:
            format_str = self.args.file_rename_pattern
        else:
            if dir_type == SINGLE_TYPE:
                format_str = SINGLE_FNAME_FORMAT
            elif dir_type in (VARIOUS_TYPE, COMP_TYPE):
                format_str = VARIOUS_FNAME_FORMAT
            else:
                format_str = NORMAL_FNAME_FORMAT

        for f in audio_files:
            orig_name, orig_ext = os.path.splitext(os.path.basename(f.path))
            new_name = TagTemplate(format_str).substitute(f.tag, zeropad=True)
            if orig_name != new_name:
                printMsg(u"Rename file to %s%s" % (new_name, orig_ext))
                file_renames.append((f, new_name, orig_ext))

        # Directory renaming
        dir_rename = None
        if dir_type != SINGLE_TYPE:
            if self.args.dir_rename_pattern:
                dir_format = self.args.dir_rename_pattern
            else:
                if dir_type == LIVE_TYPE:
                    dir_format = LIVE_DNAME_FORMAT
                else:
                    dir_format = NORMAL_DNAME_FORMAT
            template = TagTemplate(dir_format,
                                   dotted_dates=self.args.dotted_dates)

            pref_dir = template.substitute(audio_files[0].tag, zeropad=True)
            if os.path.basename(directory) != pref_dir:
                new_dir = os.path.join(os.path.dirname(directory), pref_dir)
                printMsg("Rename directory to %s" % new_dir)
                dir_rename = (directory, new_dir)

        # Cruft files to remove
        file_removes = []
        if self._dir_files_to_remove:
            for f in self._dir_files_to_remove:
                print("Remove file: " + os.path.basename(f))
                file_removes.append(f)
        self._dir_files_to_remove = set()

        if not self.args.dry_run:
            confirmed = False

            if (edited_files or file_renames or dir_rename or file_removes):
                confirmed = prompt("\nSave changes", default=True)

            if confirmed:
                for f in edited_files:
                    print(u"Saving %s" % os.path.basename(f.path))
                    f.tag.save(version=ID3_V2_4, preserve_file_time=True)

                for f, new_name, orig_ext in file_renames:
                    printMsg(u"Renaming file to %s%s" % (new_name, orig_ext))
                    f.rename(new_name, preserve_file_time=True)

                if file_removes:
                    for f in file_removes:
                        printMsg("Removing file %s" % os.path.basename(f))
                        os.remove(f)

                if dir_rename:
                    printMsg("Renaming directory to %s" % dir_rename[1])
                    s = os.stat(dir_rename[0])
                    os.rename(dir_rename[0], dir_rename[1])
                    # With a rename use the origianl access time
                    os.utime(dir_rename[1], (s.st_atime, s.st_atime))

        else:
            printMsg("\nNo changes made (run without -n/--dry-run)")

Example 156

Project: kmodes Source File: kprototypes.py
def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
                 gamma, init, n_init, verbose):
    """k-prototypes algorithm"""

    if sparse.issparse(X):
        raise TypeError("k-prototypes does not support sparse data.")

    if categorical is None or not categorical:
        raise NotImplementedError(
            "No categorical data selected, effectively doing k-means. "
            "Present a list of categorical columns, or use scikit-learn's "
            "KMeans instead."
        )
    if isinstance(categorical, int):
        categorical = [categorical]
    assert len(categorical) != X.shape[1], \
        "All columns are categorical, use k-modes instead of k-prototypes."
    assert max(categorical) < X.shape[1], \
        "Categorical index larger than number of columns."

    ncatattrs = len(categorical)
    nnumattrs = X.shape[1] - ncatattrs
    npoints = X.shape[0]
    assert n_clusters <= npoints, "More clusters than data points?"

    Xnum, Xcat = _split_num_cat(X, categorical)
    Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)

    # Convert the categorical values in Xcat to integers for speed.
    # Based on the unique values in Xcat, we can make a mapping to achieve this.
    Xcat, enc_map = encode_features(Xcat)

    # Are there more n_clusters than unique rows? Then set the unique
    # rows as initial values and skip iteration.
    unique = get_unique_rows(X)
    n_unique = unique.shape[0]
    if n_unique <= n_clusters:
        max_iter = 0
        n_init = 1
        n_clusters = n_unique
        init = list(_split_num_cat(unique, categorical))
        init[1], _ = encode_features(init[1], enc_map)

    # Estimate a good value for gamma, which determines the weighing of
    # categorical values in clusters (see Huang [1997]).
    if gamma is None:
        gamma = 0.5 * Xnum.std()

    all_centroids = []
    all_labels = []
    all_costs = []
    all_n_iters = []
    for init_no in range(n_init):

        # For numerical part of initialization, we don't have a guarantee
        # that there is not an empty cluster, so we need to retry until
        # there is none.
        init_tries = 0
        while True:
            init_tries += 1
            # _____ INIT _____
            if verbose:
                print("Init: initializing centroids")
            if isinstance(init, str) and init == 'Huang':
                centroids = kmodes.init_huang(Xcat, n_clusters, cat_dissim)
            elif isinstance(init, str) and init == 'Cao':
                centroids = kmodes.init_cao(Xcat, n_clusters, cat_dissim)
            elif isinstance(init, str) and init == 'random':
                seeds = np.random.choice(range(npoints), n_clusters)
                centroids = Xcat[seeds]
            elif isinstance(init, list):
                # Make sure inits are 2D arrays.
                init = [np.atleast_2d(cur_init).T if len(cur_init.shape) == 1
                        else cur_init
                        for cur_init in init]
                assert init[0].shape[0] == n_clusters, \
                    "Wrong number of initial numerical centroids in init " \
                    "({}, should be {}).".format(init[0].shape[0], n_clusters)
                assert init[0].shape[1] == nnumattrs, \
                    "Wrong number of numerical attributes in init ({}, should be {})."\
                    .format(init[0].shape[1], nnumattrs)
                assert init[1].shape[0] == n_clusters, \
                    "Wrong number of initial categorical centroids in init ({}, " \
                    "should be {}).".format(init[1].shape[0], n_clusters)
                assert init[1].shape[1] == ncatattrs, \
                    "Wrong number of categorical attributes in init ({}, should be {})."\
                    .format(init[1].shape[1], ncatattrs)
                centroids = [np.asarray(init[0], dtype=np.float64),
                             np.asarray(init[1], dtype=np.uint8)]
            else:
                raise NotImplementedError("Initialization method not supported.")

            if not isinstance(init, list):
                # Numerical is initialized by drawing from normal distribution,
                # categorical following the k-modes methods.
                meanx = np.mean(Xnum, axis=0)
                stdx = np.std(Xnum, axis=0)
                centroids = [
                    meanx + np.random.randn(n_clusters, nnumattrs) * stdx,
                    centroids
                ]

            if verbose:
                print("Init: initializing clusters")
            membship = np.zeros((n_clusters, npoints), dtype=np.uint8)
            # Keep track of the sum of attribute values per cluster so that we
            # can do k-means on the numerical attributes.
            cl_attr_sum = np.zeros((n_clusters, nnumattrs), dtype=np.float64)
            # cl_attr_freq is a list of lists with dictionaries that contain
            # the frequencies of values per cluster and attribute.
            cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)]
                            for _ in range(n_clusters)]
            for ipoint in range(npoints):
                # Initial assignment to clusters
                clust = np.argmin(
                    num_dissim(centroids[0], Xnum[ipoint]) +
                    gamma * cat_dissim(centroids[1], Xcat[ipoint])
                )
                membship[clust, ipoint] = 1
                # Count attribute values per cluster.
                for iattr, curattr in enumerate(Xnum[ipoint]):
                    cl_attr_sum[clust, iattr] += curattr
                for iattr, curattr in enumerate(Xcat[ipoint]):
                    cl_attr_freq[clust][iattr][curattr] += 1

            # If no empty clusters, then consider initialization finalized.
            if membship.sum(axis=1).min() > 0:
                break

            if init_tries == MAX_INIT_TRIES:
                # Could not get rid of empty clusters. Randomly
                # initialize instead.
                init = 'random'
            elif init_tries == RAISE_INIT_TRIES:
                raise ValueError(
                    "Clustering algorithm could not initialize. "
                    "Consider assigning the initial clusters manually."
                )

        # Perform an initial centroid update.
        for ik in range(n_clusters):
            for iattr in range(nnumattrs):
                centroids[0][ik, iattr] = \
                    cl_attr_sum[ik, iattr] / sum(membship[ik, :])
            for iattr in range(ncatattrs):
                centroids[1][ik, iattr] = \
                    get_max_value_key(cl_attr_freq[ik][iattr])

        # _____ ITERATION _____
        if verbose:
            print("Starting iterations...")
        itr = 0
        converged = False
        cost = np.Inf
        while itr <= max_iter and not converged:
            itr += 1
            centroids, moves = _k_prototypes_iter(Xnum, Xcat, centroids,
                                                  cl_attr_sum, cl_attr_freq,
                                                  membship, num_dissim, cat_dissim, gamma)

            # All points seen in this iteration
            labels, ncost = _labels_cost(Xnum, Xcat, centroids,
                                         num_dissim, cat_dissim, gamma)
            converged = (moves == 0) or (ncost >= cost)
            cost = ncost
            if verbose:
                print("Run: {}, iteration: {}/{}, moves: {}, ncost: {}"
                      .format(init_no + 1, itr, max_iter, moves, ncost))

        # Store results of current run.
        all_centroids.append(centroids)
        all_labels.append(labels)
        all_costs.append(cost)
        all_n_iters.append(itr)

    best = np.argmin(all_costs)
    if n_init > 1 and verbose:
        print("Best run was number {}".format(best + 1))

    # Note: return gamma in case it was automatically determined.
    return all_centroids[best], enc_map, all_labels[best], \
        all_costs[best], all_n_iters[best], gamma

Example 157

Project: nupic Source File: aggregator.py
Function: next
  def next(self, record, curInputBookmark):
    """ Return the next aggregated record, if any

    Parameters:
    ------------------------------------------------------------------------
    record:         The input record (values only) from the input source, or
                    None if the input has reached EOF (this will cause this
                    method to force completion of and return any partially
                    aggregated time period)
    curInputBookmark: The bookmark to the next input record
    retval:
      (outputRecord, inputBookmark)

      outputRecord: the aggregated record
      inputBookmark: a bookmark to the last position from the input that
                      contributed to this aggregated record.

      If we don't have any aggregated records yet, returns (None, None)


    The caller should generally do a loop like this:
      while True:
        inRecord = reader.getNextRecord()
        bookmark = reader.getBookmark()

        (aggRecord, aggBookmark) = aggregator.next(inRecord, bookmark)

        # reached EOF?
        if inRecord is None and aggRecord is None:
          break

        if aggRecord is not None:
          proessRecord(aggRecord, aggBookmark)


    This method makes use of the self._slice member variable to build up
    the values we need to aggregate. This is a dict of lists. The keys are
    the field indices and the elements of each list are the values for that
    field. For example:

      self._siice = { 0: [42, 53], 1: [4.0, 5.1] }

    """

    # This will hold the aggregated record we return
    outRecord = None

    # This will hold the bookmark of the last input used within the
    #  aggregated record we return.
    retInputBookmark = None

    if record is not None:

      # Increment input count
      self._inIdx += 1

      #print self._inIdx, record

      # Apply the filter, ignore the record if any field is unacceptable
      if self._filter != None and not self._filter[0](self._filter[1], record):
        return (None, None)

      # If no aggregation info just return as-is
      if self._nullAggregation:
        return (record, curInputBookmark)


      # ----------------------------------------------------------------------
      # Do aggregation

      #
      # Remember the very first record time stamp - it will be used as
      # the timestamp for all first records in all sequences to align
      # times for the aggregation/join of sequences.
      #
      # For a set of aggregated records, it will use the beginning of the time
      # window as a timestamp for the set
      #
      t = record[self._timeFieldIdx]

      if self._firstSequenceStartTime == None:
        self._firstSequenceStartTime = t

      # Create initial startTime and endTime if needed
      if self._startTime is None:
        self._startTime = t
      if self._endTime is None:
        self._endTime = self._getEndTime(t)
        assert self._endTime > t

      #print 'Processing line:', i, t, endTime
      #from dbgp.client import brk; brk(port=9011)


      # ----------------------------------------------------------------------
      # Does this record have a reset signal or sequence Id associated with it?
      # If so, see if we've reached a sequence boundary
      if self._resetFieldIdx is not None:
        resetSignal = record[self._resetFieldIdx]
      else:
        resetSignal = None

      if self._sequenceIdFieldIdx is not None:
        currSequenceId = record[self._sequenceIdFieldIdx]
      else:
        currSequenceId = None

      newSequence = (resetSignal == 1 and self._inIdx > 0) \
                      or self._sequenceId != currSequenceId \
                      or self._inIdx == 0

      if newSequence:
        self._sequenceId = currSequenceId


      # --------------------------------------------------------------------
      # We end the aggregation chunk if we go past the end time
      # -OR- we get an out of order record (t < startTime)
      sliceEnded = (t >= self._endTime or t < self._startTime)


      # -------------------------------------------------------------------
      # Time to generate a new output record?
      if (newSequence or sliceEnded) and len(self._slice) > 0:
        # Create aggregated record
        # print 'Creating aggregate record...'

        # Make first record timestamp as the beginning of the time period,
        # in case the first record wasn't falling on the beginning of the period
        for j, f in enumerate(self._fields):
          index = f[0]
          if index == self._timeFieldIdx:
            self._slice[j][0] = self._startTime
            break

        # Generate the aggregated record
        outRecord = self._createAggregateRecord()
        retInputBookmark = self._aggrInputBookmark

        # Reset the slice
        self._slice = defaultdict(list)


      # --------------------------------------------------------------------
      # Add current record to slice (Note keeping slices in memory). Each
      # field in the slice is a list of field values from all the sliced
      # records
      for j, f in enumerate(self._fields):
        index = f[0]
        # append the parsed field value to the proper aggregated slice field.
        self._slice[j].append(record[index])
        self._aggrInputBookmark = curInputBookmark


      # --------------------------------------------------------------------
      # If we've encountered a new sequence, start aggregation over again
      if newSequence:
        # TODO: May use self._firstSequenceStartTime as a start for the new
        # sequence (to align all sequences)
        self._startTime = t
        self._endTime = self._getEndTime(t)


      # --------------------------------------------------------------------
      # If a slice just ended, re-compute the start and end time for the
      #  next aggregated record
      if sliceEnded:
        # Did we receive an out of order record? If so, go back and iterate
        #   till we get to the next end time boundary.
        if t < self._startTime:
          self._endTime = self._firstSequenceStartTime
        while t >= self._endTime:
          self._startTime = self._endTime
          self._endTime = self._getEndTime(self._endTime)


      # If we have a record to return, do it now
      if outRecord is not None:
        return (outRecord, retInputBookmark)


    # ---------------------------------------------------------------------
    # Input reached EOF
    # Aggregate one last time in the end if necessary
    elif self._slice:

      # Make first record timestamp as the beginning of the time period,
      # in case the first record wasn't falling on the beginning of the period
      for j, f in enumerate(self._fields):
        index = f[0]
        if index == self._timeFieldIdx:
          self._slice[j][0] = self._startTime
          break

      outRecord = self._createAggregateRecord()
      retInputBookmark = self._aggrInputBookmark

      self._slice = defaultdict(list)


    # Return aggregated record
    return (outRecord, retInputBookmark)

Example 158

Project: nupic Source File: aggregator.py
  def __init__(self, aggregationInfo, inputFields, timeFieldName=None,
               sequenceIdFieldName=None, resetFieldName=None, filterInfo=None):
    """ Construct an aggregator instance

    Params:

    - aggregationInfo: a dictionary that contains the following entries
      - fields: a list of pairs. Each pair is a field name and an
        aggregation function (e.g. sum). The function will be used to aggregate
        multiple values during the aggregation period.

      - aggregation period: 0 or more of unit=value fields; allowed units are:
          [years months] | [weeks days hours minutes seconds milliseconds
          microseconds]
          NOTE: years and months are mutually-exclusive with the other units.  See
                getEndTime() and _aggregate() for more details.
          Example1: years=1, months=6,
          Example2: hours=1, minutes=30,
          If none of the period fields are specified or if all that are specified
          have values of 0, then aggregation will be suppressed, and the given
          inputFile parameter value will be returned.

    - inputFields: The fields from the data source. This is a sequence of
      `nupic.data.fieldmeta.FieldMetaInfo` instances.

    - timeFieldName: name of the field to use as the time field. If None,
          then the time field will be queried from the reader.

    - sequenceIdFieldName: name of the field to use as the sequenecId. If None,
          then the time field will be queried from the reader.

    - resetFieldName: name of the field to use as the reset field. If None,
          then the time field will be queried from the reader.

    - filterInfo: a structure with rules for filtering records out


    If the input file contains a time field, sequence id field or reset field
    that were not specified in aggregationInfo fields, those fields will be
    added automatically with the following rules:

    1. The order will be R, S, T, rest of the fields
    2. The aggregation function for these will be to pick the first:
       lambda x: x[0]

    """

    # -----------------------------------------------------------------------
    # Save member variables.

    # The same aggregationInfo dict may be used by the caller for generating
    # more datasets (with slight changes), so it is safer to copy it here and
    # all changes made here will not affect the input aggregationInfo
    self._filterInfo = filterInfo
    self._nullAggregation = False
    self._inputFields = inputFields


    # See if this is a null aggregation
    self._nullAggregation = False
    if aggregationInfo is None:
      self._nullAggregation = True
    else:
      aggDef = defaultdict(lambda: 0, aggregationInfo)
      if (aggDef['years'] == aggDef['months'] == aggDef['weeks'] ==
          aggDef['days'] == aggDef['hours'] == aggDef['minutes'] ==
          aggDef['seconds'] == aggDef['milliseconds'] ==
          aggDef['microseconds'] == 0):
        self._nullAggregation = True


    # Prepare the field filtering info. The filter allows us to ignore records
    #  based on specified min or max values for each field.
    self._filter = initFilter(self._inputFields, self._filterInfo)

    # ----------------------------------------------------------------------
    # Fill in defaults
    self._fields = None
    self._resetFieldIdx = None
    self._timeFieldIdx = None
    self._sequenceIdFieldIdx = None
    self._aggTimeDelta = datetime.timedelta()
    self._aggYears = 0
    self._aggMonths = 0

    # Init state variables used within next()
    self._aggrInputBookmark = None
    self._startTime = None
    self._endTime = None
    self._sequenceId = None
    self._firstSequenceStartTime = None
    self._inIdx = -1
    self._slice = defaultdict(list)


    # ========================================================================
    # Get aggregation params
    # self._fields will be a list of tuples: (fieldIdx, funcPtr, funcParam)
    if not self._nullAggregation:

      # ---------------------------------------------------------------------
      # Verify that all aggregation field names exist in the input
      fieldNames = [f[0] for f in aggregationInfo['fields']]
      readerFieldNames = [f[0] for f in self._inputFields]
      for name in fieldNames:
        if not name in readerFieldNames:
          raise Exception('No such input field: %s' % (name))


      # ---------------------------------------------------------------------
      # Get the indices of the special fields, if given to our constructor
      if timeFieldName is not None:
        self._timeFieldIdx = readerFieldNames.index(timeFieldName)
      if resetFieldName is not None:
        self._resetFieldIdx = readerFieldNames.index(resetFieldName)
      if sequenceIdFieldName is not None:
        self._sequenceIdFieldIdx = readerFieldNames.index(sequenceIdFieldName)


      # ---------------------------------------------------------------------
      # Re-order the fields to match the order in the reader and add in any
      #  fields from the reader that were not already in the aggregationInfo
      #  fields list.
      self._fields = []
      fieldIdx = -1
      for (name, type, special) in self._inputFields:

        fieldIdx += 1

        # See if it exists in the aggregationInfo
        found = False
        for field in aggregationInfo['fields']:
          if field[0] == name:
            aggFunctionName = field[1]
            found = True
            break
        if not found:
          aggFunctionName = 'first'

        # Convert to a function pointer and optional params
        (funcPtr, params) = self._getFuncPtrAndParams(aggFunctionName)

        # Add it
        self._fields.append((fieldIdx, funcPtr, params))

        # Is it a special field that we are still looking for?
        if special == FieldMetaSpecial.reset and self._resetFieldIdx is None:
          self._resetFieldIdx = fieldIdx
        if special == FieldMetaSpecial.timestamp and self._timeFieldIdx is None:
          self._timeFieldIdx = fieldIdx
        if (special == FieldMetaSpecial.sequence and
            self._sequenceIdFieldIdx is None):
          self._sequenceIdFieldIdx = fieldIdx


      assert self._timeFieldIdx is not None, "No time field was found"

      # Create an instance of _AggregationPeriod with the aggregation period
      self._aggTimeDelta = datetime.timedelta(days=aggDef['days'],
                                     hours=aggDef['hours'],
                                     minutes=aggDef['minutes'],
                                     seconds=aggDef['seconds'],
                                     milliseconds=aggDef['milliseconds'],
                                     microseconds=aggDef['microseconds'],
                                     weeks=aggDef['weeks'])
      self._aggYears = aggDef['years']
      self._aggMonths = aggDef['months']
      if self._aggTimeDelta:
        assert self._aggYears == 0
        assert self._aggMonths == 0

Example 159

Project: openstates Source File: bills.py
    def scrape(self, session, chambers):
        bill_type_map = {
            'B': 'bill',
            'R': 'resolution',
            'JR': 'joint resolution',
            'CR': 'concurrent resolution',
        }

        chamber_map = {
            'H': 'lower',
            'S': 'upper',
            'J': 'joint',
            'E': 'other', # Effective date
        }

        action_code_map = {
            'HI': ['other'],
            'SI': ['other'],
            'HH': ['other'],
            'SH': ['other'],
            'HPF': ['bill:introduced'],
            'HDSAS': ['other'],
            'SPF': ['bill:introduced'],
            'HSR': ['bill:reading:2'],
            'SSR': ['bill:reading:2'],
            'HFR': ['bill:reading:1'],
            'SFR': ['bill:reading:1'],
            'HRECM': ['bill:withdrawn', 'committee:referred'],
            'SRECM': ['bill:withdrawn', 'committee:referred'],
            'SW&C': ['bill:withdrawn', 'committee:referred'],
            'HW&C': ['bill:withdrawn', 'committee:referred'],
            'HRA': ['bill:passed'],
            'SRA': ['bill:passed'],
            'HPA': ['bill:passed'],
            'HRECO': ['other'],
            'SPA': ['bill:passed'],
            'HTABL': ['other'],  # 'House Tabled' - what is this?
            'SDHAS': ['other'],
            'HCFR': ['committee:passed:favorable'],
            'SCFR': ['committee:passed:favorable'],
            'HRAR': ['committee:referred'],
            'SRAR': ['committee:referred'],
            'STR': ['bill:reading:3'],
            'SAHAS': ['other'],
            'SE': ['bill:passed'],
            'SR': ['committee:referred'],
            'HTRL': ['bill:reading:3', 'bill:failed'],
            'HTR': ['bill:reading:3'],
            'S3RLT': ['bill:reading:3', 'bill:failed'],
            'HASAS': ['other'],
            'S3RPP': ['other'],
            'STAB': ['other'],
            'SRECO': ['other'],
            'SAPPT': ['other'],
            'HCA': ['other'],
            'HNOM': ['other'],
            'HTT': ['other'],
            'STT': ['other'],
            'SRECP': ['other'],
            'SCRA': ['other'],
            'SNOM': ['other'],
            'S2R': ['bill:reading:2'],
            'H2R': ['bill:reading:2'],
            'SENG': ['bill:passed'],
            'HENG': ['bill:passed'],
            'HPOST': ['other'],
            'HCAP': ['other'],
            'SDSG': ['governor:signed'],
            'SSG': ['governor:received'],
            'Signed Gov': ['governor:signed'],
            'HDSG': ['governor:signed'],
            'HSG': ['governor:received'],
            'EFF': ['other'],
            'HRP': ['other'],
            'STH': ['other'],
            'HTS': ['other'],
        }

        sid = self.metadata['session_details'][session]['_guid']
        legislation = backoff(
            self.lservice.GetLegislationForSession,
            sid
        )['LegislationIndex']

        for leg in legislation:
            lid = leg['Id']
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument['StatusHistory'][0]]

            actions = reversed([{
                'code': x['Code'],
                'action': x['Description'],
                '_guid': x['Id'],
                'date': x['Date']
            } for x in history])

            guid = instrument['Id']

            # A little bit hacky.
            bill_prefix = instrument['DocuementType']
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = '%s %s' % (
                bill_prefix,
                instrument['Number'],
            )
            if instrument['Suffix']:
                bill_id += instrument['Suffix']

            title = instrument['Caption']
            description = instrument['Summary']

            if title is None:
                continue

            bill = Bill(session, bill_chamber, bill_id, title, type=bill_type,
                description=description, _guid=guid)

            if instrument['Votes']:
                for vote_ in instrument['Votes']:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])

                    vote = Vote(
                        {'House': 'lower', 'Senate': 'upper'}[vote_['Branch']],
                        vote_['Date'],
                        vote_['Caption'] or 'Vote on Bill',
                        (vote_['Yeas'] > vote_['Nays']),
                        vote_['Yeas'],
                        vote_['Nays'],
                        (vote_['Excused'] + vote_['NotVoting']),
                        session=session,
                        bill_id=bill_id,
                        bill_chamber=bill_chamber)

                    vote.add_source(self.vsource)

                    methods = {'Yea': vote.yes, 'Nay': vote.no,}

                    for vdetail in vote_['Votes'][0]:
                        whom = vdetail['Member']
                        how = vdetail['MemberVoted']
                        try:
                            m = methods[how]
                        except KeyError:
                            m = vote.other
                        m(whom['Name'])

                    bill.add_vote(vote)

            ccommittees = defaultdict(list)
            committees = instrument['Committees']
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        'House': 'lower',
                        'Senate': 'upper',
                    }[committee['Type']]].append(committee['Name'])

            for action in actions:
                action_chamber = chamber_map[action['code'][0]]

                try:
                    action_types = action_code_map[action['code']]
                except KeyError:
                    error_msg = ('Code {code} for action {action} not '
                        'recognized.'.format(
                            code=action['code'],
                            action=action['action']))

                    self.logger.warning(error_msg)

                    action_types = ['other']

                committees = []
                if any(('committee' in x for x in action_types)):
                    committees = [str(x) for x in ccommittees.get(
                        action_chamber, [])]

                bill.add_action(action_chamber, action['action'],
                    action['date'], action_types, committees=committees,
                    _code=action['code'], _code_id=action['_guid'])

            sponsors = []
            if instrument['Authors']:
                sponsors = instrument['Authors']['Sponsorship']
                if 'Sponsors' in instrument and instrument['Sponsors']:
                    sponsors += instrument['Sponsors']['Sponsorship']

            sponsors = [
                (x['Type'], self.get_member(x['MemberId'])) for x in sponsors
            ]

            for typ, sponsor in sponsors:
                name = '{First} {Last}'.format(**dict(sponsor['Name']))
                bill.add_sponsor(
                    'primary' if 'Author' in typ else 'seconday',
                     name
                )

            for version in instrument['Versions']['DocuementDescription']:
                name, url, doc_id, version_id = [
                    version[x] for x in [
                        'Description',
                        'Url',
                        'Id',
                        'Version'
                    ]
                ]
                bill.add_version(
                    name,
                    url,
                    mimetype='application/pdf',
                    _internal_docuement_id=doc_id,
                    _version_id=version_id
                )

            versions = sorted(
                bill['versions'],
                key=lambda x: x['_internal_docuement_id']
            )
            bill['versions'] = versions

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(SOURCE_URL.format(**{
                'session': session,
                'bid': guid,
            }))
            self.save_bill(bill)

Example 160

Project: unisubs Source File: deploy.py
    def setup_images(self):
        timer = LoggingTimer()
        # make sure amara-enterprise is on the correct commit
        subprocess.check_call(['bin/update-integration.py', '--skip-fetch'])
        subprocess.check_call(['bin/build.py', self.image_name)
        timer.log_time('image build')
        # Send the image from builder to the other docker hosts
        log('sending image from builder to docker hosts')
        save_proc = self.docker.Popen(BUILDER_DOCKER_HOST, 'save',
                                      self.image_name, stdout=subprocess.PIPE)
        load_procs = [
            self.docker.Popen(host, 'load', stdin=subprocess.PIPE)
            for host in self.env.docker_hosts()
        ]
        BUF_SIZE = 4096
        while True:
            data = save_proc.stdout.read(BUF_SIZE)
            if not data:
                break
            for proc in load_procs:
                proc.stdin.write(data)
        for proc in load_procs:
            proc.stdin.close()
        if save_proc.wait() != 0:
            raise subprocess.CalledProcessError(
                "docker save error: {}".format(save_proc.return_code))
        for proc in load_procs:
            if proc.wait() != 0:
                raise subprocess.CalledProcessError(
                    "docker load error: {}".format(proc.return_code))
        timer.log_time('image save/load')
        if self.env.BRANCH in ('staging', 'production'):
            # Tag the image with amara-<branch>:latest
            # We use the tags to run periodic tasks, like updating
            # translations
            for host in [BUILDER_DOCKER_HOST] + self.env.docker_hosts():
                self.docker.run(host, 'tag', self.image_name, self.tag_name())

class ContainerManager(object):
    """Start/stop docker containers """

    def __init__(self, env, commit_id, image_name):
        self.docker = Docker()
        self.env = env
        self.commit_id = commit_id
        self.image_name = image_name
        self.containers_started = []
        self.containers_stopped = []

    def building_preview(self):
        return self.env.BRANCH not in ('staging', 'production')

    def app_params(self, limited=False):
        """Get docker params to used for both app containers and workers
        """
        aws_access_id, aws_secret_key = self.env.aws_credentials(limited)
        params = [
            # AWS Auth info
            '-e', 'AWS_ACCESS_ID=' + aws_access_id,
            '-e', 'AWS_SECRET_KEY=' + aws_secret_key,
            # REVISION controls the git revision we check out before starting
            # this is actually somewhat redundant since we already copy the
            # files into the docker image
            '-e', 'REVISION=' + self.env.BRANCH,
            # mount the workspace volume inside our container
            '-v', '/var/workspace:/var/workspace',
        ]
        if self.env.DB_NAME:
            params.extend([
                '-e', 'DB_NAME=' + self.env.DB_NAME,
            ])

        if self.building_preview():
            # SETTINGS_REVISION controls how to download the
            # server_local_settings.py file (see .docker/config_env.sh)
            params.extend(['-e', 'SETTINGS_REVISION=staging'])
        return params

    def interlock_params(self):
        if self.env.BRANCH == 'production':
            return [
                '-e', ('INTERLOCK_DATA={"alias_domains": '
                       '["www.amara.org"]}'),
                '-e', 'NEW_RELIC_APP_NAME=AmaraVPC',
                '-e', ('NEW_RELIC_LICENSE_KEY=' +
                       self.env.NEW_RELIC_LICENSE_KEY)
            ]
        else:
            return []

    def app_hostname(self):
        """Hostname for app containers.

        The hostname sets an entry in the hosts file for the container.  But
        more importantly, it tells Interlock what web traffic should be routed
        to the container.
        """
        if self.env.BRANCH == 'production':
            return 'amara.org'
        else:
            return '{}.amara.org'.format(self.env.BRANCH)

    def container_name_prefix_for_branch(self):
        """Start of docker container names for this git branch."""
        return 'app-amara-{}-'.format(self.env.BRANCH)

    def container_name_prefix_for_build(self):
        """Start of docker container names for this particular build. """
        # Include both the git commit ID and the build number since both could
        # be useful.
        return self.container_name_prefix_for_branch() + '{}-{}-'.format(
            self.commit_id[:6], self.env.BUILD_NUMBER)

    def run_app_command(self, command, argument=None):
        """Run a command using the app container

        Use this to run a command that does something then quits like
        build_media or migrate.

        Args:
            command: command to pass to our entrypoint.  The entrypoint is a
                copy of .docker/entry.sh
        """
        cmd_line = [ 'run', '-t', '--rm', ]
        cmd_line += self.app_params()
        cmd_line += [self.image_name, command]
        if argument is not None:
            cmd_line += [argument]
        self.docker.run(self.env.DOCKER_HOST_1, *cmd_line)

    def start_worker_container(self, host, name, command):
        """Start an app contanier running the feed/master worker

        Args:
            host: docker host
            name: docker name suffix of the container.  This is what shows up
                in docker ps.  All names will be prefixed with
                container_name_prefix_for_build().
            command: command to pass to our entry point (feed_worker, or
                master_worker)

        """
        host_name = '{}-{}.amara.org'.format(self.env.BRANCH, name)
        name = self.container_name_prefix_for_build() + name
        cmd_line = [
            'run', '-t', '-d',
            '-h', host_name,
            '--name', name,
            '--restart=always',
        ] + self.app_params(limited=True) + [self.image_name, command]
        cid = self.docker.run_and_return_output(host, *cmd_line).strip()
        log("container id: {}", cid)
        self.containers_started.append(ContainerInfo(host, name, cid))

    def start_app_container(self, host, name):
        """Start an app contanier running a web server

        Args:
            host: docker host
            name: docker name suffix of the container.  This is what shows up
                in docker ps.  All names will be prefixed with
                container_name_prefix_for_build().
        """
        name = self.container_name_prefix_for_build() + name
        cmd_line = [
            'run', '-t', '-d', '-P',
            '-h', self.app_hostname(),
            '--name', name,
            '--restart=always',
        ] + self.app_params(limited=True) + self.interlock_params() + [self.image_name]
        cid = self.docker.run_and_return_output(host, *cmd_line).strip()
        log("container id: {}", cid)
        self.containers_started.append(ContainerInfo(host, name, cid))

    def start_new_containers(self):
        """Start docker containers for this deploy."""

        if self.env.BRANCH == 'production':
            # for production we start up many instances, spread across the
            # hosts
            host_iter = itertools.cycle(self.env.docker_hosts())
            for i in range(int(self.env.PRODUCTION_NUM_INSTANCES)):
                host = host_iter.next()
                self.start_app_container(host, str(i + 1))
        elif self.env.BRANCH == 'staging':
            # for staging we start up 1 instance per host
            for i, host in enumerate(self.env.docker_hosts()):
                self.start_app_container(host, str(i + 1))
        else:
            # for preview branches we start 1 instance on the builder host.
            # Also we don't start up the workers
            self.start_app_container(BUILDER_DOCKER_HOST, 'preview')
            return

        self.start_worker_container(self.env.DOCKER_HOST_1, 'master-worker',
                                    'master_worker')
        self.start_worker_container(self.env.DOCKER_HOST_2, 'feed-worker',
                                    'feed_worker')

    def find_old_containers(self):
        """Find containers started by previous deploys.

        Returns a list of (host, container_id) tuples.
        """
        old_containers = []
        hosts_to_search = [BUILDER_DOCKER_HOST] + self.env.docker_hosts()
        for host in hosts_to_search:
            for container in self.docker.get_containers(host):
                try:
                    name = container['Names'][0]
                except KeyError:
                    pass
                if self.container_name_prefix_for_branch() in name:
                    cid = container['Id']
                    old_containers.append(ContainerInfo(host, name, cid))
        log("old app servers: {}", old_containers)
        return old_containers

    def shutdown_old_containers(self, old_containers):
        for container in old_containers:
            self.docker.run(container.host, 'kill', container.cid)
            self.containers_stopped.append(container)

    def remove_old_app_servers(self, old_containers):
        for container in old_containers:
            self.docker.run(container.host, 'rm', '-v', container.cid)

    def print_report(self):
        line_fmt = "{:<60} {:<60} {:<64}"
        log("------------- Containers Stopped ---------------")
        log(line_fmt, 'Host', 'Name', 'Container ID')
        for container in self.containers_stopped:
            log(line_fmt, *container)
        log("------------- Containers Started ---------------")
        log(line_fmt, 'Host', 'Name', 'Container ID')
        for container in self.containers_started:
            log(line_fmt, *container)
        log("------------- Shell Command Line ---------------")
        cmd_line = [
            'docker', 'run', '-it', '--rm',
        ] + self.app_params() + [self.image_name, 'shell']
        log_nostar(' '.join(cmd_line))

class Deploy(object):
    """Top-level manager for the deploy."""
    def run(self):
        self.setup()
        if not self.env.ROLLBACK_ID:
            self.image_builder.setup_images()
        if self.env.RESET_DB == 'true':
            if self.container_manager.building_preview():
                self.container_manager.run_app_command("reset_db")
            else:
                log("Not calling reset_db since we are not "
                    "building a preview.")
        if not self.env.ROLLBACK_ID:
            self.container_manager.run_app_command("build_media")
        self.start_and_stop_containers()
        if self.container_manager.building_preview():
            self.container_manager.run_app_command("setup_preview_site", argument=self.container_manager.app_hostname())
        self.container_manager.print_report()

    def build(self):
        self.setup(needs_migrations=False)
        self.image_builder.setup_images()
        self.container_manager.run_app_command("build_media")
        self.container_manager.print_report()

    def stop_old_containers(self):
        self.setup()
        old_containers = self.container_manager.find_old_containers()
        self.container_manager.shutdown_old_containers(old_containers)

    def setup(self, needs_migrations=True):
        self.cd_to_project_root()
        self.env = Environment(needs_migrations)
        commit_id = self.get_commit_id()
        self.image_builder = ImageBuilder(self.env, commit_id)
        self.container_manager = ContainerManager(
            self.env, commit_id, self.image_builder.image_name)

    def get_commit_id(self):
        if self.env.ROLLBACK_ID:
            log("Getting commit ID from ROLLBACK_ID")
            commit_id = self.env.ROLLBACK_ID
        else:
            cmd = ["git", "rev-parse", "HEAD"]
            commit_id = subprocess.check_output(cmd).strip()
        if not re.match('^[0-9a-f]{40}$', commit_id):
            raise ValueError("Invalid commit id: {}".format(commit_id))
        return commit_id

    def cd_to_project_root(self):
        project_root = os.path.abspath(
            os.path.dirname(os.path.dirname(__file__))
        )
        log('cd to {}', project_root)
        os.chdir(project_root)

    def start_and_stop_containers(self):
        old_containers = self.container_manager.find_old_containers()
        if self.env.MIGRATIONS == 'DONT_MIGRATE':
            self.container_manager.start_new_containers()
            time.sleep(30)
            self.container_manager.shutdown_old_containers(old_containers)
        elif self.env.MIGRATIONS == 'MIGRATE_WHILE_RUNNING_OLD_CODE':
            self.container_manager.run_app_command('migrate')
            self.container_manager.start_new_containers()
            time.sleep(30)
            self.container_manager.shutdown_old_containers(old_containers)
        elif self.env.MIGRATIONS == 'STOP_SERVERS_TO_MIGRATE':
            self.container_manager.shutdown_old_containers(old_containers)
            self.container_manager.run_app_command('migrate')
            self.container_manager.start_new_containers()
        else:
            raise ValueError("Unknown MIGRATIONS value: {}".format(
                self.env.MIGRATIONS))
        # give containers some time to shutdown before we remove them
        time.sleep(5)
        self.container_manager.remove_old_app_servers(old_containers)


class Cleanup(object):
    def run(self):
        docker_hosts = self.get_docker_hosts()
        self.docker = Docker()
        for host in [BUILDER_DOCKER_HOST] + docker_hosts:
            log("Host: {}", host)
            self.remove_stopped_containers(host)
            self.remove_unused_images(host)

    def get_docker_hosts(self):
        try:
            return os.environ['DOCKER_HOSTS'].split()
        except KeyError:
            log("DOCKER_HOSTS ENV variable missing")
            sys.exit(1)

    def remove_stopped_containers(self, host):
        log("checking for stoped containers")
        for container in self.docker.get_containers(host, status=['exited']):
            log("removing stopped container: {}", container['Id'])
            self.docker.run(host, 'rm', '-v', container['Id'])


    def remove_image(self, host, image):
        try:
            self.docker.run(host, 'rmi', image)
        except subprocess.CalledProcessError:
            # removing an image can fail if another container was started that
            # uses the image after get the list.  Just print a warning and
            # continue on
            log("Warning: error removing image {}", image)

    def remove_unused_images(self, host):
        log("checking for unused images")
        container_ids = [
        ]

        used_images = collections.defaultdict(list)
        for container in self.docker.get_containers(host):
            cid = container['Id']
            details = self.docker.get_container_details(host, cid)
            used_images[details['Image']].append(details['Name'])

        for image_info in self.docker.get_images(host):
            image = image_info['Id']
            tags = [
                tag for tag in image_info['RepoTags']
                if tag != '<none>:<none>'
            ]
            if self.should_skip_image(image, tags):
                continue

            if image in used_images:
                log("Image {} in use {}", image, used_images[image])
            else:
                for tag in tags:
                    log("Untagging {}", tag)
                    self.remove_image(host, tag)
                if self.docker.image_exists(host, image):
                    log("removing unused image: {}", image)
                    self.remove_image(host, image)
                else:
                    log("image removed from untagging: {}", image)

    def should_skip_image(self, image, tags):
        for tag in tags:
            if tag.endswith(":latest"):
                log("skipping {} because of tag {}", image, tag)
                return True
        return False

def main(argv):
    try:
        try:
            command = argv[1].lower()
        except IndexError:
            command = 'deploy'
        if command == 'deploy':
            Deploy().run()
        elif command == 'stop-deploy':
            Deploy().stop_old_containers()
        elif command == 'build':
            Deploy().build()
        elif command == 'cleanup':
            Cleanup().run()
        else:
            log.write("Unknown command: {}", command)
            sys.exit(1)
    except Exception, e:
        sys.stderr.write("Error: {}\n{}".format(
            e, ''.join(traceback.format_exc())))
        sys.exit(1)

if __name__ == '__main__':
    main(sys.argv)
See More Examples - Go to Next Page
Page 1 Page 2 Page 3 Page 4 Selected